From 0940e1f0bf8b9e499717b02cefe8c59601c21673 Mon Sep 17 00:00:00 2001
From: Irene Knapp <ireneista@gmail.com>
Date: Mon, 14 Dec 2020 20:14:11 -0800
Subject: path lists parse okay now

---
 src/commandline.lalrpop | 34 ++++++++++++++++++++++++++++++----
 src/main.rs             | 42 +++++++++++++++++++++++++++++++++++++++++-
 src/path.lalrpop        | 38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 109 insertions(+), 5 deletions(-)
 create mode 100644 src/path.lalrpop
diff --git a/src/commandline.lalrpop b/src/commandline.lalrpop
index 73f7247..dd7d84e 100644
--- a/src/commandline.lalrpop
+++ b/src/commandline.lalrpop
@@ -8,6 +8,20 @@ pub Invocation: Vec<&'input str> = {
 // the official reference to Unicode classes, and [2] is a site that is useful
 // for browsing to get an intuitive idea of what the classes mean.
 //
+// In maintaining these regexps, it's important to understand the structure
+// of Unicode character classes. There are seven top-level categories, each
+// with a single-character name (ie. "Z" for separators). Each top-level
+// category has several subcategories which form an exhaustive partition of it;
+// the subcategories have two-character names (ie. "Zs" for space separators).
+// Every allocated codepoint is in exactly one top-level category and exactly
+// one subcategory.
+//
+// It is important that these regexps exhaustively cover the entirety of
+// Unicode, without omission; otherwise lalrpop's lexer will give InvalidToken
+// errors for unrecognized characters. Overlaps will be less catastrophic, as
+// they'll be resoved by the precedence rules, but for clarity's sake they
+// should be avoided.
+//
 // [1] http://www.unicode.org/reports/tr44/#General_Category_Values
 // [2] https://www.compart.com/en/unicode/category
 //
@@ -24,16 +38,28 @@ match {
   //
   r"[\p{Zl}\p{Zp}\n\r]" => NEWLINE,
 
+  // This one recognizes exactly one character, the old-school double-quote. As
+  // tempting as it is to do something clever with character classes, shells have
+  // a long history of quoting syntaxes which are subtle and quick to anger, and
+  // for this project the decision is to be radically simple instead.
+  r#"["]"# => QUOTE,
+
+  // This one matches any control character other than line feed and carriage
+  // return. The grammar doesn't reference control characters, but having a
+  // token for them makes the error messages more informative.
+  r"[\p{C}&&[^\n\r]]" => CONTROL,
+
   // Z is the unicode class for separators, including line, paragraph, and space
-  // separators. C is the class for control characters. P is the class for
-  // punctuation. This regexp tests for the intersection of the negation of these
-  // character classes, which is any character NOT in one of these three classes.
+  // separators. C is the class for control characters. This regexp tests for
+  // the intersection of the negation of these character classes, along with a
+  // negated class enumerating all the explicitly-recognized characters, which
+  // means it matches any character NOT in the regexps above.
   //
   // Note that, counterintuitively, line feed and carriage return are classified
   // as control characters, not as line separators. Either way, this regexp would
   // still exclude them, but the difference might be relevant when maintaining
   // it.
   //
-  r"[\P{Z}&&\P{C}&&\P{P}]+" => WORD,
+  r#"[\P{Z}&&\P{C}&&[^"]]+"# => WORD,
 }
 
diff --git a/src/main.rs b/src/main.rs
index dd1c31d..d499ad9 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -7,6 +7,7 @@ use std::io::prelude::*;
 #[macro_use] extern crate lalrpop_util;
 
 lalrpop_mod!(pub commandline);
+lalrpop_mod!(pub path);
 
 
 pub type Result<T> = std::result::Result<T, Error>;
@@ -17,6 +18,18 @@ pub enum Input {
     End,
 }
 
+pub struct GenericPath {
+    absolute: bool,
+    directory: bool,
+    components: Vec<GenericPathComponent>,
+}
+
+pub enum GenericPathComponent {
+    CurrentDirectory,
+    ParentDirectory,
+    Entry(String),
+}
+
 
 fn main() -> Result<()> {
     std::process::exit(match repl() {
@@ -70,7 +83,34 @@ fn execute(input: &str) -> Result<()> {
     let invocation = commandline::InvocationParser::new().parse(input)?;
 
     println!("{}", input);
-    println!("invocation '{:?}'", invocation);
+
+    match invocation.as_slice() {
+      ["paths", path_list, ..] => {
+        println!("{:?}", path_list);
+        match path::PathListParser::new().parse(path_list) {
+          Ok(parsed_paths) => {
+            println!("paths '{:?}'", parsed_paths);
+          },
+          Err(_) => {
+            match path::PathListAllowingEmptyPathsParser::new()
+              .parse(path_list)
+            {
+              Ok(parsed) => {
+                println!("path list has empty component");
+                println!("{:?}", parsed);
+              },
+              Err(error) => {
+                println!("path list is okay, does not have empty component");
+                println!("{:?}", error);
+              },
+            }
+          },
+        }
+      },
+      _ => {
+        println!("invocation '{:?}'", invocation);
+      }
+    }
 
     Ok(())
 }
diff --git a/src/path.lalrpop b/src/path.lalrpop
new file mode 100644
index 0000000..099b217
--- /dev/null
+++ b/src/path.lalrpop
@@ -0,0 +1,38 @@
+grammar;
+
+pub PathList: Vec<&'input str> = {
+  => {
+    Vec::new()
+  },
+  <mut left:(<Path> COLON)*> <right:Path> => {
+    left.push(right);
+    left
+  },
+};
+
+pub PathListAllowingEmptyPaths: Vec<&'input str> = {
+  => vec![""],
+  Path => vec![<>],
+  <mut left:PathListAllowingEmptyPaths> COLON => {
+    left.push("");
+    left
+  },
+  <mut left:PathListAllowingEmptyPaths> COLON <right:Path> => {
+    left.push(right);
+    left
+  },
+}
+
+pub Path: &'input str = {
+  <PATH_COMPONENT>,
+}
+
+// Whitespace is not allowed.
+match {
+  r"[^z:/]+" => PATH_COMPONENT,
+
+  r"/" => SLASH,
+
+  ":" => COLON,
+}
+
-- 
cgit 1.4.1