summary refs log tree commit diff
path: root/src/commandline.lalrpop
diff options
context:
space:
mode:
Diffstat (limited to 'src/commandline.lalrpop')
-rw-r--r--src/commandline.lalrpop34
1 files changed, 30 insertions, 4 deletions
diff --git a/src/commandline.lalrpop b/src/commandline.lalrpop
index 73f7247..dd7d84e 100644
--- a/src/commandline.lalrpop
+++ b/src/commandline.lalrpop
@@ -8,6 +8,20 @@ pub Invocation: Vec<&'input str> = {
 // the official reference to Unicode classes, and [2] is a site that is useful
 // for browsing to get an intuitive idea of what the classes mean.
 //
+// In maintaining these regexps, it's important to understand the structure
+// of Unicode character classes. There are seven top-level categories, each
+// with a single-character name (ie. "Z" for separators). Each top-level
+// category has several subcategories which form an exhaustive partition of it;
+// the subcategories have two-character names (ie. "Zs" for space separators).
+// Every allocated codepoint is in exactly one top-level category and exactly
+// one subcategory.
+//
+// It is important that these regexps exhaustively cover the entirety of
+// Unicode, without omission; otherwise lalrpop's lexer will give InvalidToken
+// errors for unrecognized characters. Overlaps will be less catastrophic, as
+// they'll be resoved by the precedence rules, but for clarity's sake they
+// should be avoided.
+//
 // [1] http://www.unicode.org/reports/tr44/#General_Category_Values
 // [2] https://www.compart.com/en/unicode/category
 //
@@ -24,16 +38,28 @@ match {
   //
   r"[\p{Zl}\p{Zp}\n\r]" => NEWLINE,
 
+  // This one recognizes exactly one character, the old-school double-quote. As
+  // tempting as it is to do something clever with character classes, shells have
+  // a long history of quoting syntaxes which are subtle and quick to anger, and
+  // for this project the decision is to be radically simple instead.
+  r#"["]"# => QUOTE,
+
+  // This one matches any control character other than line feed and carriage
+  // return. The grammar doesn't reference control characters, but having a
+  // token for them makes the error messages more informative.
+  r"[\p{C}&&[^\n\r]]" => CONTROL,
+
   // Z is the unicode class for separators, including line, paragraph, and space
-  // separators. C is the class for control characters. P is the class for
-  // punctuation. This regexp tests for the intersection of the negation of these
-  // character classes, which is any character NOT in one of these three classes.
+  // separators. C is the class for control characters. This regexp tests for
+  // the intersection of the negation of these character classes, along with a
+  // negated class enumerating all the explicitly-recognized characters, which
+  // means it matches any character NOT in the regexps above.
   //
   // Note that, counterintuitively, line feed and carriage return are classified
   // as control characters, not as line separators. Either way, this regexp would
   // still exclude them, but the difference might be relevant when maintaining
   // it.
   //
-  r"[\P{Z}&&\P{C}&&\P{P}]+" => WORD,
+  r#"[\P{Z}&&\P{C}&&[^"]]+"# => WORD,
 }