From 0940e1f0bf8b9e499717b02cefe8c59601c21673 Mon Sep 17 00:00:00 2001 From: Irene Knapp Date: Mon, 14 Dec 2020 20:14:11 -0800 Subject: path lists parse okay now --- src/commandline.lalrpop | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'src/commandline.lalrpop') diff --git a/src/commandline.lalrpop b/src/commandline.lalrpop index 73f7247..dd7d84e 100644 --- a/src/commandline.lalrpop +++ b/src/commandline.lalrpop @@ -8,6 +8,20 @@ pub Invocation: Vec<&'input str> = { // the official reference to Unicode classes, and [2] is a site that is useful // for browsing to get an intuitive idea of what the classes mean. // +// In maintaining these regexps, it's important to understand the structure +// of Unicode character classes. There are seven top-level categories, each +// with a single-character name (ie. "Z" for separators). Each top-level +// category has several subcategories which form an exhaustive partition of it; +// the subcategories have two-character names (ie. "Zs" for space separators). +// Every allocated codepoint is in exactly one top-level category and exactly +// one subcategory. +// +// It is important that these regexps exhaustively cover the entirety of +// Unicode, without omission; otherwise lalrpop's lexer will give InvalidToken +// errors for unrecognized characters. Overlaps will be less catastrophic, as +// they'll be resoved by the precedence rules, but for clarity's sake they +// should be avoided. +// // [1] http://www.unicode.org/reports/tr44/#General_Category_Values // [2] https://www.compart.com/en/unicode/category // @@ -24,16 +38,28 @@ match { // r"[\p{Zl}\p{Zp}\n\r]" => NEWLINE, + // This one recognizes exactly one character, the old-school double-quote. As + // tempting as it is to do something clever with character classes, shells have + // a long history of quoting syntaxes which are subtle and quick to anger, and + // for this project the decision is to be radically simple instead. + r#"["]"# => QUOTE, + + // This one matches any control character other than line feed and carriage + // return. The grammar doesn't reference control characters, but having a + // token for them makes the error messages more informative. + r"[\p{C}&&[^\n\r]]" => CONTROL, + // Z is the unicode class for separators, including line, paragraph, and space - // separators. C is the class for control characters. P is the class for - // punctuation. This regexp tests for the intersection of the negation of these - // character classes, which is any character NOT in one of these three classes. + // separators. C is the class for control characters. This regexp tests for + // the intersection of the negation of these character classes, along with a + // negated class enumerating all the explicitly-recognized characters, which + // means it matches any character NOT in the regexps above. // // Note that, counterintuitively, line feed and carriage return are classified // as control characters, not as line separators. Either way, this regexp would // still exclude them, but the difference might be relevant when maintaining // it. // - r"[\P{Z}&&\P{C}&&\P{P}]+" => WORD, + r#"[\P{Z}&&\P{C}&&[^"]]+"# => WORD, } -- cgit 1.4.1