From 71e647b349036069751a457f3e0e8fda1b54cd2a Mon Sep 17 00:00:00 2001 From: Irene Knapp Date: Tue, 22 Sep 2020 14:19:40 -0700 Subject: Parse space-separated words. --- src/commandline.lalrpop | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) (limited to 'src/commandline.lalrpop') diff --git a/src/commandline.lalrpop b/src/commandline.lalrpop index 0655281..73f7247 100644 --- a/src/commandline.lalrpop +++ b/src/commandline.lalrpop @@ -1,16 +1,39 @@ grammar; -// -// Z is the unicode class for separators, including line, paragraph, and space -// separators. C is the class for control characters. P is the class for -// punctuation. This regexp tests for the intersection of the negation of these -// character classes, which is any character NOT in one of these three classes. -// -// [1] is the official reference, and [2] is a site that is useful for browsing -// to get an intuitive idea of what these classes mean. +pub Invocation: Vec<&'input str> = { + NEWLINE, +}; + +// Several of the regexps below make use of Unicode character classes. [1] is +// the official reference to Unicode classes, and [2] is a site that is useful +// for browsing to get an intuitive idea of what the classes mean. // // [1] http://www.unicode.org/reports/tr44/#General_Category_Values // [2] https://www.compart.com/en/unicode/category // -pub Filename: String = => filename.to_string(); +match { + // Zs is the Unicode class for space separators. This includes the ASCII + // space character. + // + r"\p{Zs}+" => { }, + + // Zl is the Unicode class for line separators. Zp is the Unicode class for + // paragraph separators. Newline and carriage return are included individually + // here, since Unicode classifies them with the control characters rather than + // with the space characters. + // + r"[\p{Zl}\p{Zp}\n\r]" => NEWLINE, + + // Z is the unicode class for separators, including line, paragraph, and space + // separators. C is the class for control characters. P is the class for + // punctuation. This regexp tests for the intersection of the negation of these + // character classes, which is any character NOT in one of these three classes. + // + // Note that, counterintuitively, line feed and carriage return are classified + // as control characters, not as line separators. Either way, this regexp would + // still exclude them, but the difference might be relevant when maintaining + // it. + // + r"[\P{Z}&&\P{C}&&\P{P}]+" => WORD, +} -- cgit 1.4.1