From 71e647b349036069751a457f3e0e8fda1b54cd2a Mon Sep 17 00:00:00 2001 From: Irene Knapp Date: Tue, 22 Sep 2020 14:19:40 -0700 Subject: Parse space-separated words. --- src/commandline.lalrpop | 41 ++++++++++++++++++++++++++++++++--------- src/main.rs | 4 ++-- 2 files changed, 34 insertions(+), 11 deletions(-) (limited to 'src') diff --git a/src/commandline.lalrpop b/src/commandline.lalrpop index 0655281..73f7247 100644 --- a/src/commandline.lalrpop +++ b/src/commandline.lalrpop @@ -1,16 +1,39 @@ grammar; -// -// Z is the unicode class for separators, including line, paragraph, and space -// separators. C is the class for control characters. P is the class for -// punctuation. This regexp tests for the intersection of the negation of these -// character classes, which is any character NOT in one of these three classes. -// -// [1] is the official reference, and [2] is a site that is useful for browsing -// to get an intuitive idea of what these classes mean. +pub Invocation: Vec<&'input str> = { + NEWLINE, +}; + +// Several of the regexps below make use of Unicode character classes. [1] is +// the official reference to Unicode classes, and [2] is a site that is useful +// for browsing to get an intuitive idea of what the classes mean. // // [1] http://www.unicode.org/reports/tr44/#General_Category_Values // [2] https://www.compart.com/en/unicode/category // -pub Filename: String = => filename.to_string(); +match { + // Zs is the Unicode class for space separators. This includes the ASCII + // space character. + // + r"\p{Zs}+" => { }, + + // Zl is the Unicode class for line separators. Zp is the Unicode class for + // paragraph separators. Newline and carriage return are included individually + // here, since Unicode classifies them with the control characters rather than + // with the space characters. + // + r"[\p{Zl}\p{Zp}\n\r]" => NEWLINE, + + // Z is the unicode class for separators, including line, paragraph, and space + // separators. C is the class for control characters. P is the class for + // punctuation. This regexp tests for the intersection of the negation of these + // character classes, which is any character NOT in one of these three classes. + // + // Note that, counterintuitively, line feed and carriage return are classified + // as control characters, not as line separators. Either way, this regexp would + // still exclude them, but the difference might be relevant when maintaining + // it. + // + r"[\P{Z}&&\P{C}&&\P{P}]+" => WORD, +} diff --git a/src/main.rs b/src/main.rs index 816ed49..dd1c31d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -67,10 +67,10 @@ fn read() -> Result { fn execute(input: &str) -> Result<()> { - let filename = commandline::FilenameParser::new().parse(input)?; + let invocation = commandline::InvocationParser::new().parse(input)?; println!("{}", input); - println!("filename '{}'", filename); + println!("invocation '{:?}'", invocation); Ok(()) } -- cgit 1.4.1