From 71e647b349036069751a457f3e0e8fda1b54cd2a Mon Sep 17 00:00:00 2001
From: Irene Knapp <ireneista@gmail.com>
Date: Tue, 22 Sep 2020 14:19:40 -0700
Subject: Parse space-separated words.

---
 src/commandline.lalrpop | 41 ++++++++++++++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 9 deletions(-)

(limited to 'src/commandline.lalrpop')

diff --git a/src/commandline.lalrpop b/src/commandline.lalrpop
index 0655281..73f7247 100644
--- a/src/commandline.lalrpop
+++ b/src/commandline.lalrpop
@@ -1,16 +1,39 @@
 grammar;
 
-//
-// Z is the unicode class for separators, including line, paragraph, and space
-// separators. C is the class for control characters. P is the class for
-// punctuation. This regexp tests for the intersection of the negation of these
-// character classes, which is any character NOT in one of these three classes.
-//
-// [1] is the official reference, and [2] is a site that is useful for browsing
-// to get an intuitive idea of what these classes mean.
+pub Invocation: Vec<&'input str> = {
+  <WORD+> NEWLINE,
+};
+
+// Several of the regexps below make use of Unicode character classes. [1] is
+// the official reference to Unicode classes, and [2] is a site that is useful
+// for browsing to get an intuitive idea of what the classes mean.
 //
 // [1] http://www.unicode.org/reports/tr44/#General_Category_Values
 // [2] https://www.compart.com/en/unicode/category
 //
-pub Filename: String = <filename:r"[\P{Z}&&\P{C}&&\P{P}]+"> => filename.to_string();
+match {
+  // Zs is the Unicode class for space separators. This includes the ASCII
+  // space character.
+  //
+  r"\p{Zs}+" => { },
+
+  // Zl is the Unicode class for line separators. Zp is the Unicode class for
+  // paragraph separators. Newline and carriage return are included individually
+  // here, since Unicode classifies them with the control characters rather than
+  // with the space characters.
+  //
+  r"[\p{Zl}\p{Zp}\n\r]" => NEWLINE,
+
+  // Z is the unicode class for separators, including line, paragraph, and space
+  // separators. C is the class for control characters. P is the class for
+  // punctuation. This regexp tests for the intersection of the negation of these
+  // character classes, which is any character NOT in one of these three classes.
+  //
+  // Note that, counterintuitively, line feed and carriage return are classified
+  // as control characters, not as line separators. Either way, this regexp would
+  // still exclude them, but the difference might be relevant when maintaining
+  // it.
+  //
+  r"[\P{Z}&&\P{C}&&\P{P}]+" => WORD,
+}
 
-- 
cgit 1.4.1