From 0940e1f0bf8b9e499717b02cefe8c59601c21673 Mon Sep 17 00:00:00 2001 From: Irene Knapp Date: Mon, 14 Dec 2020 20:14:11 -0800 Subject: path lists parse okay now --- src/commandline.lalrpop | 34 ++++++++++++++++++++++++++++++---- src/main.rs | 42 +++++++++++++++++++++++++++++++++++++++++- src/path.lalrpop | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 109 insertions(+), 5 deletions(-) create mode 100644 src/path.lalrpop diff --git a/src/commandline.lalrpop b/src/commandline.lalrpop index 73f7247..dd7d84e 100644 --- a/src/commandline.lalrpop +++ b/src/commandline.lalrpop @@ -8,6 +8,20 @@ pub Invocation: Vec<&'input str> = { // the official reference to Unicode classes, and [2] is a site that is useful // for browsing to get an intuitive idea of what the classes mean. // +// In maintaining these regexps, it's important to understand the structure +// of Unicode character classes. There are seven top-level categories, each +// with a single-character name (ie. "Z" for separators). Each top-level +// category has several subcategories which form an exhaustive partition of it; +// the subcategories have two-character names (ie. "Zs" for space separators). +// Every allocated codepoint is in exactly one top-level category and exactly +// one subcategory. +// +// It is important that these regexps exhaustively cover the entirety of +// Unicode, without omission; otherwise lalrpop's lexer will give InvalidToken +// errors for unrecognized characters. Overlaps will be less catastrophic, as +// they'll be resoved by the precedence rules, but for clarity's sake they +// should be avoided. +// // [1] http://www.unicode.org/reports/tr44/#General_Category_Values // [2] https://www.compart.com/en/unicode/category // @@ -24,16 +38,28 @@ match { // r"[\p{Zl}\p{Zp}\n\r]" => NEWLINE, + // This one recognizes exactly one character, the old-school double-quote. As + // tempting as it is to do something clever with character classes, shells have + // a long history of quoting syntaxes which are subtle and quick to anger, and + // for this project the decision is to be radically simple instead. + r#"["]"# => QUOTE, + + // This one matches any control character other than line feed and carriage + // return. The grammar doesn't reference control characters, but having a + // token for them makes the error messages more informative. + r"[\p{C}&&[^\n\r]]" => CONTROL, + // Z is the unicode class for separators, including line, paragraph, and space - // separators. C is the class for control characters. P is the class for - // punctuation. This regexp tests for the intersection of the negation of these - // character classes, which is any character NOT in one of these three classes. + // separators. C is the class for control characters. This regexp tests for + // the intersection of the negation of these character classes, along with a + // negated class enumerating all the explicitly-recognized characters, which + // means it matches any character NOT in the regexps above. // // Note that, counterintuitively, line feed and carriage return are classified // as control characters, not as line separators. Either way, this regexp would // still exclude them, but the difference might be relevant when maintaining // it. // - r"[\P{Z}&&\P{C}&&\P{P}]+" => WORD, + r#"[\P{Z}&&\P{C}&&[^"]]+"# => WORD, } diff --git a/src/main.rs b/src/main.rs index dd1c31d..d499ad9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,6 +7,7 @@ use std::io::prelude::*; #[macro_use] extern crate lalrpop_util; lalrpop_mod!(pub commandline); +lalrpop_mod!(pub path); pub type Result = std::result::Result; @@ -17,6 +18,18 @@ pub enum Input { End, } +pub struct GenericPath { + absolute: bool, + directory: bool, + components: Vec, +} + +pub enum GenericPathComponent { + CurrentDirectory, + ParentDirectory, + Entry(String), +} + fn main() -> Result<()> { std::process::exit(match repl() { @@ -70,7 +83,34 @@ fn execute(input: &str) -> Result<()> { let invocation = commandline::InvocationParser::new().parse(input)?; println!("{}", input); - println!("invocation '{:?}'", invocation); + + match invocation.as_slice() { + ["paths", path_list, ..] => { + println!("{:?}", path_list); + match path::PathListParser::new().parse(path_list) { + Ok(parsed_paths) => { + println!("paths '{:?}'", parsed_paths); + }, + Err(_) => { + match path::PathListAllowingEmptyPathsParser::new() + .parse(path_list) + { + Ok(parsed) => { + println!("path list has empty component"); + println!("{:?}", parsed); + }, + Err(error) => { + println!("path list is okay, does not have empty component"); + println!("{:?}", error); + }, + } + }, + } + }, + _ => { + println!("invocation '{:?}'", invocation); + } + } Ok(()) } diff --git a/src/path.lalrpop b/src/path.lalrpop new file mode 100644 index 0000000..099b217 --- /dev/null +++ b/src/path.lalrpop @@ -0,0 +1,38 @@ +grammar; + +pub PathList: Vec<&'input str> = { + => { + Vec::new() + }, + COLON)*> => { + left.push(right); + left + }, +}; + +pub PathListAllowingEmptyPaths: Vec<&'input str> = { + => vec![""], + Path => vec![<>], + COLON => { + left.push(""); + left + }, + COLON => { + left.push(right); + left + }, +} + +pub Path: &'input str = { + , +} + +// Whitespace is not allowed. +match { + r"[^z:/]+" => PATH_COMPONENT, + + r"/" => SLASH, + + ":" => COLON, +} + -- cgit 1.4.1