grammar; pub Invocation: Vec<&'input str> = { NEWLINE, }; // Several of the regexps below make use of Unicode character classes. [1] is // the official reference to Unicode classes, and [2] is a site that is useful // for browsing to get an intuitive idea of what the classes mean. // // [1] http://www.unicode.org/reports/tr44/#General_Category_Values // [2] https://www.compart.com/en/unicode/category // match { // Zs is the Unicode class for space separators. This includes the ASCII // space character. // r"\p{Zs}+" => { }, // Zl is the Unicode class for line separators. Zp is the Unicode class for // paragraph separators. Newline and carriage return are included individually // here, since Unicode classifies them with the control characters rather than // with the space characters. // r"[\p{Zl}\p{Zp}\n\r]" => NEWLINE, // Z is the unicode class for separators, including line, paragraph, and space // separators. C is the class for control characters. P is the class for // punctuation. This regexp tests for the intersection of the negation of these // character classes, which is any character NOT in one of these three classes. // // Note that, counterintuitively, line feed and carriage return are classified // as control characters, not as line separators. Either way, this regexp would // still exclude them, but the difference might be relevant when maintaining // it. // r"[\P{Z}&&\P{C}&&\P{P}]+" => WORD, }