1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
|
grammar;
pub Invocation: Vec<&'input str> = {
<WORD+> NEWLINE,
};
// Several of the regexps below make use of Unicode character classes. [1] is
// the official reference to Unicode classes, and [2] is a site that is useful
// for browsing to get an intuitive idea of what the classes mean.
//
// [1] http://www.unicode.org/reports/tr44/#General_Category_Values
// [2] https://www.compart.com/en/unicode/category
//
match {
// Zs is the Unicode class for space separators. This includes the ASCII
// space character.
//
r"\p{Zs}+" => { },
// Zl is the Unicode class for line separators. Zp is the Unicode class for
// paragraph separators. Newline and carriage return are included individually
// here, since Unicode classifies them with the control characters rather than
// with the space characters.
//
r"[\p{Zl}\p{Zp}\n\r]" => NEWLINE,
// Z is the unicode class for separators, including line, paragraph, and space
// separators. C is the class for control characters. P is the class for
// punctuation. This regexp tests for the intersection of the negation of these
// character classes, which is any character NOT in one of these three classes.
//
// Note that, counterintuitively, line feed and carriage return are classified
// as control characters, not as line separators. Either way, this regexp would
// still exclude them, but the difference might be relevant when maintaining
// it.
//
r"[\P{Z}&&\P{C}&&\P{P}]+" => WORD,
}
|