grammar;

pub Invocation: Vec<&'input str> = {
  <WORD+> NEWLINE,
};

// Several of the regexps below make use of Unicode character classes. [1] is
// the official reference to Unicode classes, and [2] is a site that is useful
// for browsing to get an intuitive idea of what the classes mean.
//
// [1] http://www.unicode.org/reports/tr44/#General_Category_Values
// [2] https://www.compart.com/en/unicode/category
//
match {
  // Zs is the Unicode class for space separators. This includes the ASCII
  // space character.
  //
  r"\p{Zs}+" => { },

  // Zl is the Unicode class for line separators. Zp is the Unicode class for
  // paragraph separators. Newline and carriage return are included individually
  // here, since Unicode classifies them with the control characters rather than
  // with the space characters.
  //
  r"[\p{Zl}\p{Zp}\n\r]" => NEWLINE,

  // Z is the unicode class for separators, including line, paragraph, and space
  // separators. C is the class for control characters. P is the class for
  // punctuation. This regexp tests for the intersection of the negation of these
  // character classes, which is any character NOT in one of these three classes.
  //
  // Note that, counterintuitively, line feed and carriage return are classified
  // as control characters, not as line separators. Either way, this regexp would
  // still exclude them, but the difference might be relevant when maintaining
  // it.
  //
  r"[\P{Z}&&\P{C}&&\P{P}]+" => WORD,
}