From 4181591c404f32fedccedcb075fd688372825a22 Mon Sep 17 00:00:00 2001 From: Karl Tastroff <76853528+gymore-io@users.noreply.github.com> Date: Mon, 1 Feb 2021 20:11:54 +0100 Subject: [PATCH] Add support for case insensitivity (#198) * Create the `IgnoreFlags` structure * Add the `IgnoreFlags` structure * Parsing logic of `IgnoreFlags` * Add `ApplieIgnoreFlags` trait * Add `ApplieIgnoreFlags` trait * Implementation of that trait for a whole lot of structures * Updated `lib.rs` to take the ignore flags in account * Add tests for `ignore(case/ascii_case)` * Remove the `ApplieIgnoreFlags` trait * Remove the `ApplieIgnoreFlags` trait -> Replaced with with `MakeAsciiInsensitive` trait which serves the same purpose but only for the flag `IgnoreAsciiCase`. * Add ignore flags support for `to_mir` * `Literal::to_mir` now takes ignore flags and parses mir following those flags. * Add `Mir::utf8_ignore_case` and `Mir::binary_ignore_case` * Add `Literal::escape_regex` * Add a simple test for escaped regex --- logos-derive/src/lib.rs | 47 ++- logos-derive/src/mir.rs | 20 + logos-derive/src/parser/definition.rs | 53 ++- logos-derive/src/parser/ignore_flags.rs | 483 ++++++++++++++++++++++++ logos-derive/src/parser/mod.rs | 2 + tests/tests/ignore_case.rs | 232 ++++++++++++ 6 files changed, 820 insertions(+), 17 deletions(-) create mode 100644 logos-derive/src/parser/ignore_flags.rs create mode 100644 tests/tests/ignore_case.rs diff --git a/logos-derive/src/lib.rs b/logos-derive/src/lib.rs index 4a1fc13b..5c92971b 100644 --- a/logos-derive/src/lib.rs +++ b/logos-derive/src/lib.rs @@ -132,14 +132,35 @@ pub fn logos(input: TokenStream) -> TokenStream { } }; - let bytes = definition.literal.to_bytes(); - let then = graph.push( - leaf(definition.literal.span()) - .priority(definition.priority.unwrap_or(bytes.len() * 2)) - .callback(definition.callback), - ); - - ropes.push(Rope::new(bytes, then)); + if definition.ignore_flags.is_empty() { + let bytes = definition.literal.to_bytes(); + let then = graph.push( + leaf(definition.literal.span()) + .priority(definition.priority.unwrap_or(bytes.len() * 2)) + .callback(definition.callback), + ); + + ropes.push(Rope::new(bytes, then)); + } else { + let mir = definition + .literal + .escape_regex() + .to_mir( + &Default::default(), + definition.ignore_flags, + &mut parser.errors, + ) + .expect("The literal should be perfectly valid regex"); + + let then = graph.push( + leaf(definition.literal.span()) + .priority(definition.priority.unwrap_or_else(|| mir.priority())) + .callback(definition.callback), + ); + let id = graph.regex(mir, then); + + regex_ids.push(id); + } } "regex" => { let definition = match parser.parse_definition(attr) { @@ -149,16 +170,18 @@ pub fn logos(input: TokenStream) -> TokenStream { continue; } }; - let mir = match definition - .literal - .to_mir(&parser.subpatterns, &mut parser.errors) - { + let mir = match definition.literal.to_mir( + &parser.subpatterns, + definition.ignore_flags, + &mut parser.errors, + ) { Ok(mir) => mir, Err(err) => { parser.err(err, definition.literal.span()); continue; } }; + let then = graph.push( leaf(definition.literal.span()) .priority(definition.priority.unwrap_or_else(|| mir.priority())) diff --git a/logos-derive/src/mir.rs b/logos-derive/src/mir.rs index d5b0b5ca..7254a400 100644 --- a/logos-derive/src/mir.rs +++ b/logos-derive/src/mir.rs @@ -27,6 +27,15 @@ impl Mir { Mir::try_from(ParserBuilder::new().build().parse(source)?) } + pub fn utf8_ignore_case(source: &str) -> Result { + Mir::try_from( + ParserBuilder::new() + .case_insensitive(true) + .build() + .parse(source)?, + ) + } + pub fn binary(source: &str) -> Result { Mir::try_from( ParserBuilder::new() @@ -37,6 +46,17 @@ impl Mir { ) } + pub fn binary_ignore_case(source: &str) -> Result { + Mir::try_from( + ParserBuilder::new() + .allow_invalid_utf8(true) + .unicode(false) + .case_insensitive(true) + .build() + .parse(source)?, + ) + } + pub fn priority(&self) -> usize { match self { Mir::Empty | Mir::Loop(_) | Mir::Maybe(_) => 0, diff --git a/logos-derive/src/parser/definition.rs b/logos-derive/src/parser/definition.rs index c22d0007..98a24dcf 100644 --- a/logos-derive/src/parser/definition.rs +++ b/logos-derive/src/parser/definition.rs @@ -5,12 +5,15 @@ use crate::error::{Errors, Result}; use crate::leaf::Callback; use crate::mir::Mir; use crate::parser::nested::NestedValue; -use crate::parser::{Parser, Subpatterns}; +use crate::parser::{IgnoreFlags, Parser, Subpatterns}; + +use super::ignore_flags::ascii_case::MakeAsciiCaseInsensitive; pub struct Definition { pub literal: Literal, pub priority: Option, pub callback: Option, + pub ignore_flags: IgnoreFlags, } pub enum Literal { @@ -24,6 +27,7 @@ impl Definition { literal, priority: None, callback: None, + ignore_flags: IgnoreFlags::Empty, } } @@ -67,6 +71,12 @@ impl Definition { ("callback", _) => { parser.err("Expected: callback = ...", name.span()); } + ("ignore", NestedValue::Group(tokens)) => { + self.ignore_flags.parse_group(name, tokens, parser); + } + ("ignore", _) => { + parser.err("Expected: ignore(, ...)", name.span()); + } (unknown, _) => { parser.err( format!( @@ -92,11 +102,44 @@ impl Literal { } } - pub fn to_mir(&self, subpatterns: &Subpatterns, errors: &mut Errors) -> Result { - let value = subpatterns.fix(self, errors); + pub fn escape_regex(&self) -> Literal { match self { - Literal::Utf8(_) => Mir::utf8(&value), - Literal::Bytes(_) => Mir::binary(&value), + Literal::Utf8(string) => Literal::Utf8(LitStr::new( + regex_syntax::escape(&string.value()).as_str(), + self.span(), + )), + Literal::Bytes(bytes) => Literal::Bytes(LitByteStr::new( + regex_syntax::escape(&bytes_to_regex_string(bytes.value())).as_bytes(), + self.span(), + )), + } + } + + pub fn to_mir( + &self, + subpatterns: &Subpatterns, + ignore_flags: IgnoreFlags, + errors: &mut Errors, + ) -> Result { + let value = subpatterns.fix(self, errors); + + if ignore_flags.contains(IgnoreFlags::IgnoreAsciiCase) { + match self { + Literal::Utf8(_) => { + Mir::utf8(&value).map(MakeAsciiCaseInsensitive::make_ascii_case_insensitive) + } + Literal::Bytes(_) => Mir::binary_ignore_case(&value), + } + } else if ignore_flags.contains(IgnoreFlags::IgnoreCase) { + match self { + Literal::Utf8(_) => Mir::utf8_ignore_case(&value), + Literal::Bytes(_) => Mir::binary_ignore_case(&value), + } + } else { + match self { + Literal::Utf8(_) => Mir::utf8(&value), + Literal::Bytes(_) => Mir::binary(&value), + } } } diff --git a/logos-derive/src/parser/ignore_flags.rs b/logos-derive/src/parser/ignore_flags.rs new file mode 100644 index 00000000..7b2f85cb --- /dev/null +++ b/logos-derive/src/parser/ignore_flags.rs @@ -0,0 +1,483 @@ +use std::ops::{BitAnd, BitOr}; + +use proc_macro2::{Ident, TokenStream, TokenTree}; + +use crate::parser::Parser; +use crate::util::is_punct; + +#[derive(Clone, Copy, PartialEq, Eq)] +pub struct IgnoreFlags { + bits: u8, +} + +#[allow(non_upper_case_globals)] +impl IgnoreFlags { + pub const Empty: Self = Self::new(0x00); + pub const IgnoreCase: Self = Self::new(0x01); + pub const IgnoreAsciiCase: Self = Self::new(0x02); + + #[inline] + pub const fn new(bits: u8) -> Self { + Self { bits } + } + + /// Enables a variant. + #[inline] + pub fn enable(&mut self, variant: Self) { + self.bits |= variant.bits; + } + + /// Checks if this `IgnoreFlags` contains *any* of the given variants. + #[inline] + pub fn contains(&self, variants: Self) -> bool { + self.bits & variants.bits != 0 + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.bits == 0 + } + + /// Parses an identifier an enables it for `self`. + /// + /// Valid inputs are (that produces `true`): + /// * `"case"` (incompatible with `"ascii_case"`) + /// * `"ascii_case"` (incompatible with `"case"`) + /// + /// An error causes this function to return `false` and emits an error to + /// the given `Parser`. + fn parse_ident(&mut self, ident: Ident, parser: &mut Parser) -> bool { + match ident.to_string().as_str() { + "case" => { + if self.contains(Self::IgnoreAsciiCase) { + parser.err( + "\ + The flag \"case\" cannot be used along with \"ascii_case\"\ + ", + ident.span(), + ); + false + } else { + self.enable(Self::IgnoreCase); + true + } + } + "ascii_case" => { + if self.contains(Self::IgnoreCase) { + parser.err( + "\ + The flag \"ascii_case\" cannot be used along with \"case\"\ + ", + ident.span(), + ); + false + } else { + self.enable(Self::IgnoreAsciiCase); + true + } + } + unknown => { + parser.err( + format!( + "\ + Unknown flag: {}\n\n\ + + Expected one of: case, ascii_case\ + ", + unknown + ), + ident.span(), + ); + false + } + } + } + + pub fn parse_group(&mut self, name: Ident, tokens: TokenStream, parser: &mut Parser) { + // Little finite state machine to parse "(,)*,?" + + // FSM description for future maintenance + // 0: Initial state + // -> 1 + // _ -> error + // 1: A flag was found + // , -> 2 + // None -> done + // _ -> error + // 2: A comma was found (after a ) + // -> 1 + // None -> done + // _ -> error + let mut state = 0u8; + + let mut tokens = tokens.into_iter(); + + loop { + state = match state { + 0 => match tokens.next() { + Some(TokenTree::Ident(ident)) => { + if self.parse_ident(ident, parser) { + 1 + } else { + return; + } + } + _ => { + parser.err( + "\ + Invalid ignore flag\n\n\ + + Expected one of: case, ascii_case\ + ", + name.span(), + ); + return; + } + }, + 1 => match tokens.next() { + Some(tt) if is_punct(&tt, ',') => 2, + None => return, + Some(unexpected_tt) => { + parser.err( + format!( + "\ + Unexpected token: {:?}\ + ", + unexpected_tt.to_string(), + ), + unexpected_tt.span(), + ); + return; + } + }, + 2 => match tokens.next() { + Some(TokenTree::Ident(ident)) => { + if self.parse_ident(ident, parser) { + 1 + } else { + return; + } + } + None => return, + Some(unexpected_tt) => { + parser.err( + format!( + "\ + Unexpected token: {:?}\ + ", + unexpected_tt.to_string(), + ), + unexpected_tt.span(), + ); + return; + } + }, + _ => unreachable!("Internal Error: invalid state ({})", state), + } + } + } +} + +impl BitOr for IgnoreFlags { + type Output = Self; + + fn bitor(self, other: Self) -> Self { + Self::new(self.bits | other.bits) + } +} + +impl BitAnd for IgnoreFlags { + type Output = Self; + + fn bitand(self, other: Self) -> Self { + Self::new(self.bits & other.bits) + } +} + +pub mod ascii_case { + use regex_syntax::hir; + + use crate::mir::Mir; + use crate::parser::Literal; + + pub trait MakeAsciiCaseInsensitive { + /// Creates a equivalent regular expression which ignore the letter casing + /// of ascii characters. + fn make_ascii_case_insensitive(self) -> Mir; + } + + impl MakeAsciiCaseInsensitive for u8 { + fn make_ascii_case_insensitive(self) -> Mir { + if b'a' <= self && self <= b'z' { + Mir::Alternation(vec![ + Mir::Literal(hir::Literal::Byte(self - 32)), + Mir::Literal(hir::Literal::Byte(self)), + ]) + } else if b'A' <= self && self <= b'Z' { + Mir::Alternation(vec![ + Mir::Literal(hir::Literal::Byte(self)), + Mir::Literal(hir::Literal::Byte(self + 32)), + ]) + } else { + Mir::Literal(hir::Literal::Byte(self)) + } + } + } + + impl MakeAsciiCaseInsensitive for char { + fn make_ascii_case_insensitive(self) -> Mir { + if self.is_ascii() { + (self as u8).make_ascii_case_insensitive() + } else { + Mir::Literal(hir::Literal::Unicode(self)) + } + } + } + + impl MakeAsciiCaseInsensitive for hir::Literal { + fn make_ascii_case_insensitive(self) -> Mir { + match self { + hir::Literal::Byte(b) => b.make_ascii_case_insensitive(), + hir::Literal::Unicode(c) => c.make_ascii_case_insensitive(), + } + } + } + + impl MakeAsciiCaseInsensitive for hir::ClassBytes { + fn make_ascii_case_insensitive(mut self) -> Mir { + self.case_fold_simple(); + Mir::Class(hir::Class::Bytes(self)) + } + } + + impl MakeAsciiCaseInsensitive for hir::ClassUnicode { + fn make_ascii_case_insensitive(mut self) -> Mir { + use std::cmp; + + // Manuall implementation to only perform the case folding on ascii characters. + + let mut ranges = Vec::new(); + + for range in self.ranges() { + #[inline] + fn overlaps(st1: u8, end1: u8, st2: u8, end2: u8) -> bool { + (st2 <= st1 && st1 <= end2) || (st1 <= st2 && st2 <= end1) + } + + #[inline] + fn make_ascii(c: char) -> Option { + if c.is_ascii() { + return Some(c as u8); + } else { + None + } + } + + match (make_ascii(range.start()), make_ascii(range.end())) { + (Some(start), Some(end)) => { + if overlaps(b'a', b'z', start, end) { + let lower = cmp::max(start, b'a'); + let upper = cmp::min(end, b'z'); + ranges.push(hir::ClassUnicodeRange::new( + (lower - 32) as char, + (upper - 32) as char, + )) + } + + if overlaps(b'A', b'Z', start, end) { + let lower = cmp::max(start, b'A'); + let upper = cmp::min(end, b'Z'); + ranges.push(hir::ClassUnicodeRange::new( + (lower + 32) as char, + (upper + 32) as char, + )) + } + } + (Some(start), None) => { + if overlaps(b'a', b'z', start, b'z') { + let lower = cmp::max(start, b'a'); + ranges.push(hir::ClassUnicodeRange::new((lower - 32) as char, 'Z')) + } + + if overlaps(b'A', b'Z', start, b'Z') { + let lower = cmp::max(start, b'A'); + ranges.push(hir::ClassUnicodeRange::new((lower + 32) as char, 'Z')) + } + } + _ => (), + } + } + + self.union(&hir::ClassUnicode::new(ranges)); + + Mir::Class(hir::Class::Unicode(self)) + } + } + + impl MakeAsciiCaseInsensitive for hir::Class { + fn make_ascii_case_insensitive(self) -> Mir { + match self { + hir::Class::Bytes(b) => b.make_ascii_case_insensitive(), + hir::Class::Unicode(u) => u.make_ascii_case_insensitive(), + } + } + } + + impl MakeAsciiCaseInsensitive for &Literal { + fn make_ascii_case_insensitive(self) -> Mir { + match self { + Literal::Bytes(bytes) => Mir::Concat( + bytes + .value() + .into_iter() + .map(|b| b.make_ascii_case_insensitive()) + .collect(), + ), + Literal::Utf8(s) => Mir::Concat( + s.value() + .chars() + .map(|b| b.make_ascii_case_insensitive()) + .collect(), + ), + } + } + } + + impl MakeAsciiCaseInsensitive for Mir { + fn make_ascii_case_insensitive(self) -> Mir { + match self { + Mir::Empty => Mir::Empty, + Mir::Loop(l) => Mir::Loop(Box::new(l.make_ascii_case_insensitive())), + Mir::Maybe(m) => Mir::Maybe(Box::new(m.make_ascii_case_insensitive())), + Mir::Concat(c) => Mir::Concat( + c.into_iter() + .map(|m| m.make_ascii_case_insensitive()) + .collect(), + ), + Mir::Alternation(a) => Mir::Alternation( + a.into_iter() + .map(|m| m.make_ascii_case_insensitive()) + .collect(), + ), + Mir::Class(c) => c.make_ascii_case_insensitive(), + Mir::Literal(l) => l.make_ascii_case_insensitive(), + } + } + } + + #[cfg(test)] + mod tests { + use super::MakeAsciiCaseInsensitive; + use crate::mir::{Class, Mir}; + use regex_syntax::hir::{ClassUnicode, ClassUnicodeRange}; + + fn assert_range(in_s: char, in_e: char, expected: &[(char, char)]) { + let range = ClassUnicodeRange::new(in_s, in_e); + let class = ClassUnicode::new(vec![range]); + + let expected = + ClassUnicode::new(expected.iter().map(|&(a, b)| ClassUnicodeRange::new(a, b))); + + if let Mir::Class(Class::Unicode(result)) = class.make_ascii_case_insensitive() { + assert_eq!(result, expected); + } else { + panic!("Not a unicode class"); + }; + } + + #[test] + fn no_letters_left() { + assert_range(' ', '+', &[(' ', '+')]); + } + + #[test] + fn no_letters_right() { + assert_range('{', '~', &[('{', '~')]); + } + + #[test] + fn no_letters_middle() { + assert_range('[', '`', &[('[', '`')]); + } + + #[test] + fn lowercase_left_edge() { + assert_range('a', 'd', &[('a', 'd'), ('A', 'D')]); + } + + #[test] + fn lowercase_right_edge() { + assert_range('r', 'z', &[('r', 'z'), ('R', 'Z')]); + } + + #[test] + fn lowercase_total() { + assert_range('a', 'z', &[('a', 'z'), ('A', 'Z')]); + } + + #[test] + fn uppercase_left_edge() { + assert_range('A', 'D', &[('a', 'd'), ('A', 'D')]); + } + + #[test] + fn uppercase_right_edge() { + assert_range('R', 'Z', &[('r', 'z'), ('R', 'Z')]); + } + + #[test] + fn uppercase_total() { + assert_range('A', 'Z', &[('a', 'z'), ('A', 'Z')]); + } + + #[test] + fn lowercase_cross_left() { + assert_range('[', 'h', &[('[', 'h'), ('A', 'H')]); + } + + #[test] + fn lowercase_cross_right() { + assert_range('d', '}', &[('d', '}'), ('D', 'Z')]); + } + + #[test] + fn uppercase_cross_left() { + assert_range(';', 'H', &[(';', 'H'), ('a', 'h')]); + } + + #[test] + fn uppercase_cross_right() { + assert_range('T', ']', &[('t', 'z'), ('T', ']')]); + } + + #[test] + fn cross_both() { + assert_range('X', 'c', &[('X', 'c'), ('x', 'z'), ('A', 'C')]); + } + + #[test] + fn all_letters() { + assert_range('+', '|', &[('+', '|')]); + } + + #[test] + fn oob_all_letters() { + assert_range('#', 'é', &[('#', 'é')]); + } + + #[test] + fn oob_from_uppercase() { + assert_range('Q', 'é', &[('A', 'é')]); + } + + #[test] + fn oob_from_lowercase() { + assert_range('q', 'é', &[('q', 'é'), ('Q', 'Z')]); + } + + #[test] + fn oob_no_letters() { + assert_range('|', 'é', &[('|', 'é')]); + } + } +} diff --git a/logos-derive/src/parser/mod.rs b/logos-derive/src/parser/mod.rs index 82326c12..2f2215af 100644 --- a/logos-derive/src/parser/mod.rs +++ b/logos-derive/src/parser/mod.rs @@ -9,11 +9,13 @@ use crate::leaf::{Callback, InlineCallback}; use crate::util::{expect_punct, MaybeVoid}; mod definition; +mod ignore_flags; mod nested; mod subpattern; mod type_params; pub use self::definition::{Definition, Literal}; +pub use self::ignore_flags::IgnoreFlags; use self::nested::{AttributeParser, Nested, NestedValue}; pub use self::subpattern::Subpatterns; use self::type_params::{replace_lifetime, traverse_type, TypeParams}; diff --git a/tests/tests/ignore_case.rs b/tests/tests/ignore_case.rs new file mode 100644 index 00000000..0ad6d705 --- /dev/null +++ b/tests/tests/ignore_case.rs @@ -0,0 +1,232 @@ +mod ignore_ascii_case { + use logos::Logos; + use tests::assert_lex; + + #[derive(Logos, Debug, PartialEq, Eq)] + enum Words { + #[error] + #[regex(" +", logos::skip)] + Error, + + #[token("lOwERCaSe", ignore(ascii_case))] + Lowercase, + #[token("or", ignore(ascii_case))] + Or, + #[token("UppeRcaSE", ignore(ascii_case))] + Uppercase, + #[token(":", ignore(ascii_case))] + Colon, + #[token("ThAT", ignore(ascii_case))] + That, + #[token("IS", ignore(ascii_case))] + Is, + #[token("the", ignore(ascii_case))] + The, + #[token("QuEsTiOn", ignore(ascii_case))] + Question, + + #[token("MON", ignore(ascii_case))] + Mon, + #[token("frèRE", ignore(ascii_case))] + Frere, + #[token("ÉTAIT", ignore(ascii_case))] + Etait, + #[token("là", ignore(ascii_case))] + La, + #[token("cET", ignore(ascii_case))] + Cet, + #[token("éTé", ignore(ascii_case))] + Ete, + } + + #[test] + fn tokens_simple() { + assert_lex( + "LowErcase or UppeRCase: ThAT iS tHe question", + &[ + (Words::Lowercase, "LowErcase", 0..9), + (Words::Or, "or", 10..12), + (Words::Uppercase, "UppeRCase", 13..22), + (Words::Colon, ":", 22..23), + (Words::That, "ThAT", 24..28), + (Words::Is, "iS", 29..31), + (Words::The, "tHe", 32..35), + (Words::Question, "question", 36..44), + ], + ) + } + + #[test] + fn tokens_nonascii() { + assert_lex( + "Mon Frère Était lÀ cet Été", + &[ + (Words::Mon, "Mon", 0..3), + (Words::Frere, "Frère", 4..10), + (Words::Etait, "Était", 11..17), + (Words::Error, "l", 18..19), + (Words::Error, "À", 19..21), + (Words::Cet, "cet", 22..25), + (Words::Error, "É", 26..28), + (Words::Error, "t", 28..29), + (Words::Error, "é", 29..31), + ], + ) + } + + #[derive(Logos, Debug, PartialEq, Eq)] + enum Letters { + #[error] + #[regex(" +", logos::skip)] + Error, + + #[regex("a", ignore(ascii_case))] + Single, + #[regex("bc", ignore(ascii_case))] + Concat, + #[regex("[de]", ignore(ascii_case))] + Altern, + #[regex("f+", ignore(ascii_case))] + Loop, + #[regex("gg?", ignore(ascii_case))] + Maybe, + #[regex("[h-k]+", ignore(ascii_case))] + Range, + + #[regex("à", ignore(ascii_case))] + NaSingle, + #[regex("éèd", ignore(ascii_case))] + NaConcat, + #[regex("[cûü]+", ignore(ascii_case))] + NaAltern, + #[regex("i§?", ignore(ascii_case))] + NaMaybe, + #[regex("[x-à]+", ignore(ascii_case))] + NaRange, + } + + #[test] + fn regex_simple() { + assert_lex( + "aA BCbC DdEE fFff g gg hHiIjJkK", + &[ + (Letters::Single, "a", 0..1), + (Letters::Single, "A", 1..2), + (Letters::Concat, "BC", 3..5), + (Letters::Concat, "bC", 5..7), + (Letters::Altern, "D", 8..9), + (Letters::Altern, "d", 9..10), + (Letters::Altern, "E", 10..11), + (Letters::Altern, "E", 11..12), + (Letters::Loop, "fFff", 13..17), + (Letters::Maybe, "g", 18..19), + (Letters::Maybe, "gg", 20..22), + (Letters::Range, "hHiIjJkK", 23..31), + ], + ) + } + + #[test] + fn regex_nonascii() { + assert_lex( + "à À éèD Éèd CcûÛüÜC i i§ xXyYzZ|{}", + &[ + (Letters::NaSingle, "à", 0..2), + (Letters::NaRange, "À", 3..5), + (Letters::NaConcat, "éèD", 6..11), + (Letters::NaRange, "É", 12..14), + (Letters::Error, "è", 14..16), + (Letters::Altern, "d", 16..17), + (Letters::NaAltern, "Ccû", 18..22), + (Letters::NaRange, "Û", 22..24), + (Letters::NaAltern, "ü", 24..26), + (Letters::NaRange, "Ü", 26..28), + (Letters::NaAltern, "C", 28..29), + (Letters::NaMaybe, "i", 30..31), + (Letters::NaMaybe, "i§", 32..35), + (Letters::NaRange, "xXyYzZ|{}", 36..45), + ], + ) + } +} + +mod ignore_case { + use logos::Logos; + use tests::assert_lex; + + #[derive(Logos, Debug, PartialEq, Eq)] + enum Words { + #[error] + #[regex(" +", logos::skip)] + Error, + + #[token("élÉphAnt", ignore(case))] + Elephant, + #[token("ÉlèvE", ignore(case))] + Eleve, + #[token("à", ignore(case))] + A, + + #[token("[abc]+", ignore(case))] + Abc, + } + + #[test] + fn tokens() { + assert_lex( + "ÉLÉPHANT Éléphant ÉLèVE à À a", + &[ + (Words::Elephant, "ÉLÉPHANT", 0..10), + (Words::Elephant, "Éléphant", 11..21), + (Words::Eleve, "ÉLèVE", 22..29), + (Words::A, "à", 30..32), + (Words::A, "À", 33..35), + (Words::Error, "a", 36..37), + ], + ) + } + + #[test] + fn tokens_regex_escaped() { + assert_lex( + "[abc]+ abccBA", + &[ + (Words::Abc, "[abc]+", 0..6), + (Words::Error, "a", 7..8), + (Words::Error, "b", 8..9), + (Words::Error, "c", 9..10), + (Words::Error, "c", 10..11), + (Words::Error, "B", 11..12), + (Words::Error, "A", 12..13), + ], + ) + } + + #[derive(Logos, PartialEq, Eq, Debug)] + enum Sink { + #[error] + #[regex(" +", logos::skip)] + Error, + + #[regex("[abcéà]+", ignore(case))] + Letters, + #[regex("[0-9]+", ignore(case))] + Numbers, + #[regex("ééààé", ignore(case))] + Sequence, + } + + #[test] + fn regex() { + assert_lex( + "aabbccééààéé 00123 ééààé ABCÉÀÀ ÉÉàÀÉ", + &[ + (Sink::Letters, "aabbccééààéé", 0..18), + (Sink::Numbers, "00123", 19..24), + (Sink::Sequence, "ééààé", 25..35), + (Sink::Letters, "ABCÉÀÀ", 36..45), + (Sink::Sequence, "ÉÉàÀÉ", 46..56), + ], + ) + } +}