Skip to content

Commit

Permalink
fix(lib): priority fixed and tested
Browse files Browse the repository at this point in the history
  • Loading branch information
jeertmans committed Nov 27, 2023
1 parent e6ca01f commit 6c2086d
Show file tree
Hide file tree
Showing 8 changed files with 120 additions and 86 deletions.
2 changes: 1 addition & 1 deletion logos-codegen/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ fnv = "1.0.6"
syn = { version = "2.0.13", features = ["full"] }
quote = "1.0.3"
proc-macro2 = "1.0.9"
regex-syntax = "0.6"
regex-syntax = "0.7"
lazy_static = "1.4.0"

[dev-dependencies]
Expand Down
18 changes: 4 additions & 14 deletions logos-codegen/src/graph/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,7 @@ impl<Leaf: Disambiguate + Debug> Graph<Leaf> {
self.insert_or_push(reserved, fork)
}
Mir::Literal(literal) => {
let pattern = match literal {
Literal::Unicode(unicode) => {
unicode.encode_utf8(&mut [0; 4]).as_bytes().to_vec()
}
Literal::Byte(byte) => [byte].to_vec(),
};
let pattern = literal.0.to_vec();

self.insert_or_push(reserved, Rope::new(pattern, then).miss(miss))
}
Expand All @@ -71,18 +66,13 @@ impl<Leaf: Disambiguate + Debug> Graph<Leaf> {
let mut then = then;

let mut handle_bytes = |graph: &mut Self, mir, then: &mut NodeId| match mir {
Mir::Literal(Literal::Unicode(u)) => {
cur -= u.len_utf8();
for (i, byte) in u.encode_utf8(&mut [0; 4]).bytes().enumerate() {
Mir::Literal(Literal(bytes)) => {
cur -= bytes.len();
for (i, byte) in bytes.iter().enumerate() {
ropebuf[cur + i] = byte.into();
}
None
}
Mir::Literal(Literal::Byte(byte)) => {
cur -= 1;
ropebuf[cur] = byte.into();
None
}
Mir::Class(Class::Unicode(class)) if is_one_ascii(&class) => {
cur -= 1;
ropebuf[cur] = class.ranges()[0].into();
Expand Down
134 changes: 81 additions & 53 deletions logos-codegen/src/mir.rs
Original file line number Diff line number Diff line change
@@ -1,19 +1,16 @@
use std::convert::TryFrom;

use lazy_static::lazy_static;
use regex_syntax::hir::{Hir, HirKind, RepetitionKind, RepetitionRange};
use regex_syntax::hir::{Dot, Hir, HirKind};
use regex_syntax::ParserBuilder;

pub use regex_syntax::hir::{Class, ClassUnicode, Literal};

use crate::error::{Error, Result};

lazy_static! {
/// DOT regex that matches utf8 only.
static ref DOT_UTF8: Hir = Hir::dot(false);

/// DOT regex that matches any byte.
static ref DOT_BYTES: Hir = Hir::dot(true);
static ref DOT_UTF8: Hir = Hir::dot(Dot::AnyChar);
static ref DOT_BYTES: Hir = Hir::dot(Dot::AnyByte);
}

/// Middle Intermediate Representation of the regex, built from
Expand Down Expand Up @@ -48,7 +45,7 @@ impl Mir {
pub fn binary(source: &str) -> Result<Mir> {
Mir::try_from(
ParserBuilder::new()
.allow_invalid_utf8(true)
.utf8(false)
.unicode(false)
.build()
.parse(source)?,
Expand All @@ -58,7 +55,7 @@ impl Mir {
pub fn binary_ignore_case(source: &str) -> Result<Mir> {
Mir::try_from(
ParserBuilder::new()
.allow_invalid_utf8(true)
.utf8(false)
.unicode(false)
.case_insensitive(true)
.build()
Expand All @@ -71,8 +68,11 @@ impl Mir {
Mir::Empty | Mir::Loop(_) | Mir::Maybe(_) => 0,
Mir::Concat(concat) => concat.iter().map(Mir::priority).sum(),
Mir::Alternation(alt) => alt.iter().map(Mir::priority).min().unwrap_or(0),
Mir::Class(_) => 1,
Mir::Literal(_) => 2,
Mir::Class(_) => 2,
Mir::Literal(lit) => match std::str::from_utf8(&lit.0) {
Ok(s) => 2 * s.chars().count(),
Err(_) => 2 * lit.0.len(),
},
}
}
}
Expand Down Expand Up @@ -118,16 +118,15 @@ impl TryFrom<Hir> for Mir {
return Err("#[regex]: non-greedy parsing is currently unsupported.".into());
}

let kind = repetition.kind;
let is_dot = if repetition.hir.is_always_utf8() {
*repetition.hir == *DOT_UTF8
let is_dot = if repetition.sub.properties().is_utf8() {
*repetition.sub == *DOT_UTF8
} else {
*repetition.hir == *DOT_BYTES
*repetition.sub == *DOT_BYTES
};
let mir = Mir::try_from(*repetition.hir)?;
let mir = Mir::try_from(*repetition.sub)?;

match kind {
RepetitionKind::ZeroOrMore | RepetitionKind::OneOrMore if is_dot => {
match (repetition.min, repetition.max) {
(0..=1, None) if is_dot => {
Err(
"#[regex]: \".+\" and \".*\" patterns will greedily consume \
the entire source till the end as Logos does not allow \
Expand All @@ -139,46 +138,47 @@ impl TryFrom<Hir> for Mir {
.into()
)
}
RepetitionKind::ZeroOrOne => Ok(Mir::Maybe(Box::new(mir))),
RepetitionKind::ZeroOrMore => Ok(Mir::Loop(Box::new(mir))),
RepetitionKind::OneOrMore => {
// 0 or 1
(0, Some(1)) => Ok(Mir::Maybe(Box::new(mir))),
// 0 or more
(0, None) => Ok(Mir::Loop(Box::new(mir))),
// 1 or more
(1, None) => {
Ok(Mir::Concat(vec![mir.clone(), Mir::Loop(Box::new(mir))]))
}
RepetitionKind::Range(range) => match range {
RepetitionRange::Exactly(n) => {
let mut out = Vec::with_capacity(n as usize);
for _ in 0..n {
out.push(mir.clone());
}
Ok(Mir::Concat(out))
// Exact {n}
(n, Some(m)) if m == n => {
let mut out = Vec::with_capacity(n as usize);
for _ in 0..n {
out.push(mir.clone());
}
RepetitionRange::AtLeast(n) => {
let mut out = Vec::with_capacity(n as usize);
for _ in 0..n {
out.push(mir.clone());
}
out.push(Mir::Loop(Box::new(mir)));
Ok(Mir::Concat(out))
Ok(Mir::Concat(out))
}
// At least {n,}
(n, None) => {
let mut out = Vec::with_capacity(n as usize);
for _ in 0..n {
out.push(mir.clone());
}
RepetitionRange::Bounded(n, m) => {
let mut out = Vec::with_capacity(m as usize);
for _ in 0..n {
out.push(mir.clone());
}
for _ in n..m {
out.push(Mir::Maybe(Box::new(mir.clone())));
}
Ok(Mir::Concat(out))
out.push(Mir::Loop(Box::new(mir)));
Ok(Mir::Concat(out))
}
// Bounded {n, m}
(n, Some(m)) => {
let mut out = Vec::with_capacity(m as usize);
for _ in 0..n {
out.push(mir.clone());
}
},
for _ in n..m {
out.push(Mir::Maybe(Box::new(mir.clone())));
}
Ok(Mir::Concat(out))
}
}
}
HirKind::Group(group) => Mir::try_from(*group.hir),
HirKind::WordBoundary(_) => {
Err("#[regex]: word boundaries are currently unsupported.".into())
}
HirKind::Anchor(_) => {
Err("#[regex]: anchors in #[regex] are currently unsupported.".into())
HirKind::Capture(capture) => Mir::try_from(*capture.sub),
HirKind::Look(_) => {
Err("#[regex]: look-around assertions are currently unsupported.".into())
}
}
}
Expand All @@ -191,17 +191,45 @@ mod tests {
#[test]
fn priorities() {
let regexes = [
("[a-z]+", 1),
("a", 2),
("à", 2),
("京", 2),
("Eté", 6),
("Été", 6),
("[a-z]+", 2),
("a|b", 2),
("a|[b-z]", 1),
("a|[b-z]", 2),
("(foo)+", 6),
("foobar", 12),
("(fooz|bar)+qux", 12),
];

for (regex, expected) in regexes.iter() {
let mir = Mir::utf8(regex).unwrap();
assert_eq!(mir.priority(), *expected);
assert_eq!(mir.priority(), *expected, "Failed for regex \"{}\"", regex);
}
}

#[test]
fn equivalent_patterns() {
let regexes = [
("a|b", "[a-b]"),
("1|2|3", "[1-3]"),
("1+", "[1]+"),
("c*", "[c]*"),
("aaa", "a{3}"),
("a[a]{2}", "a{3}"),
];

for (regex_left, regex_right) in regexes.iter() {
let mir_left = Mir::utf8(regex_left).unwrap();
let mir_right = Mir::utf8(regex_right).unwrap();
assert_eq!(
mir_left.priority(),
mir_right.priority(),
"Regexes \"{regex_left}\" and \"{regex_right}\" \
are equivalent but have different priorities"
);
}
}
}
36 changes: 26 additions & 10 deletions logos-codegen/src/parser/ignore_flags.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,20 @@ pub mod ascii_case {
use crate::mir::Mir;
use crate::parser::Literal;

macro_rules! literal {
($byte:expr) => {
hir::Literal(Box::new([$byte]))
};
(@char $c:expr) => {
hir::Literal(
$c.encode_utf8(&mut [0; 4])
.as_bytes()
.to_vec()
.into_boxed_slice(),
)
};
}

pub trait MakeAsciiCaseInsensitive {
/// Creates a equivalent regular expression which ignore the letter casing
/// of ascii characters.
Expand All @@ -210,16 +224,16 @@ pub mod ascii_case {
fn make_ascii_case_insensitive(self) -> Mir {
if self.is_ascii_lowercase() {
Mir::Alternation(vec![
Mir::Literal(hir::Literal::Byte(self - 32)),
Mir::Literal(hir::Literal::Byte(self)),
Mir::Literal(literal!(self - 32)),
Mir::Literal(literal!(self)),
])
} else if self.is_ascii_uppercase() {
Mir::Alternation(vec![
Mir::Literal(hir::Literal::Byte(self)),
Mir::Literal(hir::Literal::Byte(self + 32)),
Mir::Literal(literal!(self)),
Mir::Literal(literal!(self + 32)),
])
} else {
Mir::Literal(hir::Literal::Byte(self))
Mir::Literal(literal!(self))
}
}
}
Expand All @@ -229,17 +243,19 @@ pub mod ascii_case {
if self.is_ascii() {
(self as u8).make_ascii_case_insensitive()
} else {
Mir::Literal(hir::Literal::Unicode(self))
Mir::Literal(literal!(@char self))
}
}
}

impl MakeAsciiCaseInsensitive for hir::Literal {
fn make_ascii_case_insensitive(self) -> Mir {
match self {
hir::Literal::Byte(b) => b.make_ascii_case_insensitive(),
hir::Literal::Unicode(c) => c.make_ascii_case_insensitive(),
}
Mir::Concat(
self.0
.iter()
.map(|x| x.make_ascii_case_insensitive())
.collect(),
)
}
}

Expand Down
4 changes: 2 additions & 2 deletions logos/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ pub trait Logos<'source>: Sized {
/// enum Token<'a> {
/// // We will treat "abc" as if it was whitespace.
/// // This is identical to using `logos::skip`.
/// #[regex(" |abc", |_| Skip)]
/// #[regex(" |abc", |_| Skip, priority = 3)]
/// Ignored,
///
/// #[regex("[a-zA-Z]+")]
Expand Down Expand Up @@ -376,7 +376,7 @@ pub enum FilterResult<T, E> {
/// #[derive(Logos, Debug, PartialEq)]
/// enum Token<'a> {
/// // We will treat "abc" as if it was whitespace
/// #[regex(" |abc", logos::skip)]
/// #[regex(" |abc", logos::skip, priority = 3)]
/// Ignored,
///
/// #[regex("[a-zA-Z]+")]
Expand Down
4 changes: 2 additions & 2 deletions tests/tests/css.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ enum Token {
#[regex("em|ex|ch|rem|vw|vh|vmin|vmax")]
RelativeLength,

#[regex("cm|mm|Q|in|pc|pt|px")]
#[regex("cm|mm|Q|in|pc|pt|px", priority = 3)]
AbsoluteLength,

#[regex("[+-]?[0-9]*[.]?[0-9]+(?:[eE][+-]?[0-9]+)?", priority = 2)]
#[regex("[+-]?[0-9]*[.]?[0-9]+(?:[eE][+-]?[0-9]+)?", priority = 3)]
Number,

#[regex("[-a-zA-Z_][a-zA-Z0-9_-]*")]
Expand Down
6 changes: 3 additions & 3 deletions tests/tests/edgecase.rs
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ mod priority_disambiguate_1 {
#[derive(Logos, Debug, PartialEq)]
#[logos(skip r"[ \n\t\f]+")]
enum Token {
#[regex("[abc]+", priority = 2)]
#[regex("[abc]+", priority = 3)]
Abc,

#[regex("[cde]+")]
Expand All @@ -399,7 +399,7 @@ mod priority_disambiguate_2 {
#[regex("[abc]+")]
Abc,

#[regex("[cde]+", priority = 2)]
#[regex("[cde]+", priority = 3)]
Cde,
}

Expand Down Expand Up @@ -497,7 +497,7 @@ mod merging_asymmetric_loops {
#[regex(r#"[!#$%&*+-./<=>?@\\^|~:]+"#)]
Operator,

#[regex(r"/([^*]*[*]+[^*/])*([^*]*[*]+|[^*])*", logos::skip)]
#[regex(r"/([^*]*[*]+[^*/])*([^*]*[*]+|[^*])*", logos::skip, priority = 3)]
Ignored,
}
}
Expand Down
2 changes: 1 addition & 1 deletion tests/tests/ignore_case.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ mod ignore_ascii_case {
NaConcat,
#[regex("[cûü]+", ignore(ascii_case))]
NaAltern,
#[regex("i§?", ignore(ascii_case))]
#[regex("i§?", priority = 3, ignore(ascii_case))]
NaMaybe,
#[regex("[x-à]+", ignore(ascii_case))]
NaRange,
Expand Down

0 comments on commit 6c2086d

Please sign in to comment.