Skip to content

Commit

Permalink
refactor(parser): made PATTERN_SET regex lazy_static
Browse files Browse the repository at this point in the history
  • Loading branch information
AurumTheEnd committed May 17, 2024
1 parent 1b56f44 commit 4981124
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 26 deletions.
26 changes: 5 additions & 21 deletions src/parser/structs/intermediate_token.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
use regex::RegexSet;

use crate::parser::utils::LITERAL_IDENTIFIER;
use crate::parser::utils::PATTERN_SET;

#[derive(PartialEq, Debug)]
pub enum IntermediateToken<'a> {
Expand Down Expand Up @@ -136,26 +134,12 @@ impl<'a> IntermediateToken<'a> {

// TODO make a trait method
pub fn try_from(value: &'a str) -> Option<IntermediateToken> {
let input = Self::ALL_TOKEN_PATTERNS_FROM_LONGEST;

// escape the pattern so that e.g. "^" is not treated as regex, but as a literal character for the And operation
let set = RegexSet::new(input.iter().map(|pattern| {
format!(
r"(?i)^{}{}",
regex::escape(pattern),
if LITERAL_IDENTIFIER.is_match(pattern) {
"([^-_a-zA-Z0-9]|$)"
} else {
""
}
)
}))
.unwrap();

let pattern_or_no_match = set
let patterns = Self::ALL_TOKEN_PATTERNS_FROM_LONGEST;

let pattern_or_no_match = PATTERN_SET
.matches(value)
.into_iter()
.map(|index| &input[index])
.map(|index| &patterns[index])
.next();

pattern_or_no_match.map(|value| Self::from(value))
Expand Down
4 changes: 2 additions & 2 deletions src/parser/tokenize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ mod tests {
};
use crate::parser::error::EOL_VICINITY;
use crate::parser::structs::FinalToken::*;
use crate::parser::utils::LITERAL_IDENTIFIER;
use regex::Regex;

use super::*;

Expand Down Expand Up @@ -233,7 +233,7 @@ mod tests {

// test sanity
assert!(!all_tokens().contains(input));
assert!(!LITERAL_IDENTIFIER.is_match(input));
assert!(!Regex::new(r"[-_a-zA-Z0-9]+").unwrap().is_match(input));

let actual = tokenize(input);
let expected_err = UnknownSymbolError {
Expand Down
2 changes: 1 addition & 1 deletion src/parser/utils/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
pub use peek_until::peek_until_n;
pub use pop::pop_n_left;
pub use regex::{LITERAL_IDENTIFIER, SHOULD_END_LITERAL};
pub use regex::{PATTERN_SET, SHOULD_END_LITERAL};
pub use trim_whitespace::trim_whitespace_left;

mod peek_until;
Expand Down
21 changes: 19 additions & 2 deletions src/parser/utils/regex.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,23 @@
use regex::Regex;
use crate::parser::structs::IntermediateToken;
use regex::{Regex, RegexSet};

lazy_static::lazy_static! {
pub static ref SHOULD_END_LITERAL: Regex = Regex::new(r"[^-_a-zA-Z0-9]").unwrap();
pub static ref LITERAL_IDENTIFIER: Regex = Regex::new(r"[-_a-zA-Z0-9]+").unwrap();
static ref LITERAL_IDENTIFIER: Regex = Regex::new(r"[-_a-zA-Z0-9]+").unwrap();

pub static ref PATTERN_SET: RegexSet = RegexSet::new(IntermediateToken::ALL_TOKEN_PATTERNS_FROM_LONGEST
.iter()
.map(|pattern| {
format!(
r"(?i)^{}{}",
// escape the pattern so that e.g. "^" is not treated as regex, but as a literal character for the And operation
regex::escape(pattern),
if LITERAL_IDENTIFIER.is_match(pattern) {
"([^-_a-zA-Z0-9]|$)"
} else {
""
}
)
}))
.unwrap();
}

0 comments on commit 4981124

Please sign in to comment.