From a3f9b1e8c3540b3c323a71874a69068c6acb855a Mon Sep 17 00:00:00 2001 From: AurumTheEnd <47597303+aurumtheend@users.noreply.github.com> Date: Fri, 17 May 2024 15:41:27 +0200 Subject: [PATCH] refactor(parser): made PATTERN_SET regex lazy_static --- src/parser/structs/intermediate_token.rs | 26 +++++------------------- src/parser/tokenize.rs | 4 ++-- src/parser/utils/mod.rs | 2 +- src/parser/utils/regex.rs | 21 +++++++++++++++++-- 4 files changed, 27 insertions(+), 26 deletions(-) diff --git a/src/parser/structs/intermediate_token.rs b/src/parser/structs/intermediate_token.rs index f6ca202..fab2c24 100644 --- a/src/parser/structs/intermediate_token.rs +++ b/src/parser/structs/intermediate_token.rs @@ -1,6 +1,4 @@ -use regex::RegexSet; - -use crate::parser::utils::LITERAL_IDENTIFIER; +use crate::parser::utils::PATTERN_SET; #[derive(PartialEq, Debug)] pub enum IntermediateToken<'a> { @@ -136,26 +134,12 @@ impl<'a> IntermediateToken<'a> { // TODO make a trait method pub fn try_from(value: &'a str) -> Option { - let input = Self::ALL_TOKEN_PATTERNS_FROM_LONGEST; - - // escape the pattern so that e.g. "^" is not treated as regex, but as a literal character for the And operation - let set = RegexSet::new(input.iter().map(|pattern| { - format!( - r"(?i)^{}{}", - regex::escape(pattern), - if LITERAL_IDENTIFIER.is_match(pattern) { - "([^-_a-zA-Z0-9]|$)" - } else { - "" - } - ) - })) - .unwrap(); - - let pattern_or_no_match = set + let patterns = Self::ALL_TOKEN_PATTERNS_FROM_LONGEST; + + let pattern_or_no_match = PATTERN_SET .matches(value) .into_iter() - .map(|index| &input[index]) + .map(|index| &patterns[index]) .next(); pattern_or_no_match.map(|value| Self::from(value)) diff --git a/src/parser/tokenize.rs b/src/parser/tokenize.rs index ae9cb8f..d9a01b1 100644 --- a/src/parser/tokenize.rs +++ b/src/parser/tokenize.rs @@ -183,7 +183,7 @@ mod tests { }; use crate::parser::error::EOL_VICINITY; use crate::parser::structs::FinalToken::*; - use crate::parser::utils::LITERAL_IDENTIFIER; + use regex::Regex; use super::*; @@ -233,7 +233,7 @@ mod tests { // test sanity assert!(!all_tokens().contains(input)); - assert!(!LITERAL_IDENTIFIER.is_match(input)); + assert!(!Regex::new(r"[-_a-zA-Z0-9]+").unwrap().is_match(input)); let actual = tokenize(input); let expected_err = UnknownSymbolError { diff --git a/src/parser/utils/mod.rs b/src/parser/utils/mod.rs index d0bef93..4963c54 100644 --- a/src/parser/utils/mod.rs +++ b/src/parser/utils/mod.rs @@ -1,6 +1,6 @@ pub use peek_until::peek_until_n; pub use pop::pop_n_left; -pub use regex::{LITERAL_IDENTIFIER, SHOULD_END_LITERAL}; +pub use regex::{PATTERN_SET, SHOULD_END_LITERAL}; pub use trim_whitespace::trim_whitespace_left; mod peek_until; diff --git a/src/parser/utils/regex.rs b/src/parser/utils/regex.rs index c4c830e..53afc91 100644 --- a/src/parser/utils/regex.rs +++ b/src/parser/utils/regex.rs @@ -1,6 +1,23 @@ -use regex::Regex; +use crate::parser::structs::IntermediateToken; +use regex::{Regex, RegexSet}; lazy_static::lazy_static! { pub static ref SHOULD_END_LITERAL: Regex = Regex::new(r"[^-_a-zA-Z0-9]").unwrap(); - pub static ref LITERAL_IDENTIFIER: Regex = Regex::new(r"[-_a-zA-Z0-9]+").unwrap(); + static ref LITERAL_IDENTIFIER: Regex = Regex::new(r"[-_a-zA-Z0-9]+").unwrap(); + + pub static ref PATTERN_SET: RegexSet = RegexSet::new(IntermediateToken::ALL_TOKEN_PATTERNS_FROM_LONGEST + .iter() + .map(|pattern| { + format!( + r"(?i)^{}{}", + // escape the pattern so that e.g. "^" is not treated as regex, but as a literal character for the And operation + regex::escape(pattern), + if LITERAL_IDENTIFIER.is_match(pattern) { + "([^-_a-zA-Z0-9]|$)" + } else { + "" + } + ) + })) + .unwrap(); }