Skip to content

Commit

Permalink
fix(parser): fixed bug where tokenizer looped on unknown characters
Browse files Browse the repository at this point in the history
Includes a new type of error and a test.
  • Loading branch information
AurumTheEnd committed Mar 8, 2024
1 parent 07632bc commit bb19b5a
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 3 deletions.
2 changes: 2 additions & 0 deletions src/parser/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ pub enum TokenizeError {
MissingClosingCurlyBrace { position: usize, vicinity: String },
#[error("No name literal `{{}}` encountered on position {position} near '{vicinity}'")]
EmptyLiteralName { position: usize, vicinity: String },
#[error("Unknown symbol {symbol} encountered on position {position}'")]
UnknownSymbolError { position: usize, symbol: String },
#[error("Unexpected whitespace encountered in the middle of operator")]
UnexpectedWhitespace,
}
Expand Down
45 changes: 42 additions & 3 deletions src/parser/tokenize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ fn tokenize_level(
let intermediate_token = IntermediateToken::try_from(buffer.as_str());

match intermediate_token {
None => consume_while_literal(input, &mut result),
None => consume_while_literal(input, &mut result)?,
Some(token) => {
let (final_token, pattern_length) = match token {
IntermediateToken::And { pattern } => {
Expand Down Expand Up @@ -141,30 +141,49 @@ fn consume_until_brace(
Ok((FinalToken::Literal(literal_buffer), 0))
}

fn consume_while_literal(input: &mut TokenizerInput, result: &mut Vec<FinalToken>) {
fn consume_while_literal(
input: &mut TokenizerInput,
result: &mut Vec<FinalToken>,
) -> Result<(), TokenizeError> {
let mut literal_buffer: String = String::new();
let mut last_c = None;
input.iterator.reset_peek();

while let Some(c) = input.iterator.peek() {
if SHOULD_END_LITERAL.is_match(&c.to_string()) {
last_c = Some(*c);
break;
}

literal_buffer.push(*c);
input.next();
}

// we came here not matching a token, but found no literal either
if literal_buffer.is_empty() {
return Err(TokenizeError::UnknownSymbolError {
position: input.current_position(),
symbol: match last_c {
None => EOL_VICINITY.to_string(),
Some(c) => c.to_string(),
},
});
}

result.push(FinalToken::Literal(literal_buffer));

Ok(())
}

#[cfg(test)]
mod tests {
use crate::parser::error::TokenizeError::{
EmptyLiteralName, MissingClosingCurlyBrace, UnexpectedClosingCurlyBrace,
UnexpectedClosingParenthesis,
UnexpectedClosingParenthesis, UnknownSymbolError,
};
use crate::parser::error::EOL_VICINITY;
use crate::parser::structs::FinalToken::*;
use crate::parser::utils::LITERAL_IDENTIFIER;

use super::*;

Expand Down Expand Up @@ -198,6 +217,26 @@ mod tests {
Ok(())
}

#[test]
fn test_unknownchar_minimal_nok() -> Result<(), TokenizeError> {
let input = "@";

// test sanity
assert!(!all_tokens().contains(input));
assert!(!LITERAL_IDENTIFIER.is_match(input));

let actual = tokenize(input);
let expected_err = UnknownSymbolError {
position: 0,
symbol: "@".to_string(),
};

assert!(actual.is_err());
assert_eq!(actual.unwrap_err(), expected_err);

Ok(())
}

#[test]
fn test_charvar_and_singlespace_ok() -> Result<(), TokenizeError> {
let actual = tokenize("a & b")?;
Expand Down

0 comments on commit bb19b5a

Please sign in to comment.