Skip to content

Commit

Permalink
fix Lexer::clone leak and UB + tests
Browse files Browse the repository at this point in the history
`Lexer::clone` shouldn't clone the inner `ManuallyDrop`, because doing so clones the inner value, which is moved out in `Lexer::next`.

This causes use-after-free if the lexer is cloned after the last-returned token is dropped, especially if the token contains an overridden implementation of `Clone` (such as `Rc`) that tries to read the dropped data.

It causes a memory leak if the token contains a heap-allocated value, because cloning makes a new allocation. This allocation is in the `ManuallyDrop` and it's guaranteed to be overridden before the call to `ManuallyDrop::take`, so it's never freed.

Another thing: #263 (make `Lexer` implement `Copy`) probably should be added (referencing here because it looks like the issue has been forgotten).
  • Loading branch information
Jakobeha committed May 2, 2024
1 parent 142788e commit a9ad714
Show file tree
Hide file tree
Showing 3 changed files with 181 additions and 1 deletion.
2 changes: 1 addition & 1 deletion src/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ where
fn clone(&self) -> Self {
Lexer {
extras: self.extras.clone(),
token: self.token.clone(),
token: ManuallyDrop::new(None),
..*self
}
}
Expand Down
62 changes: 62 additions & 0 deletions tests/tests/clone.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
use std::cell::Cell;

use logos::Logos as _;
use logos_derive::Logos;

#[derive(Logos, Clone, Debug, PartialEq)]
pub enum Token {
#[token("a", |_| Evil::default())]
Evil(Evil),
}

#[derive(Debug, Default, PartialEq)]
pub struct Evil(Box<Cell<u8>>);

impl Clone for Evil {
fn clone(&self) -> Self {
self.0.set(self.0.get() + 1);
Self::default()
}
}

#[test]
fn test_it_works_without_cloning() {
let mut lexer = Token::lexer("aaa");
assert_eq!(lexer.next(), Some(Ok(Token::Evil(Evil::default()))));
assert_eq!(lexer.next(), Some(Ok(Token::Evil(Evil::default()))));
assert_eq!(lexer.next(), Some(Ok(Token::Evil(Evil::default()))));
assert_eq!(lexer.next(), None);
}

#[test]
fn test_clone_ub() {
let mut lexer = Token::lexer("a");
assert_eq!(lexer.next(), Some(Ok(Token::Evil(Evil::default()))));

// In logos 0.14.0, this causes use-after-free (UB),
// because `Clone` dereferences the value returned by the last call to `lexer.next()`,
// which got deallocated.
// A real-life example where this could happen is with `Rc`.
// Note that it may still pass `cargo test`, it will fail with Miri.
let mut lexer2 = lexer.clone();

assert_eq!(lexer2.next(), None);
}

#[test]
fn test_clone_leak() {
let mut lexer = Token::lexer("a");
let Some(Ok(Token::Evil(evil))) = lexer.next() else {
panic!("Expected Token::Evil");
};
assert_eq!(evil.0.get(), 0);

// In logos 0.14.0, this causes a memory leak because `evil` is cloned with `lexer`.
// This produces `evil.0.get() == 1`. It will fail even on `cargo test`.
let mut lexer2 = lexer.clone();
assert_eq!(evil.0.get(), 0);

assert_eq!(lexer2.next(), None);
let _ = evil.clone();
assert_eq!(evil.0.get(), 1);
}
118 changes: 118 additions & 0 deletions tests/tests/string.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
use logos::{Lexer, Logos as _};
use logos_derive::Logos;

#[derive(Logos, Clone, Debug, PartialEq)]
#[logos(skip " ")]
pub enum Token {
#[regex(r#""([^"\\]+|\\.)*""#, lex_single_line_string)]
String(String),
}

#[test]
fn test_it_works_without_cloning() {
let mut lexer = Token::lexer(r#""Hello, world!" "foo😀bar\nbaz \x3F\u{1234}""#);
assert_eq!(
lexer.next(),
Some(Ok(Token::String("Hello, world!".to_string())))
);
assert_eq!(
lexer.next(),
Some(Ok(Token::String("foo😀bar\nbaz \x3F\u{1234}".to_string())))
);
assert_eq!(lexer.next(), None);
}

#[test]
fn test_it_works_with_cloning() {
let mut lexer = Token::lexer(r#""Hello, world!" "foo😀bar\nbaz \x3F\u{1234}""#);
let mut lexer2 = lexer.clone();
assert_eq!(
lexer2.next(),
Some(Ok(Token::String("Hello, world!".to_string())))
);
let mut lexer3 = lexer2.clone();
let mut lexer4 = lexer3.clone();
assert_eq!(
lexer3.next(),
Some(Ok(Token::String("foo😀bar\nbaz \x3F\u{1234}".to_string())))
);
assert_eq!(
lexer4.next(),
Some(Ok(Token::String("foo😀bar\nbaz \x3F\u{1234}".to_string())))
);
assert_eq!(lexer4.next(), None);
let mut lexer5 = lexer.clone();
assert_eq!(
lexer5.next(),
Some(Ok(Token::String("Hello, world!".to_string())))
);
assert_eq!(
lexer5.next(),
Some(Ok(Token::String("foo😀bar\nbaz \x3F\u{1234}".to_string())))
);
assert_eq!(
lexer.next(),
Some(Ok(Token::String("Hello, world!".to_string())))
);
assert_eq!(lexer5.next(), None);
assert_eq!(
lexer2.next(),
Some(Ok(Token::String("foo😀bar\nbaz \x3F\u{1234}".to_string())))
);
assert_eq!(lexer2.next(), None);
assert_eq!(lexer3.next(), None);
assert_eq!(
lexer.next(),
Some(Ok(Token::String("foo😀bar\nbaz \x3F\u{1234}".to_string())))
);
assert_eq!(lexer.next(), None);
}

// Not important
pub fn lex_single_line_string(lexer: &mut Lexer<Token>) -> Result<String, ()> {
let mut string = String::new();
let mut chars = lexer.slice()[1..lexer.slice().len() - 1].chars();
while let Some(c) = chars.next() {
match c {
'\\' => {
let c = chars.next().ok_or(())?;
match c {
'\n' => {}
'n' => string.push('\n'),
'r' => string.push('\r'),
't' => string.push('\t'),
'0' => string.push('\0'),
'\'' | '"' | '\\' => string.push(c),
'x' => {
let mut hex = String::new();
hex.push(chars.next().ok_or(())?);
hex.push(chars.next().ok_or(())?);
let code = u8::from_str_radix(&hex, 16).map_err(|_| ())?;
if code > 0x7F {
return Err(());
}
string.push(code as char);
}
'u' => {
if chars.next() != Some('{') {
return Err(());
}
let mut hex = String::new();
for _ in 0..6 {
let c = chars.next().ok_or(())?;
if c == '}' {
break;
}
hex.push(c);
}
let code = u32::from_str_radix(&hex, 16).map_err(|_| ())?;
string.push(char::from_u32(code).ok_or(())?);
}
_ => return Err(()),
}
}
_ => string.push(c),
}
}
Ok(string)
}

0 comments on commit a9ad714

Please sign in to comment.