Skip to content

Commit

Permalink
Merge branch 'perf'
Browse files Browse the repository at this point in the history
  • Loading branch information
d0rianb committed Apr 26, 2024
2 parents 5cc09fc + c0cc7b8 commit eb1a2bd
Show file tree
Hide file tree
Showing 9 changed files with 3,874 additions and 19 deletions.
5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ exclude = ["*.rtf", ".idea"]
lto = true
opt-level = 3

[profile.profiling]
inherits = "dev"
opt-level = 0
debug = true

[dependencies]
derivative = "2.2.0"
serde = { version = "1.0", optional = true, features = ["derive"] }
4 changes: 4 additions & 0 deletions bench.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

cargo build --profile profiling --example bench
samply record -r 10000 target/profiling/examples/bench
17 changes: 17 additions & 0 deletions examples/bench.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
use std::time::Instant;
use rtf_parser::header::StyleSheet;
use rtf_parser::lexer::Lexer;
use rtf_parser::parser::Parser;

fn main() {
let start = Instant::now();
let doc;
{
let rtf_text = include_str!("../resources/tests/file-sample_500kB.rtf");
let tokens = Lexer::scan(rtf_text).expect("Invalid RTF content");
doc = Parser::new(tokens).parse().unwrap();
}
let elapsed = start.elapsed();
assert_eq!(doc.header.stylesheet, StyleSheet::new()); // in order to not get optimized out
println!("Elapsed: {:.2?}", elapsed);
}
4 changes: 3 additions & 1 deletion examples/load_file.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
extern crate rtf_parser;
use rtf_parser::lexer::Lexer;
use rtf_parser::parser::Parser;
use rtf_parser::header::StyleSheet;

fn main() {
let rtf_text = include_str!("../resources/tests/test-file.rtf");
let rtf_text = include_str!("../resources/tests/file-sample_500kB.rtf");
let tokens = Lexer::scan(rtf_text).expect("Invalid RTF content");
let doc = Parser::new(tokens).parse();
assert_eq!(doc.unwrap().header.stylesheet, StyleSheet::new());
}
3,788 changes: 3,788 additions & 0 deletions resources/tests/file-sample_500kB.rtf

Large diffs are not rendered by default.

16 changes: 11 additions & 5 deletions src/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,20 @@ pub struct Lexer;
impl Lexer {
pub fn scan(src: &str) -> Result<Vec<Token>, LexerError> {
let src = src.trim(); // Sanitize src : Trim the leading whitespaces
let mut it = src.chars();

let mut tokens: Vec<Token> = vec![];
let mut slice_start_index = 0;
let mut current_index = 0;
let mut previous_char = ' ';
while let Some(c) = it.next() {

// This is faster than using an iterator
let len = src.len();
let bytes = src.as_bytes();
let mut i = 0;
while i < len {
let c = bytes[i] as char;
i += 1;

match c {
// TODO: Handle char over code 127 for escaped chars
// Handle Escaped chars : "\" + any charcode below 127
Expand All @@ -59,9 +67,7 @@ impl Lexer {
let slice = &src[slice_start_index..current_index];
// Get the corresponding token(s)
let slice_tokens = Self::tokenize(slice)?;
for slice_token in slice_tokens {
tokens.push(slice_token);
}
tokens.extend_from_slice(&slice_tokens.as_slice());
slice_start_index = current_index;
}
}
Expand Down
46 changes: 35 additions & 11 deletions src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,19 +68,29 @@ impl fmt::Display for ParserError {
ParserError::InvalidFontIdentifier(property) => write!(f, "Invalid font identifier : {:?}", property),
ParserError::InvalidColorIdentifier(property) => write!(f, "Invalid color identifier : {:?}", property),
ParserError::NoMoreToken => write!(f, "No more token to parse"),
ParserError::ValueCastError(T) => write!(f, "Unable to cast i32 to {T}"),
ParserError::ValueCastError(_type) => write!(f, "Unable to cast i32 to {_type}"),
};
}
}

pub struct Parser<'a> {
tokens: Vec<Token<'a>>,
parsed_item : Vec<bool>,
cursor: usize,
}

impl<'a> Parser<'a> {
pub fn new(tokens: Vec<Token<'a>>) -> Self {
return Self { tokens, cursor: 0 };
return Self {
parsed_item: vec![false; tokens.len()],
tokens,
cursor: 0
};
}

pub fn get_tokens(&self) -> Vec<&Token> {
// It ignores the empty tokens, that replaced already parsed tokens istead of deleting them for performance reasons
return self.tokens.iter().filter(|t| *t != &Token::Empty).collect();
}

fn check_document_validity(&self) -> Result<(), ParserError> {
Expand Down Expand Up @@ -110,8 +120,18 @@ impl<'a> Parser<'a> {
// Parse the body
let mut painter_stack: Vec<Painter> = vec![Painter::default()];
let mut paragraph = Paragraph::default();
let mut it = self.tokens.iter();
while let Some(token) = it.next() {
let len = self.tokens.len();
let mut i = 0;

while i < len {
if self.parsed_item[i] {
// The item already has been parsed
i += 1;
continue;
}
let token = &self.tokens[i];
i += 1;

match token {
Token::OpeningBracket => {
painter_stack.push(Painter::default());
Expand Down Expand Up @@ -169,6 +189,7 @@ impl<'a> Parser<'a> {
Token::IgnorableDestination => {
return Err(ParserError::IgnorableDestinationParsingError);
}
Token::Empty => panic!("Try to parse an empty token, this should not happen")
};
}
return Ok(document);
Expand Down Expand Up @@ -204,21 +225,25 @@ impl<'a> Parser<'a> {
return self.get_token_at(self.cursor);
}

#[inline]
fn consume_token_at(&mut self, index: usize) -> Option<Token<'a>> {
if self.tokens.is_empty() {
if self.tokens.is_empty() || index >= self.tokens.len() {
return None;
}
Some(self.tokens.remove(index))
// PERF : vec.remove can require reallocation unlike this method
self.cursor += 1;
self.parsed_item[index] = true;
return Some(mem::replace(&mut self.tokens[index], Token::Empty));
}

fn consume_next_token(&mut self) -> Option<Token<'a>> {
return self.consume_token_at(self.cursor);
}

// Consume token from cursor to <reference-token>
fn _consume_tokens_until(&mut self, reference_token: Token<'a>) -> Vec<Token<'a>> {
fn _consume_tokens_until(&mut self, reference_token: &Token<'a>) -> Vec<Token<'a>> {
let mut ret = vec![];
let token_type_id = mem::discriminant(&reference_token);
let token_type_id = mem::discriminant(reference_token);
while let Some(token) = self.consume_next_token() {
let type_id = mem::discriminant(&token);
ret.push(token);
Expand Down Expand Up @@ -259,7 +284,6 @@ impl<'a> Parser<'a> {
self.cursor = 0; // Reset the cursor
let mut header = RtfHeader::default();
while let (Some(token), Some(mut next_token)) = (self.get_token_at(self.cursor), self.get_token_at(self.cursor + 1)) {

// Manage the case where there is CRLF between { and control_word
// {\n /*/ignoregroup }
let mut i = 0;
Expand Down Expand Up @@ -485,7 +509,7 @@ pub mod tests {
let tokens = Lexer::scan(rtf).unwrap();
let mut parser = Parser::new(tokens);
let document = parser.parse().unwrap();
assert_eq!(parser.tokens, vec![]); // Should have consumed all the tokens
assert_eq!(parser.get_tokens(), Vec::<&Token>::new()); // Should have consumed all the tokens
assert_eq!(document.header, RtfHeader::default());
}

Expand All @@ -497,7 +521,7 @@ pub mod tests {
let tokens = Lexer::scan(rtf).unwrap();
let mut parser = Parser::new(tokens);
let document = parser.parse().unwrap();
assert_eq!(parser.tokens, vec![]); // Should have consumed all the tokens
assert_eq!(parser.get_tokens(), Vec::<&Token>::new()); // Should have consumed all the tokens
assert_eq!(document.header, RtfHeader::default());
}

Expand Down
2 changes: 2 additions & 0 deletions src/tokens.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ pub enum Token<'a> {
CRLF, // Line-return \n
IgnorableDestination, // \*\ <destination-name>
ControlSymbol(ControlSymbol<'a>),
Empty // Used by the parser for optimization
}

#[allow(dead_code)]
Expand All @@ -30,6 +31,7 @@ impl<'a> fmt::Debug for Token<'a> {
Token::CRLF => write!(f, "CRLF"),
Token::IgnorableDestination => write!(f, "IgnorableDestination"),
Token::ControlSymbol(symbol) => write!(f, "ControlSymbol : {:?}", symbol),
Token::Empty => write!(f, "Empty"),
};
}
}
Expand Down
11 changes: 9 additions & 2 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,15 @@ impl StrUtils for str {
// ex : split_first_whitespace("\b I'm a bold string") -> ("\b", "I'm a bold string")
fn split_first_whitespace(&self) -> (&str, &str) {
let mut first_whitespace_index = 0;
let mut it = self.chars();
while let Some(c) = it.next() {

let len = self.len();
let bytes = self.as_bytes();
let mut i = 0;
// Faster than an iterator
while i < len {
let c = bytes[i] as char;
i += 1;

if c.is_whitespace() {
break;
} else {
Expand Down

0 comments on commit eb1a2bd

Please sign in to comment.