Merge branch 'perf'

d0rianb · Apr 26, 2024 · eb1a2bd · eb1a2bd
2 parents 5cc09fc + c0cc7b8
commit eb1a2bd
Show file tree

Hide file tree

Showing 9 changed files with 3,874 additions and 19 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -13,6 +13,11 @@ exclude = ["*.rtf", ".idea"]
 lto = true
 opt-level = 3
 
+[profile.profiling]
+inherits = "dev"
+opt-level = 0
+debug = true
+
 [dependencies]
 derivative = "2.2.0"
 serde = { version = "1.0", optional = true, features = ["derive"] }
diff --git a/bench.sh b/bench.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+cargo build --profile profiling --example bench
+samply record -r 10000 target/profiling/examples/bench
diff --git a/examples/bench.rs b/examples/bench.rs
@@ -0,0 +1,17 @@
+use std::time::Instant;
+use rtf_parser::header::StyleSheet;
+use rtf_parser::lexer::Lexer;
+use rtf_parser::parser::Parser;
+
+fn main() {
+    let start = Instant::now();
+    let doc;
+    {
+        let rtf_text = include_str!("../resources/tests/file-sample_500kB.rtf");
+        let tokens = Lexer::scan(rtf_text).expect("Invalid RTF content");
+        doc = Parser::new(tokens).parse().unwrap();
+    }
+    let elapsed = start.elapsed();
+    assert_eq!(doc.header.stylesheet, StyleSheet::new()); // in order to not get optimized out
+    println!("Elapsed: {:.2?}", elapsed);
+}
diff --git a/examples/load_file.rs b/examples/load_file.rs
@@ -1,9 +1,11 @@
 extern crate rtf_parser;
 use rtf_parser::lexer::Lexer;
 use rtf_parser::parser::Parser;
+use rtf_parser::header::StyleSheet;
 
 fn main() {
-    let rtf_text = include_str!("../resources/tests/test-file.rtf");
+    let rtf_text = include_str!("../resources/tests/file-sample_500kB.rtf");
     let tokens = Lexer::scan(rtf_text).expect("Invalid RTF content");
     let doc = Parser::new(tokens).parse();
+    assert_eq!(doc.unwrap().header.stylesheet, StyleSheet::new());
 }
diff --git a/resources/tests/file-sample_500kB.rtf b/resources/tests/file-sample_500kB.rtf
diff --git a/src/lexer.rs b/src/lexer.rs
@@ -42,12 +42,20 @@ pub struct Lexer;
 impl Lexer {
     pub fn scan(src: &str) -> Result<Vec<Token>, LexerError> {
         let src = src.trim(); // Sanitize src : Trim the leading whitespaces
-        let mut it = src.chars();
+
         let mut tokens: Vec<Token> = vec![];
         let mut slice_start_index = 0;
         let mut current_index = 0;
         let mut previous_char = ' ';
-        while let Some(c) = it.next() {
+
+        // This is faster than using an iterator
+        let len = src.len();
+        let bytes = src.as_bytes();
+        let mut i = 0;
+        while i < len {
+            let c = bytes[i] as char;
+            i += 1;
+
             match c {
                 // TODO: Handle char over code 127 for escaped chars
                 // Handle Escaped chars : "\" + any charcode below 127
@@ -59,9 +67,7 @@ impl Lexer {
                         let slice = &src[slice_start_index..current_index];
                         // Get the corresponding token(s)
                         let slice_tokens = Self::tokenize(slice)?;
-                        for slice_token in slice_tokens {
-                            tokens.push(slice_token);
-                        }
+                        tokens.extend_from_slice(&slice_tokens.as_slice());
                         slice_start_index = current_index;
                     }
                 }

diff --git a/src/parser.rs b/src/parser.rs
@@ -68,19 +68,29 @@ impl fmt::Display for ParserError {
             ParserError::InvalidFontIdentifier(property) => write!(f, "Invalid font identifier : {:?}", property),
             ParserError::InvalidColorIdentifier(property) => write!(f, "Invalid color identifier : {:?}", property),
             ParserError::NoMoreToken => write!(f, "No more token to parse"),
-            ParserError::ValueCastError(T) => write!(f, "Unable to cast i32 to {T}"),
+            ParserError::ValueCastError(_type) => write!(f, "Unable to cast i32 to {_type}"),
         };
     }
 }
 
 pub struct Parser<'a> {
     tokens: Vec<Token<'a>>,
+    parsed_item : Vec<bool>,
     cursor: usize,
 }
 
 impl<'a> Parser<'a> {
     pub fn new(tokens: Vec<Token<'a>>) -> Self {
-        return Self { tokens, cursor: 0 };
+        return Self {
+            parsed_item: vec![false; tokens.len()],
+            tokens,
+            cursor: 0
+        };
+    }
+
+    pub fn get_tokens(&self) -> Vec<&Token> {
+        // It ignores the empty tokens, that replaced already parsed tokens istead of deleting them for performance reasons
+        return self.tokens.iter().filter(|t| *t != &Token::Empty).collect();
     }
 
     fn check_document_validity(&self) -> Result<(), ParserError> {
@@ -110,8 +120,18 @@ impl<'a> Parser<'a> {
         // Parse the body
         let mut painter_stack: Vec<Painter> = vec![Painter::default()];
         let mut paragraph = Paragraph::default();
-        let mut it = self.tokens.iter();
-        while let Some(token) = it.next() {
+        let len = self.tokens.len();
+        let mut i = 0;
+
+        while i < len {
+            if self.parsed_item[i] {
+                // The item already has been parsed
+                i += 1;
+                continue;
+            }
+            let token = &self.tokens[i];
+            i += 1;
+
             match token {
                 Token::OpeningBracket => {
                     painter_stack.push(Painter::default());
@@ -169,6 +189,7 @@ impl<'a> Parser<'a> {
                 Token::IgnorableDestination => {
                     return Err(ParserError::IgnorableDestinationParsingError);
                 }
+                Token::Empty => panic!("Try to parse an empty token, this should not happen")
             };
         }
         return Ok(document);
@@ -204,21 +225,25 @@ impl<'a> Parser<'a> {
         return self.get_token_at(self.cursor);
     }
 
+    #[inline]
     fn consume_token_at(&mut self, index: usize) -> Option<Token<'a>> {
-        if self.tokens.is_empty() {
+        if self.tokens.is_empty() || index >= self.tokens.len() {
             return None;
         }
-        Some(self.tokens.remove(index))
+        // PERF : vec.remove can require reallocation unlike this method
+        self.cursor += 1;
+        self.parsed_item[index] = true;
+        return Some(mem::replace(&mut self.tokens[index], Token::Empty));
     }
 
     fn consume_next_token(&mut self) -> Option<Token<'a>> {
         return self.consume_token_at(self.cursor);
     }
 
     // Consume token from cursor to <reference-token>
-    fn _consume_tokens_until(&mut self, reference_token: Token<'a>) -> Vec<Token<'a>> {
+    fn _consume_tokens_until(&mut self, reference_token: &Token<'a>) -> Vec<Token<'a>> {
         let mut ret = vec![];
-        let token_type_id = mem::discriminant(&reference_token);
+        let token_type_id = mem::discriminant(reference_token);
         while let Some(token) = self.consume_next_token() {
             let type_id = mem::discriminant(&token);
             ret.push(token);
@@ -259,7 +284,6 @@ impl<'a> Parser<'a> {
         self.cursor = 0; // Reset the cursor
         let mut header = RtfHeader::default();
         while let (Some(token), Some(mut next_token)) = (self.get_token_at(self.cursor), self.get_token_at(self.cursor + 1)) {
-
             // Manage the case where there is CRLF between { and control_word
             // {\n /*/ignoregroup }
             let mut i = 0;
@@ -485,7 +509,7 @@ pub mod tests {
         let tokens = Lexer::scan(rtf).unwrap();
         let mut parser = Parser::new(tokens);
         let document = parser.parse().unwrap();
-        assert_eq!(parser.tokens, vec![]); // Should have consumed all the tokens
+        assert_eq!(parser.get_tokens(), Vec::<&Token>::new()); // Should have consumed all the tokens
         assert_eq!(document.header, RtfHeader::default());
     }
 
@@ -497,7 +521,7 @@ pub mod tests {
         let tokens = Lexer::scan(rtf).unwrap();
         let mut parser = Parser::new(tokens);
         let document = parser.parse().unwrap();
-        assert_eq!(parser.tokens, vec![]); // Should have consumed all the tokens
+        assert_eq!(parser.get_tokens(), Vec::<&Token>::new()); // Should have consumed all the tokens
         assert_eq!(document.header, RtfHeader::default());
     }
 

diff --git a/src/tokens.rs b/src/tokens.rs
@@ -16,6 +16,7 @@ pub enum Token<'a> {
     CRLF,                 // Line-return \n
     IgnorableDestination, // \*\ <destination-name>
     ControlSymbol(ControlSymbol<'a>),
+    Empty                 // Used by the parser for optimization
 }
 
 #[allow(dead_code)]
@@ -30,6 +31,7 @@ impl<'a> fmt::Debug for Token<'a> {
             Token::CRLF                   => write!(f, "CRLF"),
             Token::IgnorableDestination   => write!(f, "IgnorableDestination"),
             Token::ControlSymbol(symbol)  => write!(f, "ControlSymbol : {:?}", symbol),
+            Token::Empty                  => write!(f, "Empty"),
         };
     }
 }

diff --git a/src/utils.rs b/src/utils.rs
@@ -9,8 +9,15 @@ impl StrUtils for str {
     // ex : split_first_whitespace("\b I'm a bold string") -> ("\b", "I'm a bold string")
     fn split_first_whitespace(&self) -> (&str, &str) {
         let mut first_whitespace_index = 0;
-        let mut it = self.chars();
-        while let Some(c) = it.next() {
+
+        let len = self.len();
+        let bytes = self.as_bytes();
+        let mut i = 0;
+        // Faster than an iterator
+        while i < len {
+            let c = bytes[i] as char;
+            i += 1;
+
             if c.is_whitespace() {
                 break;
             } else {