Merge pull request trishume#3 from trishume/lazy-regex-compile

Binary dumps and lazy regex compilation
cobalt-org · Jun 11, 2016 · e62fd57 · e62fd57
2 parents 435903c + 37dc4dc
commit e62fd57
Show file tree

Hide file tree

Showing 13 changed files with 245 additions and 35 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,4 @@
 target
 Cargo.lock
+# Ignore the dumps for now until the format stabilizes
+assets/*
diff --git a/Cargo.toml b/Cargo.toml
@@ -12,3 +12,7 @@ lazy_static = "0.2.1"
 bitflags = "^0.4"
 plist = "0.0.13"
 rustc-serialize = "0.3"
+bincode = "0.5"
+
+# [profile.release]
+# debug = true
diff --git a/Readme.md b/Readme.md
@@ -9,7 +9,7 @@ It is currently mostly complete and can parse, interpret and highlight based on
 ## Goals
 
 - Work with many languages (accomplished through using existing grammar formats)
-- Be super fast
+- Be super fast, both in terms of highlighting and startup time
 - API that is both easy to use, and allows use in fancy text editors with piece tables and incremental re-highlighting and the like
 - Expose internals of the parsing process so text editors can do things like cache parse states and use semantic info for code intelligence
 - High quality highlighting, supporting things like heredocs and complex syntaxes (like Rust's).
@@ -29,6 +29,8 @@ There's currently an example program called `syncat` that prints one of the sour
 - [x] Write an interpreter for the `.sublime-syntax` state machine that highlights an incoming iterator of file lines into an iterator of scope-annotated text.
 - [x] Parse TextMate/Sublime Text theme files
 - [x] Highlight a scope-annotated iterator into a colour-annotated iterator for display.
+- [x] Ability to dump loaded packages as binary file and load them with lazy regex compilation for fast start up times.
+- [ ] Bundle dumped default syntaxes into the library binary so library users don't need an assets folder with Sublime Text packages.
 - [ ] Add nice API wrappers for simple use cases. The base APIs are designed for deep high performance integration with arbitrary text editor data structures.
 - [ ] Make syncat a better demo, and maybe more demo programs
 - [ ] Document the API better and make things private that don't need to be public
@@ -45,7 +47,7 @@ Currently `syntect` is reasonably fast but not as fast as it could be. The follo
 - [x] Determine if a scope is a prefix of another scope using bit manipulation in only a few instructions
 - [ ] Cache regex matches to reduce number of times oniguruma is asked to search a line
 - [ ] Cache scope lookups to reduce how much scope matching has to be done to highlight a list of scope operations
-- [ ] Lazily compile regexes so startup time isn't taken compiling a thousand regexs for Actionscript that nobody will use
+- [x] Lazily compile regexes so startup time isn't taken compiling a thousand regexs for Actionscript that nobody will use
 - [ ] Use a better regex engine, perhaps the in progress fancy-regex crate
 
 The current perf numbers are below. These numbers should get better once I implement more of the things above, but they're on par with many other text editors.
@@ -57,6 +59,7 @@ The current perf numbers are below. These numbers should get better once I imple
     - Vim is instantaneous but that isn't a fair comparison since vim's highlighting is far more basic than the other editors.
     - These comparisons aren't totally fair, except the one to Sublime Text since that is using the same theme and the same complex defintion for ES6 syntax.
 - ~220ms to load and link all the syntax definitions in the default Sublime package set. This is ~60% regex compilation and ~35% YAML parsing.
+    - but only ~16ms to load and link all the syntax definitions from a pre-made binary dump with lazy regex compilation.
 - ~1.9ms to parse and highlight the 30 line 791 character `testdata/highlight_test.erb` file. This works out to around 16,000 lines/second or 422 kilobytes/second.
 - ~250ms end to end for `syncat` to start, load the definitions, highlight the test file and shut down. This is mostly spent loading.
 

diff --git a/benches/loading.rs b/benches/loading.rs
@@ -6,10 +6,27 @@ use test::Bencher;
 
 use syntect::package_set::PackageSet;
 
+#[bench]
+fn bench_load_syntax_dump(b: &mut Bencher) {
+    b.iter(|| {
+        let ps = PackageSet::from_dump_file("assets/default_newlines.packdump");
+        test::black_box(&ps);
+    });
+}
+
 #[bench]
 fn bench_load_syntaxes(b: &mut Bencher) {
     b.iter(|| {
         let mut ps = PackageSet::new();
         ps.load_syntaxes("testdata/Packages", false).unwrap();
     });
 }
+
+#[bench]
+fn bench_link_syntaxes(b: &mut Bencher) {
+    let mut ps = PackageSet::new();
+    ps.load_syntaxes("testdata/Packages", false).unwrap();
+    b.iter(|| {
+        ps.link_syntaxes();
+    });
+}
diff --git a/examples/gendata.rs b/examples/gendata.rs
@@ -0,0 +1,12 @@
+extern crate syntect;
+use syntect::package_set::PackageSet;
+
+fn main() {
+    let mut ps = PackageSet::new();
+    ps.load_syntaxes("testdata/Packages", true).unwrap();
+    ps.dump_to_file("assets/default_newlines.packdump").unwrap();
+
+    let mut ps2 = PackageSet::new();
+    ps2.load_syntaxes("testdata/Packages", false).unwrap();
+    ps2.dump_to_file("assets/default_nonewlines.packdump").unwrap();
+}
diff --git a/examples/syncat.rs b/examples/syncat.rs
@@ -4,7 +4,7 @@ use syntect::package_set::PackageSet;
 use syntect::parser::*;
 use syntect::theme::highlighter::*;
 use syntect::theme::style::*;
-use syntect::util::{as_24_bit_terminal_escaped, debug_print_ops};
+use syntect::util::as_24_bit_terminal_escaped;
 
 use std::io::BufReader;
 use std::io::BufRead;

diff --git a/src/dumps.rs b/src/dumps.rs
@@ -0,0 +1,48 @@
+use bincode::SizeLimit;
+use bincode::rustc_serialize::*;
+use std::fs::File;
+use std::io::BufReader;
+use package_set::PackageSet;
+use std::path::Path;
+
+impl PackageSet {
+    pub fn dump_binary(&self) -> Vec<u8> {
+        assert!(!self.is_linked);
+        encode(self, SizeLimit::Infinite).unwrap()
+    }
+
+    pub fn dump_to_file<P: AsRef<Path>>(&self, path: P) -> EncodingResult<()> {
+        let mut f = try!(File::create(path).map_err(EncodingError::IoError));
+        encode_into(self, &mut f, SizeLimit::Infinite)
+    }
+
+    /// Returns a fully loaded and linked package set from
+    /// a binary dump. Panics if the dump is invalid.
+    pub fn from_binary(v: Vec<u8>) -> PackageSet {
+        let mut ps: PackageSet = decode(&v[..]).unwrap();
+        ps.link_syntaxes();
+        ps
+    }
+
+    /// Returns a fully loaded and linked package set from
+    /// a binary dump file.
+    pub fn from_dump_file<P: AsRef<Path>>(path: P) -> DecodingResult<PackageSet> {
+        let f = try!(File::open(path).map_err(DecodingError::IoError));
+        let mut reader = BufReader::new(f);
+        decode_from(&mut reader, SizeLimit::Infinite)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use package_set::PackageSet;
+    #[test]
+    fn can_dump_and_load() {
+        let mut ps = PackageSet::new();
+        ps.load_syntaxes("testdata/Packages", false).unwrap();
+
+        let bin = ps.dump_binary();
+        let ps2 = PackageSet::from_binary(bin);
+        assert_eq!(ps.syntaxes.len(), ps2.syntaxes.len());
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -5,6 +5,7 @@ extern crate regex_syntax;
 #[macro_use]
 extern crate lazy_static;
 extern crate plist;
+extern crate bincode;
 extern crate rustc_serialize;
 #[macro_use]
 extern crate bitflags;
@@ -15,3 +16,4 @@ pub mod scope;
 pub mod parser;
 pub mod theme;
 pub mod util;
+pub mod dumps;
diff --git a/src/package_set.rs b/src/package_set.rs
@@ -15,9 +15,10 @@ use std::mem;
 use std::rc::Rc;
 use std::ascii::AsciiExt;
 
-#[derive(Debug)]
+#[derive(Debug, RustcEncodable, RustcDecodable)]
 pub struct PackageSet {
     pub syntaxes: Vec<SyntaxDefinition>,
+    pub is_linked: bool,
 }
 
 #[derive(Debug)]
@@ -65,15 +66,20 @@ fn load_syntax_file(p: &Path,
 
 impl PackageSet {
     pub fn new() -> PackageSet {
-        PackageSet { syntaxes: Vec::new() }
+        PackageSet {
+            syntaxes: Vec::new(),
+            is_linked: true,
+        }
     }
 
     /// Convenience constructor calling `new` and then `load_syntaxes` on the resulting set
     /// defaults to lines given not including newline characters, see the
     /// `load_syntaxes` method docs for an explanation as to why this might not be the best.
+    /// It also links all the syntaxes together, see `link_syntaxes` for what that means.
     pub fn load_from_folder<P: AsRef<Path>>(folder: P) -> Result<PackageSet, PackageError> {
         let mut ps = Self::new();
         try!(ps.load_syntaxes(folder, false));
+        ps.link_syntaxes();
         Ok(ps)
     }
 
@@ -89,7 +95,8 @@ impl PackageSet {
         Ok(themes)
     }
 
-    /// Loads all the .sublime-syntax files in a folder and links them together into this package set
+    /// Loads all the .sublime-syntax files in a folder into this package set.
+    /// It does not link the syntaxes, in case you want to serialize this package set.
     ///
     /// The `lines_include_newline` parameter is used to work around the fact that Sublime Text normally
     /// passes line strings including newline characters (`\n`) to its regex engine. This results in many
@@ -104,14 +111,14 @@ impl PackageSet {
                                          folder: P,
                                          lines_include_newline: bool)
                                          -> Result<(), PackageError> {
+        self.is_linked = false;
         for entry in WalkDir::new(folder) {
             let entry = try!(entry.map_err(|e| PackageError::WalkDir(e)));
             if entry.path().extension().map(|e| e == "sublime-syntax").unwrap_or(false) {
                 // println!("{}", entry.path().display());
                 self.syntaxes.push(try!(load_syntax_file(entry.path(), lines_include_newline)));
             }
         }
-        self.link_syntaxes();
         Ok(())
     }
 
@@ -141,13 +148,18 @@ impl PackageSet {
         self.syntaxes.iter().find(|&s| lower == s.name.to_ascii_lowercase())
     }
 
-    fn link_syntaxes(&mut self) {
+    /// This links all the syntaxes in this set directly with pointers for performance purposes.
+    /// It is necessary to do this before parsing anything with these syntaxes.
+    /// However, it is not possible to serialize a package set that has been linked,
+    /// which is why it isn't done by default, except by the load_from_folder constructor.
+    pub fn link_syntaxes(&mut self) {
         for syntax in self.syntaxes.iter() {
             for (_, ref context_ptr) in syntax.contexts.iter() {
                 let mut mut_ref = context_ptr.borrow_mut();
                 self.link_context(syntax, mut_ref.deref_mut());
             }
         }
+        self.is_linked = true;
     }
 
     fn link_context(&self, syntax: &SyntaxDefinition, context: &mut Context) {
@@ -182,7 +194,7 @@ impl PackageSet {
             Direct(_) => None,
         };
         if let Some(new_context) = maybe_new_context {
-            let mut new_ref = Direct(Rc::downgrade(&new_context));
+            let mut new_ref = Direct(LinkerLink { link: Rc::downgrade(&new_context) });
             mem::swap(context_ref, &mut new_ref);
         }
     }
@@ -213,6 +225,7 @@ impl PackageSet {
         Ok(try!(read_plist(try!(Self::read_file(path)))))
     }
 
+    /// Loads a theme given a path to a .tmTheme file
     pub fn get_theme<P: AsRef<Path>>(path: P) -> Result<Theme, PackageError> {
         Ok(try!(Theme::parse_settings(try!(Self::read_plist(path.as_ref())))))
     }

diff --git a/src/parser.rs b/src/parser.rs
@@ -56,7 +56,11 @@ impl ParseState {
 
         let mut regions = Region::with_capacity(8);
         let mut match_cache: MatchCache = Vec::with_capacity(64); // TODO find best capacity
-        while self.parse_next_token(line, &mut match_start, &mut match_cache, &mut regions, &mut res) {
+        while self.parse_next_token(line,
+                                    &mut match_start,
+                                    &mut match_cache,
+                                    &mut regions,
+                                    &mut res) {
         }
         return res;
     }
@@ -84,11 +88,12 @@ impl ParseState {
                         overall_index += 1;
                         continue; // we've determined this pattern doesn't match this line anywhere
                     }
-                    let pat_context = pat_context_ptr.borrow();
-                    let match_pat = pat_context.match_at(pat_index);
+                    let mut pat_context = pat_context_ptr.borrow_mut();
+                    let mut match_pat = pat_context.match_at_mut(pat_index);
 
                     // println!("{:?}", match_pat.regex_str);
-                    let refs_regex = if cur_level.captures.is_some() && match_pat.regex.is_none() {
+                    match_pat.ensure_compiled_if_possible();
+                    let refs_regex = if cur_level.captures.is_some() && match_pat.has_captures {
                         let &(ref region, ref s) = cur_level.captures.as_ref().unwrap();
                         Some(match_pat.compile_with_refs(region, s))
                     } else {
@@ -104,7 +109,7 @@ impl ParseState {
                                                             line.len(),
                                                             onig::SEARCH_OPTION_NONE,
                                                             Some(regions));
-                    if overall_index >= cache.len() { // add it to the cache
+                    if overall_index >= cache.len() {
                         cache.push(matched.is_some());
                     } // TODO update the cache even if this is another time over
                     if let Some(match_start) = matched {
@@ -149,7 +154,8 @@ impl ParseState {
                     line: &str,
                     reg_match: RegexMatch,
                     level_context_ptr: ContextPtr,
-                    ops: &mut Vec<(usize, ScopeStackOp)>) -> bool {
+                    ops: &mut Vec<(usize, ScopeStackOp)>)
+                    -> bool {
         let (match_start, match_end) = reg_match.regions.pos(0).unwrap();
         let context = reg_match.context.borrow();
         let pat = context.match_at(reg_match.pat_index);

diff --git a/src/scope.rs b/src/scope.rs
@@ -5,6 +5,7 @@ use std::sync::Mutex;
 use std::fmt;
 use std::str::FromStr;
 use std::u64;
+use rustc_serialize::{Encodable, Encoder, Decodable, Decoder};
 
 lazy_static! {
     pub static ref SCOPE_REPO: Mutex<ScopeRepository> = Mutex::new(ScopeRepository::new());
@@ -129,7 +130,7 @@ impl Scope {
         (shifted & 0xFFFF) as u16
     }
 
-    #[inline(always)]
+    #[inline]
     fn missing_atoms(self) -> u32 {
         let trail = if self.b == 0 {
             self.a.trailing_zeros() + 64
@@ -145,6 +146,13 @@ impl Scope {
         8 - self.missing_atoms()
     }
 
+    /// returns a string representation of this scope, this requires locking a
+    /// global repo and shouldn't be done frequently.
+    fn build_string(self) -> String {
+        let repo = SCOPE_REPO.lock().unwrap();
+        repo.to_string(self)
+    }
+
     /// Tests if this scope is a prefix of another scope.
     /// Note that the empty scope is always a prefix.
     ///
@@ -202,20 +210,32 @@ impl FromStr for Scope {
 
 impl fmt::Display for Scope {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let repo = SCOPE_REPO.lock().unwrap();
-        let s = repo.to_string(*self);
+        let s = self.build_string();
         write!(f, "{}", s)
     }
 }
 
 impl fmt::Debug for Scope {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let repo = SCOPE_REPO.lock().unwrap();
-        let s = repo.to_string(*self);
+        let s = self.build_string();
         write!(f, "<{}>", s)
     }
 }
 
+impl Encodable for Scope {
+    fn encode<S: Encoder>(&self, s: &mut S) -> Result<(), S::Error> {
+        let st = self.build_string();
+        s.emit_str(&st)
+    }
+}
+
+impl Decodable for Scope {
+    fn decode<D: Decoder>(d: &mut D) -> Result<Scope, D::Error> {
+        let s: String = try!(d.read_str());
+        Ok(Scope::new(&s).unwrap())
+    }
+}
+
 impl ScopeStack {
     pub fn new() -> ScopeStack {
         ScopeStack { scopes: Vec::new() }