diff --git a/.gitignore b/.gitignore index e44e8f90..a9d37c56 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,2 @@ target Cargo.lock -# Ignore the dumps for now until the format stabilizes -assets/* diff --git a/Cargo.toml b/Cargo.toml index d0f8c165..88c7f421 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,6 +13,7 @@ bitflags = "^0.4" plist = "0.0.13" rustc-serialize = "0.3" bincode = "0.5" +flate2 = "*" # [profile.release] # debug = true diff --git a/Readme.md b/Readme.md index 65b0d5b2..1e9096e4 100644 --- a/Readme.md +++ b/Readme.md @@ -6,13 +6,16 @@ If you are writing a text editor (or something else needing highlighting) in Rus It is currently mostly complete and can parse, interpret and highlight based on Sublime Text syntax and `tmTheme` files. -## Goals +## Features/Goals -- Work with many languages (accomplished through using existing grammar formats) -- Be super fast, both in terms of highlighting and startup time -- API that is both easy to use, and allows use in fancy text editors with piece tables and incremental re-highlighting and the like -- Expose internals of the parsing process so text editors can do things like cache parse states and use semantic info for code intelligence -- High quality highlighting, supporting things like heredocs and complex syntaxes (like Rust's). +- [x] Work with many languages (accomplished through using existing grammar formats) +- [ ] Highlight super quickly, as fast as Sublime Text (not there yet but matching most editors) +- [x] Load up quickly, currently in around 23ms but could potentially be even faster. +- [ ] Include easy to use API for basic cases +- [x] API allows use in fancy text editors with piece tables and incremental re-highlighting and the like. +- [x] Expose internals of the parsing process so text editors can do things like cache parse states and use semantic info for code intelligence +- [x] High quality highlighting, supporting things like heredocs and complex syntaxes (like Rust's). +- [x] Include a compressed dump of all the default syntax definitions in the library binary so users don't have to manage a folder of syntaxes. ## Screenshots @@ -30,7 +33,7 @@ There's currently an example program called `syncat` that prints one of the sour - [x] Parse TextMate/Sublime Text theme files - [x] Highlight a scope-annotated iterator into a colour-annotated iterator for display. - [x] Ability to dump loaded packages as binary file and load them with lazy regex compilation for fast start up times. -- [ ] Bundle dumped default syntaxes into the library binary so library users don't need an assets folder with Sublime Text packages. +- [x] Bundle dumped default syntaxes into the library binary so library users don't need an assets folder with Sublime Text packages. - [ ] Add nice API wrappers for simple use cases. The base APIs are designed for deep high performance integration with arbitrary text editor data structures. - [ ] Make syncat a better demo, and maybe more demo programs - [ ] Document the API better and make things private that don't need to be public @@ -59,7 +62,7 @@ The current perf numbers are below. These numbers should get better once I imple - Vim is instantaneous but that isn't a fair comparison since vim's highlighting is far more basic than the other editors. - These comparisons aren't totally fair, except the one to Sublime Text since that is using the same theme and the same complex defintion for ES6 syntax. - ~220ms to load and link all the syntax definitions in the default Sublime package set. This is ~60% regex compilation and ~35% YAML parsing. - - but only ~16ms to load and link all the syntax definitions from a pre-made binary dump with lazy regex compilation. + - but only ~23ms to load and link all the syntax definitions from an internal pre-made binary dump with lazy regex compilation. - ~1.9ms to parse and highlight the 30 line 791 character `testdata/highlight_test.erb` file. This works out to around 16,000 lines/second or 422 kilobytes/second. - ~250ms end to end for `syncat` to start, load the definitions, highlight the test file and shut down. This is mostly spent loading. diff --git a/assets/default_newlines.packdump b/assets/default_newlines.packdump new file mode 100644 index 00000000..f201e244 Binary files /dev/null and b/assets/default_newlines.packdump differ diff --git a/assets/default_nonewlines.packdump b/assets/default_nonewlines.packdump new file mode 100644 index 00000000..a1b5e2f5 Binary files /dev/null and b/assets/default_nonewlines.packdump differ diff --git a/benches/loading.rs b/benches/loading.rs index 7dbfa732..9f58eed1 100644 --- a/benches/loading.rs +++ b/benches/loading.rs @@ -14,6 +14,14 @@ fn bench_load_syntax_dump(b: &mut Bencher) { }); } +#[bench] +fn bench_load_internal_dump(b: &mut Bencher) { + b.iter(|| { + let ps = PackageSet::load_defaults_newlines(); + test::black_box(&ps); + }); +} + #[bench] fn bench_load_syntaxes(b: &mut Bencher) { b.iter(|| { diff --git a/examples/syncat.rs b/examples/syncat.rs index 2acb1856..7a4ab6ce 100644 --- a/examples/syncat.rs +++ b/examples/syncat.rs @@ -12,7 +12,7 @@ use std::path::Path; use std::fs::File; fn main() { - let ps = PackageSet::load_from_folder("testdata/Packages").unwrap(); + let ps = PackageSet::load_defaults_nonewlines(); let highlighter = Highlighter::new(PackageSet::get_theme("testdata/spacegray/base16-ocean.\ dark.tmTheme") .unwrap()); diff --git a/src/dumps.rs b/src/dumps.rs index 1fa167ef..1539310b 100644 --- a/src/dumps.rs +++ b/src/dumps.rs @@ -1,25 +1,62 @@ use bincode::SizeLimit; use bincode::rustc_serialize::*; use std::fs::File; -use std::io::BufReader; +use std::io::{BufReader, BufWriter}; use package_set::PackageSet; use std::path::Path; +use flate2::write::ZlibEncoder; +use flate2::read::ZlibDecoder; +use flate2::Compression; impl PackageSet { + /// Instantiates a new package set from a binary dump of + /// Sublime Text's default open source syntax definitions and then links it. + /// These dumps are included in this library's binary for convenience. + /// This method loads the version for parsing line strings with no `\n` characters at the end. + /// + /// This is the recommended way of creating a package set for + /// non-advanced use cases. It is also significantly faster than loading the YAML files. + /// + /// Note that you can load additional syntaxes after doing this, + /// you'll just have to link again. If you want you can even + /// use the fact that SyntaxDefinitions are serializable with + /// the bincode crate to cache dumps of additional syntaxes yourself. + pub fn load_defaults_nonewlines() -> PackageSet { + let mut ps = Self::from_binary(include_bytes!("../assets/default_nonewlines.packdump")); + ps.link_syntaxes(); + ps + } + + /// Same as `load_defaults_nonewlines` but for parsing line strings with newlines at the end. + /// These are separate methods because thanks to linker garbage collection, only the serialized + /// dumps for the method(s) you call will be included in the binary (each is ~200kb for now). + pub fn load_defaults_newlines() -> PackageSet { + let mut ps = Self::from_binary(include_bytes!("../assets/default_newlines.packdump")); + ps.link_syntaxes(); + ps + } + pub fn dump_binary(&self) -> Vec { assert!(!self.is_linked); - encode(self, SizeLimit::Infinite).unwrap() + let mut v = Vec::new(); + { + let mut encoder = ZlibEncoder::new(&mut v, Compression::Best); + encode_into(self, &mut encoder, SizeLimit::Infinite).unwrap(); + } + v } pub fn dump_to_file>(&self, path: P) -> EncodingResult<()> { - let mut f = try!(File::create(path).map_err(EncodingError::IoError)); - encode_into(self, &mut f, SizeLimit::Infinite) + let f = BufWriter::new(try!(File::create(path).map_err(EncodingError::IoError))); + let mut encoder = ZlibEncoder::new(f, Compression::Best); + encode_into(self, &mut encoder, SizeLimit::Infinite) } /// Returns a fully loaded and linked package set from /// a binary dump. Panics if the dump is invalid. - pub fn from_binary(v: Vec) -> PackageSet { - let mut ps: PackageSet = decode(&v[..]).unwrap(); + pub fn from_binary(v: &[u8]) -> PackageSet { + let mut decoder = ZlibDecoder::new(v); + let mut ps: PackageSet = decode_from(&mut decoder, SizeLimit::Infinite).unwrap(); ps.link_syntaxes(); ps } @@ -28,8 +65,8 @@ impl PackageSet { /// a binary dump file. pub fn from_dump_file>(path: P) -> DecodingResult { let f = try!(File::open(path).map_err(DecodingError::IoError)); - let mut reader = BufReader::new(f); - decode_from(&mut reader, SizeLimit::Infinite) + let mut decoder = ZlibDecoder::new(BufReader::new(f)); + decode_from(&mut decoder, SizeLimit::Infinite) } } @@ -42,7 +79,7 @@ mod tests { ps.load_syntaxes("testdata/Packages", false).unwrap(); let bin = ps.dump_binary(); - let ps2 = PackageSet::from_binary(bin); + let ps2 = PackageSet::from_binary(&bin[..]); assert_eq!(ps.syntaxes.len(), ps2.syntaxes.len()); } } diff --git a/src/lib.rs b/src/lib.rs index a2c28d86..da331455 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,6 +9,7 @@ extern crate bincode; extern crate rustc_serialize; #[macro_use] extern crate bitflags; +extern crate flate2; pub mod syntax_definition; pub mod yaml_load; pub mod package_set; diff --git a/src/package_set.rs b/src/package_set.rs index 030209d8..8b59be3e 100644 --- a/src/package_set.rs +++ b/src/package_set.rs @@ -15,6 +15,13 @@ use std::mem; use std::rc::Rc; use std::ascii::AsciiExt; +/// A package set holds a bunch of syntaxes and manages +/// loading them and the crucial operation of *linking*. +/// +/// Linking replaces the references between syntaxes with direct +/// pointers. See `link_syntaxes` for more. +/// Linking, followed by adding more unlinked syntaxes with `load_syntaxes` +/// and then linking again is allowed. #[derive(Debug, RustcEncodable, RustcDecodable)] pub struct PackageSet { pub syntaxes: Vec, @@ -152,6 +159,7 @@ impl PackageSet { /// It is necessary to do this before parsing anything with these syntaxes. /// However, it is not possible to serialize a package set that has been linked, /// which is why it isn't done by default, except by the load_from_folder constructor. + /// This operation is idempotent, but takes time even on already linked package sets. pub fn link_syntaxes(&mut self) { for syntax in self.syntaxes.iter() { for (_, ref context_ptr) in syntax.contexts.iter() {