diff --git a/Cargo.toml b/Cargo.toml index a6e4374..574ae52 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,13 +11,17 @@ Fast implementation of a trie data structure """ [dev-dependencies] -rstest = "0.22.0" criterion = "0.5.1" -radix_trie = "0.2.1" -paste = "1.0.15" +divan = "0.1.14" once_cell = "1.19.0" +paste = "1.0.15" +phf = { version = "0.11.2", default-features = false } +radix_trie = "0.2.1" +rstest = "0.22.0" [[bench]] name = "bench" harness = false +[build-dependencies] +phf_codegen = "0.11.2" diff --git a/benches/bench.rs b/benches/bench.rs index d83eb0c..26ab344 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -1,13 +1,10 @@ use std::collections::{HashMap, HashSet}; -use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use divan::black_box; use once_cell::sync::Lazy; use radix_trie::Trie; use trie_hard::TrieHard; -/// This is a rip off of the benchmark suite for for -/// [`radix_trie`](https://github.com/michaelsproul/rust_radix_trie/blob/master/Cargo.toml) - const OW_1984: &str = include_str!("../data/1984.txt"); const SUN_RISING: &str = include_str!("../data/sun-rising.txt"); const RANDOM: &str = include_str!("../data/random.txt"); @@ -16,13 +13,184 @@ const RANDOM: &str = include_str!("../data/random.txt"); const HEADERS: &str = include_str!("../data/headers.txt"); static HEADERS_REV: Lazy> = Lazy::new(|| { HEADERS - .split(|c: char| c.is_whitespace()) + .lines() .collect::>() .into_iter() .map(|s| s.chars().rev().collect::()) .collect() }); +// Compile-time generated PHF Sets +include!(concat!(env!("OUT_DIR"), "/codegen.rs")); + +const PERCENT: &[i32] = &[100, 75, 50, 25, 10, 5, 2, 1]; + +fn main() { + divan::main(); +} + +/* -------------------------------------------------------------------------- */ +/* BENCHMARKS */ +/* -------------------------------------------------------------------------- */ + +#[divan::bench(args = args())] +fn trie_get(bencher: divan::Bencher, input: &Input) { + bencher + .with_inputs(|| { + let words = match input.size { + Size::Header => get_header_text(), + Size::Big => get_big_text(), + Size::Small => get_small_text(), + }; + let trie = make_trie(&words); + (generate_samples(&words, input.percent), trie) + }) + .bench_values(|(samples, trie): (Vec<&str>, TrieHard<'_, &str>)| { + samples + .iter() + .filter_map(|w| trie.get(black_box(&w[..]))) + .count() + }); +} + +#[divan::bench(args = args())] +fn radix_trie_get(bencher: divan::Bencher, input: &Input) { + bencher + .with_inputs(|| { + let words = match input.size { + Size::Header => get_header_text(), + Size::Big => get_big_text(), + Size::Small => get_small_text(), + }; + let trie = make_radix_trie(&words); + (generate_samples(&words, input.percent), trie) + }) + .bench_values(|(samples, trie): (Vec<&str>, Trie<&str, usize>)| { + samples + .iter() + .filter_map(|w| trie.get(black_box(&w[..]))) + .count() + }); +} + +#[divan::bench(args = args())] +fn hashmap_get(bencher: divan::Bencher, input: &Input) { + bencher + .with_inputs(|| { + let words = match input.size { + Size::Header => get_header_text(), + Size::Big => get_big_text(), + Size::Small => get_small_text(), + }; + let hashmap = make_hashmap(&words); + (generate_samples(&words, input.percent), hashmap) + }) + .bench_values( + |(samples, hashmap): (Vec<&str>, HashMap<&str, &str>)| { + samples + .iter() + .filter_map(|w| hashmap.get(black_box(&w[..]))) + .count() + }, + ); +} + +#[divan::bench(args = args())] +fn phf_get(bencher: divan::Bencher, input: &Input) { + bencher + .with_inputs(|| { + let (words, phf) = match input.size { + Size::Header => (get_header_text(), &HEADERS_PHF), + Size::Big => (get_big_text(), &BIG_PHF), + Size::Small => (get_small_text(), &SMALL_PHF), + }; + (generate_samples(&words, input.percent), phf) + }) + .bench_values(|(samples, phf): (Vec<&str>, &phf::Set<&str>)| { + samples + .iter() + .filter_map(|w| phf.get_key(black_box(&w[..]))) + .count() + }); +} + +#[divan::bench(args = &[Size::Big, Size::Small])] +fn trie_insert(bencher: divan::Bencher, size: &Size) { + bencher + .with_inputs(|| match size { + Size::Header => get_header_text(), + Size::Big => get_big_text(), + Size::Small => get_small_text(), + }) + .bench_values(|words: Vec<&str>| make_trie(black_box(&words))); +} + +#[divan::bench(args = &[Size::Big, Size::Small])] +fn radix_trie_insert(bencher: divan::Bencher, size: &Size) { + bencher + .with_inputs(|| match size { + Size::Header => get_header_text(), + Size::Big => get_big_text(), + Size::Small => get_small_text(), + }) + .bench_values(|words: Vec<&str>| make_radix_trie(black_box(&words))); +} + +#[divan::bench(args = &[Size::Big, Size::Small])] +fn hashmap_insert(bencher: divan::Bencher, size: &Size) { + bencher + .with_inputs(|| match size { + Size::Header => get_header_text(), + Size::Big => get_big_text(), + Size::Small => get_small_text(), + }) + .bench_values(|words: Vec<&str>| make_hashmap(black_box(&words))); +} + +/* -------------------------------------------------------------------------- */ +/* INPUTS */ +/* -------------------------------------------------------------------------- */ + +#[derive(Debug)] +enum Size { + Header, + Big, + Small, +} + +struct Input { + size: Size, + percent: i32, +} + +impl std::fmt::Display for Input { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // divan sorts by lexicographic order, so we add padding to the percentage + f.write_fmt(format_args!("{:?} - {:03}%", self.size, self.percent)) + } +} + +fn args() -> impl Iterator { + PERCENT + .iter() + .map(|p| Input { + size: Size::Header, + percent: *p, + }) + .chain(PERCENT.iter().map(|p| Input { + size: Size::Big, + percent: *p, + })) + .chain(PERCENT.iter().map(|p| Input { + size: Size::Small, + percent: *p, + })) +} + +/* -------------------------------------------------------------------------- */ +/* HELPERS */ +/* -------------------------------------------------------------------------- */ + fn get_big_text() -> Vec<&'static str> { OW_1984 .split(|c: char| c.is_whitespace()) @@ -67,48 +235,6 @@ fn make_radix_trie<'a>(words: &[&'a str]) -> Trie<&'a str, usize> { trie } -fn trie_insert_big(b: &mut Criterion) { - let words = get_big_text(); - b.bench_function("trie hard insert - big", |b| { - b.iter(|| make_trie(black_box(&words))) - }); -} - -fn trie_insert_small(b: &mut Criterion) { - let words = get_small_text(); - b.bench_function("trie hard insert - small", |b| { - b.iter(|| make_trie(black_box(&words))) - }); -} - -fn radix_trie_insert_big(b: &mut Criterion) { - let words = get_big_text(); - b.bench_function("radix trie insert - big", |b| { - b.iter(|| make_radix_trie(black_box(&words))) - }); -} - -fn radix_trie_insert_small(b: &mut Criterion) { - let words = get_small_text(); - b.bench_function("radix trie insert - small", |b| { - b.iter(|| make_radix_trie(black_box(&words))) - }); -} - -fn hashmap_insert_big(b: &mut Criterion) { - let words = get_big_text(); - b.bench_function("hashmap insert - big", |b| { - b.iter(|| make_hashmap(black_box(&words))) - }); -} - -fn hashmap_insert_small(b: &mut Criterion) { - let words = get_small_text(); - b.bench_function("hashmap insert - small", |b| { - b.iter(|| make_hashmap(black_box(&words))) - }); -} - fn generate_samples<'a>(hits: &[&'a str], hit_percent: i32) -> Vec<&'a str> { let roulette_inc = hit_percent as f64 / 100.; let mut roulette = 0.; @@ -126,118 +252,3 @@ fn generate_samples<'a>(hits: &[&'a str], hit_percent: i32) -> Vec<&'a str> { result } - -macro_rules! bench_percents_impl { - ( [ $( ($size:expr, $percent:expr ), )+ ] ) => {$( - paste::paste! { - // Trie Hard - fn [< trie_get_ $size _ $percent >] (b: &mut Criterion) { - let words = [< get_ $size _text >](); - let trie = make_trie(&words); - let samples = generate_samples(&words, $percent); - b.bench_function( - concat!( - "trie hard get - ", - stringify!($size), - " - ", - stringify!($percent), - "%" - ), |b| { - b.iter(|| { - samples.iter() - .filter_map(|w| trie.get(black_box(&w[..]))) - .count() - }) - }); - } - - // Radix Trie - fn [< radix_trie_get_ $size _ $percent >] (b: &mut Criterion) { - let words = [< get_ $size _text >](); - let trie = make_radix_trie(&words); - let samples = generate_samples(&words, $percent); - b.bench_function(concat!( - "radix trie get - ", - stringify!($size), - " - ", - stringify!($percent), - "%" - ), |b| { - b.iter(|| { - samples.iter() - .filter_map(|w| trie.get(black_box(&w[..]))) - .count() - }) - }); - } - - // Hashmap - fn [< hashmap_get_ $size _ $percent >](b: &mut Criterion) { - let words = [< get_ $size _text >] (); - let hashmap = make_hashmap(&words); - let samples = generate_samples(&words, $percent); - b.bench_function(concat!( - "hashmap get - ", - stringify!($size), - " - ", - stringify!($percent), - "%" - ), |b| { - b.iter(|| { - samples.iter() - .filter_map(|w| hashmap.get(black_box(&w[..]))) - .count() - }) - }); - } - } - - - )+}; - - ( _groups [ $( ($size:expr, $percent:expr ), )+ ] ) => { - paste::paste! { - criterion_group!( - get_benches, - $( - [< trie_get_ $size _ $percent >], - [< radix_trie_get_ $size _ $percent >], - [< hashmap_get_ $size _ $percent >], - )+ - ); - } - }; -} - -macro_rules! cartesian_impl { - ($out:tt [] $b:tt $init_b:tt) => { - bench_percents_impl!($out); - bench_percents_impl!(_groups $out); - }; - ($out:tt [$a:expr, $($at:tt)*] [] $init_b:tt) => { - cartesian_impl!($out [$($at)*] $init_b $init_b); - }; - ([$($out:tt)*] [$a:expr, $($at:tt)*] [$b:expr, $($bt:tt)*] $init_b:tt) => { - cartesian_impl!([$($out)* ($a, $b),] [$a, $($at)*] [$($bt)*] $init_b); - }; -} - -macro_rules! bench_get_percents { - ([$($size:tt)*], [$($percent:tt)*]) => { - cartesian_impl!([] [$($size)*,] [$($percent)*,] [$($percent)*,]); - }; -} - -bench_get_percents!([header, big, small], [100, 75, 50, 25, 10, 5, 2, 1]); - -criterion_group!( - insert_benches, - trie_insert_big, - radix_trie_insert_big, - hashmap_insert_big, - trie_insert_small, - radix_trie_insert_small, - hashmap_insert_small, -); - -criterion_main!(get_benches, insert_benches); diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..6a6366c --- /dev/null +++ b/build.rs @@ -0,0 +1,62 @@ +use std::{ + collections::HashSet, + env, + fs::File, + io::{BufWriter, Write as _}, + path::Path, +}; + +const HEADERS: &str = include_str!("data/headers.txt"); +const SUN_RISING: &str = include_str!("data/sun-rising.txt"); + +fn main() { + let path = Path::new(&env::var("OUT_DIR").unwrap()).join("codegen.rs"); + let mut file = BufWriter::new(File::create(path).unwrap()); + let mut headers_set = phf_codegen::Set::<&str>::new(); + let headers_rev: Vec<_> = HEADERS + .lines() + .collect::>() + .into_iter() + .map(|s| s.chars().rev().collect::()) + .collect(); + for s in &headers_rev { + headers_set.entry(s); + } + write!( + &mut file, + "static HEADERS_PHF: phf::Set<&str> = {};", + headers_set.build() + ) + .unwrap(); + + let mut small_set = phf_codegen::Set::<&str>::new(); + for s in SUN_RISING + .split(|c: char| c.is_whitespace()) + .collect::>() + .into_iter() + { + small_set.entry(s); + } + write!( + &mut file, + "static SMALL_PHF: phf::Set<&str> = {};", + small_set.build() + ) + .unwrap(); + + let mut big_set = phf_codegen::Set::<&str>::new(); + const OW_1984: &str = include_str!("data/1984.txt"); + for s in OW_1984 + .split(|c: char| c.is_whitespace()) + .collect::>() + .into_iter() + { + big_set.entry(s); + } + write!( + &mut file, + "static BIG_PHF: phf::Set<&str> = {};", + big_set.build() + ) + .unwrap(); +}