From 857290356503a7b2cb2e2184783fe94b54822ff8 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Wed, 23 Oct 2024 14:56:15 +0800 Subject: [PATCH] extend json parse bench (#5516) * extend json parse bench * remove Some(()) --- quickwit/Cargo.lock | 101 +++++++++++++++- quickwit/Cargo.toml | 1 + quickwit/quickwit-doc-mapper/Cargo.toml | 2 +- .../benches/doc_to_json_bench.rs | 110 ++++++++++++++---- .../benches/routing_expression_bench.rs | 76 +++++------- 5 files changed, 219 insertions(+), 71 deletions(-) diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index f5b1bada6fe..64094d11111 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -93,6 +93,15 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "250f629c0161ad8107cf89319e990051fae62832fd343083bea452d93e2205fd" +[[package]] +name = "alloca" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5a7d05ea6aea7e9e64d25b9156ba2fee3fdd659e34e41063cd2fc7cd020d7f4" +dependencies = [ + "cc", +] + [[package]] name = "allocator-api2" version = "0.2.18" @@ -1016,6 +1025,23 @@ dependencies = [ "serde", ] +[[package]] +name = "binggan" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b16cf7e26155ca336ba3a7220c817cdfe73a10f867fa352349fd425b43814bd9" +dependencies = [ + "alloca", + "bpu_trasher", + "miniserde", + "peakmem-alloc", + "perf-event", + "rustc-hash 2.0.0", + "rustop", + "unicode-width", + "yansi", +] + [[package]] name = "bit-set" version = "0.5.3" @@ -1109,6 +1135,15 @@ dependencies = [ "syn_derive", ] +[[package]] +name = "bpu_trasher" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1da7e5e16c1949a2ba90dd1d9c8330af4b72a3b18cc6bcc738c5fe81bf4be61" +dependencies = [ + "rand 0.8.5", +] + [[package]] name = "bs58" version = "0.5.1" @@ -4080,12 +4115,34 @@ dependencies = [ "unicase", ] +[[package]] +name = "mini-internal" +version = "0.1.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cd9f9bbedc1b92683a9847b8db12f3203cf32af6a11db085fa007708dc9555" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.79", +] + [[package]] name = "minimal-lexical" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" +[[package]] +name = "miniserde" +version = "0.1.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9b650e926368ad21aaabe6055341d1874df696178f47d70b6d9a691f616274e" +dependencies = [ + "itoa", + "mini-internal", + "ryu", +] + [[package]] name = "miniz_oxide" version = "0.8.0" @@ -4904,6 +4961,12 @@ dependencies = [ "hmac", ] +[[package]] +name = "peakmem-alloc" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eb7428a977a472465aced57d8d2335d6167c0ce9c05c283fd6faed3d8d948f6" + [[package]] name = "peeking_take_while" version = "1.0.0" @@ -4944,6 +5007,25 @@ version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" +[[package]] +name = "perf-event" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4d6393d9238342159080d79b78cb59c67399a8e7ecfa5d410bd614169e4e823" +dependencies = [ + "libc", + "perf-event-open-sys", +] + +[[package]] +name = "perf-event-open-sys" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c44fb1c7651a45a3652c4afc6e754e40b3d6e6556f1487e2b230bfc4f33c2a8" +dependencies = [ + "libc", +] + [[package]] name = "pest" version = "2.7.13" @@ -5984,7 +6066,7 @@ version = "0.8.0" dependencies = [ "anyhow", "base64 0.22.1", - "criterion", + "binggan", "fnv", "hex", "indexmap 2.1.0", @@ -7117,6 +7199,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustc-hash" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" + [[package]] name = "rustc_version" version = "0.4.1" @@ -7192,6 +7280,12 @@ dependencies = [ "untrusted 0.9.0", ] +[[package]] +name = "rustop" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b5a6a926633a8ce739286680df905e1d1d01db609fc0e09d28e9b901ac7b22f" + [[package]] name = "rustversion" version = "1.0.17" @@ -8266,7 +8360,7 @@ dependencies = [ "rayon", "regex", "rust-stemmers", - "rustc-hash", + "rustc-hash 1.1.0", "serde", "serde_json", "sketches-ddsketch", @@ -9852,6 +9946,9 @@ name = "yansi" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" +dependencies = [ + "is-terminal", +] [[package]] name = "zerocopy" diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index ba0963bf938..82419148e2f 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -86,6 +86,7 @@ async-compression = { version = "0.4", features = ["tokio", "gzip"] } async-speed-limit = "0.4" async-trait = "0.1" base64 = "0.22" +binggan = { version = "0.14" } bytes = { version = "1", features = ["serde"] } bytesize = { version = "1.3.0", features = ["serde"] } bytestring = "1.3.0" diff --git a/quickwit/quickwit-doc-mapper/Cargo.toml b/quickwit/quickwit-doc-mapper/Cargo.toml index be75ec7c02e..44b846157bd 100644 --- a/quickwit/quickwit-doc-mapper/Cargo.toml +++ b/quickwit/quickwit-doc-mapper/Cargo.toml @@ -36,7 +36,7 @@ quickwit-proto = { workspace = true } quickwit-query = { workspace = true } [dev-dependencies] -criterion = { workspace = true } +binggan = { workspace = true } matches = { workspace = true } serde_yaml = { workspace = true } time = { workspace = true } diff --git a/quickwit/quickwit-doc-mapper/benches/doc_to_json_bench.rs b/quickwit/quickwit-doc-mapper/benches/doc_to_json_bench.rs index 58bcfa413dc..5da11f3d747 100644 --- a/quickwit/quickwit-doc-mapper/benches/doc_to_json_bench.rs +++ b/quickwit/quickwit-doc-mapper/benches/doc_to_json_bench.rs @@ -17,13 +17,15 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use criterion::{criterion_group, criterion_main, Criterion, Throughput}; +use binggan::plugins::*; +use binggan::{black_box, BenchRunner, PeakMemAlloc, INSTRUMENTED_SYSTEM}; use quickwit_doc_mapper::DocMapper; use tantivy::TantivyDocument; -const JSON_TEST_DATA: &str = include_str!("data/simple-parse-bench.json"); +const SIMPLE_JSON_TEST_DATA: &str = include_str!("data/simple-parse-bench.json"); +const ROUTING_TEST_DATA: &str = include_str!("data/simple-routing-expression-bench.json"); -const DOC_MAPPER_CONF: &str = r#"{ +const DOC_MAPPER_CONF_SIMPLE_JSON: &str = r#"{ "type": "default", "default_search_fields": [], "tag_fields": [], @@ -35,28 +37,92 @@ const DOC_MAPPER_CONF: &str = r#"{ ] }"#; -pub fn simple_json_to_doc_benchmark(c: &mut Criterion) { - let doc_mapper: Box = serde_json::from_str(DOC_MAPPER_CONF).unwrap(); - let lines: Vec<&str> = JSON_TEST_DATA.lines().map(|line| line.trim()).collect(); +/// Note that {"name": "date", "type": "datetime", "input_formats": ["%Y-%m-%d"], "output_format": +/// "%Y-%m-%d"}, is removed since tantivy parsing only supports RFC3339 +const ROUTING_DOC_MAPPER_CONF: &str = r#"{ + "type": "default", + "default_search_fields": [], + "tag_fields": [], + "field_mappings": [ + {"name": "timestamp", "type": "datetime", "input_formats": ["unix_timestamp"], "output_format": "%Y-%m-%d %H:%M:%S", "output_format": "%Y-%m-%d %H:%M:%S", "fast": true }, + {"name": "source", "type": "text" }, + {"name": "vin", "type": "text" }, + {"name": "vid", "type": "text" }, + {"name": "domain", "type": "text" }, + {"name": "seller", "type": "object", "field_mappings": [ + {"name": "id", "type": "text" }, + {"name": "name", "type": "text" }, + {"name": "address", "type": "text" }, + {"name": "zip", "type": "text" } + ]} + ], + "partition_key": "seller.id" +}"#; + +#[global_allocator] +pub static GLOBAL: &PeakMemAlloc = &INSTRUMENTED_SYSTEM; + +fn get_test_data( + name: &'static str, + raw: &'static str, + doc_mapper: &'static str, +) -> (&'static str, usize, Vec<&'static str>, Box) { + let lines: Vec<&str> = raw.lines().map(|line| line.trim()).collect(); + ( + name, + raw.len(), + lines, + serde_json::from_str(doc_mapper).unwrap(), + ) +} - let mut group = c.benchmark_group("simple-json-to-doc"); - group.throughput(Throughput::Bytes(JSON_TEST_DATA.len() as u64)); - group.bench_function("simple-json-to-doc", |b| { - b.iter(|| { - for line in &lines { - doc_mapper.doc_from_json_str(line).unwrap(); +fn run_bench() { + let inputs: Vec<(&str, usize, Vec<&str>, Box)> = vec![ + (get_test_data( + "flat_json", + SIMPLE_JSON_TEST_DATA, + DOC_MAPPER_CONF_SIMPLE_JSON, + )), + (get_test_data("routing_json", ROUTING_TEST_DATA, ROUTING_DOC_MAPPER_CONF)), + ]; + + let mut runner: BenchRunner = BenchRunner::new(); + + runner.config().set_num_iter_for_bench(1); + runner.config().set_num_iter_for_group(100); + runner + .add_plugin(CacheTrasher::default()) + .add_plugin(BPUTrasher::default()) + .add_plugin(PeakMemAllocPlugin::new(GLOBAL)); + + for (input_name, size, data, doc_mapper) in inputs.iter() { + let dynamic_doc_mapper: DocMapper = + serde_json::from_str(r#"{ "mode": "dynamic" }"#).unwrap(); + let mut group = runner.new_group(); + group.set_name(input_name); + group.set_input_size(*size); + group.register_with_input("doc_mapper", data, |lines| { + for line in lines { + black_box(doc_mapper.doc_from_json_str(line).unwrap()); } - }) - }); - group.bench_function("simple-json-to-doc-tantivy", |b| { - b.iter(|| { + }); + + group.register_with_input("doc_mapper_dynamic", data, |lines| { + for line in lines { + black_box(dynamic_doc_mapper.doc_from_json_str(line).unwrap()); + } + }); + + group.register_with_input("tantivy parse json", data, |lines| { let schema = doc_mapper.schema(); - for line in &lines { - let _doc = TantivyDocument::parse_json(&schema, line).unwrap(); + for line in lines { + let _doc = black_box(TantivyDocument::parse_json(&schema, line).unwrap()); } - }) - }); + }); + group.run(); + } } -criterion_group!(benches, simple_json_to_doc_benchmark); -criterion_main!(benches); +fn main() { + run_bench(); +} diff --git a/quickwit/quickwit-doc-mapper/benches/routing_expression_bench.rs b/quickwit/quickwit-doc-mapper/benches/routing_expression_bench.rs index 8f4daa46d0f..03498778b5a 100644 --- a/quickwit/quickwit-doc-mapper/benches/routing_expression_bench.rs +++ b/quickwit/quickwit-doc-mapper/benches/routing_expression_bench.rs @@ -1,4 +1,4 @@ -// Copyright (C) 2023 Quickwit, Inc. +// Copyright (C) 2024 Quickwit, Inc. // // Quickwit is offered under the AGPL v3.0 and as commercial software. // For commercial licensing, contact us at hello@quickwit.io. @@ -17,60 +17,44 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use criterion::{criterion_group, criterion_main, Criterion, Throughput}; -use quickwit_doc_mapper::{DocMapper, RoutingExpr}; +use binggan::plugins::*; +use binggan::{BenchRunner, PeakMemAlloc, INSTRUMENTED_SYSTEM}; +use quickwit_doc_mapper::RoutingExpr; use serde_json::Value as JsonValue; -const JSON_TEST_DATA: &str = include_str!("data/simple-routing-expression-bench.json"); - -const DOC_MAPPER_CONF: &str = r#"{ - "type": "default", - "default_search_fields": [], - "tag_fields": [], - "field_mappings": [ - {"name": "timestamp", "type": "datetime", "input_formats": ["unix_timestamp"], "output_format": "%Y-%m-%d %H:%M:%S", "output_format": "%Y-%m-%d %H:%M:%S", "fast": true }, - {"name": "source", "type": "text" }, - {"name": "vin", "type": "text" }, - {"name": "vid", "type": "text" }, - {"name": "date", "type": "datetime", "input_formats": ["%Y-%m-%d"], "output_format": "%Y-%m-%d"}, - {"name": "domain", "type": "text" }, - {"name": "seller", "type": "object", "field_mappings": [ - {"name": "id", "type": "text" }, - {"name": "name", "type": "text" }, - {"name": "address", "type": "text" }, - {"name": "zip", "type": "text" } - ]} - ], - "partition_key": "seller.id" -}"#; +#[global_allocator] +pub static GLOBAL: &PeakMemAlloc = &INSTRUMENTED_SYSTEM; -pub fn simple_routing_expression_benchmark(c: &mut Criterion) { - let doc_mapper: Box = serde_json::from_str(DOC_MAPPER_CONF).unwrap(); - let lines: Vec<&str> = JSON_TEST_DATA.lines().map(|line| line.trim()).collect(); +const JSON_TEST_DATA: &str = include_str!("data/simple-routing-expression-bench.json"); - let json_lines: Vec> = lines - .iter() +fn run_bench() { + let json_lines: Vec> = JSON_TEST_DATA + .lines() .map(|line| serde_json::from_str(line).unwrap()) .collect(); - let mut group = c.benchmark_group("simple-routing-expression"); - group.throughput(Throughput::Bytes(JSON_TEST_DATA.len() as u64)); - group.bench_function("simple-json-to-doc", |b| { - b.iter(|| { - for line in &lines { - doc_mapper.doc_from_json_str(line).unwrap(); - } - }) - }); - group.bench_function("simple-eval-hash", |b| { - b.iter(|| { + let mut runner: BenchRunner = BenchRunner::new(); + + runner + .add_plugin(CacheTrasher::default()) + .add_plugin(PeakMemAllocPlugin::new(GLOBAL)); + + { + let (input_name, size, data) = &("routing_expr", JSON_TEST_DATA.len(), &json_lines); + let mut group = runner.new_group(); + group.set_name(input_name); + group.set_input_size(*size); + group.register_with_input("simple-eval-hash", data, |lines| { let routing_expr = RoutingExpr::new("seller.id").unwrap(); - for json in &json_lines { + for json in lines.iter() { routing_expr.eval_hash(json); } - }) - }); + }); + + group.run(); + } } -criterion_group!(benches, simple_routing_expression_benchmark); -criterion_main!(benches); +fn main() { + run_bench(); +}