diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index 27411942af1..0fa6c4bd34e 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -679,7 +679,7 @@ dependencies = [ "regex-lite", "roxmltree 0.14.1", "serde_json", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -1052,7 +1052,7 @@ dependencies = [ "miniserde", "peakmem-alloc", "perf-event", - "rustc-hash 2.0.0", + "rustc-hash", "rustop", "unicode-width 0.1.14", "yansi", @@ -2908,7 +2908,7 @@ dependencies = [ "reqwest", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", "time", "tokio", "tracing", @@ -2923,7 +2923,7 @@ checksum = "f8bdaaa4bc036e8318274d1b25f0f2265b3e95418b765fd1ea1c7ef938fd69bd" dependencies = [ "google-cloud-token", "http 0.2.12", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-retry", "tonic", @@ -2949,7 +2949,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96e4ad0802d3f416f62e7ce01ac1460898ee0efc98f8b45cd4aab7611607012f" dependencies = [ "reqwest", - "thiserror", + "thiserror 1.0.69", "tokio", ] @@ -2966,7 +2966,7 @@ dependencies = [ "google-cloud-googleapis", "google-cloud-token", "prost-types 0.11.9", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-util", "tracing", @@ -3723,9 +3723,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" dependencies = [ "cfg-if", - "js-sys", - "wasm-bindgen", - "web-sys", ] [[package]] @@ -4139,7 +4136,7 @@ dependencies = [ "log", "once_cell", "serde", - "thiserror", + "thiserror 1.0.69", "yada", ] @@ -4419,11 +4416,10 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "measure_time" -version = "0.8.3" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbefd235b0aadd181626f281e1d684e116972988c14c264e42069d5e8a5775cc" +checksum = "51c55d61e72fc3ab704396c5fa16f4c184db37978ae4e94ca8959693a235fc0e" dependencies = [ - "instant", "log", ] @@ -4553,7 +4549,7 @@ dependencies = [ "rustc_version", "smallvec", "tagptr", - "thiserror", + "thiserror 1.0.69", "triomphe", "uuid", ] @@ -4567,7 +4563,7 @@ dependencies = [ "crc32fast", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", "tracing", ] @@ -4863,7 +4859,7 @@ dependencies = [ "serde_json", "serde_path_to_error", "sha2", - "thiserror", + "thiserror 1.0.69", "url", ] @@ -4999,7 +4995,7 @@ dependencies = [ "serde_with", "sha2", "subtle", - "thiserror", + "thiserror 1.0.69", "url", ] @@ -5096,7 +5092,7 @@ dependencies = [ "opentelemetry_sdk", "prost 0.11.9", "reqwest", - "thiserror", + "thiserror 1.0.69", "tokio", "tonic", ] @@ -5134,7 +5130,7 @@ dependencies = [ "js-sys", "once_cell", "pin-project-lite", - "thiserror", + "thiserror 1.0.69", "urlencoding", ] @@ -5156,7 +5152,7 @@ dependencies = [ "rand 0.8.5", "regex", "serde_json", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-stream", ] @@ -5238,7 +5234,7 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "ownedbytes" version = "0.7.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=1a25b6125#1a25b612507ba380d81a648c68bd3d4a3bb6f37d" dependencies = [ "stable_deref_trait", ] @@ -5440,7 +5436,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "879952a81a83930934cbf1786752d6dedc3b1f29e8f8fb2ad1d0a36f377cf442" dependencies = [ "memchr", - "thiserror", + "thiserror 1.0.69", "ucd-trie", ] @@ -5793,7 +5789,7 @@ dependencies = [ "smallvec", "symbolic-demangle", "tempfile", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -5996,7 +5992,7 @@ dependencies = [ "parking_lot", "procfs", "protobuf", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -6282,7 +6278,7 @@ dependencies = [ "serde", "serde_json", "sync_wrapper", - "thiserror", + "thiserror 1.0.69", "tokio", "tracing", ] @@ -6346,7 +6342,7 @@ dependencies = [ "serde_json", "tabled", "tempfile", - "thiserror", + "thiserror 1.0.69", "thousands", "tikv-jemalloc-ctl", "tikv-jemallocator", @@ -6416,7 +6412,7 @@ dependencies = [ "quickwit-common", "quickwit-proto", "serde", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-stream", "tonic", @@ -6454,7 +6450,7 @@ dependencies = [ "serde_json", "siphasher", "tempfile", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-metrics", "tokio-stream", @@ -6585,7 +6581,7 @@ dependencies = [ "serde_yaml", "siphasher", "tantivy", - "thiserror", + "thiserror 1.0.69", "time", "tracing", "utoipa", @@ -6605,7 +6601,7 @@ dependencies = [ "quickwit-metastore", "quickwit-proto", "quickwit-storage", - "thiserror", + "thiserror 1.0.69", "time", "tokio", "tracing", @@ -6663,7 +6659,7 @@ dependencies = [ "serde_json", "tantivy", "tempfile", - "thiserror", + "thiserror 1.0.69", "time", "tokio", "tracing", @@ -6701,7 +6697,7 @@ dependencies = [ "serde_json", "serde_json_borrow", "tempfile", - "thiserror", + "thiserror 1.0.69", "tokio", "tonic", "tower", @@ -6796,7 +6792,7 @@ dependencies = [ "serde_json", "tantivy", "tempfile", - "thiserror", + "thiserror 1.0.69", "time", "tokio", "tracing", @@ -6890,7 +6886,7 @@ dependencies = [ "serial_test", "sqlx", "tempfile", - "thiserror", + "thiserror 1.0.69", "time", "tokio", "tokio-stream", @@ -6917,7 +6913,7 @@ dependencies = [ "quickwit-proto", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", "time", "tokio", "tonic", @@ -6948,7 +6944,7 @@ dependencies = [ "serde", "serde_json", "sqlx", - "thiserror", + "thiserror 1.0.69", "tokio", "tonic", "tonic-build", @@ -6979,7 +6975,7 @@ dependencies = [ "serde_json", "serde_with", "tantivy", - "thiserror", + "thiserror 1.0.69", "time", "whichlang", ] @@ -7001,7 +6997,7 @@ dependencies = [ "reqwest", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", "tokio", "tracing", "wiremock", @@ -7041,7 +7037,7 @@ dependencies = [ "serde", "serde_json", "tantivy", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-stream", "tower", @@ -7104,7 +7100,7 @@ dependencies = [ "serde_qs 0.12.0", "serde_with", "tempfile", - "thiserror", + "thiserror 1.0.69", "time", "tokio", "tokio-stream", @@ -7156,7 +7152,7 @@ dependencies = [ "serde_json", "tantivy", "tempfile", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-stream", "tokio-util", @@ -7375,7 +7371,7 @@ checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" dependencies = [ "getrandom 0.2.15", "libredox", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -7709,12 +7705,6 @@ version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" -[[package]] -name = "rustc-hash" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" - [[package]] name = "rustc-hash" version = "2.0.0" @@ -7944,7 +7934,7 @@ dependencies = [ "proc-macro2", "quote", "syn 2.0.89", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -8106,7 +8096,7 @@ checksum = "c7715380eec75f029a4ef7de39a9200e0a63823176b759d055b613f5a87df6a6" dependencies = [ "percent-encoding", "serde", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -8118,7 +8108,7 @@ dependencies = [ "futures", "percent-encoding", "serde", - "thiserror", + "thiserror 1.0.69", "tracing", "warp", ] @@ -8330,7 +8320,7 @@ checksum = "adc4e5204eb1910f40f9cfa375f6f05b68c3abac4b6fd879c8ff5e7ae8a0a085" dependencies = [ "num-bigint", "num-traits", - "thiserror", + "thiserror 1.0.69", "time", ] @@ -8515,7 +8505,7 @@ dependencies = [ "sha2", "smallvec", "sqlformat", - "thiserror", + "thiserror 1.0.69", "time", "tokio", "tokio-stream", @@ -8600,7 +8590,7 @@ dependencies = [ "smallvec", "sqlx-core", "stringprep", - "thiserror", + "thiserror 1.0.69", "time", "tracing", "whoami", @@ -8639,7 +8629,7 @@ dependencies = [ "smallvec", "sqlx-core", "stringprep", - "thiserror", + "thiserror 1.0.69", "time", "tracing", "whoami", @@ -8860,7 +8850,7 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" [[package]] name = "tantivy" version = "0.23.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=1a25b6125#1a25b612507ba380d81a648c68bd3d4a3bb6f37d" dependencies = [ "aho-corasick", "arc-swap", @@ -8889,7 +8879,7 @@ dependencies = [ "rayon", "regex", "rust-stemmers", - "rustc-hash 1.1.0", + "rustc-hash", "serde", "serde_json", "sketches-ddsketch", @@ -8903,7 +8893,7 @@ dependencies = [ "tantivy-stacker", "tantivy-tokenizer-api", "tempfile", - "thiserror", + "thiserror 2.0.6", "time", "uuid", "winapi 0.3.9", @@ -8913,7 +8903,7 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" version = "0.6.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=1a25b6125#1a25b612507ba380d81a648c68bd3d4a3bb6f37d" dependencies = [ "bitpacking", ] @@ -8921,7 +8911,7 @@ dependencies = [ [[package]] name = "tantivy-columnar" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=1a25b6125#1a25b612507ba380d81a648c68bd3d4a3bb6f37d" dependencies = [ "downcast-rs", "fastdivide", @@ -8936,7 +8926,7 @@ dependencies = [ [[package]] name = "tantivy-common" version = "0.7.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=1a25b6125#1a25b612507ba380d81a648c68bd3d4a3bb6f37d" dependencies = [ "async-trait", "byteorder", @@ -8959,7 +8949,7 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" version = "0.22.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=1a25b6125#1a25b612507ba380d81a648c68bd3d4a3bb6f37d" dependencies = [ "nom", ] @@ -8967,7 +8957,7 @@ dependencies = [ [[package]] name = "tantivy-sstable" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=1a25b6125#1a25b612507ba380d81a648c68bd3d4a3bb6f37d" dependencies = [ "tantivy-bitpacker", "tantivy-common", @@ -8978,7 +8968,7 @@ dependencies = [ [[package]] name = "tantivy-stacker" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=1a25b6125#1a25b612507ba380d81a648c68bd3d4a3bb6f37d" dependencies = [ "murmurhash32", "rand_distr", @@ -8988,7 +8978,7 @@ dependencies = [ [[package]] name = "tantivy-tokenizer-api" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=1a25b6125#1a25b612507ba380d81a648c68bd3d4a3bb6f37d" dependencies = [ "serde", ] @@ -9055,7 +9045,16 @@ version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec2a1820ebd077e2b90c4df007bebf344cd394098a13c563957d0afc83ea47" +dependencies = [ + "thiserror-impl 2.0.6", ] [[package]] @@ -9069,6 +9068,17 @@ dependencies = [ "syn 2.0.89", ] +[[package]] +name = "thiserror-impl" +version = "2.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d65750cab40f4ff1929fb1ba509e9914eb756131cef4210da8d5d700d26f6312" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.89", +] + [[package]] name = "thousands" version = "0.2.0" @@ -9145,7 +9155,7 @@ version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78bfd61bca99323ce96911bd2c443259115460615e44f1d449cee8cb3831a1dd" dependencies = [ - "thiserror", + "thiserror 1.0.69", "time", ] @@ -9607,7 +9617,7 @@ dependencies = [ "log", "rand 0.8.5", "sha1", - "thiserror", + "thiserror 1.0.69", "url", "utf-8", ] @@ -9942,7 +9952,7 @@ dependencies = [ "strip-ansi-escapes", "syslog_loose", "termcolor", - "thiserror", + "thiserror 1.0.69", "tokio", "tracing", "uaparser", diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index 9c91d6efd58..8cd422d66a2 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -328,7 +328,7 @@ quickwit-serve = { path = "quickwit-serve" } quickwit-storage = { path = "quickwit-storage" } quickwit-telemetry = { path = "quickwit-telemetry" } -tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "2f2db16", default-features = false, features = [ +tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "1a25b6125", default-features = false, features = [ "lz4-compression", "mmap", "quickwit", diff --git a/quickwit/quickwit-common/src/path_hasher.rs b/quickwit/quickwit-common/src/path_hasher.rs index 68fe97db463..481d203a13e 100644 --- a/quickwit/quickwit-common/src/path_hasher.rs +++ b/quickwit/quickwit-common/src/path_hasher.rs @@ -19,12 +19,20 @@ use std::hash::Hasher; +/// We use 255 as a separator as it isn't used by utf-8. +/// +/// Tantivy uses 1 because it is more convenient for range queries, but we don't +/// care about the sort order here. +/// +/// Note: changing this is not retro-compatible! +const SEPARATOR: &[u8] = &[255]; + /// Mini wrapper over the FnvHasher to incrementally hash nodes /// in a tree. /// -/// The wrapper does not do too much. Its main purpose to -/// work around the lack of Clone in the fnv Hasher -/// and enforce a 0 byte separator between segments. +/// Its purpose is to: +/// - work around the lack of Clone in the fnv Hasher +/// - enforce a 1 byte separator between segments #[derive(Default)] pub struct PathHasher { hasher: fnv::FnvHasher, @@ -40,13 +48,13 @@ impl Clone for PathHasher { } impl PathHasher { - /// Helper function, mostly for tests. + #[cfg(any(test, feature = "testsuite"))] pub fn hash_path(segments: &[&[u8]]) -> u64 { let mut hasher = Self::default(); for segment in segments { hasher.append(segment); } - hasher.finish() + hasher.finish_leaf() } /// Appends a new segment to our path. @@ -56,13 +64,18 @@ impl PathHasher { #[inline] pub fn append(&mut self, payload: &[u8]) { self.hasher.write(payload); - // We use 255 as a separator as all utf8 bytes contain a 0 - // in position 0-5. - self.hasher.write(&[255u8]); + self.hasher.write(SEPARATOR); } #[inline] - pub fn finish(&self) -> u64 { + pub fn finish_leaf(&self) -> u64 { self.hasher.finish() } + + #[inline] + pub fn finish_intermediate(&self) -> u64 { + let mut intermediate = fnv::FnvHasher::with_key(self.hasher.finish()); + intermediate.write(SEPARATOR); + intermediate.finish() + } } diff --git a/quickwit/quickwit-doc-mapper/Cargo.toml b/quickwit/quickwit-doc-mapper/Cargo.toml index 44b846157bd..ae0239e53c5 100644 --- a/quickwit/quickwit-doc-mapper/Cargo.toml +++ b/quickwit/quickwit-doc-mapper/Cargo.toml @@ -41,6 +41,7 @@ matches = { workspace = true } serde_yaml = { workspace = true } time = { workspace = true } +quickwit-common = { workspace = true, features = ["testsuite"] } quickwit-query = { workspace = true, features = ["multilang"] } [features] diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs index 42f233013a4..e5ab815102e 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs @@ -557,7 +557,7 @@ impl DocMapper { if self.index_field_presence { let field_presence_hashes: FnvHashSet = - populate_field_presence(&document, &self.schema); + populate_field_presence(&document, &self.schema, true); for field_presence_hash in field_presence_hashes { document.add_field_value(FIELD_PRESENCE_FIELD, &field_presence_hash); } diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/field_presence.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/field_presence.rs index 95f7dcba632..ad88a4c4493 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/field_presence.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/field_presence.rs @@ -32,6 +32,7 @@ use tantivy::Document; pub(crate) fn populate_field_presence( document: &D, schema: &Schema, + populate_object_fields: bool, ) -> FnvHashSet { let mut field_presence_hashes: FnvHashSet = FnvHashSet::with_capacity_and_hasher(schema.num_fields(), Default::default()); @@ -50,72 +51,169 @@ pub(crate) fn populate_field_presence( } else { false }; - populate_field_presence_for_json_obj( - json_obj, - path_hasher, + let mut subfields_populator = SubfieldsPopulator { + populate_object_fields, is_expand_dots_enabled, - &mut field_presence_hashes, - ); + field_presence_hashes, + }; + subfields_populator.populate_field_presence_for_json_obj(path_hasher, json_obj); + field_presence_hashes = subfields_populator.field_presence_hashes; } else { - field_presence_hashes.insert(path_hasher.finish()); + field_presence_hashes.insert(path_hasher.finish_leaf()); } } field_presence_hashes } -#[inline] -fn populate_field_presence_for_json_value<'a>( - json_value: impl Value<'a>, - path_hasher: &PathHasher, +/// A struct to help populate field presence hashes for nested JSON field. +struct SubfieldsPopulator { + populate_object_fields: bool, is_expand_dots_enabled: bool, - output: &mut FnvHashSet, -) { - match json_value.as_value() { - ReferenceValue::Leaf(ReferenceValueLeaf::Null) => {} - ReferenceValue::Leaf(_) => { - output.insert(path_hasher.finish()); - } - ReferenceValue::Array(items) => { - for item in items { - populate_field_presence_for_json_value( - item, - path_hasher, - is_expand_dots_enabled, - output, - ); + field_presence_hashes: FnvHashSet, +} + +impl SubfieldsPopulator { + #[inline] + fn populate_field_presence_for_json_value<'a>( + &mut self, + path_hasher: PathHasher, + json_value: impl Value<'a>, + ) { + match json_value.as_value() { + ReferenceValue::Leaf(ReferenceValueLeaf::Null) => {} + ReferenceValue::Leaf(_) => { + self.field_presence_hashes.insert(path_hasher.finish_leaf()); + } + ReferenceValue::Array(items) => { + for item in items { + self.populate_field_presence_for_json_value(path_hasher.clone(), item); + } + } + ReferenceValue::Object(json_obj) => { + self.populate_field_presence_for_json_obj(path_hasher, json_obj); } } - ReferenceValue::Object(json_obj) => { - populate_field_presence_for_json_obj( - json_obj, - path_hasher.clone(), - is_expand_dots_enabled, - output, - ); + } + + fn populate_field_presence_for_json_obj<'a, I, V>( + &mut self, + path_hasher: PathHasher, + json_obj: I, + ) where + I: Iterator, + V: Value<'a>, + { + if self.populate_object_fields { + self.field_presence_hashes + .insert(path_hasher.finish_intermediate()); + } + for (field_key, field_value) in json_obj { + let mut child_path_hasher = path_hasher.clone(); + if self.is_expand_dots_enabled { + let mut expanded_key = field_key.split('.').peekable(); + while let Some(segment) = expanded_key.next() { + child_path_hasher.append(segment.as_bytes()); + if self.populate_object_fields && expanded_key.peek().is_some() { + self.field_presence_hashes + .insert(child_path_hasher.finish_intermediate()); + } + } + } else { + child_path_hasher.append(field_key.as_bytes()); + }; + self.populate_field_presence_for_json_value(child_path_hasher, field_value); } } } -fn populate_field_presence_for_json_obj<'a, Iter: Iterator)>>( - json_obj: Iter, - path_hasher: PathHasher, - is_expand_dots_enabled: bool, - output: &mut FnvHashSet, -) { - for (field_key, field_value) in json_obj { - let mut child_path_hasher = path_hasher.clone(); - if is_expand_dots_enabled { - for segment in field_key.split('.') { - child_path_hasher.append(segment.as_bytes()); - } - } else { - child_path_hasher.append(field_key.as_bytes()); - }; - populate_field_presence_for_json_value( - field_value, - &child_path_hasher, - is_expand_dots_enabled, - output, +#[cfg(test)] +mod tests { + use tantivy::schema::*; + use tantivy::TantivyDocument; + + use super::*; + + #[test] + fn test_populate_field_presence_basic() { + let mut schema_builder = Schema::builder(); + schema_builder.add_text_field("indexed_text", TEXT); + schema_builder.add_text_field("text_not_indexed", STORED); + let schema = schema_builder.build(); + let json_doc = r#"{"indexed_text": "hello", "text_not_indexed": "world"}"#; + let document = TantivyDocument::parse_json(&schema, json_doc).unwrap(); + + let field_presence = populate_field_presence(&document, &schema, true); + assert_eq!(field_presence.len(), 1); + } + + #[test] + fn test_populate_field_presence_with_array() { + let mut schema_builder = Schema::builder(); + schema_builder.add_text_field("list", TEXT); + let schema = schema_builder.build(); + let json_doc = r#"{"list": ["value1", "value2"]}"#; + let document = TantivyDocument::parse_json(&schema, json_doc).unwrap(); + + let field_presence = populate_field_presence(&document, &schema, true); + assert_eq!(field_presence.len(), 1); + } + + #[test] + fn test_populate_field_presence_with_json() { + let mut schema_builder = Schema::builder(); + schema_builder.add_json_field("json", TEXT); + let schema = schema_builder.build(); + let json_doc = r#"{"json": {"subfield": "a"}}"#; + let document = TantivyDocument::parse_json(&schema, json_doc).unwrap(); + + let field_presence = populate_field_presence(&document, &schema, false); + assert_eq!(field_presence.len(), 1); + let field_presence = populate_field_presence(&document, &schema, true); + assert_eq!(field_presence.len(), 2); + } + + #[test] + fn test_populate_field_presence_with_nested_jsons() { + let mut schema_builder = Schema::builder(); + schema_builder.add_json_field("json", TEXT); + let schema = schema_builder.build(); + let json_doc = r#"{"json": {"subfield": {"subsubfield": "a"}}}"#; + let document = TantivyDocument::parse_json(&schema, json_doc).unwrap(); + + let field_presence = populate_field_presence(&document, &schema, false); + assert_eq!(field_presence.len(), 1); + let field_presence = populate_field_presence(&document, &schema, true); + assert_eq!(field_presence.len(), 3); + } + + #[test] + fn test_populate_field_presence_with_array_of_objects() { + let mut schema_builder = Schema::builder(); + schema_builder.add_json_field("json", TEXT); + let schema = schema_builder.build(); + let json_doc = r#"{"json": {"list": [{"key1":"value1"}, {"key2":"value2"}]}}"#; + let document = TantivyDocument::parse_json(&schema, json_doc).unwrap(); + + let field_presence = populate_field_presence(&document, &schema, false); + assert_eq!(field_presence.len(), 2); + let field_presence = populate_field_presence(&document, &schema, true); + assert_eq!(field_presence.len(), 4); + } + + #[test] + fn test_populate_field_presence_with_expand_dots() { + let mut schema_builder = Schema::builder(); + schema_builder.add_json_field( + "json", + Into::::into(TEXT).set_expand_dots_enabled(), ); + let schema = schema_builder.build(); + let json_doc = r#"{"json": {"key.with.dots": "value"}}"#; + let document = TantivyDocument::parse_json(&schema, json_doc).unwrap(); + + let field_presence = populate_field_presence(&document, &schema, false); + assert_eq!(field_presence.len(), 1); + let field_presence = populate_field_presence(&document, &schema, true); + assert_eq!(field_presence.len(), 4); } } diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs index 146c2f1f51c..c0aaf27d487 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs @@ -85,6 +85,15 @@ pub struct TermRange { pub limit: Option, } +/// Description of how a fast field should be warmed up +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct FastFieldWarmupInfo { + /// Name of the fast field + pub name: String, + /// Whether subfields should also be loaded for warmup + pub with_subfields: bool, +} + /// Information about what a DocMapper think should be warmed up before /// running the query. #[derive(Debug, Default, Clone, PartialEq, Eq)] @@ -92,8 +101,8 @@ pub struct WarmupInfo { /// Name of fields from the term dictionary and posting list which needs to /// be entirely loaded pub term_dict_fields: HashSet, - /// Name of fast fields which needs to be loaded - pub fast_field_names: HashSet, + /// Fast fields which needs to be loaded + pub fast_fields: HashSet, /// Whether to warmup field norms. Used mostly for scoring. pub field_norms: bool, /// Terms to warmup, and whether their position is needed too. @@ -106,9 +115,18 @@ impl WarmupInfo { /// Merge other WarmupInfo into self. pub fn merge(&mut self, other: WarmupInfo) { self.term_dict_fields.extend(other.term_dict_fields); - self.fast_field_names.extend(other.fast_field_names); self.field_norms |= other.field_norms; + for fast_field_warmup_info in other.fast_fields.into_iter() { + // avoid overwriting with a less demanding warmup + if !self.fast_fields.contains(&FastFieldWarmupInfo { + name: fast_field_warmup_info.name.clone(), + with_subfields: true, + }) { + self.fast_fields.insert(fast_field_warmup_info); + } + } + for (field, term_and_pos) in other.terms_grouped_by_field.into_iter() { let sub_map = self.terms_grouped_by_field.entry(field).or_default(); @@ -571,8 +589,14 @@ mod tests { } } - fn hashset(elements: &[&str]) -> HashSet { - elements.iter().map(|elem| elem.to_string()).collect() + fn hashset_fast(elements: &[&str]) -> HashSet { + elements + .iter() + .map(|elem| FastFieldWarmupInfo { + name: elem.to_string(), + with_subfields: false, + }) + .collect() } fn hashset_field(elements: &[u32]) -> HashSet { @@ -617,7 +641,7 @@ mod tests { fn test_warmup_info_merge() { let wi_base = WarmupInfo { term_dict_fields: hashset_field(&[1, 2]), - fast_field_names: hashset(&["fast1", "fast2"]), + fast_fields: hashset_fast(&["fast1", "fast2"]), field_norms: false, terms_grouped_by_field: hashmap(&[(1, "term1", false), (1, "term2", false)]), term_ranges_grouped_by_field: hashmap_ranges(&[ @@ -634,7 +658,7 @@ mod tests { let mut wi_base = wi_base; let wi_2 = WarmupInfo { term_dict_fields: hashset_field(&[2, 3]), - fast_field_names: hashset(&["fast2", "fast3"]), + fast_fields: hashset_fast(&["fast2", "fast3"]), field_norms: true, terms_grouped_by_field: hashmap(&[(2, "term1", false), (1, "term2", true)]), term_ranges_grouped_by_field: hashmap_ranges(&[ @@ -646,8 +670,8 @@ mod tests { assert_eq!(wi_base.term_dict_fields, hashset_field(&[1, 2, 3])); assert_eq!( - wi_base.fast_field_names, - hashset(&["fast1", "fast2", "fast3"]) + wi_base.fast_fields, + hashset_fast(&["fast1", "fast2", "fast3"]) ); assert!(wi_base.field_norms); @@ -698,7 +722,7 @@ mod tests { fn test_warmup_info_simplify() { let mut warmup_info = WarmupInfo { term_dict_fields: hashset_field(&[1]), - fast_field_names: hashset(&["fast1", "fast2"]), + fast_fields: hashset_fast(&["fast1", "fast2"]), field_norms: false, terms_grouped_by_field: hashmap(&[ (1, "term1", false), @@ -713,7 +737,7 @@ mod tests { }; let expected = WarmupInfo { term_dict_fields: hashset_field(&[1]), - fast_field_names: hashset(&["fast1", "fast2"]), + fast_fields: hashset_fast(&["fast1", "fast2"]), field_norms: false, terms_grouped_by_field: hashmap(&[(1, "term2", true), (2, "term3", false)]), term_ranges_grouped_by_field: hashmap_ranges(&[ diff --git a/quickwit/quickwit-doc-mapper/src/lib.rs b/quickwit/quickwit-doc-mapper/src/lib.rs index c592616e86a..9c940076a5a 100644 --- a/quickwit/quickwit-doc-mapper/src/lib.rs +++ b/quickwit/quickwit-doc-mapper/src/lib.rs @@ -35,9 +35,9 @@ mod routing_expression; pub mod tag_pruning; pub use doc_mapper::{ - analyze_text, BinaryFormat, DocMapper, DocMapperBuilder, FieldMappingEntry, FieldMappingType, - JsonObject, NamedField, QuickwitBytesOptions, QuickwitJsonOptions, TermRange, TokenizerConfig, - TokenizerEntry, WarmupInfo, + analyze_text, BinaryFormat, DocMapper, DocMapperBuilder, FastFieldWarmupInfo, + FieldMappingEntry, FieldMappingType, JsonObject, NamedField, QuickwitBytesOptions, + QuickwitJsonOptions, TermRange, TokenizerConfig, TokenizerEntry, WarmupInfo, }; use doc_mapper::{ FastFieldOptions, FieldMappingEntryForSerialization, IndexRecordOptionSchema, diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index 9dffeef0ad7..047cb9ea131 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -30,7 +30,9 @@ use quickwit_query::{find_field_or_hit_dynamic, InvalidQuery}; use tantivy::query::Query; use tantivy::schema::{Field, Schema}; use tantivy::Term; +use tracing::error; +use crate::doc_mapper::FastFieldWarmupInfo; use crate::{QueryParserError, TermRange, WarmupInfo}; #[derive(Default)] @@ -48,22 +50,37 @@ impl<'a> QueryAstVisitor<'a> for RangeQueryFields { } } -#[derive(Default)] -struct ExistsQueryFields { - exists_query_field_names: HashSet, +struct ExistsQueryFastFields { + fields: HashSet, + schema: Schema, } -impl<'a> QueryAstVisitor<'a> for ExistsQueryFields { +impl<'a> QueryAstVisitor<'a> for ExistsQueryFastFields { type Err = Infallible; fn visit_exists(&mut self, exists_query: &'a FieldPresenceQuery) -> Result<(), Infallible> { - // If the field is a fast field, we will rely on the `ColumnIndex`. - // If the field is not a fast, we will rely on the field presence field. - // - // After all field names are collected they are checked against schema and - // non-fast fields are removed from warmup operation. - self.exists_query_field_names - .insert(exists_query.field.to_string()); + let fields = exists_query.find_field_and_subfields(&self.schema); + for (_, field_entry, path) in fields { + if field_entry.is_fast() { + if field_entry.field_type().is_json() { + let full_path = format!("{}.{}", field_entry.name(), path); + self.fields.insert(FastFieldWarmupInfo { + name: full_path, + with_subfields: true, + }); + } else if path.is_empty() { + self.fields.insert(FastFieldWarmupInfo { + name: field_entry.name().to_string(), + with_subfields: false, + }); + } else { + error!( + field_entry = field_entry.name(), + path, "only JSON type supports subfields" + ); + } + } + } Ok(()) } } @@ -80,18 +97,24 @@ pub(crate) fn build_query( // This cannot fail. The error type is Infallible. let _: Result<(), Infallible> = range_query_fields.visit(query_ast); - let mut exists_query_fields = ExistsQueryFields::default(); + let mut exists_query_fields = ExistsQueryFastFields { + fields: HashSet::new(), + schema: schema.clone(), + }; // This cannot fail. The error type is Infallible. let _: Result<(), Infallible> = exists_query_fields.visit(query_ast); - let mut fast_field_names = HashSet::new(); - fast_field_names.extend(range_query_fields.range_query_field_names); - fast_field_names.extend( - exists_query_fields - .exists_query_field_names + let mut fast_fields = HashSet::new(); + let range_query_fast_fields = + range_query_fields + .range_query_field_names .into_iter() - .filter(|field| is_fast_field(&schema, field)), - ); + .map(|name| FastFieldWarmupInfo { + name, + with_subfields: false, + }); + fast_fields.extend(range_query_fast_fields); + fast_fields.extend(exists_query_fields.fields); let query = query_ast.build_tantivy_query( &schema, @@ -118,20 +141,13 @@ pub(crate) fn build_query( term_dict_fields: term_set_query_fields, terms_grouped_by_field, term_ranges_grouped_by_field, - fast_field_names, + fast_fields, ..WarmupInfo::default() }; Ok((query, warmup_info)) } -fn is_fast_field(schema: &Schema, field_name: &str) -> bool { - if let Ok((_field, field_entry, _path)) = find_field_or_hit_dynamic(field_name, schema) { - return field_entry.is_fast(); - } - false -} - struct ExtractTermSetFields<'a> { term_dict_fields_to_warm_up: HashSet, schema: &'a Schema, @@ -151,7 +167,8 @@ impl<'a> QueryAstVisitor<'a> for ExtractTermSetFields<'_> { fn visit_term_set(&mut self, term_set_query: &'a TermSetQuery) -> anyhow::Result<()> { for field in term_set_query.terms_per_field.keys() { - if let Ok((field, _field_entry, _path)) = find_field_or_hit_dynamic(field, self.schema) + if let Some((field, _field_entry, _path)) = + find_field_or_hit_dynamic(field, self.schema) { self.term_dict_fields_to_warm_up.insert(field); } else { @@ -285,6 +302,7 @@ fn extract_prefix_term_ranges( mod test { use std::ops::Bound; + use quickwit_common::shared_consts::FIELD_PRESENCE_FIELD_NAME; use quickwit_query::query_ast::{ query_ast_from_user_text, FullTextMode, FullTextParams, PhrasePrefixQuery, QueryAstVisitor, UserInputQuery, @@ -305,6 +323,7 @@ mod test { fn make_schema(dynamic_mode: bool) -> Schema { let mut schema_builder = Schema::builder(); + schema_builder.add_i64_field(FIELD_PRESENCE_FIELD_NAME, INDEXED); schema_builder.add_text_field("title", TEXT); schema_builder.add_text_field("desc", TEXT | STORED); schema_builder.add_text_field("server.name", TEXT | STORED); @@ -321,6 +340,8 @@ mod test { schema_builder.add_u64_field("u64_fast", FAST | STORED); schema_builder.add_i64_field("i64_fast", FAST | STORED); schema_builder.add_f64_field("f64_fast", FAST | STORED); + schema_builder.add_json_field("json_fast", FAST); + schema_builder.add_json_field("json_text", TEXT); if dynamic_mode { schema_builder.add_json_field(DYNAMIC_FIELD_NAME, TEXT); } @@ -420,7 +441,7 @@ mod test { "foo:bar", Vec::new(), TestExpectation::Ok( - r#"TermQuery(Term(field=13, type=Json, path=foo, type=Str, "bar"))"#, + r#"TermQuery(Term(field=16, type=Json, path=foo, type=Str, "bar"))"#, ), ); check_build_query_dynamic_mode( @@ -460,7 +481,7 @@ mod test { check_build_query_static_mode( "title:bar", Vec::new(), - TestExpectation::Ok(r#"TermQuery(Term(field=0, type=Str, "bar"))"#), + TestExpectation::Ok(r#"TermQuery(Term(field=1, type=Str, "bar"))"#), ); check_build_query_static_mode( "bar", @@ -559,6 +580,41 @@ mod test { ); } + #[test] + fn test_existence_query() { + check_build_query_static_mode( + "title:*", + Vec::new(), + TestExpectation::Ok("TermQuery(Term(field=0, type=U64"), + ); + + check_build_query_static_mode( + "ip:*", + Vec::new(), + TestExpectation::Ok("ExistsQuery { field_name: \"ip\", json_subpaths: true }"), + ); + check_build_query_static_mode( + "json_text:*", + Vec::new(), + TestExpectation::Ok("TermSetQuery"), + ); + check_build_query_static_mode( + "json_fast:*", + Vec::new(), + TestExpectation::Ok("ExistsQuery { field_name: \"json_fast\", json_subpaths: true }"), + ); + check_build_query_static_mode( + "foo:*", + Vec::new(), + TestExpectation::Err("invalid query: field does not exist: `foo`"), + ); + check_build_query_static_mode( + "server:*", + Vec::new(), + TestExpectation::Ok("BooleanQuery { subqueries: [(Should, TermQuery(Term"), + ); + } + #[test] fn test_datetime_range_query() { { @@ -611,8 +667,8 @@ mod test { "ip:[127.0.0.1 TO 127.1.1.1]", Vec::new(), TestExpectation::Ok( - "RangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=6, \ - type=IpAddr, ::ffff:127.0.0.1)), upper_bound: Included(Term(field=6, \ + "RangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=7, \ + type=IpAddr, ::ffff:127.0.0.1)), upper_bound: Included(Term(field=7, \ type=IpAddr, ::ffff:127.1.1.1)) } }", ), ); @@ -620,7 +676,7 @@ mod test { "ip:>127.0.0.1", Vec::new(), TestExpectation::Ok( - "RangeQuery { bounds: BoundsRange { lower_bound: Excluded(Term(field=6, \ + "RangeQuery { bounds: BoundsRange { lower_bound: Excluded(Term(field=7, \ type=IpAddr, ::ffff:127.0.0.1)), upper_bound: Unbounded } }", ), ); @@ -632,14 +688,14 @@ mod test { "f64_fast:[7.7 TO 77.7]", Vec::new(), TestExpectation::Ok( - r#"RangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=12, type=F64, 7.7)), upper_bound: Included(Term(field=12, type=F64, 77.7)) } }"#, + r#"RangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=13, type=F64, 7.7)), upper_bound: Included(Term(field=13, type=F64, 77.7)) } }"#, ), ); check_build_query_static_mode( "f64_fast:>7", Vec::new(), TestExpectation::Ok( - r#"RangeQuery { bounds: BoundsRange { lower_bound: Excluded(Term(field=12, type=F64, 7.0)), upper_bound: Unbounded } }"#, + r#"RangeQuery { bounds: BoundsRange { lower_bound: Excluded(Term(field=13, type=F64, 7.0)), upper_bound: Unbounded } }"#, ), ); } @@ -649,12 +705,12 @@ mod test { check_build_query_static_mode( "i64_fast:[-7 TO 77]", Vec::new(), - TestExpectation::Ok(r#"field=11"#), + TestExpectation::Ok(r#"field=12"#), ); check_build_query_static_mode( "i64_fast:>7", Vec::new(), - TestExpectation::Ok(r#"field=11"#), + TestExpectation::Ok(r#"field=12"#), ); } @@ -663,12 +719,12 @@ mod test { check_build_query_static_mode( "u64_fast:[7 TO 77]", Vec::new(), - TestExpectation::Ok(r#"field=10,"#), + TestExpectation::Ok(r#"field=11,"#), ); check_build_query_static_mode( "u64_fast:>7", Vec::new(), - TestExpectation::Ok(r#"field=10,"#), + TestExpectation::Ok(r#"field=11,"#), ); } @@ -678,8 +734,8 @@ mod test { "ips:[127.0.0.1 TO 127.1.1.1]", Vec::new(), TestExpectation::Ok( - "RangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=7, \ - type=IpAddr, ::ffff:127.0.0.1)), upper_bound: Included(Term(field=7, \ + "RangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=8, \ + type=IpAddr, ::ffff:127.0.0.1)), upper_bound: Included(Term(field=8, \ type=IpAddr, ::ffff:127.1.1.1)) } }", ), ); @@ -723,7 +779,7 @@ mod test { assert_eq!(warmup_info.term_dict_fields.len(), 1); assert!(warmup_info .term_dict_fields - .contains(&tantivy::schema::Field::from_field_id(1))); + .contains(&tantivy::schema::Field::from_field_id(2))); let (_, warmup_info) = build_query( &query_without_set, @@ -773,7 +829,7 @@ mod test { extractor2.term_ranges_to_warm_up ); - let field = tantivy::schema::Field::from_field_id(0); + let field = tantivy::schema::Field::from_field_id(1); let mut expected_inner = std::collections::HashMap::new(); expected_inner.insert( TermRange { diff --git a/quickwit/quickwit-query/Cargo.toml b/quickwit/quickwit-query/Cargo.toml index bee650198c8..e94f8aef4ec 100644 --- a/quickwit/quickwit-query/Cargo.toml +++ b/quickwit/quickwit-query/Cargo.toml @@ -34,6 +34,8 @@ criterion = { workspace = true } proptest = { workspace = true } time = { workspace = true } +quickwit-common = { workspace = true, features = ["testsuite"] } + [features] multilang = [ "lindera-core", diff --git a/quickwit/quickwit-query/src/query_ast/field_presence.rs b/quickwit/quickwit-query/src/query_ast/field_presence.rs index df82ab3591a..24a6881e2bc 100644 --- a/quickwit/quickwit-query/src/query_ast/field_presence.rs +++ b/quickwit/quickwit-query/src/query_ast/field_presence.rs @@ -20,13 +20,15 @@ use quickwit_common::shared_consts::FIELD_PRESENCE_FIELD_NAME; use quickwit_common::PathHasher; use serde::{Deserialize, Serialize}; -use tantivy::schema::{Field, IndexRecordOption, Schema as TantivySchema}; +use tantivy::schema::{Field, FieldEntry, IndexRecordOption, Schema as TantivySchema}; use tantivy::Term; +use super::tantivy_query_ast::TantivyBoolQuery; +use super::utils::{find_subfields, DYNAMIC_FIELD_NAME}; use crate::query_ast::tantivy_query_ast::TantivyQueryAst; use crate::query_ast::{BuildTantivyAst, QueryAst}; use crate::tokenizers::TokenizerManager; -use crate::{find_field_or_hit_dynamic, InvalidQuery}; +use crate::{find_field_or_hit_dynamic, BooleanOperand, InvalidQuery}; #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] pub struct FieldPresenceQuery { @@ -39,7 +41,7 @@ impl From for QueryAst { } } -fn compute_field_presence_hash(field: Field, field_path: &str) -> u64 { +fn compute_field_presence_hash(field: Field, field_path: &str) -> PathHasher { let mut path_hasher: PathHasher = PathHasher::default(); path_hasher.append(&field.field_id().to_le_bytes()[..]); let mut escaped = false; @@ -66,7 +68,59 @@ fn compute_field_presence_hash(field: Field, field_path: &str) -> u64 { if !current_segment.is_empty() { path_hasher.append(current_segment.as_bytes()); } - path_hasher.finish() + path_hasher +} + +fn build_existence_query( + field_presence_field: Field, + field: Field, + field_entry: &FieldEntry, + path: &str, +) -> TantivyQueryAst { + if field_entry.is_fast() { + let full_path = if path.is_empty() { + field_entry.name().to_string() + } else { + format!("{}.{}", field_entry.name(), path) + }; + let exists_query = tantivy::query::ExistsQuery::new(full_path, true); + TantivyQueryAst::from(exists_query) + } else { + // fallback to the presence field + let presence_hasher = compute_field_presence_hash(field, path); + let leaf_term = Term::from_field_u64(field_presence_field, presence_hasher.finish_leaf()); + if field_entry.field_type().is_json() { + let intermediate_term = + Term::from_field_u64(field_presence_field, presence_hasher.finish_intermediate()); + let query = tantivy::query::TermSetQuery::new([leaf_term, intermediate_term]); + TantivyQueryAst::from(query) + } else { + let query = tantivy::query::TermQuery::new(leaf_term, IndexRecordOption::Basic); + TantivyQueryAst::from(query) + } + } +} + +impl FieldPresenceQuery { + /// Identify the field and potential subfields that are required for this query. + /// + /// This is only based on the schema and cannot now about dynamic fields. + pub fn find_field_and_subfields<'a>( + &'a self, + schema: &'a TantivySchema, + ) -> Vec<(Field, &'a FieldEntry, &'a str)> { + let mut fields = Vec::new(); + if let Some((field, entry, path)) = find_field_or_hit_dynamic(&self.field, schema) { + fields.push((field, entry, path)); + }; + // if `self.field` was not found, it might still be an `object` field + if fields.is_empty() || fields[0].1.name() == DYNAMIC_FIELD_NAME { + for (field, entry) in find_subfields(&self.field, schema) { + fields.push((field, entry, "")); + } + } + fields + } } impl BuildTantivyAst for FieldPresenceQuery { @@ -80,24 +134,23 @@ impl BuildTantivyAst for FieldPresenceQuery { let field_presence_field = schema.get_field(FIELD_PRESENCE_FIELD_NAME).map_err(|_| { InvalidQuery::SchemaError("field presence is not available for this split".to_string()) })?; - let (field, field_entry, path) = find_field_or_hit_dynamic(&self.field, schema)?; - if field_entry.is_fast() { - let full_path = if path.is_empty() { - field_entry.name().to_string() - } else { - format!("{}.{}", field_entry.name(), path) - }; - let exists_query = tantivy::query::ExistsQuery::new_exists_query(full_path); - Ok(TantivyQueryAst::from(exists_query)) - } else { - // fallback to the presence field - let field_presence_hash = compute_field_presence_hash(field, path); - let field_presence_term: Term = - Term::from_field_u64(field_presence_field, field_presence_hash); - let field_presence_term_query = - tantivy::query::TermQuery::new(field_presence_term, IndexRecordOption::Basic); - Ok(TantivyQueryAst::from(field_presence_term_query)) + let fields = self.find_field_and_subfields(schema); + if fields.is_empty() { + // the schema is not dynamic and no subfields are defined + return Err(InvalidQuery::FieldDoesNotExist { + full_path: self.field.clone(), + }); } + let queries = fields + .into_iter() + .map(|(field, entry, path)| { + build_existence_query(field_presence_field, field, entry, path) + }) + .collect(); + Ok(TantivyQueryAst::Bool(TantivyBoolQuery::build_clause( + BooleanOperand::Or, + queries, + ))) } } @@ -109,7 +162,7 @@ mod tests { #[test] fn test_field_presence_single() { let field_presence_term: u64 = - compute_field_presence_hash(Field::from_field_id(17u32), "attributes"); + compute_field_presence_hash(Field::from_field_id(17u32), "attributes").finish_leaf(); assert_eq!( field_presence_term, PathHasher::hash_path(&[&17u32.to_le_bytes()[..], b"attributes"]) @@ -119,7 +172,8 @@ mod tests { #[test] fn test_field_presence_hash_simple() { let field_presence_term: u64 = - compute_field_presence_hash(Field::from_field_id(17u32), "attributes.color"); + compute_field_presence_hash(Field::from_field_id(17u32), "attributes.color") + .finish_leaf(); assert_eq!( field_presence_term, PathHasher::hash_path(&[&17u32.to_le_bytes()[..], b"attributes", b"color"]) @@ -129,7 +183,8 @@ mod tests { #[test] fn test_field_presence_hash_escaped_dot() { let field_presence_term: u64 = - compute_field_presence_hash(Field::from_field_id(17u32), r"attributes\.color.hello"); + compute_field_presence_hash(Field::from_field_id(17u32), r"attributes\.color.hello") + .finish_leaf(); assert_eq!( field_presence_term, PathHasher::hash_path(&[&17u32.to_le_bytes()[..], b"attributes.color", b"hello"]) diff --git a/quickwit/quickwit-query/src/query_ast/full_text_query.rs b/quickwit/quickwit-query/src/query_ast/full_text_query.rs index 661bb89039f..55efd9b7131 100644 --- a/quickwit/quickwit-query/src/query_ast/full_text_query.rs +++ b/quickwit/quickwit-query/src/query_ast/full_text_query.rs @@ -271,8 +271,7 @@ impl FullTextQuery { return None; }; - let (field, field_entry, json_path) = - find_field_or_hit_dynamic(&self.field, schema).ok()?; + let (field, field_entry, json_path) = find_field_or_hit_dynamic(&self.field, schema)?; let field_type: &FieldType = field_entry.field_type(); match field_type { FieldType::Str(text_options) => { diff --git a/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs b/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs index 1675b22d760..3181764821d 100644 --- a/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs +++ b/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs @@ -48,7 +48,10 @@ impl PhrasePrefixQuery { schema: &TantivySchema, tokenizer_manager: &TokenizerManager, ) -> Result<(Field, Vec<(usize, Term)>), InvalidQuery> { - let (field, field_entry, json_path) = find_field_or_hit_dynamic(&self.field, schema)?; + let (field, field_entry, json_path) = find_field_or_hit_dynamic(&self.field, schema) + .ok_or_else(|| InvalidQuery::FieldDoesNotExist { + full_path: self.field.clone(), + })?; let field_type = field_entry.field_type(); match field_type { diff --git a/quickwit/quickwit-query/src/query_ast/range_query.rs b/quickwit/quickwit-query/src/query_ast/range_query.rs index 52ce26bc306..bca8f00e897 100644 --- a/quickwit/quickwit-query/src/query_ast/range_query.rs +++ b/quickwit/quickwit-query/src/query_ast/range_query.rs @@ -121,7 +121,11 @@ impl BuildTantivyAst for RangeQuery { _with_validation: bool, ) -> Result { let (field, field_entry, json_path) = - super::utils::find_field_or_hit_dynamic(&self.field, schema)?; + super::utils::find_field_or_hit_dynamic(&self.field, schema).ok_or_else(|| { + InvalidQuery::FieldDoesNotExist { + full_path: self.field.clone(), + } + })?; if !field_entry.is_fast() { return Err(InvalidQuery::SchemaError(format!( "range queries are only supported for fast fields. (`{}` is not a fast field)", diff --git a/quickwit/quickwit-query/src/query_ast/utils.rs b/quickwit/quickwit-query/src/query_ast/utils.rs index 310056c0309..99f68d6e658 100644 --- a/quickwit/quickwit-query/src/query_ast/utils.rs +++ b/quickwit/quickwit-query/src/query_ast/utils.rs @@ -32,35 +32,44 @@ use crate::tokenizers::TokenizerManager; use crate::InvalidQuery; use crate::MatchAllOrNone::MatchNone as TantivyEmptyQuery; -const DYNAMIC_FIELD_NAME: &str = "_dynamic"; +pub(crate) const DYNAMIC_FIELD_NAME: &str = "_dynamic"; fn make_term_query(term: Term) -> TantivyQueryAst { TantivyTermQuery::new(term, IndexRecordOption::WithFreqs).into() } +/// Find the field or fallback to the dynamic field if it exists pub fn find_field_or_hit_dynamic<'a>( full_path: &'a str, schema: &'a TantivySchema, -) -> Result<(Field, &'a FieldEntry, &'a str), InvalidQuery> { +) -> Option<(Field, &'a FieldEntry, &'a str)> { let (field, path) = if let Some((field, path)) = schema.find_field(full_path) { (field, path) } else { - let dynamic_field = - schema - .get_field(DYNAMIC_FIELD_NAME) - .map_err(|_| InvalidQuery::FieldDoesNotExist { - full_path: full_path.to_string(), - })?; + let dynamic_field = schema.get_field(DYNAMIC_FIELD_NAME).ok()?; (dynamic_field, full_path) }; let field_entry = schema.get_field_entry(field); let typ = field_entry.field_type().value_type(); if !path.is_empty() && typ != Type::Json { - return Err(InvalidQuery::FieldDoesNotExist { - full_path: full_path.to_string(), - }); + return None; } - Ok((field, field_entry, path)) + Some((field, field_entry, path)) +} + +/// Find all the fields that are below the given path. +/// +/// This will return a list of fields only when the path is that of a composite +/// type in the doc mapping. +pub fn find_subfields<'a>( + path: &'a str, + schema: &'a TantivySchema, +) -> Vec<(Field, &'a FieldEntry)> { + let prefix = format!("{}.", path); + schema + .fields() + .filter(|(_, field_entry)| field_entry.name().starts_with(&prefix)) + .collect() } /// Creates a full text query. @@ -74,12 +83,14 @@ pub(crate) fn full_text_query( tokenizer_manager: &TokenizerManager, lenient: bool, ) -> Result { - let (field, field_entry, path) = match find_field_or_hit_dynamic(full_path, schema) { - Ok(res) => res, - Err(InvalidQuery::FieldDoesNotExist { .. }) if lenient => { - return Ok(TantivyEmptyQuery.into()) + let Some((field, field_entry, path)) = find_field_or_hit_dynamic(full_path, schema) else { + if lenient { + return Ok(TantivyEmptyQuery.into()); + } else { + return Err(InvalidQuery::FieldDoesNotExist { + full_path: full_path.to_string(), + }); } - Err(e) => return Err(e), }; compute_query_with_field( field, diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs index 145e5a45bd1..81a44932f22 100644 --- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs +++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs @@ -110,7 +110,12 @@ impl WildcardQuery { schema: &TantivySchema, tokenizer_manager: &TokenizerManager, ) -> Result<(Field, Term), InvalidQuery> { - let (field, field_entry, json_path) = find_field_or_hit_dynamic(&self.field, schema)?; + let Some((field, field_entry, json_path)) = find_field_or_hit_dynamic(&self.field, schema) + else { + return Err(InvalidQuery::FieldDoesNotExist { + full_path: self.field.clone(), + }); + }; let field_type = field_entry.field_type(); let prefix = unescape_with_final_wildcard(&self.value)?; diff --git a/quickwit/quickwit-search/src/collector.rs b/quickwit/quickwit-search/src/collector.rs index 67beb8090cb..454946f6fe8 100644 --- a/quickwit/quickwit-search/src/collector.rs +++ b/quickwit/quickwit-search/src/collector.rs @@ -23,7 +23,7 @@ use std::collections::HashSet; use itertools::Itertools; use quickwit_common::binary_heap::{SortKeyMapper, TopK}; -use quickwit_doc_mapper::WarmupInfo; +use quickwit_doc_mapper::{FastFieldWarmupInfo, WarmupInfo}; use quickwit_proto::search::{ LeafSearchResponse, PartialHit, ResourceStats, SearchRequest, SortByValue, SortOrder, SortValue, SplitSearchError, @@ -753,7 +753,14 @@ impl QuickwitCollector { pub fn warmup_info(&self) -> WarmupInfo { WarmupInfo { - fast_field_names: self.fast_field_names(), + fast_fields: self + .fast_field_names() + .into_iter() + .map(|name| FastFieldWarmupInfo { + name, + with_subfields: false, + }) + .collect(), field_norms: self.requires_scoring(), ..WarmupInfo::default() } diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index 236149ca038..7242e62c409 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -29,7 +29,7 @@ use bytesize::ByteSize; use futures::future::try_join_all; use quickwit_common::pretty::PrettySample; use quickwit_directories::{CachingDirectory, HotDirectory, StorageDirectory}; -use quickwit_doc_mapper::{DocMapper, TermRange, WarmupInfo}; +use quickwit_doc_mapper::{DocMapper, FastFieldWarmupInfo, TermRange, WarmupInfo}; use quickwit_proto::search::{ CountHits, LeafSearchRequest, LeafSearchResponse, PartialHit, ResourceStats, SearchRequest, SortOrder, SortValue, SplitIdAndFooterOffsets, SplitSearchError, @@ -219,7 +219,7 @@ pub(crate) async fn warmup(searcher: &Searcher, warmup_info: &WarmupInfo) -> any let warm_up_term_dict_future = warm_up_term_dict_fields(searcher, &warmup_info.term_dict_fields) .instrument(debug_span!("warm_up_term_dicts")); - let warm_up_fastfields_future = warm_up_fastfields(searcher, &warmup_info.fast_field_names) + let warm_up_fastfields_future = warm_up_fastfields(searcher, &warmup_info.fast_fields) .instrument(debug_span!("warm_up_fastfields")); let warm_up_fieldnorms_future = warm_up_fieldnorms(searcher, warmup_info.field_norms) .instrument(debug_span!("warm_up_fieldnorms")); @@ -271,11 +271,17 @@ async fn warm_up_postings(searcher: &Searcher, fields: &HashSet) -> anyho async fn warm_up_fastfield( fast_field_reader: &FastFieldReaders, - fast_field_name: &str, + fast_field: &FastFieldWarmupInfo, ) -> anyhow::Result<()> { - let columns = fast_field_reader - .list_dynamic_column_handles(fast_field_name) + let mut columns = fast_field_reader + .list_dynamic_column_handles(&fast_field.name) .await?; + if fast_field.with_subfields { + let subpath_columns = fast_field_reader + .list_subpath_dynamic_column_handles(&fast_field.name) + .await?; + columns.extend(subpath_columns); + } futures::future::try_join_all( columns .into_iter() @@ -289,13 +295,13 @@ async fn warm_up_fastfield( /// all of the fast fields passed as argument. async fn warm_up_fastfields( searcher: &Searcher, - fast_field_names: &HashSet, + fast_fields: &HashSet, ) -> anyhow::Result<()> { let mut warm_up_futures = Vec::new(); for segment_reader in searcher.segment_readers() { let fast_field_reader = segment_reader.fast_fields(); - for fast_field_name in fast_field_names { - let warm_up_fut = warm_up_fastfield(fast_field_reader, fast_field_name); + for fast_field in fast_fields { + let warm_up_fut = warm_up_fastfield(fast_field_reader, fast_field); warm_up_futures.push(Box::pin(warm_up_fut)); } } diff --git a/quickwit/quickwit-search/src/root.rs b/quickwit/quickwit-search/src/root.rs index 724687148f2..d9debcc9cbd 100644 --- a/quickwit/quickwit-search/src/root.rs +++ b/quickwit/quickwit-search/src/root.rs @@ -1339,7 +1339,7 @@ pub async fn search_plan( searched_splits: split_ids, storage_requests: StorageRequestCount { footer: 1, - fastfield: warmup_info.fast_field_names.len(), + fastfield: warmup_info.fast_fields.len(), fieldnorm: fieldnorm_query_count, sstable: sstable_query_count, posting: sstable_query_count, diff --git a/quickwit/quickwit-search/src/search_stream/leaf.rs b/quickwit/quickwit-search/src/search_stream/leaf.rs index 0659965b40d..a1728a889a1 100644 --- a/quickwit/quickwit-search/src/search_stream/leaf.rs +++ b/quickwit/quickwit-search/src/search_stream/leaf.rs @@ -24,7 +24,7 @@ use std::sync::Arc; use futures::{FutureExt, StreamExt}; use quickwit_common::pretty::PrettySample; -use quickwit_doc_mapper::DocMapper; +use quickwit_doc_mapper::{DocMapper, FastFieldWarmupInfo, WarmupInfo}; use quickwit_proto::search::{ LeafSearchStreamResponse, OutputFormat, SearchRequest, SearchStreamRequest, SplitIdAndFooterOffsets, @@ -181,12 +181,21 @@ async fn leaf_search_stream_single_split( .iter() .any(|sort| sort.field_name == "_score"); - // TODO no test fail if this line get removed - warmup_info.field_norms |= requires_scoring; - - let fast_field_names = - request_fields.fast_fields_for_request(timestamp_filter_builder_opt.as_ref()); - warmup_info.fast_field_names.extend(fast_field_names); + let fast_fields = request_fields + .fast_fields_for_request(timestamp_filter_builder_opt.as_ref()) + .into_iter() + .map(|name| FastFieldWarmupInfo { + name, + with_subfields: false, + }) + .collect(); + let stream_warmup_info = WarmupInfo { + fast_fields, + // TODO no test fail if this line get removed + field_norms: requires_scoring, + ..Default::default() + }; + warmup_info.merge(stream_warmup_info); warmup_info.simplify(); warmup(&searcher, &warmup_info).await?; diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0011-exists-query.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0011-exists-query.yaml index 8de231391e5..154ce7a432f 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/0011-exists-query.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0011-exists-query.yaml @@ -25,6 +25,16 @@ expected: total: value: 60 --- +json: + query: + exists: + field: payload +expected: + hits: + total: + # one of the docs contains `"payload":{}` + value: 99 +--- # Fortunately, ES does not accept this quirky syntax in the # case of exists query. json: diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/0004_exists_search.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/0004_exists_search.yaml new file mode 100644 index 00000000000..7d7ee138cdc --- /dev/null +++ b/quickwit/rest-api-tests/scenarii/qw_search_api/0004_exists_search.yaml @@ -0,0 +1,68 @@ +endpoint: nested/search +params: + query: "doesnotexist:*" +expected: + num_hits: 0 +--- +# json fast fields: +endpoint: nested/search +params: + query: "json_fast:*" +expected: + num_hits: 1 +--- +endpoint: nested/search +params: + query: "json_fast.field_c:*" +expected: + num_hits: 1 +--- +endpoint: nested/search +params: + query: "json_fast.doesnotexist:*" +expected: + num_hits: 0 +--- +# json text fields: +endpoint: nested/search +params: + query: "json_text.field_a:*" +expected: + num_hits: 2 +--- +endpoint: nested/search +params: + query: "json_text.field_b:*" +expected: + num_hits: 1 +--- +endpoint: nested/search +params: + query: "json_text:*" +expected: + num_hits: 2 +--- +# object fields: +endpoint: nested/search +params: + query: "object_multi.object_fast_field:*" +expected: + num_hits: 2 +--- +endpoint: nested/search +params: + query: "object_multi.doesnotexist:*" +expected: + num_hits: 0 +--- +endpoint: nested/search +params: + query: "object_multi.object_text_field:*" +expected: + num_hits: 1 +--- +endpoint: nested/search +params: + query: "object_multi:*" +expected: + num_hits: 3 diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml index 84270d975b4..e5865e2c2c1 100644 --- a/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml @@ -68,7 +68,8 @@ params: ndjson: - {"seq": 1, "tag": 1} - {"seq": 2, "tag": 2} ----method: POST +--- +method: POST endpoint: tagged/ingest params: commit: force @@ -82,3 +83,45 @@ params: commit: force ndjson: - {"seq": 4, "tag": 1} +--- +method: DELETE +endpoint: indexes/nested +status_code: null +--- +method: POST +endpoint: indexes/ +json: + version: "0.7" + index_id: nested + doc_mapping: + index_field_presence: true + # default mode is dynamic + field_mappings: + - name: json_text + type: json + indexed: true + - name: json_fast + type: json + stored: true + fast: true + - name: object_multi + type: object + field_mappings: + - name: object_text_field + type: text + - name: object_fast_field + type: u64 + fast: true + +--- +method: POST +endpoint: nested/ingest +params: + commit: force +ndjson: + - {"json_text": {"field_a": "hello", "field_b": "world"}} + - {"json_text": {"field_a": "hi"}} + - {"json_fast": {"field_c": 1}} + - {"object_multi": {"object_text_field": "multi hello"}} + - {"object_multi": {"object_fast_field": 1}} + - {"object_multi": {"object_fast_field": 2}} diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml index a2896f541b1..36dd9c24422 100644 --- a/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml @@ -4,3 +4,6 @@ endpoint: indexes/simple --- method: DELETE endpoint: indexes/tagged +--- +method: DELETE +endpoint: indexes/nested