From fba56b86cfea62ab1a4c716cbed8438664cc52ea Mon Sep 17 00:00:00 2001 From: Adrien Guillo Date: Tue, 10 Dec 2024 17:08:05 -0500 Subject: [PATCH 1/2] Bump pulsar from 5.1.1 to 6.3 (#5584) --- quickwit/Cargo.lock | 321 +++++++++++++----- quickwit/Cargo.toml | 2 +- .../src/source/pulsar_source.rs | 2 - 3 files changed, 231 insertions(+), 94 deletions(-) diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index 1662803ed0b..27411942af1 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -580,7 +580,7 @@ dependencies = [ "http 0.2.12", "http 1.1.0", "once_cell", - "p256", + "p256 0.11.1", "percent-encoding", "ring 0.17.8", "sha2", @@ -823,7 +823,7 @@ dependencies = [ "serde", "serde_dynamo", "serde_json", - "serde_with 3.11.0", + "serde_with", ] [[package]] @@ -980,6 +980,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce" +[[package]] +name = "base16ct" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf" + [[package]] name = "base62" version = "2.0.3" @@ -1498,7 +1504,7 @@ dependencies = [ "anstream", "anstyle", "clap_lex", - "strsim 0.11.1", + "strsim", ] [[package]] @@ -1841,8 +1847,10 @@ version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76" dependencies = [ + "generic-array", "rand_core 0.6.4", "subtle", + "zeroize", ] [[package]] @@ -1902,37 +1910,40 @@ dependencies = [ ] [[package]] -name = "darling" -version = "0.13.4" +name = "curve25519-dalek" +version = "4.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a01d95850c592940db9b8194bc39f4bc0e89dee5c4265e4b1807c34a9aba453c" +checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be" dependencies = [ - "darling_core 0.13.4", - "darling_macro 0.13.4", + "cfg-if", + "cpufeatures", + "curve25519-dalek-derive", + "digest", + "fiat-crypto", + "rustc_version", + "subtle", + "zeroize", ] [[package]] -name = "darling" -version = "0.20.10" +name = "curve25519-dalek-derive" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989" +checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" dependencies = [ - "darling_core 0.20.10", - "darling_macro 0.20.10", + "proc-macro2", + "quote", + "syn 2.0.89", ] [[package]] -name = "darling_core" -version = "0.13.4" +name = "darling" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "859d65a907b6852c9361e3185c862aae7fafd2887876799fa55f5f99dc40d610" +checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989" dependencies = [ - "fnv", - "ident_case", - "proc-macro2", - "quote", - "strsim 0.10.0", - "syn 1.0.109", + "darling_core", + "darling_macro", ] [[package]] @@ -1945,28 +1956,17 @@ dependencies = [ "ident_case", "proc-macro2", "quote", - "strsim 0.11.1", + "strsim", "syn 2.0.89", ] -[[package]] -name = "darling_macro" -version = "0.13.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c972679f83bdf9c42bd905396b6c3588a843a17f0f16dfcfa3e2c5d57441835" -dependencies = [ - "darling_core 0.13.4", - "quote", - "syn 1.0.109", -] - [[package]] name = "darling_macro" version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ - "darling_core 0.20.10", + "darling_core", "quote", "syn 2.0.89", ] @@ -1979,9 +1979,9 @@ checksum = "e8566979429cf69b49a5c740c60791108e86440e8be149bbea4fe54d2c32d6e2" [[package]] name = "data-url" -version = "0.2.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d7439c3735f405729d52c3fbbe4de140eaf938a1fe47d227c27f8254d4302a5" +checksum = "5c297a1c74b71ae29df00c3e22dd9534821d60eb9af5a0192823fa2acea70c2a" [[package]] name = "deadpool" @@ -2208,11 +2208,49 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c" dependencies = [ "der 0.6.1", - "elliptic-curve", - "rfc6979", + "elliptic-curve 0.12.3", + "rfc6979 0.3.1", "signature 1.6.4", ] +[[package]] +name = "ecdsa" +version = "0.16.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee27f32b5c5292967d2d4a9d7f1e0b0aed2c15daded5a60300e4abb9d8020bca" +dependencies = [ + "der 0.7.9", + "digest", + "elliptic-curve 0.13.8", + "rfc6979 0.4.0", + "signature 2.2.0", + "spki 0.7.3", +] + +[[package]] +name = "ed25519" +version = "2.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53" +dependencies = [ + "pkcs8 0.10.2", + "signature 2.2.0", +] + +[[package]] +name = "ed25519-dalek" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a3daa8e81a3963a60642bcc1f90a670680bd4a77535faa384e9d1c79d620871" +dependencies = [ + "curve25519-dalek", + "ed25519", + "serde", + "sha2", + "subtle", + "zeroize", +] + [[package]] name = "either" version = "1.13.0" @@ -2240,16 +2278,37 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3" dependencies = [ - "base16ct", + "base16ct 0.1.1", "crypto-bigint 0.4.9", "der 0.6.1", "digest", - "ff", + "ff 0.12.1", "generic-array", - "group", + "group 0.12.1", "pkcs8 0.9.0", "rand_core 0.6.4", - "sec1", + "sec1 0.3.0", + "subtle", + "zeroize", +] + +[[package]] +name = "elliptic-curve" +version = "0.13.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47" +dependencies = [ + "base16ct 0.2.0", + "crypto-bigint 0.5.5", + "digest", + "ff 0.13.0", + "generic-array", + "group 0.13.0", + "hkdf", + "pem-rfc7468", + "pkcs8 0.10.2", + "rand_core 0.6.4", + "sec1 0.7.3", "subtle", "zeroize", ] @@ -2515,6 +2574,22 @@ dependencies = [ "subtle", ] +[[package]] +name = "ff" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ded41244b729663b1e574f1b4fb731469f69f79c17667b5d776b16cda0479449" +dependencies = [ + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "fiat-crypto" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d" + [[package]] name = "filetime" version = "0.2.25" @@ -2922,7 +2997,18 @@ version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7" dependencies = [ - "ff", + "ff 0.12.1", + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "group" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63" +dependencies = [ + "ff 0.13.0", "rand_core 0.6.4", "subtle", ] @@ -4887,26 +4973,31 @@ dependencies = [ [[package]] name = "openidconnect" -version = "2.5.1" +version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98dd5b7049bac4fdd2233b8c9767d42c05da8006fdb79cc903258556d2b18009" +checksum = "f47e80a9cfae4462dd29c41e987edd228971d6565553fbc14b8a11e666d91590" dependencies = [ "base64 0.13.1", "chrono", + "dyn-clone", + "ed25519-dalek", + "hmac", "http 0.2.12", "itertools 0.10.5", "log", - "num-bigint", "oauth2", + "p256 0.13.2", + "p384", "rand 0.8.5", - "ring 0.16.20", + "rsa", "serde", "serde-value", "serde_derive", "serde_json", "serde_path_to_error", "serde_plain", - "serde_with 1.14.0", + "serde_with", + "sha2", "subtle", "thiserror", "url", @@ -5158,8 +5249,32 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51f44edd08f51e2ade572f141051021c5af22677e42b7dd28a88155151c33594" dependencies = [ - "ecdsa", - "elliptic-curve", + "ecdsa 0.14.8", + "elliptic-curve 0.12.3", + "sha2", +] + +[[package]] +name = "p256" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9863ad85fa8f4460f9c48cb909d38a0d689dba1f6f6988a5e3e0d31071bcd4b" +dependencies = [ + "ecdsa 0.16.9", + "elliptic-curve 0.13.8", + "primeorder", + "sha2", +] + +[[package]] +name = "p384" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70786f51bcc69f6a4c0360e063a4cac5419ef7c5cd5b3c99ad70f3be5ba79209" +dependencies = [ + "ecdsa 0.16.9", + "elliptic-curve 0.13.8", + "primeorder", "sha2", ] @@ -5770,6 +5885,15 @@ dependencies = [ "syn 2.0.89", ] +[[package]] +name = "primeorder" +version = "0.13.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "353e1ca18966c16d9deb1c69278edbc5f194139612772bd9537af60ac231e1e6" +dependencies = [ + "elliptic-curve 0.13.8", +] + [[package]] name = "proc-macro-crate" version = "1.3.1" @@ -6045,9 +6169,9 @@ dependencies = [ [[package]] name = "pulsar" -version = "5.1.1" +version = "6.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20f237570b5665b38c7d5228f9a1d2990e369c00e635704528996bcd5219f540" +checksum = "d7f3541ff84e39da334979ac4bf171e0f277f4f782603aeae65bf5795dc7275a" dependencies = [ "async-trait", "bit-vec", @@ -6065,7 +6189,7 @@ dependencies = [ "nom", "oauth2", "openidconnect", - "pem 1.1.1", + "pem 3.0.4", "prost 0.11.9", "prost-build", "prost-derive 0.11.9", @@ -6079,7 +6203,7 @@ dependencies = [ "tokio-util", "url", "uuid", - "zstd 0.11.2+zstd.1.5.2", + "zstd 0.12.4", ] [[package]] @@ -6362,7 +6486,7 @@ dependencies = [ "regex", "serde", "serde_json", - "serde_with 3.11.0", + "serde_with", "serde_yaml", "siphasher", "tokio", @@ -6762,7 +6886,7 @@ dependencies = [ "sea-query-binder", "serde", "serde_json", - "serde_with 3.11.0", + "serde_with", "serial_test", "sqlx", "tempfile", @@ -6853,7 +6977,7 @@ dependencies = [ "quickwit-datetime", "serde", "serde_json", - "serde_with 3.11.0", + "serde_with", "tantivy", "thiserror", "time", @@ -6978,7 +7102,7 @@ dependencies = [ "serde", "serde_json", "serde_qs 0.12.0", - "serde_with 3.11.0", + "serde_with", "tempfile", "thiserror", "time", @@ -7405,6 +7529,16 @@ dependencies = [ "zeroize", ] +[[package]] +name = "rfc6979" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dd2a808d456c4a54e300a23e9f5a67e122c3024119acbfd73e3bf664491cb2" +dependencies = [ + "hmac", + "subtle", +] + [[package]] name = "rgb" version = "0.8.50" @@ -7805,7 +7939,7 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9834af2c4bd8c5162f00c89f1701fb6886119a88062cf76fe842ea9e232b9839" dependencies = [ - "darling 0.20.10", + "darling", "heck 0.4.1", "proc-macro2", "quote", @@ -7825,7 +7959,7 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928" dependencies = [ - "base16ct", + "base16ct 0.1.1", "der 0.6.1", "generic-array", "pkcs8 0.9.0", @@ -7833,6 +7967,20 @@ dependencies = [ "zeroize", ] +[[package]] +name = "sec1" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3e97a565f76233a6003f9f5c54be1d9c5bdfa3eccfb189469f11ec4901c47dc" +dependencies = [ + "base16ct 0.2.0", + "der 0.7.9", + "generic-array", + "pkcs8 0.10.2", + "subtle", + "zeroize", +] + [[package]] name = "security-framework" version = "2.11.1" @@ -7996,16 +8144,6 @@ dependencies = [ "serde", ] -[[package]] -name = "serde_with" -version = "1.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "678b5a069e50bf00ecd22d0cd8ddf7c236f68581b03db652061ed5eb13a312ff" -dependencies = [ - "serde", - "serde_with_macros 1.5.2", -] - [[package]] name = "serde_with" version = "3.11.0" @@ -8020,29 +8158,17 @@ dependencies = [ "serde", "serde_derive", "serde_json", - "serde_with_macros 3.11.0", + "serde_with_macros", "time", ] -[[package]] -name = "serde_with_macros" -version = "1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e182d6ec6f05393cc0e5ed1bf81ad6db3a8feedf8ee515ecdd369809bcce8082" -dependencies = [ - "darling 0.13.4", - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "serde_with_macros" version = "3.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d846214a9854ef724f3da161b426242d8de7c1fc7de2f89bb1efcb154dca79d" dependencies = [ - "darling 0.20.10", + "darling", "proc-macro2", "quote", "syn 2.0.89", @@ -8594,12 +8720,6 @@ dependencies = [ "vte 0.11.1", ] -[[package]] -name = "strsim" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" - [[package]] name = "strsim" version = "0.11.1" @@ -10549,6 +10669,15 @@ dependencies = [ "zstd-safe 5.0.2+zstd.1.5.2", ] +[[package]] +name = "zstd" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c" +dependencies = [ + "zstd-safe 6.0.6", +] + [[package]] name = "zstd" version = "0.13.2" @@ -10568,6 +10697,16 @@ dependencies = [ "zstd-sys", ] +[[package]] +name = "zstd-safe" +version = "6.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581" +dependencies = [ + "libc", + "zstd-sys", +] + [[package]] name = "zstd-safe" version = "7.2.1" diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index c3e3051470c..9c91d6efd58 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -181,7 +181,7 @@ prost = { version = "0.11.6", default-features = false, features = [ ] } prost-build = "0.11.6" prost-types = "0.11.6" -pulsar = { version = "5.1.1", default-features = false, features = [ +pulsar = { version = "6.3", default-features = false, features = [ "auth-oauth2", "compression", "tokio-runtime", diff --git a/quickwit/quickwit-indexing/src/source/pulsar_source.rs b/quickwit/quickwit-indexing/src/source/pulsar_source.rs index e7e1ce6e9b0..528ef6e12ec 100644 --- a/quickwit/quickwit-indexing/src/source/pulsar_source.rs +++ b/quickwit/quickwit-indexing/src/source/pulsar_source.rs @@ -429,9 +429,7 @@ async fn connect_pulsar(params: &PulsarSourceParams) -> anyhow::Result = builder.build().await?; - Ok(pulsar) } From 2121ba1ce3ceff8d867671848d0c212803f10b42 Mon Sep 17 00:00:00 2001 From: Remi Dettai Date: Wed, 11 Dec 2024 16:05:42 +0100 Subject: [PATCH 2/2] Fix lenient option with wildcard queries (#5575) * Better error messages in integ tests * Initial fix suggestion from Trinity * Add rest api test and clarify docs about leniency * Add missing field test on wildcard query * Fix add query building unit tests * Forgotten staging file --- docs/reference/es_compatible_api.md | 22 +++++- .../quickwit-doc-mapper/src/query_builder.rs | 72 +++++++++++++++++-- .../src/tests/update_tests/mod.rs | 2 +- .../src/elastic_query_dsl/match_query.rs | 6 +- .../src/elastic_query_dsl/mod.rs | 8 +++ .../src/elastic_query_dsl/multi_match.rs | 6 +- .../elastic_query_dsl/phrase_prefix_query.rs | 1 + .../elastic_query_dsl/query_string_query.rs | 6 +- .../src/query_ast/full_text_query.rs | 1 + .../src/query_ast/phrase_prefix_query.rs | 10 ++- .../src/query_ast/user_input_query.rs | 3 + .../src/query_ast/wildcard_query.rs | 67 ++++++++++------- .../0005-query_string_query.yaml | 16 ++++- 13 files changed, 169 insertions(+), 51 deletions(-) diff --git a/docs/reference/es_compatible_api.md b/docs/reference/es_compatible_api.md index 3ead0fa927c..c3c4c94e1c9 100644 --- a/docs/reference/es_compatible_api.md +++ b/docs/reference/es_compatible_api.md @@ -394,6 +394,7 @@ The following query types are supported. | `fields` | `String[]` (Optional) | Default search target fields. | - | | `default_operator` | `"AND"` or `"OR"` | In the absence of boolean operator defines whether terms should be combined as a conjunction (`AND`) or disjunction (`OR`). | `OR` | | `boost` | `Number` | Multiplier boost for score computation. | 1.0 | +| `lenient` | `Boolean` | [See note](#about-the-lenient-argument). | false | ### `bool` @@ -494,7 +495,7 @@ The following query types are supported. | `operator` | `"AND"` or `"OR"` | Defines whether all terms should be present (`AND`) or if at least one term is sufficient to match (`OR`). | OR | | `zero_terms_query` | `all` or `none` | Defines if all (`all`) or no documents (`none`) should be returned if the query does not contain any terms after tokenization. | `none` | | `boost` | `Number` | Multiplier boost for score computation | 1.0 | - +| `lenient` | `Boolean` | [See note](#about-the-lenient-argument). | false | @@ -637,8 +638,17 @@ Contrary to ES/Opensearch, in Quickwit, at most 50 terms will be considered when } ``` -#### Supported Multi-match Queries -| Type | Description | +#### Supported parameters + +| Variable | Type | Description | Default value | +| ------------------ | --------------------- | ---------------------------------------------| ------------- | +| `type` | `String` | See supported types below | `most_fields` | +| `fields` | `String[]` (Optional) | Default search target fields. | - | +| `lenient` | `Boolean` | [See note](#about-the-lenient-argument). | false | + +Supported types: + +| `type` value | Description | | --------------- | ------------------------------------------------------------------------------------------- | | `most_fields` | Finds documents matching any field and combines the `_score` from each field (default). | | `phrase` | Runs a `match_phrase` query on each field. | @@ -721,6 +731,12 @@ Query matching only documents containing a non-null value for a given field. | `field` | String | Only documents with a value for field will be returned. | - | +### About the `lenient` argument + +Quickwit and Elasticsearch have different interpretations of the `lenient` setting: +- In Quickwit, lenient mode allows ignoring parts of the query that reference non-existing columns. This is a behavior that Elasticsearch supports by default. +- In Elasticsearch, lenient mode primarily addresses type errors (such as searching for text in an integer field). Quickwit always supports this behavior, regardless of the `lenient` setting. + ## Search multiple indices Search APIs that accept requests path parameter also support multi-target syntax. diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index dbc663794e5..9dffeef0ad7 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -248,7 +248,9 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> { ) -> Result<(), Self::Err> { let terms = match phrase_prefix.get_terms(self.schema, self.tokenizer_manager) { Ok((_, terms)) => terms, - Err(InvalidQuery::SchemaError(_)) => return Ok(()), /* the query will be nullified when casting to a tantivy ast */ + Err(InvalidQuery::SchemaError(_)) | Err(InvalidQuery::FieldDoesNotExist { .. }) => { + return Ok(()) + } /* the query will be nullified when casting to a tantivy ast */ Err(e) => return Err(e), }; if let Some((_, term)) = terms.last() { @@ -258,7 +260,12 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> { } fn visit_wildcard(&mut self, wildcard_query: &'a WildcardQuery) -> Result<(), Self::Err> { - let (_, term) = wildcard_query.extract_prefix_term(self.schema, self.tokenizer_manager)?; + let term = match wildcard_query.extract_prefix_term(self.schema, self.tokenizer_manager) { + Ok((_, term)) => term, + /* the query will be nullified when casting to a tantivy ast */ + Err(InvalidQuery::FieldDoesNotExist { .. }) => return Ok(()), + Err(e) => return Err(e), + }; self.add_prefix_term(term, u32::MAX, false); Ok(()) } @@ -280,8 +287,11 @@ mod test { use quickwit_query::query_ast::{ query_ast_from_user_text, FullTextMode, FullTextParams, PhrasePrefixQuery, QueryAstVisitor, + UserInputQuery, + }; + use quickwit_query::{ + create_default_quickwit_tokenizer_manager, BooleanOperand, MatchAllOrNone, }; - use quickwit_query::{create_default_quickwit_tokenizer_manager, MatchAllOrNone}; use tantivy::schema::{DateOptions, DateTimePrecision, Schema, FAST, INDEXED, STORED, TEXT}; use tantivy::Term; @@ -323,7 +333,7 @@ mod test { search_fields: Vec, expected: TestExpectation, ) { - check_build_query(user_query, search_fields, expected, true); + check_build_query(user_query, search_fields, expected, true, false); } #[track_caller] @@ -332,15 +342,31 @@ mod test { search_fields: Vec, expected: TestExpectation, ) { - check_build_query(user_query, search_fields, expected, false); + check_build_query(user_query, search_fields, expected, false, false); + } + + #[track_caller] + fn check_build_query_static_lenient_mode( + user_query: &str, + search_fields: Vec, + expected: TestExpectation, + ) { + check_build_query(user_query, search_fields, expected, false, true); } fn test_build_query( user_query: &str, search_fields: Vec, dynamic_mode: bool, + lenient: bool, ) -> Result { - let query_ast = query_ast_from_user_text(user_query, Some(search_fields)) + let user_input_query = UserInputQuery { + user_text: user_query.to_string(), + default_fields: Some(search_fields), + default_operator: BooleanOperand::And, + lenient, + }; + let query_ast = user_input_query .parse_user_query(&[]) .map_err(|err| err.to_string())?; let schema = make_schema(dynamic_mode); @@ -362,8 +388,9 @@ mod test { search_fields: Vec, expected: TestExpectation, dynamic_mode: bool, + lenient: bool, ) { - let query_result = test_build_query(user_query, search_fields, dynamic_mode); + let query_result = test_build_query(user_query, search_fields, dynamic_mode, lenient); match (query_result, expected) { (Err(query_err_msg), TestExpectation::Err(sub_str)) => { assert!( @@ -425,6 +452,11 @@ mod test { Vec::new(), TestExpectation::Err("invalid query: field does not exist: `foo`"), ); + check_build_query_static_lenient_mode( + "foo:bar", + Vec::new(), + TestExpectation::Ok("EmptyQuery"), + ); check_build_query_static_mode( "title:bar", Vec::new(), @@ -435,6 +467,11 @@ mod test { vec!["fieldnotinschema".to_string()], TestExpectation::Err("invalid query: field does not exist: `fieldnotinschema`"), ); + check_build_query_static_lenient_mode( + "bar", + vec!["fieldnotinschema".to_string()], + TestExpectation::Ok("EmptyQuery"), + ); check_build_query_static_mode( "title:[a TO b]", Vec::new(), @@ -503,6 +540,25 @@ mod test { ); } + #[test] + fn test_wildcard_query() { + check_build_query_static_mode( + "title:hello*", + Vec::new(), + TestExpectation::Ok("PhrasePrefixQuery"), + ); + check_build_query_static_mode( + "foo:bar*", + Vec::new(), + TestExpectation::Err("invalid query: field does not exist: `foo`"), + ); + check_build_query_static_mode( + "title:hello*yo", + Vec::new(), + TestExpectation::Err("Wildcard query contains wildcard in non final position"), + ); + } + #[test] fn test_datetime_range_query() { { @@ -695,12 +751,14 @@ mod test { phrase: "short".to_string(), max_expansions: 50, params: params.clone(), + lenient: false, }; let long = PhrasePrefixQuery { field: "title".to_string(), phrase: "not so short".to_string(), max_expansions: 50, params: params.clone(), + lenient: false, }; let mut extractor1 = ExtractPrefixTermRanges::with_schema(&schema, &tokenizer_manager); extractor1.visit_phrase_prefix(&short).unwrap(); diff --git a/quickwit/quickwit-integration-tests/src/tests/update_tests/mod.rs b/quickwit/quickwit-integration-tests/src/tests/update_tests/mod.rs index ad6bb67bcc5..835102c89eb 100644 --- a/quickwit/quickwit-integration-tests/src/tests/update_tests/mod.rs +++ b/quickwit/quickwit-integration-tests/src/tests/update_tests/mod.rs @@ -41,7 +41,7 @@ async fn assert_hits_unordered( ) .await; if let Ok(expected_hits) = expected_result { - let resp = search_res.unwrap_or_else(|_| panic!("query: {}", query)); + let resp = search_res.unwrap_or_else(|err| panic!("query: {}, error: {}", query, err)); assert_eq!(resp.errors.len(), 0, "query: {}", query); assert_eq!( resp.num_hits, diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/match_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/match_query.rs index 18c565976f7..1547dcaeae9 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/match_query.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/match_query.rs @@ -19,6 +19,7 @@ use serde::Deserialize; +use super::LeniencyBool; use crate::elastic_query_dsl::{ ConvertibleToQueryAst, ElasticQueryDslInner, StringOrStructForSerialization, }; @@ -42,11 +43,8 @@ pub(crate) struct MatchQueryParams { pub(crate) operator: BooleanOperand, #[serde(default)] pub(crate) zero_terms_query: MatchAllOrNone, - // Quickwit and Elastic have different notions of lenient. For us, it means it's okay to - // disregard part of the query where which uses non-existing collumn (which Elastic does by - // default). For Elastic, it covers type errors (searching text in an integer field). #[serde(default)] - pub(crate) lenient: bool, + pub(crate) lenient: LeniencyBool, } impl ConvertibleToQueryAst for MatchQuery { diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs b/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs index 9e49c866d95..2140b659138 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs @@ -50,6 +50,14 @@ use crate::elastic_query_dsl::terms_query::TermsQuery; use crate::not_nan_f32::NotNaNf32; use crate::query_ast::QueryAst; +/// Quickwit and Elasticsearch have different interpretations of leniency: +/// - In Quickwit, lenient mode allows ignoring parts of the query that reference non-existing +/// columns. This is a behavior that Elasticsearch supports by default. +/// - In Elasticsearch, lenient mode primarily addresses type errors (such as searching for text in +/// an integer field). Quickwit always supports this behavior, regardless of the `lenient` +/// setting. +pub type LeniencyBool = bool; + fn default_max_expansions() -> u32 { 50 } diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/multi_match.rs b/quickwit/quickwit-query/src/elastic_query_dsl/multi_match.rs index 9b607151a31..8f5f8313a53 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/multi_match.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/multi_match.rs @@ -21,6 +21,7 @@ use serde::Deserialize; use serde_with::formats::PreferMany; use serde_with::{serde_as, OneOrMany}; +use super::LeniencyBool; use crate::elastic_query_dsl::bool_query::BoolQuery; use crate::elastic_query_dsl::match_bool_prefix::MatchBoolPrefixQuery; use crate::elastic_query_dsl::match_phrase_query::{MatchPhraseQuery, MatchPhraseQueryParams}; @@ -48,11 +49,8 @@ struct MultiMatchQueryForDeserialization { #[serde_as(deserialize_as = "OneOrMany<_, PreferMany>")] #[serde(default)] fields: Vec, - // Quickwit and Elastic have different notions of lenient. For us, it means it's okay to - // disregard part of the query where which uses non-existing collumn (which Elastic does by - // default). For Elastic, it covers type errors (searching text in an integer field). #[serde(default)] - lenient: bool, + lenient: LeniencyBool, } fn deserialize_match_query_for_one_field( diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/phrase_prefix_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/phrase_prefix_query.rs index 3955a175c64..4579b6530bf 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/phrase_prefix_query.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/phrase_prefix_query.rs @@ -67,6 +67,7 @@ impl ConvertibleToQueryAst for MatchPhrasePrefixQuery { phrase: query, params: analyzer, max_expansions, + lenient: false, }; Ok(phrase_prefix_query_ast.into()) } diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/query_string_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/query_string_query.rs index f7192f8928e..9e7e6ce180f 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/query_string_query.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/query_string_query.rs @@ -19,6 +19,7 @@ use serde::Deserialize; +use super::LeniencyBool; use crate::elastic_query_dsl::ConvertibleToQueryAst; use crate::not_nan_f32::NotNaNf32; use crate::query_ast::UserInputQuery; @@ -40,11 +41,8 @@ pub(crate) struct QueryStringQuery { default_operator: BooleanOperand, #[serde(default)] boost: Option, - // Regardless of this option Quickwit behaves in elasticsearch definition of - // lenient. We include this property here just to accept user queries containing - // this option. #[serde(default)] - lenient: bool, + lenient: LeniencyBool, } impl ConvertibleToQueryAst for QueryStringQuery { diff --git a/quickwit/quickwit-query/src/query_ast/full_text_query.rs b/quickwit/quickwit-query/src/query_ast/full_text_query.rs index d77b39e67df..661bb89039f 100644 --- a/quickwit/quickwit-query/src/query_ast/full_text_query.rs +++ b/quickwit/quickwit-query/src/query_ast/full_text_query.rs @@ -227,6 +227,7 @@ pub struct FullTextQuery { pub field: String, pub text: String, pub params: FullTextParams, + /// Support missing fields pub lenient: bool, } diff --git a/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs b/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs index d0107f885f9..1675b22d760 100644 --- a/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs +++ b/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs @@ -38,6 +38,8 @@ pub struct PhrasePrefixQuery { pub phrase: String, pub max_expansions: u32, pub params: FullTextParams, + /// Support missing fields + pub lenient: bool, } impl PhrasePrefixQuery { @@ -117,7 +119,13 @@ impl BuildTantivyAst for PhrasePrefixQuery { _search_fields: &[String], _with_validation: bool, ) -> Result { - let (_, terms) = self.get_terms(schema, tokenizer_manager)?; + let (_, terms) = match self.get_terms(schema, tokenizer_manager) { + Ok(res) => res, + Err(InvalidQuery::FieldDoesNotExist { .. }) if self.lenient => { + return Ok(TantivyQueryAst::match_none()) + } + Err(e) => return Err(e), + }; if terms.is_empty() { if self.params.zero_terms_query.is_none() { diff --git a/quickwit/quickwit-query/src/query_ast/user_input_query.rs b/quickwit/quickwit-query/src/query_ast/user_input_query.rs index 8a910567982..279f41b4676 100644 --- a/quickwit/quickwit-query/src/query_ast/user_input_query.rs +++ b/quickwit/quickwit-query/src/query_ast/user_input_query.rs @@ -49,6 +49,7 @@ pub struct UserInputQuery { #[serde(default, skip_serializing_if = "Option::is_none")] pub default_fields: Option>, pub default_operator: BooleanOperand, + /// Support missing fields pub lenient: bool, } @@ -273,12 +274,14 @@ fn convert_user_input_literal( phrase: phrase.clone(), params: full_text_params.clone(), max_expansions: DEFAULT_PHRASE_QUERY_MAX_EXPANSION, + lenient, } .into() } else if wildcard { query_ast::WildcardQuery { field: field_name, value: phrase.clone(), + lenient, } .into() } else { diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs index 86afb68a7d3..145e5a45bd1 100644 --- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs +++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs @@ -34,6 +34,8 @@ use crate::{find_field_or_hit_dynamic, InvalidQuery}; pub struct WildcardQuery { pub field: String, pub value: String, + /// Support missing fields + pub lenient: bool, } impl From for QueryAst { @@ -42,16 +44,6 @@ impl From for QueryAst { } } -impl WildcardQuery { - #[cfg(test)] - pub fn from_field_value(field: impl ToString, value: impl ToString) -> Self { - Self { - field: field.to_string(), - value: value.to_string(), - } - } -} - fn extract_unique_token(mut tokens: Vec) -> anyhow::Result { let term = tokens .pop() @@ -77,7 +69,7 @@ fn unescape_with_final_wildcard(phrase: &str) -> anyhow::Result { .scan(State::Normal, |state, c| { if *saw_wildcard { return Some(Some(Err(anyhow!( - "Wildcard iquery contains wildcard in non final position" + "Wildcard query contains wildcard in non final position" )))); } match state { @@ -190,7 +182,13 @@ impl BuildTantivyAst for WildcardQuery { _search_fields: &[String], _with_validation: bool, ) -> Result { - let (_, term) = self.extract_prefix_term(schema, tokenizer_manager)?; + let (_, term) = match self.extract_prefix_term(schema, tokenizer_manager) { + Ok(res) => res, + Err(InvalidQuery::FieldDoesNotExist { .. }) if self.lenient => { + return Ok(TantivyQueryAst::match_none()) + } + Err(e) => return Err(e), + }; let mut phrase_prefix_query = tantivy::query::PhrasePrefixQuery::new_with_offset(vec![(0, term)]); @@ -206,20 +204,24 @@ mod tests { use super::*; use crate::create_default_quickwit_tokenizer_manager; + fn single_text_field_schema(field_name: &str, tokenizer: &str) -> TantivySchema { + let mut schema_builder = TantivySchema::builder(); + let text_options = TextOptions::default() + .set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer)); + schema_builder.add_text_field(field_name, text_options); + schema_builder.build() + } + #[test] fn test_extract_term_for_wildcard() { let query = WildcardQuery { field: "my_field".to_string(), value: "MyString Wh1ch a nOrMal Tokenizer would cut*".to_string(), + lenient: false, }; let tokenizer_manager = create_default_quickwit_tokenizer_manager(); for tokenizer in ["raw", "whitespace"] { - let mut schema_builder = TantivySchema::builder(); - let text_options = TextOptions::default() - .set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer)); - schema_builder.add_text_field("my_field", text_options); - let schema = schema_builder.build(); - + let schema = single_text_field_schema("my_field", tokenizer); let (_field, term) = query .extract_prefix_term(&schema, &tokenizer_manager) .unwrap(); @@ -237,19 +239,34 @@ mod tests { "source_code_default", "source_code_with_hex", ] { - let mut schema_builder = TantivySchema::builder(); - let text_options = TextOptions::default() - .set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer)); - schema_builder.add_text_field("my_field", text_options); - let schema = schema_builder.build(); - + let schema = single_text_field_schema("my_field", tokenizer); let (_field, term) = query .extract_prefix_term(&schema, &tokenizer_manager) .unwrap(); - let value = term.value(); let text = value.as_str().unwrap(); assert_eq!(text, &query.value.trim_end_matches('*').to_lowercase()); } } + + #[test] + fn test_extract_term_for_wildcard_missing_field() { + let query = WildcardQuery { + field: "my_missing_field".to_string(), + value: "My query value*".to_string(), + lenient: false, + }; + let tokenizer_manager = create_default_quickwit_tokenizer_manager(); + let schema = single_text_field_schema("my_field", "whitespace"); + let err = query + .extract_prefix_term(&schema, &tokenizer_manager) + .unwrap_err(); + let InvalidQuery::FieldDoesNotExist { + full_path: missing_field_full_path, + } = err + else { + panic!("unexpected error: {:?}", err); + }; + assert_eq!(missing_field_full_path, "my_missing_field"); + } } diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml index 8cb495379c3..668e4877cfc 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml @@ -226,13 +226,25 @@ json: query: query_string: query: "true" - fields: ["public", "public.inner"] + fields: ["public", "public.notdefined", "notdefined"] lenient: true expected: hits: total: value: 100 --- +# trailing wildcard +json: + query: + query_string: + query: "jour*" + fields: ["payload.description", "payload.notdefined", "notdefined"] + lenient: true +expected: + hits: + total: + value: 3 +--- # elasticsearch accepts this query engines: - quickwit @@ -240,5 +252,5 @@ json: query: query_string: query: "true" - fields: ["public", "public.inner"] + fields: ["public", "public.notdefined"] status_code: 400