diff --git a/docs/reference/es_compatible_api.md b/docs/reference/es_compatible_api.md index 3ead0fa927c..c3c4c94e1c9 100644 --- a/docs/reference/es_compatible_api.md +++ b/docs/reference/es_compatible_api.md @@ -394,6 +394,7 @@ The following query types are supported. | `fields` | `String[]` (Optional) | Default search target fields. | - | | `default_operator` | `"AND"` or `"OR"` | In the absence of boolean operator defines whether terms should be combined as a conjunction (`AND`) or disjunction (`OR`). | `OR` | | `boost` | `Number` | Multiplier boost for score computation. | 1.0 | +| `lenient` | `Boolean` | [See note](#about-the-lenient-argument). | false | ### `bool` @@ -494,7 +495,7 @@ The following query types are supported. | `operator` | `"AND"` or `"OR"` | Defines whether all terms should be present (`AND`) or if at least one term is sufficient to match (`OR`). | OR | | `zero_terms_query` | `all` or `none` | Defines if all (`all`) or no documents (`none`) should be returned if the query does not contain any terms after tokenization. | `none` | | `boost` | `Number` | Multiplier boost for score computation | 1.0 | - +| `lenient` | `Boolean` | [See note](#about-the-lenient-argument). | false | @@ -637,8 +638,17 @@ Contrary to ES/Opensearch, in Quickwit, at most 50 terms will be considered when } ``` -#### Supported Multi-match Queries -| Type | Description | +#### Supported parameters + +| Variable | Type | Description | Default value | +| ------------------ | --------------------- | ---------------------------------------------| ------------- | +| `type` | `String` | See supported types below | `most_fields` | +| `fields` | `String[]` (Optional) | Default search target fields. | - | +| `lenient` | `Boolean` | [See note](#about-the-lenient-argument). | false | + +Supported types: + +| `type` value | Description | | --------------- | ------------------------------------------------------------------------------------------- | | `most_fields` | Finds documents matching any field and combines the `_score` from each field (default). | | `phrase` | Runs a `match_phrase` query on each field. | @@ -721,6 +731,12 @@ Query matching only documents containing a non-null value for a given field. | `field` | String | Only documents with a value for field will be returned. | - | +### About the `lenient` argument + +Quickwit and Elasticsearch have different interpretations of the `lenient` setting: +- In Quickwit, lenient mode allows ignoring parts of the query that reference non-existing columns. This is a behavior that Elasticsearch supports by default. +- In Elasticsearch, lenient mode primarily addresses type errors (such as searching for text in an integer field). Quickwit always supports this behavior, regardless of the `lenient` setting. + ## Search multiple indices Search APIs that accept requests path parameter also support multi-target syntax. diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index dbc663794e5..9dffeef0ad7 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -248,7 +248,9 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> { ) -> Result<(), Self::Err> { let terms = match phrase_prefix.get_terms(self.schema, self.tokenizer_manager) { Ok((_, terms)) => terms, - Err(InvalidQuery::SchemaError(_)) => return Ok(()), /* the query will be nullified when casting to a tantivy ast */ + Err(InvalidQuery::SchemaError(_)) | Err(InvalidQuery::FieldDoesNotExist { .. }) => { + return Ok(()) + } /* the query will be nullified when casting to a tantivy ast */ Err(e) => return Err(e), }; if let Some((_, term)) = terms.last() { @@ -258,7 +260,12 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> { } fn visit_wildcard(&mut self, wildcard_query: &'a WildcardQuery) -> Result<(), Self::Err> { - let (_, term) = wildcard_query.extract_prefix_term(self.schema, self.tokenizer_manager)?; + let term = match wildcard_query.extract_prefix_term(self.schema, self.tokenizer_manager) { + Ok((_, term)) => term, + /* the query will be nullified when casting to a tantivy ast */ + Err(InvalidQuery::FieldDoesNotExist { .. }) => return Ok(()), + Err(e) => return Err(e), + }; self.add_prefix_term(term, u32::MAX, false); Ok(()) } @@ -280,8 +287,11 @@ mod test { use quickwit_query::query_ast::{ query_ast_from_user_text, FullTextMode, FullTextParams, PhrasePrefixQuery, QueryAstVisitor, + UserInputQuery, + }; + use quickwit_query::{ + create_default_quickwit_tokenizer_manager, BooleanOperand, MatchAllOrNone, }; - use quickwit_query::{create_default_quickwit_tokenizer_manager, MatchAllOrNone}; use tantivy::schema::{DateOptions, DateTimePrecision, Schema, FAST, INDEXED, STORED, TEXT}; use tantivy::Term; @@ -323,7 +333,7 @@ mod test { search_fields: Vec, expected: TestExpectation, ) { - check_build_query(user_query, search_fields, expected, true); + check_build_query(user_query, search_fields, expected, true, false); } #[track_caller] @@ -332,15 +342,31 @@ mod test { search_fields: Vec, expected: TestExpectation, ) { - check_build_query(user_query, search_fields, expected, false); + check_build_query(user_query, search_fields, expected, false, false); + } + + #[track_caller] + fn check_build_query_static_lenient_mode( + user_query: &str, + search_fields: Vec, + expected: TestExpectation, + ) { + check_build_query(user_query, search_fields, expected, false, true); } fn test_build_query( user_query: &str, search_fields: Vec, dynamic_mode: bool, + lenient: bool, ) -> Result { - let query_ast = query_ast_from_user_text(user_query, Some(search_fields)) + let user_input_query = UserInputQuery { + user_text: user_query.to_string(), + default_fields: Some(search_fields), + default_operator: BooleanOperand::And, + lenient, + }; + let query_ast = user_input_query .parse_user_query(&[]) .map_err(|err| err.to_string())?; let schema = make_schema(dynamic_mode); @@ -362,8 +388,9 @@ mod test { search_fields: Vec, expected: TestExpectation, dynamic_mode: bool, + lenient: bool, ) { - let query_result = test_build_query(user_query, search_fields, dynamic_mode); + let query_result = test_build_query(user_query, search_fields, dynamic_mode, lenient); match (query_result, expected) { (Err(query_err_msg), TestExpectation::Err(sub_str)) => { assert!( @@ -425,6 +452,11 @@ mod test { Vec::new(), TestExpectation::Err("invalid query: field does not exist: `foo`"), ); + check_build_query_static_lenient_mode( + "foo:bar", + Vec::new(), + TestExpectation::Ok("EmptyQuery"), + ); check_build_query_static_mode( "title:bar", Vec::new(), @@ -435,6 +467,11 @@ mod test { vec!["fieldnotinschema".to_string()], TestExpectation::Err("invalid query: field does not exist: `fieldnotinschema`"), ); + check_build_query_static_lenient_mode( + "bar", + vec!["fieldnotinschema".to_string()], + TestExpectation::Ok("EmptyQuery"), + ); check_build_query_static_mode( "title:[a TO b]", Vec::new(), @@ -503,6 +540,25 @@ mod test { ); } + #[test] + fn test_wildcard_query() { + check_build_query_static_mode( + "title:hello*", + Vec::new(), + TestExpectation::Ok("PhrasePrefixQuery"), + ); + check_build_query_static_mode( + "foo:bar*", + Vec::new(), + TestExpectation::Err("invalid query: field does not exist: `foo`"), + ); + check_build_query_static_mode( + "title:hello*yo", + Vec::new(), + TestExpectation::Err("Wildcard query contains wildcard in non final position"), + ); + } + #[test] fn test_datetime_range_query() { { @@ -695,12 +751,14 @@ mod test { phrase: "short".to_string(), max_expansions: 50, params: params.clone(), + lenient: false, }; let long = PhrasePrefixQuery { field: "title".to_string(), phrase: "not so short".to_string(), max_expansions: 50, params: params.clone(), + lenient: false, }; let mut extractor1 = ExtractPrefixTermRanges::with_schema(&schema, &tokenizer_manager); extractor1.visit_phrase_prefix(&short).unwrap(); diff --git a/quickwit/quickwit-integration-tests/src/tests/update_tests/mod.rs b/quickwit/quickwit-integration-tests/src/tests/update_tests/mod.rs index ad6bb67bcc5..835102c89eb 100644 --- a/quickwit/quickwit-integration-tests/src/tests/update_tests/mod.rs +++ b/quickwit/quickwit-integration-tests/src/tests/update_tests/mod.rs @@ -41,7 +41,7 @@ async fn assert_hits_unordered( ) .await; if let Ok(expected_hits) = expected_result { - let resp = search_res.unwrap_or_else(|_| panic!("query: {}", query)); + let resp = search_res.unwrap_or_else(|err| panic!("query: {}, error: {}", query, err)); assert_eq!(resp.errors.len(), 0, "query: {}", query); assert_eq!( resp.num_hits, diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/match_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/match_query.rs index 18c565976f7..1547dcaeae9 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/match_query.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/match_query.rs @@ -19,6 +19,7 @@ use serde::Deserialize; +use super::LeniencyBool; use crate::elastic_query_dsl::{ ConvertibleToQueryAst, ElasticQueryDslInner, StringOrStructForSerialization, }; @@ -42,11 +43,8 @@ pub(crate) struct MatchQueryParams { pub(crate) operator: BooleanOperand, #[serde(default)] pub(crate) zero_terms_query: MatchAllOrNone, - // Quickwit and Elastic have different notions of lenient. For us, it means it's okay to - // disregard part of the query where which uses non-existing collumn (which Elastic does by - // default). For Elastic, it covers type errors (searching text in an integer field). #[serde(default)] - pub(crate) lenient: bool, + pub(crate) lenient: LeniencyBool, } impl ConvertibleToQueryAst for MatchQuery { diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs b/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs index 9e49c866d95..2140b659138 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs @@ -50,6 +50,14 @@ use crate::elastic_query_dsl::terms_query::TermsQuery; use crate::not_nan_f32::NotNaNf32; use crate::query_ast::QueryAst; +/// Quickwit and Elasticsearch have different interpretations of leniency: +/// - In Quickwit, lenient mode allows ignoring parts of the query that reference non-existing +/// columns. This is a behavior that Elasticsearch supports by default. +/// - In Elasticsearch, lenient mode primarily addresses type errors (such as searching for text in +/// an integer field). Quickwit always supports this behavior, regardless of the `lenient` +/// setting. +pub type LeniencyBool = bool; + fn default_max_expansions() -> u32 { 50 } diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/multi_match.rs b/quickwit/quickwit-query/src/elastic_query_dsl/multi_match.rs index 9b607151a31..8f5f8313a53 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/multi_match.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/multi_match.rs @@ -21,6 +21,7 @@ use serde::Deserialize; use serde_with::formats::PreferMany; use serde_with::{serde_as, OneOrMany}; +use super::LeniencyBool; use crate::elastic_query_dsl::bool_query::BoolQuery; use crate::elastic_query_dsl::match_bool_prefix::MatchBoolPrefixQuery; use crate::elastic_query_dsl::match_phrase_query::{MatchPhraseQuery, MatchPhraseQueryParams}; @@ -48,11 +49,8 @@ struct MultiMatchQueryForDeserialization { #[serde_as(deserialize_as = "OneOrMany<_, PreferMany>")] #[serde(default)] fields: Vec, - // Quickwit and Elastic have different notions of lenient. For us, it means it's okay to - // disregard part of the query where which uses non-existing collumn (which Elastic does by - // default). For Elastic, it covers type errors (searching text in an integer field). #[serde(default)] - lenient: bool, + lenient: LeniencyBool, } fn deserialize_match_query_for_one_field( diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/phrase_prefix_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/phrase_prefix_query.rs index 3955a175c64..4579b6530bf 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/phrase_prefix_query.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/phrase_prefix_query.rs @@ -67,6 +67,7 @@ impl ConvertibleToQueryAst for MatchPhrasePrefixQuery { phrase: query, params: analyzer, max_expansions, + lenient: false, }; Ok(phrase_prefix_query_ast.into()) } diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/query_string_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/query_string_query.rs index f7192f8928e..9e7e6ce180f 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/query_string_query.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/query_string_query.rs @@ -19,6 +19,7 @@ use serde::Deserialize; +use super::LeniencyBool; use crate::elastic_query_dsl::ConvertibleToQueryAst; use crate::not_nan_f32::NotNaNf32; use crate::query_ast::UserInputQuery; @@ -40,11 +41,8 @@ pub(crate) struct QueryStringQuery { default_operator: BooleanOperand, #[serde(default)] boost: Option, - // Regardless of this option Quickwit behaves in elasticsearch definition of - // lenient. We include this property here just to accept user queries containing - // this option. #[serde(default)] - lenient: bool, + lenient: LeniencyBool, } impl ConvertibleToQueryAst for QueryStringQuery { diff --git a/quickwit/quickwit-query/src/query_ast/full_text_query.rs b/quickwit/quickwit-query/src/query_ast/full_text_query.rs index d77b39e67df..661bb89039f 100644 --- a/quickwit/quickwit-query/src/query_ast/full_text_query.rs +++ b/quickwit/quickwit-query/src/query_ast/full_text_query.rs @@ -227,6 +227,7 @@ pub struct FullTextQuery { pub field: String, pub text: String, pub params: FullTextParams, + /// Support missing fields pub lenient: bool, } diff --git a/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs b/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs index d0107f885f9..1675b22d760 100644 --- a/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs +++ b/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs @@ -38,6 +38,8 @@ pub struct PhrasePrefixQuery { pub phrase: String, pub max_expansions: u32, pub params: FullTextParams, + /// Support missing fields + pub lenient: bool, } impl PhrasePrefixQuery { @@ -117,7 +119,13 @@ impl BuildTantivyAst for PhrasePrefixQuery { _search_fields: &[String], _with_validation: bool, ) -> Result { - let (_, terms) = self.get_terms(schema, tokenizer_manager)?; + let (_, terms) = match self.get_terms(schema, tokenizer_manager) { + Ok(res) => res, + Err(InvalidQuery::FieldDoesNotExist { .. }) if self.lenient => { + return Ok(TantivyQueryAst::match_none()) + } + Err(e) => return Err(e), + }; if terms.is_empty() { if self.params.zero_terms_query.is_none() { diff --git a/quickwit/quickwit-query/src/query_ast/user_input_query.rs b/quickwit/quickwit-query/src/query_ast/user_input_query.rs index 8a910567982..279f41b4676 100644 --- a/quickwit/quickwit-query/src/query_ast/user_input_query.rs +++ b/quickwit/quickwit-query/src/query_ast/user_input_query.rs @@ -49,6 +49,7 @@ pub struct UserInputQuery { #[serde(default, skip_serializing_if = "Option::is_none")] pub default_fields: Option>, pub default_operator: BooleanOperand, + /// Support missing fields pub lenient: bool, } @@ -273,12 +274,14 @@ fn convert_user_input_literal( phrase: phrase.clone(), params: full_text_params.clone(), max_expansions: DEFAULT_PHRASE_QUERY_MAX_EXPANSION, + lenient, } .into() } else if wildcard { query_ast::WildcardQuery { field: field_name, value: phrase.clone(), + lenient, } .into() } else { diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs index 86afb68a7d3..145e5a45bd1 100644 --- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs +++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs @@ -34,6 +34,8 @@ use crate::{find_field_or_hit_dynamic, InvalidQuery}; pub struct WildcardQuery { pub field: String, pub value: String, + /// Support missing fields + pub lenient: bool, } impl From for QueryAst { @@ -42,16 +44,6 @@ impl From for QueryAst { } } -impl WildcardQuery { - #[cfg(test)] - pub fn from_field_value(field: impl ToString, value: impl ToString) -> Self { - Self { - field: field.to_string(), - value: value.to_string(), - } - } -} - fn extract_unique_token(mut tokens: Vec) -> anyhow::Result { let term = tokens .pop() @@ -77,7 +69,7 @@ fn unescape_with_final_wildcard(phrase: &str) -> anyhow::Result { .scan(State::Normal, |state, c| { if *saw_wildcard { return Some(Some(Err(anyhow!( - "Wildcard iquery contains wildcard in non final position" + "Wildcard query contains wildcard in non final position" )))); } match state { @@ -190,7 +182,13 @@ impl BuildTantivyAst for WildcardQuery { _search_fields: &[String], _with_validation: bool, ) -> Result { - let (_, term) = self.extract_prefix_term(schema, tokenizer_manager)?; + let (_, term) = match self.extract_prefix_term(schema, tokenizer_manager) { + Ok(res) => res, + Err(InvalidQuery::FieldDoesNotExist { .. }) if self.lenient => { + return Ok(TantivyQueryAst::match_none()) + } + Err(e) => return Err(e), + }; let mut phrase_prefix_query = tantivy::query::PhrasePrefixQuery::new_with_offset(vec![(0, term)]); @@ -206,20 +204,24 @@ mod tests { use super::*; use crate::create_default_quickwit_tokenizer_manager; + fn single_text_field_schema(field_name: &str, tokenizer: &str) -> TantivySchema { + let mut schema_builder = TantivySchema::builder(); + let text_options = TextOptions::default() + .set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer)); + schema_builder.add_text_field(field_name, text_options); + schema_builder.build() + } + #[test] fn test_extract_term_for_wildcard() { let query = WildcardQuery { field: "my_field".to_string(), value: "MyString Wh1ch a nOrMal Tokenizer would cut*".to_string(), + lenient: false, }; let tokenizer_manager = create_default_quickwit_tokenizer_manager(); for tokenizer in ["raw", "whitespace"] { - let mut schema_builder = TantivySchema::builder(); - let text_options = TextOptions::default() - .set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer)); - schema_builder.add_text_field("my_field", text_options); - let schema = schema_builder.build(); - + let schema = single_text_field_schema("my_field", tokenizer); let (_field, term) = query .extract_prefix_term(&schema, &tokenizer_manager) .unwrap(); @@ -237,19 +239,34 @@ mod tests { "source_code_default", "source_code_with_hex", ] { - let mut schema_builder = TantivySchema::builder(); - let text_options = TextOptions::default() - .set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer)); - schema_builder.add_text_field("my_field", text_options); - let schema = schema_builder.build(); - + let schema = single_text_field_schema("my_field", tokenizer); let (_field, term) = query .extract_prefix_term(&schema, &tokenizer_manager) .unwrap(); - let value = term.value(); let text = value.as_str().unwrap(); assert_eq!(text, &query.value.trim_end_matches('*').to_lowercase()); } } + + #[test] + fn test_extract_term_for_wildcard_missing_field() { + let query = WildcardQuery { + field: "my_missing_field".to_string(), + value: "My query value*".to_string(), + lenient: false, + }; + let tokenizer_manager = create_default_quickwit_tokenizer_manager(); + let schema = single_text_field_schema("my_field", "whitespace"); + let err = query + .extract_prefix_term(&schema, &tokenizer_manager) + .unwrap_err(); + let InvalidQuery::FieldDoesNotExist { + full_path: missing_field_full_path, + } = err + else { + panic!("unexpected error: {:?}", err); + }; + assert_eq!(missing_field_full_path, "my_missing_field"); + } } diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml index 8cb495379c3..668e4877cfc 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml @@ -226,13 +226,25 @@ json: query: query_string: query: "true" - fields: ["public", "public.inner"] + fields: ["public", "public.notdefined", "notdefined"] lenient: true expected: hits: total: value: 100 --- +# trailing wildcard +json: + query: + query_string: + query: "jour*" + fields: ["payload.description", "payload.notdefined", "notdefined"] + lenient: true +expected: + hits: + total: + value: 3 +--- # elasticsearch accepts this query engines: - quickwit @@ -240,5 +252,5 @@ json: query: query_string: query: "true" - fields: ["public", "public.inner"] + fields: ["public", "public.notdefined"] status_code: 400