Skip to content

Commit

Permalink
Fix lenient option with wildcard queries (#5575)
Browse files Browse the repository at this point in the history
* Better error messages in integ tests

* Initial fix suggestion from Trinity

* Add rest api test and clarify docs about leniency

* Add missing field test on wildcard query

* Fix add query building unit tests

* Forgotten staging file
  • Loading branch information
rdettai authored Dec 11, 2024
1 parent fba56b8 commit 2121ba1
Show file tree
Hide file tree
Showing 13 changed files with 169 additions and 51 deletions.
22 changes: 19 additions & 3 deletions docs/reference/es_compatible_api.md
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,7 @@ The following query types are supported.
| `fields` | `String[]` (Optional) | Default search target fields. | - |
| `default_operator` | `"AND"` or `"OR"` | In the absence of boolean operator defines whether terms should be combined as a conjunction (`AND`) or disjunction (`OR`). | `OR` |
| `boost` | `Number` | Multiplier boost for score computation. | 1.0 |
| `lenient` | `Boolean` | [See note](#about-the-lenient-argument). | false |


### `bool`
Expand Down Expand Up @@ -494,7 +495,7 @@ The following query types are supported.
| `operator` | `"AND"` or `"OR"` | Defines whether all terms should be present (`AND`) or if at least one term is sufficient to match (`OR`). | OR |
| `zero_terms_query` | `all` or `none` | Defines if all (`all`) or no documents (`none`) should be returned if the query does not contain any terms after tokenization. | `none` |
| `boost` | `Number` | Multiplier boost for score computation | 1.0 |

| `lenient` | `Boolean` | [See note](#about-the-lenient-argument). | false |



Expand Down Expand Up @@ -637,8 +638,17 @@ Contrary to ES/Opensearch, in Quickwit, at most 50 terms will be considered when
}
```

#### Supported Multi-match Queries
| Type | Description |
#### Supported parameters

| Variable | Type | Description | Default value |
| ------------------ | --------------------- | ---------------------------------------------| ------------- |
| `type` | `String` | See supported types below | `most_fields` |
| `fields` | `String[]` (Optional) | Default search target fields. | - |
| `lenient` | `Boolean` | [See note](#about-the-lenient-argument). | false |

Supported types:

| `type` value | Description |
| --------------- | ------------------------------------------------------------------------------------------- |
| `most_fields` | Finds documents matching any field and combines the `_score` from each field (default). |
| `phrase` | Runs a `match_phrase` query on each field. |
Expand Down Expand Up @@ -721,6 +731,12 @@ Query matching only documents containing a non-null value for a given field.
| `field` | String | Only documents with a value for field will be returned. | - |


### About the `lenient` argument

Quickwit and Elasticsearch have different interpretations of the `lenient` setting:
- In Quickwit, lenient mode allows ignoring parts of the query that reference non-existing columns. This is a behavior that Elasticsearch supports by default.
- In Elasticsearch, lenient mode primarily addresses type errors (such as searching for text in an integer field). Quickwit always supports this behavior, regardless of the `lenient` setting.

## Search multiple indices

Search APIs that accept <index_id> requests path parameter also support multi-target syntax.
Expand Down
72 changes: 65 additions & 7 deletions quickwit/quickwit-doc-mapper/src/query_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,9 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> {
) -> Result<(), Self::Err> {
let terms = match phrase_prefix.get_terms(self.schema, self.tokenizer_manager) {
Ok((_, terms)) => terms,
Err(InvalidQuery::SchemaError(_)) => return Ok(()), /* the query will be nullified when casting to a tantivy ast */
Err(InvalidQuery::SchemaError(_)) | Err(InvalidQuery::FieldDoesNotExist { .. }) => {
return Ok(())
} /* the query will be nullified when casting to a tantivy ast */
Err(e) => return Err(e),
};
if let Some((_, term)) = terms.last() {
Expand All @@ -258,7 +260,12 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> {
}

fn visit_wildcard(&mut self, wildcard_query: &'a WildcardQuery) -> Result<(), Self::Err> {
let (_, term) = wildcard_query.extract_prefix_term(self.schema, self.tokenizer_manager)?;
let term = match wildcard_query.extract_prefix_term(self.schema, self.tokenizer_manager) {
Ok((_, term)) => term,
/* the query will be nullified when casting to a tantivy ast */
Err(InvalidQuery::FieldDoesNotExist { .. }) => return Ok(()),
Err(e) => return Err(e),
};
self.add_prefix_term(term, u32::MAX, false);
Ok(())
}
Expand All @@ -280,8 +287,11 @@ mod test {

use quickwit_query::query_ast::{
query_ast_from_user_text, FullTextMode, FullTextParams, PhrasePrefixQuery, QueryAstVisitor,
UserInputQuery,
};
use quickwit_query::{
create_default_quickwit_tokenizer_manager, BooleanOperand, MatchAllOrNone,
};
use quickwit_query::{create_default_quickwit_tokenizer_manager, MatchAllOrNone};
use tantivy::schema::{DateOptions, DateTimePrecision, Schema, FAST, INDEXED, STORED, TEXT};
use tantivy::Term;

Expand Down Expand Up @@ -323,7 +333,7 @@ mod test {
search_fields: Vec<String>,
expected: TestExpectation,
) {
check_build_query(user_query, search_fields, expected, true);
check_build_query(user_query, search_fields, expected, true, false);
}

#[track_caller]
Expand All @@ -332,15 +342,31 @@ mod test {
search_fields: Vec<String>,
expected: TestExpectation,
) {
check_build_query(user_query, search_fields, expected, false);
check_build_query(user_query, search_fields, expected, false, false);
}

#[track_caller]
fn check_build_query_static_lenient_mode(
user_query: &str,
search_fields: Vec<String>,
expected: TestExpectation,
) {
check_build_query(user_query, search_fields, expected, false, true);
}

fn test_build_query(
user_query: &str,
search_fields: Vec<String>,
dynamic_mode: bool,
lenient: bool,
) -> Result<String, String> {
let query_ast = query_ast_from_user_text(user_query, Some(search_fields))
let user_input_query = UserInputQuery {
user_text: user_query.to_string(),
default_fields: Some(search_fields),
default_operator: BooleanOperand::And,
lenient,
};
let query_ast = user_input_query
.parse_user_query(&[])
.map_err(|err| err.to_string())?;
let schema = make_schema(dynamic_mode);
Expand All @@ -362,8 +388,9 @@ mod test {
search_fields: Vec<String>,
expected: TestExpectation,
dynamic_mode: bool,
lenient: bool,
) {
let query_result = test_build_query(user_query, search_fields, dynamic_mode);
let query_result = test_build_query(user_query, search_fields, dynamic_mode, lenient);
match (query_result, expected) {
(Err(query_err_msg), TestExpectation::Err(sub_str)) => {
assert!(
Expand Down Expand Up @@ -425,6 +452,11 @@ mod test {
Vec::new(),
TestExpectation::Err("invalid query: field does not exist: `foo`"),
);
check_build_query_static_lenient_mode(
"foo:bar",
Vec::new(),
TestExpectation::Ok("EmptyQuery"),
);
check_build_query_static_mode(
"title:bar",
Vec::new(),
Expand All @@ -435,6 +467,11 @@ mod test {
vec!["fieldnotinschema".to_string()],
TestExpectation::Err("invalid query: field does not exist: `fieldnotinschema`"),
);
check_build_query_static_lenient_mode(
"bar",
vec!["fieldnotinschema".to_string()],
TestExpectation::Ok("EmptyQuery"),
);
check_build_query_static_mode(
"title:[a TO b]",
Vec::new(),
Expand Down Expand Up @@ -503,6 +540,25 @@ mod test {
);
}

#[test]
fn test_wildcard_query() {
check_build_query_static_mode(
"title:hello*",
Vec::new(),
TestExpectation::Ok("PhrasePrefixQuery"),
);
check_build_query_static_mode(
"foo:bar*",
Vec::new(),
TestExpectation::Err("invalid query: field does not exist: `foo`"),
);
check_build_query_static_mode(
"title:hello*yo",
Vec::new(),
TestExpectation::Err("Wildcard query contains wildcard in non final position"),
);
}

#[test]
fn test_datetime_range_query() {
{
Expand Down Expand Up @@ -695,12 +751,14 @@ mod test {
phrase: "short".to_string(),
max_expansions: 50,
params: params.clone(),
lenient: false,
};
let long = PhrasePrefixQuery {
field: "title".to_string(),
phrase: "not so short".to_string(),
max_expansions: 50,
params: params.clone(),
lenient: false,
};
let mut extractor1 = ExtractPrefixTermRanges::with_schema(&schema, &tokenizer_manager);
extractor1.visit_phrase_prefix(&short).unwrap();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ async fn assert_hits_unordered(
)
.await;
if let Ok(expected_hits) = expected_result {
let resp = search_res.unwrap_or_else(|_| panic!("query: {}", query));
let resp = search_res.unwrap_or_else(|err| panic!("query: {}, error: {}", query, err));
assert_eq!(resp.errors.len(), 0, "query: {}", query);
assert_eq!(
resp.num_hits,
Expand Down
6 changes: 2 additions & 4 deletions quickwit/quickwit-query/src/elastic_query_dsl/match_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

use serde::Deserialize;

use super::LeniencyBool;
use crate::elastic_query_dsl::{
ConvertibleToQueryAst, ElasticQueryDslInner, StringOrStructForSerialization,
};
Expand All @@ -42,11 +43,8 @@ pub(crate) struct MatchQueryParams {
pub(crate) operator: BooleanOperand,
#[serde(default)]
pub(crate) zero_terms_query: MatchAllOrNone,
// Quickwit and Elastic have different notions of lenient. For us, it means it's okay to
// disregard part of the query where which uses non-existing collumn (which Elastic does by
// default). For Elastic, it covers type errors (searching text in an integer field).
#[serde(default)]
pub(crate) lenient: bool,
pub(crate) lenient: LeniencyBool,
}

impl ConvertibleToQueryAst for MatchQuery {
Expand Down
8 changes: 8 additions & 0 deletions quickwit/quickwit-query/src/elastic_query_dsl/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,14 @@ use crate::elastic_query_dsl::terms_query::TermsQuery;
use crate::not_nan_f32::NotNaNf32;
use crate::query_ast::QueryAst;

/// Quickwit and Elasticsearch have different interpretations of leniency:
/// - In Quickwit, lenient mode allows ignoring parts of the query that reference non-existing
/// columns. This is a behavior that Elasticsearch supports by default.
/// - In Elasticsearch, lenient mode primarily addresses type errors (such as searching for text in
/// an integer field). Quickwit always supports this behavior, regardless of the `lenient`
/// setting.
pub type LeniencyBool = bool;

fn default_max_expansions() -> u32 {
50
}
Expand Down
6 changes: 2 additions & 4 deletions quickwit/quickwit-query/src/elastic_query_dsl/multi_match.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use serde::Deserialize;
use serde_with::formats::PreferMany;
use serde_with::{serde_as, OneOrMany};

use super::LeniencyBool;
use crate::elastic_query_dsl::bool_query::BoolQuery;
use crate::elastic_query_dsl::match_bool_prefix::MatchBoolPrefixQuery;
use crate::elastic_query_dsl::match_phrase_query::{MatchPhraseQuery, MatchPhraseQueryParams};
Expand Down Expand Up @@ -48,11 +49,8 @@ struct MultiMatchQueryForDeserialization {
#[serde_as(deserialize_as = "OneOrMany<_, PreferMany>")]
#[serde(default)]
fields: Vec<String>,
// Quickwit and Elastic have different notions of lenient. For us, it means it's okay to
// disregard part of the query where which uses non-existing collumn (which Elastic does by
// default). For Elastic, it covers type errors (searching text in an integer field).
#[serde(default)]
lenient: bool,
lenient: LeniencyBool,
}

fn deserialize_match_query_for_one_field(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ impl ConvertibleToQueryAst for MatchPhrasePrefixQuery {
phrase: query,
params: analyzer,
max_expansions,
lenient: false,
};
Ok(phrase_prefix_query_ast.into())
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

use serde::Deserialize;

use super::LeniencyBool;
use crate::elastic_query_dsl::ConvertibleToQueryAst;
use crate::not_nan_f32::NotNaNf32;
use crate::query_ast::UserInputQuery;
Expand All @@ -40,11 +41,8 @@ pub(crate) struct QueryStringQuery {
default_operator: BooleanOperand,
#[serde(default)]
boost: Option<NotNaNf32>,
// Regardless of this option Quickwit behaves in elasticsearch definition of
// lenient. We include this property here just to accept user queries containing
// this option.
#[serde(default)]
lenient: bool,
lenient: LeniencyBool,
}

impl ConvertibleToQueryAst for QueryStringQuery {
Expand Down
1 change: 1 addition & 0 deletions quickwit/quickwit-query/src/query_ast/full_text_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ pub struct FullTextQuery {
pub field: String,
pub text: String,
pub params: FullTextParams,
/// Support missing fields
pub lenient: bool,
}

Expand Down
10 changes: 9 additions & 1 deletion quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ pub struct PhrasePrefixQuery {
pub phrase: String,
pub max_expansions: u32,
pub params: FullTextParams,
/// Support missing fields
pub lenient: bool,
}

impl PhrasePrefixQuery {
Expand Down Expand Up @@ -117,7 +119,13 @@ impl BuildTantivyAst for PhrasePrefixQuery {
_search_fields: &[String],
_with_validation: bool,
) -> Result<TantivyQueryAst, InvalidQuery> {
let (_, terms) = self.get_terms(schema, tokenizer_manager)?;
let (_, terms) = match self.get_terms(schema, tokenizer_manager) {
Ok(res) => res,
Err(InvalidQuery::FieldDoesNotExist { .. }) if self.lenient => {
return Ok(TantivyQueryAst::match_none())
}
Err(e) => return Err(e),
};

if terms.is_empty() {
if self.params.zero_terms_query.is_none() {
Expand Down
3 changes: 3 additions & 0 deletions quickwit/quickwit-query/src/query_ast/user_input_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ pub struct UserInputQuery {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub default_fields: Option<Vec<String>>,
pub default_operator: BooleanOperand,
/// Support missing fields
pub lenient: bool,
}

Expand Down Expand Up @@ -273,12 +274,14 @@ fn convert_user_input_literal(
phrase: phrase.clone(),
params: full_text_params.clone(),
max_expansions: DEFAULT_PHRASE_QUERY_MAX_EXPANSION,
lenient,
}
.into()
} else if wildcard {
query_ast::WildcardQuery {
field: field_name,
value: phrase.clone(),
lenient,
}
.into()
} else {
Expand Down
Loading

0 comments on commit 2121ba1

Please sign in to comment.