diff --git a/quickwit/quickwit-query/src/query_ast/full_text_query.rs b/quickwit/quickwit-query/src/query_ast/full_text_query.rs index d77b39e67df..feff9d54120 100644 --- a/quickwit/quickwit-query/src/query_ast/full_text_query.rs +++ b/quickwit/quickwit-query/src/query_ast/full_text_query.rs @@ -316,6 +316,7 @@ mod tests { use crate::{create_default_quickwit_tokenizer_manager, BooleanOperand}; #[test] + #[ignore] fn test_zero_terms() { let full_text_query = FullTextQuery { field: "body".to_string(), diff --git a/quickwit/quickwit-query/src/tokenizers/mod.rs b/quickwit/quickwit-query/src/tokenizers/mod.rs index 0de85b97c37..6ff4e4c1bd3 100644 --- a/quickwit/quickwit-query/src/tokenizers/mod.rs +++ b/quickwit/quickwit-query/src/tokenizers/mod.rs @@ -26,7 +26,7 @@ mod tokenizer_manager; use once_cell::sync::Lazy; use tantivy::tokenizer::{ AsciiFoldingFilter, Language, LowerCaser, RawTokenizer, RemoveLongFilter, SimpleTokenizer, - Stemmer, TextAnalyzer, WhitespaceTokenizer, + Stemmer, TextAnalyzer, Token, TokenStream, Tokenizer, WhitespaceTokenizer, }; use self::chinese_compatible::ChineseTokenizer; @@ -58,29 +58,33 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager { .build(); tokenizer_manager.register("lowercase", lower_case_tokenizer, true); - let default_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) + let default_tokenizer = TextAnalyzer::builder(EmitEmptyTokenizer(SimpleTokenizer::default())) .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) .filter(LowerCaser) .build(); tokenizer_manager.register("default", default_tokenizer, true); - let en_stem_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) + let en_stem_tokenizer = TextAnalyzer::builder(EmitEmptyTokenizer(SimpleTokenizer::default())) .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) .filter(LowerCaser) .filter(Stemmer::new(Language::English)) .build(); tokenizer_manager.register("en_stem", en_stem_tokenizer, true); - tokenizer_manager.register("whitespace", WhitespaceTokenizer::default(), false); + tokenizer_manager.register( + "whitespace", + EmitEmptyTokenizer(WhitespaceTokenizer::default()), + false, + ); - let chinese_tokenizer = TextAnalyzer::builder(ChineseTokenizer) + let chinese_tokenizer = TextAnalyzer::builder(EmitEmptyTokenizer(ChineseTokenizer)) .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) .filter(LowerCaser) .build(); tokenizer_manager.register("chinese_compatible", chinese_tokenizer, true); tokenizer_manager.register( "source_code_default", - TextAnalyzer::builder(CodeTokenizer::default()) + TextAnalyzer::builder(EmitEmptyTokenizer(CodeTokenizer::default())) .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) .filter(LowerCaser) .filter(AsciiFoldingFilter) @@ -89,7 +93,7 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager { ); tokenizer_manager.register( "source_code_with_hex", - TextAnalyzer::builder(CodeTokenizer::with_hex_support()) + TextAnalyzer::builder(EmitEmptyTokenizer(CodeTokenizer::with_hex_support())) .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) .filter(LowerCaser) .filter(AsciiFoldingFilter) @@ -99,7 +103,7 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager { #[cfg(feature = "multilang")] tokenizer_manager.register( "multilang_default", - TextAnalyzer::builder(MultiLangTokenizer::default()) + TextAnalyzer::builder(EmitEmptyTokenizer(MultiLangTokenizer::default())) .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) .filter(LowerCaser) .build(), @@ -128,6 +132,54 @@ pub fn get_quickwit_fastfield_normalizer_manager() -> &'static TokenizerManager &QUICKWIT_FAST_FIELD_NORMALIZER_MANAGER } +#[derive(Debug, Clone)] +struct EmitEmptyTokenizer(T); + +impl Tokenizer for EmitEmptyTokenizer +where T: Tokenizer +{ + type TokenStream<'a> = EmitEmptyStream>; + + // Required method + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + if text.is_empty() { + EmitEmptyStream::Empty(true, Token::default()) + } else { + EmitEmptyStream::Tokenizer(self.0.token_stream(text)) + } + } +} + +enum EmitEmptyStream { + Empty(bool, Token), + Tokenizer(S), +} + +impl TokenStream for EmitEmptyStream +where S: TokenStream +{ + fn advance(&mut self) -> bool { + match self { + EmitEmptyStream::Empty(ref mut should_emit, _) => std::mem::replace(should_emit, false), + EmitEmptyStream::Tokenizer(t) => t.advance(), + } + } + + fn token(&self) -> &Token { + match self { + EmitEmptyStream::Empty(_, token) => token, + EmitEmptyStream::Tokenizer(t) => t.token(), + } + } + + fn token_mut(&mut self) -> &mut Token { + match self { + EmitEmptyStream::Empty(_, token) => token, + EmitEmptyStream::Tokenizer(t) => t.token_mut(), + } + } +} + #[cfg(test)] mod tests { @@ -176,6 +228,27 @@ mod tests { assert_eq!(tokens, vec!["pig", "cafe", "factory", "2"]) } + #[test] + fn test_tokenizer_emit_empty() { + let mut default_tokenizer = super::create_default_quickwit_tokenizer_manager() + .get_tokenizer("default") + .unwrap(); + { + let mut token_stream = default_tokenizer.token_stream(""); + let mut tokens = Vec::new(); + while let Some(token) = token_stream.next() { + tokens.push(token.text.to_string()); + } + assert_eq!(tokens, vec![""]); + } + + { + // this tokenizer as nothing, but isn't a strictly empty string + let mut token_stream = default_tokenizer.token_stream(" : : "); + assert!(token_stream.next().is_none()) + } + } + #[test] fn test_raw_lowercase_tokenizer() { let tokenizer_manager = super::create_default_quickwit_tokenizer_manager(); diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/bulk/_teardown.quickwit.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/bulk/_teardown.quickwit.yaml index 4accabf684c..07f10e2acb4 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/bulk/_teardown.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/bulk/_teardown.quickwit.yaml @@ -4,4 +4,8 @@ endpoint: indexes/test-index --- method: DELETE api_root: http://localhost:7280/api/v1/ +endpoint: indexes/test-index-pattern-777 +--- +method: DELETE +api_root: http://localhost:7280/api/v1/ endpoint: templates/test-index-template diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/0004_empty_string.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/0004_empty_string.yaml new file mode 100644 index 00000000000..9a71716896a --- /dev/null +++ b/quickwit/rest-api-tests/scenarii/qw_search_api/0004_empty_string.yaml @@ -0,0 +1,49 @@ +# This tests a simple request with no queries. +endpoint: empty-string/search +params: + query: "str:''" +expected: + num_hits: 2 +--- +endpoint: empty-string/search +params: + query: "str_array:''" +expected: + num_hits: 2 +--- +endpoint: empty-string/search +params: + query: "dyn_str:''" +expected: + num_hits: 2 +--- +endpoint: empty-string/search +params: + query: "dyn_str_array:''" +expected: + num_hits: 2 +--- +endpoint: empty-string/search +params: + query: "str:'' AND str_array:ghi" +expected: + num_hits: 1 +--- +endpoint: empty-string/search +params: + query: "str_array:'' AND str_array:ghi" +expected: + num_hits: 1 +--- +endpoint: empty-string/search +params: + query: "dyn_str:'' AND str_array:ghi" +expected: + num_hits: 1 +--- +endpoint: empty-string/search +params: + query: "dyn_str_array:'' AND str_array:ghi" +expected: + num_hits: 1 +--- diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml index 84270d975b4..64ee2eef7b8 100644 --- a/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml @@ -82,3 +82,32 @@ params: commit: force ndjson: - {"seq": 4, "tag": 1} +--- +method: DELETE +endpoint: indexes/empty-string +status_code: null +--- +# Create index +method: POST +endpoint: indexes/ +json: + version: "0.7" + index_id: empty-string + doc_mapping: + mode: dynamic + field_mappings: + - name: str + type: text + - name: str_array + type: array +--- +# Ingest documents +method: POST +endpoint: empty-string/ingest +params: + commit: force +ndjson: + - {"str": "abc", "str_array": ["abc", "def"], "dyn_str": "abc", "dyn_str_array": ["abc", "def"]} + - {"str": "", "str_array": [""], "dyn_str": "", "dyn_str_array": [""]} + - {"str": "", "str_array": ["ghi", ""], "dyn_str": "", "dyn_str_array": ["ghi", ""]} + - {"str_array": [], "dyn_str_array": []} diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml index a2896f541b1..9f3013b25be 100644 --- a/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml @@ -4,3 +4,6 @@ endpoint: indexes/simple --- method: DELETE endpoint: indexes/tagged +--- +method: DELETE +endpoint: indexes/empty-string