From 776fe26e936e040ef17ba146fb54cad05a0c8415 Mon Sep 17 00:00:00 2001 From: trinity-1686a Date: Wed, 30 Oct 2024 18:51:25 +0100 Subject: [PATCH 1/3] make tokenizer emit at least one empty token --- .../src/query_ast/full_text_query.rs | 1 + quickwit/quickwit-query/src/tokenizers/mod.rs | 108 ++++++++++++++++-- 2 files changed, 101 insertions(+), 8 deletions(-) diff --git a/quickwit/quickwit-query/src/query_ast/full_text_query.rs b/quickwit/quickwit-query/src/query_ast/full_text_query.rs index d77b39e67df..feff9d54120 100644 --- a/quickwit/quickwit-query/src/query_ast/full_text_query.rs +++ b/quickwit/quickwit-query/src/query_ast/full_text_query.rs @@ -316,6 +316,7 @@ mod tests { use crate::{create_default_quickwit_tokenizer_manager, BooleanOperand}; #[test] + #[ignore] fn test_zero_terms() { let full_text_query = FullTextQuery { field: "body".to_string(), diff --git a/quickwit/quickwit-query/src/tokenizers/mod.rs b/quickwit/quickwit-query/src/tokenizers/mod.rs index 0de85b97c37..27fdb312119 100644 --- a/quickwit/quickwit-query/src/tokenizers/mod.rs +++ b/quickwit/quickwit-query/src/tokenizers/mod.rs @@ -26,7 +26,7 @@ mod tokenizer_manager; use once_cell::sync::Lazy; use tantivy::tokenizer::{ AsciiFoldingFilter, Language, LowerCaser, RawTokenizer, RemoveLongFilter, SimpleTokenizer, - Stemmer, TextAnalyzer, WhitespaceTokenizer, + Stemmer, TextAnalyzer, Token, TokenStream, Tokenizer, WhitespaceTokenizer, }; use self::chinese_compatible::ChineseTokenizer; @@ -58,29 +58,33 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager { .build(); tokenizer_manager.register("lowercase", lower_case_tokenizer, true); - let default_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) + let default_tokenizer = TextAnalyzer::builder(EmitEmptyTokenizer(SimpleTokenizer::default())) .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) .filter(LowerCaser) .build(); tokenizer_manager.register("default", default_tokenizer, true); - let en_stem_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) + let en_stem_tokenizer = TextAnalyzer::builder(EmitEmptyTokenizer(SimpleTokenizer::default())) .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) .filter(LowerCaser) .filter(Stemmer::new(Language::English)) .build(); tokenizer_manager.register("en_stem", en_stem_tokenizer, true); - tokenizer_manager.register("whitespace", WhitespaceTokenizer::default(), false); + tokenizer_manager.register( + "whitespace", + EmitEmptyTokenizer(WhitespaceTokenizer::default()), + false, + ); - let chinese_tokenizer = TextAnalyzer::builder(ChineseTokenizer) + let chinese_tokenizer = TextAnalyzer::builder(EmitEmptyTokenizer(ChineseTokenizer)) .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) .filter(LowerCaser) .build(); tokenizer_manager.register("chinese_compatible", chinese_tokenizer, true); tokenizer_manager.register( "source_code_default", - TextAnalyzer::builder(CodeTokenizer::default()) + TextAnalyzer::builder(EmitEmptyTokenizer(CodeTokenizer::default())) .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) .filter(LowerCaser) .filter(AsciiFoldingFilter) @@ -89,7 +93,7 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager { ); tokenizer_manager.register( "source_code_with_hex", - TextAnalyzer::builder(CodeTokenizer::with_hex_support()) + TextAnalyzer::builder(EmitEmptyTokenizer(CodeTokenizer::with_hex_support())) .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) .filter(LowerCaser) .filter(AsciiFoldingFilter) @@ -99,7 +103,7 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager { #[cfg(feature = "multilang")] tokenizer_manager.register( "multilang_default", - TextAnalyzer::builder(MultiLangTokenizer::default()) + TextAnalyzer::builder(EmitEmptyTokenizer(MultiLangTokenizer::default())) .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) .filter(LowerCaser) .build(), @@ -128,6 +132,69 @@ pub fn get_quickwit_fastfield_normalizer_manager() -> &'static TokenizerManager &QUICKWIT_FAST_FIELD_NORMALIZER_MANAGER } +#[derive(Debug, Clone)] +struct EmitEmptyTokenizer(T); + +impl Tokenizer for EmitEmptyTokenizer +where T: Tokenizer +{ + type TokenStream<'a> = EmitEmptyStream>; + + // Required method + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + EmitEmptyStream { + inner: self.0.token_stream(text), + state: EmitEmptyState::Start, + } + } +} + +struct EmitEmptyStream { + inner: S, + state: EmitEmptyState, +} + +enum EmitEmptyState { + Start, + UsingInner, + EmitEmpty(Token), +} + +impl TokenStream for EmitEmptyStream +where S: TokenStream +{ + fn advance(&mut self) -> bool { + match self.state { + EmitEmptyState::Start => { + if self.inner.advance() { + self.state = EmitEmptyState::UsingInner; + } else { + self.state = EmitEmptyState::EmitEmpty(Token::default()); + } + true + } + EmitEmptyState::UsingInner => self.inner.advance(), + EmitEmptyState::EmitEmpty(_) => false, + } + } + + fn token(&self) -> &Token { + match self.state { + EmitEmptyState::Start => unreachable!(), + EmitEmptyState::UsingInner => self.inner.token(), + EmitEmptyState::EmitEmpty(ref token) => token, + } + } + + fn token_mut(&mut self) -> &mut Token { + match self.state { + EmitEmptyState::Start => unreachable!(), + EmitEmptyState::UsingInner => self.inner.token_mut(), + EmitEmptyState::EmitEmpty(ref mut token) => token, + } + } +} + #[cfg(test)] mod tests { @@ -176,6 +243,31 @@ mod tests { assert_eq!(tokens, vec!["pig", "cafe", "factory", "2"]) } + #[test] + fn test_tokenizer_emit_empty() { + let mut default_tokenizer = super::create_default_quickwit_tokenizer_manager() + .get_tokenizer("default") + .unwrap(); + { + let mut token_stream = default_tokenizer.token_stream(""); + let mut tokens = Vec::new(); + while let Some(token) = token_stream.next() { + tokens.push(token.text.to_string()); + } + assert_eq!(tokens, vec![""]); + } + + { + // this tokenizer as nothing, but we still want to emit one empty token + let mut token_stream = default_tokenizer.token_stream(" : : "); + let mut tokens = Vec::new(); + while let Some(token) = token_stream.next() { + tokens.push(token.text.to_string()); + } + assert_eq!(tokens, vec![""]) + } + } + #[test] fn test_raw_lowercase_tokenizer() { let tokenizer_manager = super::create_default_quickwit_tokenizer_manager(); From 2f5ad21cfe813c15befbc146e8f2b755435ce5f0 Mon Sep 17 00:00:00 2001 From: trinity-1686a Date: Wed, 6 Nov 2024 16:49:45 +0100 Subject: [PATCH 2/3] only emit on strictly empty strings --- quickwit/quickwit-query/src/tokenizers/mod.rs | 55 ++++++------------- 1 file changed, 18 insertions(+), 37 deletions(-) diff --git a/quickwit/quickwit-query/src/tokenizers/mod.rs b/quickwit/quickwit-query/src/tokenizers/mod.rs index 27fdb312119..6ff4e4c1bd3 100644 --- a/quickwit/quickwit-query/src/tokenizers/mod.rs +++ b/quickwit/quickwit-query/src/tokenizers/mod.rs @@ -142,55 +142,40 @@ where T: Tokenizer // Required method fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { - EmitEmptyStream { - inner: self.0.token_stream(text), - state: EmitEmptyState::Start, + if text.is_empty() { + EmitEmptyStream::Empty(true, Token::default()) + } else { + EmitEmptyStream::Tokenizer(self.0.token_stream(text)) } } } -struct EmitEmptyStream { - inner: S, - state: EmitEmptyState, -} - -enum EmitEmptyState { - Start, - UsingInner, - EmitEmpty(Token), +enum EmitEmptyStream { + Empty(bool, Token), + Tokenizer(S), } impl TokenStream for EmitEmptyStream where S: TokenStream { fn advance(&mut self) -> bool { - match self.state { - EmitEmptyState::Start => { - if self.inner.advance() { - self.state = EmitEmptyState::UsingInner; - } else { - self.state = EmitEmptyState::EmitEmpty(Token::default()); - } - true - } - EmitEmptyState::UsingInner => self.inner.advance(), - EmitEmptyState::EmitEmpty(_) => false, + match self { + EmitEmptyStream::Empty(ref mut should_emit, _) => std::mem::replace(should_emit, false), + EmitEmptyStream::Tokenizer(t) => t.advance(), } } fn token(&self) -> &Token { - match self.state { - EmitEmptyState::Start => unreachable!(), - EmitEmptyState::UsingInner => self.inner.token(), - EmitEmptyState::EmitEmpty(ref token) => token, + match self { + EmitEmptyStream::Empty(_, token) => token, + EmitEmptyStream::Tokenizer(t) => t.token(), } } fn token_mut(&mut self) -> &mut Token { - match self.state { - EmitEmptyState::Start => unreachable!(), - EmitEmptyState::UsingInner => self.inner.token_mut(), - EmitEmptyState::EmitEmpty(ref mut token) => token, + match self { + EmitEmptyStream::Empty(_, token) => token, + EmitEmptyStream::Tokenizer(t) => t.token_mut(), } } } @@ -258,13 +243,9 @@ mod tests { } { - // this tokenizer as nothing, but we still want to emit one empty token + // this tokenizer as nothing, but isn't a strictly empty string let mut token_stream = default_tokenizer.token_stream(" : : "); - let mut tokens = Vec::new(); - while let Some(token) = token_stream.next() { - tokens.push(token.text.to_string()); - } - assert_eq!(tokens, vec![""]) + assert!(token_stream.next().is_none()) } } From 14ecacbf3e39314d36b700ea0c35a01496e5ce4d Mon Sep 17 00:00:00 2001 From: trinity-1686a Date: Fri, 8 Nov 2024 12:00:30 +0100 Subject: [PATCH 3/3] add integration test --- .../bulk/_teardown.quickwit.yaml | 4 ++ .../qw_search_api/0004_empty_string.yaml | 49 +++++++++++++++++++ .../qw_search_api/_setup.quickwit.yaml | 29 +++++++++++ .../qw_search_api/_teardown.quickwit.yaml | 3 ++ 4 files changed, 85 insertions(+) create mode 100644 quickwit/rest-api-tests/scenarii/qw_search_api/0004_empty_string.yaml diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/bulk/_teardown.quickwit.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/bulk/_teardown.quickwit.yaml index 4accabf684c..07f10e2acb4 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/bulk/_teardown.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/bulk/_teardown.quickwit.yaml @@ -4,4 +4,8 @@ endpoint: indexes/test-index --- method: DELETE api_root: http://localhost:7280/api/v1/ +endpoint: indexes/test-index-pattern-777 +--- +method: DELETE +api_root: http://localhost:7280/api/v1/ endpoint: templates/test-index-template diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/0004_empty_string.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/0004_empty_string.yaml new file mode 100644 index 00000000000..9a71716896a --- /dev/null +++ b/quickwit/rest-api-tests/scenarii/qw_search_api/0004_empty_string.yaml @@ -0,0 +1,49 @@ +# This tests a simple request with no queries. +endpoint: empty-string/search +params: + query: "str:''" +expected: + num_hits: 2 +--- +endpoint: empty-string/search +params: + query: "str_array:''" +expected: + num_hits: 2 +--- +endpoint: empty-string/search +params: + query: "dyn_str:''" +expected: + num_hits: 2 +--- +endpoint: empty-string/search +params: + query: "dyn_str_array:''" +expected: + num_hits: 2 +--- +endpoint: empty-string/search +params: + query: "str:'' AND str_array:ghi" +expected: + num_hits: 1 +--- +endpoint: empty-string/search +params: + query: "str_array:'' AND str_array:ghi" +expected: + num_hits: 1 +--- +endpoint: empty-string/search +params: + query: "dyn_str:'' AND str_array:ghi" +expected: + num_hits: 1 +--- +endpoint: empty-string/search +params: + query: "dyn_str_array:'' AND str_array:ghi" +expected: + num_hits: 1 +--- diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml index 84270d975b4..64ee2eef7b8 100644 --- a/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml @@ -82,3 +82,32 @@ params: commit: force ndjson: - {"seq": 4, "tag": 1} +--- +method: DELETE +endpoint: indexes/empty-string +status_code: null +--- +# Create index +method: POST +endpoint: indexes/ +json: + version: "0.7" + index_id: empty-string + doc_mapping: + mode: dynamic + field_mappings: + - name: str + type: text + - name: str_array + type: array +--- +# Ingest documents +method: POST +endpoint: empty-string/ingest +params: + commit: force +ndjson: + - {"str": "abc", "str_array": ["abc", "def"], "dyn_str": "abc", "dyn_str_array": ["abc", "def"]} + - {"str": "", "str_array": [""], "dyn_str": "", "dyn_str_array": [""]} + - {"str": "", "str_array": ["ghi", ""], "dyn_str": "", "dyn_str_array": ["ghi", ""]} + - {"str_array": [], "dyn_str_array": []} diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml index a2896f541b1..9f3013b25be 100644 --- a/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml @@ -4,3 +4,6 @@ endpoint: indexes/simple --- method: DELETE endpoint: indexes/tagged +--- +method: DELETE +endpoint: indexes/empty-string