Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make tokenizer emit at least one empty token on empty strings #5532

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions quickwit/quickwit-query/src/query_ast/full_text_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,7 @@ mod tests {
use crate::{create_default_quickwit_tokenizer_manager, BooleanOperand};

#[test]
#[ignore]
Copy link
Contributor Author

@trinity-1686a trinity-1686a Oct 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the behavior this test tries for changed, I don't think it make sense to keep it. Note that it doesn't test for a query, but a body:"", so it makes sense it no longer is a MatchAll. emits no FullTextQuery, and is still a MatchAll

fn test_zero_terms() {
let full_text_query = FullTextQuery {
field: "body".to_string(),
Expand Down
89 changes: 81 additions & 8 deletions quickwit/quickwit-query/src/tokenizers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ mod tokenizer_manager;
use once_cell::sync::Lazy;
use tantivy::tokenizer::{
AsciiFoldingFilter, Language, LowerCaser, RawTokenizer, RemoveLongFilter, SimpleTokenizer,
Stemmer, TextAnalyzer, WhitespaceTokenizer,
Stemmer, TextAnalyzer, Token, TokenStream, Tokenizer, WhitespaceTokenizer,
};

use self::chinese_compatible::ChineseTokenizer;
Expand Down Expand Up @@ -58,29 +58,33 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager {
.build();
tokenizer_manager.register("lowercase", lower_case_tokenizer, true);

let default_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
let default_tokenizer = TextAnalyzer::builder(EmitEmptyTokenizer(SimpleTokenizer::default()))
.filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH))
.filter(LowerCaser)
.build();
tokenizer_manager.register("default", default_tokenizer, true);

let en_stem_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
let en_stem_tokenizer = TextAnalyzer::builder(EmitEmptyTokenizer(SimpleTokenizer::default()))
.filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH))
.filter(LowerCaser)
.filter(Stemmer::new(Language::English))
.build();
tokenizer_manager.register("en_stem", en_stem_tokenizer, true);

tokenizer_manager.register("whitespace", WhitespaceTokenizer::default(), false);
tokenizer_manager.register(
"whitespace",
EmitEmptyTokenizer(WhitespaceTokenizer::default()),
false,
);

let chinese_tokenizer = TextAnalyzer::builder(ChineseTokenizer)
let chinese_tokenizer = TextAnalyzer::builder(EmitEmptyTokenizer(ChineseTokenizer))
.filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH))
.filter(LowerCaser)
.build();
tokenizer_manager.register("chinese_compatible", chinese_tokenizer, true);
tokenizer_manager.register(
"source_code_default",
TextAnalyzer::builder(CodeTokenizer::default())
TextAnalyzer::builder(EmitEmptyTokenizer(CodeTokenizer::default()))
.filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
Expand All @@ -89,7 +93,7 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager {
);
tokenizer_manager.register(
"source_code_with_hex",
TextAnalyzer::builder(CodeTokenizer::with_hex_support())
TextAnalyzer::builder(EmitEmptyTokenizer(CodeTokenizer::with_hex_support()))
.filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
Expand All @@ -99,7 +103,7 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager {
#[cfg(feature = "multilang")]
tokenizer_manager.register(
"multilang_default",
TextAnalyzer::builder(MultiLangTokenizer::default())
TextAnalyzer::builder(EmitEmptyTokenizer(MultiLangTokenizer::default()))
.filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH))
.filter(LowerCaser)
.build(),
Expand Down Expand Up @@ -128,6 +132,54 @@ pub fn get_quickwit_fastfield_normalizer_manager() -> &'static TokenizerManager
&QUICKWIT_FAST_FIELD_NORMALIZER_MANAGER
}

#[derive(Debug, Clone)]
struct EmitEmptyTokenizer<T>(T);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need a comment to express the intent on that kind of tokenizer. It is very difficult for a future readers to infer the point of this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This tokenizer emits the empty token whenever the underlying tokenizer emits no token.
I am not sure this is the behavior we want.

For instance, a stop word would result in the emission of the empty token.

A stricter behavior, emitting an empty token iff the text is effectively empty might actually make more sense. I suspect the code would be simpler too.


impl<T> Tokenizer for EmitEmptyTokenizer<T>
where T: Tokenizer
{
type TokenStream<'a> = EmitEmptyStream<T::TokenStream<'a>>;

// Required method
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
if text.is_empty() {
EmitEmptyStream::Empty(true, Token::default())
} else {
EmitEmptyStream::Tokenizer(self.0.token_stream(text))
}
}
}

enum EmitEmptyStream<S> {
Empty(bool, Token),
Tokenizer(S),
}

impl<S> TokenStream for EmitEmptyStream<S>
where S: TokenStream
{
fn advance(&mut self) -> bool {
match self {
EmitEmptyStream::Empty(ref mut should_emit, _) => std::mem::replace(should_emit, false),
EmitEmptyStream::Tokenizer(t) => t.advance(),
}
}

fn token(&self) -> &Token {
match self {
EmitEmptyStream::Empty(_, token) => token,
EmitEmptyStream::Tokenizer(t) => t.token(),
}
}

fn token_mut(&mut self) -> &mut Token {
match self {
EmitEmptyStream::Empty(_, token) => token,
EmitEmptyStream::Tokenizer(t) => t.token_mut(),
}
}
}

#[cfg(test)]
mod tests {

Expand Down Expand Up @@ -176,6 +228,27 @@ mod tests {
assert_eq!(tokens, vec!["pig", "cafe", "factory", "2"])
}

#[test]
fn test_tokenizer_emit_empty() {
let mut default_tokenizer = super::create_default_quickwit_tokenizer_manager()
.get_tokenizer("default")
.unwrap();
{
let mut token_stream = default_tokenizer.token_stream("");
let mut tokens = Vec::new();
while let Some(token) = token_stream.next() {
tokens.push(token.text.to_string());
}
assert_eq!(tokens, vec![""]);
}

{
// this tokenizer as nothing, but isn't a strictly empty string
let mut token_stream = default_tokenizer.token_stream(" : : ");
assert!(token_stream.next().is_none())
}
}

#[test]
fn test_raw_lowercase_tokenizer() {
let tokenizer_manager = super::create_default_quickwit_tokenizer_manager();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,8 @@ endpoint: indexes/test-index
---
method: DELETE
api_root: http://localhost:7280/api/v1/
endpoint: indexes/test-index-pattern-777
---
method: DELETE
api_root: http://localhost:7280/api/v1/
endpoint: templates/test-index-template
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# This tests a simple request with no queries.
endpoint: empty-string/search
params:
query: "str:''"
expected:
num_hits: 2
---
endpoint: empty-string/search
params:
query: "str_array:''"
expected:
num_hits: 2
---
endpoint: empty-string/search
params:
query: "dyn_str:''"
expected:
num_hits: 2
---
endpoint: empty-string/search
params:
query: "dyn_str_array:''"
expected:
num_hits: 2
---
endpoint: empty-string/search
params:
query: "str:'' AND str_array:ghi"
expected:
num_hits: 1
---
endpoint: empty-string/search
params:
query: "str_array:'' AND str_array:ghi"
expected:
num_hits: 1
---
endpoint: empty-string/search
params:
query: "dyn_str:'' AND str_array:ghi"
expected:
num_hits: 1
---
endpoint: empty-string/search
params:
query: "dyn_str_array:'' AND str_array:ghi"
expected:
num_hits: 1
---
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,32 @@ params:
commit: force
ndjson:
- {"seq": 4, "tag": 1}
---
method: DELETE
endpoint: indexes/empty-string
status_code: null
---
# Create index
method: POST
endpoint: indexes/
json:
version: "0.7"
index_id: empty-string
doc_mapping:
mode: dynamic
field_mappings:
- name: str
type: text
- name: str_array
type: array<text>
---
# Ingest documents
method: POST
endpoint: empty-string/ingest
params:
commit: force
ndjson:
- {"str": "abc", "str_array": ["abc", "def"], "dyn_str": "abc", "dyn_str_array": ["abc", "def"]}
- {"str": "", "str_array": [""], "dyn_str": "", "dyn_str_array": [""]}
- {"str": "", "str_array": ["ghi", ""], "dyn_str": "", "dyn_str_array": ["ghi", ""]}
- {"str_array": [], "dyn_str_array": []}
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ endpoint: indexes/simple
---
method: DELETE
endpoint: indexes/tagged
---
method: DELETE
endpoint: indexes/empty-string
Loading