From 549e129659585a41644970fb5d3d023787ae4f1b Mon Sep 17 00:00:00 2001 From: trinity-1686a Date: Fri, 12 Jul 2024 11:40:14 +0200 Subject: [PATCH] add raw_lowercase tokenizer (#5216) --- docs/configuration/index-config.md | 7 +++--- quickwit/quickwit-query/src/tokenizers/mod.rs | 22 +++++++++++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/docs/configuration/index-config.md b/docs/configuration/index-config.md index 27d1befb5f2..01eb922fa98 100644 --- a/docs/configuration/index-config.md +++ b/docs/configuration/index-config.md @@ -142,11 +142,12 @@ fast: | Tokenizer | Description | | ------------- | ------------- | | `raw` | Does not process nor tokenize text. Filters out tokens larger than 255 bytes. | +| `raw_lowercase` | Does not tokenize text, but lowercase it. Filters out tokens larger than 255 bytes. | | `default` | Chops the text on according to whitespace and punctuation, removes tokens that are too long, and converts to lowercase. Filters out tokens larger than 255 bytes. | -| `en_stem` | Like `default`, but also applies stemming on the resulting tokens. Filters out tokens larger than 255 bytes. | -| `whitespace` | Chops the text on according to whitespace only. Doesn't remove long tokens or converts to lowercase. | +| `en_stem` | Like `default`, but also applies stemming on the resulting tokens. Filters out tokens larger than 255 bytes. | +| `whitespace` | Chops the text on according to whitespace only. Doesn't remove long tokens or converts to lowercase. | | `chinese_compatible` | Chop between each CJK character in addition to what `default` does. Should be used with `record: position` to be able to properly search | -| `lowercase` | Applies a lowercase transformation on the text. It does not tokenize the text. | +| `lowercase` | Applies a lowercase transformation on the text. It does not tokenize the text. | ##### Description of available normalizers diff --git a/quickwit/quickwit-query/src/tokenizers/mod.rs b/quickwit/quickwit-query/src/tokenizers/mod.rs index 3e68a55cb50..420252c52a7 100644 --- a/quickwit/quickwit-query/src/tokenizers/mod.rs +++ b/quickwit/quickwit-query/src/tokenizers/mod.rs @@ -46,6 +46,12 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager { .build(); tokenizer_manager.register("raw", raw_tokenizer, false); + let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default()) + .filter(LowerCaser) + .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) + .build(); + tokenizer_manager.register("raw_lowercase", raw_tokenizer, false); + let lower_case_tokenizer = TextAnalyzer::builder(RawTokenizer::default()) .filter(LowerCaser) .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) @@ -160,4 +166,20 @@ mod tests { } assert_eq!(tokens, vec!["pig", "cafe", "factory", "2"]) } + + #[test] + fn test_raw_lowercase_tokenizer() { + let tokenizer_manager = super::create_default_quickwit_tokenizer_manager(); + let my_long_text = "a text, that is just too long, no one will type it, no one will like \ + it, no one shall find it. I just need some more chars, now you may \ + not pass."; + + let mut tokenizer = tokenizer_manager.get_tokenizer("raw_lowercase").unwrap(); + let mut stream = tokenizer.token_stream(my_long_text); + assert!(stream.advance()); + assert_eq!(stream.token().text.len(), my_long_text.len()); + // there are non letter, so we can't check for all lowercase directly + assert!(stream.token().text.chars().all(|c| !c.is_uppercase())); + assert!(!stream.advance()); + } }