Skip to content

Commit

Permalink
add raw_lowercase tokenizer (#5216)
Browse files Browse the repository at this point in the history
  • Loading branch information
trinity-1686a authored Jul 12, 2024
1 parent afee9de commit 549e129
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 3 deletions.
7 changes: 4 additions & 3 deletions docs/configuration/index-config.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,11 +142,12 @@ fast:
| Tokenizer | Description |
| ------------- | ------------- |
| `raw` | Does not process nor tokenize text. Filters out tokens larger than 255 bytes. |
| `raw_lowercase` | Does not tokenize text, but lowercase it. Filters out tokens larger than 255 bytes. |
| `default` | Chops the text on according to whitespace and punctuation, removes tokens that are too long, and converts to lowercase. Filters out tokens larger than 255 bytes. |
| `en_stem` | Like `default`, but also applies stemming on the resulting tokens. Filters out tokens larger than 255 bytes. |
| `whitespace` | Chops the text on according to whitespace only. Doesn't remove long tokens or converts to lowercase. |
| `en_stem` | Like `default`, but also applies stemming on the resulting tokens. Filters out tokens larger than 255 bytes. |
| `whitespace` | Chops the text on according to whitespace only. Doesn't remove long tokens or converts to lowercase. |
| `chinese_compatible` | Chop between each CJK character in addition to what `default` does. Should be used with `record: position` to be able to properly search |
| `lowercase` | Applies a lowercase transformation on the text. It does not tokenize the text. |
| `lowercase` | Applies a lowercase transformation on the text. It does not tokenize the text. |

##### Description of available normalizers

Expand Down
22 changes: 22 additions & 0 deletions quickwit/quickwit-query/src/tokenizers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager {
.build();
tokenizer_manager.register("raw", raw_tokenizer, false);

let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
.filter(LowerCaser)
.filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH))
.build();
tokenizer_manager.register("raw_lowercase", raw_tokenizer, false);

let lower_case_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
.filter(LowerCaser)
.filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH))
Expand Down Expand Up @@ -160,4 +166,20 @@ mod tests {
}
assert_eq!(tokens, vec!["pig", "cafe", "factory", "2"])
}

#[test]
fn test_raw_lowercase_tokenizer() {
let tokenizer_manager = super::create_default_quickwit_tokenizer_manager();
let my_long_text = "a text, that is just too long, no one will type it, no one will like \
it, no one shall find it. I just need some more chars, now you may \
not pass.";

let mut tokenizer = tokenizer_manager.get_tokenizer("raw_lowercase").unwrap();
let mut stream = tokenizer.token_stream(my_long_text);
assert!(stream.advance());
assert_eq!(stream.token().text.len(), my_long_text.len());
// there are non letter, so we can't check for all lowercase directly
assert!(stream.token().text.chars().all(|c| !c.is_uppercase()));
assert!(!stream.advance());
}
}

0 comments on commit 549e129

Please sign in to comment.