From 144074d18e9b40615dacfd6c3908bcecb6b7ea3b Mon Sep 17 00:00:00 2001 From: trinity-1686a Date: Fri, 12 Apr 2024 17:16:48 +0200 Subject: [PATCH] add support for concatenated field type (#4773) * initial support for concatenate fields * add support of json and object fields to concatenate * add support for _dynamic in concat field * add integration tests * add unit tests --- docs/configuration/index-config.md | 61 ++ .../src/default_doc_mapper/default_mapper.rs | 644 +++++++++++++++--- .../default_doc_mapper/field_mapping_entry.rs | 92 +++ .../default_doc_mapper/field_mapping_type.rs | 12 +- .../src/default_doc_mapper/mapping_tree.rs | 436 ++++++++++-- .../concat_fields/0001_concat_field.yaml | 117 ++++ .../scenarii/concat_fields/_ctx.yaml | 5 + .../concat_fields/_setup.quickwit.yaml | 64 ++ .../concat_fields/_teardown.quickwit.yaml | 3 + 9 files changed, 1291 insertions(+), 143 deletions(-) create mode 100644 quickwit/rest-api-tests/scenarii/concat_fields/0001_concat_field.yaml create mode 100644 quickwit/rest-api-tests/scenarii/concat_fields/_ctx.yaml create mode 100644 quickwit/rest-api-tests/scenarii/concat_fields/_setup.quickwit.yaml create mode 100644 quickwit/rest-api-tests/scenarii/concat_fields/_teardown.quickwit.yaml diff --git a/docs/configuration/index-config.md b/docs/configuration/index-config.md index d56dde24d85..50da9d5851c 100644 --- a/docs/configuration/index-config.md +++ b/docs/configuration/index-config.md @@ -169,6 +169,8 @@ Quickwit handles three numeric types: `i64`, `u64`, and `f64`. Numeric values can be stored in a fast field (the equivalent of Lucene's `DocValues`), which is a column-oriented storage used for range queries and aggregations. +When querying negative numbers without precising a field (using `default_search_fields`), you should single-quote the number (for instance '-5'), otherwise it will be interpreted as wanting to match anything but that number. + Example of a mapping for an u64 field: ```yaml @@ -410,6 +412,65 @@ field_mappings: type: text ``` +#### concatenate + +Quickwit supports mapping the content of multiple fields to a single one. This can be more efficient at query time than +searching through dozens of `default_search_fields`. It also allow querying inside a json field without knowing the path +to the field being searched. + +```yaml +name: my_default_field +type: concatenate +concatenated_fields: + - text # things inside text, tokenized with the `default` tokenizer + - resource.author # all fields in resource.author, assuming resource is an `object` field. +include_dynamic_fields: true +tokenizer: default +record: basic +``` + +Concatenate fields don't support fast fields, and are never stored. They uses their own tokenizer, independantly of the +tokenizer configured on the individual fields. +At query time, concatenate fields don't support range queries. +Only the following types are supported inside a concatenate field: text, bool, i64, u64, json. Other types are rejected +at index creation, or silently discarded during indexation if they are found inside a json field. +Adding an object field to a concatenate field doesn't automatically add its subfields (yet). + +It isn't possible to add subfields from a json field to a concatenate field. For instance if `attributes` is a json field, it's not possible to add only `attributes.color` to a concatenate field. + +For json fields and dynamic fields, the path is not indexed, only values are. For instance, given the following document: +```json +{ + "421312": { + "my-key": "my-value" + } +} +``` +It is possible to search for `my-value` despite not knowing the full path, but it isn't possible to search for all documents containing a key `my-key`. + + + ### Mode The `mode` describes how Quickwit should behave when it receives a field that is not defined in the field mapping. diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs index 278d2f23bfa..10a62caf567 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs @@ -36,7 +36,10 @@ use tantivy::TantivyDocument as Document; use super::field_mapping_entry::RAW_TOKENIZER_NAME; use super::DefaultDocMapperBuilder; -use crate::default_doc_mapper::mapping_tree::{build_mapping_tree, MappingNode}; +use crate::default_doc_mapper::mapping_tree::{ + build_mapping_tree, map_primitive_json_to_tantivy, JsonValueIterator, MappingNode, + MappingNodeRoot, +}; use crate::default_doc_mapper::FieldMappingType; use crate::doc_mapper::{JsonObject, Partition}; use crate::query_builder::build_query; @@ -73,6 +76,8 @@ pub struct DefaultDocMapper { /// Root node of the field mapping tree. /// See [`MappingNode`]. field_mappings: MappingNode, + /// Concat fields which needs to learn about any element put in dynamic_field + concatenate_dynamic_fields: Vec, /// Schema generated by the store source and field mappings parameters. schema: Schema, /// List of field names used for tagging. @@ -154,7 +159,13 @@ impl TryFrom for DefaultDocMapper { }; // Adding regular fields. - let field_mappings = build_mapping_tree(&builder.field_mappings, &mut schema_builder)?; + let MappingNodeRoot { + field_mappings, + concatenate_dynamic_fields, + } = build_mapping_tree(&builder.field_mappings, &mut schema_builder)?; + if !concatenate_dynamic_fields.is_empty() && dynamic_field.is_none() { + bail!("concatenate field has `include_dynamic_fields` set, but index isn't dynamic"); + } let source_field = if builder.store_source { Some(schema_builder.add_json_field(SOURCE_FIELD_NAME, STORED)) } else { @@ -254,6 +265,7 @@ impl TryFrom for DefaultDocMapper { default_search_field_names, timestamp_field_name: builder.timestamp_field, field_mappings, + concatenate_dynamic_fields, tag_field_names, required_fields, partition_key, @@ -501,6 +513,49 @@ fn populate_field_presence_for_json_obj( } } +fn zip_cloneable, U: Clone>(iter: I, item: U) -> ZipCloneable { + let mut inner = iter.peekable(); + if inner.peek().is_some() { + ZipCloneable::Running { inner, item } + } else { + ZipCloneable::Ended + } +} + +/// An iterator which zip a value alongside another iterator, cloning it each time it yields, +/// except for the last iteration. +#[derive(Default, Debug)] +enum ZipCloneable, U: Clone> { + Running { + inner: std::iter::Peekable, + item: U, + }, + #[default] + Ended, +} + +impl, U: Clone> Iterator for ZipCloneable { + type Item = (T, U); + + fn next(&mut self) -> Option<(T, U)> { + match self { + ZipCloneable::Running { inner, item } => { + let current_value = inner.next()?; + if inner.peek().is_some() { + Some((current_value, item.clone())) + } else { + // we are in the latest iteration, take item so we don't clone it + let ZipCloneable::Running { item, .. } = std::mem::take(self) else { + unreachable!() + }; + Some((current_value, item)) + } + } + ZipCloneable::Ended => None, + } + } +} + #[typetag::serde(name = "default")] impl DocMapper for DefaultDocMapper { fn doc_from_json_obj( @@ -535,6 +590,19 @@ impl DocMapper for DefaultDocMapper { if let Some(dynamic_field) = self.dynamic_field { if !dynamic_json_obj.is_empty() { + if !self.concatenate_dynamic_fields.is_empty() { + let json_obj_values = + JsonValueIterator::new(serde_json::Value::Object(dynamic_json_obj.clone())) + .flat_map(map_primitive_json_to_tantivy); + + for value in json_obj_values { + for (concatenate_dynamic_field, value) in + zip_cloneable(self.concatenate_dynamic_fields.iter(), value) + { + document.add_field_value(*concatenate_dynamic_field, value); + } + } + } document.add_object( dynamic_field, dynamic_json_obj @@ -649,6 +717,8 @@ impl DocMapper for DefaultDocMapper { #[cfg(test)] mod tests { use std::collections::{HashMap, HashSet}; + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::sync::Arc; use quickwit_common::PathHasher; use quickwit_query::query_ast::query_ast_from_user_text; @@ -657,6 +727,7 @@ mod tests { use super::DefaultDocMapper; use crate::default_doc_mapper::field_mapping_entry::DEFAULT_TOKENIZER_NAME; + use crate::default_doc_mapper::mapping_tree::value_to_pretokenized; use crate::{ DefaultDocMapperBuilder, DocMapper, DocParsingError, DYNAMIC_FIELD_NAME, FIELD_PRESENCE_FIELD_NAME, SOURCE_FIELD_NAME, @@ -1514,119 +1585,433 @@ mod tests { assert_eq!(doc.len(), 0); } - #[test] - fn test_dymamic_mode_simple() { - let default_doc_mapper: DefaultDocMapper = - serde_json::from_str(r#"{ "mode": "dynamic" }"#).unwrap(); + fn test_doc_from_json_test_aux( + doc_mapper_json: &str, + field: &str, + document_json: &str, + expected: Vec, + ) { + let default_doc_mapper: DefaultDocMapper = serde_json::from_str(doc_mapper_json).unwrap(); let schema = default_doc_mapper.schema(); - let dynamic_field = schema.get_field(DYNAMIC_FIELD_NAME).unwrap(); - let (_, doc) = default_doc_mapper - .doc_from_json_str(r#"{ "a": { "b": 5, "c": 6 } }"#) - .unwrap(); - let vals: Vec<&TantivyValue> = doc.get_all(dynamic_field).collect(); - assert_eq!(vals.len(), 1); - if let TantivyValue::Object(json_val) = &vals[0] { - assert_eq!( - serde_json::to_value(json_val).unwrap(), - json!({ - "a": { - "b": 5, - "c": 6 - } - }) - ); - } else { - panic!("Expected json"); + let field = schema.get_field(field).unwrap(); + let (_, doc) = default_doc_mapper.doc_from_json_str(document_json).unwrap(); + let vals: Vec<&TantivyValue> = doc.get_all(field).collect(); + assert_eq!(vals.len(), expected.len()); + for (val, exp) in vals.into_iter().zip(expected.iter()) { + assert_eq!(val, exp); } } + #[test] + fn test_dymamic_mode_simple() { + test_doc_from_json_test_aux( + r#"{ "mode": "dynamic" }"#, + DYNAMIC_FIELD_NAME, + r#"{ "a": { "b": 5, "c": 6 } }"#, + vec![json!({ + "a": { + "b": 5, + "c": 6 + } + }) + .into()], + ); + } + #[test] fn test_dymamic_mode_inner() { - let default_doc_mapper: DefaultDocMapper = serde_json::from_str( + test_doc_from_json_test_aux( r#"{ - "field_mappings": [ - { - "name": "some_obj", - "type": "object", - "field_mappings": [ - { - "name": "child_a", - "type": "text" - } - ] - } - ], - "mode": "dynamic" - }"#, - ) - .unwrap(); - let (_, doc) = default_doc_mapper - .doc_from_json_str( - r#"{ "some_obj": { "child_a": "", "child_b": {"c": 3} }, "some_obj2": 4 }"#, - ) - .unwrap(); - let dynamic_field = default_doc_mapper - .schema() - .get_field(DYNAMIC_FIELD_NAME) - .unwrap(); - let vals: Vec<&TantivyValue> = doc.get_all(dynamic_field).collect(); - assert_eq!(vals.len(), 1); - if let TantivyValue::Object(json_val) = &vals[0] { - assert_eq!( - serde_json::to_value(json_val).unwrap(), - serde_json::json!({ - "some_obj": { - "child_b": { - "c": 3 - } - }, - "some_obj2": 4 - }) - ); - } else { - panic!("Expected json"); - } + "field_mappings": [ + { + "name": "some_obj", + "type": "object", + "field_mappings": [ + { + "name": "child_a", + "type": "text" + } + ] + } + ], + "mode": "dynamic" + }"#, + DYNAMIC_FIELD_NAME, + r#"{ "some_obj": { "child_a": "", "child_b": {"c": 3} }, "some_obj2": 4 }"#, + vec![json!({ + "some_obj": { + "child_b": { + "c": 3 + } + }, + "some_obj2": 4 + }) + .into()], + ); } #[test] fn test_json_object_in_mapping() { - let default_doc_mapper: DefaultDocMapper = serde_json::from_str( + test_doc_from_json_test_aux( r#"{ - "field_mappings": [ - { - "name": "some_obj", - "type": "object", - "field_mappings": [ - { - "name": "json_obj", - "type": "json" - } - ] - } - ], - "mode": "strict" - }"#, + "field_mappings": [ + { + "name": "some_obj", + "type": "object", + "field_mappings": [ + { + "name": "json_obj", + "type": "json" + } + ] + } + ], + "mode": "strict" + }"#, + "some_obj.json_obj", + r#"{ "some_obj": { "json_obj": {"hello": 2} } }"#, + vec![json!({ + "hello": 2 + }) + .into()], + ); + } + + #[test] + fn test_reject_invalid_concatenate_field() { + assert!(serde_json::from_str::( + r#"{ + "field_mappings": [ + { + "name": "concat", + "type": "concatenate", + "concatenate_fields": ["inexistant_field"] + } + ] + }"# ) - .unwrap(); - let (_, doc) = default_doc_mapper - .doc_from_json_str(r#"{ "some_obj": { "json_obj": {"hello": 2} } }"#) - .unwrap(); - let json_field = default_doc_mapper - .schema() - .get_field("some_obj.json_obj") - .unwrap(); - let vals: Vec<&TantivyValue> = doc.get_all(json_field).collect(); - assert_eq!(vals.len(), 1); - if let TantivyValue::Object(json_val) = &vals[0] { - assert_eq!( - serde_json::to_value(json_val).unwrap(), - serde_json::json!({ - "hello": 2 - }) - ); - } else { - panic!("expected json"); - } + .unwrap_err() + .to_string() + .contains("uses an unknown field")); + assert!(serde_json::from_str::( + r#"{ + "field_mappings": [ + { + "name": "concat", + "type": "concatenate", + "include_dynamic_fields": true + } + ], + "mode": "strict" + }"# + ) + .unwrap_err() + .to_string() + .contains("concatenate field has `include_dynamic_fields` set, but index isn't dynamic")); + assert!(serde_json::from_str::( + r#"{ + "field_mappings": [ + { + "name": "concat", + "type": "concatenate" + } + ] + }"# + ) + .unwrap_err() + .to_string() + .contains("concatenate type must have at least one sub-field")); + } + + #[test] + fn test_concatenate_field_in_mapping() { + test_doc_from_json_test_aux( + r#"{ + "field_mappings": [ + { + "name": "some_text", + "type": "text" + }, + { + "name": "concat", + "type": "concatenate", + "concatenate_fields": ["some_text"] + } + ], + "mode": "strict" + }"#, + "concat", + r#"{"some_text": "this is a text"}"#, + vec!["this is a text".into()], + ); + } + + #[test] + fn test_concatenate_field_in_mapping_dynamic() { + test_doc_from_json_test_aux( + r#"{ + "field_mappings": [ + { + "name": "concat", + "type": "concatenate", + "include_dynamic_fields": true + } + ], + "mode": "dynamic" + }"#, + "concat", + r#"{"other_field": "this is a text"}"#, + vec!["this is a text".into()], + ); + test_doc_from_json_test_aux( + r#"{ + "field_mappings": [ + { + "name": "concat", + "type": "concatenate", + "include_dynamic_fields": true + } + ], + "mode": "dynamic" + }"#, + "concat", + r#"{"first_field": "this is a text", "second_field": "this is a text field too"}"#, + vec!["this is a text".into(), "this is a text field too".into()], + ); + } + + #[test] + fn test_concatenate_field_in_mapping_integer() { + test_doc_from_json_test_aux( + r#"{ + "field_mappings": [ + { + "name": "some_int", + "type": "u64" + }, + { + "name": "concat", + "type": "concatenate", + "concatenate_fields": ["some_int"] + } + ], + "mode": "strict" + }"#, + "concat", + r#"{"some_int": 25}"#, + vec![value_to_pretokenized(25).into()], + ); + test_doc_from_json_test_aux( + r#"{ + "field_mappings": [ + { + "name": "concat", + "type": "concatenate", + "include_dynamic_fields": true + } + ], + "mode": "dynamic" + }"#, + "concat", + r#"{"some_int": 25}"#, + vec![value_to_pretokenized(25).into()], + ); + } + + #[test] + fn test_concatenate_field_in_mapping_boolean() { + test_doc_from_json_test_aux( + r#"{ + "field_mappings": [ + { + "name": "some_bool", + "type": "bool" + }, + { + "name": "concat", + "type": "concatenate", + "concatenate_fields": ["some_bool"] + } + ], + "mode": "strict" + }"#, + "concat", + r#"{"some_bool": false}"#, + vec![value_to_pretokenized(false).into()], + ); + test_doc_from_json_test_aux( + r#"{ + "field_mappings": [ + { + "name": "concat", + "type": "concatenate", + "include_dynamic_fields": true + } + ], + "mode": "dynamic" + }"#, + "concat", + r#"{"some_bool": true}"#, + vec![value_to_pretokenized(true).into()], + ); + } + + #[test] + fn test_concatenate_field_array() { + test_doc_from_json_test_aux( + r#"{ + "field_mappings": [ + { + "name": "some_text", + "type": "array" + }, + { + "name": "concat", + "type": "concatenate", + "concatenate_fields": ["some_text"] + } + ], + "mode": "strict" + }"#, + "concat", + r#"{"some_text": ["this is a text", "this is a text too"]}"#, + vec!["this is a text".into(), "this is a text too".into()], + ); + } + + #[test] + fn test_concatenate_multiple_field() { + test_doc_from_json_test_aux( + r#"{ + "field_mappings": [ + { + "name": "some_text", + "type": "text" + }, + { + "name": "other_text", + "type": "text" + }, + { + "name": "concat", + "type": "concatenate", + "concatenate_fields": ["some_text", "other_text"] + } + ], + "mode": "strict" + }"#, + "concat", + r#"{"some_text": "this is a text", "other_text": "this is a text too"}"#, + vec!["this is a text too".into(), "this is a text".into()], + ); + } + + #[test] + fn test_concatenate_field_object() { + test_doc_from_json_test_aux( + r#"{ + "field_mappings": [ + { + "name": "some_obj", + "type": "object", + "field_mappings": [ + { + "name": "json_obj", + "type": "json" + } + ] + }, + { + "name": "concat", + "type": "concatenate", + "concatenate_fields": ["some_obj.json_obj"] + } + ], + "mode": "strict" + }"#, + "concat", + r#"{ "some_obj": { "json_obj": {"hello": "world"} } }"#, + vec!["world".into()], + ); + } + + /* + * in the future we may want to make this works. Currently it isn't supported and fail at index + * creation + #[test] + fn test_concatenate_field_json_subpath() { + test_doc_from_json_test_aux( + r#"{ + "field_mappings": [ + { + "name": "json_obj", + "type": "json" + }, + { + "name": "concat", + "type": "concatenate", + "concatenate_fields": ["json_obj.hello"] + } + ], + "mode": "strict" + }"#, + "concat", + r#"{ "json_obj": { "hello": "1", "world": "2"} }"#, + vec!["1".into()], + ); + } + */ + + #[test] + fn test_concatenate_field_text() { + test_doc_from_json_test_aux( + r#"{ + "field_mappings": [ + { + "name": "some_text", + "type": "text" + }, + { + "name": "concat1", + "type": "concatenate", + "concatenate_fields": ["some_text"] + }, + { + "name": "concat2", + "type": "concatenate", + "concatenate_fields": ["some_text"] + } + ], + "mode": "strict" + }"#, + "concat1", + r#"{"some_text": "this is a text"}"#, + vec!["this is a text".into()], + ); + test_doc_from_json_test_aux( + r#"{ + "field_mappings": [ + { + "name": "some_text", + "type": "text" + }, + { + "name": "concat1", + "type": "concatenate", + "concatenate_fields": ["some_text"] + }, + { + "name": "concat2", + "type": "concatenate", + "concatenate_fields": ["some_text"] + } + ], + "mode": "strict" + }"#, + "concat2", + r#"{"some_text": "this is a text"}"#, + vec!["this is a text".into()], + ); } fn default_doc_mapper_query_aux( @@ -1981,4 +2366,59 @@ mod tests { ); } } + + struct CloneLimiter { + clone_left: Arc, + } + + impl Clone for CloneLimiter { + fn clone(&self) -> Self { + if self.clone_left.fetch_sub(1, Ordering::Relaxed) == 0 { + panic!("clone count exceeded"); + } + CloneLimiter { + clone_left: self.clone_left.clone(), + } + } + } + + impl CloneLimiter { + fn new(max_clone: usize) -> Self { + CloneLimiter { + clone_left: Arc::new(AtomicUsize::new(max_clone)), + } + } + } + + #[test] + #[should_panic(expected = "clone count exceeded")] + fn test_clone_limiter_panic() { + let limiter = CloneLimiter::new(1); + let _ = limiter.clone(); + let _ = limiter.clone(); + } + + #[test] + fn test_clone_limiter_doesnt_panic_early() { + let limiter = CloneLimiter::new(1); + let _ = limiter.clone(); + } + + #[test] + fn test_zip_cloneable() { + for (_val, _limiter) in super::zip_cloneable(std::iter::empty::<()>(), CloneLimiter::new(0)) + { + } + + for iter_len in 1..5 { + // to generate an iter with X items, we need only X-1 clone. In particular, for X=1, we + // don't need to clone + let limiter = CloneLimiter::new(iter_len - 1); + for ((val, _limiter), expected) in + super::zip_cloneable(0..iter_len, limiter).zip(0..iter_len) + { + assert_eq!(val, expected); + } + } + } } diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs index 44b978c4c1e..3f556629858 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs @@ -353,6 +353,15 @@ impl TextIndexingOptions { } } + fn from_parts_concatenate( + tokenizer: Option, + record: Option, + ) -> anyhow::Result { + let text_index_options_opt = Self::from_parts_text(true, tokenizer, record, false)?; + let text_index_options = text_index_options_opt.expect("concatenate field must be indexed"); + Ok(text_index_options) + } + fn to_parts_text( this: Option, ) -> ( @@ -383,6 +392,14 @@ impl TextIndexingOptions { (indexed, tokenizer, record) } + fn to_parts_concatenate( + this: Self, + ) -> (Option, Option) { + let (_indexed, tokenizer, record, _fieldorm) = + TextIndexingOptions::to_parts_text(Some(this)); + (tokenizer, record) + } + fn default_json() -> Self { TextIndexingOptions { tokenizer: QuickwitTextTokenizer::raw(), @@ -635,6 +652,69 @@ impl From for JsonObjectOptions { } } +/// Options associated to a concatenate field. +#[quickwit_macros::serde_multikey] +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize, utoipa::ToSchema)] +#[serde(deny_unknown_fields)] +pub struct QuickwitConcatenateOptions { + /// Optional description of JSON object. + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub description: Option, + /// Fields to concatenate + #[serde(default)] + pub concatenate_fields: Vec, + #[serde(default)] + pub include_dynamic_fields: bool, + #[serde_multikey( + deserializer = TextIndexingOptions::from_parts_concatenate, + serializer = TextIndexingOptions::to_parts_concatenate, + fields = ( + /// Sets the tokenize that should be used with the text fields in the + /// concatenate field. + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub tokenizer: Option, + /// Sets how much information should be added in the index + /// with each token. + #[schema(value_type = IndexRecordOptionSchema)] + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub record: Option, + ), + )] + /// Options for indexing text in a concatenate field. + pub indexing_options: TextIndexingOptions, +} + +impl Default for QuickwitConcatenateOptions { + fn default() -> Self { + QuickwitConcatenateOptions { + description: None, + concatenate_fields: Vec::new(), + include_dynamic_fields: false, + indexing_options: TextIndexingOptions { + tokenizer: QuickwitTextTokenizer::default(), + record: IndexRecordOption::Basic, + fieldnorms: false, + }, + } + } +} + +impl From for TextOptions { + fn from(quickwit_text_options: QuickwitConcatenateOptions) -> Self { + let mut text_options = TextOptions::default(); + let text_field_indexing = TextFieldIndexing::default() + .set_index_option(quickwit_text_options.indexing_options.record) + .set_fieldnorms(quickwit_text_options.indexing_options.fieldnorms) + .set_tokenizer(quickwit_text_options.indexing_options.tokenizer.name()); + + text_options = text_options.set_indexing_options(text_field_indexing); + text_options + } +} + fn deserialize_mapping_type( quickwit_field_type: QuickwitFieldType, json: JsonValue, @@ -649,6 +729,15 @@ fn deserialize_mapping_type( } return Ok(FieldMappingType::Object(object_options)); } + QuickwitFieldType::Concatenate => { + let concatenate_options: QuickwitConcatenateOptions = serde_json::from_value(json)?; + if concatenate_options.concatenate_fields.is_empty() + && !concatenate_options.include_dynamic_fields + { + anyhow::bail!("concatenate type must have at least one sub-field"); + } + return Ok(FieldMappingType::Concatenate(concatenate_options)); + } }; match typ { Type::Str => { @@ -742,6 +831,9 @@ fn typed_mapping_to_json_params( FieldMappingType::DateTime(date_time_options, _) => serialize_to_map(&date_time_options), FieldMappingType::Json(json_options, _) => serialize_to_map(&json_options), FieldMappingType::Object(object_options) => serialize_to_map(&object_options), + FieldMappingType::Concatenate(concatenate_options) => { + serialize_to_map(&concatenate_options) + } } .unwrap() } diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_type.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_type.rs index 4c484228f8f..a32bb962938 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_type.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_type.rs @@ -22,8 +22,8 @@ use tantivy::schema::Type; use super::date_time_type::QuickwitDateTimeOptions; use super::field_mapping_entry::QuickwitBoolOptions; use crate::default_doc_mapper::field_mapping_entry::{ - QuickwitBytesOptions, QuickwitIpAddrOptions, QuickwitJsonOptions, QuickwitNumericOptions, - QuickwitObjectOptions, QuickwitTextOptions, + QuickwitBytesOptions, QuickwitConcatenateOptions, QuickwitIpAddrOptions, QuickwitJsonOptions, + QuickwitNumericOptions, QuickwitObjectOptions, QuickwitTextOptions, }; use crate::Cardinality; @@ -51,6 +51,8 @@ pub enum FieldMappingType { Json(QuickwitJsonOptions, Cardinality), /// Object mapping type configuration. Object(QuickwitObjectOptions), + /// Concatenate field mapping type configuration. + Concatenate(QuickwitConcatenateOptions), } impl FieldMappingType { @@ -69,6 +71,7 @@ impl FieldMappingType { FieldMappingType::Object(_) => { return QuickwitFieldType::Object; } + FieldMappingType::Concatenate(_) => return QuickwitFieldType::Concatenate, }; match cardinality { Cardinality::SingleValue => QuickwitFieldType::Simple(primitive_type), @@ -81,6 +84,7 @@ impl FieldMappingType { pub enum QuickwitFieldType { Simple(Type), Object, + Concatenate, Array(Type), } @@ -90,6 +94,7 @@ impl QuickwitFieldType { QuickwitFieldType::Simple(typ) => primitive_type_to_str(typ).to_string(), QuickwitFieldType::Object => "object".to_string(), QuickwitFieldType::Array(typ) => format!("array<{}>", primitive_type_to_str(typ)), + QuickwitFieldType::Concatenate => "concatenate".to_string(), } } @@ -97,6 +102,9 @@ impl QuickwitFieldType { if type_str == "object" { return Some(QuickwitFieldType::Object); } + if type_str == "concatenate" { + return Some(QuickwitFieldType::Concatenate); + } if type_str.starts_with("array<") && type_str.ends_with('>') { let parsed_type_str = parse_primitive_type(&type_str[6..type_str.len() - 1])?; return Some(QuickwitFieldType::Array(parsed_type_str)); diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs index 17f10cfa9f5..a7351749dbd 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs @@ -29,6 +29,7 @@ use tantivy::schema::{ BytesOptions, Field, IntoIpv6Addr, IpAddrOptions, JsonObjectOptions, NumericOptions, OwnedValue as TantivyValue, SchemaBuilder, TextOptions, }; +use tantivy::tokenizer::{PreTokenizedString, Token}; use tantivy::{DateOptions, TantivyDocument as Document}; use tracing::warn; @@ -54,6 +55,124 @@ pub enum LeafType { Text(QuickwitTextOptions), } +pub(crate) fn value_to_pretokenized(val: T) -> PreTokenizedString { + let text = val.to_string(); + PreTokenizedString { + text: text.clone(), + tokens: vec![Token { + offset_from: 0, + offset_to: 1, + position: 0, + text, + position_length: 1, + }], + } +} + +enum MapOrArrayIter { + Array(std::vec::IntoIter), + Map(serde_json::map::IntoIter), + Value(JsonValue), +} + +impl Iterator for MapOrArrayIter { + type Item = JsonValue; + + fn next(&mut self) -> Option { + match self { + MapOrArrayIter::Array(iter) => iter.next(), + MapOrArrayIter::Map(iter) => iter.next().map(|(_, val)| val), + MapOrArrayIter::Value(val) => { + if val.is_null() { + None + } else { + Some(std::mem::take(val)) + } + } + } + } +} + +/// Iterate over all primitive values inside the provided JsonValue, ignoring Nulls, and opening +/// arrays and objects. +pub(crate) struct JsonValueIterator { + currently_itered: Vec, +} + +impl JsonValueIterator { + pub fn new(source: JsonValue) -> JsonValueIterator { + let base_value = match source { + JsonValue::Array(array) => MapOrArrayIter::Array(array.into_iter()), + JsonValue::Object(map) => MapOrArrayIter::Map(map.into_iter()), + other => MapOrArrayIter::Value(other), + }; + JsonValueIterator { + currently_itered: vec![base_value], + } + } +} + +impl Iterator for JsonValueIterator { + type Item = JsonValue; + + fn next(&mut self) -> Option { + loop { + let currently_itered = self.currently_itered.last_mut()?; + match currently_itered.next() { + Some(JsonValue::Array(array)) => self + .currently_itered + .push(MapOrArrayIter::Array(array.into_iter())), + Some(JsonValue::Object(map)) => self + .currently_itered + .push(MapOrArrayIter::Map(map.into_iter())), + Some(JsonValue::Null) => continue, + Some(other) => return Some(other), + None => { + self.currently_itered.pop(); + continue; + } + } + } + } +} + +enum OneOrIter> { + One(Option), + Iter(I), +} + +impl> OneOrIter { + pub fn one(item: T) -> Self { + OneOrIter::One(Some(item)) + } +} + +impl> Iterator for OneOrIter { + type Item = T; + + fn next(&mut self) -> Option { + match self { + OneOrIter::Iter(iter) => iter.next(), + OneOrIter::One(item) => std::mem::take(item), + } + } +} + +pub(crate) fn map_primitive_json_to_tantivy(value: JsonValue) -> Option { + match value { + JsonValue::Array(_) | JsonValue::Object(_) | JsonValue::Null => None, + JsonValue::String(text) => Some(TantivyValue::Str(text)), + JsonValue::Bool(val) => Some(value_to_pretokenized(val).into()), + JsonValue::Number(number) => { + if let Some(val) = u64::from_json_number(&number) { + Some(value_to_pretokenized(val).into()) + } else { + i64::from_json_number(&number).map(|val| value_to_pretokenized(val).into()) + } + } + } +} + impl LeafType { fn value_from_json(&self, json_val: JsonValue) -> Result { match self { @@ -100,6 +219,67 @@ impl LeafType { } } } + + fn tantivy_string_value_from_json( + &self, + json_val: JsonValue, + ) -> Result, String> { + match self { + LeafType::Text(_) => { + if let JsonValue::String(text) = json_val { + Ok(OneOrIter::one(TantivyValue::Str(text))) + } else { + Err(format!("expected string, got `{json_val}`")) + } + } + LeafType::I64(numeric_options) => { + let val = i64::from_json_to_self(json_val, numeric_options.coerce)?; + Ok(OneOrIter::one(value_to_pretokenized(val).into())) + } + LeafType::U64(numeric_options) => { + let val = u64::from_json_to_self(json_val, numeric_options.coerce)?; + Ok(OneOrIter::one(value_to_pretokenized(val).into())) + } + LeafType::F64(_) => Err("unsuported concat type: f64".to_string()), + LeafType::Bool(_) => { + if let JsonValue::Bool(val) = json_val { + Ok(OneOrIter::one(value_to_pretokenized(val).into())) + } else { + Err(format!("expected boolean, got `{json_val}`")) + } + } + LeafType::IpAddr(_) => Err("unsuported concat type: IpAddr".to_string()), + LeafType::DateTime(_date_time_options) => { + Err("unsuported concat type: DateTime".to_string()) + } + LeafType::Bytes(_binary_options) => Err("unsuported concat type: DateTime".to_string()), + LeafType::Json(_) => { + if let JsonValue::Object(json_obj) = json_val { + Ok(OneOrIter::Iter( + json_obj + .into_iter() + .flat_map(|(_key, val)| JsonValueIterator::new(val)) + .flat_map(map_primitive_json_to_tantivy), + )) + } else { + Err(format!("expected object, got `{json_val}`")) + } + } + } + } + + fn supported_for_concat(&self) -> bool { + use LeafType::*; + matches!(self, Text(_) | U64(_) | I64(_) | Bool(_) | Json(_)) + /* + // will be supported if possible + DateTime(_), + IpAddr(_), + // won't be supported + Bytes(_), + F64(_), + */ + } } #[derive(Clone)] @@ -107,6 +287,8 @@ pub(crate) struct MappingLeaf { field: Field, typ: LeafType, cardinality: Cardinality, + // concatenate fields this field is part of + concatenate: Vec, } impl MappingLeaf { @@ -129,6 +311,17 @@ impl MappingLeaf { // We just ignore `null`. continue; } + if !self.concatenate.is_empty() { + let concat_values = self + .typ + .tantivy_string_value_from_json(el_json_val.clone()) + .map_err(|err_msg| DocParsingError::ValueError(path.join("."), err_msg))?; + for concat_value in concat_values { + for field in &self.concatenate { + document.add_field_value(*field, concat_value.clone()); + } + } + } let value = self .typ .value_from_json(el_json_val) @@ -137,6 +330,18 @@ impl MappingLeaf { } return Ok(()); } + + if !self.concatenate.is_empty() { + let concat_values = self + .typ + .tantivy_string_value_from_json(json_val.clone()) + .map_err(|err_msg| DocParsingError::ValueError(path.join("."), err_msg))?; + for concat_value in concat_values { + for field in &self.concatenate { + document.add_field_value(*field, concat_value.clone()); + } + } + } let value = self .typ .value_from_json(json_val) @@ -246,20 +451,18 @@ fn insert_json_val( trait NumVal: Sized + FromStr + ToString + Into { fn from_json_number(num: &serde_json::Number) -> Option; - fn from_json(json_val: JsonValue, coerce: bool) -> Result { + fn from_json_to_self(json_val: JsonValue, coerce: bool) -> Result { match json_val { - JsonValue::Number(num_val) => Self::from_json_number(&num_val) - .map(Self::into) - .ok_or_else(|| { - format!( - "expected {}, got inconvertible JSON number `{}`", - type_name::(), - num_val - ) - }), + JsonValue::Number(num_val) => Self::from_json_number(&num_val).ok_or_else(|| { + format!( + "expected {}, got inconvertible JSON number `{}`", + type_name::(), + num_val + ) + }), JsonValue::String(str_val) => { if coerce { - str_val.parse::().map(Self::into).map_err(|_| { + str_val.parse::().map_err(|_| { format!( "failed to coerce JSON string `\"{str_val}\"` to {}", type_name::() @@ -284,6 +487,10 @@ trait NumVal: Sized + FromStr + ToString + Into { } } + fn from_json(json_val: JsonValue, coerce: bool) -> Result { + Self::from_json_to_self(json_val, coerce).map(Self::into) + } + fn to_json(&self, output_format: NumericOutputFormat) -> Option; } @@ -363,11 +570,7 @@ impl MappingNode { fn internal_find_field_mapping_type(&self, field_path: &[String]) -> Option { let (first_path_fragment, sub_field_path) = field_path.split_first()?; - let field_name = self - .branches_order - .iter() - .find(|name| name == &first_path_fragment)?; - let child_tree = self.branches.get(field_name).expect("Missing field"); + let child_tree = self.branches.get(first_path_fragment)?; match (child_tree, sub_field_path.is_empty()) { (_, true) => Some(child_tree.clone().into()), (MappingTree::Leaf(_), false) => None, @@ -377,6 +580,33 @@ impl MappingNode { } } + /// Finds the field mapping type for a given field path in the mapping tree. + /// Dots in `field_path_as_str` define the boundaries between field names. + /// If a dot is part of a field name, it must be escaped with '\'. + pub fn find_field_mapping_leaf( + &mut self, + field_path_as_str: &str, + ) -> Option> { + let field_path = build_field_path_from_str(field_path_as_str); + self.internal_find_field_mapping_leaf(&field_path) + } + + fn internal_find_field_mapping_leaf( + &mut self, + field_path: &[String], + ) -> Option> { + let (first_path_fragment, sub_field_path) = field_path.split_first()?; + let child_tree = self.branches.get_mut(first_path_fragment)?; + match (child_tree, sub_field_path.is_empty()) { + (MappingTree::Leaf(_), false) => None, + (MappingTree::Node(child_node), false) => { + child_node.internal_find_field_mapping_leaf(sub_field_path) + } + (MappingTree::Leaf(leaf), true) => Some([leaf].into_iter()), + (MappingTree::Node(_), true) => None, + } + } + #[cfg(test)] pub fn num_fields(&self) -> usize { self.branches.len() @@ -531,10 +761,17 @@ impl MappingTree { } } +pub(crate) struct MappingNodeRoot { + /// The root of a mapping tree + pub field_mappings: MappingNode, + /// The list of concatenate fields which includes the dynamic field + pub concatenate_dynamic_fields: Vec, +} + pub(crate) fn build_mapping_tree( entries: &[FieldMappingEntry], schema: &mut SchemaBuilder, -) -> anyhow::Result { +) -> anyhow::Result { let mut field_path = Vec::new(); build_mapping_tree_from_entries(entries, &mut field_path, schema) } @@ -543,18 +780,61 @@ fn build_mapping_tree_from_entries<'a>( entries: &'a [FieldMappingEntry], field_path: &mut Vec<&'a str>, schema: &mut SchemaBuilder, -) -> anyhow::Result { +) -> anyhow::Result { let mut mapping_node = MappingNode::default(); + let mut concatenate_fields = Vec::new(); + let mut concatenate_dynamic_fields = Vec::new(); for entry in entries { - field_path.push(&entry.name); - if mapping_node.branches.contains_key(&entry.name) { - bail!("duplicated field definition `{}`", entry.name); + if let FieldMappingType::Concatenate(_) = &entry.mapping_type { + concatenate_fields.push(entry); + } else { + field_path.push(&entry.name); + if mapping_node.branches.contains_key(&entry.name) { + bail!("duplicated field definition `{}`", entry.name); + } + let (child_tree, mut dynamic_fields) = + build_mapping_from_field_type(&entry.mapping_type, field_path, schema)?; + field_path.pop(); + mapping_node.insert(&entry.name, child_tree); + concatenate_dynamic_fields.append(&mut dynamic_fields); } - let child_tree = build_mapping_from_field_type(&entry.mapping_type, field_path, schema)?; - field_path.pop(); - mapping_node.insert(&entry.name, child_tree); } - Ok(mapping_node) + for concatenate_field_entry in concatenate_fields { + let FieldMappingType::Concatenate(options) = &concatenate_field_entry.mapping_type else { + // we only pushed Concatenate fields in `concatenate_fields` + unreachable!(); + }; + let name = &concatenate_field_entry.name; + if mapping_node.branches.contains_key(name) { + bail!("duplicated field definition `{}`", name); + } + let text_options: TextOptions = options.clone().into(); + let field = schema.add_text_field(name, text_options); + for sub_field in &options.concatenate_fields { + for matched_field in + mapping_node + .find_field_mapping_leaf(sub_field) + .ok_or_else(|| { + anyhow::anyhow!("concatenate field uses an unknown field `{sub_field}`") + })? + { + if !matched_field.typ.supported_for_concat() { + bail!( + "subfield `{}` not supported inside a concatenate field", + sub_field + ); + } + matched_field.concatenate.push(field); + } + } + if options.include_dynamic_fields { + concatenate_dynamic_fields.push(field); + } + } + Ok(MappingNodeRoot { + field_mappings: mapping_node, + concatenate_dynamic_fields, + }) } fn get_numeric_options_for_bool_field( @@ -682,11 +962,14 @@ fn escape_dots(field_name: &str) -> String { escaped_field_name } +/// build a sub-mapping tree from the fields it contains. +/// +/// also returns the list of concatenate fields which consume the dynamic field fn build_mapping_from_field_type<'a>( field_mapping_type: &'a FieldMappingType, field_path: &mut Vec<&'a str>, schema_builder: &mut SchemaBuilder, -) -> anyhow::Result { +) -> anyhow::Result<(MappingTree, Vec)> { let field_name = field_name_for_field_path(field_path); match field_mapping_type { FieldMappingType::Text(options, cardinality) => { @@ -696,8 +979,9 @@ fn build_mapping_from_field_type<'a>( field, typ: LeafType::Text(options.clone()), cardinality: *cardinality, + concatenate: Vec::new(), }; - Ok(MappingTree::Leaf(mapping_leaf)) + Ok((MappingTree::Leaf(mapping_leaf), Vec::new())) } FieldMappingType::I64(options, cardinality) => { let numeric_options = get_numeric_options_for_numeric_field(options); @@ -706,8 +990,9 @@ fn build_mapping_from_field_type<'a>( field, typ: LeafType::I64(options.clone()), cardinality: *cardinality, + concatenate: Vec::new(), }; - Ok(MappingTree::Leaf(mapping_leaf)) + Ok((MappingTree::Leaf(mapping_leaf), Vec::new())) } FieldMappingType::U64(options, cardinality) => { let numeric_options = get_numeric_options_for_numeric_field(options); @@ -716,8 +1001,9 @@ fn build_mapping_from_field_type<'a>( field, typ: LeafType::U64(options.clone()), cardinality: *cardinality, + concatenate: Vec::new(), }; - Ok(MappingTree::Leaf(mapping_leaf)) + Ok((MappingTree::Leaf(mapping_leaf), Vec::new())) } FieldMappingType::F64(options, cardinality) => { let numeric_options = get_numeric_options_for_numeric_field(options); @@ -726,8 +1012,9 @@ fn build_mapping_from_field_type<'a>( field, typ: LeafType::F64(options.clone()), cardinality: *cardinality, + concatenate: Vec::new(), }; - Ok(MappingTree::Leaf(mapping_leaf)) + Ok((MappingTree::Leaf(mapping_leaf), Vec::new())) } FieldMappingType::Bool(options, cardinality) => { let numeric_options = get_numeric_options_for_bool_field(options); @@ -736,8 +1023,9 @@ fn build_mapping_from_field_type<'a>( field, typ: LeafType::Bool(options.clone()), cardinality: *cardinality, + concatenate: Vec::new(), }; - Ok(MappingTree::Leaf(mapping_leaf)) + Ok((MappingTree::Leaf(mapping_leaf), Vec::new())) } FieldMappingType::IpAddr(options, cardinality) => { let ip_addr_options = get_ip_address_options(options); @@ -746,8 +1034,9 @@ fn build_mapping_from_field_type<'a>( field, typ: LeafType::IpAddr(options.clone()), cardinality: *cardinality, + concatenate: Vec::new(), }; - Ok(MappingTree::Leaf(mapping_leaf)) + Ok((MappingTree::Leaf(mapping_leaf), Vec::new())) } FieldMappingType::DateTime(options, cardinality) => { let date_time_options = get_date_time_options(options); @@ -756,8 +1045,9 @@ fn build_mapping_from_field_type<'a>( field, typ: LeafType::DateTime(options.clone()), cardinality: *cardinality, + concatenate: Vec::new(), }; - Ok(MappingTree::Leaf(mapping_leaf)) + Ok((MappingTree::Leaf(mapping_leaf), Vec::new())) } FieldMappingType::Bytes(options, cardinality) => { let bytes_options = get_bytes_options(options); @@ -766,25 +1056,37 @@ fn build_mapping_from_field_type<'a>( field, typ: LeafType::Bytes(options.clone()), cardinality: *cardinality, + concatenate: Vec::new(), }; - Ok(MappingTree::Leaf(mapping_leaf)) + Ok((MappingTree::Leaf(mapping_leaf), Vec::new())) } FieldMappingType::Json(options, cardinality) => { let json_options = JsonObjectOptions::from(options.clone()); let field = schema_builder.add_json_field(&field_name, json_options); - Ok(MappingTree::Leaf(MappingLeaf { + let mapping_leaf = MappingLeaf { field, typ: LeafType::Json(options.clone()), cardinality: *cardinality, - })) + concatenate: Vec::new(), + }; + Ok((MappingTree::Leaf(mapping_leaf), Vec::new())) } FieldMappingType::Object(entries) => { - let mapping_node = build_mapping_tree_from_entries( + let MappingNodeRoot { + field_mappings, + concatenate_dynamic_fields, + } = build_mapping_tree_from_entries( &entries.field_mappings, field_path, schema_builder, )?; - Ok(MappingTree::Node(mapping_node)) + Ok(( + MappingTree::Node(field_mappings), + concatenate_dynamic_fields, + )) + } + FieldMappingType::Concatenate(_) => { + bail!("Concatenate shouldn't reach build_mapping_from_field_type: this is a bug") } } } @@ -799,7 +1101,7 @@ mod tests { use time::macros::datetime; use time::OffsetDateTime; - use super::{value_to_json, LeafType, MappingLeaf}; + use super::{value_to_json, JsonValueIterator, LeafType, MapOrArrayIter, MappingLeaf}; use crate::default_doc_mapper::date_time_type::QuickwitDateTimeOptions; use crate::default_doc_mapper::field_mapping_entry::{ BinaryFormat, NumericOutputFormat, QuickwitBoolOptions, QuickwitBytesOptions, @@ -981,6 +1283,7 @@ mod tests { field, typ, cardinality: Cardinality::MultiValues, + concatenate: Vec::new(), }; let mut document = Document::default(); let mut path = Vec::new(); @@ -1032,6 +1335,7 @@ mod tests { field, typ, cardinality: Cardinality::MultiValues, + concatenate: Vec::new(), }; let mut document = Document::default(); let mut path = Vec::new(); @@ -1054,6 +1358,7 @@ mod tests { field, typ, cardinality: Cardinality::MultiValues, + concatenate: Vec::new(), }; let mut document = Document::default(); let mut path = Vec::new(); @@ -1071,6 +1376,7 @@ mod tests { field, typ, cardinality: Cardinality::MultiValues, + concatenate: Vec::new(), }; let mut document = Document::default(); let mut path = Vec::new(); @@ -1089,6 +1395,7 @@ mod tests { field, typ, cardinality: Cardinality::MultiValues, + concatenate: Vec::new(), }; let mut document = Document::default(); let mut path = vec!["root".to_string(), "my_field".to_string()]; @@ -1234,6 +1541,7 @@ mod tests { field, typ, cardinality: Cardinality::MultiValues, + concatenate: Vec::new(), }; let mut document = Document::default(); let mut path = vec!["root".to_string(), "my_field".to_string()]; @@ -1372,4 +1680,54 @@ mod tests { assert_eq!(super::build_field_path_from_str("a."), vec!["a"]); assert_eq!(super::build_field_path_from_str(".a"), vec!["", "a"]); } + + #[test] + fn test_map_or_array_iter() { + // single element + let single_value = MapOrArrayIter::Value(json!({"a": "b", "c": 4})); + let res: Vec<_> = single_value.collect(); + assert_eq!(res, vec![json!({"a": "b", "c": 4})]); + + // array of elements + let multiple_values = + MapOrArrayIter::Array(vec![json!({"a": "b", "c": 4}), json!(5)].into_iter()); + let res: Vec<_> = multiple_values.collect(); + assert_eq!(res, vec![json!({"a": "b", "c": 4}), json!(5)]); + + // map of elements + let multiple_values = MapOrArrayIter::Map( + json!({"a": {"a": "b", "c": 4}, "b":5}) + .as_object() + .unwrap() + .clone() + .into_iter(), + ); + let res: Vec<_> = multiple_values.collect(); + assert_eq!(res, vec![json!({"a": "b", "c": 4}), json!(5)]); + } + + #[test] + fn test_json_value_iterator() { + assert_eq!( + JsonValueIterator::new(json!(5)).collect::>(), + vec![json!(5)] + ); + assert_eq!( + JsonValueIterator::new(json!([5, "a"])).collect::>(), + vec![json!(5), json!("a")] + ); + assert_eq!( + JsonValueIterator::new(json!({"a":1, "b": 2})).collect::>(), + vec![json!(1), json!(2)] + ); + assert_eq!( + JsonValueIterator::new(json!([{"a":1, "b": 2}, "a"])).collect::>(), + vec![json!(1), json!(2), json!("a")] + ); + assert_eq!( + JsonValueIterator::new(json!([{"a":1, "b": 2}, {"a": {"b": [3, 4]}}])) + .collect::>(), + vec![json!(1), json!(2), json!(3), json!(4)] + ); + } } diff --git a/quickwit/rest-api-tests/scenarii/concat_fields/0001_concat_field.yaml b/quickwit/rest-api-tests/scenarii/concat_fields/0001_concat_field.yaml new file mode 100644 index 00000000000..c90794cca4b --- /dev/null +++ b/quickwit/rest-api-tests/scenarii/concat_fields/0001_concat_field.yaml @@ -0,0 +1,117 @@ +# we use the tokenizer from the concat field, not the underlying field +endpoint: concat/search +params: + query: "concat_raw:AB-CD" +expected: + num_hits: 1 +--- +endpoint: concat/search +params: + query: "concat_raw:EF-GH" +expected: + num_hits: 1 +--- +endpoint: concat/search +params: + query: "concat_raw:'AB CD'" +expected: + num_hits: 0 +--- +endpoint: concat/search +params: + query: "concat_raw:'EF GH'" +expected: + num_hits: 0 +--- +endpoint: concat/search +params: + query: "concat_default:AB" +expected: + num_hits: 1 +--- +endpoint: concat/search +params: + query: "concat_default:GH" +expected: + num_hits: 1 +--- +# we find bool both in text and in bool fields +endpoint: concat/search +params: + query: "concat_raw:true" +expected: + num_hits: 2 +--- +endpoint: concat/search +params: + query: "concat_default:true" +expected: + num_hits: 2 +--- +# we find numbers both in text and int fields +endpoint: concat/search +params: + query: "concat_raw:42" +expected: + num_hits: 1 # only 1 hit, 42 doesn't get tokenized on this field +--- +endpoint: concat/search +params: + query: "concat_default:42" +expected: + num_hits: 2 # 2 hits, the number, and the tokenized text +--- +endpoint: concat/search +params: + query: "concat_raw:otherfieldvalue" +expected: + num_hits: 1 +--- +endpoint: concat/search +params: + query: "concat_raw:9" +expected: + num_hits: 1 +--- +endpoint: concat/search +params: + query: "concat_raw:false" +expected: + num_hits: 2 # also include the document with a json field +--- +endpoint: concat/search +params: + query: "concat_default:otherfieldvalue OR concat_default:9" +expected: + num_hits: 0 # this field doesn't include _dynamic +--- +endpoint: concat/search +params: + query: "concat_default:false" +expected: + num_hits: 1 # only include the document with a json field +--- +endpoint: concat/search +params: + query: "concat_raw:10" +expected: + num_hits: 1 +--- +endpoint: concat/search +params: + query: "concat_raw:nestedstring" +expected: + num_hits: 1 +--- +endpoint: concat/search +params: + query: "concat_default:10" +expected: + num_hits: 1 +--- +endpoint: concat/search +params: + query: "concat_default:nestedstring" +expected: + num_hits: 1 +--- diff --git a/quickwit/rest-api-tests/scenarii/concat_fields/_ctx.yaml b/quickwit/rest-api-tests/scenarii/concat_fields/_ctx.yaml new file mode 100644 index 00000000000..f507346eae6 --- /dev/null +++ b/quickwit/rest-api-tests/scenarii/concat_fields/_ctx.yaml @@ -0,0 +1,5 @@ +method: GET +engines: ["quickwit"] +api_root: "http://localhost:7280/api/v1/" +headers: + Content-Type: application/json diff --git a/quickwit/rest-api-tests/scenarii/concat_fields/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/concat_fields/_setup.quickwit.yaml new file mode 100644 index 00000000000..877fce489f7 --- /dev/null +++ b/quickwit/rest-api-tests/scenarii/concat_fields/_setup.quickwit.yaml @@ -0,0 +1,64 @@ +# Delete possibly remaining index +method: DELETE +endpoint: indexes/concat +status_code: null +--- +# Create index +method: POST +endpoint: indexes/ +json: + version: "0.7" + index_id: concat + doc_mapping: + mode: dynamic + field_mappings: + - name: text1 + type: text + tokenizer: default + - name: text2 + type: text + tokenizer: raw + - name: boolean + type: bool + - name: int + type: u64 + - name: json + type: json + - name: concat_raw + type: concatenate + concatenate_fields: + - text1 + - text2 + - boolean + - int + - json + tokenizer: raw + include_dynamic_fields: true + - name: concat_default + type: concatenate + concatenate_fields: + - text1 + - text2 + - boolean + - int + - json + tokenizer: default + dynamic_mapping: + tokenizer: default + expand_dots: true +sleep_after: 3 +--- +# Ingest documents +method: POST +endpoint: concat/ingest +num_retries: 10 +params: + commit: force +ndjson: + - {"text1": "AB-CD", "text2": "EF-GH"} + - {"text1": "true"} + - {"boolean": true} + - {"text2": "i like 42"} + - { "int": 42} + - {"other-field": "otherfieldvalue", "other-field-number": 9, "other-field-bool": false} + - {"json": {"some_bool": false, "some_int": 10, "nested": {"some_string": "nestedstring"}}} diff --git a/quickwit/rest-api-tests/scenarii/concat_fields/_teardown.quickwit.yaml b/quickwit/rest-api-tests/scenarii/concat_fields/_teardown.quickwit.yaml new file mode 100644 index 00000000000..d3569447bdc --- /dev/null +++ b/quickwit/rest-api-tests/scenarii/concat_fields/_teardown.quickwit.yaml @@ -0,0 +1,3 @@ +# Delete index +method: DELETE +endpoint: indexes/concat