From 312f5c3202f3c3960ef3a24a56eb1869b6941569 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Thu, 24 Oct 2024 20:58:48 +0800 Subject: [PATCH] refactor doc mapper (#5522) * refactor doc mapper move tantivy val to json to own module remove duplicated conversion logic move field_presence to own module * use base64 for bytes --- .../src/doc_mapper/doc_mapper_impl.rs | 118 +---- .../src/doc_mapper/field_presence.rs | 121 +++++ .../src/doc_mapper/mapping_tree.rs | 377 +-------------- .../quickwit-doc-mapper/src/doc_mapper/mod.rs | 2 + .../src/doc_mapper/tantivy_val_to_json.rs | 428 ++++++++++++++++++ 5 files changed, 564 insertions(+), 482 deletions(-) create mode 100644 quickwit/quickwit-doc-mapper/src/doc_mapper/field_presence.rs create mode 100644 quickwit/quickwit-doc-mapper/src/doc_mapper/tantivy_val_to_json.rs diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs index 57594025a1b..42f233013a4 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs @@ -22,7 +22,6 @@ use std::num::NonZeroU32; use anyhow::{bail, Context}; use fnv::FnvHashSet; -use quickwit_common::PathHasher; use quickwit_proto::types::DocMappingUid; use quickwit_query::create_default_quickwit_tokenizer_manager; use quickwit_query::query_ast::QueryAst; @@ -31,13 +30,12 @@ use serde::{Deserialize, Serialize}; use serde_json::{self, Value as JsonValue}; use serde_json_borrow::Map as BorrowedJsonMap; use tantivy::query::Query; -use tantivy::schema::document::{ReferenceValue, ReferenceValueLeaf}; -use tantivy::schema::{ - Field, FieldType, OwnedValue as TantivyValue, Schema, Value, INDEXED, STORED, -}; +use tantivy::schema::{Field, FieldType, OwnedValue as TantivyValue, Schema, INDEXED, STORED}; use tantivy::TantivyDocument as Document; use super::field_mapping_entry::RAW_TOKENIZER_NAME; +use super::field_presence::populate_field_presence; +use super::tantivy_val_to_json::tantivy_value_to_json; use super::DocMapperBuilder; use crate::doc_mapper::mapping_tree::{ build_field_path_from_str, build_mapping_tree, map_primitive_json_to_tantivy, @@ -430,85 +428,6 @@ fn extract_single_obj( } } -// TODO: Formatting according to mapper if applicable -fn tantivy_value_to_json(val: TantivyValue) -> JsonValue { - match val { - TantivyValue::Null => JsonValue::Null, - TantivyValue::Str(val) => JsonValue::String(val), - TantivyValue::PreTokStr(val) => JsonValue::String(val.text), - TantivyValue::U64(val) => JsonValue::Number(val.into()), - TantivyValue::I64(val) => JsonValue::Number(val.into()), - TantivyValue::F64(val) => serde_json::json!(val), - TantivyValue::Bool(val) => JsonValue::Bool(val), - TantivyValue::Date(val) => JsonValue::String(format!("{:?}", val)), - TantivyValue::Facet(val) => JsonValue::String(val.to_string()), - TantivyValue::Bytes(val) => JsonValue::String(format!("{:?}", val)), - TantivyValue::Array(val) => val.into_iter().map(tantivy_value_to_json).collect(), - TantivyValue::Object(val) => val - .into_iter() - .map(|(key, val)| (key, tantivy_value_to_json(val))) - .collect(), - TantivyValue::IpAddr(val) => JsonValue::String(format!("{:?}", val)), - } -} - -#[inline] -fn populate_field_presence_for_json_value<'a>( - json_value: impl Value<'a>, - path_hasher: &PathHasher, - is_expand_dots_enabled: bool, - output: &mut FnvHashSet, -) { - match json_value.as_value() { - ReferenceValue::Leaf(ReferenceValueLeaf::Null) => {} - ReferenceValue::Leaf(_) => { - output.insert(path_hasher.finish()); - } - ReferenceValue::Array(items) => { - for item in items { - populate_field_presence_for_json_value( - item, - path_hasher, - is_expand_dots_enabled, - output, - ); - } - } - ReferenceValue::Object(json_obj) => { - populate_field_presence_for_json_obj( - json_obj, - path_hasher.clone(), - is_expand_dots_enabled, - output, - ); - } - } -} - -fn populate_field_presence_for_json_obj<'a, Iter: Iterator)>>( - json_obj: Iter, - path_hasher: PathHasher, - is_expand_dots_enabled: bool, - output: &mut FnvHashSet, -) { - for (field_key, field_value) in json_obj { - let mut child_path_hasher = path_hasher.clone(); - if is_expand_dots_enabled { - for segment in field_key.split('.') { - child_path_hasher.append(segment.as_bytes()); - } - } else { - child_path_hasher.append(field_key.as_bytes()); - }; - populate_field_presence_for_json_value( - field_value, - &child_path_hasher, - is_expand_dots_enabled, - output, - ); - } -} - impl DocMapper { /// Returns the unique identifier of the doc mapping. pub fn doc_mapping_uid(&self) -> DocMappingUid { @@ -636,36 +555,9 @@ impl DocMapper { document.add_u64(document_size_field, document_len); } - // The capacity is inexact here. - if self.index_field_presence { - let mut field_presence_hashes: FnvHashSet = - FnvHashSet::with_capacity_and_hasher(document.len(), Default::default()); - for (field, value) in document.field_values() { - let field_entry = self.schema.get_field_entry(field); - if !field_entry.is_indexed() || field_entry.is_fast() { - // We are using an tantivy's ExistsQuery for fast fields. - continue; - } - let mut path_hasher: PathHasher = PathHasher::default(); - path_hasher.append(&field.field_id().to_le_bytes()[..]); - if let Some(json_obj) = value.as_object() { - let is_expand_dots_enabled: bool = - if let FieldType::JsonObject(json_options) = field_entry.field_type() { - json_options.is_expand_dots_enabled() - } else { - false - }; - populate_field_presence_for_json_obj( - json_obj, - path_hasher, - is_expand_dots_enabled, - &mut field_presence_hashes, - ); - } else { - field_presence_hashes.insert(path_hasher.finish()); - } - } + let field_presence_hashes: FnvHashSet = + populate_field_presence(&document, &self.schema); for field_presence_hash in field_presence_hashes { document.add_field_value(FIELD_PRESENCE_FIELD, &field_presence_hash); } diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/field_presence.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/field_presence.rs new file mode 100644 index 00000000000..95f7dcba632 --- /dev/null +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/field_presence.rs @@ -0,0 +1,121 @@ +// Copyright (C) 2024 Quickwit, Inc. +// +// Quickwit is offered under the AGPL v3.0 and as commercial software. +// For commercial licensing, contact us at hello@quickwit.io. +// +// AGPL: +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use fnv::FnvHashSet; +use quickwit_common::PathHasher; +use tantivy::schema::document::{ReferenceValue, ReferenceValueLeaf}; +use tantivy::schema::{FieldType, Schema, Value}; +use tantivy::Document; + +/// Populates the field presence for a document. +/// +/// The field presence is a set of hashes that represent the fields that are present in the +/// document. Each hash is computed from the field path. +/// +/// It is only added if the field is indexed and not fast. +pub(crate) fn populate_field_presence( + document: &D, + schema: &Schema, +) -> FnvHashSet { + let mut field_presence_hashes: FnvHashSet = + FnvHashSet::with_capacity_and_hasher(schema.num_fields(), Default::default()); + for (field, value) in document.iter_fields_and_values() { + let field_entry = schema.get_field_entry(field); + if !field_entry.is_indexed() || field_entry.is_fast() { + // We are using an tantivy's ExistsQuery for fast fields. + continue; + } + let mut path_hasher: PathHasher = PathHasher::default(); + path_hasher.append(&field.field_id().to_le_bytes()[..]); + if let Some(json_obj) = value.as_object() { + let is_expand_dots_enabled: bool = + if let FieldType::JsonObject(json_options) = field_entry.field_type() { + json_options.is_expand_dots_enabled() + } else { + false + }; + populate_field_presence_for_json_obj( + json_obj, + path_hasher, + is_expand_dots_enabled, + &mut field_presence_hashes, + ); + } else { + field_presence_hashes.insert(path_hasher.finish()); + } + } + field_presence_hashes +} + +#[inline] +fn populate_field_presence_for_json_value<'a>( + json_value: impl Value<'a>, + path_hasher: &PathHasher, + is_expand_dots_enabled: bool, + output: &mut FnvHashSet, +) { + match json_value.as_value() { + ReferenceValue::Leaf(ReferenceValueLeaf::Null) => {} + ReferenceValue::Leaf(_) => { + output.insert(path_hasher.finish()); + } + ReferenceValue::Array(items) => { + for item in items { + populate_field_presence_for_json_value( + item, + path_hasher, + is_expand_dots_enabled, + output, + ); + } + } + ReferenceValue::Object(json_obj) => { + populate_field_presence_for_json_obj( + json_obj, + path_hasher.clone(), + is_expand_dots_enabled, + output, + ); + } + } +} + +fn populate_field_presence_for_json_obj<'a, Iter: Iterator)>>( + json_obj: Iter, + path_hasher: PathHasher, + is_expand_dots_enabled: bool, + output: &mut FnvHashSet, +) { + for (field_key, field_value) in json_obj { + let mut child_path_hasher = path_hasher.clone(); + if is_expand_dots_enabled { + for segment in field_key.split('.') { + child_path_hasher.append(segment.as_bytes()); + } + } else { + child_path_hasher.append(field_key.as_bytes()); + }; + populate_field_presence_for_json_value( + field_value, + &child_path_hasher, + is_expand_dots_enabled, + output, + ); + } +} diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mapping_tree.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mapping_tree.rs index 86c89f77359..c10ab9699fb 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mapping_tree.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mapping_tree.rs @@ -33,7 +33,8 @@ use tantivy::schema::{ use tantivy::TantivyDocument as Document; use super::date_time_type::QuickwitDateTimeOptions; -use super::field_mapping_entry::{NumericOutputFormat, QuickwitBoolOptions}; +use super::field_mapping_entry::QuickwitBoolOptions; +use super::tantivy_val_to_json::formatted_tantivy_value_to_json; use crate::doc_mapper::field_mapping_entry::{ QuickwitBytesOptions, QuickwitIpAddrOptions, QuickwitNumericOptions, QuickwitObjectOptions, QuickwitTextOptions, @@ -498,7 +499,7 @@ fn extract_json_val( }; let mut vals_with_correct_type_it = vals .into_iter() - .flat_map(|value| value_to_json(value, leaf_type)); + .flat_map(|value| formatted_tantivy_value_to_json(value, leaf_type)); match cardinality { Cardinality::SingleValued => vals_with_correct_type_it.next(), Cardinality::MultiValued => Some(JsonValue::Array(vals_with_correct_type_it.collect())), @@ -608,225 +609,6 @@ fn add_key_to_vec_map( } } -fn value_to_string(value: TantivyValue) -> Result { - match value { - TantivyValue::Str(s) => return Ok(JsonValue::String(s)), - TantivyValue::U64(number) => Some(number.to_string()), - TantivyValue::I64(number) => Some(number.to_string()), - TantivyValue::F64(number) => Some(number.to_string()), - TantivyValue::Bool(b) => Some(b.to_string()), - TantivyValue::Date(date) => { - return quickwit_datetime::DateTimeOutputFormat::default() - .format_to_json(date) - .map_err(|_| value); - } - TantivyValue::IpAddr(ip) => Some(ip.to_string()), - _ => None, - } - .map(JsonValue::String) - .ok_or(value) -} - -fn value_to_bool(value: TantivyValue) -> Result { - match &value { - TantivyValue::Str(s) => s.parse().ok(), - TantivyValue::U64(number) => match number { - 0 => Some(false), - 1 => Some(true), - _ => None, - }, - TantivyValue::I64(number) => match number { - 0 => Some(false), - 1 => Some(true), - _ => None, - }, - TantivyValue::Bool(b) => Some(*b), - _ => None, - } - .map(JsonValue::Bool) - .ok_or(value) -} - -fn value_to_ip(value: TantivyValue) -> Result { - match &value { - TantivyValue::Str(s) => s - .parse::() - .or_else(|_| { - s.parse::() - .map(|ip| ip.to_ipv6_mapped()) - }) - .ok(), - TantivyValue::IpAddr(ip) => Some(*ip), - _ => None, - } - .map(|ip| { - serde_json::to_value(TantivyValue::IpAddr(ip)) - .expect("Json serialization should never fail.") - }) - .ok_or(value) -} - -fn value_to_float( - value: TantivyValue, - numeric_options: &QuickwitNumericOptions, -) -> Result { - match &value { - TantivyValue::Str(s) => s.parse().ok(), - TantivyValue::U64(number) => Some(*number as f64), - TantivyValue::I64(number) => Some(*number as f64), - TantivyValue::F64(number) => Some(*number), - TantivyValue::Bool(b) => Some(if *b { 1.0 } else { 0.0 }), - _ => None, - } - .and_then(|f64_val| f64_val.to_json(numeric_options.output_format)) - .ok_or(value) -} - -fn value_to_u64( - value: TantivyValue, - numeric_options: &QuickwitNumericOptions, -) -> Result { - match &value { - TantivyValue::Str(s) => s.parse().ok(), - TantivyValue::U64(number) => Some(*number), - TantivyValue::I64(number) => (*number).try_into().ok(), - TantivyValue::F64(number) => { - if (0.0..=(u64::MAX as f64)).contains(number) { - Some(*number as u64) - } else { - None - } - } - TantivyValue::Bool(b) => Some(*b as u64), - _ => None, - } - .and_then(|u64_val| u64_val.to_json(numeric_options.output_format)) - .ok_or(value) -} - -fn value_to_i64( - value: TantivyValue, - numeric_options: &QuickwitNumericOptions, -) -> Result { - match &value { - TantivyValue::Str(s) => s.parse().ok(), - TantivyValue::U64(number) => (*number).try_into().ok(), - TantivyValue::I64(number) => Some(*number), - TantivyValue::F64(number) => { - if ((i64::MIN as f64)..=(i64::MAX as f64)).contains(number) { - Some(*number as i64) - } else { - None - } - } - TantivyValue::Bool(b) => Some(*b as i64), - _ => None, - } - .and_then(|u64_val| u64_val.to_json(numeric_options.output_format)) - .ok_or(value) -} - -/// Transforms a tantivy object into a serde_json one, without cloning strings. -/// It still allocates maps. -// TODO we should probably move this to tantivy, it has the opposite conversion already -fn tantivy_object_to_json_value_nocopy(object: Vec<(String, TantivyValue)>) -> JsonValue { - JsonValue::Object( - object - .into_iter() - .map(|(key, value)| (key, tantivy_value_to_json_value_nocopy(value))) - .collect(), - ) -} - -fn tantivy_value_to_json_value_nocopy(value: TantivyValue) -> JsonValue { - match value { - TantivyValue::Null => JsonValue::Null, - TantivyValue::Str(s) => JsonValue::String(s), - TantivyValue::U64(number) => JsonValue::Number(number.into()), - TantivyValue::I64(number) => JsonValue::Number(number.into()), - TantivyValue::F64(f) => { - JsonValue::Number(serde_json::Number::from_f64(f).expect("expected finite f64")) - } - TantivyValue::Bool(b) => JsonValue::Bool(b), - TantivyValue::Array(array) => JsonValue::Array( - array - .into_iter() - .map(tantivy_value_to_json_value_nocopy) - .collect(), - ), - TantivyValue::Object(object) => tantivy_object_to_json_value_nocopy(object), - // we shouldn't have these types inside a json field in quickwit - TantivyValue::PreTokStr(pretok) => JsonValue::String(pretok.text), - TantivyValue::Date(date) => quickwit_datetime::DateTimeOutputFormat::Rfc3339 - .format_to_json(date) - .expect("Invalid datetime is not allowed."), - TantivyValue::Facet(facet) => JsonValue::String(facet.to_string()), - // TantivyValue::Bytes(Vec) => (), // tantivy would do b64 here - TantivyValue::IpAddr(ip_v6) => { - let ip_str = if let Some(ip_v4) = ip_v6.to_ipv4_mapped() { - ip_v4.to_string() - } else { - ip_v6.to_string() - }; - JsonValue::String(ip_str) - } - value => unimplemented!("got unexpected type {value:?} inside json field"), - } -} - -/// Converts Tantivy::Value into Json Value. -/// -/// Makes sure the type and value are consistent before converting. -/// For certain LeafType, we use the type options to format the output. -fn value_to_json(value: TantivyValue, leaf_type: &LeafType) -> Option { - let res = match leaf_type { - LeafType::Text(_) => value_to_string(value), - LeafType::Bool(_) => value_to_bool(value), - LeafType::IpAddr(_) => value_to_ip(value), - LeafType::F64(numeric_options) => value_to_float(value, numeric_options), - LeafType::U64(numeric_options) => value_to_u64(value, numeric_options), - LeafType::I64(numeric_options) => value_to_i64(value, numeric_options), - LeafType::Json(_) => { - if let TantivyValue::Object(obj) = value { - // TODO do we want to allow almost everything here? - return Some(tantivy_object_to_json_value_nocopy(obj)); - } else { - Err(value) - } - } - LeafType::Bytes(bytes_options) => { - if let TantivyValue::Bytes(ref bytes) = value { - // TODO we could cast str to bytes - let json_value = bytes_options.output_format.format_to_json(bytes); - Ok(json_value) - } else { - Err(value) - } - } - LeafType::DateTime(date_time_options) => date_time_options - .reparse_tantivy_value(&value) - .map(|date_time| { - date_time_options - .output_format - .format_to_json(date_time) - .expect("Invalid datetime is not allowed.") - }) - .ok_or(value), - }; - match res { - Ok(res) => Some(res), - Err(value) => { - quickwit_common::rate_limited_warn!( - limit_per_min = 2, - "the value type `{:?}` doesn't match the requested type `{:?}`", - value, - leaf_type - ); - None - } - } -} - fn insert_json_val( field_path: &[&str], //< may not be empty json_val: JsonValue, @@ -846,7 +628,7 @@ fn insert_json_val( doc_json.insert(last_field_name.to_string(), json_val); } -trait NumVal: Sized + FromStr + ToString + Into { +pub(crate) trait NumVal: Sized + FromStr + ToString + Into { fn from_json_number(num: &serde_json::Number) -> Option; fn validate_json(json_val: &BorrowedJsonValue, coerce: bool) -> Result<(), String> { @@ -929,50 +711,23 @@ trait NumVal: Sized + FromStr + ToString + Into { fn from_json(json_val: JsonValue, coerce: bool) -> Result { Self::from_json_to_self(&json_val, coerce).map(Self::into) } - - fn to_json(&self, output_format: NumericOutputFormat) -> Option; } impl NumVal for u64 { fn from_json_number(num: &serde_json::Number) -> Option { num.as_u64() } - - fn to_json(&self, output_format: NumericOutputFormat) -> Option { - let json_value = match output_format { - NumericOutputFormat::String => JsonValue::String(self.to_string()), - NumericOutputFormat::Number => JsonValue::Number(serde_json::Number::from(*self)), - }; - Some(json_value) - } } impl NumVal for i64 { fn from_json_number(num: &serde_json::Number) -> Option { num.as_i64() } - - fn to_json(&self, output_format: NumericOutputFormat) -> Option { - let json_value = match output_format { - NumericOutputFormat::String => JsonValue::String(self.to_string()), - NumericOutputFormat::Number => JsonValue::Number(serde_json::Number::from(*self)), - }; - Some(json_value) - } } impl NumVal for f64 { fn from_json_number(num: &serde_json::Number) -> Option { num.as_f64() } - - fn to_json(&self, output_format: NumericOutputFormat) -> Option { - match output_format { - NumericOutputFormat::String => Some(JsonValue::String(self.to_string())), - NumericOutputFormat::Number => { - serde_json::Number::from_f64(*self).map(JsonValue::Number) - } - } - } } #[derive(Clone, Default)] @@ -1584,13 +1339,13 @@ mod tests { use time::OffsetDateTime; use super::{ - add_key_to_vec_map, extract_val_from_tantivy_val, value_to_json, JsonValueIterator, - LeafType, MapOrArrayIter, MappingLeaf, + add_key_to_vec_map, extract_val_from_tantivy_val, JsonValueIterator, LeafType, + MapOrArrayIter, MappingLeaf, }; use crate::doc_mapper::date_time_type::QuickwitDateTimeOptions; use crate::doc_mapper::field_mapping_entry::{ - BinaryFormat, NumericOutputFormat, QuickwitBoolOptions, QuickwitBytesOptions, - QuickwitIpAddrOptions, QuickwitNumericOptions, QuickwitTextOptions, + BinaryFormat, QuickwitBoolOptions, QuickwitBytesOptions, QuickwitIpAddrOptions, + QuickwitNumericOptions, QuickwitTextOptions, }; use crate::Cardinality; @@ -2054,122 +1809,6 @@ mod tests { ) } - #[test] - fn test_tantivy_value_to_json_value_bytes() { - let bytes_options_base64 = QuickwitBytesOptions::default(); - assert_eq!( - value_to_json( - TantivyValue::Bytes(vec![1, 2, 3]), - &LeafType::Bytes(bytes_options_base64) - ) - .unwrap(), - serde_json::json!("AQID") - ); - - let bytes_options_hex = QuickwitBytesOptions { - output_format: BinaryFormat::Hex, - ..Default::default() - }; - assert_eq!( - value_to_json( - TantivyValue::Bytes(vec![1, 2, 3]), - &LeafType::Bytes(bytes_options_hex) - ) - .unwrap(), - serde_json::json!("010203") - ); - } - - #[test] - fn test_tantivy_value_to_json_value_f64() { - let numeric_options_number = QuickwitNumericOptions::default(); - assert_eq!( - value_to_json( - TantivyValue::F64(0.1), - &LeafType::F64(numeric_options_number.clone()) - ) - .unwrap(), - serde_json::json!(0.1) - ); - assert_eq!( - value_to_json( - TantivyValue::U64(1), - &LeafType::F64(numeric_options_number.clone()) - ) - .unwrap(), - serde_json::json!(1.0) - ); - assert_eq!( - value_to_json( - TantivyValue::Str("0.1".to_string()), - &LeafType::F64(numeric_options_number.clone()) - ) - .unwrap(), - serde_json::json!(0.1) - ); - - let numeric_options_str = QuickwitNumericOptions { - output_format: NumericOutputFormat::String, - ..Default::default() - }; - assert_eq!( - value_to_json(TantivyValue::F64(0.1), &LeafType::F64(numeric_options_str)).unwrap(), - serde_json::json!("0.1") - ); - } - - #[test] - fn test_tantivy_value_to_json_value_i64() { - let numeric_options_number = QuickwitNumericOptions::default(); - assert_eq!( - value_to_json( - TantivyValue::I64(-1), - &LeafType::I64(numeric_options_number.clone()) - ) - .unwrap(), - serde_json::json!(-1) - ); - assert_eq!( - value_to_json(TantivyValue::I64(1), &LeafType::I64(numeric_options_number)).unwrap(), - serde_json::json!(1) - ); - - let numeric_options_str = QuickwitNumericOptions { - output_format: NumericOutputFormat::String, - ..Default::default() - }; - assert_eq!( - value_to_json(TantivyValue::I64(-1), &LeafType::I64(numeric_options_str)).unwrap(), - serde_json::json!("-1") - ); - } - - #[test] - fn test_tantivy_value_to_json_value_u64() { - let numeric_options_number = QuickwitNumericOptions::default(); - assert_eq!( - value_to_json( - TantivyValue::U64(1), - &LeafType::U64(numeric_options_number.clone()) - ) - .unwrap(), - serde_json::json!(1u64) - ); - assert_eq!( - value_to_json(TantivyValue::I64(1), &LeafType::U64(numeric_options_number)).unwrap(), - serde_json::json!(1u64) - ); - - let numeric_options_str = QuickwitNumericOptions { - output_format: NumericOutputFormat::String, - ..Default::default() - }; - assert_eq!( - value_to_json(TantivyValue::U64(1), &LeafType::U64(numeric_options_str)).unwrap(), - serde_json::json!("1") - ); - } - #[test] fn test_field_path_for_field_name() { assert_eq!(super::build_field_path_from_str(""), Vec::::new()); diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs index 418ad5e6471..146c2f1f51c 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs @@ -22,7 +22,9 @@ mod doc_mapper_builder; mod doc_mapper_impl; mod field_mapping_entry; mod field_mapping_type; +mod field_presence; mod mapping_tree; +mod tantivy_val_to_json; mod tokenizer_entry; use std::collections::{HashMap, HashSet}; diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/tantivy_val_to_json.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/tantivy_val_to_json.rs new file mode 100644 index 00000000000..949f205451b --- /dev/null +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/tantivy_val_to_json.rs @@ -0,0 +1,428 @@ +// Copyright (C) 2024 Quickwit, Inc. +// +// Quickwit is offered under the AGPL v3.0 and as commercial software. +// For commercial licensing, contact us at hello@quickwit.io. +// +// AGPL: +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use serde_json::Value as JsonValue; +use tantivy::schema::OwnedValue as TantivyValue; + +use super::field_mapping_entry::{NumericOutputFormat, QuickwitNumericOptions}; +use super::mapping_tree::LeafType; +use super::BinaryFormat; + +pub(crate) trait NumToJson { + fn to_json(&self, output_format: NumericOutputFormat) -> Option; +} + +impl NumToJson for u64 { + fn to_json(&self, output_format: NumericOutputFormat) -> Option { + let json_value = match output_format { + NumericOutputFormat::String => JsonValue::String(self.to_string()), + NumericOutputFormat::Number => JsonValue::Number(serde_json::Number::from(*self)), + }; + Some(json_value) + } +} + +impl NumToJson for i64 { + fn to_json(&self, output_format: NumericOutputFormat) -> Option { + let json_value = match output_format { + NumericOutputFormat::String => JsonValue::String(self.to_string()), + NumericOutputFormat::Number => JsonValue::Number(serde_json::Number::from(*self)), + }; + Some(json_value) + } +} +impl NumToJson for f64 { + fn to_json(&self, output_format: NumericOutputFormat) -> Option { + match output_format { + NumericOutputFormat::String => Some(JsonValue::String(self.to_string())), + NumericOutputFormat::Number => { + serde_json::Number::from_f64(*self).map(JsonValue::Number) + } + } + } +} + +fn value_to_string(value: TantivyValue) -> Result { + match value { + TantivyValue::Str(s) => return Ok(JsonValue::String(s)), + TantivyValue::U64(number) => Some(number.to_string()), + TantivyValue::I64(number) => Some(number.to_string()), + TantivyValue::F64(number) => Some(number.to_string()), + TantivyValue::Bool(b) => Some(b.to_string()), + TantivyValue::Date(date) => { + return quickwit_datetime::DateTimeOutputFormat::default() + .format_to_json(date) + .map_err(|_| value); + } + TantivyValue::IpAddr(ip) => Some(ip.to_string()), + _ => None, + } + .map(JsonValue::String) + .ok_or(value) +} + +fn value_to_bool(value: TantivyValue) -> Result { + match &value { + TantivyValue::Str(s) => s.parse().ok(), + TantivyValue::U64(number) => match number { + 0 => Some(false), + 1 => Some(true), + _ => None, + }, + TantivyValue::I64(number) => match number { + 0 => Some(false), + 1 => Some(true), + _ => None, + }, + TantivyValue::Bool(b) => Some(*b), + _ => None, + } + .map(JsonValue::Bool) + .ok_or(value) +} + +fn value_to_ip(value: TantivyValue) -> Result { + match &value { + TantivyValue::Str(s) => s + .parse::() + .or_else(|_| { + s.parse::() + .map(|ip| ip.to_ipv6_mapped()) + }) + .ok(), + TantivyValue::IpAddr(ip) => Some(*ip), + _ => None, + } + .map(|ip| { + serde_json::to_value(TantivyValue::IpAddr(ip)) + .expect("Json serialization should never fail.") + }) + .ok_or(value) +} + +fn value_to_float( + value: TantivyValue, + numeric_options: &QuickwitNumericOptions, +) -> Result { + match &value { + TantivyValue::Str(s) => s.parse().ok(), + TantivyValue::U64(number) => Some(*number as f64), + TantivyValue::I64(number) => Some(*number as f64), + TantivyValue::F64(number) => Some(*number), + TantivyValue::Bool(b) => Some(if *b { 1.0 } else { 0.0 }), + _ => None, + } + .and_then(|f64_val| f64_val.to_json(numeric_options.output_format)) + .ok_or(value) +} + +fn value_to_u64( + value: TantivyValue, + numeric_options: &QuickwitNumericOptions, +) -> Result { + match &value { + TantivyValue::Str(s) => s.parse().ok(), + TantivyValue::U64(number) => Some(*number), + TantivyValue::I64(number) => (*number).try_into().ok(), + TantivyValue::F64(number) => { + if (0.0..=(u64::MAX as f64)).contains(number) { + Some(*number as u64) + } else { + None + } + } + TantivyValue::Bool(b) => Some(*b as u64), + _ => None, + } + .and_then(|u64_val| u64_val.to_json(numeric_options.output_format)) + .ok_or(value) +} + +fn value_to_i64( + value: TantivyValue, + numeric_options: &QuickwitNumericOptions, +) -> Result { + match &value { + TantivyValue::Str(s) => s.parse().ok(), + TantivyValue::U64(number) => (*number).try_into().ok(), + TantivyValue::I64(number) => Some(*number), + TantivyValue::F64(number) => { + if ((i64::MIN as f64)..=(i64::MAX as f64)).contains(number) { + Some(*number as i64) + } else { + None + } + } + TantivyValue::Bool(b) => Some(*b as i64), + _ => None, + } + .and_then(|u64_val| u64_val.to_json(numeric_options.output_format)) + .ok_or(value) +} + +/// Transforms a tantivy object into a serde_json one, without cloning strings. +/// It still allocates maps. +// TODO we should probably move this to tantivy, it has the opposite conversion already +pub fn tantivy_object_to_json_value(object: Vec<(String, TantivyValue)>) -> JsonValue { + JsonValue::Object( + object + .into_iter() + .map(|(key, value)| (key, tantivy_value_to_json(value))) + .collect(), + ) +} + +/// Converts Tantivy::Value into Json Value. +/// +/// Formatting by defaults, e.g. Rfc3339 for dates. +pub fn tantivy_value_to_json(value: TantivyValue) -> JsonValue { + match value { + TantivyValue::Null => JsonValue::Null, + TantivyValue::Str(s) => JsonValue::String(s), + TantivyValue::U64(number) => JsonValue::Number(number.into()), + TantivyValue::I64(number) => JsonValue::Number(number.into()), + TantivyValue::F64(f) => { + JsonValue::Number(serde_json::Number::from_f64(f).expect("expected finite f64")) + } + TantivyValue::Bool(b) => JsonValue::Bool(b), + TantivyValue::Array(array) => { + JsonValue::Array(array.into_iter().map(tantivy_value_to_json).collect()) + } + TantivyValue::Object(object) => tantivy_object_to_json_value(object), + // we shouldn't have these types inside a json field in quickwit + TantivyValue::PreTokStr(pretok) => JsonValue::String(pretok.text), + TantivyValue::Date(date) => quickwit_datetime::DateTimeOutputFormat::Rfc3339 + .format_to_json(date) + .expect("Invalid datetime is not allowed."), + TantivyValue::Facet(facet) => JsonValue::String(facet.to_string()), + TantivyValue::Bytes(bytes) => BinaryFormat::Base64.format_to_json(&bytes), + TantivyValue::IpAddr(ip_v6) => { + let ip_str = if let Some(ip_v4) = ip_v6.to_ipv4_mapped() { + ip_v4.to_string() + } else { + ip_v6.to_string() + }; + JsonValue::String(ip_str) + } + } +} + +/// Converts TantivyValue into Json Value and formats according to the LeafType. +/// +/// Makes sure the type and value are consistent before converting. +/// For certain LeafType, we use the type options to format the output. +pub fn formatted_tantivy_value_to_json( + value: TantivyValue, + leaf_type: &LeafType, +) -> Option { + let res = match leaf_type { + LeafType::Text(_) => value_to_string(value), + LeafType::Bool(_) => value_to_bool(value), + LeafType::IpAddr(_) => value_to_ip(value), + LeafType::F64(numeric_options) => value_to_float(value, numeric_options), + LeafType::U64(numeric_options) => value_to_u64(value, numeric_options), + LeafType::I64(numeric_options) => value_to_i64(value, numeric_options), + LeafType::Json(_) => { + if let TantivyValue::Object(obj) = value { + // TODO do we want to allow almost everything here? + return Some(tantivy_object_to_json_value(obj)); + } else { + Err(value) + } + } + LeafType::Bytes(bytes_options) => { + if let TantivyValue::Bytes(ref bytes) = value { + // TODO we could cast str to bytes + let json_value = bytes_options.output_format.format_to_json(bytes); + Ok(json_value) + } else { + Err(value) + } + } + LeafType::DateTime(date_time_options) => date_time_options + .reparse_tantivy_value(&value) + .map(|date_time| { + date_time_options + .output_format + .format_to_json(date_time) + .expect("Invalid datetime is not allowed.") + }) + .ok_or(value), + }; + match res { + Ok(res) => Some(res), + Err(value) => { + quickwit_common::rate_limited_warn!( + limit_per_min = 2, + "the value type `{:?}` doesn't match the requested type `{:?}`", + value, + leaf_type + ); + None + } + } +} + +#[cfg(test)] +mod tests { + + use tantivy::schema::OwnedValue as TantivyValue; + + use super::*; + use crate::doc_mapper::field_mapping_entry::{ + BinaryFormat, NumericOutputFormat, QuickwitBytesOptions, QuickwitNumericOptions, + }; + use crate::doc_mapper::mapping_tree::LeafType; + + #[test] + fn test_tantivy_value_to_json_value_bytes() { + let bytes_options_base64 = QuickwitBytesOptions::default(); + assert_eq!( + formatted_tantivy_value_to_json( + TantivyValue::Bytes(vec![1, 2, 3]), + &LeafType::Bytes(bytes_options_base64) + ) + .unwrap(), + serde_json::json!("AQID") + ); + + let bytes_options_hex = QuickwitBytesOptions { + output_format: BinaryFormat::Hex, + ..Default::default() + }; + assert_eq!( + formatted_tantivy_value_to_json( + TantivyValue::Bytes(vec![1, 2, 3]), + &LeafType::Bytes(bytes_options_hex) + ) + .unwrap(), + serde_json::json!("010203") + ); + } + + #[test] + fn test_tantivy_value_to_json_value_f64() { + let numeric_options_number = QuickwitNumericOptions::default(); + assert_eq!( + formatted_tantivy_value_to_json( + TantivyValue::F64(0.1), + &LeafType::F64(numeric_options_number.clone()) + ) + .unwrap(), + serde_json::json!(0.1) + ); + assert_eq!( + formatted_tantivy_value_to_json( + TantivyValue::U64(1), + &LeafType::F64(numeric_options_number.clone()) + ) + .unwrap(), + serde_json::json!(1.0) + ); + assert_eq!( + formatted_tantivy_value_to_json( + TantivyValue::Str("0.1".to_string()), + &LeafType::F64(numeric_options_number.clone()) + ) + .unwrap(), + serde_json::json!(0.1) + ); + + let numeric_options_str = QuickwitNumericOptions { + output_format: NumericOutputFormat::String, + ..Default::default() + }; + assert_eq!( + formatted_tantivy_value_to_json( + TantivyValue::F64(0.1), + &LeafType::F64(numeric_options_str) + ) + .unwrap(), + serde_json::json!("0.1") + ); + } + + #[test] + fn test_tantivy_value_to_json_value_i64() { + let numeric_options_number = QuickwitNumericOptions::default(); + assert_eq!( + formatted_tantivy_value_to_json( + TantivyValue::I64(-1), + &LeafType::I64(numeric_options_number.clone()) + ) + .unwrap(), + serde_json::json!(-1) + ); + assert_eq!( + formatted_tantivy_value_to_json( + TantivyValue::I64(1), + &LeafType::I64(numeric_options_number) + ) + .unwrap(), + serde_json::json!(1) + ); + + let numeric_options_str = QuickwitNumericOptions { + output_format: NumericOutputFormat::String, + ..Default::default() + }; + assert_eq!( + formatted_tantivy_value_to_json( + TantivyValue::I64(-1), + &LeafType::I64(numeric_options_str) + ) + .unwrap(), + serde_json::json!("-1") + ); + } + + #[test] + fn test_tantivy_value_to_json_value_u64() { + let numeric_options_number = QuickwitNumericOptions::default(); + assert_eq!( + formatted_tantivy_value_to_json( + TantivyValue::U64(1), + &LeafType::U64(numeric_options_number.clone()) + ) + .unwrap(), + serde_json::json!(1u64) + ); + assert_eq!( + formatted_tantivy_value_to_json( + TantivyValue::I64(1), + &LeafType::U64(numeric_options_number) + ) + .unwrap(), + serde_json::json!(1u64) + ); + + let numeric_options_str = QuickwitNumericOptions { + output_format: NumericOutputFormat::String, + ..Default::default() + }; + assert_eq!( + formatted_tantivy_value_to_json( + TantivyValue::U64(1), + &LeafType::U64(numeric_options_str) + ) + .unwrap(), + serde_json::json!("1") + ); + } +}