From 40197c26dd1c6a914e6c2bb307e4e5a7ed576836 Mon Sep 17 00:00:00 2001 From: kevinyhzou Date: Wed, 13 Nov 2024 12:23:42 +0800 Subject: [PATCH] Fix json input format ignore key case --- src/Core/FormatFactorySettings.h | 1 + src/Core/SettingsChangesHistory.cpp | 1 + src/Formats/FormatFactory.cpp | 1 + src/Formats/FormatSettings.h | 1 + .../Impl/JSONEachRowRowInputFormat.cpp | 21 ++++++++++++++++++- .../Formats/Impl/JSONEachRowRowInputFormat.h | 3 +++ 6 files changed, 27 insertions(+), 1 deletion(-) diff --git a/src/Core/FormatFactorySettings.h b/src/Core/FormatFactorySettings.h index a095bffc4c93..79ff202fe954 100644 --- a/src/Core/FormatFactorySettings.h +++ b/src/Core/FormatFactorySettings.h @@ -472,6 +472,7 @@ Enabled by default. DECLARE(Bool, input_format_json_ignore_unnecessary_fields, true, R"( Ignore unnecessary fields and not parse them. Enabling this may not throw exceptions on json strings of invalid format or with duplicated fields )", 0) \ + DECLARE(Bool, input_format_json_case_insensitive_column_matching, false, R"(Ignore json key case while read json field from string)", 0) \ DECLARE(Bool, input_format_try_infer_variants, false, R"( If enabled, ClickHouse will try to infer type [`Variant`](../../sql-reference/data-types/variant.md) in schema inference for text formats when there is more than one possible type for column/array elements. diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index 0ff9d0a68334..ba95d6b28333 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -77,6 +77,7 @@ static std::initializer_list #include #include +#include namespace DB { @@ -46,6 +47,15 @@ JSONEachRowRowInputFormat::JSONEachRowRowInputFormat( { const auto & header = getPort().getHeader(); name_map = header.getNamesToIndexesMap(); + if (format_settings_.json.case_insensitive_column_matching) + { + for (auto & it : name_map) + { + String key = it.first.toString(); + boost::to_lower(key); + lower_case_name_map[key] = it.first; + } + } if (format_settings_.import_nested_json) { for (size_t i = 0; i != header.columns(); ++i) @@ -168,7 +178,16 @@ void JSONEachRowRowInputFormat::readJSONObject(MutableColumns & columns) skipUnknownField(name_ref); continue; } - const size_t column_index = columnIndex(name_ref, key_index); + size_t column_index = 0; + if (format_settings.json.case_insensitive_column_matching) + { + String field_name = name_ref.toString(); + boost::to_lower(field_name); + StringRef field_name_ref = lower_case_name_map[field_name]; + column_index = columnIndex(field_name_ref, key_index); + } + else + column_index = columnIndex(name_ref, key_index); if (unlikely(ssize_t(column_index) < 0)) { diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h index b1163f7e883c..0f4f47682eed 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h @@ -76,6 +76,9 @@ class JSONEachRowRowInputFormat : public IRowInputFormat /// Hash table match `field name -> position in the block`. NOTE You can use perfect hash map. Block::NameMap name_map; + /// Hash table match `lower_case field name -> field name in the block`. + std::unordered_map lower_case_name_map; + /// Cached search results for previous row (keyed as index in JSON object) - used as a hint. std::vector prev_positions;