Merge pull request ClickHouse#34888 from kitaisreal/performance-tests…

…-fix Performance tests fix
Kyligence · Mar 16, 2022 · 9ba53ae · 9ba53ae
2 parents 663c8e9 + dc31a41
commit 9ba53ae
Show file tree

Hide file tree

Showing 12 changed files with 100 additions and 89 deletions.
diff --git a/src/Common/FrequencyHolder.h b/src/Common/FrequencyHolder.h
@@ -1,5 +1,11 @@
 #pragma once
 
+#include <base/StringRef.h>
+#include <base/logger_useful.h>
+
+#include <string_view>
+#include <unordered_map>
+
 #include <Common/Arena.h>
 #include <Common/getResource.h>
 #include <Common/HashTable/HashMap.h>
@@ -10,11 +16,6 @@
 #include <IO/readFloatText.h>
 #include <IO/ZstdInflatingReadBuffer.h>
 
-#include <base/StringRef.h>
-#include <base/logger_useful.h>
-
-#include <string_view>
-#include <unordered_map>
 
 namespace DB
 {
@@ -34,7 +35,6 @@ namespace ErrorCodes
 
 class FrequencyHolder
 {
-
 public:
     struct Language
     {
@@ -52,6 +52,7 @@ class FrequencyHolder
 public:
     using Map = HashMap<StringRef, Float64>;
     using Container = std::vector<Language>;
+
     using EncodingMap = HashMap<UInt16, Float64>;
     using EncodingContainer = std::vector<Encoding>;
 
@@ -61,6 +62,30 @@ class FrequencyHolder
         return instance;
     }
 
+    const Map & getEmotionalDict() const
+    {
+        return emotional_dict;
+    }
+
+    const EncodingContainer & getEncodingsFrequency() const
+    {
+        return encodings_freq;
+    }
+
+    const Container & getProgrammingFrequency() const
+    {
+        return programming_freq;
+    }
+
+private:
+
+    FrequencyHolder()
+    {
+        loadEmotionalDict();
+        loadEncodingsFrequency();
+        loadProgrammingFrequency();
+    }
+
     void loadEncodingsFrequency()
     {
         Poco::Logger * log = &Poco::Logger::get("EncodingsFrequency");
@@ -119,7 +144,6 @@ class FrequencyHolder
         LOG_TRACE(log, "Charset frequencies was added, charsets count: {}", encodings_freq.size());
     }
 
-
     void loadEmotionalDict()
     {
         Poco::Logger * log = &Poco::Logger::get("EmotionalDict");
@@ -158,7 +182,6 @@ class FrequencyHolder
         LOG_TRACE(log, "Emotional dictionary was added. Word count: {}", std::to_string(count));
     }
 
-
     void loadProgrammingFrequency()
     {
         Poco::Logger * log = &Poco::Logger::get("ProgrammingFrequency");
@@ -211,42 +234,10 @@ class FrequencyHolder
         LOG_TRACE(log, "Programming languages frequencies was added");
     }
 
-    const Map & getEmotionalDict()
-    {
-        std::lock_guard lock(mutex);
-        if (emotional_dict.empty())
-            loadEmotionalDict();
-
-        return emotional_dict;
-    }
-
-
-    const EncodingContainer & getEncodingsFrequency()
-    {
-        std::lock_guard lock(mutex);
-        if (encodings_freq.empty())
-            loadEncodingsFrequency();
-
-        return encodings_freq;
-    }
-
-    const Container & getProgrammingFrequency()
-    {
-        std::lock_guard lock(mutex);
-        if (programming_freq.empty())
-            loadProgrammingFrequency();
-
-        return programming_freq;
-    }
-
-
-private:
     Arena string_pool;
 
     Map emotional_dict;
     Container programming_freq;
     EncodingContainer encodings_freq;
-
-    std::mutex mutex;
 };
 }
diff --git a/src/Common/ThreadProfileEvents.cpp b/src/Common/ThreadProfileEvents.cpp
@@ -9,7 +9,6 @@
 #include <filesystem>
 #include <fstream>
 #include <optional>
-#include <sstream>
 #include <unordered_set>
 
 #include <fcntl.h>
@@ -21,6 +20,8 @@
 #include <sys/types.h>
 #include <dirent.h>
 
+#include <boost/algorithm/string/split.hpp>
+
 #include <base/errnoToString.h>
 
 
@@ -247,9 +248,9 @@ static_assert(sizeof(raw_events_info) / sizeof(raw_events_info[0]) == NUMBER_OF_
 #undef CACHE_EVENT
 
 // A map of event name -> event index, to parse event list in settings.
-static std::unordered_map<std::string, size_t> populateEventMap()
+static std::unordered_map<std::string_view, size_t> populateEventMap()
 {
-    std::unordered_map<std::string, size_t> name_to_index;
+    std::unordered_map<std::string_view, size_t> name_to_index;
     name_to_index.reserve(NUMBER_OF_RAW_EVENTS);
 
     for (size_t i = 0; i < NUMBER_OF_RAW_EVENTS; ++i)
@@ -455,10 +456,10 @@ std::vector<size_t> PerfEventsCounters::eventIndicesFromString(const std::string
         return result;
     }
 
+    std::vector<std::string> event_names;
+    boost::split(event_names, events_list, [](char c) { return c == ','; });
 
-    std::istringstream iss(events_list);        // STYLE_CHECK_ALLOW_STD_STRING_STREAM
-    std::string event_name;
-    while (std::getline(iss, event_name, ','))
+    for (auto & event_name : event_names)
     {
         // Allow spaces at the beginning of the token, so that you can write 'a, b'.
         event_name.erase(0, event_name.find_first_not_of(' '));

diff --git a/src/Functions/FunctionsCharsetClassification.cpp b/src/Functions/FunctionsCharsetClassification.cpp
@@ -8,26 +8,21 @@
 namespace DB
 {
 
-/* Determine language and charset of text data. For each text, we build the distribution of bigrams bytes.
- * Then we use marked-up dictionaries with distributions of bigram bytes of various languages and charsets.
- * Using a naive Bayesian classifier, find the most likely charset and language and return it
- */
-
-template <bool detect_language>
-struct CharsetClassificationImpl
+namespace
 {
     /* We need to solve zero-frequency problem for Naive Bayes Classifier
      * If the bigram is not found in the text, we assume that the probability of its meeting is 1e-06.
      * 1e-06 is minimal value in our marked-up dictionary.
      */
-    static constexpr Float64 zero_frequency = 1e-06;
+    constexpr Float64 zero_frequency = 1e-06;
 
     /// If the data size is bigger than this, behaviour is unspecified for this function.
-    static constexpr size_t max_string_size = 1u << 15;
+    constexpr size_t max_string_size = 1UL << 15;
 
-    static ALWAYS_INLINE inline Float64 naiveBayes(
+    template <typename ModelMap>
+    ALWAYS_INLINE inline Float64 naiveBayes(
         const FrequencyHolder::EncodingMap & standard,
-        const HashMap<UInt16, UInt64> & model,
+        const ModelMap & model,
         Float64 max_result)
     {
         Float64 res = 0;
@@ -52,10 +47,11 @@ struct CharsetClassificationImpl
     }
 
     /// Сount how many times each bigram occurs in the text.
-    static ALWAYS_INLINE inline void calculateStats(
+    template <typename ModelMap>
+    ALWAYS_INLINE inline void calculateStats(
         const UInt8 * data,
         const size_t size,
-        HashMap<UInt16, UInt64> & model)
+        ModelMap & model)
     {
         UInt16 hash = 0;
         for (size_t i = 0; i < size; ++i)
@@ -65,7 +61,15 @@ struct CharsetClassificationImpl
             ++model[hash];
         }
     }
+}
 
+/* Determine language and charset of text data. For each text, we build the distribution of bigrams bytes.
+ * Then we use marked-up dictionaries with distributions of bigram bytes of various languages and charsets.
+ * Using a naive Bayesian classifier, find the most likely charset and language and return it
+ */
+template <bool detect_language>
+struct CharsetClassificationImpl
+{
     static void vector(
         const ColumnString::Chars & data,
         const ColumnString::Offsets & offsets,
@@ -74,7 +78,7 @@ struct CharsetClassificationImpl
     {
         const auto & encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();
 
-        if (detect_language)
+        if constexpr (detect_language)
             /// 2 chars for ISO code + 1 zero byte
             res_data.reserve(offsets.size() * 3);
         else
@@ -83,37 +87,43 @@ struct CharsetClassificationImpl
 
         res_offsets.resize(offsets.size());
 
-        size_t res_offset = 0;
+        size_t current_result_offset = 0;
+
+        double zero_frequency_log = log(zero_frequency);
 
         for (size_t i = 0; i < offsets.size(); ++i)
         {
             const UInt8 * str = data.data() + offsets[i - 1];
             const size_t str_len = offsets[i] - offsets[i - 1] - 1;
 
-            std::string_view res;
-
-            HashMap<UInt16, UInt64> model;
+            HashMapWithStackMemory<UInt16, UInt64, DefaultHash<UInt16>, 4> model;
             calculateStats(str, str_len, model);
 
+            std::string_view result_value;
+
             /// Go through the dictionary and find the charset with the highest weight
-            Float64 max_result = log(zero_frequency) * (max_string_size);
+            Float64 max_result = zero_frequency_log * (max_string_size);
             for (const auto & item : encodings_freq)
             {
                 Float64 score = naiveBayes(item.map, model, max_result);
                 if (max_result < score)
                 {
                     max_result = score;
-                    res = detect_language ? item.lang : item.name;
+
+                    if constexpr (detect_language)
+                        result_value = item.lang;
+                    else
+                        result_value = item.name;
                 }
             }
 
-            res_data.resize(res_offset + res.size() + 1);
-            memcpy(&res_data[res_offset], res.data(), res.size());
-
-            res_data[res_offset + res.size()] = 0;
-            res_offset += res.size() + 1;
+            size_t result_value_size = result_value.size();
+            res_data.resize(current_result_offset + result_value_size + 1);
+            memcpy(&res_data[current_result_offset], result_value.data(), result_value_size);
+            res_data[current_result_offset + result_value_size] = '\0';
+            current_result_offset += result_value_size + 1;
 
-            res_offsets[i] = res_offset;
+            res_offsets[i] = current_result_offset;
         }
     }
 };

diff --git a/src/Functions/normalizeString.cpp b/src/Functions/normalizeString.cpp
@@ -98,8 +98,6 @@ struct NormalizeUTF8Impl
         ColumnString::Offset current_from_offset = 0;
         ColumnString::Offset current_to_offset = 0;
 
-        icu::UnicodeString to_string;
-
         PODArray<UChar> from_uchars;
         PODArray<UChar> to_uchars;
 

diff --git a/src/Interpreters/ActionsVisitor.cpp b/src/Interpreters/ActionsVisitor.cpp
@@ -81,7 +81,10 @@ static Block createBlockFromCollection(const Collection & collection, const Data
     size_t columns_num = types.size();
     MutableColumns columns(columns_num);
     for (size_t i = 0; i < columns_num; ++i)
+    {
         columns[i] = types[i]->createColumn();
+        columns[i]->reserve(collection.size());
+    }
 
     Row tuple_values;
     for (const auto & value : collection)

diff --git a/src/Interpreters/Set.cpp b/src/Interpreters/Set.cpp
@@ -165,7 +165,7 @@ void Set::setHeader(const ColumnsWithTypeAndName & header)
 
 bool Set::insertFromBlock(const ColumnsWithTypeAndName & columns)
 {
-    std::unique_lock lock(rwlock);
+    std::lock_guard<std::shared_mutex> lock(rwlock);
 
     if (data.empty())
         throw Exception("Method Set::setHeader must be called before Set::insertFromBlock", ErrorCodes::LOGICAL_ERROR);

diff --git a/src/Interpreters/evaluateConstantExpression.cpp b/src/Interpreters/evaluateConstantExpression.cpp
@@ -4,6 +4,7 @@
 #include <Columns/ColumnsNumber.h>
 #include <Core/Block.h>
 #include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/FieldToDataType.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/convertFieldToType.h>
 #include <Interpreters/ExpressionActions.h>
@@ -32,6 +33,9 @@ namespace ErrorCodes
 
 std::pair<Field, std::shared_ptr<const IDataType>> evaluateConstantExpression(const ASTPtr & node, ContextPtr context)
 {
+    if (ASTLiteral * literal = node->as<ASTLiteral>())
+        return std::make_pair(literal->value, applyVisitor(FieldToDataType(), literal->value));
+
     NamesAndTypesList source_columns = {{ "_dummy", std::make_shared<DataTypeUInt8>() }};
     auto ast = node->clone();
     ReplaceQueryParameterVisitor param_visitor(context->getQueryParameters());

diff --git a/tests/performance/classification.xml b/tests/performance/classification.xml
@@ -7,14 +7,14 @@
         <table_exists>hits_100m_single</table_exists>
     </preconditions>
 
-    <query>SELECT detectLanguage(SearchPhrase) FROM hits_100m_single FORMAT Null</query>
-    <query>SELECT detectLanguageMixed(SearchPhrase) FROM hits_100m_single FORMAT Null</query>
+    <query>SELECT detectLanguage(SearchPhrase) FROM hits_100m_single LIMIT 10000000 FORMAT Null</query>
+    <query>SELECT detectLanguageMixed(SearchPhrase) FROM hits_100m_single LIMIT 10000000 FORMAT Null</query>
     <query>SELECT detectTonality(SearchPhrase) FROM hits_100m_single FORMAT Null</query>
 
     <!-- Input is not really correct for these functions,
     but at least it gives us some idea about their performance -->
-    <query>SELECT detectProgrammingLanguage(SearchPhrase) FROM hits_100m_single FORMAT Null</query>
-    <query>SELECT detectLanguageUnknown(SearchPhrase) FROM hits_100m_single FORMAT Null</query>
-    <query>SELECT detectCharset(SearchPhrase) FROM hits_100m_single FORMAT Null</query>
+    <query>SELECT detectProgrammingLanguage(SearchPhrase) FROM hits_100m_single LIMIT 10000000 FORMAT Null</query>
+    <query>SELECT detectLanguageUnknown(SearchPhrase) FROM hits_100m_single LIMIT 500000 FORMAT Null</query>
+    <query>SELECT detectCharset(SearchPhrase) FROM hits_100m_single LIMIT 500000 FORMAT Null</query>
 
 </test>
diff --git a/tests/performance/merge_tree_many_partitions.xml b/tests/performance/merge_tree_many_partitions.xml
@@ -1,11 +1,13 @@
 <test>
-    <create_query>CREATE TABLE bad_partitions (x UInt64) ENGINE = MergeTree PARTITION BY x ORDER BY x</create_query>
-    <fill_query>INSERT INTO bad_partitions SELECT * FROM numbers(10000)</fill_query>
-
     <settings>
         <max_partitions_per_insert_block>0</max_partitions_per_insert_block>
+        <max_insert_threads>1</max_insert_threads>
+        <max_memory_usage>20G</max_memory_usage>
     </settings>
 
+    <create_query>CREATE TABLE bad_partitions (x UInt64) ENGINE = MergeTree PARTITION BY x ORDER BY x</create_query>
+    <fill_query>INSERT INTO bad_partitions SELECT * FROM numbers(10000)</fill_query>
+
     <query short="1">SELECT count() FROM bad_partitions</query>
 
     <drop_query>DROP TABLE IF EXISTS bad_partitions</drop_query>