Skip to content

Commit

Permalink
Merge pull request ClickHouse#34888 from kitaisreal/performance-tests…
Browse files Browse the repository at this point in the history
…-fix

Performance tests fix
  • Loading branch information
kitaisreal authored Mar 16, 2022
2 parents 663c8e9 + dc31a41 commit 9ba53ae
Show file tree
Hide file tree
Showing 12 changed files with 100 additions and 89 deletions.
71 changes: 31 additions & 40 deletions src/Common/FrequencyHolder.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
#pragma once

#include <base/StringRef.h>
#include <base/logger_useful.h>

#include <string_view>
#include <unordered_map>

#include <Common/Arena.h>
#include <Common/getResource.h>
#include <Common/HashTable/HashMap.h>
Expand All @@ -10,11 +16,6 @@
#include <IO/readFloatText.h>
#include <IO/ZstdInflatingReadBuffer.h>

#include <base/StringRef.h>
#include <base/logger_useful.h>

#include <string_view>
#include <unordered_map>

namespace DB
{
Expand All @@ -34,7 +35,6 @@ namespace ErrorCodes

class FrequencyHolder
{

public:
struct Language
{
Expand All @@ -52,6 +52,7 @@ class FrequencyHolder
public:
using Map = HashMap<StringRef, Float64>;
using Container = std::vector<Language>;

using EncodingMap = HashMap<UInt16, Float64>;
using EncodingContainer = std::vector<Encoding>;

Expand All @@ -61,6 +62,30 @@ class FrequencyHolder
return instance;
}

const Map & getEmotionalDict() const
{
return emotional_dict;
}

const EncodingContainer & getEncodingsFrequency() const
{
return encodings_freq;
}

const Container & getProgrammingFrequency() const
{
return programming_freq;
}

private:

FrequencyHolder()
{
loadEmotionalDict();
loadEncodingsFrequency();
loadProgrammingFrequency();
}

void loadEncodingsFrequency()
{
Poco::Logger * log = &Poco::Logger::get("EncodingsFrequency");
Expand Down Expand Up @@ -119,7 +144,6 @@ class FrequencyHolder
LOG_TRACE(log, "Charset frequencies was added, charsets count: {}", encodings_freq.size());
}


void loadEmotionalDict()
{
Poco::Logger * log = &Poco::Logger::get("EmotionalDict");
Expand Down Expand Up @@ -158,7 +182,6 @@ class FrequencyHolder
LOG_TRACE(log, "Emotional dictionary was added. Word count: {}", std::to_string(count));
}


void loadProgrammingFrequency()
{
Poco::Logger * log = &Poco::Logger::get("ProgrammingFrequency");
Expand Down Expand Up @@ -211,42 +234,10 @@ class FrequencyHolder
LOG_TRACE(log, "Programming languages frequencies was added");
}

const Map & getEmotionalDict()
{
std::lock_guard lock(mutex);
if (emotional_dict.empty())
loadEmotionalDict();

return emotional_dict;
}


const EncodingContainer & getEncodingsFrequency()
{
std::lock_guard lock(mutex);
if (encodings_freq.empty())
loadEncodingsFrequency();

return encodings_freq;
}

const Container & getProgrammingFrequency()
{
std::lock_guard lock(mutex);
if (programming_freq.empty())
loadProgrammingFrequency();

return programming_freq;
}


private:
Arena string_pool;

Map emotional_dict;
Container programming_freq;
EncodingContainer encodings_freq;

std::mutex mutex;
};
}
13 changes: 7 additions & 6 deletions src/Common/ThreadProfileEvents.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#include <filesystem>
#include <fstream>
#include <optional>
#include <sstream>
#include <unordered_set>

#include <fcntl.h>
Expand All @@ -21,6 +20,8 @@
#include <sys/types.h>
#include <dirent.h>

#include <boost/algorithm/string/split.hpp>

#include <base/errnoToString.h>


Expand Down Expand Up @@ -247,9 +248,9 @@ static_assert(sizeof(raw_events_info) / sizeof(raw_events_info[0]) == NUMBER_OF_
#undef CACHE_EVENT

// A map of event name -> event index, to parse event list in settings.
static std::unordered_map<std::string, size_t> populateEventMap()
static std::unordered_map<std::string_view, size_t> populateEventMap()
{
std::unordered_map<std::string, size_t> name_to_index;
std::unordered_map<std::string_view, size_t> name_to_index;
name_to_index.reserve(NUMBER_OF_RAW_EVENTS);

for (size_t i = 0; i < NUMBER_OF_RAW_EVENTS; ++i)
Expand Down Expand Up @@ -455,10 +456,10 @@ std::vector<size_t> PerfEventsCounters::eventIndicesFromString(const std::string
return result;
}

std::vector<std::string> event_names;
boost::split(event_names, events_list, [](char c) { return c == ','; });

std::istringstream iss(events_list); // STYLE_CHECK_ALLOW_STD_STRING_STREAM
std::string event_name;
while (std::getline(iss, event_name, ','))
for (auto & event_name : event_names)
{
// Allow spaces at the beginning of the token, so that you can write 'a, b'.
event_name.erase(0, event_name.find_first_not_of(' '));
Expand Down
62 changes: 36 additions & 26 deletions src/Functions/FunctionsCharsetClassification.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,21 @@
namespace DB
{

/* Determine language and charset of text data. For each text, we build the distribution of bigrams bytes.
* Then we use marked-up dictionaries with distributions of bigram bytes of various languages ​​and charsets.
* Using a naive Bayesian classifier, find the most likely charset and language and return it
*/

template <bool detect_language>
struct CharsetClassificationImpl
namespace
{
/* We need to solve zero-frequency problem for Naive Bayes Classifier
* If the bigram is not found in the text, we assume that the probability of its meeting is 1e-06.
* 1e-06 is minimal value in our marked-up dictionary.
*/
static constexpr Float64 zero_frequency = 1e-06;
constexpr Float64 zero_frequency = 1e-06;

/// If the data size is bigger than this, behaviour is unspecified for this function.
static constexpr size_t max_string_size = 1u << 15;
constexpr size_t max_string_size = 1UL << 15;

static ALWAYS_INLINE inline Float64 naiveBayes(
template <typename ModelMap>
ALWAYS_INLINE inline Float64 naiveBayes(
const FrequencyHolder::EncodingMap & standard,
const HashMap<UInt16, UInt64> & model,
const ModelMap & model,
Float64 max_result)
{
Float64 res = 0;
Expand All @@ -52,10 +47,11 @@ struct CharsetClassificationImpl
}

/// Сount how many times each bigram occurs in the text.
static ALWAYS_INLINE inline void calculateStats(
template <typename ModelMap>
ALWAYS_INLINE inline void calculateStats(
const UInt8 * data,
const size_t size,
HashMap<UInt16, UInt64> & model)
ModelMap & model)
{
UInt16 hash = 0;
for (size_t i = 0; i < size; ++i)
Expand All @@ -65,7 +61,15 @@ struct CharsetClassificationImpl
++model[hash];
}
}
}

/* Determine language and charset of text data. For each text, we build the distribution of bigrams bytes.
* Then we use marked-up dictionaries with distributions of bigram bytes of various languages ​​and charsets.
* Using a naive Bayesian classifier, find the most likely charset and language and return it
*/
template <bool detect_language>
struct CharsetClassificationImpl
{
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
Expand All @@ -74,7 +78,7 @@ struct CharsetClassificationImpl
{
const auto & encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();

if (detect_language)
if constexpr (detect_language)
/// 2 chars for ISO code + 1 zero byte
res_data.reserve(offsets.size() * 3);
else
Expand All @@ -83,37 +87,43 @@ struct CharsetClassificationImpl

res_offsets.resize(offsets.size());

size_t res_offset = 0;
size_t current_result_offset = 0;

double zero_frequency_log = log(zero_frequency);

for (size_t i = 0; i < offsets.size(); ++i)
{
const UInt8 * str = data.data() + offsets[i - 1];
const size_t str_len = offsets[i] - offsets[i - 1] - 1;

std::string_view res;

HashMap<UInt16, UInt64> model;
HashMapWithStackMemory<UInt16, UInt64, DefaultHash<UInt16>, 4> model;
calculateStats(str, str_len, model);

std::string_view result_value;

/// Go through the dictionary and find the charset with the highest weight
Float64 max_result = log(zero_frequency) * (max_string_size);
Float64 max_result = zero_frequency_log * (max_string_size);
for (const auto & item : encodings_freq)
{
Float64 score = naiveBayes(item.map, model, max_result);
if (max_result < score)
{
max_result = score;
res = detect_language ? item.lang : item.name;

if constexpr (detect_language)
result_value = item.lang;
else
result_value = item.name;
}
}

res_data.resize(res_offset + res.size() + 1);
memcpy(&res_data[res_offset], res.data(), res.size());

res_data[res_offset + res.size()] = 0;
res_offset += res.size() + 1;
size_t result_value_size = result_value.size();
res_data.resize(current_result_offset + result_value_size + 1);
memcpy(&res_data[current_result_offset], result_value.data(), result_value_size);
res_data[current_result_offset + result_value_size] = '\0';
current_result_offset += result_value_size + 1;

res_offsets[i] = res_offset;
res_offsets[i] = current_result_offset;
}
}
};
Expand Down
2 changes: 0 additions & 2 deletions src/Functions/normalizeString.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,6 @@ struct NormalizeUTF8Impl
ColumnString::Offset current_from_offset = 0;
ColumnString::Offset current_to_offset = 0;

icu::UnicodeString to_string;

PODArray<UChar> from_uchars;
PODArray<UChar> to_uchars;

Expand Down
3 changes: 3 additions & 0 deletions src/Interpreters/ActionsVisitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,10 @@ static Block createBlockFromCollection(const Collection & collection, const Data
size_t columns_num = types.size();
MutableColumns columns(columns_num);
for (size_t i = 0; i < columns_num; ++i)
{
columns[i] = types[i]->createColumn();
columns[i]->reserve(collection.size());
}

Row tuple_values;
for (const auto & value : collection)
Expand Down
2 changes: 1 addition & 1 deletion src/Interpreters/Set.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ void Set::setHeader(const ColumnsWithTypeAndName & header)

bool Set::insertFromBlock(const ColumnsWithTypeAndName & columns)
{
std::unique_lock lock(rwlock);
std::lock_guard<std::shared_mutex> lock(rwlock);

if (data.empty())
throw Exception("Method Set::setHeader must be called before Set::insertFromBlock", ErrorCodes::LOGICAL_ERROR);
Expand Down
4 changes: 4 additions & 0 deletions src/Interpreters/evaluateConstantExpression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <Columns/ColumnsNumber.h>
#include <Core/Block.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/FieldToDataType.h>
#include <Interpreters/Context.h>
#include <Interpreters/convertFieldToType.h>
#include <Interpreters/ExpressionActions.h>
Expand Down Expand Up @@ -32,6 +33,9 @@ namespace ErrorCodes

std::pair<Field, std::shared_ptr<const IDataType>> evaluateConstantExpression(const ASTPtr & node, ContextPtr context)
{
if (ASTLiteral * literal = node->as<ASTLiteral>())
return std::make_pair(literal->value, applyVisitor(FieldToDataType(), literal->value));

NamesAndTypesList source_columns = {{ "_dummy", std::make_shared<DataTypeUInt8>() }};
auto ast = node->clone();
ReplaceQueryParameterVisitor param_visitor(context->getQueryParameters());
Expand Down
10 changes: 5 additions & 5 deletions tests/performance/classification.xml
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@
<table_exists>hits_100m_single</table_exists>
</preconditions>

<query>SELECT detectLanguage(SearchPhrase) FROM hits_100m_single FORMAT Null</query>
<query>SELECT detectLanguageMixed(SearchPhrase) FROM hits_100m_single FORMAT Null</query>
<query>SELECT detectLanguage(SearchPhrase) FROM hits_100m_single LIMIT 10000000 FORMAT Null</query>
<query>SELECT detectLanguageMixed(SearchPhrase) FROM hits_100m_single LIMIT 10000000 FORMAT Null</query>
<query>SELECT detectTonality(SearchPhrase) FROM hits_100m_single FORMAT Null</query>

<!-- Input is not really correct for these functions,
but at least it gives us some idea about their performance -->
<query>SELECT detectProgrammingLanguage(SearchPhrase) FROM hits_100m_single FORMAT Null</query>
<query>SELECT detectLanguageUnknown(SearchPhrase) FROM hits_100m_single FORMAT Null</query>
<query>SELECT detectCharset(SearchPhrase) FROM hits_100m_single FORMAT Null</query>
<query>SELECT detectProgrammingLanguage(SearchPhrase) FROM hits_100m_single LIMIT 10000000 FORMAT Null</query>
<query>SELECT detectLanguageUnknown(SearchPhrase) FROM hits_100m_single LIMIT 500000 FORMAT Null</query>
<query>SELECT detectCharset(SearchPhrase) FROM hits_100m_single LIMIT 500000 FORMAT Null</query>

</test>
8 changes: 5 additions & 3 deletions tests/performance/merge_tree_many_partitions.xml
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
<test>
<create_query>CREATE TABLE bad_partitions (x UInt64) ENGINE = MergeTree PARTITION BY x ORDER BY x</create_query>
<fill_query>INSERT INTO bad_partitions SELECT * FROM numbers(10000)</fill_query>

<settings>
<max_partitions_per_insert_block>0</max_partitions_per_insert_block>
<max_insert_threads>1</max_insert_threads>
<max_memory_usage>20G</max_memory_usage>
</settings>

<create_query>CREATE TABLE bad_partitions (x UInt64) ENGINE = MergeTree PARTITION BY x ORDER BY x</create_query>
<fill_query>INSERT INTO bad_partitions SELECT * FROM numbers(10000)</fill_query>

<query short="1">SELECT count() FROM bad_partitions</query>

<drop_query>DROP TABLE IF EXISTS bad_partitions</drop_query>
Expand Down
Loading

0 comments on commit 9ba53ae

Please sign in to comment.