Skip to content

Commit

Permalink
Bugfix 1268: swap out xxhash for grouping (#1416)
Browse files Browse the repository at this point in the history
Fixes #1268 

Replace `xxhash` for `std::hash` for hash-based groupings. Experimented
with [highwayhash](https://github.com/google/highwayhash) as well, but
this was slower for numeric types.

Newly added benchmarks results:

`xxhash`
```
Benchmark                                              Time             CPU   Iterations
----------------------------------------------------------------------------------------
BM_hash_grouping_int<int8_t >/100000/10/2          0.474 ms        0.474 ms            5
BM_hash_grouping_int<int8_t >/100000/100000/2      0.459 ms        0.459 ms            5
BM_hash_grouping_int<int16_t>/100000/10/2          0.459 ms        0.459 ms            5
BM_hash_grouping_int<int16_t>/100000/100000/2       2.22 ms         2.22 ms            5
BM_hash_grouping_int<int32_t>/100000/10/2          0.524 ms        0.524 ms            5
BM_hash_grouping_int<int32_t>/100000/100000/2      0.527 ms        0.527 ms            5
BM_hash_grouping_int<int64_t>/100000/10/2          0.553 ms        0.553 ms            5
BM_hash_grouping_int<int64_t>/100000/100000/2       1.34 ms         1.34 ms            5
BM_hash_grouping_string/100000/10/2/10              1.45 ms         1.45 ms            5
BM_hash_grouping_string/100000/100000/2/10          2.45 ms         2.45 ms            5
BM_hash_grouping_string/100000/10/2/100            0.378 ms        0.378 ms            5
BM_hash_grouping_string/100000/100000/2/100         1.57 ms         1.57 ms            5

```

`highwayhash`
```
Benchmark                                              Time             CPU   Iterations
----------------------------------------------------------------------------------------
BM_hash_grouping_int<int8_t>/100000/10/2            6.21 ms         6.21 ms          100
BM_hash_grouping_int<int8_t>/100000/100000/2        6.15 ms         6.15 ms          100
BM_hash_grouping_int<int16_t>/100000/10/2           6.17 ms         6.17 ms          100
BM_hash_grouping_int<int16_t>/100000/100000/2       6.17 ms         6.17 ms          100
BM_hash_grouping_int<int32_t>/100000/10/2           6.38 ms         6.38 ms          100
BM_hash_grouping_int<int32_t>/100000/100000/2       6.43 ms         6.43 ms          100
BM_hash_grouping_int<int64_t>/100000/10/2           6.54 ms         6.54 ms          100
BM_hash_grouping_int<int64_t>/100000/100000/2       6.45 ms         6.45 ms          100
BM_hash_grouping_string/100000/10/2/10             0.539 ms        0.539 ms          100
BM_hash_grouping_string/100000/100000/2/10         0.811 ms        0.811 ms          100
BM_hash_grouping_string/100000/10/2/100            0.287 ms        0.287 ms          100
BM_hash_grouping_string/100000/100000/2/100        0.655 ms        0.655 ms          100
```

`std::hash`
```
Benchmark                                              Time             CPU   Iterations
----------------------------------------------------------------------------------------
BM_hash_grouping_int<int8_t>/100000/10/2           0.225 ms        0.225 ms          100
BM_hash_grouping_int<int8_t>/100000/100000/2       0.207 ms        0.207 ms          100
BM_hash_grouping_int<int16_t>/100000/10/2          0.216 ms        0.216 ms          100
BM_hash_grouping_int<int16_t>/100000/100000/2      0.213 ms        0.213 ms          100
BM_hash_grouping_int<int32_t>/100000/10/2          0.213 ms        0.213 ms          100
BM_hash_grouping_int<int32_t>/100000/100000/2      0.207 ms        0.207 ms          100
BM_hash_grouping_int<int64_t>/100000/10/2          0.208 ms        0.208 ms          100
BM_hash_grouping_int<int64_t>/100000/100000/2      0.217 ms        0.217 ms          100
BM_hash_grouping_string/100000/10/2/10             0.504 ms        0.504 ms          100
BM_hash_grouping_string/100000/100000/2/10         0.624 ms        0.624 ms          100
BM_hash_grouping_string/100000/10/2/100            0.262 ms        0.262 ms          100
BM_hash_grouping_string/100000/100000/2/100        0.542 ms        0.542 ms          100

```
  • Loading branch information
alexowens90 authored Mar 14, 2024
1 parent 6375285 commit 01e890c
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 60 deletions.
3 changes: 1 addition & 2 deletions cpp/arcticdb/entity/ref_key.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,7 @@ namespace std {
template<>
struct hash<arcticdb::entity::RefKey> {
inline arcticdb::HashedValue operator()(const arcticdb::entity::RefKey &k) const noexcept {
auto view = k.view();
return arcticdb::hash(const_cast<uint8_t * >(reinterpret_cast<const uint8_t *>(view.data())), view.size());
return arcticdb::hash(k.view());
}
};
}
Expand Down
4 changes: 2 additions & 2 deletions cpp/arcticdb/processing/grouper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,10 @@ class HashingGroupers {
if (std::isnan(key)) {
return std::nullopt;
} else {
return hash<RawType>(&key);
return hash<RawType>(key);
}
} else {
return hash<RawType>(&key);
return hash<RawType>(key);
}
}
private:
Expand Down
91 changes: 90 additions & 1 deletion cpp/arcticdb/processing/test/benchmark_clause.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
* As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
*/

#include <random>

#include <benchmark/benchmark.h>

#include <arcticdb/processing/clause.hpp>
Expand Down Expand Up @@ -86,5 +88,92 @@ static void BM_merge_ordered(benchmark::State& state){
}
}

template <typename integer>
requires std::integral<integer>
void BM_hash_grouping_int(benchmark::State& state) {
auto num_rows = state.range(0);
auto num_unique_values = state.range(1);
auto num_buckets = state.range(2);

std::random_device rd;
std::mt19937 gen(rd());
// uniform_int_distribution (validly) undefined for int8_t in MSVC, hence the casting backwards and forwards
std::uniform_int_distribution<int64_t> dis(static_cast<int64_t>(std::numeric_limits<integer>::lowest()),
static_cast<int64_t>(std::numeric_limits<integer>::max()));
std::vector<integer> unique_values;
unique_values.reserve(num_unique_values);
for (auto idx = 0; idx < num_unique_values; ++idx) {
unique_values.emplace_back(static_cast<integer>(dis(gen)));
}

std::uniform_int_distribution<size_t> unique_values_dis(0, num_unique_values - 1);

std::vector<integer> data;
data.reserve(num_rows);
for(int idx = 0; idx < num_rows; ++idx) {
data.emplace_back(unique_values[unique_values_dis(gen)]);
}

constexpr auto data_type = data_type_from_raw_type<integer>();
Column column(make_scalar_type(data_type), num_rows, true, false);
memcpy(column.ptr(), data.data(), num_rows * sizeof(integer));
ColumnWithStrings col_with_strings(std::move(column), {});

grouping::HashingGroupers::Grouper<ScalarTagType<DataTypeTag<data_type>>> grouper;
grouping::ModuloBucketizer bucketizer(num_buckets);

for (auto _ : state) {
auto buckets = get_buckets(col_with_strings, grouper, bucketizer);
}
}

void BM_hash_grouping_string(benchmark::State& state) {
auto num_rows = state.range(0);
auto num_unique_values = state.range(1);
auto num_buckets = state.range(2);
auto string_length = state.range(3);

std::random_device rd;
std::mt19937 gen(rd());
const std::string character_set{"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789`¬!\"£$%^&*()_+-=[]{};:'@#~,<.>/? "};
std::uniform_int_distribution<> dis(0, character_set.size() - 1);
std::vector<std::string> unique_values;
unique_values.reserve(num_unique_values);
for (auto idx = 0; idx < num_unique_values; ++idx) {
std::string builder;
builder.reserve(string_length);
for (auto idx_2 = 0; idx_2 < string_length; ++idx_2) {
builder += character_set[dis(gen)];
}
unique_values.emplace_back(builder);
}

std::uniform_int_distribution<size_t> unique_values_dis(0, num_unique_values - 1);

Column column(make_scalar_type(DataType::UTF_DYNAMIC64), false);
auto string_pool = std::make_shared<StringPool>();
for (auto idx = 0; idx < num_rows; ++idx) {
std::string_view str{unique_values[unique_values_dis(gen)]};
OffsetString ofstr = string_pool->get(str);
column.set_scalar(idx, ofstr.offset());
}

ColumnWithStrings col_with_strings(std::move(column), string_pool);

grouping::HashingGroupers::Grouper<ScalarTagType<DataTypeTag<DataType::UTF_DYNAMIC64>>> grouper;
grouping::ModuloBucketizer bucketizer(num_buckets);

for (auto _ : state) {
auto buckets = get_buckets(col_with_strings, grouper, bucketizer);
}
}

BENCHMARK(BM_merge_interleaved)->Args({10'000, 100});
BENCHMARK(BM_merge_ordered)->Args({10'000, 100});
BENCHMARK(BM_merge_ordered)->Args({10'000, 100});

BENCHMARK(BM_hash_grouping_int<int8_t>)->Args({100'000, 10, 2})->Args({100'000, 100'000, 2});
BENCHMARK(BM_hash_grouping_int<int16_t>)->Args({100'000, 10, 2})->Args({100'000, 100'000, 2});
BENCHMARK(BM_hash_grouping_int<int32_t>)->Args({100'000, 10, 2})->Args({100'000, 100'000, 2});
BENCHMARK(BM_hash_grouping_int<int64_t>)->Args({100'000, 10, 2})->Args({100'000, 100'000, 2});

BENCHMARK(BM_hash_grouping_string)->Args({100'000, 10, 2, 10})->Args({100'000, 100'000, 2, 10})->Args({100'000, 10, 2, 100})->Args({100'000, 100'000, 2, 100});
42 changes: 0 additions & 42 deletions cpp/arcticdb/processing/test/test_clause.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,48 +44,6 @@ void segment_string_assert_all_values_equal(const arcticdb::ProcessingUnit& proc

}

TEST(Clause, Partition) {
// This test is fragile as it depends on both the hash function used for grouping and the partitioning strategy
// Remove if either of these change and the test starts failing
using namespace arcticdb;
ScopedConfig num_buckets("Partition.NumBuckets", 16);
auto component_manager = std::make_shared<ComponentManager>();

PartitionClause<arcticdb::grouping::HashingGroupers, arcticdb::grouping::ModuloBucketizer> partition{"int8"};
partition.set_component_manager(component_manager);

auto proc_unit = ProcessingUnit{get_groupable_timeseries_segment("groupable", 30, {1,2,3,1,2,3,1,2,3})};
auto entity_ids = Composite<EntityIds>(push_entities(component_manager, std::move(proc_unit)));
auto partitioned = gather_entities(component_manager, partition.process(std::move(entity_ids)));

std::vector<std::unordered_set<int8_t>> tags = {{1, 3}, {2}};
std::array<size_t, 2> sizes = {180, 90};
for (auto inner_seg : folly::enumerate(partitioned.as_range())){
segment_scalar_assert_all_values_equal<int8_t>(*inner_seg, ColumnName("int8"), tags[inner_seg.index], sizes[inner_seg.index]);
}
}

TEST(Clause, PartitionString) {
// This test is fragile as it depends on both the hash function used for grouping and the partitioning strategy
// Remove if either of these change and the test starts failing
using namespace arcticdb;
ScopedConfig num_buckets("Partition.NumBuckets", 16);
auto component_manager = std::make_shared<ComponentManager>();

PartitionClause<arcticdb::grouping::HashingGroupers, arcticdb::grouping::ModuloBucketizer> partition{"strings"};
partition.set_component_manager(component_manager);

auto proc_unit = ProcessingUnit{get_groupable_timeseries_segment("groupable", 30, {1,1,3,3,1,1})};
auto entity_ids = Composite<EntityIds>(push_entities(component_manager, std::move(proc_unit)));
auto partitioned = gather_entities(component_manager, partition.process(std::move(entity_ids)));

std::vector<size_t> tags = {1, 3};
std::vector<size_t> sizes = {120, 60};
for (auto inner_seg : folly::enumerate(partitioned.as_range())){
segment_string_assert_all_values_equal(*inner_seg, ColumnName("strings"), fmt::format("string_{}", tags[inner_seg.index]), sizes[inner_seg.index]);
}
}

TEST(Clause, PartitionEmptyColumn) {
using namespace arcticdb;
auto component_manager = std::make_shared<ComponentManager>();
Expand Down
20 changes: 7 additions & 13 deletions cpp/arcticdb/util/hash.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,16 @@

namespace arcticdb {

using HashedValue = XXH64_hash_t;
constexpr std::size_t DEFAULT_SEED = 0x42;

template<class T, std::size_t seed = DEFAULT_SEED>
HashedValue hash(T *d, std::size_t count) {
return XXH64(reinterpret_cast<const void *>(d), count * sizeof(T), seed);
template<class T>
inline size_t hash(T d) {
return std::hash<T>{}(d);
}

// size argument to XXH64 being compile-time constant improves performance
template<class T, std::size_t seed = DEFAULT_SEED>
HashedValue hash(T *d) {
return XXH64(reinterpret_cast<const void *>(d), sizeof(T), seed);
inline size_t hash(std::string_view sv) {
return std::hash<std::string_view>{}(sv);
}

inline HashedValue hash(std::string_view sv) {
return hash(sv.data(), sv.size());
}
using HashedValue = XXH64_hash_t;

class HashAccum {
public:
Expand All @@ -56,6 +49,7 @@ class HashAccum {
}
private:
XXH64_state_t state_ = XXH64_state_t{};
static constexpr std::size_t DEFAULT_SEED = 0x42;
};

} // namespace arcticdb

0 comments on commit 01e890c

Please sign in to comment.