Skip to content

Commit

Permalink
Fix corner case bug in VectorHasher. (facebookincubator#6646)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: facebookincubator#6646

VectorHasher could go into rangeOverflow_ mode while having hasRange_ true.
As a result mayUseValueIds() was returning true even with distinctOverflow_ being true.
distinctOverflow_ being true means we are no longer using that VectorHasher.
Due to mayUseValueIds() returning true the HashBuild was still adding rows to
the VectorHasher and from every batch a single row would leak into the underlying
F14Map w/o string data being backed by saving it in the VectorHasher.
When F14Map was rehashing - it detected the discrepancy in the 7-bit tag and aborted.
The issue is quite rare.

We fix by:
- Whenever we go into rangeOverflow_ mode we also set hasRange_ to false, thus fixing the bug.
- We clean unnecessary data when going to distinctOverflow_.

Reviewed By: Yuhta

Differential Revision: D49449425

fbshipit-source-id: 7bcd67b4e53f9129ecb14be0ee70767a04cb7ea0
  • Loading branch information
Sergey Pershin authored and codyschierbeck committed Sep 27, 2023
1 parent e36969b commit d04f124
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 10 deletions.
27 changes: 19 additions & 8 deletions velox/exec/VectorHasher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -573,7 +573,7 @@ void VectorHasher::analyzeValue(StringView value) {
auto data = value.data();
if (!rangeOverflow_) {
if (size > kStringASRangeMaxSize) {
rangeOverflow_ = true;
setRangeOverflow();
} else {
int64_t number = stringAsNumber(data, size);
updateRange(number);
Expand All @@ -585,7 +585,7 @@ void VectorHasher::analyzeValue(StringView value) {
auto pair = uniqueValues_.insert(unique);
if (pair.second) {
if (uniqueValues_.size() > kMaxDistinct) {
distinctOverflow_ = true;
setDistinctOverflow();
return;
}
copyStringToLocal(&*pair.first);
Expand All @@ -599,7 +599,7 @@ void VectorHasher::copyStringToLocal(const UniqueValue* unique) {
return;
}
if (distinctStringsBytes_ > kMaxDistinctStringsBytes) {
distinctOverflow_ = true;
setDistinctOverflow();
return;
}
if (uniqueValuesStorage_.empty()) {
Expand All @@ -621,6 +621,18 @@ void VectorHasher::copyStringToLocal(const UniqueValue* unique) {
reinterpret_cast<int64_t>(str->data() + start));
}

void VectorHasher::setDistinctOverflow() {
distinctOverflow_ = true;
uniqueValues_.clear();
uniqueValuesStorage_.clear();
distinctStringsBytes_ = 0;
}

void VectorHasher::setRangeOverflow() {
rangeOverflow_ = true;
hasRange_ = false;
}

std::unique_ptr<common::Filter> VectorHasher::getFilter(
bool nullAllowed) const {
switch (typeKind_) {
Expand Down Expand Up @@ -731,7 +743,7 @@ void VectorHasher::cardinality(
if (!hasRange_ || rangeOverflow_) {
asRange = kRangeTooLarge;
} else if (__builtin_sub_overflow(max_, min_, &signedRange)) {
rangeOverflow_ = true;
setRangeOverflow();
asRange = kRangeTooLarge;
} else if (signedRange < kMaxRange) {
// We check that after the extension by reservePct the range of max - min
Expand All @@ -746,7 +758,7 @@ void VectorHasher::cardinality(
extendRange(type_->kind(), reservePct, min, max);
asRange = (max - min) + 2;
} else {
rangeOverflow_ = true;
setRangeOverflow();
asRange = kRangeTooLarge;
}
if (distinctOverflow_) {
Expand Down Expand Up @@ -819,8 +831,7 @@ void VectorHasher::merge(const VectorHasher& other) {
min_ = std::min(min_, other.min_);
max_ = std::max(max_, other.max_);
} else {
hasRange_ = false;
rangeOverflow_ = true;
setRangeOverflow();
}
if (!distinctOverflow_ && !other.distinctOverflow_) {
// Unique values can be merged without dispatch on type. All the
Expand All @@ -833,7 +844,7 @@ void VectorHasher::merge(const VectorHasher& other) {
uniqueValues_.insert(value);
}
} else {
distinctOverflow_ = true;
setDistinctOverflow();
}
}

Expand Down
12 changes: 10 additions & 2 deletions velox/exec/VectorHasher.h
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,10 @@ class VectorHasher {

std::string toString() const;

size_t numUniqueValues() const {
return uniqueValues_.size();
}

private:
static constexpr uint32_t kStringASRangeMaxSize = 7;
static constexpr uint32_t kStringBufferUnitSize = 1024;
Expand Down Expand Up @@ -415,7 +419,7 @@ class VectorHasher {
unique.setId(uniqueValues_.size() + 1);
if (uniqueValues_.insert(unique).second) {
if (uniqueValues_.size() > kMaxDistinct) {
distinctOverflow_ = true;
setDistinctOverflow();
}
}
}
Expand Down Expand Up @@ -515,6 +519,10 @@ class VectorHasher {

void copyStringToLocal(const UniqueValue* unique);

void setDistinctOverflow();

void setRangeOverflow();

static inline bool
isNullAt(const char* group, int32_t nullByte, uint8_t nullMask) {
return (group[nullByte] & nullMask) != 0;
Expand Down Expand Up @@ -613,7 +621,7 @@ inline uint64_t VectorHasher::valueId(StringView value) {
copyStringToLocal(&*pair.first);
if (!rangeOverflow_) {
if (size > kStringASRangeMaxSize) {
rangeOverflow_ = true;
setRangeOverflow();
} else {
updateRange(stringAsNumber(data, size));
}
Expand Down
41 changes: 41 additions & 0 deletions velox/exec/tests/VectorHasherTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,47 @@ TEST_F(VectorHasherTest, stringIds) {
EXPECT_EQ(numInRange, rows.countSelected());
}

// Tests distinct overflow, but starting with a small string
TEST_F(VectorHasherTest, stringDistinctOverflow) {
auto hasher = exec::VectorHasher::create(VARCHAR(), 1);

constexpr uint32_t numRows{10000};

// 7 vectors, 10000 rows each.
// The 1st row of every batch has a small string.
std::vector<FlatVectorPtr<StringView>> batches;
std::vector<std::vector<std::string>> strings;
strings.resize(7);
for (auto i = 0; i < 7; ++i) {
auto& stringVec = strings[i];
stringVec.resize(numRows);
batches.emplace_back(vectorMaker_->flatVector<StringView>(
numRows, [&i, &stringVec, numRows](vector_size_t row) {
const auto num = numRows * i + row;
stringVec[row] = (row != 0)
? fmt::format("abcdefghijabcdefghij{}", num)
: fmt::format("s{}", num);
return StringView(stringVec[row]);
}));
}

SelectivityVector rows(numRows, true);
raw_vector<uint64_t> hashes{numRows};
for (auto i = 0; i < 7; ++i) {
if (i < 5) {
ASSERT_TRUE(hasher->mayUseValueIds());
ASSERT_EQ(i * numRows, hasher->numUniqueValues());
} else {
ASSERT_FALSE(hasher->mayUseValueIds());
ASSERT_EQ(0, hasher->numUniqueValues());
}
if (hasher->mayUseValueIds()) {
hasher->decode(*batches[i], rows);
hasher->computeValueIds(rows, hashes);
}
}
}

TEST_F(VectorHasherTest, integerIds) {
auto vector = BaseVector::create(BIGINT(), 100, pool_.get());
auto ints = vector->as<FlatVector<int64_t>>();
Expand Down

0 comments on commit d04f124

Please sign in to comment.