Skip to content

Commit

Permalink
Fix segfault in conditional join (#16094)
Browse files Browse the repository at this point in the history
Closes #16066.

I found a bug that would cause the reported segfault and have fixed it in this PR. When the right table has zero rows, conditional left anti-joins were returning a vector of indices containing garbage data.

Along the way, I refactored several parts of the conditional join tests and added coverage for more cases.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Yunsong Wang (https://github.com/PointKernel)

URL: #16094
  • Loading branch information
bdice authored Jun 26, 2024
1 parent e0b8ab0 commit 65b64f6
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 35 deletions.
13 changes: 3 additions & 10 deletions cpp/src/join/conditional_join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
{
if (right.num_rows() == 0) {
switch (join_type) {
case join_kind::LEFT_ANTI_JOIN:
return std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
case join_kind::LEFT_ANTI_JOIN: return get_trivial_left_join_indices(left, stream, mr).first;
case join_kind::LEFT_SEMI_JOIN:
return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
default: CUDF_FAIL("Invalid join kind."); break;
Expand Down Expand Up @@ -96,10 +95,6 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
join_size = size.value(stream);
}

if (left.num_rows() == 0) {
return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
}

rmm::device_scalar<size_type> write_index(0, stream);

auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
Expand Down Expand Up @@ -149,8 +144,7 @@ conditional_join(table_view const& left,
// with a corresponding NULL from the right.
case join_kind::LEFT_JOIN:
case join_kind::LEFT_ANTI_JOIN:
case join_kind::FULL_JOIN:
return get_trivial_left_join_indices(left, stream, rmm::mr::get_current_device_resource());
case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left, stream, mr);
// Inner and left semi joins return empty output because no matches can exist.
case join_kind::INNER_JOIN:
case join_kind::LEFT_SEMI_JOIN:
Expand All @@ -169,8 +163,7 @@ conditional_join(table_view const& left,
std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
// Full joins need to return the trivial complement.
case join_kind::FULL_JOIN: {
auto ret_flipped =
get_trivial_left_join_indices(right, stream, rmm::mr::get_current_device_resource());
auto ret_flipped = get_trivial_left_join_indices(right, stream, mr);
return std::pair(std::move(ret_flipped.second), std::move(ret_flipped.first));
}
default: CUDF_FAIL("Invalid join kind."); break;
Expand Down
92 changes: 67 additions & 25 deletions cpp/tests/join/conditional_join_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include <cudf/ast/expressions.hpp>
#include <cudf/column/column_view.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/join.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/utilities/default_stream.hpp>
Expand Down Expand Up @@ -222,21 +223,25 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest<T> {
std::vector<std::pair<cudf::size_type, cudf::size_type>> expected_outputs)
{
auto result_size = this->join_size(left, right, predicate);
EXPECT_TRUE(result_size == expected_outputs.size());

auto result = this->join(left, right, predicate);
std::vector<std::pair<cudf::size_type, cudf::size_type>> result_pairs;
for (size_t i = 0; i < result.first->size(); ++i) {
// Note: Not trying to be terribly efficient here since these tests are
// small, otherwise a batch copy to host before constructing the tuples
// would be important.
result_pairs.push_back({result.first->element(i, cudf::get_default_stream()),
result.second->element(i, cudf::get_default_stream())});
}
EXPECT_EQ(result_size, expected_outputs.size());

auto result = this->join(left, right, predicate);
auto lhs_result = cudf::detail::make_std_vector_sync(*result.first, cudf::get_default_stream());
auto rhs_result =
cudf::detail::make_std_vector_sync(*result.second, cudf::get_default_stream());
std::vector<std::pair<cudf::size_type, cudf::size_type>> result_pairs(lhs_result.size());
std::transform(lhs_result.begin(),
lhs_result.end(),
rhs_result.begin(),
result_pairs.begin(),
[](cudf::size_type lhs, cudf::size_type rhs) {
return std::pair{lhs, rhs};
});
std::sort(result_pairs.begin(), result_pairs.end());
std::sort(expected_outputs.begin(), expected_outputs.end());

EXPECT_TRUE(std::equal(expected_outputs.begin(), expected_outputs.end(), result_pairs.begin()));
EXPECT_TRUE(std::equal(
expected_outputs.begin(), expected_outputs.end(), result_pairs.begin(), result_pairs.end()));
}

/*
Expand Down Expand Up @@ -411,6 +416,11 @@ TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnLeftEmpty)
this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
};

TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnRightEmpty)
{
this->test({{3, 4, 5}}, {{}}, left_zero_eq_right_zero, {});
};

TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnTwoRowAllEqual)
{
this->test({{0, 1}}, {{0, 0}}, left_zero_eq_right_zero, {{0, 0}, {0, 1}});
Expand Down Expand Up @@ -600,6 +610,14 @@ TYPED_TEST(ConditionalLeftJoinTest, TestOneColumnLeftEmpty)
this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
};

TYPED_TEST(ConditionalLeftJoinTest, TestOneColumnRightEmpty)
{
this->test({{3, 4, 5}},
{{}},
left_zero_eq_right_zero,
{{0, JoinNoneValue}, {1, JoinNoneValue}, {2, JoinNoneValue}});
};

TYPED_TEST(ConditionalLeftJoinTest, TestCompareRandomToHash)
{
auto [left, right] = gen_random_repeated_columns<TypeParam>();
Expand Down Expand Up @@ -666,6 +684,14 @@ TYPED_TEST(ConditionalFullJoinTest, TestOneColumnLeftEmpty)
{{JoinNoneValue, 0}, {JoinNoneValue, 1}, {JoinNoneValue, 2}});
};

TYPED_TEST(ConditionalFullJoinTest, TestOneColumnRightEmpty)
{
this->test({{3, 4, 5}},
{{}},
left_zero_eq_right_zero,
{{0, JoinNoneValue}, {1, JoinNoneValue}, {2, JoinNoneValue}});
};

TYPED_TEST(ConditionalFullJoinTest, TestTwoColumnThreeRowSomeEqual)
{
this->test({{0, 1, 2}, {10, 20, 30}},
Expand Down Expand Up @@ -705,20 +731,16 @@ struct ConditionalJoinSingleReturnTest : public ConditionalJoinTest<T> {
auto [left_wrappers, right_wrappers, left_columns, right_columns, left, right] =
this->parse_input(left_data, right_data);
auto result_size = this->join_size(left, right, predicate);
EXPECT_TRUE(result_size == expected_outputs.size());

auto result = this->join(left, right, predicate);
std::vector<cudf::size_type> resulting_indices;
for (size_t i = 0; i < result->size(); ++i) {
// Note: Not trying to be terribly efficient here since these tests are
// small, otherwise a batch copy to host before constructing the tuples
// would be important.
resulting_indices.push_back(result->element(i, cudf::get_default_stream()));
}
std::sort(resulting_indices.begin(), resulting_indices.end());
EXPECT_EQ(result_size, expected_outputs.size());

auto result = this->join(left, right, predicate);
auto result_indices = cudf::detail::make_std_vector_sync(*result, cudf::get_default_stream());
std::sort(result_indices.begin(), result_indices.end());
std::sort(expected_outputs.begin(), expected_outputs.end());
EXPECT_TRUE(
std::equal(resulting_indices.begin(), resulting_indices.end(), expected_outputs.begin()));
EXPECT_TRUE(std::equal(result_indices.begin(),
result_indices.end(),
expected_outputs.begin(),
expected_outputs.end()));
}

void _compare_to_hash_join(std::unique_ptr<rmm::device_uvector<cudf::size_type>> const& result,
Expand Down Expand Up @@ -826,6 +848,16 @@ struct ConditionalLeftSemiJoinTest : public ConditionalJoinSingleReturnTest<T> {

TYPED_TEST_SUITE(ConditionalLeftSemiJoinTest, cudf::test::IntegralTypesNotBool);

TYPED_TEST(ConditionalLeftSemiJoinTest, TestOneColumnLeftEmpty)
{
this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
};

TYPED_TEST(ConditionalLeftSemiJoinTest, TestOneColumnRightEmpty)
{
this->test({{3, 4, 5}}, {{}}, left_zero_eq_right_zero, {});
};

TYPED_TEST(ConditionalLeftSemiJoinTest, TestTwoColumnThreeRowSomeEqual)
{
this->test({{0, 1, 2}, {10, 20, 30}}, {{0, 1, 3}, {30, 40, 50}}, left_zero_eq_right_zero, {0, 1});
Expand Down Expand Up @@ -873,6 +905,16 @@ struct ConditionalLeftAntiJoinTest : public ConditionalJoinSingleReturnTest<T> {

TYPED_TEST_SUITE(ConditionalLeftAntiJoinTest, cudf::test::IntegralTypesNotBool);

TYPED_TEST(ConditionalLeftAntiJoinTest, TestOneColumnLeftEmpty)
{
this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
};

TYPED_TEST(ConditionalLeftAntiJoinTest, TestOneColumnRightEmpty)
{
this->test({{3, 4, 5}}, {{}}, left_zero_eq_right_zero, {0, 1, 2});
};

TYPED_TEST(ConditionalLeftAntiJoinTest, TestTwoColumnThreeRowSomeEqual)
{
this->test({{0, 1, 2}, {10, 20, 30}}, {{0, 1, 3}, {30, 40, 50}}, left_zero_eq_right_zero, {2});
Expand Down

0 comments on commit 65b64f6

Please sign in to comment.