Skip to content

Commit

Permalink
Rebase velox (2024_03_15)
Browse files Browse the repository at this point in the history
Signed-off-by: glutenperfbot <[email protected]>
  • Loading branch information
glutenperfbot committed Mar 15, 2024
1 parent bb32a90 commit db2c5e3
Show file tree
Hide file tree
Showing 5 changed files with 330 additions and 0 deletions.
9 changes: 9 additions & 0 deletions velox/docs/functions/spark/array.rst
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,15 @@ Array Functions
SELECT array_sort(ARRAY [NULL, 1, NULL]); -- [1, NULL, NULL]
SELECT array_sort(ARRAY [NULL, 2, 1]); -- [1, 2, NULL]

.. spark:function:: array_union(array(E), array(E1)) -> array(E2)
Returns an array of the elements in the union of array1 and array2, without duplicates. ::

SELECT array_union(array(1, 2, 3), array(1, 3, 5)); -- [1, 2, 3, 5]
SELECT array_union(array(1, 3, 5), array(1, 2, 3)); -- [1, 3, 5, 2]
SELECT array_union(array(1, 2, 3), array(1, 3, 5, null)); -- [1, 2, 3, 5, null]
SELECT array_union(array(1, 2, NaN), array(1, 3, NaN)); -- [1, 2, NaN, 3]

.. spark:function:: concat(array(E), array(E1), ..., array(En)) -> array(E, E1, ..., En)
Returns the concatenation of array(E), array(E1), ..., array(En). ::
Expand Down
89 changes: 89 additions & 0 deletions velox/functions/sparksql/ArrayUnionFunction.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

namespace facebook::velox::functions::sparksql {

/// This class implements the array union function.
///
/// DEFINITION:
/// array_union(x, y) → array
/// Returns an array of the elements in the union of x and y, without
/// duplicates.
template <typename T>
struct ArrayUnionFunction {
VELOX_DEFINE_FUNCTION_TYPES(T)

// Fast path for primitives.
template <typename Out, typename In>
void call(Out& out, const In& inputArray1, const In& inputArray2) {
folly::F14FastSet<typename In::element_t> elementSet;
bool nullAdded = false;
bool nanAdded = false;
auto addItems = [&](auto& inputArray) {
for (const auto& item : inputArray) {
if (item.has_value()) {
if constexpr (
std::is_same_v<In, arg_type<Array<float>>> ||
std::is_same_v<In, arg_type<Array<double>>>) {
bool isNaN = std::isnan(item.value());
if ((isNaN && !nanAdded) ||
(!isNaN && elementSet.insert(item.value()).second)) {
auto& newItem = out.add_item();
newItem = item.value();
}
if (!nanAdded && isNaN) {
nanAdded = true;
}
} else if (elementSet.insert(item.value()).second) {
auto& newItem = out.add_item();
newItem = item.value();
}
} else if (!nullAdded) {
nullAdded = true;
out.add_null();
}
}
};
addItems(inputArray1);
addItems(inputArray2);
}

void call(
out_type<Array<Generic<T1>>>& out,
const arg_type<Array<Generic<T1>>>& inputArray1,
const arg_type<Array<Generic<T1>>>& inputArray2) {
folly::F14FastSet<exec::GenericView> elementSet;
bool nullAdded = false;
auto addItems = [&](auto& inputArray) {
for (const auto& item : inputArray) {
if (item.has_value()) {
if (elementSet.insert(item.value()).second) {
auto& newItem = out.add_item();
newItem.copy_from(item.value());
}
} else if (!nullAdded) {
nullAdded = true;
out.add_null();
}
}
};
addItems(inputArray1);
addItems(inputArray2);
}
};
} // namespace facebook::velox::functions::sparksql
23 changes: 23 additions & 0 deletions velox/functions/sparksql/Register.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "velox/functions/prestosql/StringFunctions.h"
#include "velox/functions/sparksql/ArrayMinMaxFunction.h"
#include "velox/functions/sparksql/ArraySort.h"
#include "velox/functions/sparksql/ArrayUnionFunction.h"
#include "velox/functions/sparksql/Bitwise.h"
#include "velox/functions/sparksql/DateTimeFunctions.h"
#include "velox/functions/sparksql/Hash.h"
Expand Down Expand Up @@ -122,6 +123,12 @@ inline void registerArrayMinMaxFunctions(const std::string& prefix) {
}
} // namespace

template <typename T>
inline void registerArrayUnionFunctions(const std::string& prefix) {
registerFunction<sparksql::ArrayUnionFunction, Array<T>, Array<T>, Array<T>>(
{prefix + "array_union"});
}

void registerFunctions(const std::string& prefix) {
registerAllSpecialFormGeneralFunctions();

Expand Down Expand Up @@ -357,8 +364,24 @@ void registerFunctions(const std::string& prefix) {

registerFunction<MonotonicallyIncreasingIdFunction, int64_t>(
{prefix + "monotonically_increasing_id"});
<<<<<<< HEAD

registerFunction<UuidFunction, Varchar, Constant<int64_t>>({prefix + "uuid"});
=======
registerArrayUnionFunctions<bool>(prefix);
registerArrayUnionFunctions<int8_t>(prefix);
registerArrayUnionFunctions<int16_t>(prefix);
registerArrayUnionFunctions<int32_t>(prefix);
registerArrayUnionFunctions<int64_t>(prefix);
registerArrayUnionFunctions<int128_t>(prefix);
registerArrayUnionFunctions<float>(prefix);
registerArrayUnionFunctions<double>(prefix);
registerArrayUnionFunctions<Varchar>(prefix);
registerArrayUnionFunctions<Varbinary>(prefix);
registerArrayUnionFunctions<Date>(prefix);
registerArrayUnionFunctions<Timestamp>(prefix);
registerArrayUnionFunctions<Generic<T1>>(prefix);
>>>>>>> Fix array_union on NaN (7086)
}

} // namespace sparksql
Expand Down
208 changes: 208 additions & 0 deletions velox/functions/sparksql/tests/ArrayUnionTest.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "velox/common/base/tests/GTestUtils.h"
#include "velox/functions/sparksql/tests/SparkFunctionBaseTest.h"

using namespace facebook::velox;
using namespace facebook::velox::test;

namespace facebook::velox::functions::sparksql::test {
namespace {

class ArrayUnionTest : public SparkFunctionBaseTest {
protected:
void testExpression(
const std::string& expression,
const std::vector<VectorPtr>& input,
const VectorPtr& expected) {
auto result = evaluate(expression, makeRowVector(input));
assertEqualVectors(expected, result);
}

template <typename T>
void testFloatArray() {
const auto array1 = makeArrayVector<T>(
{{1.99, 2.78, 3.98, 4.01},
{3.89, 4.99, 5.13},
{7.13, 8.91, std::numeric_limits<T>::quiet_NaN()},
{10.02, 20.01, std::numeric_limits<T>::quiet_NaN()}});
const auto array2 = makeArrayVector<T>(
{{2.78, 4.01, 5.99},
{3.89, 4.99, 5.13},
{7.13, 8.91, std::numeric_limits<T>::quiet_NaN()},
{40.99, 50.12}});

VectorPtr expected;
expected = makeArrayVector<T>({
{1.99, 2.78, 3.98, 4.01, 5.99},
{3.89, 4.99, 5.13},
{7.13, 8.91, std::numeric_limits<T>::quiet_NaN()},
{10.02, 20.01, std::numeric_limits<T>::quiet_NaN(), 40.99, 50.12},
});
testExpression("array_union(c0, c1)", {array1, array2}, expected);

expected = makeArrayVector<T>({
{2.78, 4.01, 5.99, 1.99, 3.98},
{3.89, 4.99, 5.13},
{7.13, 8.91, std::numeric_limits<T>::quiet_NaN()},
{40.99, 50.12, 10.02, 20.01, std::numeric_limits<T>::quiet_NaN()},
});
testExpression("array_union(c0, c1)", {array2, array1}, expected);
}
};

// Union two integer arrays.
TEST_F(ArrayUnionTest, intArray) {
const auto array1 = makeArrayVector<int64_t>(
{{1, 2, 3, 4}, {3, 4, 5}, {7, 8, 9}, {10, 20, 30}});
const auto array2 =
makeArrayVector<int64_t>({{2, 4, 5}, {3, 4, 5}, {}, {40, 50}});
VectorPtr expected;

expected = makeArrayVector<int64_t>({
{1, 2, 3, 4, 5},
{3, 4, 5},
{7, 8, 9},
{10, 20, 30, 40, 50},
});
testExpression("array_union(c0, c1)", {array1, array2}, expected);

expected = makeArrayVector<int64_t>({
{2, 4, 5, 1, 3},
{3, 4, 5},
{7, 8, 9},
{40, 50, 10, 20, 30},
});
testExpression("array_union(c0, c1)", {array2, array1}, expected);
}

// Union two float or double arrays.
TEST_F(ArrayUnionTest, floatArray) {
testFloatArray<float>();
testFloatArray<double>();
}

// Union two string arrays.
TEST_F(ArrayUnionTest, stringArray) {
const auto array1 =
makeArrayVector<StringView>({{"foo", "bar"}, {"foo", "baz"}});
const auto array2 =
makeArrayVector<StringView>({{"foo", "bar"}, {"bar", "baz"}});
VectorPtr expected;

expected = makeArrayVector<StringView>({
{"foo", "bar"},
{"foo", "baz", "bar"},
});
testExpression("array_union(c0, c1)", {array1, array2}, expected);
}

// Union two integer arrays with null.
TEST_F(ArrayUnionTest, nullArray) {
const auto array1 = makeNullableArrayVector<int64_t>({
{{1, std::nullopt, 3, 4}},
{7, 8, 9},
{{10, std::nullopt, std::nullopt}},
});
const auto array2 = makeNullableArrayVector<int64_t>({
{{std::nullopt, std::nullopt, 3, 5}},
std::nullopt,
{{1, 10}},
});
VectorPtr expected;

expected = makeNullableArrayVector<int64_t>({
{{1, std::nullopt, 3, 4, 5}},
std::nullopt,
{{10, std::nullopt, 1}},
});
testExpression("array_union(c0, c1)", {array1, array2}, expected);

expected = makeNullableArrayVector<int64_t>({
{{std::nullopt, 3, 5, 1, 4}},
std::nullopt,
{{1, 10, std::nullopt}},
});
testExpression("array_union(c0, c1)", {array2, array1}, expected);
}

// Union array vectors.
TEST_F(ArrayUnionTest, complexTypes) {
auto baseVector = makeArrayVector<int64_t>(
{{1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}, {6, 6}});

// Create arrays of array vector using above base vector.
// [[1, 1], [2, 2]]
// [[3, 3], [4, 4]]
// [[5, 5], [6, 6]]
auto arrayOfArrays1 = makeArrayVector({0, 2, 4}, baseVector);
// [[1, 1], [2, 2], [3, 3]]
// [[4, 4]]
// [[5, 5], [6, 6]]
auto arrayOfArrays2 = makeArrayVector({0, 3, 4}, baseVector);

// [[1, 1], [2, 2], [3, 3]]
// [[3, 3], [4, 4]]
// [[5, 5], [6, 6]]
auto expected = makeArrayVector(
{0, 3, 5},
makeArrayVector<int64_t>(
{{1, 1}, {2, 2}, {3, 3}, {3, 3}, {4, 4}, {5, 5}, {6, 6}}));

testExpression(
"array_union(c0, c1)", {arrayOfArrays1, arrayOfArrays2}, expected);
}

// Union double array vectors.
TEST_F(ArrayUnionTest, complexDoubleType) {
auto baseVector = makeArrayVector<double>(
{{1.0, 1.0},
{2.0, 2.0},
{3.0, 3.0},
{4.0, 4.0},
{5.0, std::numeric_limits<double>::quiet_NaN()},
{6.0, 6.0}});

// Create arrays of array vector using above base vector.
// [[1.0, 1.0], [2.0, 2.0]]
// [[3.0, 3.0], [4.0, 4.0]]
// [[5.0, NaN], [6.0, 6.0]]
auto arrayOfArrays1 = makeArrayVector({0, 2, 4}, baseVector);
// [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]]
// [[4.0, 4.0]]
// [[5.0, NaN], [6.0, 6.0]]
auto arrayOfArrays2 = makeArrayVector({0, 3, 4}, baseVector);

// [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]]
// [[3.0, 3.0], [4.0, 4.0]]
// [[5.0, NaN], [6.0, 6.0]]
auto expected = makeArrayVector(
{0, 3, 5},
makeArrayVector<double>(
{{1.0, 1.0},
{2.0, 2.0},
{3.0, 3.0},
{3.0, 3.0},
{4.0, 4.0},
{5.0, std::numeric_limits<double>::quiet_NaN()},
{6.0, 6.0}}));

testExpression(
"array_union(c0, c1)", {arrayOfArrays1, arrayOfArrays2}, expected);
}
} // namespace
} // namespace facebook::velox::functions::sparksql::test
1 change: 1 addition & 0 deletions velox/functions/sparksql/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ add_executable(
ArrayMaxTest.cpp
ArrayMinTest.cpp
ArraySortTest.cpp
ArrayUnionTest.cpp
BitwiseTest.cpp
ComparisonsTest.cpp
DateTimeFunctionsTest.cpp
Expand Down

0 comments on commit db2c5e3

Please sign in to comment.