From 968aef5e8f23890bb11c01dd5611bc7b6085ef8d Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 1 Apr 2024 10:03:32 -0500 Subject: [PATCH 01/52] hashing - initial Signed-off-by: brandon-b-miller --- .../user_guide/api_docs/pylibcudf/hashing.rst | 6 + .../user_guide/api_docs/pylibcudf/index.rst | 1 + python/cudf/cudf/_lib/cpp/hash.pxd | 11 ++ python/cudf/cudf/_lib/cpp/hash.pyx | 0 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 1 + python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 2 + python/cudf/cudf/_lib/pylibcudf/__init__.py | 2 + python/cudf/cudf/_lib/pylibcudf/hashing.pxd | 34 ++++++ python/cudf/cudf/_lib/pylibcudf/hashing.pyx | 106 ++++++++++++++++++ 9 files changed, 163 insertions(+) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst create mode 100644 python/cudf/cudf/_lib/cpp/hash.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/hashing.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/hashing.pyx diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst new file mode 100644 index 00000000000..f5b8d6106da --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst @@ -0,0 +1,6 @@ +======== +hashing +======== + +.. automodule:: cudf._lib.pylibcudf.hashing + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 8cad95f61ae..e148b9ca6c1 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -16,6 +16,7 @@ This page provides API documentation for pylibcudf. filling gpumemoryview groupby + hashing join lists merge diff --git a/python/cudf/cudf/_lib/cpp/hash.pxd b/python/cudf/cudf/_lib/cpp/hash.pxd index d55e244dc2c..0200b2e34dd 100644 --- a/python/cudf/cudf/_lib/cpp/hash.pxd +++ b/python/cudf/cudf/_lib/cpp/hash.pxd @@ -16,6 +16,11 @@ cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil: const uint32_t seed ) except + + cdef unique_ptr[table] murmurhash3_x64_128 "cudf::hashing::murmurhash3_x64_128" ( + const table_view& input, + const uint64_t seed + ) + cdef unique_ptr[column] md5 "cudf::hashing::md5" ( const table_view& input ) except + @@ -44,3 +49,9 @@ cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil: const table_view& input, const uint64_t seed ) except + + + cpdef enum class hash_id(int): + HASH_IDENTITY + HASH_MURMUR3 + HASH_SPARK_MURMUR3 + HASH_MD5 \ No newline at end of file diff --git a/python/cudf/cudf/_lib/cpp/hash.pyx b/python/cudf/cudf/_lib/cpp/hash.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 81d15cf95b4..12bbc90947c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -21,6 +21,7 @@ set(cython_sources filling.pyx gpumemoryview.pyx groupby.pyx + hashing.pyx interop.pyx join.pyx lists.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 48c23a9dd4c..c2216b13f0c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -8,6 +8,7 @@ from . cimport ( copying, filling, groupby, + hashing, join, lists, merge, @@ -40,6 +41,7 @@ __all__ = [ "filling", "gpumemoryview", "groupby", + "hashing", "join", "lists", "merge", diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 8ccb0ecc341..2db4e950424 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -7,6 +7,7 @@ copying, filling, groupby, + hashing, interop, join, lists, @@ -39,6 +40,7 @@ "filling", "gpumemoryview", "groupby", + "hashing", "interop", "join", "lists", diff --git a/python/cudf/cudf/_lib/pylibcudf/hashing.pxd b/python/cudf/cudf/_lib/pylibcudf/hashing.pxd new file mode 100644 index 00000000000..459c732e373 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/hashing.pxd @@ -0,0 +1,34 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libc.stdint cimport uint32_t, uint64_t +from cudf._lib.cpp.types cimport size_type +from cudf._lib.cpp.table cimport table +from libcpp.vector cimport vector + +#from cudf._lib.cpp.hash cimport hash_id + +from .column cimport Column +from .table cimport Table + + +cpdef Column murmurhash3_x86_32( + Table input, + uint32_t seed +) + +cpdef Table murmurhash3_x64_128( + Table input, + uint64_t seed +) + +cpdef Column md5(Table input) +cpdef Column sha1(Table input) +cpdef Column sha224(Table input) +cpdef Column sha256(Table input) +cpdef Column sha384(Table input) +cpdef Column sha512(Table input) + +cpdef Column xxhash_64( + Table input, + uint64_t seed +) diff --git a/python/cudf/cudf/_lib/pylibcudf/hashing.pyx b/python/cudf/cudf/_lib/pylibcudf/hashing.pyx new file mode 100644 index 00000000000..7147a8f417f --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/hashing.pyx @@ -0,0 +1,106 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.cpp.hash cimport( + murmurhash3_x86_32 as cpp_murmurhash3_x86_32, + murmurhash3_x64_128 as cpp_murmurhash3_x64_128, + md5 as cpp_md5, + sha1 as cpp_sha1, + sha224 as cpp_sha224, + sha256 as cpp_sha256, + sha384 as cpp_sha384, + sha512 as cpp_sha512, + xxhash_64 as cpp_xxhash_64 +) +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.table.table cimport table +from libc.stdint cimport uint32_t, uint64_t + +from .table cimport Table +from .column cimport Column + +#cpdef Column hash(Tabl) + +cpdef Column murmurhash3_x86_32( + Table input, + uint32_t seed +): + cdef unique_ptr[column] c_result + with nogil: + c_result = move( + cpp_murmurhash3_x86_32( + input.view(), + seed + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Table murmurhash3_x64_128( + Table input, + uint64_t seed +): + cdef unique_ptr[table] c_result + with nogil: + c_result = move( + cpp_murmurhash3_x64_128( + input.view(), + seed + ) + ) + + return Table.from_libcudf(move(c_result)) + + +cpdef Column xxhash_64( + Table input, + uint64_t seed +): + cdef unique_ptr[column] c_result + with nogil: + c_result = move( + cpp_xxhash_64( + input.view(), + seed + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column md5(Table input): + cdef unique_ptr[column] c_result + with nogil: + c_result = move(cpp_md5(input.view())) + return Column.from_libcudf(move(c_result)) + +cpdef Column sha1(Table input): + cdef unique_ptr[column] c_result + with nogil: + c_result = move(cpp_sha1(input.view())) + return Column.from_libcudf(move(c_result)) + +cpdef Column sha224(Table input): + cdef unique_ptr[column] c_result + with nogil: + c_result = move(cpp_sha224(input.view())) + return Column.from_libcudf(move(c_result)) + +cpdef Column sha256(Table input): + cdef unique_ptr[column] c_result + with nogil: + c_result = move(cpp_sha256(input.view())) + return Column.from_libcudf(move(c_result)) + +cpdef Column sha384(Table input): + cdef unique_ptr[column] c_result + with nogil: + c_result = move(cpp_sha384(input.view())) + return Column.from_libcudf(move(c_result)) + +cpdef Column sha512(Table input): + cdef unique_ptr[column] c_result + with nogil: + c_result = move(cpp_sha512(input.view())) + return Column.from_libcudf(move(c_result)) From f4c953c969a09d49a76c991586848b070df6dcb4 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 1 Apr 2024 11:47:45 -0500 Subject: [PATCH 02/52] minor cleanup for now Signed-off-by: brandon-b-miller --- python/cudf/cudf/_lib/cpp/hash.pxd | 2 +- python/cudf/cudf/_lib/pylibcudf/hashing.pyx | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/hash.pxd b/python/cudf/cudf/_lib/cpp/hash.pxd index 0200b2e34dd..dce1f92c33f 100644 --- a/python/cudf/cudf/_lib/cpp/hash.pxd +++ b/python/cudf/cudf/_lib/cpp/hash.pxd @@ -54,4 +54,4 @@ cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil: HASH_IDENTITY HASH_MURMUR3 HASH_SPARK_MURMUR3 - HASH_MD5 \ No newline at end of file + HASH_MD5 diff --git a/python/cudf/cudf/_lib/pylibcudf/hashing.pyx b/python/cudf/cudf/_lib/pylibcudf/hashing.pyx index 7147a8f417f..cb551dd9831 100644 --- a/python/cudf/cudf/_lib/pylibcudf/hashing.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/hashing.pyx @@ -20,8 +20,6 @@ from libc.stdint cimport uint32_t, uint64_t from .table cimport Table from .column cimport Column -#cpdef Column hash(Tabl) - cpdef Column murmurhash3_x86_32( Table input, uint32_t seed From eeee5ee710be8cae79116474cd619021951ec82e Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 1 Apr 2024 13:35:07 -0500 Subject: [PATCH 03/52] add hash top level function --- python/cudf/cudf/_lib/hash.pyx | 44 ++------------------- python/cudf/cudf/_lib/pylibcudf/hashing.pyx | 41 ++++++++++++++----- 2 files changed, 36 insertions(+), 49 deletions(-) diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index 6854cff7763..fd269c29831 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -7,19 +7,10 @@ from libcpp.pair cimport pair from libcpp.utility cimport move from libcpp.vector cimport vector +from cudf._lib import pylibcudf + cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.column cimport Column -from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.hash cimport ( - md5, - murmurhash3_x86_32, - sha1, - sha224, - sha256, - sha384, - sha512, - xxhash_64, -) from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -51,32 +42,5 @@ def hash_partition(list source_columns, object columns_to_hash, @acquire_spill_lock() def hash(list source_columns, str method, int seed=0): - cdef table_view c_source_view = table_view_from_columns(source_columns) - cdef unique_ptr[column] c_result - if method == "murmur3": - with nogil: - c_result = move(murmurhash3_x86_32(c_source_view, seed)) - elif method == "md5": - with nogil: - c_result = move(md5(c_source_view)) - elif method == "sha1": - with nogil: - c_result = move(sha1(c_source_view)) - elif method == "sha224": - with nogil: - c_result = move(sha224(c_source_view)) - elif method == "sha256": - with nogil: - c_result = move(sha256(c_source_view)) - elif method == "sha384": - with nogil: - c_result = move(sha384(c_source_view)) - elif method == "sha512": - with nogil: - c_result = move(sha512(c_source_view)) - elif method == "xxhash64": - with nogil: - c_result = move(xxhash_64(c_source_view, seed)) - else: - raise ValueError(f"Unsupported hash function: {method}") - return Column.from_unique_ptr(move(c_result)) + ctbl = pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]) + return Column.from_pylibcudf(pylibcudf.hashing.hash(ctbl, method, seed)) diff --git a/python/cudf/cudf/_lib/pylibcudf/hashing.pyx b/python/cudf/cudf/_lib/pylibcudf/hashing.pyx index cb551dd9831..3d443cfa7fc 100644 --- a/python/cudf/cudf/_lib/pylibcudf/hashing.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/hashing.pyx @@ -1,27 +1,50 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from libc.stdint cimport uint32_t, uint64_t from libcpp.memory cimport unique_ptr from libcpp.utility cimport move -from cudf._lib.cpp.hash cimport( - murmurhash3_x86_32 as cpp_murmurhash3_x86_32, - murmurhash3_x64_128 as cpp_murmurhash3_x64_128, +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.hash cimport ( md5 as cpp_md5, + murmurhash3_x64_128 as cpp_murmurhash3_x64_128, + murmurhash3_x86_32 as cpp_murmurhash3_x86_32, sha1 as cpp_sha1, sha224 as cpp_sha224, sha256 as cpp_sha256, sha384 as cpp_sha384, sha512 as cpp_sha512, - xxhash_64 as cpp_xxhash_64 + xxhash_64 as cpp_xxhash_64, ) -from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.table.table cimport table -from libc.stdint cimport uint32_t, uint64_t -from .table cimport Table from .column cimport Column +from .table cimport Table + + +cpdef Column hash(Table input, object method, object seed): + if method == "murmur3": + return murmurhash3_x86_32(input, seed) + elif method == "xxhash64": + return xxhash_64(input, seed) + elif method == "md5": + return md5(input) + elif method == "sha1": + return sha1(input) + elif method == "sha224": + return sha224(input) + elif method == "sha256": + return sha256(input) + elif method == "sha384": + return sha384(input) + elif method == "sha512": + return sha512(input) + else: + raise ValueError( + f"Unsupported hashing algorithm {method}." + ) cpdef Column murmurhash3_x86_32( - Table input, + Table input, uint32_t seed ): cdef unique_ptr[column] c_result @@ -52,7 +75,7 @@ cpdef Table murmurhash3_x64_128( cpdef Column xxhash_64( - Table input, + Table input, uint64_t seed ): cdef unique_ptr[column] c_result From 05764233dbf6368abb16be24b608737b8bb8df27 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 2 Apr 2024 14:05:02 -0500 Subject: [PATCH 04/52] begin tests --- .../cudf/cudf/pylibcudf_tests/common/utils.py | 1 - python/cudf/cudf/pylibcudf_tests/conftest.py | 24 ++++++++ .../cudf/cudf/pylibcudf_tests/test_copying.py | 17 ------ .../cudf/cudf/pylibcudf_tests/test_hashing.py | 55 +++++++++++++++++++ 4 files changed, 79 insertions(+), 18 deletions(-) create mode 100644 python/cudf/cudf/pylibcudf_tests/test_hashing.py diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index 6636ab9e5f8..596cd2c92ae 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -35,7 +35,6 @@ def assert_column_eq(plc_column: plc.Column, pa_array: pa.Array) -> None: plc_pa = plc_pa.combine_chunks() if isinstance(pa_array, pa.ChunkedArray): pa_array = pa_array.combine_chunks() - assert plc_pa.equals(pa_array) diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py index 6d8284fb3db..69540c3d9ba 100644 --- a/python/cudf/cudf/pylibcudf_tests/conftest.py +++ b/python/cudf/cudf/pylibcudf_tests/conftest.py @@ -11,6 +11,8 @@ from utils import DEFAULT_STRUCT_TESTING_TYPE +import cudf._lib.pylibcudf as plc + # This fixture defines the standard set of types that all tests should default to # running on. If there is a need for some tests to run on a different set of types, that @@ -29,3 +31,25 @@ ) def pa_type(request): return request.param + + +# TODO: Test nullable data +@pytest.fixture(scope="session") +def pa_input_column(pa_type): + if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type): + return pa.array([1, 2, 3], type=pa_type) + elif pa.types.is_string(pa_type): + return pa.array(["a", "b", "c"], type=pa_type) + elif pa.types.is_boolean(pa_type): + return pa.array([True, True, False], type=pa_type) + elif pa.types.is_list(pa_type): + # TODO: Add heterogenous sizes + return pa.array([[1], [2], [3]], type=pa_type) + elif pa.types.is_struct(pa_type): + return pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type) + raise ValueError("Unsupported type") + + +@pytest.fixture(scope="session") +def input_column(pa_input_column): + return plc.interop.from_arrow(pa_input_column) diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py index 0bf30f98636..a79aea3e12e 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_copying.py +++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py @@ -18,23 +18,6 @@ from cudf._lib import pylibcudf as plc -# TODO: Test nullable data -@pytest.fixture(scope="module") -def pa_input_column(pa_type): - if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type): - return pa.array([1, 2, 3], type=pa_type) - elif pa.types.is_string(pa_type): - return pa.array(["a", "b", "c"], type=pa_type) - elif pa.types.is_boolean(pa_type): - return pa.array([True, True, False], type=pa_type) - elif pa.types.is_list(pa_type): - # TODO: Add heterogenous sizes - return pa.array([[1], [2], [3]], type=pa_type) - elif pa.types.is_struct(pa_type): - return pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type) - raise ValueError("Unsupported type") - - @pytest.fixture(scope="module") def input_column(pa_input_column): return plc.interop.from_arrow(pa_input_column) diff --git a/python/cudf/cudf/pylibcudf_tests/test_hashing.py b/python/cudf/cudf/pylibcudf_tests/test_hashing.py new file mode 100644 index 00000000000..ba591277d02 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_hashing.py @@ -0,0 +1,55 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import hashlib + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import cudf._lib.pylibcudf as plc + +SEED = 0 +METHODS = ["" "md5", "sha1", "sha224", "sha256", "sha384", "sha512"] + + +# Full table hash +@pytest.fixture(scope="module") +def all_types_input_table(): + data = pa.Table.from_pydict( + { + "int": [1, 2, 3], + "float": [1.0, 2.0, 3.0], + "bool": [True, False, True], + "string": ["a", "b", "c"], + "list": [[1], [2], [3]], + # 'struct': [{'a': 1}, {'a': 2}, {'a': 3}] + } + ) + return data + + +def all_types_output_table(input, method): + def _applyfunc(x): + hasher = getattr(hashlib, method) + return hasher(str(x).encode()).hexdigest() + + result = pa.Table.from_pandas(input.to_pandas().map(_applyfunc)) + return result + + +@pytest.mark.parametrize("method", METHODS) +def test_hash_column(pa_input_column, method): + def _applyfunc(x): + hasher = getattr(hashlib, method) + return hasher(str(x).encode()).hexdigest() + + plc_tbl = plc.interop.from_arrow( + pa.Table.from_arrays([pa_input_column], names=["data"]) + ) + plc_hasher = getattr(plc.hashing, method) + + expect = pa.Array.from_pandas( + pa_input_column.to_pandas().apply(_applyfunc) + ) + got = plc_hasher(plc_tbl) + assert_column_eq(got, expect) From 306ad1d42b7bd0c4d5ac15f8d91577a1ed8addf6 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 3 Apr 2024 08:39:01 -0500 Subject: [PATCH 05/52] some untested code worth saving --- python/cudf/cudf/_lib/hash.pyx | 31 +++- python/cudf/cudf/_lib/pylibcudf/hashing.pyx | 22 --- .../cudf/cudf/pylibcudf_tests/test_hashing.py | 133 +++++++++++++++++- 3 files changed, 161 insertions(+), 25 deletions(-) diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index fd269c29831..45d3c35aa19 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -14,6 +14,16 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view +from cudf._lib.pylibcudf.hashing cimport ( + md5, + murmurhash3_x86_32, + sha1, + sha224, + sha256, + sha384, + sha512, + xxhash_64, +) from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns @@ -43,4 +53,23 @@ def hash_partition(list source_columns, object columns_to_hash, @acquire_spill_lock() def hash(list source_columns, str method, int seed=0): ctbl = pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]) - return Column.from_pylibcudf(pylibcudf.hashing.hash(ctbl, method, seed)) + if method == "murmur3": + return Column.from_pylibcudf(murmurhash3_x86_32(ctbl, seed)) + elif method == "xxhash64": + return Column.from_pylibcudf(xxhash_64(ctbl, seed)) + elif method == "md5": + return Column.from_pylibcudf(md5(ctbl)) + elif method == "sha1": + return Column.from_pylibcudf(sha1(ctbl)) + elif method == "sha224": + return Column.from_pylibcudf(sha224(ctbl)) + elif method == "sha256": + return Column.from_pylibcudf(sha256(ctbl)) + elif method == "sha384": + return Column.from_pylibcudf(sha384(ctbl)) + elif method == "sha512": + return Column.from_pylibcudf(sha512(ctbl)) + else: + raise ValueError( + f"Unsupported hashing algorithm {method}." + ) diff --git a/python/cudf/cudf/_lib/pylibcudf/hashing.pyx b/python/cudf/cudf/_lib/pylibcudf/hashing.pyx index 3d443cfa7fc..dd4bde01a37 100644 --- a/python/cudf/cudf/_lib/pylibcudf/hashing.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/hashing.pyx @@ -21,28 +21,6 @@ from .column cimport Column from .table cimport Table -cpdef Column hash(Table input, object method, object seed): - if method == "murmur3": - return murmurhash3_x86_32(input, seed) - elif method == "xxhash64": - return xxhash_64(input, seed) - elif method == "md5": - return md5(input) - elif method == "sha1": - return sha1(input) - elif method == "sha224": - return sha224(input) - elif method == "sha256": - return sha256(input) - elif method == "sha384": - return sha384(input) - elif method == "sha512": - return sha512(input) - else: - raise ValueError( - f"Unsupported hashing algorithm {method}." - ) - cpdef Column murmurhash3_x86_32( Table input, uint32_t seed diff --git a/python/cudf/cudf/pylibcudf_tests/test_hashing.py b/python/cudf/cudf/pylibcudf_tests/test_hashing.py index ba591277d02..97efc9da60f 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_hashing.py +++ b/python/cudf/cudf/pylibcudf_tests/test_hashing.py @@ -2,14 +2,16 @@ import hashlib +import numpy as np import pyarrow as pa import pytest +import xxhash from utils import assert_column_eq import cudf._lib.pylibcudf as plc SEED = 0 -METHODS = ["" "md5", "sha1", "sha224", "sha256", "sha384", "sha512"] +METHODS = ["md5", "sha1", "sha224", "sha256", "sha384", "sha512"] # Full table hash @@ -22,7 +24,7 @@ def all_types_input_table(): "bool": [True, False, True], "string": ["a", "b", "c"], "list": [[1], [2], [3]], - # 'struct': [{'a': 1}, {'a': 2}, {'a': 3}] + "struct": [{"a": 1}, {"a": 2}, {"a": 3}], } ) return data @@ -53,3 +55,130 @@ def _applyfunc(x): ) got = plc_hasher(plc_tbl) assert_column_eq(got, expect) + + +@pytest.mark.parametrize( + "method", ["sha1", "sha224", "sha256", "sha384", "sha512"] +) +@pytest.mark.parametrize("dtype", ["list", "struct"]) +def test_sha_list_struct_err(all_types_input_table, dtype, method): + err_types = all_types_input_table[[dtype]] + plc_tbl = plc.interop.from_arrow(err_types) + plc_hasher = getattr(plc.hashing, method) + + with pytest.raises(TypeError): + plc_hasher(plc_tbl) + + +def test_xxhash_64_int(): + input_data = plc.interop.from_arrow( + pa.array.from_pylist( + [ + -127, + -70000, + 0, + 200000, + 128, + np.iinfo("int32").max(), + np.iinfo("int32").min(), + np.iinfo("int32").min(), + ] + ) + ) + expected = pa.array.from_pylist( + [ + 4827426872506142937, + 13867166853951622683, + 4246796580750024372, + 17339819992360460003, + 7292178400482025765, + 2971168436322821236, + 9380524276503839603, + 9380524276503839603, + ] + ) + got = plc.hashing.xxhash_64(input_data) + assert_column_eq(got, expected) + + +def test_xxhash_64_double(): + # see xxhash_64_test.cpp for details + input_data = plc.interop.from_arrow( + pa.array.from_pylist( + [ + -127.0, + -70000.125, + 128.5, + -0.0, + np.inf, + np.nan, + np.iinfo("float64").max(), + np.iinfo("float64").min(), + np.iinfo("float64").min(), + ] + ) + ) + expected = pa.array.from_pylist( + [ + 16892115221677838993, + 1686446903308179321, + 3803688792395291579, + 18250447068822614389, + 3511911086082166358, + 4558309869707674848, + 18031741628920313605, + 16838308782748609196, + 3127544388062992779, + 1692401401506680154, + 13770442912356326755, + ] + ) + got = plc.hashing.xxhash_64(input_data) + assert_column_eq(got, expected) + + +def test_xxhash_64_string(): + input_data = plc.interop.from_arrow( + [ + "The", + "quick", + "brown fox", + "jumps over the lazy dog.", + "I am Jack's complete lack of null value", + "A very long (greater than 128 bytes/characters) to test a very long string. " + "2nd half of the very long string to verify the long string hashing happening.", + "Some multi-byte characters here: ééé", + "ééé", + "ééé ééé", + "ééé ééé ééé ééé", + "", + "!@#$%^&*(())", + "0123456789", + "{}|:<>?,./;[]=-", + ] + ) + + def hasher(x): + return xxhash.xxh64(bytes(x, "utf-8")).intdigest() + + expected = pa.from_pandas(input_data.to_pandas().apply(hasher)) + got = plc.hashing.xxhash_64(input) + + assert_column_eq(got, expected) + + +def test_xxhash64_decimal(): + input_data = plc.interop.from_arrow( + pa.array([0, 100, -100, 999999999, -999999999], type=pa.decimal(-3)) + ) + expected = pa.array( + [ + 4246796580750024372, + 5959467639951725378, + 4122185689695768261, + 3249245648192442585, + 8009575895491381648, + ] + ) + got = plc.hashing.xxhash_64(input_data) + assert_column_eq(got, expected) From a15dd45302a8345d6837e66c6bff1bc1ac54cee0 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 3 Apr 2024 10:32:44 -0500 Subject: [PATCH 06/52] tests run and fail --- cpp/src/hash/sha_hash.cuh | 3 +- python/cudf/cudf/_lib/cpp/hash.pxd | 19 +- .../cudf/cudf/pylibcudf_tests/test_hashing.py | 173 +++++++++++------- 3 files changed, 117 insertions(+), 78 deletions(-) diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh index 0a22ee34918..5a983e94aab 100644 --- a/cpp/src/hash/sha_hash.cuh +++ b/cpp/src/hash/sha_hash.cuh @@ -512,7 +512,8 @@ std::unique_ptr sha_hash(table_view const& input, CUDF_EXPECTS( std::all_of( input.begin(), input.end(), [](auto const& col) { return sha_leaf_type_check(col.type()); }), - "Unsupported column type for hash function."); + "Unsupported column type for hash function.", + cudf::data_type_error); // Result column allocation and creation auto begin = thrust::make_constant_iterator(Hasher::digest_size); diff --git a/python/cudf/cudf/_lib/cpp/hash.pxd b/python/cudf/cudf/_lib/cpp/hash.pxd index dce1f92c33f..44a18862766 100644 --- a/python/cudf/cudf/_lib/cpp/hash.pxd +++ b/python/cudf/cudf/_lib/cpp/hash.pxd @@ -7,6 +7,7 @@ from libcpp.vector cimport vector from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view +from cudf._lib.exception_handler cimport cudf_exception_handler cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil: @@ -14,41 +15,41 @@ cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil: cdef unique_ptr[column] murmurhash3_x86_32 "cudf::hashing::murmurhash3_x86_32" ( const table_view& input, const uint32_t seed - ) except + + ) except +cudf_exception_handler cdef unique_ptr[table] murmurhash3_x64_128 "cudf::hashing::murmurhash3_x64_128" ( const table_view& input, const uint64_t seed - ) + ) except +cudf_exception_handler cdef unique_ptr[column] md5 "cudf::hashing::md5" ( const table_view& input - ) except + + ) except +cudf_exception_handler cdef unique_ptr[column] sha1 "cudf::hashing::sha1" ( const table_view& input - ) except + + ) except +cudf_exception_handler cdef unique_ptr[column] sha224 "cudf::hashing::sha224" ( const table_view& input - ) except + + ) except +cudf_exception_handler cdef unique_ptr[column] sha256 "cudf::hashing::sha256" ( const table_view& input - ) except + + ) except +cudf_exception_handler cdef unique_ptr[column] sha384 "cudf::hashing::sha384" ( const table_view& input - ) except + + ) except +cudf_exception_handler cdef unique_ptr[column] sha512 "cudf::hashing::sha512" ( const table_view& input - ) except + + ) except +cudf_exception_handler cdef unique_ptr[column] xxhash_64 "cudf::hashing::xxhash_64" ( const table_view& input, const uint64_t seed - ) except + + ) except +cudf_exception_handler cpdef enum class hash_id(int): HASH_IDENTITY diff --git a/python/cudf/cudf/pylibcudf_tests/test_hashing.py b/python/cudf/cudf/pylibcudf_tests/test_hashing.py index 97efc9da60f..d0da26582c6 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_hashing.py +++ b/python/cudf/cudf/pylibcudf_tests/test_hashing.py @@ -14,6 +14,92 @@ METHODS = ["md5", "sha1", "sha224", "sha256", "sha384", "sha512"] +@pytest.fixture(scope="module") +def xxhash_64_int_tbl(): + arrow_tbl = pa.Table.from_arrays( + [ + pa.array( + [ + -127, + -70000, + 0, + 200000, + 128, + np.iinfo("int32").max, + np.iinfo("int32").min, + np.iinfo("int32").min, + ], + type=pa.int32(), + ) + ], + names=["data"], + ) + return plc.interop.from_arrow(arrow_tbl) + + +@pytest.fixture(scope="module") +def xxhash_64_double_tbl(): + arrow_tbl = pa.Table.from_arrays( + [ + pa.array( + [ + -127.0, + -70000.125, + 128.5, + -0.0, + np.inf, + np.nan, + np.finfo("float64").max, + np.finfo("float64").min, + np.finfo("float64").min, + ], + type=pa.float32(), + ) + ], + names=["data"], + ) + return plc.interop.from_arrow(arrow_tbl) + + +@pytest.fixture(scope="module") +def xxhash_64_string_tbl(): + arrow_tbl = pa.Table.from_arrays( + [ + pa.array( + [ + "The", + "quick", + "brown fox", + "jumps over the lazy dog.", + "I am Jack's complete lack of null value", + "A very long (greater than 128 bytes/characters) to test a very long string. " + "2nd half of the very long string to verify the long string hashing happening.", + "Some multi-byte characters here: ééé", + "ééé", + "ééé ééé", + "ééé ééé ééé ééé", + "", + "!@#$%^&*(())", + "0123456789", + "{}|:<>?,./;[]=-", + ], + type=pa.string(), + ) + ], + names=["data"], + ) + return plc.interop.from_arrow(arrow_tbl) + + +@pytest.fixture(scope="module") +def xxhash_64_decimal_tbl(): + arrow_tbl = pa.Table.from_arrays( + [pa.array([0, 100, -100, 999999999, -999999999], type=pa.decimal(-3))], + names=["data"], + ) + return plc.interop.from_arrow(arrow_tbl) + + # Full table hash @pytest.fixture(scope="module") def all_types_input_table(): @@ -62,30 +148,16 @@ def _applyfunc(x): ) @pytest.mark.parametrize("dtype", ["list", "struct"]) def test_sha_list_struct_err(all_types_input_table, dtype, method): - err_types = all_types_input_table[[dtype]] + err_types = all_types_input_table.select([dtype]) plc_tbl = plc.interop.from_arrow(err_types) plc_hasher = getattr(plc.hashing, method) - with pytest.raises(TypeError): + with pytest.raises(ValueError): plc_hasher(plc_tbl) -def test_xxhash_64_int(): - input_data = plc.interop.from_arrow( - pa.array.from_pylist( - [ - -127, - -70000, - 0, - 200000, - 128, - np.iinfo("int32").max(), - np.iinfo("int32").min(), - np.iinfo("int32").min(), - ] - ) - ) - expected = pa.array.from_pylist( +def test_xxhash_64_int(xxhash_64_int_tbl): + expected = pa.array( [ 4827426872506142937, 13867166853951622683, @@ -95,30 +167,16 @@ def test_xxhash_64_int(): 2971168436322821236, 9380524276503839603, 9380524276503839603, - ] + ], + type=pa.uint64(), ) - got = plc.hashing.xxhash_64(input_data) + got = plc.hashing.xxhash_64(xxhash_64_int_tbl, 0) assert_column_eq(got, expected) -def test_xxhash_64_double(): +def test_xxhash_64_double(xxhash_64_double_tbl): # see xxhash_64_test.cpp for details - input_data = plc.interop.from_arrow( - pa.array.from_pylist( - [ - -127.0, - -70000.125, - 128.5, - -0.0, - np.inf, - np.nan, - np.iinfo("float64").max(), - np.iinfo("float64").min(), - np.iinfo("float64").min(), - ] - ) - ) - expected = pa.array.from_pylist( + expected = pa.array( [ 16892115221677838993, 1686446903308179321, @@ -131,46 +189,24 @@ def test_xxhash_64_double(): 3127544388062992779, 1692401401506680154, 13770442912356326755, - ] + ], + type=pa.uint64(), ) - got = plc.hashing.xxhash_64(input_data) + got = plc.hashing.xxhash_64(xxhash_64_double_tbl, 0) assert_column_eq(got, expected) -def test_xxhash_64_string(): - input_data = plc.interop.from_arrow( - [ - "The", - "quick", - "brown fox", - "jumps over the lazy dog.", - "I am Jack's complete lack of null value", - "A very long (greater than 128 bytes/characters) to test a very long string. " - "2nd half of the very long string to verify the long string hashing happening.", - "Some multi-byte characters here: ééé", - "ééé", - "ééé ééé", - "ééé ééé ééé ééé", - "", - "!@#$%^&*(())", - "0123456789", - "{}|:<>?,./;[]=-", - ] - ) - +def test_xxhash_64_string(xxhash_64_string_tbl): def hasher(x): return xxhash.xxh64(bytes(x, "utf-8")).intdigest() - expected = pa.from_pandas(input_data.to_pandas().apply(hasher)) - got = plc.hashing.xxhash_64(input) + expected = pa.from_pandas(xxhash_64_string_tbl.to_pandas().apply(hasher)) + got = plc.hashing.xxhash_64(xxhash_64_string_tbl, 0) assert_column_eq(got, expected) -def test_xxhash64_decimal(): - input_data = plc.interop.from_arrow( - pa.array([0, 100, -100, 999999999, -999999999], type=pa.decimal(-3)) - ) +def test_xxhash64_decimal(xxhash_64_decimal_tbl): expected = pa.array( [ 4246796580750024372, @@ -178,7 +214,8 @@ def test_xxhash64_decimal(): 4122185689695768261, 3249245648192442585, 8009575895491381648, - ] + ], + type=pa.uint64(), ) - got = plc.hashing.xxhash_64(input_data) + got = plc.hashing.xxhash_64(xxhash_64_decimal_tbl, 0) assert_column_eq(got, expected) From 30b0f2b5786ba3e2e1518312d3409f8b3cf1c9b6 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 3 Apr 2024 11:08:44 -0500 Subject: [PATCH 07/52] todo --- python/cudf/cudf/pylibcudf_tests/test_hashing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/pylibcudf_tests/test_hashing.py b/python/cudf/cudf/pylibcudf_tests/test_hashing.py index d0da26582c6..6c61b1240ab 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_hashing.py +++ b/python/cudf/cudf/pylibcudf_tests/test_hashing.py @@ -119,6 +119,7 @@ def all_types_input_table(): def all_types_output_table(input, method): def _applyfunc(x): hasher = getattr(hashlib, method) + # TODO: not how libcudf computes row hash return hasher(str(x).encode()).hexdigest() result = pa.Table.from_pandas(input.to_pandas().map(_applyfunc)) From eeb4edb68353581fcdeae82f534a708a4e46f342 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Fri, 5 Apr 2024 08:05:27 -0700 Subject: [PATCH 08/52] Apply suggestions from code review Co-authored-by: Bradley Dice Co-authored-by: Vyas Ramasubramani --- docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst | 4 ++-- python/cudf/cudf/_lib/cpp/hash.pxd | 2 +- python/cudf/cudf/_lib/pylibcudf/hashing.pxd | 3 --- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst index f5b8d6106da..8a783ac5564 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst @@ -1,6 +1,6 @@ -======== +======= hashing -======== +======= .. automodule:: cudf._lib.pylibcudf.hashing :members: diff --git a/python/cudf/cudf/_lib/cpp/hash.pxd b/python/cudf/cudf/_lib/cpp/hash.pxd index 44a18862766..337de1b4927 100644 --- a/python/cudf/cudf/_lib/cpp/hash.pxd +++ b/python/cudf/cudf/_lib/cpp/hash.pxd @@ -17,7 +17,7 @@ cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil: const uint32_t seed ) except +cudf_exception_handler - cdef unique_ptr[table] murmurhash3_x64_128 "cudf::hashing::murmurhash3_x64_128" ( + cdef unique_ptr[table] murmurhash3_x64_128 const table_view& input, const uint64_t seed ) except +cudf_exception_handler diff --git a/python/cudf/cudf/_lib/pylibcudf/hashing.pxd b/python/cudf/cudf/_lib/pylibcudf/hashing.pxd index 459c732e373..66350b460ae 100644 --- a/python/cudf/cudf/_lib/pylibcudf/hashing.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/hashing.pxd @@ -1,9 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from libc.stdint cimport uint32_t, uint64_t -from cudf._lib.cpp.types cimport size_type -from cudf._lib.cpp.table cimport table -from libcpp.vector cimport vector #from cudf._lib.cpp.hash cimport hash_id From 6762279925536c8a4a1ab5012631d6aaea79bae4 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 5 Apr 2024 10:11:55 -0500 Subject: [PATCH 09/52] remove hash_id --- python/cudf/cudf/_lib/cpp/hash.pxd | 24 ++++++++------------- python/cudf/cudf/_lib/pylibcudf/hashing.pxd | 4 +--- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/hash.pxd b/python/cudf/cudf/_lib/cpp/hash.pxd index 337de1b4927..ea687de6969 100644 --- a/python/cudf/cudf/_lib/cpp/hash.pxd +++ b/python/cudf/cudf/_lib/cpp/hash.pxd @@ -12,47 +12,41 @@ from cudf._lib.exception_handler cimport cudf_exception_handler cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil: - cdef unique_ptr[column] murmurhash3_x86_32 "cudf::hashing::murmurhash3_x86_32" ( + cdef unique_ptr[column] murmurhash3_x86_32( const table_view& input, const uint32_t seed ) except +cudf_exception_handler - cdef unique_ptr[table] murmurhash3_x64_128 + cdef unique_ptr[table] murmurhash3_x64_128( const table_view& input, const uint64_t seed ) except +cudf_exception_handler - cdef unique_ptr[column] md5 "cudf::hashing::md5" ( + cdef unique_ptr[column] md5( const table_view& input ) except +cudf_exception_handler - cdef unique_ptr[column] sha1 "cudf::hashing::sha1" ( + cdef unique_ptr[column] sha1( const table_view& input ) except +cudf_exception_handler - cdef unique_ptr[column] sha224 "cudf::hashing::sha224" ( + cdef unique_ptr[column] sha224( const table_view& input ) except +cudf_exception_handler - cdef unique_ptr[column] sha256 "cudf::hashing::sha256" ( + cdef unique_ptr[column] sha256( const table_view& input ) except +cudf_exception_handler - cdef unique_ptr[column] sha384 "cudf::hashing::sha384" ( + cdef unique_ptr[column] sha384( const table_view& input ) except +cudf_exception_handler - cdef unique_ptr[column] sha512 "cudf::hashing::sha512" ( + cdef unique_ptr[column] sha512( const table_view& input ) except +cudf_exception_handler - cdef unique_ptr[column] xxhash_64 "cudf::hashing::xxhash_64" ( + cdef unique_ptr[column] xxhash_64( const table_view& input, const uint64_t seed ) except +cudf_exception_handler - - cpdef enum class hash_id(int): - HASH_IDENTITY - HASH_MURMUR3 - HASH_SPARK_MURMUR3 - HASH_MD5 diff --git a/python/cudf/cudf/_lib/pylibcudf/hashing.pxd b/python/cudf/cudf/_lib/pylibcudf/hashing.pxd index 66350b460ae..4b8c063a135 100644 --- a/python/cudf/cudf/_lib/pylibcudf/hashing.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/hashing.pxd @@ -2,14 +2,12 @@ from libc.stdint cimport uint32_t, uint64_t -#from cudf._lib.cpp.hash cimport hash_id - from .column cimport Column from .table cimport Table cpdef Column murmurhash3_x86_32( - Table input, + Table input, uint32_t seed ) From 8ab4afafe722e43106cd4fb8a1182617f1448e2e Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 5 Apr 2024 10:12:13 -0500 Subject: [PATCH 10/52] docs --- python/cudf/cudf/_lib/pylibcudf/hashing.pyx | 112 ++++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/python/cudf/cudf/_lib/pylibcudf/hashing.pyx b/python/cudf/cudf/_lib/pylibcudf/hashing.pyx index dd4bde01a37..2896f238277 100644 --- a/python/cudf/cudf/_lib/pylibcudf/hashing.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/hashing.pyx @@ -25,6 +25,24 @@ cpdef Column murmurhash3_x86_32( Table input, uint32_t seed ): + """Computes the MurmurHash3 32-bit hash value of each row in the given table. + + For details, see :cpp:func:murmurhash3_x86_32. + + Parameters + ---------- + + input : Table + The table of columns to hash + seed : uint32_t + Optional seed value to use for the hash function + + Returns + ------- + + pylibcudf.Column + A column where each row is the hash of a row from the input + """ cdef unique_ptr[column] c_result with nogil: c_result = move( @@ -40,6 +58,24 @@ cpdef Table murmurhash3_x64_128( Table input, uint64_t seed ): + """Computes the MurmurHash3 64-bit hash value of each row in the given table. + + For details, see :cpp:func:murmurhash3_x64_128. + + Parameters + ---------- + + input : Table + The table of columns to hash + seed : uint32_t + Optional seed value to use for the hash function + + Returns + ------- + + pylibcudf.Table + A table of two UINT64 columns + """ cdef unique_ptr[table] c_result with nogil: c_result = move( @@ -75,30 +111,106 @@ cpdef Column md5(Table input): return Column.from_libcudf(move(c_result)) cpdef Column sha1(Table input): + """Computes the SHA-1 hash value of each row in the given table. + + For details, see :cpp:func:sha1. + + Parameters + ---------- + + input : Table + The table of columns to hash + + Returns + ------- + + pylibcudf.Column + A column where each row is the hash of a row from the input + """ cdef unique_ptr[column] c_result with nogil: c_result = move(cpp_sha1(input.view())) return Column.from_libcudf(move(c_result)) cpdef Column sha224(Table input): + """Computes the SHA-224 hash value of each row in the given table. + + For details, see :cpp:func:sha224. + + Parameters + ---------- + + input : Table + The table of columns to hash + + Returns + ------- + pylibcudf.Column + A column where each row is the hash of a row from the input + """ cdef unique_ptr[column] c_result with nogil: c_result = move(cpp_sha224(input.view())) return Column.from_libcudf(move(c_result)) cpdef Column sha256(Table input): + """Computes the SHA-256 hash value of each row in the given table. + + For details, see :cpp:func:256. + + Parameters + ---------- + + input : Table + The table of columns to hash + + Returns + ------- + pylibcudf.Column + A column where each row is the hash of a row from the input + """ cdef unique_ptr[column] c_result with nogil: c_result = move(cpp_sha256(input.view())) return Column.from_libcudf(move(c_result)) cpdef Column sha384(Table input): + """Computes the SHA-384 hash value of each row in the given table. + + For details, see :cpp:func:sha384. + + Parameters + ---------- + + input : Table + The table of columns to hash + + Returns + ------- + pylibcudf.Column + A column where each row is the hash of a row from the input + """ cdef unique_ptr[column] c_result with nogil: c_result = move(cpp_sha384(input.view())) return Column.from_libcudf(move(c_result)) cpdef Column sha512(Table input): + """Computes the SHA-512 hash value of each row in the given table. + + For details, see :cpp:func:sha512. + + Parameters + ---------- + + input : Table + The table of columns to hash + + Returns + ------- + pylibcudf.Column + A column where each row is the hash of a row from the input + """ cdef unique_ptr[column] c_result with nogil: c_result = move(cpp_sha512(input.view())) From ccf64d4d34ed5ee5797b486797d21674bad41c9e Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 5 Apr 2024 10:13:29 -0500 Subject: [PATCH 11/52] small lint --- python/cudf/cudf/_lib/pylibcudf/hashing.pyx | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/cudf/cudf/_lib/pylibcudf/hashing.pyx b/python/cudf/cudf/_lib/pylibcudf/hashing.pyx index 2896f238277..57081ec49d5 100644 --- a/python/cudf/cudf/_lib/pylibcudf/hashing.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/hashing.pyx @@ -54,6 +54,7 @@ cpdef Column murmurhash3_x86_32( return Column.from_libcudf(move(c_result)) + cpdef Table murmurhash3_x64_128( Table input, uint64_t seed @@ -132,6 +133,7 @@ cpdef Column sha1(Table input): c_result = move(cpp_sha1(input.view())) return Column.from_libcudf(move(c_result)) + cpdef Column sha224(Table input): """Computes the SHA-224 hash value of each row in the given table. @@ -153,6 +155,7 @@ cpdef Column sha224(Table input): c_result = move(cpp_sha224(input.view())) return Column.from_libcudf(move(c_result)) + cpdef Column sha256(Table input): """Computes the SHA-256 hash value of each row in the given table. @@ -174,6 +177,7 @@ cpdef Column sha256(Table input): c_result = move(cpp_sha256(input.view())) return Column.from_libcudf(move(c_result)) + cpdef Column sha384(Table input): """Computes the SHA-384 hash value of each row in the given table. @@ -195,6 +199,7 @@ cpdef Column sha384(Table input): c_result = move(cpp_sha384(input.view())) return Column.from_libcudf(move(c_result)) + cpdef Column sha512(Table input): """Computes the SHA-512 hash value of each row in the given table. From 9ce384a37d39b6aa720b67318d0fd10089ee49ac Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 5 Apr 2024 11:27:13 -0500 Subject: [PATCH 12/52] add DEFAULT_HASH_SEED from hpp --- python/cudf/cudf/_lib/cpp/hash.pxd | 3 +++ python/cudf/cudf/_lib/hash.pyx | 5 ++--- python/cudf/cudf/_lib/pylibcudf/hashing.pxd | 15 ++++++++------- python/cudf/cudf/_lib/pylibcudf/hashing.pyx | 7 ++++--- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/hash.pxd b/python/cudf/cudf/_lib/cpp/hash.pxd index ea687de6969..68d7c2b44fc 100644 --- a/python/cudf/cudf/_lib/cpp/hash.pxd +++ b/python/cudf/cudf/_lib/cpp/hash.pxd @@ -50,3 +50,6 @@ cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil: const table_view& input, const uint64_t seed ) except +cudf_exception_handler + +cdef extern from "cudf/hashing.hpp" namespace "cudf" nogil: + cdef uint32_t DEFAULT_HASH_SEED diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index 45d3c35aa19..b8ce3f0b0b4 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -22,7 +22,6 @@ from cudf._lib.pylibcudf.hashing cimport ( sha256, sha384, sha512, - xxhash_64, ) from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns @@ -55,8 +54,8 @@ def hash(list source_columns, str method, int seed=0): ctbl = pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]) if method == "murmur3": return Column.from_pylibcudf(murmurhash3_x86_32(ctbl, seed)) - elif method == "xxhash64": - return Column.from_pylibcudf(xxhash_64(ctbl, seed)) +# elif method == "xxhash64": +# return Column.from_pylibcudf(xxhash_64(ctbl, seed)) elif method == "md5": return Column.from_pylibcudf(md5(ctbl)) elif method == "sha1": diff --git a/python/cudf/cudf/_lib/pylibcudf/hashing.pxd b/python/cudf/cudf/_lib/pylibcudf/hashing.pxd index 4b8c063a135..2d070ddda69 100644 --- a/python/cudf/cudf/_lib/pylibcudf/hashing.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/hashing.pxd @@ -8,12 +8,18 @@ from .table cimport Table cpdef Column murmurhash3_x86_32( Table input, - uint32_t seed + uint32_t seed=* ) cpdef Table murmurhash3_x64_128( Table input, - uint64_t seed + uint64_t seed=* +) + + +cpdef Column xxhash_64( + Table input, + uint64_t seed=* ) cpdef Column md5(Table input) @@ -22,8 +28,3 @@ cpdef Column sha224(Table input) cpdef Column sha256(Table input) cpdef Column sha384(Table input) cpdef Column sha512(Table input) - -cpdef Column xxhash_64( - Table input, - uint64_t seed -) diff --git a/python/cudf/cudf/_lib/pylibcudf/hashing.pyx b/python/cudf/cudf/_lib/pylibcudf/hashing.pyx index 57081ec49d5..a72003682ec 100644 --- a/python/cudf/cudf/_lib/pylibcudf/hashing.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/hashing.pyx @@ -5,6 +5,7 @@ from libcpp.utility cimport move from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.hash cimport ( + DEFAULT_HASH_SEED, md5 as cpp_md5, murmurhash3_x64_128 as cpp_murmurhash3_x64_128, murmurhash3_x86_32 as cpp_murmurhash3_x86_32, @@ -23,7 +24,7 @@ from .table cimport Table cpdef Column murmurhash3_x86_32( Table input, - uint32_t seed + uint32_t seed=DEFAULT_HASH_SEED ): """Computes the MurmurHash3 32-bit hash value of each row in the given table. @@ -57,7 +58,7 @@ cpdef Column murmurhash3_x86_32( cpdef Table murmurhash3_x64_128( Table input, - uint64_t seed + uint64_t seed=DEFAULT_HASH_SEED ): """Computes the MurmurHash3 64-bit hash value of each row in the given table. @@ -91,7 +92,7 @@ cpdef Table murmurhash3_x64_128( cpdef Column xxhash_64( Table input, - uint64_t seed + uint64_t seed=DEFAULT_HASH_SEED ): cdef unique_ptr[column] c_result with nogil: From bed579279e133c7eb8d60c8a3c0254549f087910 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 5 Apr 2024 13:51:55 -0500 Subject: [PATCH 13/52] fix xxhash_64_string test --- .../cudf/cudf/pylibcudf_tests/test_hashing.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/test_hashing.py b/python/cudf/cudf/pylibcudf_tests/test_hashing.py index 6c61b1240ab..ff1394d3fef 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_hashing.py +++ b/python/cudf/cudf/pylibcudf_tests/test_hashing.py @@ -6,7 +6,7 @@ import pyarrow as pa import pytest import xxhash -from utils import assert_column_eq +from utils import assert_column_eq, assert_table_eq import cudf._lib.pylibcudf as plc @@ -137,6 +137,11 @@ def _applyfunc(x): ) plc_hasher = getattr(plc.hashing, method) + if isinstance(pa_input_column.type, (pa.ListType, pa.StructType)): + with pytest.raises(TypeError): + plc_hasher(plc_tbl) + return + expect = pa.Array.from_pandas( pa_input_column.to_pandas().apply(_applyfunc) ) @@ -153,7 +158,7 @@ def test_sha_list_struct_err(all_types_input_table, dtype, method): plc_tbl = plc.interop.from_arrow(err_types) plc_hasher = getattr(plc.hashing, method) - with pytest.raises(ValueError): + with pytest.raises(TypeError): plc_hasher(plc_tbl) @@ -201,7 +206,11 @@ def test_xxhash_64_string(xxhash_64_string_tbl): def hasher(x): return xxhash.xxh64(bytes(x, "utf-8")).intdigest() - expected = pa.from_pandas(xxhash_64_string_tbl.to_pandas().apply(hasher)) + expected = pa.Array.from_pandas( + plc.interop.to_arrow(xxhash_64_string_tbl) + .to_pandas()[""] + .apply(hasher) + ) got = plc.hashing.xxhash_64(xxhash_64_string_tbl, 0) assert_column_eq(got, expected) @@ -219,4 +228,4 @@ def test_xxhash64_decimal(xxhash_64_decimal_tbl): type=pa.uint64(), ) got = plc.hashing.xxhash_64(xxhash_64_decimal_tbl, 0) - assert_column_eq(got, expected) + assert_table_eq(got, expected) From 992cba1f2d231fbdba9bffa4a849725741a74d42 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 8 Apr 2024 11:17:09 -0500 Subject: [PATCH 14/52] raise for unimplemented hash test functions on the python side for now --- .../cudf/cudf/pylibcudf_tests/test_hashing.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/test_hashing.py b/python/cudf/cudf/pylibcudf_tests/test_hashing.py index ff1394d3fef..7edbd5d8679 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_hashing.py +++ b/python/cudf/cudf/pylibcudf_tests/test_hashing.py @@ -129,8 +129,21 @@ def _applyfunc(x): @pytest.mark.parametrize("method", METHODS) def test_hash_column(pa_input_column, method): def _applyfunc(x): - hasher = getattr(hashlib, method) - return hasher(str(x).encode()).hexdigest() + if isinstance(x, str): + hasher = getattr(hashlib, method) + return hasher(str(x).encode()).hexdigest() + elif isinstance(x, float): + raise NotImplementedError + elif isinstance(x, bool): + raise NotImplementedError + elif isinstance(x, int): + raise NotImplementedError + elif isinstance(x, list): + raise NotImplementedError + elif isinstance(x, dict): + raise NotImplementedError + else: + raise NotImplementedError("Unsupported type") plc_tbl = plc.interop.from_arrow( pa.Table.from_arrays([pa_input_column], names=["data"]) From 2977b63a45c11783b89e9391b3a9d0b470b5d138 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 10 Apr 2024 16:31:54 -0500 Subject: [PATCH 15/52] fix up some tests --- .../cudf/cudf/pylibcudf_tests/test_hashing.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/test_hashing.py b/python/cudf/cudf/pylibcudf_tests/test_hashing.py index 7edbd5d8679..6dd35d3130d 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_hashing.py +++ b/python/cudf/cudf/pylibcudf_tests/test_hashing.py @@ -1,6 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import hashlib +import struct import numpy as np import pyarrow as pa @@ -130,20 +131,16 @@ def _applyfunc(x): def test_hash_column(pa_input_column, method): def _applyfunc(x): if isinstance(x, str): - hasher = getattr(hashlib, method) - return hasher(str(x).encode()).hexdigest() + binary = str(x).encode() elif isinstance(x, float): - raise NotImplementedError + binary = struct.pack(" Date: Wed, 10 Apr 2024 17:15:01 -0500 Subject: [PATCH 16/52] separate md5 test --- cpp/src/hash/md5_hash.cu | 3 +- .../cudf/cudf/pylibcudf_tests/test_hashing.py | 53 +++++++++++++------ 2 files changed, 40 insertions(+), 16 deletions(-) diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu index b34455905d9..d2d1503c0c2 100644 --- a/cpp/src/hash/md5_hash.cu +++ b/cpp/src/hash/md5_hash.cu @@ -301,7 +301,8 @@ std::unique_ptr md5(table_view const& input, } return md5_leaf_type_check(col.type()); }), - "Unsupported column type for hash function."); + "Unsupported column type for hash function.", + cudf::data_type_error); // Digest size in bytes auto constexpr digest_size = 32; diff --git a/python/cudf/cudf/pylibcudf_tests/test_hashing.py b/python/cudf/cudf/pylibcudf_tests/test_hashing.py index 6dd35d3130d..e72e65cffed 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_hashing.py +++ b/python/cudf/cudf/pylibcudf_tests/test_hashing.py @@ -127,21 +127,27 @@ def _applyfunc(x): return result -@pytest.mark.parametrize("method", METHODS) -def test_hash_column(pa_input_column, method): - def _applyfunc(x): - if isinstance(x, str): - binary = str(x).encode() - elif isinstance(x, float): - binary = struct.pack(" Date: Thu, 11 Apr 2024 09:17:50 -0500 Subject: [PATCH 17/52] cleanup --- python/cudf/cudf/pylibcudf_tests/test_hashing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/pylibcudf_tests/test_hashing.py b/python/cudf/cudf/pylibcudf_tests/test_hashing.py index e72e65cffed..e75e2874029 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_hashing.py +++ b/python/cudf/cudf/pylibcudf_tests/test_hashing.py @@ -139,7 +139,6 @@ def python_hash_value(x, method): elif isinstance(x, np.ndarray): binary = x.tobytes() else: - breakpoint() raise NotImplementedError return getattr(hashlib, method)(binary).hexdigest() From edcde7642eb43d1acdc423c06ec54bbbd6f3fb88 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 11 Apr 2024 09:19:09 -0500 Subject: [PATCH 18/52] more cleanup --- python/cudf/cudf/pylibcudf_tests/common/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index 596cd2c92ae..6636ab9e5f8 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -35,6 +35,7 @@ def assert_column_eq(plc_column: plc.Column, pa_array: pa.Array) -> None: plc_pa = plc_pa.combine_chunks() if isinstance(pa_array, pa.ChunkedArray): pa_array = pa_array.combine_chunks() + assert plc_pa.equals(pa_array) From 3a442cb2c2baaa05a21f7515cf2da3815d27760e Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 12 Apr 2024 08:45:08 -0500 Subject: [PATCH 19/52] begin hashing tests --- python/cudf/cudf/_lib/pylibcudf/hashing.pyx | 1 + .../cudf/cudf/pylibcudf_tests/test_hashing.py | 36 ++++++++++++++++++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/hashing.pyx b/python/cudf/cudf/_lib/pylibcudf/hashing.pyx index a72003682ec..60afdcac79f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/hashing.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/hashing.pyx @@ -21,6 +21,7 @@ from cudf._lib.cpp.table.table cimport table from .column cimport Column from .table cimport Table +LIBCUDF_DEFAULT_HASH_SEED = DEFAULT_HASH_SEED cpdef Column murmurhash3_x86_32( Table input, diff --git a/python/cudf/cudf/pylibcudf_tests/test_hashing.py b/python/cudf/cudf/pylibcudf_tests/test_hashing.py index e75e2874029..8f2e5f8bac2 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_hashing.py +++ b/python/cudf/cudf/pylibcudf_tests/test_hashing.py @@ -3,6 +3,7 @@ import hashlib import struct +import mmh3 import numpy as np import pyarrow as pa import pytest @@ -140,7 +141,14 @@ def python_hash_value(x, method): binary = x.tobytes() else: raise NotImplementedError - return getattr(hashlib, method)(binary).hexdigest() + if method == 'murmurhash3_x86_32': + raise NotImplementedError + elif method == 'murmurhash3_x64_128': + hasher = mmh3.mmh3_x64_128(seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED) + hasher.update(binary) + return hasher.digest() + else: + return getattr(hashlib, method)(binary).hexdigest() @pytest.mark.parametrize( @@ -261,3 +269,29 @@ def test_xxhash64_decimal(xxhash_64_decimal_tbl): ) got = plc.hashing.xxhash_64(xxhash_64_decimal_tbl, 0) assert_table_eq(got, expected) + +def test_murmurhash3_x86_32(pa_input_column): + plc_tbl = plc.interop.from_arrow( + pa.Table.from_arrays([pa_input_column], names=["data"]) + ) + got = plc.hashing.murmurhash3_x86_32(plc_tbl, 0) + expect = pa.Array.from_pandas( + pa_input_column.to_pandas().apply(python_hash_value, args=("murmurhash3_x86_32",)) + ) + assert_table_eq(got, expect) + + +def test_murmurhash3_x64_128(pa_input_column): + plc_tbl = plc.interop.from_arrow( + pa.Table.from_arrays([pa_input_column], names=["data"]) + ) + got = plc.hashing.murmurhash3_x64_128(plc_tbl, 0) + breakpoint() + expect = pa.Table.from_arrays( + pa.Array.from_pandas( + [pa_input_column.to_pandas().apply(python_hash_value, args=("murmurhash3_x64_128",))] + ), + names=["data"] + ) + + assert_table_eq(got, expect) From eef3616da658760ac65871a4c690edcb35cfb6d6 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 16 Apr 2024 08:21:24 -0500 Subject: [PATCH 20/52] fix up murmurhash3_x64_128 test, list struct error sha test --- .../cudf/cudf/pylibcudf_tests/test_hashing.py | 225 ++++-------------- 1 file changed, 41 insertions(+), 184 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/test_hashing.py b/python/cudf/cudf/pylibcudf_tests/test_hashing.py index 8f2e5f8bac2..2676021c77f 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_hashing.py +++ b/python/cudf/cudf/pylibcudf_tests/test_hashing.py @@ -5,6 +5,7 @@ import mmh3 import numpy as np +import pandas as pd import pyarrow as pa import pytest import xxhash @@ -17,117 +18,16 @@ @pytest.fixture(scope="module") -def xxhash_64_int_tbl(): - arrow_tbl = pa.Table.from_arrays( - [ - pa.array( - [ - -127, - -70000, - 0, - 200000, - 128, - np.iinfo("int32").max, - np.iinfo("int32").min, - np.iinfo("int32").min, - ], - type=pa.int32(), - ) - ], - names=["data"], - ) - return plc.interop.from_arrow(arrow_tbl) - - -@pytest.fixture(scope="module") -def xxhash_64_double_tbl(): - arrow_tbl = pa.Table.from_arrays( - [ - pa.array( - [ - -127.0, - -70000.125, - 128.5, - -0.0, - np.inf, - np.nan, - np.finfo("float64").max, - np.finfo("float64").min, - np.finfo("float64").min, - ], - type=pa.float32(), - ) - ], - names=["data"], - ) - return plc.interop.from_arrow(arrow_tbl) - - -@pytest.fixture(scope="module") -def xxhash_64_string_tbl(): - arrow_tbl = pa.Table.from_arrays( - [ - pa.array( - [ - "The", - "quick", - "brown fox", - "jumps over the lazy dog.", - "I am Jack's complete lack of null value", - "A very long (greater than 128 bytes/characters) to test a very long string. " - "2nd half of the very long string to verify the long string hashing happening.", - "Some multi-byte characters here: ééé", - "ééé", - "ééé ééé", - "ééé ééé ééé ééé", - "", - "!@#$%^&*(())", - "0123456789", - "{}|:<>?,./;[]=-", - ], - type=pa.string(), - ) - ], - names=["data"], - ) - return plc.interop.from_arrow(arrow_tbl) - - -@pytest.fixture(scope="module") -def xxhash_64_decimal_tbl(): - arrow_tbl = pa.Table.from_arrays( - [pa.array([0, 100, -100, 999999999, -999999999], type=pa.decimal(-3))], - names=["data"], - ) - return plc.interop.from_arrow(arrow_tbl) - - -# Full table hash -@pytest.fixture(scope="module") -def all_types_input_table(): +def list_struct_table(): data = pa.Table.from_pydict( { - "int": [1, 2, 3], - "float": [1.0, 2.0, 3.0], - "bool": [True, False, True], - "string": ["a", "b", "c"], - "list": [[1], [2], [3]], - "struct": [{"a": 1}, {"a": 2}, {"a": 3}], + "list": [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + "struct": [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}], } ) return data -def all_types_output_table(input, method): - def _applyfunc(x): - hasher = getattr(hashlib, method) - # TODO: not how libcudf computes row hash - return hasher(str(x).encode()).hexdigest() - - result = pa.Table.from_pandas(input.to_pandas().map(_applyfunc)) - return result - - def python_hash_value(x, method): if isinstance(x, str): binary = str(x).encode() @@ -141,12 +41,17 @@ def python_hash_value(x, method): binary = x.tobytes() else: raise NotImplementedError - if method == 'murmurhash3_x86_32': + if method == "murmurhash3_x86_32": raise NotImplementedError - elif method == 'murmurhash3_x64_128': + elif method == "murmurhash3_x64_128": hasher = mmh3.mmh3_x64_128(seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED) hasher.update(binary) - return hasher.digest() + # libcudf returns a tuple of two 64-bit integers + return hasher.utupledigest() + elif method == "xxhash_64": + return xxhash.xxh64( + binary, seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ).intdigest() else: return getattr(hashlib, method)(binary).hexdigest() @@ -189,12 +94,26 @@ def test_hash_column_md5(pa_input_column): assert_column_eq(got, expect) +def test_hash_column_xxhash64(pa_input_column): + plc_tbl = plc.interop.from_arrow( + pa.Table.from_arrays([pa_input_column], names=["data"]) + ) + + expect = pa.Array.from_pandas( + pa_input_column.to_pandas().apply( + python_hash_value, args=("xxhash_64",) + ) + ) + got = plc.hashing.xxhash_64(plc_tbl, 0) + assert_column_eq(got, expect) + + @pytest.mark.parametrize( "method", ["sha1", "sha224", "sha256", "sha384", "sha512"] ) @pytest.mark.parametrize("dtype", ["list", "struct"]) -def test_sha_list_struct_err(all_types_input_table, dtype, method): - err_types = all_types_input_table.select([dtype]) +def test_sha_list_struct_err(list_struct_table, dtype, method): + err_types = list_struct_table.select([dtype]) plc_tbl = plc.interop.from_arrow(err_types) plc_hasher = getattr(plc.hashing, method) @@ -202,81 +121,15 @@ def test_sha_list_struct_err(all_types_input_table, dtype, method): plc_hasher(plc_tbl) -def test_xxhash_64_int(xxhash_64_int_tbl): - expected = pa.array( - [ - 4827426872506142937, - 13867166853951622683, - 4246796580750024372, - 17339819992360460003, - 7292178400482025765, - 2971168436322821236, - 9380524276503839603, - 9380524276503839603, - ], - type=pa.uint64(), - ) - got = plc.hashing.xxhash_64(xxhash_64_int_tbl, 0) - assert_column_eq(got, expected) - - -def test_xxhash_64_double(xxhash_64_double_tbl): - # see xxhash_64_test.cpp for details - expected = pa.array( - [ - 16892115221677838993, - 1686446903308179321, - 3803688792395291579, - 18250447068822614389, - 3511911086082166358, - 4558309869707674848, - 18031741628920313605, - 16838308782748609196, - 3127544388062992779, - 1692401401506680154, - 13770442912356326755, - ], - type=pa.uint64(), - ) - got = plc.hashing.xxhash_64(xxhash_64_double_tbl, 0) - assert_column_eq(got, expected) - - -def test_xxhash_64_string(xxhash_64_string_tbl): - def hasher(x): - return xxhash.xxh64(bytes(x, "utf-8")).intdigest() - - expected = pa.Array.from_pandas( - plc.interop.to_arrow(xxhash_64_string_tbl) - .to_pandas()[""] - .apply(hasher) - ) - got = plc.hashing.xxhash_64(xxhash_64_string_tbl, 0) - - assert_column_eq(got, expected) - - -def test_xxhash64_decimal(xxhash_64_decimal_tbl): - expected = pa.array( - [ - 4246796580750024372, - 5959467639951725378, - 4122185689695768261, - 3249245648192442585, - 8009575895491381648, - ], - type=pa.uint64(), - ) - got = plc.hashing.xxhash_64(xxhash_64_decimal_tbl, 0) - assert_table_eq(got, expected) - def test_murmurhash3_x86_32(pa_input_column): plc_tbl = plc.interop.from_arrow( pa.Table.from_arrays([pa_input_column], names=["data"]) ) got = plc.hashing.murmurhash3_x86_32(plc_tbl, 0) expect = pa.Array.from_pandas( - pa_input_column.to_pandas().apply(python_hash_value, args=("murmurhash3_x86_32",)) + pa_input_column.to_pandas().apply( + python_hash_value, args=("murmurhash3_x86_32",) + ) ) assert_table_eq(got, expect) @@ -286,12 +139,16 @@ def test_murmurhash3_x64_128(pa_input_column): pa.Table.from_arrays([pa_input_column], names=["data"]) ) got = plc.hashing.murmurhash3_x64_128(plc_tbl, 0) - breakpoint() - expect = pa.Table.from_arrays( - pa.Array.from_pandas( - [pa_input_column.to_pandas().apply(python_hash_value, args=("murmurhash3_x64_128",))] - ), - names=["data"] + tuples = pa_input_column.to_pandas().apply( + python_hash_value, args=("murmurhash3_x64_128",) + ) + expect = pa.Table.from_pandas( + pd.DataFrame( + { + 0: tuples.apply(lambda tup: np.uint64(tup[0])), + 1: tuples.apply(lambda tup: np.uint64(tup[1])), + } + ) ) assert_table_eq(got, expect) From ab5870da7685630c2e492133867cce1e34fedfee Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 16 Apr 2024 08:33:55 -0500 Subject: [PATCH 21/52] add mmh3_x86_32 tests that currently fail --- python/cudf/cudf/pylibcudf_tests/test_hashing.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/test_hashing.py b/python/cudf/cudf/pylibcudf_tests/test_hashing.py index 2676021c77f..da9ad0e9c3f 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_hashing.py +++ b/python/cudf/cudf/pylibcudf_tests/test_hashing.py @@ -42,7 +42,10 @@ def python_hash_value(x, method): else: raise NotImplementedError if method == "murmurhash3_x86_32": - raise NotImplementedError + # mmh3.hash by default uses MurmurHash3_x86_32 + return mmh3.hash( + binary, seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED, signed=False + ) elif method == "murmurhash3_x64_128": hasher = mmh3.mmh3_x64_128(seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED) hasher.update(binary) @@ -131,7 +134,7 @@ def test_murmurhash3_x86_32(pa_input_column): python_hash_value, args=("murmurhash3_x86_32",) ) ) - assert_table_eq(got, expect) + assert_column_eq(got, expect) def test_murmurhash3_x64_128(pa_input_column): From 41c0ae67a899bca6d762b4b1301d8a0f587b939a Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Thu, 2 May 2024 14:11:31 -0700 Subject: [PATCH 22/52] Apply suggestions from code review Co-authored-by: Lawrence Mitchell --- python/cudf/cudf/_lib/pylibcudf/hashing.pyx | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/hashing.pyx b/python/cudf/cudf/_lib/pylibcudf/hashing.pyx index 60afdcac79f..2e236df4df6 100644 --- a/python/cudf/cudf/_lib/pylibcudf/hashing.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/hashing.pyx @@ -29,7 +29,7 @@ cpdef Column murmurhash3_x86_32( ): """Computes the MurmurHash3 32-bit hash value of each row in the given table. - For details, see :cpp:func:murmurhash3_x86_32. + For details, see :cpp:func:`murmurhash3_x86_32`. Parameters ---------- @@ -63,7 +63,7 @@ cpdef Table murmurhash3_x64_128( ): """Computes the MurmurHash3 64-bit hash value of each row in the given table. - For details, see :cpp:func:murmurhash3_x64_128. + For details, see :cpp:func:`murmurhash3_x64_128`. Parameters ---------- @@ -116,7 +116,7 @@ cpdef Column md5(Table input): cpdef Column sha1(Table input): """Computes the SHA-1 hash value of each row in the given table. - For details, see :cpp:func:sha1. + For details, see :cpp:func:`sha1`. Parameters ---------- @@ -139,7 +139,7 @@ cpdef Column sha1(Table input): cpdef Column sha224(Table input): """Computes the SHA-224 hash value of each row in the given table. - For details, see :cpp:func:sha224. + For details, see :cpp:func:`sha224`. Parameters ---------- @@ -161,7 +161,7 @@ cpdef Column sha224(Table input): cpdef Column sha256(Table input): """Computes the SHA-256 hash value of each row in the given table. - For details, see :cpp:func:256. + For details, see :cpp:func:`sha256`. Parameters ---------- @@ -183,7 +183,7 @@ cpdef Column sha256(Table input): cpdef Column sha384(Table input): """Computes the SHA-384 hash value of each row in the given table. - For details, see :cpp:func:sha384. + For details, see :cpp:func:`sha384`. Parameters ---------- @@ -205,7 +205,7 @@ cpdef Column sha384(Table input): cpdef Column sha512(Table input): """Computes the SHA-512 hash value of each row in the given table. - For details, see :cpp:func:sha512. + For details, see :cpp:func:`sha512`. Parameters ---------- From 774b093951c83412682ff068f830552b31c2c731 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 2 May 2024 18:58:30 -0500 Subject: [PATCH 23/52] update cpp errors --- cpp/tests/hashing/sha1_test.cpp | 4 ++-- cpp/tests/hashing/sha224_test.cpp | 4 ++-- cpp/tests/hashing/sha256_test.cpp | 4 ++-- cpp/tests/hashing/sha384_test.cpp | 4 ++-- cpp/tests/hashing/sha512_test.cpp | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/cpp/tests/hashing/sha1_test.cpp b/cpp/tests/hashing/sha1_test.cpp index c3d0fe7450a..428b5e8e749 100644 --- a/cpp/tests/hashing/sha1_test.cpp +++ b/cpp/tests/hashing/sha1_test.cpp @@ -137,7 +137,7 @@ TEST_F(SHA1HashTest, ListsUnsupported) auto const input = cudf::table_view({strings_list_col}); - EXPECT_THROW(cudf::hashing::sha1(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha1(input), cudf::data_type_error); } TEST_F(SHA1HashTest, StructsUnsupported) @@ -146,7 +146,7 @@ TEST_F(SHA1HashTest, StructsUnsupported) auto struct_col = cudf::test::structs_column_wrapper{{child_col}}; auto const input = cudf::table_view({struct_col}); - EXPECT_THROW(cudf::hashing::sha1(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha1(input), cudf::data_type_error); } template diff --git a/cpp/tests/hashing/sha224_test.cpp b/cpp/tests/hashing/sha224_test.cpp index def5e934177..58274c2b390 100644 --- a/cpp/tests/hashing/sha224_test.cpp +++ b/cpp/tests/hashing/sha224_test.cpp @@ -137,7 +137,7 @@ TEST_F(SHA224HashTest, ListsUnsupported) auto const input = cudf::table_view({strings_list_col}); - EXPECT_THROW(cudf::hashing::sha224(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha224(input), cudf::data_type_error); } TEST_F(SHA224HashTest, StructsUnsupported) @@ -146,7 +146,7 @@ TEST_F(SHA224HashTest, StructsUnsupported) auto struct_col = cudf::test::structs_column_wrapper{{child_col}}; auto const input = cudf::table_view({struct_col}); - EXPECT_THROW(cudf::hashing::sha224(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha224(input), cudf::data_type_error); } template diff --git a/cpp/tests/hashing/sha256_test.cpp b/cpp/tests/hashing/sha256_test.cpp index 410a99edd77..d2d3a567674 100644 --- a/cpp/tests/hashing/sha256_test.cpp +++ b/cpp/tests/hashing/sha256_test.cpp @@ -138,7 +138,7 @@ TEST_F(SHA256HashTest, ListsUnsupported) auto const input = cudf::table_view({strings_list_col}); - EXPECT_THROW(cudf::hashing::sha256(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha256(input), cudf::data_type_error); } TEST_F(SHA256HashTest, StructsUnsupported) @@ -147,7 +147,7 @@ TEST_F(SHA256HashTest, StructsUnsupported) auto struct_col = cudf::test::structs_column_wrapper{{child_col}}; auto const input = cudf::table_view({struct_col}); - EXPECT_THROW(cudf::hashing::sha256(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha256(input), cudf::data_type_error); } template diff --git a/cpp/tests/hashing/sha384_test.cpp b/cpp/tests/hashing/sha384_test.cpp index 810fbc82d8e..44603c6e09a 100644 --- a/cpp/tests/hashing/sha384_test.cpp +++ b/cpp/tests/hashing/sha384_test.cpp @@ -155,7 +155,7 @@ TEST_F(SHA384HashTest, ListsUnsupported) auto const input = cudf::table_view({strings_list_col}); - EXPECT_THROW(cudf::hashing::sha384(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha384(input), cudf::data_type_error); } TEST_F(SHA384HashTest, StructsUnsupported) @@ -164,7 +164,7 @@ TEST_F(SHA384HashTest, StructsUnsupported) auto struct_col = cudf::test::structs_column_wrapper{{child_col}}; auto const input = cudf::table_view({struct_col}); - EXPECT_THROW(cudf::hashing::sha384(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha384(input), cudf::data_type_error); } template diff --git a/cpp/tests/hashing/sha512_test.cpp b/cpp/tests/hashing/sha512_test.cpp index 93caa16c1c4..2216e009dad 100644 --- a/cpp/tests/hashing/sha512_test.cpp +++ b/cpp/tests/hashing/sha512_test.cpp @@ -155,7 +155,7 @@ TEST_F(SHA512HashTest, ListsUnsupported) auto const input = cudf::table_view({strings_list_col}); - EXPECT_THROW(cudf::hashing::sha512(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha512(input), cudf::data_type_error); } TEST_F(SHA512HashTest, StructsUnsupported) @@ -164,7 +164,7 @@ TEST_F(SHA512HashTest, StructsUnsupported) auto struct_col = cudf::test::structs_column_wrapper{{child_col}}; auto const input = cudf::table_view({struct_col}); - EXPECT_THROW(cudf::hashing::sha512(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha512(input), cudf::data_type_error); } template From 2f131b306534fb382fc94d72dfe0235a1e18f178 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 2 May 2024 21:04:57 -0500 Subject: [PATCH 24/52] address some reviews --- python/cudf/cudf/_lib/hash.pyx | 7 ++-- python/cudf/cudf/_lib/pylibcudf/hashing.pyx | 34 +++++++++++++++++++ python/cudf/cudf/pylibcudf_tests/conftest.py | 24 ------------- .../cudf/cudf/pylibcudf_tests/test_copying.py | 19 ++++++++++- .../cudf/cudf/pylibcudf_tests/test_hashing.py | 21 ++++++++++++ 5 files changed, 77 insertions(+), 28 deletions(-) diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index b8ce3f0b0b4..319cdf5dae3 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -7,9 +7,8 @@ from libcpp.pair cimport pair from libcpp.utility cimport move from libcpp.vector cimport vector -from cudf._lib import pylibcudf - cimport cudf._lib.cpp.types as libcudf_types +from cudf._lib cimport pylibcudf from cudf._lib.column cimport Column from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition from cudf._lib.cpp.table.table cimport table @@ -51,7 +50,9 @@ def hash_partition(list source_columns, object columns_to_hash, @acquire_spill_lock() def hash(list source_columns, str method, int seed=0): - ctbl = pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]) + cdef pylibcudf.Table ctbl = pylibcudf.Table( + [c.to_pylibcudf(mode="read") for c in source_columns] + ) if method == "murmur3": return Column.from_pylibcudf(murmurhash3_x86_32(ctbl, seed)) # elif method == "xxhash64": diff --git a/python/cudf/cudf/_lib/pylibcudf/hashing.pyx b/python/cudf/cudf/_lib/pylibcudf/hashing.pyx index 2e236df4df6..ad9a53698ef 100644 --- a/python/cudf/cudf/_lib/pylibcudf/hashing.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/hashing.pyx @@ -95,6 +95,23 @@ cpdef Column xxhash_64( Table input, uint64_t seed=DEFAULT_HASH_SEED ): + """Computes the xxHash 64-bit hash value of each row in the given table. + + For details, see :cpp:func:`xxhash_64`. + + Parameters + ---------- + input : Table + The table of columns to hash + seed : uint32_t + Optional seed value to use for the hash function + + Returns + ------- + pylibcudf.Column + A column where each row is the hash of a row from the input + """ + cdef unique_ptr[column] c_result with nogil: c_result = move( @@ -108,6 +125,23 @@ cpdef Column xxhash_64( cpdef Column md5(Table input): + """Computes the MD5 hash value of each row in the given table. + + For details, see :cpp:func:`md5`. + + Parameters + ---------- + + input : Table + The table of columns to hash + + Returns + ------- + pylibcudf.Column + A column where each row is the md5 hash of a row from the input + + """ + cdef unique_ptr[column] c_result with nogil: c_result = move(cpp_md5(input.view())) diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py index 69540c3d9ba..6d8284fb3db 100644 --- a/python/cudf/cudf/pylibcudf_tests/conftest.py +++ b/python/cudf/cudf/pylibcudf_tests/conftest.py @@ -11,8 +11,6 @@ from utils import DEFAULT_STRUCT_TESTING_TYPE -import cudf._lib.pylibcudf as plc - # This fixture defines the standard set of types that all tests should default to # running on. If there is a need for some tests to run on a different set of types, that @@ -31,25 +29,3 @@ ) def pa_type(request): return request.param - - -# TODO: Test nullable data -@pytest.fixture(scope="session") -def pa_input_column(pa_type): - if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type): - return pa.array([1, 2, 3], type=pa_type) - elif pa.types.is_string(pa_type): - return pa.array(["a", "b", "c"], type=pa_type) - elif pa.types.is_boolean(pa_type): - return pa.array([True, True, False], type=pa_type) - elif pa.types.is_list(pa_type): - # TODO: Add heterogenous sizes - return pa.array([[1], [2], [3]], type=pa_type) - elif pa.types.is_struct(pa_type): - return pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type) - raise ValueError("Unsupported type") - - -@pytest.fixture(scope="session") -def input_column(pa_input_column): - return plc.interop.from_arrow(pa_input_column) diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py index a79aea3e12e..1d43b5fa894 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_copying.py +++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py @@ -18,7 +18,24 @@ from cudf._lib import pylibcudf as plc -@pytest.fixture(scope="module") +# TODO: Test nullable data +@pytest.fixture(scope="session") +def pa_input_column(pa_type): + if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type): + return pa.array([1, 2, 3], type=pa_type) + elif pa.types.is_string(pa_type): + return pa.array(["a", "b", "c"], type=pa_type) + elif pa.types.is_boolean(pa_type): + return pa.array([True, True, False], type=pa_type) + elif pa.types.is_list(pa_type): + # TODO: Add heterogenous sizes + return pa.array([[1], [2], [3]], type=pa_type) + elif pa.types.is_struct(pa_type): + return pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type) + raise ValueError("Unsupported type") + + +@pytest.fixture(scope="session") def input_column(pa_input_column): return plc.interop.from_arrow(pa_input_column) diff --git a/python/cudf/cudf/pylibcudf_tests/test_hashing.py b/python/cudf/cudf/pylibcudf_tests/test_hashing.py index da9ad0e9c3f..fb4ca7b734b 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_hashing.py +++ b/python/cudf/cudf/pylibcudf_tests/test_hashing.py @@ -17,6 +17,27 @@ METHODS = ["md5", "sha1", "sha224", "sha256", "sha384", "sha512"] +@pytest.fixture +def pa_input_column(pa_type): + if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type): + return pa.array([1, 2, 3], type=pa_type) + elif pa.types.is_string(pa_type): + return pa.array(["a", "b", "c"], type=pa_type) + elif pa.types.is_boolean(pa_type): + return pa.array([True, True, False], type=pa_type) + elif pa.types.is_list(pa_type): + # TODO: Add heterogenous sizes + return pa.array([[1], [2], [3]], type=pa_type) + elif pa.types.is_struct(pa_type): + return pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type) + raise ValueError("Unsupported type") + + +@pytest.fixture() +def input_column(pa_input_column): + return plc.interop.from_arrow(pa_input_column) + + @pytest.fixture(scope="module") def list_struct_table(): data = pa.Table.from_pydict( From af6e59d0ae5cf83bb01acdc01fdfd03a36c353c1 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 2 May 2024 21:09:29 -0500 Subject: [PATCH 25/52] uncomment xxhash_64 --- python/cudf/cudf/_lib/hash.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index 319cdf5dae3..a684d8f430e 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -21,6 +21,7 @@ from cudf._lib.pylibcudf.hashing cimport ( sha256, sha384, sha512, + xxhash_64, ) from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns @@ -55,8 +56,8 @@ def hash(list source_columns, str method, int seed=0): ) if method == "murmur3": return Column.from_pylibcudf(murmurhash3_x86_32(ctbl, seed)) -# elif method == "xxhash64": -# return Column.from_pylibcudf(xxhash_64(ctbl, seed)) + elif method == "xxhash64": + return Column.from_pylibcudf(xxhash_64(ctbl, seed)) elif method == "md5": return Column.from_pylibcudf(md5(ctbl)) elif method == "sha1": From 2e6743e9f8434056db7e2d55a7d7287d8fdcbe64 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 3 May 2024 06:37:01 -0700 Subject: [PATCH 26/52] add mmh3 to test_python_cudf --- conda/environments/all_cuda-118_arch-x86_64.yaml | 1 + conda/environments/all_cuda-122_arch-x86_64.yaml | 1 + dependencies.yaml | 1 + python/cudf/pyproject.toml | 1 + 4 files changed, 4 insertions(+) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 7a5fef9f25e..9a12b238834 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -48,6 +48,7 @@ dependencies: - librdkafka>=1.9.0,<1.10.0a0 - librmm==24.6.* - make +- mmh3 - moto>=4.0.8 - msgpack-python - myst-nb diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml index 48453e18bb0..315babe2499 100644 --- a/conda/environments/all_cuda-122_arch-x86_64.yaml +++ b/conda/environments/all_cuda-122_arch-x86_64.yaml @@ -47,6 +47,7 @@ dependencies: - librdkafka>=1.9.0,<1.10.0a0 - librmm==24.6.* - make +- mmh3 - moto>=4.0.8 - msgpack-python - myst-nb diff --git a/dependencies.yaml b/dependencies.yaml index 1508656471d..8f0b0460e16 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -619,6 +619,7 @@ dependencies: - pytest-benchmark - pytest-cases>=3.8.2 - scipy + - mmh3 - output_types: conda packages: - aiobotocore>=2.2.0 diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index fc3a243572f..cb74af47990 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -55,6 +55,7 @@ test = [ "cramjam", "fastavro>=0.22.9", "hypothesis", + "mmh3", "msgpack", "pytest-benchmark", "pytest-cases>=3.8.2", From 8d8bef99812b6da0fbf8e61b3866079960111754 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 3 May 2024 08:21:13 -0700 Subject: [PATCH 27/52] fix murmurhash3_x86_32 --- python/cudf/cudf/pylibcudf_tests/test_hashing.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/test_hashing.py b/python/cudf/cudf/pylibcudf_tests/test_hashing.py index fb4ca7b734b..4e2ed28b5b8 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_hashing.py +++ b/python/cudf/cudf/pylibcudf_tests/test_hashing.py @@ -63,10 +63,10 @@ def python_hash_value(x, method): else: raise NotImplementedError if method == "murmurhash3_x86_32": - # mmh3.hash by default uses MurmurHash3_x86_32 - return mmh3.hash( - binary, seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED, signed=False - ) + # reimplement libcudf hash combine for a single colum + seed = plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + hashval = mmh3.hash(binary, seed) + return seed ^ (hashval + 0x9E3779B9 + (seed << 6) + (seed >> 2)) elif method == "murmurhash3_x64_128": hasher = mmh3.mmh3_x64_128(seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED) hasher.update(binary) @@ -153,7 +153,8 @@ def test_murmurhash3_x86_32(pa_input_column): expect = pa.Array.from_pandas( pa_input_column.to_pandas().apply( python_hash_value, args=("murmurhash3_x86_32",) - ) + ), + type=pa.uint32(), ) assert_column_eq(got, expect) From 5930c7aec1a3eee3c7c2f001cf117472cc2545db Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 3 May 2024 08:23:43 -0700 Subject: [PATCH 28/52] add xxhash testing dependency --- conda/environments/all_cuda-118_arch-x86_64.yaml | 1 + conda/environments/all_cuda-122_arch-x86_64.yaml | 1 + dependencies.yaml | 1 + python/cudf/pyproject.toml | 1 + 4 files changed, 4 insertions(+) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 9a12b238834..76b21f373f3 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -95,6 +95,7 @@ dependencies: - tokenizers==0.15.2 - transformers==4.38.1 - typing_extensions>=4.0.0 +- xxhash - zlib>=1.2.13 - pip: - git+https://github.com/python-streamz/streamz.git@master diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml index 315babe2499..437013f097c 100644 --- a/conda/environments/all_cuda-122_arch-x86_64.yaml +++ b/conda/environments/all_cuda-122_arch-x86_64.yaml @@ -93,6 +93,7 @@ dependencies: - tokenizers==0.15.2 - transformers==4.38.1 - typing_extensions>=4.0.0 +- xxhash - zlib>=1.2.13 - pip: - git+https://github.com/python-streamz/streamz.git@master diff --git a/dependencies.yaml b/dependencies.yaml index 8f0b0460e16..f28dd672a4f 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -620,6 +620,7 @@ dependencies: - pytest-cases>=3.8.2 - scipy - mmh3 + - xxhash - output_types: conda packages: - aiobotocore>=2.2.0 diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index cb74af47990..41877bf402f 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -66,6 +66,7 @@ test = [ "tokenizers==0.15.2", "transformers==4.38.1", "tzdata", + "xxhash", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. pandas-tests = [ "pandas[test, pyarrow, performance, computation, fss, excel, parquet, feather, hdf5, spss, html, xml, plot, output-formatting, clipboard, compression]", From cedc89fa5166a27f256118b8695c3a6028c55823 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 3 May 2024 09:21:30 -0700 Subject: [PATCH 29/52] depandasify --- .../cudf/cudf/pylibcudf_tests/test_hashing.py | 57 +++++++++++-------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/test_hashing.py b/python/cudf/cudf/pylibcudf_tests/test_hashing.py index 4e2ed28b5b8..7150523d0f2 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_hashing.py +++ b/python/cudf/cudf/pylibcudf_tests/test_hashing.py @@ -5,7 +5,6 @@ import mmh3 import numpy as np -import pandas as pd import pyarrow as pa import pytest import xxhash @@ -58,8 +57,8 @@ def python_hash_value(x, method): binary = x.to_bytes(1, byteorder="little", signed=True) elif isinstance(x, int): binary = x.to_bytes(8, byteorder="little", signed=True) - elif isinstance(x, np.ndarray): - binary = x.tobytes() + elif isinstance(x, list): + binary = np.array(x).tobytes() else: raise NotImplementedError if method == "murmurhash3_x86_32": @@ -94,8 +93,12 @@ def test_hash_column_sha(pa_input_column, method): plc_hasher(plc_tbl) return - expect = pa.Array.from_pandas( - pa_input_column.to_pandas().apply(python_hash_value, args=(method,)) + expect = pa.array( + [ + python_hash_value(val, method) + for val in pa_input_column.to_pylist() + ], + type=pa.string(), ) got = plc_hasher(plc_tbl) assert_column_eq(got, expect) @@ -111,8 +114,9 @@ def test_hash_column_md5(pa_input_column): plc.hashing.md5(plc_tbl) return - expect = pa.Array.from_pandas( - pa_input_column.to_pandas().apply(python_hash_value, args=("md5",)) + expect = pa.array( + [python_hash_value(val, "md5") for val in pa_input_column.to_pylist()], + type=pa.string(), ) got = plc.hashing.md5(plc_tbl) assert_column_eq(got, expect) @@ -123,10 +127,12 @@ def test_hash_column_xxhash64(pa_input_column): pa.Table.from_arrays([pa_input_column], names=["data"]) ) - expect = pa.Array.from_pandas( - pa_input_column.to_pandas().apply( - python_hash_value, args=("xxhash_64",) - ) + expect = pa.array( + [ + python_hash_value(val, "xxhash_64") + for val in pa_input_column.to_pylist() + ], + type=pa.uint64(), ) got = plc.hashing.xxhash_64(plc_tbl, 0) assert_column_eq(got, expect) @@ -150,10 +156,11 @@ def test_murmurhash3_x86_32(pa_input_column): pa.Table.from_arrays([pa_input_column], names=["data"]) ) got = plc.hashing.murmurhash3_x86_32(plc_tbl, 0) - expect = pa.Array.from_pandas( - pa_input_column.to_pandas().apply( - python_hash_value, args=("murmurhash3_x86_32",) - ), + expect = pa.array( + [ + python_hash_value(val, "murmurhash3_x86_32") + for val in pa_input_column.to_pylist() + ], type=pa.uint32(), ) assert_column_eq(got, expect) @@ -164,16 +171,16 @@ def test_murmurhash3_x64_128(pa_input_column): pa.Table.from_arrays([pa_input_column], names=["data"]) ) got = plc.hashing.murmurhash3_x64_128(plc_tbl, 0) - tuples = pa_input_column.to_pandas().apply( - python_hash_value, args=("murmurhash3_x64_128",) - ) - expect = pa.Table.from_pandas( - pd.DataFrame( - { - 0: tuples.apply(lambda tup: np.uint64(tup[0])), - 1: tuples.apply(lambda tup: np.uint64(tup[1])), - } - ) + tuples = [ + python_hash_value(val, "murmurhash3_x64_128") + for val in pa_input_column.to_pylist() + ] + expect = pa.Table.from_arrays( + [ + pa.array([np.uint64(t[0]) for t in tuples]), + pa.array([np.uint64(t[1]) for t in tuples]), + ], + names=["0", "1"], ) assert_table_eq(got, expect) From 751e5f3dd0e799a8db99a87e275f98e91888c0fa Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 16 May 2024 08:24:02 -0700 Subject: [PATCH 30/52] fix pylibcudf tests --- python/cudf/cudf/_lib/hash.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index c88dc5a04d6..3d7883a9106 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -8,7 +8,7 @@ from libcpp.utility cimport move from libcpp.vector cimport vector cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types -from cudf._lib cimport pylibcudf +from cudf._lib import pylibcudf from cudf._lib.column cimport Column from cudf._lib.pylibcudf.libcudf.partitioning cimport ( hash_partition as cpp_hash_partition, From 642b4446d029aacdc33ca588cef61c2bd617c392 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 16 May 2024 13:32:38 -0700 Subject: [PATCH 31/52] update dependencies --- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-122_arch-x86_64.yaml | 2 +- dependencies.yaml | 6 +++++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 3c37b554093..5c9d39e6132 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -75,6 +75,7 @@ dependencies: - pytest-xdist - pytest<8 - python-confluent-kafka>=1.9.0,<1.10.0a0 +- python-xxhash - python>=3.9,<3.12 - pytorch>=2.1.0 - rapids-dask-dependency==24.6.* @@ -95,7 +96,6 @@ dependencies: - tokenizers==0.15.2 - transformers==4.39.3 - typing_extensions>=4.0.0 -- xxhash - zlib>=1.2.13 - pip: - git+https://github.com/python-streamz/streamz.git@master diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml index 02164a5f5b1..bbf233c0f2c 100644 --- a/conda/environments/all_cuda-122_arch-x86_64.yaml +++ b/conda/environments/all_cuda-122_arch-x86_64.yaml @@ -73,6 +73,7 @@ dependencies: - pytest-xdist - pytest<8 - python-confluent-kafka>=1.9.0,<1.10.0a0 +- python-xxhash - python>=3.9,<3.12 - pytorch>=2.1.0 - rapids-dask-dependency==24.6.* @@ -93,7 +94,6 @@ dependencies: - tokenizers==0.15.2 - transformers==4.39.3 - typing_extensions>=4.0.0 -- xxhash - zlib>=1.2.13 - pip: - git+https://github.com/python-streamz/streamz.git@master diff --git a/dependencies.yaml b/dependencies.yaml index 738e0639603..ab5da357897 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -640,7 +640,6 @@ dependencies: - pytest-cases>=3.8.2 - scipy - mmh3 - - xxhash - output_types: conda packages: - aiobotocore>=2.2.0 @@ -649,12 +648,17 @@ dependencies: - msgpack-python - moto>=4.0.8 - s3fs>=2022.3.0 + - python-xxhash - output_types: pyproject packages: - msgpack - &tokenizers tokenizers==0.15.2 - &transformers transformers==4.39.3 - tzdata + - xxhash + - output_types: requirements + packages: + - xxhash specific: - output_types: conda matrices: From cbeb9f9a1717c81b0ffa2f820375caa79488a1ba Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 16 May 2024 14:58:27 -0700 Subject: [PATCH 32/52] linting --- python/cudf/cudf/_lib/pylibcudf/hashing.pyx | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/hashing.pyx b/python/cudf/cudf/_lib/pylibcudf/hashing.pyx index e7b868d37ad..0b79831076d 100644 --- a/python/cudf/cudf/_lib/pylibcudf/hashing.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/hashing.pyx @@ -33,7 +33,6 @@ cpdef Column murmurhash3_x86_32( Parameters ---------- - input : Table The table of columns to hash seed : uint32_t @@ -41,7 +40,6 @@ cpdef Column murmurhash3_x86_32( Returns ------- - pylibcudf.Column A column where each row is the hash of a row from the input """ @@ -67,7 +65,6 @@ cpdef Table murmurhash3_x64_128( Parameters ---------- - input : Table The table of columns to hash seed : uint32_t @@ -75,7 +72,6 @@ cpdef Table murmurhash3_x64_128( Returns ------- - pylibcudf.Table A table of two UINT64 columns """ @@ -131,7 +127,6 @@ cpdef Column md5(Table input): Parameters ---------- - input : Table The table of columns to hash @@ -154,13 +149,11 @@ cpdef Column sha1(Table input): Parameters ---------- - input : Table The table of columns to hash Returns ------- - pylibcudf.Column A column where each row is the hash of a row from the input """ @@ -177,7 +170,6 @@ cpdef Column sha224(Table input): Parameters ---------- - input : Table The table of columns to hash @@ -199,7 +191,6 @@ cpdef Column sha256(Table input): Parameters ---------- - input : Table The table of columns to hash @@ -221,7 +212,6 @@ cpdef Column sha384(Table input): Parameters ---------- - input : Table The table of columns to hash @@ -243,7 +233,6 @@ cpdef Column sha512(Table input): Parameters ---------- - input : Table The table of columns to hash From 174af9d52b8f68b2242b343e6702ea9e9641d586 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 16 May 2024 15:01:05 -0700 Subject: [PATCH 33/52] refactor --- python/cudf/cudf/pylibcudf_tests/test_hashing.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/test_hashing.py b/python/cudf/cudf/pylibcudf_tests/test_hashing.py index 7150523d0f2..258e01728f8 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_hashing.py +++ b/python/cudf/cudf/pylibcudf_tests/test_hashing.py @@ -48,6 +48,12 @@ def list_struct_table(): return data +def libcudf_mmh3_x86_32(binary): + seed = plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + hashval = mmh3.hash(binary, seed) + return seed ^ (hashval + 0x9E3779B9 + (seed << 6) + (seed >> 2)) + + def python_hash_value(x, method): if isinstance(x, str): binary = str(x).encode() @@ -62,10 +68,7 @@ def python_hash_value(x, method): else: raise NotImplementedError if method == "murmurhash3_x86_32": - # reimplement libcudf hash combine for a single colum - seed = plc.hashing.LIBCUDF_DEFAULT_HASH_SEED - hashval = mmh3.hash(binary, seed) - return seed ^ (hashval + 0x9E3779B9 + (seed << 6) + (seed >> 2)) + return libcudf_mmh3_x86_32(binary) elif method == "murmurhash3_x64_128": hasher = mmh3.mmh3_x64_128(seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED) hasher.update(binary) From 37a91bfaf9ba4928a66d5db71fcb3c3d9c4ff3fb Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Sun, 7 Jul 2024 19:30:16 -0700 Subject: [PATCH 34/52] debug commit --- cpp/src/hash/murmurhash3_x86_32.cu | 2 ++ python/cudf/cudf/pylibcudf_tests/conftest.py | 12 ++++++++++-- python/cudf/cudf/pylibcudf_tests/test_hashing.py | 14 ++++++++++++-- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/cpp/src/hash/murmurhash3_x86_32.cu b/cpp/src/hash/murmurhash3_x86_32.cu index eac72f5d995..d1b2bf93bdd 100644 --- a/cpp/src/hash/murmurhash3_x86_32.cu +++ b/cpp/src/hash/murmurhash3_x86_32.cu @@ -49,6 +49,8 @@ std::unique_ptr murmurhash3_x86_32(table_view const& input, auto const row_hasher = cudf::experimental::row::hash::row_hasher(input, stream); auto output_view = output->mutable_view(); + std::cout << "\n\n\n murmurhash \n\n\n" << std::endl; + // Compute the hash value for each row thrust::tabulate(rmm::exec_policy(stream), output_view.begin(), diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py index e4760ea7ac8..d321bb55e67 100644 --- a/python/cudf/cudf/pylibcudf_tests/conftest.py +++ b/python/cudf/cudf/pylibcudf_tests/conftest.py @@ -16,14 +16,22 @@ from utils import ALL_PA_TYPES, DEFAULT_PA_TYPES, NUMERIC_PA_TYPES - -# This fixture defines the standard set of types that all tests should default to +def _type_to_str(typ): + if isinstance(typ, pa.ListType): + return f"list-{_type_to_str(typ.value_type)}" + elif isinstance(typ, pa.StructType): + return f"struct-{'-'.join([_type_to_str(typ.field(i).type) for i in range(typ.num_fields)])}" + else: + return str(typ) + +# This fixture defines [the standard set of types that all tests should default to # running on. If there is a need for some tests to run on a different set of types, that # type list fixture should also be defined below here if it is likely to be reused # across modules. Otherwise it may be defined on a per-module basis. @pytest.fixture( scope="session", params=DEFAULT_PA_TYPES, + ids=_type_to_str, ) def pa_type(request): return request.param diff --git a/python/cudf/cudf/pylibcudf_tests/test_hashing.py b/python/cudf/cudf/pylibcudf_tests/test_hashing.py index 6dcc37dd6b2..e8630b3d612 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_hashing.py +++ b/python/cudf/cudf/pylibcudf_tests/test_hashing.py @@ -26,9 +26,15 @@ def pa_input_column(pa_type): return pa.array([True, True, False], type=pa_type) elif pa.types.is_list(pa_type): # TODO: Add heterogenous sizes - return pa.array([[1], [2], [3]], type=pa_type) + try: + return pa.array([[1], [2], [3]], type=pa_type) + except: + pytest.skip() elif pa.types.is_struct(pa_type): - return pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type) + try: + return pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type) + except: + pytest.skip() raise ValueError("Unsupported type") @@ -130,6 +136,9 @@ def test_hash_column_xxhash64(pa_input_column): pa.Table.from_arrays([pa_input_column], names=["data"]) ) + + if isinstance(pa_input_column.type, (pa.ListType, pa.StructType)): + pytest.xfail() expect = pa.array( [ python_hash_value(val, "xxhash_64") @@ -138,6 +147,7 @@ def test_hash_column_xxhash64(pa_input_column): type=pa.uint64(), ) got = plc.hashing.xxhash_64(plc_tbl, 0) + assert_column_eq(got, expect) From b406b414732c9a37d70e69065f6cd9b79b26a146 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 2 Oct 2024 09:56:44 -0700 Subject: [PATCH 35/52] small updates --- cpp/src/hash/murmurhash3_x86_32.cu | 2 -- python/pylibcudf/pylibcudf/tests/test_hashing.py | 14 +++----------- 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/cpp/src/hash/murmurhash3_x86_32.cu b/cpp/src/hash/murmurhash3_x86_32.cu index 3e1884192b6..dd7b19633be 100644 --- a/cpp/src/hash/murmurhash3_x86_32.cu +++ b/cpp/src/hash/murmurhash3_x86_32.cu @@ -49,8 +49,6 @@ std::unique_ptr murmurhash3_x86_32(table_view const& input, auto const row_hasher = cudf::experimental::row::hash::row_hasher(input, stream); auto output_view = output->mutable_view(); - std::cout << "\n\n\n murmurhash \n\n\n" << std::endl; - // Compute the hash value for each row thrust::tabulate(rmm::exec_policy(stream), output_view.begin(), diff --git a/python/pylibcudf/pylibcudf/tests/test_hashing.py b/python/pylibcudf/pylibcudf/tests/test_hashing.py index e8630b3d612..19c10856b57 100644 --- a/python/pylibcudf/pylibcudf/tests/test_hashing.py +++ b/python/pylibcudf/pylibcudf/tests/test_hashing.py @@ -6,12 +6,11 @@ import mmh3 import numpy as np import pyarrow as pa +import pylibcudf as plc import pytest import xxhash from utils import assert_column_eq, assert_table_eq -import cudf._lib.pylibcudf as plc - SEED = 0 METHODS = ["md5", "sha1", "sha224", "sha256", "sha384", "sha512"] @@ -26,15 +25,9 @@ def pa_input_column(pa_type): return pa.array([True, True, False], type=pa_type) elif pa.types.is_list(pa_type): # TODO: Add heterogenous sizes - try: - return pa.array([[1], [2], [3]], type=pa_type) - except: - pytest.skip() + return pa.array([[1], [2], [3]], type=pa_type) elif pa.types.is_struct(pa_type): - try: - return pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type) - except: - pytest.skip() + return pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type) raise ValueError("Unsupported type") @@ -136,7 +129,6 @@ def test_hash_column_xxhash64(pa_input_column): pa.Table.from_arrays([pa_input_column], names=["data"]) ) - if isinstance(pa_input_column.type, (pa.ListType, pa.StructType)): pytest.xfail() expect = pa.array( From d730b6f81bd3af9737b00e39dfd2072f358e8c23 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 4 Oct 2024 08:59:01 -0700 Subject: [PATCH 36/52] refactor/pass, missing a few tests --- .../pylibcudf/pylibcudf/tests/test_hashing.py | 169 ++++++++---------- 1 file changed, 78 insertions(+), 91 deletions(-) diff --git a/python/pylibcudf/pylibcudf/tests/test_hashing.py b/python/pylibcudf/pylibcudf/tests/test_hashing.py index 19c10856b57..27d18814837 100644 --- a/python/pylibcudf/pylibcudf/tests/test_hashing.py +++ b/python/pylibcudf/pylibcudf/tests/test_hashing.py @@ -15,25 +15,45 @@ METHODS = ["md5", "sha1", "sha224", "sha256", "sha384", "sha512"] +def scalar_to_binary(x): + if isinstance(x, str): + return x.encode() + elif isinstance(x, float): + return struct.pack("> 2)) + + +@pytest.fixture(params=[pa.int64(), pa.float64(), pa.string(), pa.bool_()]) +def scalar_type(request): + return request.param + + @pytest.fixture -def pa_input_column(pa_type): - if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type): - return pa.array([1, 2, 3], type=pa_type) - elif pa.types.is_string(pa_type): - return pa.array(["a", "b", "c"], type=pa_type) - elif pa.types.is_boolean(pa_type): - return pa.array([True, True, False], type=pa_type) - elif pa.types.is_list(pa_type): - # TODO: Add heterogenous sizes - return pa.array([[1], [2], [3]], type=pa_type) - elif pa.types.is_struct(pa_type): - return pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type) - raise ValueError("Unsupported type") - - -@pytest.fixture() -def input_column(pa_input_column): - return plc.interop.from_arrow(pa_input_column) +def pa_scalar_input_column(scalar_type): + if pa.types.is_integer(scalar_type) or pa.types.is_floating(scalar_type): + return pa.array([1, 2, 3], type=scalar_type) + elif pa.types.is_string(scalar_type): + return pa.array(["a", "b", "c"], type=scalar_type) + elif pa.types.is_boolean(scalar_type): + return pa.array([True, True, False], type=scalar_type) + + +@pytest.fixture +def plc_scalar_input_tbl(pa_scalar_input_column): + return plc.interop.from_arrow( + pa.Table.from_arrays([pa_scalar_input_column], names=["data"]) + ) @pytest.fixture(scope="module") @@ -47,98 +67,65 @@ def list_struct_table(): return data -def libcudf_mmh3_x86_32(binary): - seed = plc.hashing.LIBCUDF_DEFAULT_HASH_SEED - hashval = mmh3.hash(binary, seed) - return seed ^ (hashval + 0x9E3779B9 + (seed << 6) + (seed >> 2)) - - def python_hash_value(x, method): - if isinstance(x, str): - binary = str(x).encode() - elif isinstance(x, float): - binary = struct.pack(" Date: Fri, 4 Oct 2024 09:26:42 -0700 Subject: [PATCH 37/52] extra test --- python/pylibcudf/pylibcudf/tests/test_hashing.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/python/pylibcudf/pylibcudf/tests/test_hashing.py b/python/pylibcudf/pylibcudf/tests/test_hashing.py index 27d18814837..3c024e2baae 100644 --- a/python/pylibcudf/pylibcudf/tests/test_hashing.py +++ b/python/pylibcudf/pylibcudf/tests/test_hashing.py @@ -112,9 +112,6 @@ def py_hasher(val): assert_column_eq(got, expect) -# TODO: sha and md5 struct/list errors - - def test_hash_column_xxhash64(pa_scalar_input_column, plc_scalar_input_tbl): def py_hasher(val): return xxhash.xxh64( @@ -143,6 +140,14 @@ def test_sha_list_struct_err(list_struct_table, dtype, method): plc_hasher(plc_tbl) +def test_md5_struct_err(list_struct_table): + err_types = list_struct_table.select(["struct"]) + plc_tbl = plc.interop.from_arrow(err_types) + + with pytest.raises(TypeError): + plc.hashing.md5(plc_tbl) + + def test_murmurhash3_x86_32(pa_scalar_input_column, plc_scalar_input_tbl): def py_hasher(val): return libcudf_mmh3_x86_32(scalar_to_binary(val)) From 68c7a49f0caace226dfcc8a40e3318caa3ab42a6 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 14 Oct 2024 13:49:36 -0700 Subject: [PATCH 38/52] missing test --- python/pylibcudf/pylibcudf/tests/test_hashing.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/pylibcudf/pylibcudf/tests/test_hashing.py b/python/pylibcudf/pylibcudf/tests/test_hashing.py index 3c024e2baae..faf3ab06a54 100644 --- a/python/pylibcudf/pylibcudf/tests/test_hashing.py +++ b/python/pylibcudf/pylibcudf/tests/test_hashing.py @@ -161,6 +161,9 @@ def py_hasher(val): assert_column_eq(got, expect) +# def test_murmurhash_x86_32_list_struct TODO + + def test_murmurhash3_x64_128(pa_scalar_input_column, plc_scalar_input_tbl): def py_hasher(val): hasher = mmh3.mmh3_x64_128(seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED) From cdd41db886090806b1667b39d399f51617df45d8 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 17 Oct 2024 06:29:13 -0700 Subject: [PATCH 39/52] prune moves --- python/pylibcudf/pylibcudf/hashing.pyx | 36 +++++++++++--------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/python/pylibcudf/pylibcudf/hashing.pyx b/python/pylibcudf/pylibcudf/hashing.pyx index d4df2d67d17..401c844b23c 100644 --- a/python/pylibcudf/pylibcudf/hashing.pyx +++ b/python/pylibcudf/pylibcudf/hashing.pyx @@ -44,11 +44,9 @@ cpdef Column murmurhash3_x86_32( """ cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_murmurhash3_x86_32( - input.view(), - seed - ) + c_result = cpp_murmurhash3_x86_32( + input.view(), + seed ) return Column.from_libcudf(move(c_result)) @@ -76,11 +74,9 @@ cpdef Table murmurhash3_x64_128( """ cdef unique_ptr[table] c_result with nogil: - c_result = move( - cpp_murmurhash3_x64_128( - input.view(), - seed - ) + c_result = cpp_murmurhash3_x64_128( + input.view(), + seed ) return Table.from_libcudf(move(c_result)) @@ -109,11 +105,9 @@ cpdef Column xxhash_64( cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_xxhash_64( - input.view(), - seed - ) + c_result = cpp_xxhash_64( + input.view(), + seed ) return Column.from_libcudf(move(c_result)) @@ -138,7 +132,7 @@ cpdef Column md5(Table input): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_md5(input.view())) + c_result = cpp_md5(input.view()) return Column.from_libcudf(move(c_result)) cpdef Column sha1(Table input): @@ -158,7 +152,7 @@ cpdef Column sha1(Table input): """ cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_sha1(input.view())) + c_result = cpp_sha1(input.view()) return Column.from_libcudf(move(c_result)) @@ -179,7 +173,7 @@ cpdef Column sha224(Table input): """ cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_sha224(input.view())) + c_result = cpp_sha224(input.view()) return Column.from_libcudf(move(c_result)) @@ -200,7 +194,7 @@ cpdef Column sha256(Table input): """ cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_sha256(input.view())) + c_result = cpp_sha256(input.view()) return Column.from_libcudf(move(c_result)) @@ -221,7 +215,7 @@ cpdef Column sha384(Table input): """ cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_sha384(input.view())) + c_result = cpp_sha384(input.view()) return Column.from_libcudf(move(c_result)) @@ -242,5 +236,5 @@ cpdef Column sha512(Table input): """ cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_sha512(input.view())) + c_result = cpp_sha512(input.view()) return Column.from_libcudf(move(c_result)) From a6ded881d20f0e8907d813989ea574a9b0fe870e Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 18 Oct 2024 06:36:00 -0700 Subject: [PATCH 40/52] fixes --- python/pylibcudf/pylibcudf/tests/conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pylibcudf/pylibcudf/tests/conftest.py b/python/pylibcudf/pylibcudf/tests/conftest.py index 245e0ab51a8..45030ed2f02 100644 --- a/python/pylibcudf/pylibcudf/tests/conftest.py +++ b/python/pylibcudf/pylibcudf/tests/conftest.py @@ -16,6 +16,7 @@ from utils import ALL_PA_TYPES, DEFAULT_PA_TYPES, NUMERIC_PA_TYPES + def _type_to_str(typ): if isinstance(typ, pa.ListType): return f"list-{_type_to_str(typ.value_type)}" @@ -24,6 +25,7 @@ def _type_to_str(typ): else: return str(typ) + # This fixture defines [the standard set of types that all tests should default to # running on. If there is a need for some tests to run on a different set of types, that # type list fixture should also be defined below here if it is likely to be reused From 382c2dc3ff12c52b2c7580698ff7f243276a587d Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 22 Oct 2024 10:28:57 -0500 Subject: [PATCH 41/52] Update docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst Co-authored-by: Matthew Murray <41342305+Matt711@users.noreply.github.com> --- docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst index 8a783ac5564..6bd1fbd821b 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst @@ -2,5 +2,5 @@ hashing ======= -.. automodule:: cudf._lib.pylibcudf.hashing +.. automodule:: pylibcudf.hashing :members: From a55048a1454dcf1015fdebe44904b830093ce0b1 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 22 Oct 2024 15:03:05 -0500 Subject: [PATCH 42/52] Apply suggestions from code review Co-authored-by: Bradley Dice --- dependencies.yaml | 5 +---- python/pylibcudf/pylibcudf/hashing.pyx | 4 ++-- python/pylibcudf/pylibcudf/tests/conftest.py | 4 ++-- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/dependencies.yaml b/dependencies.yaml index 960ade0823e..cb983ca07d4 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -838,16 +838,13 @@ dependencies: - moto>=4.0.8 - s3fs>=2022.3.0 - python-xxhash - - output_types: pyproject + - output_types: [pyproject, requirements] packages: - msgpack - &tokenizers tokenizers==0.15.2 - &transformers transformers==4.39.3 - tzdata - xxhash - - output_types: requirements - packages: - - xxhash specific: - output_types: [conda, requirements] matrices: diff --git a/python/pylibcudf/pylibcudf/hashing.pyx b/python/pylibcudf/pylibcudf/hashing.pyx index 401c844b23c..9ea3d4d1bda 100644 --- a/python/pylibcudf/pylibcudf/hashing.pyx +++ b/python/pylibcudf/pylibcudf/hashing.pyx @@ -64,7 +64,7 @@ cpdef Table murmurhash3_x64_128( ---------- input : Table The table of columns to hash - seed : uint32_t + seed : uint64_t Optional seed value to use for the hash function Returns @@ -94,7 +94,7 @@ cpdef Column xxhash_64( ---------- input : Table The table of columns to hash - seed : uint32_t + seed : uint64_t Optional seed value to use for the hash function Returns diff --git a/python/pylibcudf/pylibcudf/tests/conftest.py b/python/pylibcudf/pylibcudf/tests/conftest.py index 45030ed2f02..cf1ee387791 100644 --- a/python/pylibcudf/pylibcudf/tests/conftest.py +++ b/python/pylibcudf/pylibcudf/tests/conftest.py @@ -19,9 +19,9 @@ def _type_to_str(typ): if isinstance(typ, pa.ListType): - return f"list-{_type_to_str(typ.value_type)}" + return f"list[{_type_to_str(typ.value_type)}]" elif isinstance(typ, pa.StructType): - return f"struct-{'-'.join([_type_to_str(typ.field(i).type) for i in range(typ.num_fields)])}" + return f"struct[{', '.join([_type_to_str(typ.field(i).type) for i in range(typ.num_fields)])}]" else: return str(typ) From 23cd5fede37dc50a8f0eae5efd5642b65f10661b Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 22 Oct 2024 13:06:53 -0700 Subject: [PATCH 43/52] combine sha/md5 tests --- .../pylibcudf/pylibcudf/tests/test_hashing.py | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/python/pylibcudf/pylibcudf/tests/test_hashing.py b/python/pylibcudf/pylibcudf/tests/test_hashing.py index faf3ab06a54..ad233a88568 100644 --- a/python/pylibcudf/pylibcudf/tests/test_hashing.py +++ b/python/pylibcudf/pylibcudf/tests/test_hashing.py @@ -84,9 +84,11 @@ def python_hash_value(x, method): @pytest.mark.parametrize( - "method", ["sha1", "sha224", "sha256", "sha384", "sha512"] + "method", ["sha1", "sha224", "sha256", "sha384", "sha512", "md5"] ) -def test_hash_column_sha(pa_scalar_input_column, plc_scalar_input_tbl, method): +def test_hash_column_sha_md5( + pa_scalar_input_column, plc_scalar_input_tbl, method +): plc_hasher = getattr(plc.hashing, method) def py_hasher(val): @@ -100,18 +102,6 @@ def py_hasher(val): assert_column_eq(got, expect) -def test_hash_column_md5(pa_scalar_input_column, plc_scalar_input_tbl): - def py_hasher(val): - return hashlib.md5(scalar_to_binary(val)).hexdigest() - - expect = pa.array( - [py_hasher(val) for val in pa_scalar_input_column.to_pylist()], - type=pa.string(), - ) - got = plc.hashing.md5(plc_scalar_input_tbl) - assert_column_eq(got, expect) - - def test_hash_column_xxhash64(pa_scalar_input_column, plc_scalar_input_tbl): def py_hasher(val): return xxhash.xxh64( From 1a4cfadfa2b1cfee3e85c047c345e62cb7861f42 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Sun, 27 Oct 2024 06:42:19 -0700 Subject: [PATCH 44/52] struct and list tests, struct still fails --- .../pylibcudf/pylibcudf/tests/test_hashing.py | 65 ++++++++++++++++++- 1 file changed, 63 insertions(+), 2 deletions(-) diff --git a/python/pylibcudf/pylibcudf/tests/test_hashing.py b/python/pylibcudf/pylibcudf/tests/test_hashing.py index ad233a88568..7fe864f1d61 100644 --- a/python/pylibcudf/pylibcudf/tests/test_hashing.py +++ b/python/pylibcudf/pylibcudf/tests/test_hashing.py @@ -28,10 +28,13 @@ def scalar_to_binary(x): raise NotImplementedError +def hash_combine_32(lhs, rhs): + return lhs ^ (rhs + 0x9E3779B9 + (lhs << 6) + (lhs >> 2)) + def libcudf_mmh3_x86_32(binary): seed = plc.hashing.LIBCUDF_DEFAULT_HASH_SEED hashval = mmh3.hash(binary, seed) - return seed ^ (hashval + 0x9E3779B9 + (seed << 6) + (seed >> 2)) + return hash_combine_32(seed, hashval) @pytest.fixture(params=[pa.int64(), pa.float64(), pa.string(), pa.bool_()]) @@ -151,8 +154,66 @@ def py_hasher(val): assert_column_eq(got, expect) -# def test_murmurhash_x86_32_list_struct TODO +@pytest.mark.filterwarnings("ignore::RuntimeWarning") +def test_murmurhash3_x86_32_list(): + pa_tbl = pa.Table.from_pydict({"list": pa.array([[1, 2, 3], [4,5,6], [7,8,9]], type=pa.list_(pa.uint32()))}) + plc_tbl = plc.interop.from_arrow(pa_tbl) + + def hash_single_uint32(val, seed=0): + return mmh3.hash(np.uint32(val).tobytes(), seed=seed, signed=False) + + def uint_hash_combine_32(lhs, rhs): + lhs = np.uint32(lhs) + rhs = np.uint32(rhs) + return hash_combine_32(lhs, rhs) + + def hash_list(l): + hash_value = uint_hash_combine_32(0, hash_single_uint32(len(l))) + + for element in l: + hash_value = uint_hash_combine_32(hash_value, hash_single_uint32(element, seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED)) + + final = uint_hash_combine_32(plc.hashing.LIBCUDF_DEFAULT_HASH_SEED, hash_value) + return final + + expect = pa.array([hash_list(val) for val in pa_tbl["list"].to_pylist()], type=pa.uint32()) + got = plc.hashing.murmurhash3_x86_32(plc_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED) + assert_column_eq(got, expect) + +@pytest.mark.filterwarnings("ignore::RuntimeWarning") +def test_murmurhash3_x86_32_struct(): + pa_tbl = pa.table( + { + 'struct': pa.array( + [ + {"a": 1, "b": 2} + ], + type=pa.struct([pa.field("a", pa.uint32()), pa.field("b", pa.uint32())]) + ) + } + ) + plc_tbl = plc.interop.from_arrow(pa_tbl) + + def hash_single_uint32(val, seed=0): + return mmh3.hash(np.uint32(val).tobytes(), seed=seed, signed=False) + + def uint_hash_combine_32(lhs, rhs): + lhs = np.uint32(lhs) + rhs = np.uint32(rhs) + return hash_combine_32(lhs, rhs) + + def hash_struct(s): + hash_value = uint_hash_combine_32(0, hash_single_uint32(len(s))) + + for key in s: + hash_value = uint_hash_combine_32(hash_value, hash_single_uint32(s[key], seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED)) + final = uint_hash_combine_32(plc.hashing.LIBCUDF_DEFAULT_HASH_SEED, hash_value) + return final + + expect = pa.array([hash_struct(val) for val in pa_tbl["struct"].to_pylist()], type=pa.uint32()) + got = plc.hashing.murmurhash3_x86_32(plc_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED) + assert_column_eq(got, expect) def test_murmurhash3_x64_128(pa_scalar_input_column, plc_scalar_input_tbl): def py_hasher(val): From 4c37de985da120e231bd732158d1e2674a9c8188 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Sun, 27 Oct 2024 08:17:03 -0700 Subject: [PATCH 45/52] pass. --- .../pylibcudf/pylibcudf/tests/test_hashing.py | 104 ++++++++++++------ 1 file changed, 69 insertions(+), 35 deletions(-) diff --git a/python/pylibcudf/pylibcudf/tests/test_hashing.py b/python/pylibcudf/pylibcudf/tests/test_hashing.py index 7fe864f1d61..e541d27d7c1 100644 --- a/python/pylibcudf/pylibcudf/tests/test_hashing.py +++ b/python/pylibcudf/pylibcudf/tests/test_hashing.py @@ -28,9 +28,20 @@ def scalar_to_binary(x): raise NotImplementedError +def hash_single_uint32(val, seed=0): + return mmh3.hash(np.uint32(val).tobytes(), seed=seed, signed=False) + + def hash_combine_32(lhs, rhs): return lhs ^ (rhs + 0x9E3779B9 + (lhs << 6) + (lhs >> 2)) + +def uint_hash_combine_32(lhs, rhs): + lhs = np.uint32(lhs) + rhs = np.uint32(rhs) + return hash_combine_32(lhs, rhs) + + def libcudf_mmh3_x86_32(binary): seed = plc.hashing.LIBCUDF_DEFAULT_HASH_SEED hashval = mmh3.hash(binary, seed) @@ -156,65 +167,88 @@ def py_hasher(val): @pytest.mark.filterwarnings("ignore::RuntimeWarning") def test_murmurhash3_x86_32_list(): - pa_tbl = pa.Table.from_pydict({"list": pa.array([[1, 2, 3], [4,5,6], [7,8,9]], type=pa.list_(pa.uint32()))}) + pa_tbl = pa.Table.from_pydict( + { + "list": pa.array( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], type=pa.list_(pa.uint32()) + ) + } + ) plc_tbl = plc.interop.from_arrow(pa_tbl) - def hash_single_uint32(val, seed=0): - return mmh3.hash(np.uint32(val).tobytes(), seed=seed, signed=False) - - def uint_hash_combine_32(lhs, rhs): - lhs = np.uint32(lhs) - rhs = np.uint32(rhs) - return hash_combine_32(lhs, rhs) - - def hash_list(l): - hash_value = uint_hash_combine_32(0, hash_single_uint32(len(l))) + def hash_list(list_): + hash_value = uint_hash_combine_32(0, hash_single_uint32(len(list_))) - for element in l: - hash_value = uint_hash_combine_32(hash_value, hash_single_uint32(element, seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED)) + for element in list_: + hash_value = uint_hash_combine_32( + hash_value, + hash_single_uint32( + element, seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ), + ) - final = uint_hash_combine_32(plc.hashing.LIBCUDF_DEFAULT_HASH_SEED, hash_value) + final = uint_hash_combine_32( + plc.hashing.LIBCUDF_DEFAULT_HASH_SEED, hash_value + ) return final - expect = pa.array([hash_list(val) for val in pa_tbl["list"].to_pylist()], type=pa.uint32()) - got = plc.hashing.murmurhash3_x86_32(plc_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED) + expect = pa.array( + [hash_list(val) for val in pa_tbl["list"].to_pylist()], + type=pa.uint32(), + ) + got = plc.hashing.murmurhash3_x86_32( + plc_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ) assert_column_eq(got, expect) + @pytest.mark.filterwarnings("ignore::RuntimeWarning") def test_murmurhash3_x86_32_struct(): pa_tbl = pa.table( { - 'struct': pa.array( + "struct": pa.array( [ - {"a": 1, "b": 2} - ], - type=pa.struct([pa.field("a", pa.uint32()), pa.field("b", pa.uint32())]) + {"a": 1, "b": 2, "c": 3}, + {"a": 4, "b": 5, "c": 6}, + {"a": 7, "b": 8, "c": 9}, + ], + type=pa.struct( + [ + pa.field("a", pa.uint32()), + pa.field("b", pa.uint32(), pa.field("c", pa.uint32())), + ] + ), ) } ) plc_tbl = plc.interop.from_arrow(pa_tbl) - def hash_single_uint32(val, seed=0): - return mmh3.hash(np.uint32(val).tobytes(), seed=seed, signed=False) - - def uint_hash_combine_32(lhs, rhs): - lhs = np.uint32(lhs) - rhs = np.uint32(rhs) - return hash_combine_32(lhs, rhs) - def hash_struct(s): - hash_value = uint_hash_combine_32(0, hash_single_uint32(len(s))) + seed = plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + keys = list(s.keys()) - for key in s: - hash_value = uint_hash_combine_32(hash_value, hash_single_uint32(s[key], seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED)) + combined_hash = hash_single_uint32(s[keys[0]], seed=seed) + combined_hash = uint_hash_combine_32(0, combined_hash) + combined_hash = uint_hash_combine_32(seed, combined_hash) - final = uint_hash_combine_32(plc.hashing.LIBCUDF_DEFAULT_HASH_SEED, hash_value) - return final + # Step 3: Process the remaining fields + for key in keys[1:]: + current_hash = hash_single_uint32(s[key], seed=seed) + combined_hash = uint_hash_combine_32(combined_hash, current_hash) + + return combined_hash - expect = pa.array([hash_struct(val) for val in pa_tbl["struct"].to_pylist()], type=pa.uint32()) - got = plc.hashing.murmurhash3_x86_32(plc_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED) + got = plc.hashing.murmurhash3_x86_32( + plc_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ) + + expect = pa.array( + [hash_struct(val) for val in pa_tbl["struct"].to_pylist()], + type=pa.uint32(), + ) assert_column_eq(got, expect) + def test_murmurhash3_x64_128(pa_scalar_input_column, plc_scalar_input_tbl): def py_hasher(val): hasher = mmh3.mmh3_x64_128(seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED) From 542307412663999cd85b4abfd478d17207787168 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Sun, 27 Oct 2024 08:17:32 -0700 Subject: [PATCH 46/52] clean --- python/pylibcudf/pylibcudf/tests/test_hashing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pylibcudf/pylibcudf/tests/test_hashing.py b/python/pylibcudf/pylibcudf/tests/test_hashing.py index e541d27d7c1..bfc3497a4af 100644 --- a/python/pylibcudf/pylibcudf/tests/test_hashing.py +++ b/python/pylibcudf/pylibcudf/tests/test_hashing.py @@ -231,7 +231,6 @@ def hash_struct(s): combined_hash = uint_hash_combine_32(0, combined_hash) combined_hash = uint_hash_combine_32(seed, combined_hash) - # Step 3: Process the remaining fields for key in keys[1:]: current_hash = hash_single_uint32(s[key], seed=seed) combined_hash = uint_hash_combine_32(combined_hash, current_hash) From 46b27a19eaeeea4001c28bb833f1cf07e52914e0 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 28 Oct 2024 05:16:47 -0700 Subject: [PATCH 47/52] style --- python/pylibcudf/pylibcudf/tests/test_hashing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pylibcudf/pylibcudf/tests/test_hashing.py b/python/pylibcudf/pylibcudf/tests/test_hashing.py index bfc3497a4af..9bef1235c09 100644 --- a/python/pylibcudf/pylibcudf/tests/test_hashing.py +++ b/python/pylibcudf/pylibcudf/tests/test_hashing.py @@ -6,11 +6,12 @@ import mmh3 import numpy as np import pyarrow as pa -import pylibcudf as plc import pytest import xxhash from utils import assert_column_eq, assert_table_eq +import pylibcudf as plc + SEED = 0 METHODS = ["md5", "sha1", "sha224", "sha256", "sha384", "sha512"] From 60e5c4c0fe0fb6538c7af2e0ecee9fb4537b86da Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Mon, 28 Oct 2024 17:29:14 -0500 Subject: [PATCH 48/52] Update python/pylibcudf/pylibcudf/tests/conftest.py Co-authored-by: Bradley Dice --- python/pylibcudf/pylibcudf/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pylibcudf/pylibcudf/tests/conftest.py b/python/pylibcudf/pylibcudf/tests/conftest.py index 9aaffd532e3..5265e411c7f 100644 --- a/python/pylibcudf/pylibcudf/tests/conftest.py +++ b/python/pylibcudf/pylibcudf/tests/conftest.py @@ -22,7 +22,7 @@ def _type_to_str(typ): if isinstance(typ, pa.ListType): return f"list[{_type_to_str(typ.value_type)}]" elif isinstance(typ, pa.StructType): - return f"struct[{', '.join([_type_to_str(typ.field(i).type) for i in range(typ.num_fields)])}]" + return f"struct[{', '.join(_type_to_str(typ.field(i).type) for i in range(typ.num_fields))}]" else: return str(typ) From dcd38c6d0d25464bf6eb1b329d5578a948c19ca5 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 28 Oct 2024 15:28:19 -0700 Subject: [PATCH 49/52] update docstrings --- python/pylibcudf/pylibcudf/hashing.pyx | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/python/pylibcudf/pylibcudf/hashing.pyx b/python/pylibcudf/pylibcudf/hashing.pyx index 9ea3d4d1bda..f92fc2ef17a 100644 --- a/python/pylibcudf/pylibcudf/hashing.pyx +++ b/python/pylibcudf/pylibcudf/hashing.pyx @@ -28,7 +28,7 @@ cpdef Column murmurhash3_x86_32( ): """Computes the MurmurHash3 32-bit hash value of each row in the given table. - For details, see :cpp:func:`murmurhash3_x86_32`. + For details, see :cpp:func:`cudf::hashing::murmurhash3_x86_32`. Parameters ---------- @@ -58,7 +58,7 @@ cpdef Table murmurhash3_x64_128( ): """Computes the MurmurHash3 64-bit hash value of each row in the given table. - For details, see :cpp:func:`murmurhash3_x64_128`. + For details, see :cpp:func:`cudf::hashing::murmurhash3_x64_128`. Parameters ---------- @@ -88,7 +88,7 @@ cpdef Column xxhash_64( ): """Computes the xxHash 64-bit hash value of each row in the given table. - For details, see :cpp:func:`xxhash_64`. + For details, see :cpp:func:`cudf::hashing::xxhash_64`. Parameters ---------- @@ -116,7 +116,7 @@ cpdef Column xxhash_64( cpdef Column md5(Table input): """Computes the MD5 hash value of each row in the given table. - For details, see :cpp:func:`md5`. + For details, see :cpp:func:`cudf::hashing::md5`. Parameters ---------- @@ -138,7 +138,7 @@ cpdef Column md5(Table input): cpdef Column sha1(Table input): """Computes the SHA-1 hash value of each row in the given table. - For details, see :cpp:func:`sha1`. + For details, see :cpp:func:`cudf::hashing::sha1`. Parameters ---------- @@ -159,7 +159,7 @@ cpdef Column sha1(Table input): cpdef Column sha224(Table input): """Computes the SHA-224 hash value of each row in the given table. - For details, see :cpp:func:`sha224`. + For details, see :cpp:func:`cudf::hashing::sha224`. Parameters ---------- @@ -180,7 +180,7 @@ cpdef Column sha224(Table input): cpdef Column sha256(Table input): """Computes the SHA-256 hash value of each row in the given table. - For details, see :cpp:func:`sha256`. + For details, see :cpp:func:`cudf::hashing::sha256`. Parameters ---------- @@ -201,7 +201,7 @@ cpdef Column sha256(Table input): cpdef Column sha384(Table input): """Computes the SHA-384 hash value of each row in the given table. - For details, see :cpp:func:`sha384`. + For details, see :cpp:func:`cudf::hashing::sha384`. Parameters ---------- @@ -222,7 +222,7 @@ cpdef Column sha384(Table input): cpdef Column sha512(Table input): """Computes the SHA-512 hash value of each row in the given table. - For details, see :cpp:func:`sha512`. + For details, see :cpp:func:`cudf::hashing::sha512`. Parameters ---------- From 7f3157bece66534601adc8b829419ee13cab0a17 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 29 Oct 2024 19:35:39 -0700 Subject: [PATCH 50/52] enforce uint32 --- python/pylibcudf/pylibcudf/tests/test_hashing.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/pylibcudf/pylibcudf/tests/test_hashing.py b/python/pylibcudf/pylibcudf/tests/test_hashing.py index 9bef1235c09..83fb50fa4ef 100644 --- a/python/pylibcudf/pylibcudf/tests/test_hashing.py +++ b/python/pylibcudf/pylibcudf/tests/test_hashing.py @@ -34,13 +34,11 @@ def hash_single_uint32(val, seed=0): def hash_combine_32(lhs, rhs): - return lhs ^ (rhs + 0x9E3779B9 + (lhs << 6) + (lhs >> 2)) + return np.uint32(lhs ^ (rhs + 0x9E3779B9 + (lhs << 6) + (lhs >> 2))) def uint_hash_combine_32(lhs, rhs): - lhs = np.uint32(lhs) - rhs = np.uint32(rhs) - return hash_combine_32(lhs, rhs) + return hash_combine_32(np.uint32(lhs), np.uint32(rhs)) def libcudf_mmh3_x86_32(binary): From 08b88184c379bda53652252aed2216b681c7bf2a Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 30 Oct 2024 12:31:45 -0700 Subject: [PATCH 51/52] adjust doxygen tags --- cpp/include/cudf/hashing.hpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp index 0c5327edb91..75790885aeb 100644 --- a/cpp/include/cudf/hashing.hpp +++ b/cpp/include/cudf/hashing.hpp @@ -22,12 +22,6 @@ namespace CUDF_EXPORT cudf { -/** - * @addtogroup column_hash - * @{ - * @file - */ - /** * @brief Type of hash value * @@ -42,6 +36,12 @@ static constexpr uint32_t DEFAULT_HASH_SEED = 0; //! Hash APIs namespace hashing { +/** + * @addtogroup column_hash + * @{ + * @file + */ + /** * @brief Computes the MurmurHash3 32-bit hash value of each row in the given table * @@ -183,7 +183,8 @@ std::unique_ptr xxhash_64( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** @} */ // end of group + } // namespace hashing -/** @} */ // end of group } // namespace CUDF_EXPORT cudf From d0234d4ba5a6552ab8936fb8b660e3bdbdf1d3c4 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 30 Oct 2024 18:04:36 -0700 Subject: [PATCH 52/52] doc fixes --- cpp/include/cudf/hashing.hpp | 3 ++- python/pylibcudf/pylibcudf/hashing.pyx | 18 +++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp index 75790885aeb..307a52cd242 100644 --- a/cpp/include/cudf/hashing.hpp +++ b/cpp/include/cudf/hashing.hpp @@ -24,12 +24,13 @@ namespace CUDF_EXPORT cudf { /** * @brief Type of hash value - * + * @ingroup column_hash */ using hash_value_type = uint32_t; /** * @brief The default seed value for hash functions + * @ingroup column_hash */ static constexpr uint32_t DEFAULT_HASH_SEED = 0; diff --git a/python/pylibcudf/pylibcudf/hashing.pyx b/python/pylibcudf/pylibcudf/hashing.pyx index f92fc2ef17a..9ea3d4d1bda 100644 --- a/python/pylibcudf/pylibcudf/hashing.pyx +++ b/python/pylibcudf/pylibcudf/hashing.pyx @@ -28,7 +28,7 @@ cpdef Column murmurhash3_x86_32( ): """Computes the MurmurHash3 32-bit hash value of each row in the given table. - For details, see :cpp:func:`cudf::hashing::murmurhash3_x86_32`. + For details, see :cpp:func:`murmurhash3_x86_32`. Parameters ---------- @@ -58,7 +58,7 @@ cpdef Table murmurhash3_x64_128( ): """Computes the MurmurHash3 64-bit hash value of each row in the given table. - For details, see :cpp:func:`cudf::hashing::murmurhash3_x64_128`. + For details, see :cpp:func:`murmurhash3_x64_128`. Parameters ---------- @@ -88,7 +88,7 @@ cpdef Column xxhash_64( ): """Computes the xxHash 64-bit hash value of each row in the given table. - For details, see :cpp:func:`cudf::hashing::xxhash_64`. + For details, see :cpp:func:`xxhash_64`. Parameters ---------- @@ -116,7 +116,7 @@ cpdef Column xxhash_64( cpdef Column md5(Table input): """Computes the MD5 hash value of each row in the given table. - For details, see :cpp:func:`cudf::hashing::md5`. + For details, see :cpp:func:`md5`. Parameters ---------- @@ -138,7 +138,7 @@ cpdef Column md5(Table input): cpdef Column sha1(Table input): """Computes the SHA-1 hash value of each row in the given table. - For details, see :cpp:func:`cudf::hashing::sha1`. + For details, see :cpp:func:`sha1`. Parameters ---------- @@ -159,7 +159,7 @@ cpdef Column sha1(Table input): cpdef Column sha224(Table input): """Computes the SHA-224 hash value of each row in the given table. - For details, see :cpp:func:`cudf::hashing::sha224`. + For details, see :cpp:func:`sha224`. Parameters ---------- @@ -180,7 +180,7 @@ cpdef Column sha224(Table input): cpdef Column sha256(Table input): """Computes the SHA-256 hash value of each row in the given table. - For details, see :cpp:func:`cudf::hashing::sha256`. + For details, see :cpp:func:`sha256`. Parameters ---------- @@ -201,7 +201,7 @@ cpdef Column sha256(Table input): cpdef Column sha384(Table input): """Computes the SHA-384 hash value of each row in the given table. - For details, see :cpp:func:`cudf::hashing::sha384`. + For details, see :cpp:func:`sha384`. Parameters ---------- @@ -222,7 +222,7 @@ cpdef Column sha384(Table input): cpdef Column sha512(Table input): """Computes the SHA-512 hash value of each row in the given table. - For details, see :cpp:func:`cudf::hashing::sha512`. + For details, see :cpp:func:`sha512`. Parameters ----------