diff --git a/polars_hash/Cargo.toml b/polars_hash/Cargo.toml index 42b8ea0..dd5734b 100644 --- a/polars_hash/Cargo.toml +++ b/polars_hash/Cargo.toml @@ -19,8 +19,10 @@ sha1 = { version = "0.10.6" } sha2 = { version = "0.10.8" } sha3 = { version = "0.10.8" } blake3 = { version = "1.5.4" } -md5 = {version = "0.7.0"} +md5 = { version = "0.7.0" } h3o = { version = "0.6.4" } +xxhash-rust = { version = "0.8.12", features = ["xxh32", "xxh64"] } +mur3 = { version = "0.1.0" } [target.'cfg(target_os = "linux")'.dependencies] diff --git a/polars_hash/Makefile b/polars_hash/Makefile index 3a1d80e..813a8e3 100644 --- a/polars_hash/Makefile +++ b/polars_hash/Makefile @@ -21,6 +21,10 @@ format: venv ## Run formatters unset CONDA_PREFIX && \ source venv/bin/activate && ruff format . && cargo fmt +test: venv install ## Run pytests + unset CONDA_PREFIX && \ + source venv/bin/activate && pytest tests/ + clean: ## Clean venv and clean cargo -@rm -r venv -@cd polars_hash && cargo clean diff --git a/polars_hash/polars_hash/__init__.py b/polars_hash/polars_hash/__init__.py index a309c9f..45417c3 100644 --- a/polars_hash/polars_hash/__init__.py +++ b/polars_hash/polars_hash/__init__.py @@ -148,6 +148,46 @@ def md5(self) -> pl.Expr: is_elementwise=True, ) + def murmur32(self, *, seed: int = 0) -> pl.Expr: + """Takes Utf8 as input and returns uint32 hash with murmur32.""" + return register_plugin_function( + plugin_path=Path(__file__).parent, + function_name="murmur32", + args=self._expr, + is_elementwise=True, + kwargs={"seed": seed}, + ) + + def murmur128(self, *, seed: int = 0) -> pl.Expr: + """Takes Utf8 as input and returns binary hash with murmur128.""" + return register_plugin_function( + plugin_path=Path(__file__).parent, + function_name="murmur128", + args=self._expr, + is_elementwise=True, + kwargs={"seed": seed}, + ) + + def xxhash32(self, *, seed: int = 0) -> pl.Expr: + """Takes Utf8 as input and returns uint32 hash with xxhash32.""" + return register_plugin_function( + plugin_path=Path(__file__).parent, + function_name="xxhash32", + args=self._expr, + is_elementwise=True, + kwargs={"seed": seed}, + ) + + def xxhash64(self, *, seed: int = 0) -> pl.Expr: + """Takes Utf8 as input and returns uint64 hash with xxhash64.""" + return register_plugin_function( + plugin_path=Path(__file__).parent, + function_name="xxhash64", + args=self._expr, + is_elementwise=True, + kwargs={"seed": seed}, + ) + @pl.api.register_expr_namespace("geohash") class GeoHashingNameSpace: diff --git a/polars_hash/src/expressions.rs b/polars_hash/src/expressions.rs index 59ba4e3..a0d19c6 100644 --- a/polars_hash/src/expressions.rs +++ b/polars_hash/src/expressions.rs @@ -1,6 +1,8 @@ use crate::geohashers::{geohash_decoder, geohash_encoder, geohash_neighbors}; use crate::h3::h3_encoder; +use crate::murmurhash_hashers::*; use crate::sha_hashers::*; +use crate::xxhash_hashers::*; use polars::{ chunked_array::ops::arity::{ try_binary_elementwise, try_ternary_elementwise, unary_elementwise, @@ -13,10 +15,21 @@ use polars_core::datatypes::{ Field, }; use pyo3_polars::derive::polars_expr; +use serde::Deserialize; use std::fmt::Write; use std::{str, string}; use wyhash::wyhash as real_wyhash; +#[derive(Deserialize)] +struct SeedKwargs32bit { + seed: u32, +} + +#[derive(Deserialize)] +struct SeedKwargs64bit { + seed: u64, +} + pub fn blake3_hash_str(value: &str, output: &mut string::String) { let hash = blake3::hash(value.as_bytes()); write!(output, "{}", hash).unwrap() @@ -294,3 +307,39 @@ fn ghash_neighbors(inputs: &[Series]) -> PolarsResult { Ok(geohash_neighbors(ca)?.into_series()) } + +#[polars_expr(output_type=UInt32)] +fn murmur32(inputs: &[Series], kwargs: SeedKwargs32bit) -> PolarsResult { + let seeded_hash_function = |v| murmurhash3_32(v, kwargs.seed); + + let ca = inputs[0].str()?; + let out: ChunkedArray = unary_elementwise(ca, seeded_hash_function); + Ok(out.into_series()) +} + +#[polars_expr(output_type=Binary)] +fn murmur128(inputs: &[Series], kwargs: SeedKwargs32bit) -> PolarsResult { + let seeded_hash_function = |v| murmurhash3_128(v, kwargs.seed); + + let ca = inputs[0].str()?; + let out: ChunkedArray = unary_elementwise(ca, seeded_hash_function); + Ok(out.into_series()) +} + +#[polars_expr(output_type=UInt32)] +fn xxhash32(inputs: &[Series], kwargs: SeedKwargs32bit) -> PolarsResult { + let seeded_hash_function = |v| xxhash_32(v, kwargs.seed); + + let ca = inputs[0].str()?; + let out: ChunkedArray = unary_elementwise(ca, seeded_hash_function); + Ok(out.into_series()) +} + +#[polars_expr(output_type=UInt64)] +fn xxhash64(inputs: &[Series], kwargs: SeedKwargs64bit) -> PolarsResult { + let seeded_hash_function = |v| xxhash_64(v, kwargs.seed); + + let ca = inputs[0].str()?; + let out: ChunkedArray = unary_elementwise(ca, seeded_hash_function); + Ok(out.into_series()) +} diff --git a/polars_hash/src/lib.rs b/polars_hash/src/lib.rs index deceb87..216e813 100644 --- a/polars_hash/src/lib.rs +++ b/polars_hash/src/lib.rs @@ -1,7 +1,10 @@ mod expressions; mod geohashers; mod h3; +mod murmurhash_hashers; mod sha_hashers; +mod xxhash_hashers; + use pyo3::types::PyModule; use pyo3::{pymodule, Bound, PyResult, Python}; use pyo3_polars::PolarsAllocator; diff --git a/polars_hash/src/murmurhash_hashers.rs b/polars_hash/src/murmurhash_hashers.rs new file mode 100644 index 0000000..cd14f72 --- /dev/null +++ b/polars_hash/src/murmurhash_hashers.rs @@ -0,0 +1,17 @@ +use mur3::murmurhash3_x64_128; +use mur3::murmurhash3_x86_32; +pub fn murmurhash3_32(value: Option<&str>, seed: u32) -> Option { + value.map(|v| murmurhash3_x86_32(v.as_bytes(), seed)) +} + +pub fn murmurhash3_128(value: Option<&str>, seed: u32) -> Option> { + value.map(|v| { + let mut result = Vec::new(); + let hash = murmurhash3_x64_128(v.as_bytes(), seed); + + result.extend_from_slice(hash.0.to_le_bytes().as_ref()); + result.extend_from_slice(hash.1.to_le_bytes().as_ref()); + + result + }) +} diff --git a/polars_hash/src/xxhash_hashers.rs b/polars_hash/src/xxhash_hashers.rs new file mode 100644 index 0000000..be5d17b --- /dev/null +++ b/polars_hash/src/xxhash_hashers.rs @@ -0,0 +1,10 @@ +use xxhash_rust::xxh32::xxh32; +use xxhash_rust::xxh64::xxh64; + +pub fn xxhash_32(value: Option<&str>, seed: u32) -> Option { + value.map(|v| xxh32(v.as_bytes(), seed)) +} + +pub fn xxhash_64(value: Option<&str>, seed: u64) -> Option { + value.map(|v| xxh64(v.as_bytes(), seed)) +} diff --git a/polars_hash/tests/test_hash.py b/polars_hash/tests/test_hash.py index 4394c93..c7f168f 100644 --- a/polars_hash/tests/test_hash.py +++ b/polars_hash/tests/test_hash.py @@ -1,4 +1,6 @@ import polars as pl +import pytest +from polars.exceptions import ComputeError from polars.testing import assert_frame_equal import polars_hash as plh # noqa: F401 @@ -200,3 +202,172 @@ def test_geohash_13(): ] ) assert_frame_equal(result, expected) + + +def test_murmurhash32(): + df = pl.DataFrame({"literal": ["hello_world", None, ""]}) + result = df.select(pl.col("literal").nchash.murmur32()) # type: ignore + + expected = pl.DataFrame( + [ + pl.Series( + "literal", + [ + 3531928679, + None, + 0, + ], + dtype=pl.UInt32, + ), + ] + ) + + assert_frame_equal(result, expected) + + +def test_murmurhash32_seeded(): + df = pl.DataFrame({"literal": ["hello_world", None, ""]}) + result = df.select(plh.col("literal").nchash.murmur32(seed=42)) + + expected = pl.DataFrame( + [ + pl.Series( + "literal", + [ + 259561949, + None, + 142593372, + ], + dtype=pl.UInt32, + ), + ] + ) + + assert_frame_equal(result, expected) + + +def test_murmurhash128(): + df = pl.DataFrame({"literal": ["hello_world", None, ""]}) + result = df.select(plh.col("literal").nchash.murmur128()) + + expected = pl.DataFrame( + [ + pl.Series( + "literal", + [ + b"\x98,\xf3\x9e\x1c\x1a\xa5]\x1b\x07\x97\x16\x07l\x8de", + None, + b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + ], + dtype=pl.Binary, + ), + ] + ) + + assert_frame_equal(result, expected) + + +def test_xxhash32(): + df = pl.DataFrame({"literal": ["hello_world", None, ""]}) + result = df.select(pl.col("literal").nchash.xxhash32()) # type: ignore + + expected = pl.DataFrame( + [ + pl.Series( + "literal", + [ + 1605956417, + None, + 46947589, + ], + dtype=pl.UInt32, + ), + ] + ) + + assert_frame_equal(result, expected) + + +def test_xxhash64(): + df = pl.DataFrame({"literal": ["hello_world", None, ""]}) + result = df.select(pl.col("literal").nchash.xxhash64()) # type: ignore + + expected = pl.DataFrame( + [ + pl.Series( + "literal", + [ + 5654987600477331689, + None, + 17241709254077376921, + ], + dtype=pl.UInt64, + ), + ] + ) + + assert_frame_equal(result, expected) + + +def test_big(): + df = ( + pl.DataFrame({"a": ["asdfasdf" * 1_000_000]}) + .with_columns(pl.col("a").str.split("")) + .explode("a") + ) + print(df.select(plh.col("a").nchash.xxhash64())) + + +def test_xxhash32_seeded(): + df = pl.DataFrame({"literal": ["hello_world", None, ""]}) + result = df.select(pl.col("literal").nchash.xxhash32(seed=42)) # type: ignore + + expected = pl.DataFrame( + [ + pl.Series( + "literal", + [ + 1544934469, + None, + 3586027192, + ], + dtype=pl.UInt32, + ), + ] + ) + + assert_frame_equal(result, expected) + + +def test_xxhash64_seeded(): + df = pl.DataFrame({"literal": ["hello_world", None, ""]}) + result = df.select(pl.col("literal").nchash.xxhash64(seed=42)) # type: ignore + + expected = pl.DataFrame( + [ + pl.Series( + "literal", + [ + 17477110538672341566, + None, + 11002672306508523268, + ], + dtype=pl.UInt64, + ), + ] + ) + + assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + ("hash_fn_expr"), + [ + plh.col("literal").nchash.xxhash32(seed=None), # type: ignore + ], +) +def test_forced_missing_seed_errors(hash_fn_expr): + df = pl.DataFrame({"literal": ["hello_world", None, ""]}) + + with pytest.raises(ComputeError, match="expected u32"): + df.select(hash_fn_expr)