Skip to content

Commit

Permalink
add xxhash and murmurhash3 (#30)
Browse files Browse the repository at this point in the history
* more things

* mm128 returns bytes

* restructure

* lint

* allow seeding all new funcs

* cleanup

* swap hashing crates

* restructure
  • Loading branch information
moritzwilksch authored Nov 27, 2024
1 parent fb1b24e commit e71b15e
Show file tree
Hide file tree
Showing 8 changed files with 297 additions and 1 deletion.
4 changes: 3 additions & 1 deletion polars_hash/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@ sha1 = { version = "0.10.6" }
sha2 = { version = "0.10.8" }
sha3 = { version = "0.10.8" }
blake3 = { version = "1.5.4" }
md5 = {version = "0.7.0"}
md5 = { version = "0.7.0" }
h3o = { version = "0.6.4" }
xxhash-rust = { version = "0.8.12", features = ["xxh32", "xxh64"] }
mur3 = { version = "0.1.0" }


[target.'cfg(target_os = "linux")'.dependencies]
Expand Down
4 changes: 4 additions & 0 deletions polars_hash/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ format: venv ## Run formatters
unset CONDA_PREFIX && \
source venv/bin/activate && ruff format . && cargo fmt

test: venv install ## Run pytests
unset CONDA_PREFIX && \
source venv/bin/activate && pytest tests/

clean: ## Clean venv and clean cargo
-@rm -r venv
-@cd polars_hash && cargo clean
Expand Down
40 changes: 40 additions & 0 deletions polars_hash/polars_hash/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,46 @@ def md5(self) -> pl.Expr:
is_elementwise=True,
)

def murmur32(self, *, seed: int = 0) -> pl.Expr:
"""Takes Utf8 as input and returns uint32 hash with murmur32."""
return register_plugin_function(
plugin_path=Path(__file__).parent,
function_name="murmur32",
args=self._expr,
is_elementwise=True,
kwargs={"seed": seed},
)

def murmur128(self, *, seed: int = 0) -> pl.Expr:
"""Takes Utf8 as input and returns binary hash with murmur128."""
return register_plugin_function(
plugin_path=Path(__file__).parent,
function_name="murmur128",
args=self._expr,
is_elementwise=True,
kwargs={"seed": seed},
)

def xxhash32(self, *, seed: int = 0) -> pl.Expr:
"""Takes Utf8 as input and returns uint32 hash with xxhash32."""
return register_plugin_function(
plugin_path=Path(__file__).parent,
function_name="xxhash32",
args=self._expr,
is_elementwise=True,
kwargs={"seed": seed},
)

def xxhash64(self, *, seed: int = 0) -> pl.Expr:
"""Takes Utf8 as input and returns uint64 hash with xxhash64."""
return register_plugin_function(
plugin_path=Path(__file__).parent,
function_name="xxhash64",
args=self._expr,
is_elementwise=True,
kwargs={"seed": seed},
)


@pl.api.register_expr_namespace("geohash")
class GeoHashingNameSpace:
Expand Down
49 changes: 49 additions & 0 deletions polars_hash/src/expressions.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use crate::geohashers::{geohash_decoder, geohash_encoder, geohash_neighbors};
use crate::h3::h3_encoder;
use crate::murmurhash_hashers::*;
use crate::sha_hashers::*;
use crate::xxhash_hashers::*;
use polars::{
chunked_array::ops::arity::{
try_binary_elementwise, try_ternary_elementwise, unary_elementwise,
Expand All @@ -13,10 +15,21 @@ use polars_core::datatypes::{
Field,
};
use pyo3_polars::derive::polars_expr;
use serde::Deserialize;
use std::fmt::Write;
use std::{str, string};
use wyhash::wyhash as real_wyhash;

#[derive(Deserialize)]
struct SeedKwargs32bit {
seed: u32,
}

#[derive(Deserialize)]
struct SeedKwargs64bit {
seed: u64,
}

pub fn blake3_hash_str(value: &str, output: &mut string::String) {
let hash = blake3::hash(value.as_bytes());
write!(output, "{}", hash).unwrap()
Expand Down Expand Up @@ -294,3 +307,39 @@ fn ghash_neighbors(inputs: &[Series]) -> PolarsResult<Series> {

Ok(geohash_neighbors(ca)?.into_series())
}

#[polars_expr(output_type=UInt32)]
fn murmur32(inputs: &[Series], kwargs: SeedKwargs32bit) -> PolarsResult<Series> {
let seeded_hash_function = |v| murmurhash3_32(v, kwargs.seed);

let ca = inputs[0].str()?;
let out: ChunkedArray<UInt32Type> = unary_elementwise(ca, seeded_hash_function);
Ok(out.into_series())
}

#[polars_expr(output_type=Binary)]
fn murmur128(inputs: &[Series], kwargs: SeedKwargs32bit) -> PolarsResult<Series> {
let seeded_hash_function = |v| murmurhash3_128(v, kwargs.seed);

let ca = inputs[0].str()?;
let out: ChunkedArray<BinaryType> = unary_elementwise(ca, seeded_hash_function);
Ok(out.into_series())
}

#[polars_expr(output_type=UInt32)]
fn xxhash32(inputs: &[Series], kwargs: SeedKwargs32bit) -> PolarsResult<Series> {
let seeded_hash_function = |v| xxhash_32(v, kwargs.seed);

let ca = inputs[0].str()?;
let out: ChunkedArray<UInt32Type> = unary_elementwise(ca, seeded_hash_function);
Ok(out.into_series())
}

#[polars_expr(output_type=UInt64)]
fn xxhash64(inputs: &[Series], kwargs: SeedKwargs64bit) -> PolarsResult<Series> {
let seeded_hash_function = |v| xxhash_64(v, kwargs.seed);

let ca = inputs[0].str()?;
let out: ChunkedArray<UInt64Type> = unary_elementwise(ca, seeded_hash_function);
Ok(out.into_series())
}
3 changes: 3 additions & 0 deletions polars_hash/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
mod expressions;
mod geohashers;
mod h3;
mod murmurhash_hashers;
mod sha_hashers;
mod xxhash_hashers;

use pyo3::types::PyModule;
use pyo3::{pymodule, Bound, PyResult, Python};
use pyo3_polars::PolarsAllocator;
Expand Down
17 changes: 17 additions & 0 deletions polars_hash/src/murmurhash_hashers.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
use mur3::murmurhash3_x64_128;
use mur3::murmurhash3_x86_32;
pub fn murmurhash3_32(value: Option<&str>, seed: u32) -> Option<u32> {
value.map(|v| murmurhash3_x86_32(v.as_bytes(), seed))
}

pub fn murmurhash3_128(value: Option<&str>, seed: u32) -> Option<Vec<u8>> {
value.map(|v| {
let mut result = Vec::new();
let hash = murmurhash3_x64_128(v.as_bytes(), seed);

result.extend_from_slice(hash.0.to_le_bytes().as_ref());
result.extend_from_slice(hash.1.to_le_bytes().as_ref());

result
})
}
10 changes: 10 additions & 0 deletions polars_hash/src/xxhash_hashers.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
use xxhash_rust::xxh32::xxh32;
use xxhash_rust::xxh64::xxh64;

pub fn xxhash_32(value: Option<&str>, seed: u32) -> Option<u32> {
value.map(|v| xxh32(v.as_bytes(), seed))
}

pub fn xxhash_64(value: Option<&str>, seed: u64) -> Option<u64> {
value.map(|v| xxh64(v.as_bytes(), seed))
}
171 changes: 171 additions & 0 deletions polars_hash/tests/test_hash.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import polars as pl
import pytest
from polars.exceptions import ComputeError
from polars.testing import assert_frame_equal

import polars_hash as plh # noqa: F401
Expand Down Expand Up @@ -200,3 +202,172 @@ def test_geohash_13():
]
)
assert_frame_equal(result, expected)


def test_murmurhash32():
df = pl.DataFrame({"literal": ["hello_world", None, ""]})
result = df.select(pl.col("literal").nchash.murmur32()) # type: ignore

expected = pl.DataFrame(
[
pl.Series(
"literal",
[
3531928679,
None,
0,
],
dtype=pl.UInt32,
),
]
)

assert_frame_equal(result, expected)


def test_murmurhash32_seeded():
df = pl.DataFrame({"literal": ["hello_world", None, ""]})
result = df.select(plh.col("literal").nchash.murmur32(seed=42))

expected = pl.DataFrame(
[
pl.Series(
"literal",
[
259561949,
None,
142593372,
],
dtype=pl.UInt32,
),
]
)

assert_frame_equal(result, expected)


def test_murmurhash128():
df = pl.DataFrame({"literal": ["hello_world", None, ""]})
result = df.select(plh.col("literal").nchash.murmur128())

expected = pl.DataFrame(
[
pl.Series(
"literal",
[
b"\x98,\xf3\x9e\x1c\x1a\xa5]\x1b\x07\x97\x16\x07l\x8de",
None,
b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
],
dtype=pl.Binary,
),
]
)

assert_frame_equal(result, expected)


def test_xxhash32():
df = pl.DataFrame({"literal": ["hello_world", None, ""]})
result = df.select(pl.col("literal").nchash.xxhash32()) # type: ignore

expected = pl.DataFrame(
[
pl.Series(
"literal",
[
1605956417,
None,
46947589,
],
dtype=pl.UInt32,
),
]
)

assert_frame_equal(result, expected)


def test_xxhash64():
df = pl.DataFrame({"literal": ["hello_world", None, ""]})
result = df.select(pl.col("literal").nchash.xxhash64()) # type: ignore

expected = pl.DataFrame(
[
pl.Series(
"literal",
[
5654987600477331689,
None,
17241709254077376921,
],
dtype=pl.UInt64,
),
]
)

assert_frame_equal(result, expected)


def test_big():
df = (
pl.DataFrame({"a": ["asdfasdf" * 1_000_000]})
.with_columns(pl.col("a").str.split(""))
.explode("a")
)
print(df.select(plh.col("a").nchash.xxhash64()))


def test_xxhash32_seeded():
df = pl.DataFrame({"literal": ["hello_world", None, ""]})
result = df.select(pl.col("literal").nchash.xxhash32(seed=42)) # type: ignore

expected = pl.DataFrame(
[
pl.Series(
"literal",
[
1544934469,
None,
3586027192,
],
dtype=pl.UInt32,
),
]
)

assert_frame_equal(result, expected)


def test_xxhash64_seeded():
df = pl.DataFrame({"literal": ["hello_world", None, ""]})
result = df.select(pl.col("literal").nchash.xxhash64(seed=42)) # type: ignore

expected = pl.DataFrame(
[
pl.Series(
"literal",
[
17477110538672341566,
None,
11002672306508523268,
],
dtype=pl.UInt64,
),
]
)

assert_frame_equal(result, expected)


@pytest.mark.parametrize(
("hash_fn_expr"),
[
plh.col("literal").nchash.xxhash32(seed=None), # type: ignore
],
)
def test_forced_missing_seed_errors(hash_fn_expr):
df = pl.DataFrame({"literal": ["hello_world", None, ""]})

with pytest.raises(ComputeError, match="expected u32"):
df.select(hash_fn_expr)

0 comments on commit e71b15e

Please sign in to comment.