Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

auto convert Java values(arrays/scalar) to Numpy ones and convert DH nulls based on the annotations of the params of a Py UDF #4502

Merged
merged 18 commits into from
Dec 1, 2023
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,13 @@ public class PyCallableWrapperJpyImpl implements PyCallableWrapper {
private static final Map<Character, Class<?>> numpyType2JavaClass = new HashMap<>();

static {
numpyType2JavaClass.put('b', byte.class);
numpyType2JavaClass.put('h', short.class);
numpyType2JavaClass.put('H', char.class);
numpyType2JavaClass.put('i', int.class);
numpyType2JavaClass.put('l', long.class);
numpyType2JavaClass.put('h', short.class);
numpyType2JavaClass.put('f', float.class);
numpyType2JavaClass.put('d', double.class);
numpyType2JavaClass.put('b', byte.class);
numpyType2JavaClass.put('?', boolean.class);
numpyType2JavaClass.put('U', String.class);
numpyType2JavaClass.put('M', Instant.class);
Expand Down Expand Up @@ -133,23 +134,21 @@ private void prepareSignature() {
pyCallable
+ " has multiple signatures; this is not currently supported for numba vectorized/guvectorized functions");
}
signature = params.get(0).getStringValue();
unwrapped = pyCallable;
// since vectorization doesn't support array type parameters, don't flag numba guvectorized as vectorized
numbaVectorized = isNumbaVectorized;
vectorized = isNumbaVectorized;
} else if (pyCallable.hasAttribute("dh_vectorized")) {
signature = pyCallable.getAttribute("signature").toString();
unwrapped = pyCallable.getAttribute("callable");
numbaVectorized = false;
vectorized = true;
} else {
signature = dh_table_module.call("_encode_signature", pyCallable).toString();
unwrapped = pyCallable;
numbaVectorized = false;
vectorized = false;
}
pyUdfDecoratedCallable = dh_table_module.call("_py_udf", unwrapped);
signature = pyUdfDecoratedCallable.getAttribute("signature").toString();
}

@Override
Expand Down
61 changes: 49 additions & 12 deletions py/server/deephaven/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ def __call__(self, *args, **kwargs):
"""Double-precision floating-point number type"""
string = DType(j_name="java.lang.String", qst_type=_JQstType.stringType(), np_type=np.str_)
"""String type"""
Character = DType(j_name="java.lang.Character")
"""Character type"""
BigDecimal = DType(j_name="java.math.BigDecimal")
"""Java BigDecimal type"""
StringSet = DType(j_name="io.deephaven.stringset.StringSet")
Expand Down Expand Up @@ -188,6 +190,20 @@ def __call__(self, *args, **kwargs):
}


_J_ARRAY_NP_TYPE_MAP = {
boolean_array.j_type: np.dtype("?"),
byte_array.j_type: np.dtype("b"),
char_array.j_type: np.dtype("uint16"),
short_array.j_type: np.dtype("h"),
int32_array.j_type: np.dtype("i"),
long_array.j_type: np.dtype("l"),
float32_array.j_type: np.dtype("f"),
double_array.j_type: np.dtype("d"),
string_array.j_type: np.dtype("U"),
instant_array.j_type: np.dtype("datetime64[ns]"),
}


def null_remap(dtype: DType) -> Callable[[Any], Any]:
""" Creates a null value remap function for the provided DType.

Expand Down Expand Up @@ -325,8 +341,19 @@ def from_np_dtype(np_dtype: Union[np.dtype, pd.api.extensions.ExtensionDtype]) -
return PyObject


_NUMPY_INT_TYPE_CODES = ["i", "l", "h", "b"]
_NUMPY_FLOATING_TYPE_CODES = ["f", "d"]
_NUMPY_INT_TYPE_CODES = {"b", "h", "H", "i", "l"}
_NUMPY_FLOATING_TYPE_CODES = {"f", "d"}


def _is_py_null(x: Any) -> bool:
"""Checks if the value is a Python null value, i.e. None or NaN, or Pandas.NA."""
if x is None:
return True

try:
return bool(pd.isna(x))
except (TypeError, ValueError):
return False


def _scalar(x: Any, dtype: DType) -> Any:
Expand All @@ -336,12 +363,14 @@ def _scalar(x: Any, dtype: DType) -> Any:

# NULL_BOOL will appear in Java as a byte value which causes a cast error. We just let JPY converts it to Java null
# and the engine has casting logic to handle it.
if x is None and dtype != bool_ and _PRIMITIVE_DTYPE_NULL_MAP.get(dtype):
if _is_py_null(x) and dtype not in (bool_, char) and _PRIMITIVE_DTYPE_NULL_MAP.get(dtype):
return _PRIMITIVE_DTYPE_NULL_MAP[dtype]
jmao-denver marked this conversation as resolved.
Show resolved Hide resolved

try:
if hasattr(x, "dtype"):
if x.dtype.char in _NUMPY_INT_TYPE_CODES:
if x.dtype.char == 'H': # np.uint16 maps to Java char
return Character(int(x))
elif x.dtype.char in _NUMPY_INT_TYPE_CODES:
return int(x)
elif x.dtype.char in _NUMPY_FLOATING_TYPE_CODES:
return float(x)
Expand Down Expand Up @@ -382,20 +411,32 @@ def _component_np_dtype_char(t: type) -> Optional[str]:
if isinstance(t, _GenericAlias) and issubclass(t.__origin__, Sequence):
component_type = t.__args__[0]

if not component_type:
component_type = _np_ndarray_component_type(t)

if component_type:
return _np_dtype_char(component_type)
else:
return None


def _np_ndarray_component_type(t: type) -> Optional[type]:
"""Returns the numpy ndarray component type if the type is a numpy ndarray, otherwise return None."""

# Py3.8: npt.NDArray can be used in Py 3.8 as a generic alias, but a specific alias (e.g. npt.NDArray[np.int64])
# is an instance of a private class of np, yet we don't have a choice but to use it. And when npt.NDArray is used,
# the 1st argument is typing.Any, the 2nd argument is another generic alias of which the 1st argument is the
# component type
if not component_type and sys.version_info.minor == 8:
component_type = None
if sys.version_info.major == 3 and sys.version_info.minor == 8:
if isinstance(t, np._typing._generic_alias._GenericAlias) and t.__origin__ == np.ndarray:
component_type = t.__args__[1].__args__[0]

# Py3.9+, np.ndarray as a generic alias is only supported in Python 3.9+, also npt.NDArray is still available but a
# specific alias (e.g. npt.NDArray[np.int64]) now is an instance of typing.GenericAlias.
# when npt.NDArray is used, the 1st argument is typing.Any, the 2nd argument is another generic alias of which
# the 1st argument is the component type
# when np.ndarray is used, the 1st argument is the component type
if not component_type and sys.version_info.minor > 8:
if not component_type and sys.version_info.major == 3 and sys.version_info.minor > 8:
import types
if isinstance(t, types.GenericAlias) and (issubclass(t.__origin__, Sequence) or t.__origin__ == np.ndarray):
nargs = len(t.__args__)
jmao-denver marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -406,8 +447,4 @@ def _component_np_dtype_char(t: type) -> Optional[str]:
a1 = t.__args__[1]
if a0 == typing.Any and isinstance(a1, types.GenericAlias):
component_type = a1.__args__[0]

if component_type:
return _np_dtype_char(component_type)
else:
return None
return component_type
120 changes: 116 additions & 4 deletions py/server/deephaven/jcompat.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,29 @@
""" This module provides Java compatibility support including convenience functions to create some widely used Java
data structures from corresponding Python ones in order to be able to call Java methods. """

from typing import Any, Callable, Dict, Iterable, List, Sequence, Set, TypeVar, Union
from typing import Any, Callable, Dict, Iterable, List, Sequence, Set, TypeVar, Union, Tuple, Literal

import jpy
import numpy as np
import pandas as pd

from deephaven import dtypes, DHError
from deephaven._wrapper import unwrap, wrap_j_object
from deephaven.dtypes import DType
from deephaven.dtypes import DType, _PRIMITIVE_DTYPE_NULL_MAP, _J_ARRAY_NP_TYPE_MAP

_NULL_BOOLEAN_AS_BYTE = jpy.get_type("io.deephaven.util.BooleanUtils").NULL_BOOLEAN_AS_BYTE
_JPrimitiveArrayConversionUtility = jpy.get_type("io.deephaven.integrations.common.PrimitiveArrayConversionUtility")

_DH_PANDAS_NULLABLE_TYPE_MAP: Dict[DType, pd.api.extensions.ExtensionDtype] = {
dtypes.bool_: pd.BooleanDtype,
dtypes.byte: pd.Int8Dtype,
dtypes.short: pd.Int16Dtype,
dtypes.char: pd.UInt16Dtype,
dtypes.int32: pd.Int32Dtype,
dtypes.int64: pd.Int64Dtype,
dtypes.float32: pd.Float32Dtype,
dtypes.float64: pd.Float64Dtype,
}


def is_java_type(obj: Any) -> bool:
Expand Down Expand Up @@ -181,11 +198,106 @@ def to_sequence(v: Union[T, Sequence[T]] = None, wrapped: bool = False) -> Seque
return ()
if wrapped:
if not isinstance(v, Sequence) or isinstance(v, str):
return (v, )
return (v,)
else:
return tuple(v)

if not isinstance(v, Sequence) or isinstance(v, str):
return (unwrap(v), )
return (unwrap(v),)
else:
return tuple((unwrap(o) for o in v))


def _j_array_to_numpy_array(dtype: DType, j_array: jpy.JType, conv_null: bool = False, type_promotion: bool = True) -> \
jmao-denver marked this conversation as resolved.
Show resolved Hide resolved
np.ndarray:
""" Produces a numpy array from the DType and given Java array.

Args:
dtype (DType): The dtype of the array
jmao-denver marked this conversation as resolved.
Show resolved Hide resolved
j_array (jpy.JType): The Java array to convert
conv_null (bool): If True, convert nulls to the default value for the dtype
jmao-denver marked this conversation as resolved.
Show resolved Hide resolved
type_promotion (bool): when conv_null is True, whether to promote the dtype to np.float64 for Java integer
arrays if the array contains Deephaven nulls. When True, Java integer arrays will be promoted to
np.float64 if the Java array contains deephaven nulls and these nulls will be converted to np.nan. When
False, an exception will be thrown if the Java array contains deephaven nulls. Defaults to True.
Note, this option has no effect on Java floating point arrays.
jmao-denver marked this conversation as resolved.
Show resolved Hide resolved

Returns:
np.ndarray: The numpy array

Raises:
DHError
"""
if dtype.is_primitive:
np_array = np.frombuffer(j_array, dtype.np_type)
elif dtype == dtypes.Instant:
longs = _JPrimitiveArrayConversionUtility.translateArrayInstantToLong(j_array)
np_long_array = np.frombuffer(longs, np.int64)
np_array = np_long_array.view(dtype.np_type)
elif dtype == dtypes.bool_:
bytes_ = _JPrimitiveArrayConversionUtility.translateArrayBooleanToByte(j_array)
np_array = np.frombuffer(bytes_, dtype.np_type)
jmao-denver marked this conversation as resolved.
Show resolved Hide resolved
elif dtype == dtypes.string:
np_array = np.array(j_array, dtypes.string.np_type)
elif dtype.np_type is not np.object_:
try:
np_array = np.frombuffer(j_array, dtype.np_type)
except:
np_array = np.array(j_array, np.object_)
else:
np_array = np.array(j_array, np.object_)

if conv_null:
if dh_null := _PRIMITIVE_DTYPE_NULL_MAP.get(dtype):
if dtype in (dtypes.float32, dtypes.float64):
np_array = np.copy(np_array)
np_array[np_array == dh_null] = np.nan
else:
if dtype is dtypes.bool_: # promote boolean to float64
jmao-denver marked this conversation as resolved.
Show resolved Hide resolved
np_array = np.frombuffer(np_array, np.byte)

if any(np_array[np_array == dh_null]):
if not type_promotion:
raise DHError(f"Java array contains Deephaven nulls for dtype {dtype} that numpy array of the "
f"equivalent type doesn't support")
jmao-denver marked this conversation as resolved.
Show resolved Hide resolved
np_array = np_array.astype(np.float64)
np_array[np_array == dh_null] = np.nan
jmao-denver marked this conversation as resolved.
Show resolved Hide resolved
else:
if dtype is dtypes.bool_:
np_array = np.frombuffer(np_array, np.bool_)
chipkent marked this conversation as resolved.
Show resolved Hide resolved
return np_array

return np_array


def _j_array_to_series(dtype: DType, j_array: jpy.JType, conv_null: bool) -> pd.Series:
"""Produce a copy of the specified Java array as a pandas.Series object.

Args:
dtype (DType): the data type of the Java array
jmao-denver marked this conversation as resolved.
Show resolved Hide resolved
j_array (jpy.JType): the Java array
conv_null (bool): whether to check for Deephaven nulls in the data and automatically replace them with
pd.NA.

Returns:
a pandas Series

Raises:
DHError
"""
if conv_null and dtype == dtypes.bool_:
j_array = _JPrimitiveArrayConversionUtility.translateArrayBooleanToByte(j_array)
np_array = np.frombuffer(j_array, dtype=np.byte)
s = pd.Series(data=np_array, dtype=pd.Int8Dtype(), copy=False)
s.mask(s == _NULL_BOOLEAN_AS_BYTE, inplace=True)
return s.astype(pd.BooleanDtype(), copy=False)

np_array = _j_array_to_numpy_array(dtype, j_array, conv_null=False)
jmao-denver marked this conversation as resolved.
Show resolved Hide resolved
if conv_null and (nv := _PRIMITIVE_DTYPE_NULL_MAP.get(dtype)) is not None:
pd_ex_dtype = _DH_PANDAS_NULLABLE_TYPE_MAP.get(dtype)
s = pd.Series(data=np_array, dtype=pd_ex_dtype(), copy=False)
s.mask(s == nv, inplace=True)
else:
s = pd.Series(data=np_array, copy=False)

return s
29 changes: 5 additions & 24 deletions py/server/deephaven/numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@

import jpy
import numpy as np
from deephaven.dtypes import DType

from deephaven import DHError, dtypes, empty_table, new_table
from deephaven import DHError, dtypes, new_table
from deephaven.column import Column, InputColumn
from deephaven.dtypes import DType
from deephaven.jcompat import _j_array_to_numpy_array
from deephaven.table import Table

_JPrimitiveArrayConversionUtility = jpy.get_type("io.deephaven.integrations.common.PrimitiveArrayConversionUtility")
_JDataAccessHelpers = jpy.get_type("io.deephaven.engine.table.impl.DataAccessHelpers")


Expand All @@ -25,28 +25,9 @@ def _to_column_name(name: str) -> str:


def column_to_numpy_array(col_def: Column, j_array: jpy.JType) -> np.ndarray:
""" Produces a numpy array from the given Java array and the Table column definition. """
""" Produces a numpy array from the given Java array and the Table column definition."""
try:
if col_def.data_type.is_primitive:
np_array = np.frombuffer(j_array, col_def.data_type.np_type)
elif col_def.data_type == dtypes.Instant:
longs = _JPrimitiveArrayConversionUtility.translateArrayInstantToLong(j_array)
np_long_array = np.frombuffer(longs, np.int64)
np_array = np_long_array.view(col_def.data_type.np_type)
elif col_def.data_type == dtypes.bool_:
bytes_ = _JPrimitiveArrayConversionUtility.translateArrayBooleanToByte(j_array)
np_array = np.frombuffer(bytes_, col_def.data_type.np_type)
elif col_def.data_type == dtypes.string:
np_array = np.array([s for s in j_array], dtypes.string.np_type)
elif col_def.data_type.np_type is not np.object_:
try:
np_array = np.frombuffer(j_array, col_def.data_type.np_type)
except:
np_array = np.array(j_array, np.object_)
else:
np_array = np.array(j_array, np.object_)

return np_array
return _j_array_to_numpy_array(col_def.data_type, j_array)
except DHError:
raise
except Exception as e:
Expand Down
Loading
Loading