From e3bf58b12c564ed6e8c5069ba40fa8fe6a96784a Mon Sep 17 00:00:00 2001 From: jianfengmao Date: Fri, 24 Nov 2023 18:01:36 -0700 Subject: [PATCH] Add Java char support, fix a vectorization issue Also add more test cases --- .../table/impl/lang/QueryLanguageParser.java | 7 +- .../engine/util/PyCallableWrapperJpyImpl.java | 5 +- py/server/deephaven/dtypes.py | 25 ++++-- py/server/deephaven/jcompat.py | 31 ------- py/server/deephaven/numpy.py | 5 +- py/server/deephaven/table.py | 16 ++-- py/server/tests/test_udf_numpy_args.py | 89 ++++++++++++++++--- ...lues.py => test_udf_return_java_values.py} | 4 +- 8 files changed, 111 insertions(+), 71 deletions(-) rename py/server/tests/{test_pyfunc_return_java_values.py => test_udf_return_java_values.py} (99%) diff --git a/engine/table/src/main/java/io/deephaven/engine/table/impl/lang/QueryLanguageParser.java b/engine/table/src/main/java/io/deephaven/engine/table/impl/lang/QueryLanguageParser.java index 71addc2fdaa..888278ebe86 100644 --- a/engine/table/src/main/java/io/deephaven/engine/table/impl/lang/QueryLanguageParser.java +++ b/engine/table/src/main/java/io/deephaven/engine/table/impl/lang/QueryLanguageParser.java @@ -2687,12 +2687,7 @@ private void checkVectorizability(@NotNull final MethodCallExpr n, } } - List> paramTypes = pyCallableWrapper.getParamTypes(); - if (paramTypes.size() != expressions.length) { - // note vectorization doesn't handle Python variadic arguments - throw new PythonCallVectorizationFailure("Python function argument count mismatch: " + n + " " - + paramTypes.size() + " vs. " + expressions.length); - } + } private void prepareVectorizationArgs(MethodCallExpr n, QueryScope queryScope, Expression[] expressions, diff --git a/engine/table/src/main/java/io/deephaven/engine/util/PyCallableWrapperJpyImpl.java b/engine/table/src/main/java/io/deephaven/engine/util/PyCallableWrapperJpyImpl.java index 18262f8e7f0..42bd3c8aab3 100644 --- a/engine/table/src/main/java/io/deephaven/engine/util/PyCallableWrapperJpyImpl.java +++ b/engine/table/src/main/java/io/deephaven/engine/util/PyCallableWrapperJpyImpl.java @@ -29,12 +29,13 @@ public class PyCallableWrapperJpyImpl implements PyCallableWrapper { private static final Map> numpyType2JavaClass = new HashMap<>(); static { + numpyType2JavaClass.put('b', byte.class); + numpyType2JavaClass.put('h', short.class); + numpyType2JavaClass.put('H', char.class); numpyType2JavaClass.put('i', int.class); numpyType2JavaClass.put('l', long.class); - numpyType2JavaClass.put('h', short.class); numpyType2JavaClass.put('f', float.class); numpyType2JavaClass.put('d', double.class); - numpyType2JavaClass.put('b', byte.class); numpyType2JavaClass.put('?', boolean.class); numpyType2JavaClass.put('U', String.class); numpyType2JavaClass.put('M', Instant.class); diff --git a/py/server/deephaven/dtypes.py b/py/server/deephaven/dtypes.py index 44f9425e9e0..c51e030fd45 100644 --- a/py/server/deephaven/dtypes.py +++ b/py/server/deephaven/dtypes.py @@ -102,6 +102,8 @@ def __call__(self, *args, **kwargs): """Double-precision floating-point number type""" string = DType(j_name="java.lang.String", qst_type=_JQstType.stringType(), np_type=np.str_) """String type""" +Character = DType(j_name="java.lang.Character") +"""Character type""" BigDecimal = DType(j_name="java.math.BigDecimal") """Java BigDecimal type""" StringSet = DType(j_name="io.deephaven.stringset.StringSet") @@ -339,8 +341,19 @@ def from_np_dtype(np_dtype: Union[np.dtype, pd.api.extensions.ExtensionDtype]) - return PyObject -_NUMPY_INT_TYPE_CODES = ["i", "l", "h", "b"] -_NUMPY_FLOATING_TYPE_CODES = ["f", "d"] +_NUMPY_INT_TYPE_CODES = {"b", "h", "H", "i", "l"} +_NUMPY_FLOATING_TYPE_CODES = {"f", "d"} + + +def _is_py_null(x: Any) -> bool: + """Checks if the value is a Python null value, i.e. None or NaN, or Pandas.NA.""" + if x is None: + return True + + try: + return bool(pd.isna(x)) + except (TypeError, ValueError): + return False def _scalar(x: Any, dtype: DType) -> Any: @@ -350,12 +363,14 @@ def _scalar(x: Any, dtype: DType) -> Any: # NULL_BOOL will appear in Java as a byte value which causes a cast error. We just let JPY converts it to Java null # and the engine has casting logic to handle it. - if x is None and dtype != bool_ and _PRIMITIVE_DTYPE_NULL_MAP.get(dtype): + if _is_py_null(x) and dtype not in (bool_, char) and _PRIMITIVE_DTYPE_NULL_MAP.get(dtype): return _PRIMITIVE_DTYPE_NULL_MAP[dtype] try: if hasattr(x, "dtype"): - if x.dtype.char in _NUMPY_INT_TYPE_CODES: + if x.dtype.char == 'H': # np.uint16 maps to Java char + return Character(int(x)) + elif x.dtype.char in _NUMPY_INT_TYPE_CODES: return int(x) elif x.dtype.char in _NUMPY_FLOATING_TYPE_CODES: return float(x) @@ -368,8 +383,6 @@ def _scalar(x: Any, dtype: DType) -> Any: elif x.dtype.char == 'M': from deephaven.time import to_j_instant return to_j_instant(x) - elif x.dtype.char == 'H': # np.uint16 - return jpy.get_type("java.lang.Character")(int(x)) elif isinstance(x, (datetime.datetime, pd.Timestamp)): from deephaven.time import to_j_instant return to_j_instant(x) diff --git a/py/server/deephaven/jcompat.py b/py/server/deephaven/jcompat.py index 4c8fa4aeda9..f2cc80b9213 100644 --- a/py/server/deephaven/jcompat.py +++ b/py/server/deephaven/jcompat.py @@ -283,34 +283,3 @@ def _j_array_to_series(dtype: DType, j_array: jpy.JType, conv_null: bool) -> pd. s = pd.Series(data=np_array, copy=False) return s - - -def _convert_udf_args(args: Tuple[Any], fn_signature: str, null_value: Literal[np.nan, pd.NA, None]) -> List[Any]: - converted_args = [] - for arg, np_dtype_char in zip(args, fn_signature): - if np_dtype_char == 'O': - converted_args.append(arg) - elif src_np_dtype := _J_ARRAY_NP_TYPE_MAP.get(type(arg)): - # array types - np_dtype = np.dtype(np_dtype_char) - if src_np_dtype != np_dtype and np_dtype != np.object_: - raise DHError(f"Cannot convert Java array of type {src_np_dtype} to numpy array of type {np_dtype}") - dtype = dtypes.from_np_dtype(np_dtype) - if null_value is pd.NA: - converted_args.append(_j_array_to_series(dtype, arg, conv_null=True)) - else: # np.nan or None - converted_args.append(_j_array_to_numpy_array(dtype, arg, conv_null=bool(null_value))) - else: # scalar type or array types that don't need conversion - try: - np_dtype = np.dtype(np_dtype_char) - except TypeError: - converted_args.append(arg) - else: - dtype = dtypes.from_np_dtype(np_dtype) - if dtype is dtypes.bool_: - converted_args.append(null_value if arg is None else arg) - elif dh_null := _PRIMITIVE_DTYPE_NULL_MAP.get(dtype): - converted_args.append(null_value if arg == dh_null else arg) - else: - converted_args.append(arg) - return converted_args diff --git a/py/server/deephaven/numpy.py b/py/server/deephaven/numpy.py index 2b8fd215aca..4a972c27367 100644 --- a/py/server/deephaven/numpy.py +++ b/py/server/deephaven/numpy.py @@ -4,7 +4,6 @@ """ This module supports the conversion between Deephaven tables and numpy arrays. """ import re -from functools import wraps from typing import List import jpy @@ -13,8 +12,8 @@ from deephaven import DHError, dtypes, new_table from deephaven.column import Column, InputColumn from deephaven.dtypes import DType -from deephaven.jcompat import _j_array_to_numpy_array, _convert_udf_args -from deephaven.table import Table, _encode_signature +from deephaven.jcompat import _j_array_to_numpy_array +from deephaven.table import Table _JDataAccessHelpers = jpy.get_type("io.deephaven.engine.table.impl.DataAccessHelpers") diff --git a/py/server/deephaven/table.py b/py/server/deephaven/table.py index 9eae28557f5..fee44bb6841 100644 --- a/py/server/deephaven/table.py +++ b/py/server/deephaven/table.py @@ -30,14 +30,14 @@ from deephaven.agg import Aggregation from deephaven.column import Column, ColumnType from deephaven.filters import Filter, and_, or_ -from deephaven.jcompat import j_unary_operator, j_binary_operator, j_map_to_dict, j_hashmap, _convert_udf_args, \ - _j_array_to_numpy_array +from deephaven.jcompat import j_unary_operator, j_binary_operator, j_map_to_dict, j_hashmap, _j_array_to_numpy_array from deephaven.jcompat import to_sequence, j_array_list from deephaven.time import to_np_datetime64 from deephaven.update_graph import auto_locking_ctx, UpdateGraph from deephaven.updateby import UpdateByOperation from deephaven.dtypes import _BUILDABLE_ARRAY_DTYPE_MAP, _scalar, _np_dtype_char, _component_np_dtype_char, DType, \ - _np_ndarray_component_type, _J_ARRAY_NP_TYPE_MAP, _PRIMITIVE_DTYPE_NULL_MAP + _np_ndarray_component_type, _J_ARRAY_NP_TYPE_MAP, _PRIMITIVE_DTYPE_NULL_MAP, _NUMPY_INT_TYPE_CODES, \ + _NUMPY_FLOATING_TYPE_CODES # Table _J_Table = jpy.get_type("io.deephaven.engine.table.Table") @@ -368,7 +368,7 @@ def _j_py_script_session() -> _JPythonScriptSession: return None -_SUPPORTED_NP_TYPE_CODES = {"b", "h", "i", "l", "f", "d", "?", "U", "M", "O"} +_SUPPORTED_NP_TYPE_CODES = {"b", "h", "H", "i", "l", "f", "d", "?", "U", "M", "O"} @dataclass @@ -444,7 +444,7 @@ def _parse_type_no_nested(annotation, p_annotation, t): p_annotation.is_array = True if tc in {"N", "O", "?", "U", "M"}: p_annotation.is_none_legal = True - if tc in {"b", "h", "i", "l"}: + if tc in {"b", "h", "H", "i", "l"}: if p_annotation.int_char and p_annotation.int_char != tc: raise DHError(message=f"ambiguity detected: multiple integer types in annotation: {annotation}") p_annotation.int_char = tc @@ -499,9 +499,9 @@ def _parse_numba_signature(fn: Union[numba.np.ufunc.gufunc.GUFunc, numba.np.ufun for p in params: pa = ParsedAnnotation() pa.encoded_types.add(p) - if p in {"b", "h", "i", "l"}: + if p in _NUMPY_INT_TYPE_CODES: pa.int_char = p - if p in {"f", "d"}: + if p in _NUMPY_FLOATING_TYPE_CODES: pa.floating_char = p p_annotations.append(pa) else: # GUFunc @@ -517,7 +517,7 @@ def _parse_numba_signature(fn: Union[numba.np.ufunc.gufunc.GUFunc, numba.np.ufun pa.is_array = True else: pa.encoded_types.add(p) - if p in {"b", "h", "i", "l"}: + if p in {"b", "h", "H", "i", "l"}: pa.int_char = p if p in {"f", "d"}: pa.floating_char = p diff --git a/py/server/tests/test_udf_numpy_args.py b/py/server/tests/test_udf_numpy_args.py index ee40f8bbc5c..a0d5da1ae30 100644 --- a/py/server/tests/test_udf_numpy_args.py +++ b/py/server/tests/test_udf_numpy_args.py @@ -14,30 +14,33 @@ from tests.testbase import BaseTestCase _J_TYPE_NULL_MAP = { - "double": "NULL_DOUBLE", - "float": "NULL_FLOAT", + "byte": "NULL_BYTE", + "short": "NULL_SHORT", + "char": "NULL_CHAR", "int": "NULL_INT", "long": "NULL_LONG", - "short": "NULL_SHORT", - "byte": "NULL_BYTE", + "float": "NULL_FLOAT", + "double": "NULL_DOUBLE", } _J_TYPE_NP_DTYPE_MAP = { - "double": "np.float64", - "float": "np.float32", + "byte": "np.int8", + "short": "np.int16", + "char": "np.uint16", "int": "np.int32", "long": "np.int64", - "short": "np.int16", - "byte": "np.int8", + "float": "np.float32", + "double": "np.float64", } _J_TYPE_J_ARRAY_TYPE_MAP = { - "double": double_array, - "float": float32_array, + "byte": int8_array, + "short": int16_array, + "char": char_array, "int": int32_array, "long": long_array, - "short": int16_array, - "byte": int8_array, + "float": float32_array, + "double": double_array, } @@ -254,7 +257,19 @@ def f51(col1, col2: Optional[np.ndarray[np.int32]]) -> bool: with self.assertRaises(DHError) as cm: t = t.update(["X1 = f51(X, null)"]) - def test_str_bool_datetime(self): + t = empty_table(10).update(["X = i % 3", "Y = i"]).group_by("X") + + def f6(*args: np.int32, col2: np.ndarray[np.int32]) -> bool: + return np.nanmean(col2) == np.mean(col2) + with self.assertRaises(DHError) as cm: + t1 = t.update(["X1 = f6(X, Y)"]) + self.assertIn("missing 1 required keyword-only argument", str(cm.exception)) + + with self.assertRaises(DHError) as cm: + t1 = t.update(["X1 = f6(X, Y=null)"]) + self.assertIn("not compatible with annotation", str(cm.exception)) + + def test_str_bool_datetime_array(self): with self.subTest("str"): def f1(p1: np.ndarray[str], p2=None) -> bool: return bool(len(p1)) @@ -304,6 +319,54 @@ def f31(p1: Optional[np.ndarray[bool]], p2=None) -> bool: t2 = t.update(["X1 = f31(null, Y)"]) self.assertEqual(3, t2.to_string("X1").count("false")) + def test_str_bool_datetime_scalar(self): + with self.subTest("str"): + def f1(p1: str, p2=None) -> bool: + return p1 is None + + t = empty_table(10).update(["X = i % 3", "Y = i % 2 == 0? `deephaven`: null"]) + t1 = t.update(["X1 = f1(Y)"]) + self.assertEqual(t1.columns[2].data_type, dtypes.bool_) + self.assertEqual(5, t1.to_string().count("true")) + + def f11(p1: Union[str, None], p2=None) -> bool: + return p1 is None + t2 = t.update(["X1 = f11(Y)"]) + self.assertEqual(5, t2.to_string().count("false")) + + with self.subTest("datetime"): + def f2(p1: np.datetime64, p2=None) -> bool: + return p1 is None + + t = empty_table(10).update(["X = i % 3", "Y = i % 2 == 0? now() : null"]) + t1 = t.update(["X1 = f2(Y)"]) + self.assertEqual(t1.columns[2].data_type, dtypes.bool_) + self.assertEqual(5, t1.to_string().count("true")) + + def f21(p1: Union[np.datetime64, None], p2=None) -> bool: + return p1 is None + t2 = t.update(["X1 = f21(Y)"]) + self.assertEqual(5, t2.to_string().count("false")) + + with self.subTest("boolean"): + def f3(p1: np.bool_, p2=None) -> bool: + return p1 is None + + t = empty_table(10).update(["X = i % 3", "Y = i % 2 == 0? true : null"]) + t1 = t.update(["X1 = f3(Y)"]) + self.assertEqual(t1.columns[2].data_type, dtypes.bool_) + self.assertEqual(5, t1.to_string("X1").count("false")) + + t = empty_table(10).update(["X = i % 3", "Y = i % 2 == 0? true : false"]) + t1 = t.update(["X1 = f3(Y)"]) + self.assertEqual(t1.columns[2].data_type, dtypes.bool_) + self.assertEqual(0, t1.to_string("X1").count("true")) + + def f31(p1: Optional[np.bool_], p2=None) -> bool: + return p1 is None + t2 = t.update(["X1 = f31(null, Y)"]) + self.assertEqual(10, t2.to_string("X1").count("true")) + if __name__ == "__main__": unittest.main() diff --git a/py/server/tests/test_pyfunc_return_java_values.py b/py/server/tests/test_udf_return_java_values.py similarity index 99% rename from py/server/tests/test_pyfunc_return_java_values.py rename to py/server/tests/test_udf_return_java_values.py index 87bf99ede84..1a7c78e3aa9 100644 --- a/py/server/tests/test_pyfunc_return_java_values.py +++ b/py/server/tests/test_udf_return_java_values.py @@ -23,7 +23,7 @@ dtypes.byte: "np.int8", dtypes.bool_: "np.bool_", dtypes.string: "np.str_", - # dtypes.char: "np.uint16", + dtypes.char: "np.uint16", } @@ -52,7 +52,7 @@ def test_array_return(self): "np.float64": dtypes.double_array, "bool": dtypes.boolean_array, "np.str_": dtypes.string_array, - # "np.uint16": dtypes.char_array, + "np.uint16": dtypes.char_array, } container_types = ["List", "Tuple", "list", "tuple", "Sequence", "np.ndarray"] for component_type, dh_dtype in component_types.items():