From 9df7c023a12e90071fb9cb3b083313146f4186b6 Mon Sep 17 00:00:00 2001 From: jianfengmao Date: Sun, 12 Nov 2023 10:00:40 -0700 Subject: [PATCH 1/3] change default dtype_backend for to_pandas --- py/server/deephaven/pandas.py | 7 ++++--- py/server/tests/test_learn_gather.py | 2 +- py/server/tests/test_pandas.py | 16 ++++++++-------- py/server/tests/test_parquet.py | 4 ++-- py/server/tests/test_table.py | 2 +- 5 files changed, 16 insertions(+), 15 deletions(-) diff --git a/py/server/deephaven/pandas.py b/py/server/deephaven/pandas.py index 14e74ece14c..4dbf7e37420 100644 --- a/py/server/deephaven/pandas.py +++ b/py/server/deephaven/pandas.py @@ -112,7 +112,8 @@ def _column_to_series(table: Table, col_def: Column, conv_null: bool) -> pd.Seri } -def to_pandas(table: Table, cols: List[str] = None, dtype_backend: Literal[None, "pyarrow", "numpy_nullable"] = None, +def to_pandas(table: Table, cols: List[str] = None, + dtype_backend: Literal[None, "pyarrow", "numpy_nullable"] = "numpy_nullable", conv_null: bool = True) -> pd.DataFrame: """Produces a pandas DataFrame from a table. @@ -125,8 +126,8 @@ def to_pandas(table: Table, cols: List[str] = None, dtype_backend: Literal[None, cols (List[str]): the source column names, default is None which means include all columns dtype_backend (str): Which dtype_backend to use, e.g. whether a DataFrame should have NumPy arrays, nullable dtypes are used for all dtypes that have a nullable implementation when “numpy_nullable” is set, - pyarrow is used for all dtypes if “pyarrow” is set. default is None, meaning Numpy backed DataFrames with - no nullable dtypes. + pyarrow is used for all dtypes if “pyarrow” is set. None means Numpy backed DataFrames with no nullable + dtypes. Default is "numpy_nullable". conv_null (bool): When dtype_backend is not set, whether to check for Deephaven nulls in the data and automatically replace them with pd.NA. default is True. diff --git a/py/server/tests/test_learn_gather.py b/py/server/tests/test_learn_gather.py index 64a316be3f7..f0caff9d059 100644 --- a/py/server/tests/test_learn_gather.py +++ b/py/server/tests/test_learn_gather.py @@ -141,7 +141,7 @@ def base_test(self, source, model, np_dtype): gatherer_colmajor = lambda rowset, colset: gather.table_to_numpy_2d(rowset, colset, gather.MemoryLayout.COLUMN_MAJOR, np_dtype) - array_from_table = to_pandas(source, conv_null=False).values + array_from_table = to_pandas(source, dtype_backend=None, conv_null=False).values gathered_rowmajor = gatherer_rowmajor(rows, cols) gathered_colmajor = gatherer_colmajor(rows, cols) diff --git a/py/server/tests/test_pandas.py b/py/server/tests/test_pandas.py index f896c23d853..2e78809f85a 100644 --- a/py/server/tests/test_pandas.py +++ b/py/server/tests/test_pandas.py @@ -54,7 +54,7 @@ def tearDown(self) -> None: super().tearDown() def test_to_pandas_no_conv_null(self): - df = to_pandas(self.test_table, conv_null=False) + df = to_pandas(self.test_table, dtype_backend=None, conv_null=False) self.assertEqual(len(df.columns), len(self.test_table.columns)) self.assertEqual(df.size, 2 * len(self.test_table.columns)) df_series = [df[col] for col in list(df.columns)] @@ -70,7 +70,7 @@ def test_to_pandas_remaps(self): prepared_table = self.test_table.update( formulas=["Long = isNull(Long_) ? Double.NaN : Long_"]) - df = to_pandas(prepared_table, cols=["Boolean", "Long"], conv_null=False) + df = to_pandas(prepared_table, cols=["Boolean", "Long"], dtype_backend=None, conv_null=False) self.assertEqual(df['Long'].dtype, np.float64) self.assertEqual(df['Boolean'].values.dtype, np.bool_) @@ -88,12 +88,12 @@ def test_vector_column(self): test_table = test_table.group_by(["String"]) df = to_pandas(test_table, cols=["String", "Doubles"]) - self.assertEqual(df['String'].dtype, np.object_) + self.assertEqual(df['String'].dtype, pd.StringDtype()) self.assertEqual(df['Doubles'].dtype, np.object_) double_series = df['Doubles'] - self.assertEqual([1.0, 2.0], list(double_series[0].toArray())) - self.assertEqual([4.0, 8.0, 16.0], list(double_series[1].toArray())) + self.assertEqual([1.0, 2.0], list(double_series[0])) + self.assertEqual([4.0, 8.0, 16.0], list(double_series[1])) def test_invalid_col_name(self): with self.assertRaises(DHError) as cm: @@ -114,7 +114,7 @@ def test_to_table(self): double_col(name="Double", data=[1.01, -1.01]), ] test_table = new_table(cols=input_cols) - df = to_pandas(test_table, conv_null=False) + df = to_pandas(test_table, dtype_backend=None, conv_null=False) table_from_df = to_table(df) self.assert_table_equals(table_from_df, test_table) @@ -123,7 +123,7 @@ def test_to_table_boolean_with_none(self): table_with_null_bool = new_table(cols=input_cols) prepared_table = table_with_null_bool.update( formulas=["Boolean = isNull(Boolean) ? (byte)NULL_BYTE : (Boolean == true ? 1: 0)"]) - df = to_pandas(prepared_table, conv_null=False) + df = to_pandas(prepared_table, dtype_backend=None, conv_null=False) table_from_df = to_table(df) self.assert_table_equals(table_from_df, prepared_table) @@ -159,7 +159,7 @@ def test_round_trip_with_nulls(self): pyobj_col(name="PyObj", data=[CustomClass(1, "1"), None]), ] test_table = new_table(cols=input_cols) - df = to_pandas(test_table) + df = to_pandas(test_table, dtype_backend=None) self.assertEqual(len(df.columns), len(test_table.columns)) self.assertEqual(df.size, 2 * len(test_table.columns)) test_table2 = to_table(df) diff --git a/py/server/tests/test_parquet.py b/py/server/tests/test_parquet.py index b2cd933740b..777c2668f45 100644 --- a/py/server/tests/test_parquet.py +++ b/py/server/tests/test_parquet.py @@ -347,7 +347,7 @@ def test_dates_and_time(self): from_disk = read('data_from_dh.parquet') self.assert_table_equals(dh_table, from_disk) - df_from_disk = to_pandas(from_disk) + df_from_disk = to_pandas(from_disk, dtype_backend=None) if pandas.__version__.split('.')[0] == "1": df_from_pandas = pandas.read_parquet("data_from_dh.parquet", use_nullable_dtypes=True) else: @@ -384,7 +384,7 @@ def time_test_helper(pa_table, new_schema, dest): # Write the provided pyarrow table type-casted to the new schema pyarrow.parquet.write_table(pa_table.cast(new_schema), dest) from_disk = read(dest) - df_from_disk = to_pandas(from_disk) + df_from_disk = to_pandas(from_disk, dtype_backend=None) original_df = pa_table.to_pandas() # Compare the dataframes as strings self.assertTrue((df_from_disk.astype(str) == original_df.astype(str)).all().values.all()) diff --git a/py/server/tests/test_table.py b/py/server/tests/test_table.py index bc82fa2022b..816c4e74f99 100644 --- a/py/server/tests/test_table.py +++ b/py/server/tests/test_table.py @@ -676,7 +676,7 @@ def verify_layout_hint(t: Table, layout_hint_str: str): self.assertIn("RuntimeError", cm.exception.compact_traceback) def verify_table_data(self, t: Table, expected: List[Any], assert_not_in: bool = False): - t_data = to_pandas(t).values.flatten() + t_data = to_pandas(t, dtype_backend=None).values.flatten() for s in expected: if assert_not_in: self.assertNotIn(s, t_data) From 7d468f49da37a5a4bcac3f6ced89be645fdcda8e Mon Sep 17 00:00:00 2001 From: jianfengmao Date: Tue, 14 Nov 2023 10:40:32 -0700 Subject: [PATCH 2/3] Improve docstrings and add comments in test code --- py/server/deephaven/pandas.py | 7 ++++--- py/server/tests/test_parquet.py | 3 +++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/py/server/deephaven/pandas.py b/py/server/deephaven/pandas.py index 4dbf7e37420..236ac2aeb1f 100644 --- a/py/server/deephaven/pandas.py +++ b/py/server/deephaven/pandas.py @@ -124,11 +124,12 @@ def to_pandas(table: Table, cols: List[str] = None, Args: table (Table): the source table cols (List[str]): the source column names, default is None which means include all columns - dtype_backend (str): Which dtype_backend to use, e.g. whether a DataFrame should have NumPy arrays, + dtype_backend (str): which dtype_backend to use, e.g. whether a DataFrame should have NumPy arrays, nullable dtypes are used for all dtypes that have a nullable implementation when “numpy_nullable” is set, pyarrow is used for all dtypes if “pyarrow” is set. None means Numpy backed DataFrames with no nullable - dtypes. Default is "numpy_nullable". - conv_null (bool): When dtype_backend is not set, whether to check for Deephaven nulls in the data and + dtypes. Both "numpy_nullable" and "pyarrow" automatically converts Deephaven nulls and enable the correct + mapping of Java String. default is "numpy_nullable". + conv_null (bool): when dtype_backend is not set, whether to check for Deephaven nulls in the data and automatically replace them with pd.NA. default is True. Returns: diff --git a/py/server/tests/test_parquet.py b/py/server/tests/test_parquet.py index 777c2668f45..36c70515000 100644 --- a/py/server/tests/test_parquet.py +++ b/py/server/tests/test_parquet.py @@ -347,6 +347,7 @@ def test_dates_and_time(self): from_disk = read('data_from_dh.parquet') self.assert_table_equals(dh_table, from_disk) + # TODO dtype_backend=None is a workaround until https://github.com/deephaven/deephaven-core/issues/4823 is fixed df_from_disk = to_pandas(from_disk, dtype_backend=None) if pandas.__version__.split('.')[0] == "1": df_from_pandas = pandas.read_parquet("data_from_dh.parquet", use_nullable_dtypes=True) @@ -384,6 +385,8 @@ def time_test_helper(pa_table, new_schema, dest): # Write the provided pyarrow table type-casted to the new schema pyarrow.parquet.write_table(pa_table.cast(new_schema), dest) from_disk = read(dest) + + # TODO dtype_backend=None is a workaround until https://github.com/deephaven/deephaven-core/issues/4823 is fixed df_from_disk = to_pandas(from_disk, dtype_backend=None) original_df = pa_table.to_pandas() # Compare the dataframes as strings From 040028cc9f163ca9033141f272c92a10e44a9aaa Mon Sep 17 00:00:00 2001 From: jianfengmao Date: Tue, 14 Nov 2023 12:38:13 -0700 Subject: [PATCH 3/3] Accept suggested changes to docstrings --- py/server/deephaven/pandas.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/py/server/deephaven/pandas.py b/py/server/deephaven/pandas.py index 236ac2aeb1f..c14ae1f9ca7 100644 --- a/py/server/deephaven/pandas.py +++ b/py/server/deephaven/pandas.py @@ -127,8 +127,9 @@ def to_pandas(table: Table, cols: List[str] = None, dtype_backend (str): which dtype_backend to use, e.g. whether a DataFrame should have NumPy arrays, nullable dtypes are used for all dtypes that have a nullable implementation when “numpy_nullable” is set, pyarrow is used for all dtypes if “pyarrow” is set. None means Numpy backed DataFrames with no nullable - dtypes. Both "numpy_nullable" and "pyarrow" automatically converts Deephaven nulls and enable the correct - mapping of Java String. default is "numpy_nullable". + dtypes. Both "numpy_nullable" and "pyarrow" automatically convert Deephaven nulls to Pandas NA and enable + Pandas extension types. Extension types are needed to support types beyond NumPy's type system. Extension + types support operations such as properly mapping Java Strings to Python strings. default is "numpy_nullable". conv_null (bool): when dtype_backend is not set, whether to check for Deephaven nulls in the data and automatically replace them with pd.NA. default is True.