Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

change default dtype_backend for to_pandas #4815

Merged
merged 3 commits into from
Nov 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions py/server/deephaven/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ def _column_to_series(table: Table, col_def: Column, conv_null: bool) -> pd.Seri
}


def to_pandas(table: Table, cols: List[str] = None, dtype_backend: Literal[None, "pyarrow", "numpy_nullable"] = None,
def to_pandas(table: Table, cols: List[str] = None,
dtype_backend: Literal[None, "pyarrow", "numpy_nullable"] = "numpy_nullable",
conv_null: bool = True) -> pd.DataFrame:
"""Produces a pandas DataFrame from a table.

Expand All @@ -123,11 +124,13 @@ def to_pandas(table: Table, cols: List[str] = None, dtype_backend: Literal[None,
Args:
table (Table): the source table
cols (List[str]): the source column names, default is None which means include all columns
dtype_backend (str): Which dtype_backend to use, e.g. whether a DataFrame should have NumPy arrays,
dtype_backend (str): which dtype_backend to use, e.g. whether a DataFrame should have NumPy arrays,
nullable dtypes are used for all dtypes that have a nullable implementation when “numpy_nullable” is set,
pyarrow is used for all dtypes if “pyarrow” is set. default is None, meaning Numpy backed DataFrames with
no nullable dtypes.
conv_null (bool): When dtype_backend is not set, whether to check for Deephaven nulls in the data and
pyarrow is used for all dtypes if “pyarrow” is set. None means Numpy backed DataFrames with no nullable
dtypes. Both "numpy_nullable" and "pyarrow" automatically convert Deephaven nulls to Pandas NA and enable
Pandas extension types. Extension types are needed to support types beyond NumPy's type system. Extension
types support operations such as properly mapping Java Strings to Python strings. default is "numpy_nullable".
conv_null (bool): when dtype_backend is not set, whether to check for Deephaven nulls in the data and
automatically replace them with pd.NA. default is True.

Returns:
Expand Down
2 changes: 1 addition & 1 deletion py/server/tests/test_learn_gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def base_test(self, source, model, np_dtype):
gatherer_colmajor = lambda rowset, colset: gather.table_to_numpy_2d(rowset, colset,
gather.MemoryLayout.COLUMN_MAJOR, np_dtype)

array_from_table = to_pandas(source, conv_null=False).values
array_from_table = to_pandas(source, dtype_backend=None, conv_null=False).values

gathered_rowmajor = gatherer_rowmajor(rows, cols)
gathered_colmajor = gatherer_colmajor(rows, cols)
Expand Down
16 changes: 8 additions & 8 deletions py/server/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def tearDown(self) -> None:
super().tearDown()

def test_to_pandas_no_conv_null(self):
df = to_pandas(self.test_table, conv_null=False)
df = to_pandas(self.test_table, dtype_backend=None, conv_null=False)
self.assertEqual(len(df.columns), len(self.test_table.columns))
self.assertEqual(df.size, 2 * len(self.test_table.columns))
df_series = [df[col] for col in list(df.columns)]
Expand All @@ -70,7 +70,7 @@ def test_to_pandas_remaps(self):
prepared_table = self.test_table.update(
formulas=["Long = isNull(Long_) ? Double.NaN : Long_"])

df = to_pandas(prepared_table, cols=["Boolean", "Long"], conv_null=False)
df = to_pandas(prepared_table, cols=["Boolean", "Long"], dtype_backend=None, conv_null=False)
self.assertEqual(df['Long'].dtype, np.float64)
self.assertEqual(df['Boolean'].values.dtype, np.bool_)

Expand All @@ -88,12 +88,12 @@ def test_vector_column(self):

test_table = test_table.group_by(["String"])
df = to_pandas(test_table, cols=["String", "Doubles"])
self.assertEqual(df['String'].dtype, np.object_)
self.assertEqual(df['String'].dtype, pd.StringDtype())
self.assertEqual(df['Doubles'].dtype, np.object_)

double_series = df['Doubles']
self.assertEqual([1.0, 2.0], list(double_series[0].toArray()))
self.assertEqual([4.0, 8.0, 16.0], list(double_series[1].toArray()))
self.assertEqual([1.0, 2.0], list(double_series[0]))
self.assertEqual([4.0, 8.0, 16.0], list(double_series[1]))

def test_invalid_col_name(self):
with self.assertRaises(DHError) as cm:
Expand All @@ -114,7 +114,7 @@ def test_to_table(self):
double_col(name="Double", data=[1.01, -1.01]),
]
test_table = new_table(cols=input_cols)
df = to_pandas(test_table, conv_null=False)
df = to_pandas(test_table, dtype_backend=None, conv_null=False)
table_from_df = to_table(df)
self.assert_table_equals(table_from_df, test_table)

Expand All @@ -123,7 +123,7 @@ def test_to_table_boolean_with_none(self):
table_with_null_bool = new_table(cols=input_cols)
prepared_table = table_with_null_bool.update(
formulas=["Boolean = isNull(Boolean) ? (byte)NULL_BYTE : (Boolean == true ? 1: 0)"])
df = to_pandas(prepared_table, conv_null=False)
df = to_pandas(prepared_table, dtype_backend=None, conv_null=False)
table_from_df = to_table(df)
self.assert_table_equals(table_from_df, prepared_table)

Expand Down Expand Up @@ -159,7 +159,7 @@ def test_round_trip_with_nulls(self):
pyobj_col(name="PyObj", data=[CustomClass(1, "1"), None]),
]
test_table = new_table(cols=input_cols)
df = to_pandas(test_table)
df = to_pandas(test_table, dtype_backend=None)
self.assertEqual(len(df.columns), len(test_table.columns))
self.assertEqual(df.size, 2 * len(test_table.columns))
test_table2 = to_table(df)
Expand Down
7 changes: 5 additions & 2 deletions py/server/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,8 @@ def test_dates_and_time(self):
from_disk = read('data_from_dh.parquet')
self.assert_table_equals(dh_table, from_disk)

df_from_disk = to_pandas(from_disk)
# TODO dtype_backend=None is a workaround until https://github.com/deephaven/deephaven-core/issues/4823 is fixed
df_from_disk = to_pandas(from_disk, dtype_backend=None)
if pandas.__version__.split('.')[0] == "1":
df_from_pandas = pandas.read_parquet("data_from_dh.parquet", use_nullable_dtypes=True)
else:
Expand Down Expand Up @@ -384,7 +385,9 @@ def time_test_helper(pa_table, new_schema, dest):
# Write the provided pyarrow table type-casted to the new schema
pyarrow.parquet.write_table(pa_table.cast(new_schema), dest)
from_disk = read(dest)
df_from_disk = to_pandas(from_disk)

# TODO dtype_backend=None is a workaround until https://github.com/deephaven/deephaven-core/issues/4823 is fixed
df_from_disk = to_pandas(from_disk, dtype_backend=None)
original_df = pa_table.to_pandas()
# Compare the dataframes as strings
self.assertTrue((df_from_disk.astype(str) == original_df.astype(str)).all().values.all())
jmao-denver marked this conversation as resolved.
Show resolved Hide resolved
Expand Down
2 changes: 1 addition & 1 deletion py/server/tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -676,7 +676,7 @@ def verify_layout_hint(t: Table, layout_hint_str: str):
self.assertIn("RuntimeError", cm.exception.compact_traceback)

def verify_table_data(self, t: Table, expected: List[Any], assert_not_in: bool = False):
t_data = to_pandas(t).values.flatten()
t_data = to_pandas(t, dtype_backend=None).values.flatten()
for s in expected:
if assert_not_in:
self.assertNotIn(s, t_data)
Expand Down
Loading