From fba354c887fc261b8b21760fdda14536d26c1d68 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 6 Sep 2023 16:41:56 -0700 Subject: [PATCH 1/3] Fix empty column construction when dtype is object --- python/cudf/cudf/core/column/column.py | 18 ++++++++++++++++++ python/cudf/cudf/tests/test_dataframe.py | 5 +---- python/cudf/cudf/tests/test_index.py | 24 ++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 9dde17a1045..a1f1fe2db36 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2433,6 +2433,24 @@ def as_column( from_pandas=True if nan_as_null is None else nan_as_null, ) + if ( + isinstance(pyarrow_array, pa.NullArray) + and pa_type is None + and dtype is None + ): + if getattr(arbitrary, "dtype", None) == cudf.dtype( + "object" + ): + # pa.array constructor returns a NullArray + # for empty arrays, instead of a StringArray. + # This issue is only specific to this dtype, + # all other dtypes, result in their corresponding + # arrow array creation. + dtype = cudf.dtype("str") + pyarrow_array = pyarrow_array.cast( + np_to_pa_dtype(dtype) + ) + if ( isinstance(arbitrary, pd.Index) and arbitrary.dtype == cudf.dtype("object") diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 3c84cfe48c4..44d0b9249d0 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -7256,10 +7256,7 @@ def test_dataframe_keys(df): def test_series_keys(ps): gds = cudf.from_pandas(ps) - if len(ps) == 0 and not isinstance(ps.index, pd.RangeIndex): - assert_eq(ps.keys().astype("float64"), gds.keys()) - else: - assert_eq(ps.keys(), gds.keys()) + assert_eq(ps.keys(), gds.keys()) @pytest_unmark_spilling diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 359b3c519de..c32ceb22383 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -26,6 +26,7 @@ SIGNED_INTEGER_TYPES, SIGNED_TYPES, UNSIGNED_TYPES, + ALL_TYPES, _create_pandas_series, assert_column_memory_eq, assert_column_memory_ne, @@ -2698,3 +2699,26 @@ def test_index_getitem_time_duration(dtype): assert gidx[i] is pidx[i] else: assert_eq(gidx[i], pidx[i]) + + +@pytest.mark.parametrize("dtype", ALL_TYPES) +def test_index_empty_from_pandas(request, dtype): + request.node.add_marker( + pytest.mark.xfail( + condition=not PANDAS_GE_200 + and dtype + in { + "datetime64[ms]", + "datetime64[s]", + "datetime64[us]", + "timedelta64[ms]", + "timedelta64[s]", + "timedelta64[us]", + }, + reason="Fixed in pandas-2.0", + ) + ) + pidx = pd.Index([], dtype=dtype) + gidx = cudf.from_pandas(pidx) + + assert_eq(pidx, gidx) From 2811ca9471ecd1263f47ee901b695090a78a5cf1 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 7 Sep 2023 08:12:19 -0700 Subject: [PATCH 2/3] Merge conditional --- python/cudf/cudf/core/column/column.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 21325d07bfe..84d6d55dd6a 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2437,19 +2437,16 @@ def as_column( isinstance(pyarrow_array, pa.NullArray) and pa_type is None and dtype is None + and getattr(arbitrary, "dtype", None) + == cudf.dtype("object") ): - if getattr(arbitrary, "dtype", None) == cudf.dtype( - "object" - ): - # pa.array constructor returns a NullArray - # for empty arrays, instead of a StringArray. - # This issue is only specific to this dtype, - # all other dtypes, result in their corresponding - # arrow array creation. - dtype = cudf.dtype("str") - pyarrow_array = pyarrow_array.cast( - np_to_pa_dtype(dtype) - ) + # pa.array constructor returns a NullArray + # for empty arrays, instead of a StringArray. + # This issue is only specific to this dtype, + # all other dtypes, result in their corresponding + # arrow array creation. + dtype = cudf.dtype("str") + pyarrow_array = pyarrow_array.cast(np_to_pa_dtype(dtype)) if ( isinstance(arbitrary, pd.Index) From 79298e881393b9804d2ff59e02743c1ba19bba2c Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 7 Sep 2023 10:40:24 -0500 Subject: [PATCH 3/3] isort --- python/cudf/cudf/tests/test_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index df41747c3c1..528cb99bfa0 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -20,13 +20,13 @@ as_index, ) from cudf.testing._utils import ( + ALL_TYPES, FLOAT_TYPES, NUMERIC_TYPES, OTHER_TYPES, SIGNED_INTEGER_TYPES, SIGNED_TYPES, UNSIGNED_TYPES, - ALL_TYPES, _create_pandas_series, assert_column_memory_eq, assert_column_memory_ne,