From 1e8d4b74c092070d8e71773c4d357090b03d2597 Mon Sep 17 00:00:00 2001 From: bretttully Date: Thu, 14 Nov 2024 09:25:51 +1000 Subject: [PATCH 1/6] Reorder to_pandas extension dtype mapping Addresses https://github.com/pandas-dev/pandas/issues/53011 `types_mapper` always had highest priority as it overrode what was set before. However, switching the logical ordering, it means that we don't need to call `_pandas_api.pandas_dtype(dtype)` when using the pyarrow backend. Resolving the issue of complex `dtype` with `list` or `struct` --- python/pyarrow/pandas_compat.py | 40 ++++++++++++++++----------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 7fbde36bc23e9..5a930a41f0300 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -848,6 +848,25 @@ def _get_extension_dtypes(table, columns_metadata, types_mapper=None): if _pandas_api.extension_dtype is None: return ext_columns + # use the specified mapping of built-in arrow types to pandas dtypes + if types_mapper: + for field in table.schema: + typ = field.type + pandas_dtype = types_mapper(typ) + if pandas_dtype is not None: + ext_columns[field.name] = pandas_dtype + + # infer from extension type in the schema + for field in table.schema: + typ = field.type + if field.name not in ext_columns and isinstance(typ, pa.BaseExtensionType): + try: + pandas_dtype = typ.to_pandas_dtype() + except NotImplementedError: + pass + else: + ext_columns[field.name] = pandas_dtype + # infer the extension columns from the pandas metadata for col_meta in columns_metadata: try: @@ -856,7 +875,7 @@ def _get_extension_dtypes(table, columns_metadata, types_mapper=None): name = col_meta['name'] dtype = col_meta['numpy_type'] - if dtype not in _pandas_supported_numpy_types: + if name not in ext_columns and dtype not in _pandas_supported_numpy_types: # pandas_dtype is expensive, so avoid doing this for types # that are certainly numpy dtypes pandas_dtype = _pandas_api.pandas_dtype(dtype) @@ -864,25 +883,6 @@ def _get_extension_dtypes(table, columns_metadata, types_mapper=None): if hasattr(pandas_dtype, "__from_arrow__"): ext_columns[name] = pandas_dtype - # infer from extension type in the schema - for field in table.schema: - typ = field.type - if isinstance(typ, pa.BaseExtensionType): - try: - pandas_dtype = typ.to_pandas_dtype() - except NotImplementedError: - pass - else: - ext_columns[field.name] = pandas_dtype - - # use the specified mapping of built-in arrow types to pandas dtypes - if types_mapper: - for field in table.schema: - typ = field.type - pandas_dtype = types_mapper(typ) - if pandas_dtype is not None: - ext_columns[field.name] = pandas_dtype - return ext_columns From cb870689572ad00f1e80e663433d3971dc212862 Mon Sep 17 00:00:00 2001 From: bretttully Date: Thu, 14 Nov 2024 09:56:36 +1000 Subject: [PATCH 2/6] Added test --- python/pyarrow/tests/test_pandas.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 178a073ed59dc..6c92a0b46f365 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -17,6 +17,7 @@ import gc import decimal +import io import json import multiprocessing as mp import sys @@ -4411,6 +4412,31 @@ def test_to_pandas_extension_dtypes_mapping(): assert isinstance(result['a'].dtype, pd.PeriodDtype) + +def test_to_pandas_extension_dtypes_mapping_complex_type(): + pa_type = pa.struct( + [ + pa.field("bar", pa.bool_(), nullable=False), + pa.field("baz", pa.float32(), nullable=True), + ], + ) + pd_type = pd.ArrowDtype(pa_type) + schema = pa.schema([pa.field("foo", pa_type)]) + df0 = pd.DataFrame( + [ + {"foo": {"bar": True, "baz": np.float32(1)}}, + {"foo": {"bar": True, "baz": None}}, + ], + ).astype({"foo": pd_type}) + + # Round trip df0 into df1 + with io.BytesIO() as stream: + df0.to_parquet(stream, schema=schema) + stream.seek(0) + df1 = pd.read_parquet(stream, dtype_backend="pyarrow") + pd.testing.assert_frame_equal(df0, df1) + + def test_array_to_pandas(): if Version(pd.__version__) < Version("1.1"): pytest.skip("ExtensionDtype to_pandas method missing") From 1c23076fb37f8903762c7416db12e8e453e23366 Mon Sep 17 00:00:00 2001 From: bretttully Date: Fri, 15 Nov 2024 15:23:08 +1100 Subject: [PATCH 3/6] GH-39914 better testing based on PR feedback --- python/pyarrow/tests/test_pandas.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 6c92a0b46f365..3b0ea1a81026c 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -17,7 +17,6 @@ import gc import decimal -import io import json import multiprocessing as mp import sys @@ -4430,10 +4429,8 @@ def test_to_pandas_extension_dtypes_mapping_complex_type(): ).astype({"foo": pd_type}) # Round trip df0 into df1 - with io.BytesIO() as stream: - df0.to_parquet(stream, schema=schema) - stream.seek(0) - df1 = pd.read_parquet(stream, dtype_backend="pyarrow") + table = pa.Table.from_pandas(df0, schema=schema) + df1 = table.to_pandas(types_mapper=pd.ArrowDtype) pd.testing.assert_frame_equal(df0, df1) From 436a9ebd9e6f40f84932e65499710ec738866a52 Mon Sep 17 00:00:00 2001 From: bretttully Date: Wed, 20 Nov 2024 13:29:42 +1100 Subject: [PATCH 4/6] GH-39914 fix test for old pandas --- python/pyarrow/tests/test_pandas.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 3b0ea1a81026c..ab21d98dac708 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -4411,8 +4411,11 @@ def test_to_pandas_extension_dtypes_mapping(): assert isinstance(result['a'].dtype, pd.PeriodDtype) - + def test_to_pandas_extension_dtypes_mapping_complex_type(): + if Version(pd.__version__) < Version("1.5.2"): + pytest.skip("Test relies on pd.ArrowDtype") + # https://github.com/apache/arrow/pull/44720 pa_type = pa.struct( [ pa.field("bar", pa.bool_(), nullable=False), From e3b9892e5663f4888bc79c38b9d36bbefcdaf2b4 Mon Sep 17 00:00:00 2001 From: bretttully Date: Wed, 20 Nov 2024 13:30:40 +1100 Subject: [PATCH 5/6] GH-39914 mistake in last --- python/pyarrow/tests/test_pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index ab21d98dac708..31569d12934e4 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -4413,9 +4413,9 @@ def test_to_pandas_extension_dtypes_mapping(): def test_to_pandas_extension_dtypes_mapping_complex_type(): + # https://github.com/apache/arrow/pull/44720 if Version(pd.__version__) < Version("1.5.2"): pytest.skip("Test relies on pd.ArrowDtype") - # https://github.com/apache/arrow/pull/44720 pa_type = pa.struct( [ pa.field("bar", pa.bool_(), nullable=False), From 662c744fbf94b598c915a1e8556dc815aa6c1336 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 21 Nov 2024 08:53:20 +0100 Subject: [PATCH 6/6] Update python/pyarrow/tests/test_pandas.py --- python/pyarrow/tests/test_pandas.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 31569d12934e4..1186f87b0322a 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -4411,7 +4411,6 @@ def test_to_pandas_extension_dtypes_mapping(): assert isinstance(result['a'].dtype, pd.PeriodDtype) - def test_to_pandas_extension_dtypes_mapping_complex_type(): # https://github.com/apache/arrow/pull/44720 if Version(pd.__version__) < Version("1.5.2"):