Skip to content

Commit

Permalink
BUG (string dtype): fix inplace mutation with copy=False in ensure_st…
Browse files Browse the repository at this point in the history
…ring_array (pandas-dev#59756)

* BUG (string dtype): fix inplace mutation with copy=False in ensure_string_array

* update
  • Loading branch information
jorisvandenbossche committed Oct 10, 2024
1 parent bf47ce6 commit 74c6fac
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 6 deletions.
19 changes: 14 additions & 5 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -736,7 +736,9 @@ cpdef ndarray[object] ensure_string_array(
convert_na_value : bool, default True
If False, existing na values will be used unchanged in the new array.
copy : bool, default True
Whether to ensure that a new array is returned.
Whether to ensure that a new array is returned. When True, a new array
is always returned. When False, a new array is only returned when needed
to avoid mutating the input array.
skipna : bool, default True
Whether or not to coerce nulls to their stringified form
(e.g. if False, NaN becomes 'nan').
Expand Down Expand Up @@ -765,10 +767,17 @@ cpdef ndarray[object] ensure_string_array(

result = np.asarray(arr, dtype="object")

if copy and (result is arr or np.shares_memory(arr, result)):
# GH#54654
result = result.copy()
elif not copy and result is arr:
if result is arr or np.may_share_memory(arr, result):
# if np.asarray(..) did not make a copy of the input arr, we still need
# to do that to avoid mutating the input array
# GH#54654: share_memory check is needed for rare cases where np.asarray
# returns a new object without making a copy of the actual data
if copy:
result = result.copy()
else:
already_copied = False
elif not copy and not result.flags.writeable:
# Weird edge case where result is a view
already_copied = False

if issubclass(arr.dtype.type, np.str_):
Expand Down
22 changes: 21 additions & 1 deletion pandas/tests/copy_view/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,8 @@ def test_astype_string_and_object_update_original(
tm.assert_frame_equal(df2, df_orig)


def test_astype_string_copy_on_pickle_roundrip():
def test_astype_str_copy_on_pickle_roundrip():
# TODO(infer_string) this test can be removed after 3.0 (once str is the default)
# https://github.com/pandas-dev/pandas/issues/54654
# ensure_string_array may alter array inplace
base = Series(np.array([(1, 2), None, 1], dtype="object"))
Expand All @@ -144,6 +145,25 @@ def test_astype_string_copy_on_pickle_roundrip():
tm.assert_series_equal(base, base_copy)


def test_astype_string_copy_on_pickle_roundrip(any_string_dtype):
# https://github.com/pandas-dev/pandas/issues/54654
# ensure_string_array may alter array inplace
base = Series(np.array([(1, 2), None, 1], dtype="object"))
base_copy = pickle.loads(pickle.dumps(base))
base_copy.astype(any_string_dtype)
tm.assert_series_equal(base, base_copy)


def test_astype_string_read_only_on_pickle_roundrip(any_string_dtype):
# https://github.com/pandas-dev/pandas/issues/54654
# ensure_string_array may alter read-only array inplace
base = Series(np.array([(1, 2), None, 1], dtype="object"))
base_copy = pickle.loads(pickle.dumps(base))
base_copy._values.flags.writeable = False
base_copy.astype(any_string_dtype)
tm.assert_series_equal(base, base_copy)


def test_astype_dict_dtypes(using_copy_on_write):
df = DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")}
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/libs/test_lib.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pickle

import numpy as np
import pytest

Expand Down Expand Up @@ -283,3 +285,15 @@ def test_no_default_pickle():
# GH#40397
obj = tm.round_trip_pickle(lib.no_default)
assert obj is lib.no_default


def test_ensure_string_array_copy():
# ensure the original array is not modified in case of copy=False with
# pickle-roundtripped object dtype array
# https://github.com/pandas-dev/pandas/issues/54654
arr = np.array(["a", None], dtype=object)
arr = pickle.loads(pickle.dumps(arr))
result = lib.ensure_string_array(arr, copy=False)
assert not np.shares_memory(arr, result)
assert arr[1] is None
assert result[1] is np.nan

0 comments on commit 74c6fac

Please sign in to comment.