Skip to content

Commit

Permalink
[backport 2.3.x] BUG (string dtype): replace with non-string to fall …
Browse files Browse the repository at this point in the history
…back to object dtype (#60285) (#60292)

* BUG (string dtype): replace with non-string to fall back to object dtype (#60285)

Co-authored-by: Matthew Roeschke <[email protected]>
(cherry picked from commit 938832b)

* updates for 2.3

* fix inplace modification for 2.3.x branch with python storage
  • Loading branch information
jorisvandenbossche authored Nov 14, 2024
1 parent 2054463 commit 54b47df
Show file tree
Hide file tree
Showing 7 changed files with 76 additions and 59 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,10 @@ Conversion
Strings
^^^^^^^
- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
- Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`)
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
-

Interval
^^^^^^^^
Expand Down
43 changes: 25 additions & 18 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -726,20 +726,9 @@ def _values_for_factorize(self) -> tuple[np.ndarray, libmissing.NAType | float]:

return arr, self.dtype.na_value

def __setitem__(self, key, value) -> None:
value = extract_array(value, extract_numpy=True)
if isinstance(value, type(self)):
# extract_array doesn't extract NumpyExtensionArray subclasses
value = value._ndarray

key = check_array_indexer(self, key)
scalar_key = lib.is_scalar(key)
scalar_value = lib.is_scalar(value)
if scalar_key and not scalar_value:
raise ValueError("setting an array element with a sequence.")

# validate new items
if scalar_value:
def _maybe_convert_setitem_value(self, value):
"""Maybe convert value to be pyarrow compatible."""
if lib.is_scalar(value):
if isna(value):
value = self.dtype.na_value
elif not isinstance(value, str):
Expand All @@ -749,8 +738,11 @@ def __setitem__(self, key, value) -> None:
"instead."
)
else:
value = extract_array(value, extract_numpy=True)
if not is_array_like(value):
value = np.asarray(value, dtype=object)
elif isinstance(value.dtype, type(self.dtype)):
return value
else:
# cast categories and friends to arrays to see if values are
# compatible, compatibility with arrow backed strings
Expand All @@ -760,11 +752,26 @@ def __setitem__(self, key, value) -> None:
"Invalid value for dtype 'str'. Value should be a "
"string or missing value (or array of those)."
)
return value

mask = isna(value)
if mask.any():
value = value.copy()
value[isna(value)] = self.dtype.na_value
def __setitem__(self, key, value) -> None:
value = self._maybe_convert_setitem_value(value)

key = check_array_indexer(self, key)
scalar_key = lib.is_scalar(key)
scalar_value = lib.is_scalar(value)
if scalar_key and not scalar_value:
raise ValueError("setting an array element with a sequence.")

if not scalar_value:
if value.dtype == self.dtype:
value = value._ndarray
else:
value = np.asarray(value)
mask = isna(value)
if mask.any():
value = value.copy()
value[isna(value)] = self.dtype.na_value

super().__setitem__(key, value)

Expand Down
7 changes: 7 additions & 0 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1754,6 +1754,13 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool:
except (ValueError, TypeError):
return False

if dtype == "string":
try:
arr._maybe_convert_setitem_value(element) # type: ignore[union-attr]
return True
except (ValueError, TypeError):
return False

# This is technically incorrect, but maintains the behavior of
# ExtensionBlock._can_hold_element
return True
Expand Down
40 changes: 32 additions & 8 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
ABCNumpyExtensionArray,
ABCSeries,
)
from pandas.core.dtypes.inference import is_re
from pandas.core.dtypes.missing import (
is_valid_na_for_dtype,
isna,
Expand Down Expand Up @@ -115,6 +116,7 @@
PeriodArray,
TimedeltaArray,
)
from pandas.core.arrays.string_ import StringDtype
from pandas.core.base import PandasObject
import pandas.core.common as com
from pandas.core.computation import expressions
Expand Down Expand Up @@ -476,7 +478,9 @@ def split_and_operate(self, func, *args, **kwargs) -> list[Block]:
# Up/Down-casting

@final
def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block:
def coerce_to_target_dtype(
self, other, warn_on_upcast: bool = False, using_cow: bool = False
) -> Block:
"""
coerce the current block to a dtype compat for other
we will return a block, possibly object, and not raise
Expand Down Expand Up @@ -528,7 +532,14 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block:
f"{self.values.dtype}. Please report a bug at "
"https://github.com/pandas-dev/pandas/issues."
)
return self.astype(new_dtype, copy=False)
copy = False
if (
not using_cow
and isinstance(self.dtype, StringDtype)
and self.dtype.storage == "python"
):
copy = True
return self.astype(new_dtype, copy=copy, using_cow=using_cow)

@final
def _maybe_downcast(
Expand Down Expand Up @@ -879,7 +890,7 @@ def replace(
else:
return [self] if inplace else [self.copy()]

elif self._can_hold_element(value):
elif self._can_hold_element(value) or (self.dtype == "string" and is_re(value)):
# TODO(CoW): Maybe split here as well into columns where mask has True
# and rest?
blk = self._maybe_copy(using_cow, inplace)
Expand Down Expand Up @@ -926,12 +937,13 @@ def replace(
if value is None or value is NA:
blk = self.astype(np.dtype(object))
else:
blk = self.coerce_to_target_dtype(value)
blk = self.coerce_to_target_dtype(value, using_cow=using_cow)
return blk.replace(
to_replace=to_replace,
value=value,
inplace=True,
mask=mask,
using_cow=using_cow,
)

else:
Expand Down Expand Up @@ -980,16 +992,26 @@ def _replace_regex(
-------
List[Block]
"""
if not self._can_hold_element(to_replace):
if not is_re(to_replace) and not self._can_hold_element(to_replace):
# i.e. only if self.is_object is True, but could in principle include a
# String ExtensionBlock
if using_cow:
return [self.copy(deep=False)]
return [self] if inplace else [self.copy()]

rx = re.compile(to_replace)
if is_re(to_replace) and self.dtype not in [object, "string"]:
# only object or string dtype can hold strings, and a regex object
# will only match strings
return [self.copy(deep=False)]

block = self._maybe_copy(using_cow, inplace)
if not (
self._can_hold_element(value) or (self.dtype == "string" and is_re(value))
):
block = self.astype(np.dtype(object))
else:
block = self._maybe_copy(using_cow, inplace)

rx = re.compile(to_replace)

replace_regex(block.values, rx, value, mask)

Expand Down Expand Up @@ -1048,7 +1070,9 @@ def replace_list(

# Exclude anything that we know we won't contain
pairs = [
(x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x)
(x, y)
for x, y in zip(src_list, dest_list)
if (self._can_hold_element(x) or (self.dtype == "string" and is_re(x)))
]
if not len(pairs):
if using_cow:
Expand Down
9 changes: 0 additions & 9 deletions pandas/tests/frame/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,6 @@ def test_regex_replace_dict_nested_non_first_character(
expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype)
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_regex_replace_dict_nested_gh4115(self):
df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2})
expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2})
Expand Down Expand Up @@ -556,7 +555,6 @@ def test_replace_series_dict(self):
result = df.replace(s, df.mean())
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_replace_convert(self):
# gh 3907
df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]])
Expand Down Expand Up @@ -932,7 +930,6 @@ def test_replace_input_formats_listlike(self):
with pytest.raises(ValueError, match=msg):
df.replace(to_rep, values[1:])

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_replace_input_formats_scalar(self):
df = DataFrame(
{"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]}
Expand Down Expand Up @@ -961,7 +958,6 @@ def test_replace_limit(self):
# TODO
pass

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_replace_dict_no_regex(self):
answer = Series(
{
Expand All @@ -985,7 +981,6 @@ def test_replace_dict_no_regex(self):
result = answer.replace(weights)
tm.assert_series_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_replace_series_no_regex(self):
answer = Series(
{
Expand Down Expand Up @@ -1104,7 +1099,6 @@ def test_replace_swapping_bug(self, using_infer_string):
expect = DataFrame({"a": ["Y", "N", "Y"]})
tm.assert_frame_equal(res, expect)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_replace_period(self):
d = {
"fname": {
Expand Down Expand Up @@ -1141,7 +1135,6 @@ def test_replace_period(self):
result = df.replace(d)
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_replace_datetime(self):
d = {
"fname": {
Expand Down Expand Up @@ -1367,7 +1360,6 @@ def test_replace_commutative(self, df, to_replace, exp):
result = df.replace(to_replace)
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
@pytest.mark.parametrize(
"replacer",
[
Expand Down Expand Up @@ -1644,7 +1636,6 @@ def test_regex_replace_scalar(
expected.loc[expected["a"] == ".", "a"] = expected_replace_val
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
@pytest.mark.parametrize("regex", [False, True])
def test_replace_regex_dtype_frame(self, regex):
# GH-48644
Expand Down
18 changes: 5 additions & 13 deletions pandas/tests/series/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -886,24 +886,16 @@ def test_index_where(self, obj, key, expected, warn, val):
mask = np.zeros(obj.shape, dtype=bool)
mask[key] = True

if obj.dtype == "string" and not (isinstance(val, str) or isna(val)):
with pytest.raises(TypeError, match="Invalid value"):
Index(obj, dtype=obj.dtype).where(~mask, val)
else:
res = Index(obj, dtype=obj.dtype).where(~mask, val)
expected_idx = Index(expected, dtype=expected.dtype)
tm.assert_index_equal(res, expected_idx)
res = Index(obj, dtype=obj.dtype).where(~mask, val)
expected_idx = Index(expected, dtype=expected.dtype)
tm.assert_index_equal(res, expected_idx)

def test_index_putmask(self, obj, key, expected, warn, val):
mask = np.zeros(obj.shape, dtype=bool)
mask[key] = True

if obj.dtype == "string" and not (isinstance(val, str) or isna(val)):
with pytest.raises(TypeError, match="Invalid value"):
Index(obj, dtype=obj.dtype).putmask(mask, val)
else:
res = Index(obj, dtype=obj.dtype).putmask(mask, val)
tm.assert_index_equal(res, Index(expected, dtype=expected.dtype))
res = Index(obj, dtype=obj.dtype).putmask(mask, val)
tm.assert_index_equal(res, Index(expected, dtype=expected.dtype))


@pytest.mark.parametrize(
Expand Down
16 changes: 6 additions & 10 deletions pandas/tests/series/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,10 +403,6 @@ def test_replace_categorical(self, categorical, numeric, using_infer_string):
ser = pd.Series(categorical)
msg = "Downcasting behavior in `replace`"
msg = "with CategoricalDtype is deprecated"
if using_infer_string:
with pytest.raises(TypeError, match="Invalid value"):
ser.replace({"A": 1, "B": 2})
return
with tm.assert_produces_warning(FutureWarning, match=msg):
result = ser.replace({"A": 1, "B": 2})
expected = pd.Series(numeric).astype("category")
Expand Down Expand Up @@ -745,13 +741,13 @@ def test_replace_regex_dtype_series(self, regex):
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("regex", [False, True])
def test_replace_regex_dtype_series_string(self, regex, using_infer_string):
if not using_infer_string:
# then this is object dtype which is already tested above
return
def test_replace_regex_dtype_series_string(self, regex):
series = pd.Series(["0"], dtype="str")
with pytest.raises(TypeError, match="Invalid value"):
series.replace(to_replace="0", value=1, regex=regex)
expected = pd.Series([1], dtype="int64")
msg = "Downcasting behavior in `replace`"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = series.replace(to_replace="0", value=1, regex=regex)
tm.assert_series_equal(result, expected)

def test_replace_different_int_types(self, any_int_numpy_dtype):
# GH#45311
Expand Down

0 comments on commit 54b47df

Please sign in to comment.