Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[backport 2.3.x] TST (string dtype): resolve xfails for frame fillna and replace tests + fix bug in replace for string (#60295) #60331

Draft
wants to merge 1 commit into
base: 2.3.x
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pandas/core/array_algos/replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,4 +149,6 @@ def re_replacer(s):
if mask is None:
values[:] = f(values)
else:
if values.ndim != mask.ndim:
mask = np.broadcast_to(mask, values.shape)
values[mask] = f(values[mask])
7 changes: 7 additions & 0 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2139,6 +2139,13 @@ def where(
if isinstance(self.dtype, (IntervalDtype, StringDtype)):
# TestSetitemFloatIntervalWithIntIntervalValues
blk = self.coerce_to_target_dtype(orig_other)
if (
self.ndim == 2
and isinstance(orig_cond, np.ndarray)
and orig_cond.ndim == 1
and not is_1d_only_ea_dtype(blk.dtype)
):
orig_cond = orig_cond[:, None]
nbs = blk.where(orig_other, orig_cond, using_cow=using_cow)
return self._maybe_downcast(
nbs, downcast=_downcast, using_cow=using_cow, caller="where"
Expand Down
37 changes: 12 additions & 25 deletions pandas/tests/frame/methods/test_fillna.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas.util._test_decorators as td

from pandas import (
Expand Down Expand Up @@ -91,8 +89,6 @@ def test_fillna_datetime(self, datetime_frame):
with pytest.raises(ValueError, match=msg):
datetime_frame.fillna(5, method="ffill")

# TODO(infer_string) test as actual error instead of xfail
@pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string")
def test_fillna_mixed_type(self, float_string_frame):
mf = float_string_frame
mf.loc[mf.index[5:20], "foo"] = np.nan
Expand Down Expand Up @@ -126,26 +122,24 @@ def test_fillna_empty(self, using_copy_on_write):
df.x.fillna(method=m, inplace=True)
df.x.fillna(method=m)

def test_fillna_different_dtype(self, using_infer_string):
def test_fillna_different_dtype(self):
# with different dtype (GH#3386)
df = DataFrame(
[["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]]
)

if using_infer_string:
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
result = df.fillna({2: "foo"})
else:
msg = "Downcasting object dtype arrays"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.fillna({2: "foo"})
expected = DataFrame(
[["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]]
)
# column is originally float (all-NaN) -> filling with string gives object dtype
# expected[2] = expected[2].astype("object")
tm.assert_frame_equal(result, expected)

if using_infer_string:
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
return_value = df.fillna({2: "foo"}, inplace=True)
else:
msg = "Downcasting object dtype arrays"
with tm.assert_produces_warning(FutureWarning, match=msg):
return_value = df.fillna({2: "foo"}, inplace=True)
tm.assert_frame_equal(df, expected)
assert return_value is None
Expand Down Expand Up @@ -390,6 +384,7 @@ def test_fillna_dtype_conversion(self, using_infer_string):
result = df.fillna("nan")
else:
result = df.fillna("nan")
# expected = DataFrame("nan", dtype="object", index=range(3),columns=["A", "B"])
expected = DataFrame("nan", index=range(3), columns=["A", "B"])
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -665,18 +660,10 @@ def test_fillna_col_reordering(self):
filled = df.fillna(method="ffill")
assert df.columns.tolist() == filled.columns.tolist()

# TODO(infer_string) test as actual error instead of xfail
@pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string")
def test_fill_corner(self, float_frame, float_string_frame):
mf = float_string_frame
mf.loc[mf.index[5:20], "foo"] = np.nan
mf.loc[mf.index[-10:], "A"] = np.nan

filled = float_string_frame.fillna(value=0)
assert (filled.loc[filled.index[5:20], "foo"] == 0).all()
del float_string_frame["foo"]

float_frame.reindex(columns=[]).fillna(value=0)
def test_fill_empty(self, float_frame):
df = float_frame.reindex(columns=[])
result = df.fillna(value=0)
tm.assert_frame_equal(result, df)

def test_fillna_downcast_dict(self):
# GH#40809
Expand Down
68 changes: 43 additions & 25 deletions pandas/tests/frame/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import (
DataFrame,
Expand All @@ -30,7 +28,6 @@ def mix_abc() -> dict[str, list[float | str]]:


class TestDataFrameReplace:
@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_replace_inplace(self, datetime_frame, float_string_frame):
datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan
datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan
Expand All @@ -46,7 +43,9 @@ def test_replace_inplace(self, datetime_frame, float_string_frame):
mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan

result = float_string_frame.replace(np.nan, 0)
expected = float_string_frame.fillna(value=0)
expected = float_string_frame.copy()
expected["foo"] = expected["foo"].astype(object)
expected = expected.fillna(value=0)
tm.assert_frame_equal(result, expected)

tsframe = datetime_frame.copy()
Expand Down Expand Up @@ -298,20 +297,22 @@ def test_regex_replace_dict_nested_non_first_character(
tm.assert_frame_equal(result, expected)

def test_regex_replace_dict_nested_gh4115(self):
df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2})
expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2})
df = DataFrame(
{"Type": Series(["Q", "T", "Q", "Q", "T"], dtype=object), "tmp": 2}
)
expected = DataFrame({"Type": Series([0, 1, 0, 0, 1], dtype=object), "tmp": 2})
msg = "Downcasting behavior in `replace`"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.replace({"Type": {"Q": 0, "T": 1}})

tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_regex_replace_list_to_scalar(self, mix_abc):
df = DataFrame(mix_abc)
expec = DataFrame(
{
"a": mix_abc["a"],
"b": np.array([np.nan] * 4),
"b": Series([np.nan] * 4, dtype="str"),
"c": [np.nan, np.nan, np.nan, "d"],
}
)
Expand All @@ -334,7 +335,6 @@ def test_regex_replace_list_to_scalar(self, mix_abc):
tm.assert_frame_equal(res2, expec)
tm.assert_frame_equal(res3, expec)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_regex_replace_str_to_numeric(self, mix_abc):
# what happens when you try to replace a numeric value with a regex?
df = DataFrame(mix_abc)
Expand All @@ -346,11 +346,12 @@ def test_regex_replace_str_to_numeric(self, mix_abc):
return_value = res3.replace(regex=r"\s*\.\s*", value=0, inplace=True)
assert return_value is None
expec = DataFrame({"a": mix_abc["a"], "b": ["a", "b", 0, 0], "c": mix_abc["c"]})
# TODO(infer_string)
expec["c"] = expec["c"].astype(object)
tm.assert_frame_equal(res, expec)
tm.assert_frame_equal(res2, expec)
tm.assert_frame_equal(res3, expec)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_regex_replace_regex_list_to_numeric(self, mix_abc):
df = DataFrame(mix_abc)
res = df.replace([r"\s*\.\s*", "b"], 0, regex=True)
Expand Down Expand Up @@ -566,21 +567,28 @@ def test_replace_convert(self):
res = rep.dtypes
tm.assert_series_equal(expec, res)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_replace_mixed(self, float_string_frame):
mf = float_string_frame
mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan
mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan

result = float_string_frame.replace(np.nan, -18)
expected = float_string_frame.fillna(value=-18)
expected = float_string_frame.copy()
expected["foo"] = expected["foo"].astype(object)
expected = expected.fillna(value=-18)
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result.replace(-18, np.nan), float_string_frame)
expected2 = float_string_frame.copy()
expected2["foo"] = expected2["foo"].astype(object)
tm.assert_frame_equal(result.replace(-18, np.nan), expected2)

result = float_string_frame.replace(np.nan, -1e8)
expected = float_string_frame.fillna(value=-1e8)
expected = float_string_frame.copy()
expected["foo"] = expected["foo"].astype(object)
expected = expected.fillna(value=-1e8)
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result.replace(-1e8, np.nan), float_string_frame)
expected2 = float_string_frame.copy()
expected2["foo"] = expected2["foo"].astype(object)
tm.assert_frame_equal(result.replace(-1e8, np.nan), expected2)

def test_replace_mixed_int_block_upcasting(self):
# int block upcasting
Expand Down Expand Up @@ -641,7 +649,7 @@ def test_replace_mixed2(self, using_infer_string):

expected = DataFrame(
{
"A": Series(["foo", "bar"]),
"A": Series(["foo", "bar"], dtype="object"),
"B": Series([0, "foo"], dtype="object"),
}
)
Expand Down Expand Up @@ -958,15 +966,16 @@ def test_replace_limit(self):
# TODO
pass

def test_replace_dict_no_regex(self):
def test_replace_dict_no_regex(self, any_string_dtype):
answer = Series(
{
0: "Strongly Agree",
1: "Agree",
2: "Neutral",
3: "Disagree",
4: "Strongly Disagree",
}
},
dtype=any_string_dtype,
)
weights = {
"Agree": 4,
Expand All @@ -981,15 +990,16 @@ def test_replace_dict_no_regex(self):
result = answer.replace(weights)
tm.assert_series_equal(result, expected)

def test_replace_series_no_regex(self):
def test_replace_series_no_regex(self, any_string_dtype):
answer = Series(
{
0: "Strongly Agree",
1: "Agree",
2: "Neutral",
3: "Disagree",
4: "Strongly Disagree",
}
},
dtype=any_string_dtype,
)
weights = Series(
{
Expand Down Expand Up @@ -1087,16 +1097,15 @@ def test_nested_dict_overlapping_keys_replace_str(self):
expected = df.replace({"a": dict(zip(astr, bstr))})
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_replace_swapping_bug(self, using_infer_string):
def test_replace_swapping_bug(self):
df = DataFrame({"a": [True, False, True]})
res = df.replace({"a": {True: "Y", False: "N"}})
expect = DataFrame({"a": ["Y", "N", "Y"]})
expect = DataFrame({"a": ["Y", "N", "Y"]}, dtype=object)
tm.assert_frame_equal(res, expect)

df = DataFrame({"a": [0, 1, 0]})
res = df.replace({"a": {0: "Y", 1: "N"}})
expect = DataFrame({"a": ["Y", "N", "Y"]})
expect = DataFrame({"a": ["Y", "N", "Y"]}, dtype=object)
tm.assert_frame_equal(res, expect)

def test_replace_period(self):
Expand Down Expand Up @@ -1372,7 +1381,7 @@ def test_replace_commutative(self, df, to_replace, exp):
)
def test_replace_replacer_dtype(self, replacer):
# GH26632
df = DataFrame(["a"])
df = DataFrame(["a"], dtype=object)
msg = "Downcasting behavior in `replace` "
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.replace({"a": replacer, "b": replacer})
Expand Down Expand Up @@ -1489,6 +1498,7 @@ def test_replace_value_category_type(self):
input_df = input_df.replace("obj1", "obj9")
result = input_df.replace("cat2", "catX")

result = result.astype({"col1": "int64", "col3": "float64", "col5": "str"})
tm.assert_frame_equal(result, expected)

def test_replace_dict_category_type(self):
Expand Down Expand Up @@ -1650,6 +1660,14 @@ def test_replace_regex_dtype_frame(self, regex):
expected_df2 = DataFrame({"A": [1], "B": ["1"]})
with tm.assert_produces_warning(FutureWarning, match=msg):
result_df2 = df2.replace(to_replace="0", value=1, regex=regex)

if regex:
# TODO(infer_string): both string columns get cast to object,
# while only needed for column A
expected_df2 = DataFrame({"A": [1], "B": ["1"]}, dtype=object)
else:
expected_df2 = DataFrame({"A": Series([1], dtype=object), "B": ["1"]})
result_df2 = df2.replace(to_replace="0", value=1, regex=regex)
tm.assert_frame_equal(result_df2, expected_df2)

def test_replace_with_value_also_being_replaced(self):
Expand Down
Loading