Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Fix XGBoost/LightGBM test execution and scikit-learn compatibility #846

Merged
merged 11 commits into from
Feb 9, 2025
2 changes: 1 addition & 1 deletion .github/workflows/python.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ jobs:
pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xorbits \
xorbits/xgboost/ xorbits/lightgbm/
xorbits/_mars/learn/contrib/xgboost/ xorbits/_mars/learn/contrib/lightgbm/
elif [ "$MODULE" == "jax" ]; then
pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/_mars/tensor/fuse/tests/test_runtime_fusion.py
pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/_mars/tensor/
Expand Down
12 changes: 8 additions & 4 deletions python/xorbits/_mars/learn/preprocessing/_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def partial_fit(self, X, y=None, session=None, run_kwargs=None):
reset=first_pass,
estimator=self,
dtype=FLOAT_DTYPES,
force_all_finite="allow-nan",
ensure_all_finite="allow-nan",
)

if np.isnan(X.shape[0]): # pragma: no cover
Expand Down Expand Up @@ -267,7 +267,7 @@ def transform(self, X, session=None, run_kwargs=None):
X,
copy=self.copy,
dtype=FLOAT_DTYPES,
force_all_finite="allow-nan",
ensure_all_finite="allow-nan",
reset=False,
)

Expand All @@ -293,7 +293,7 @@ def inverse_transform(self, X, session=None, run_kwargs=None):
check_is_fitted(self)

X = check_array(
X, copy=self.copy, dtype=FLOAT_DTYPES, force_all_finite="allow-nan"
X, copy=self.copy, dtype=FLOAT_DTYPES, ensure_all_finite="allow-nan"
)

X -= self.min_
Expand Down Expand Up @@ -382,7 +382,11 @@ def minmax_scale(
# Unlike the scaler object, this function allows 1d input.
# If copy is required, it will be done inside the scaler object.
X = check_array(
X, copy=False, ensure_2d=False, dtype=FLOAT_DTYPES, force_all_finite="allow-nan"
X,
copy=False,
ensure_2d=False,
dtype=FLOAT_DTYPES,
ensure_all_finite="allow-nan",
)
original_ndim = X.ndim

Expand Down
2 changes: 1 addition & 1 deletion python/xorbits/_mars/learn/utils/tests/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def test_check_array(setup):

for X, dtype, order, copy in product(Xs, dtypes, orders, copy_flags):
X_checked = check_array(
X, dtype=dtype, order=order, copy=copy, force_all_finite=False
X, dtype=dtype, order=order, copy=copy, ensure_all_finite=False
)
if dtype is not None:
assert X_checked.dtype == dtype
Expand Down
32 changes: 16 additions & 16 deletions python/xorbits/_mars/learn/utils/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def _ensure_no_complex_data(array):


def _ensure_sparse_format(
spmatrix, accept_sparse, dtype, copy, force_all_finite, accept_large_sparse
spmatrix, accept_sparse, dtype, copy, ensure_all_finite, accept_large_sparse
):
"""Convert a sparse matrix to a given format.

Expand All @@ -196,7 +196,7 @@ def _ensure_sparse_format(
Whether a forced copy will be triggered. If copy=False, a copy might
be triggered by a conversion.

force_all_finite : boolean or 'allow-nan', (default=True)
ensure_all_finite : boolean or 'allow-nan', (default=True)
Whether to raise an error on np.inf and np.nan in X. The possibilities
are:

Expand Down Expand Up @@ -254,9 +254,9 @@ def _ensure_sparse_format(
# force copy
spmatrix = spmatrix.copy()

if force_all_finite:
if ensure_all_finite:
spmatrix = assert_all_finite(
spmatrix, allow_nan=force_all_finite == "allow-nan", check_only=False
spmatrix, allow_nan=ensure_all_finite == "allow-nan", check_only=False
)

return spmatrix
Expand All @@ -269,7 +269,7 @@ def check_array(
dtype="numeric",
order=None,
copy=False,
force_all_finite=True,
ensure_all_finite=True,
ensure_2d=True,
allow_nd=False,
ensure_min_samples=1,
Expand Down Expand Up @@ -316,7 +316,7 @@ def check_array(
Whether a forced copy will be triggered. If copy=False, a copy might
be triggered by a conversion.

force_all_finite : boolean or 'allow-nan', (default=True)
ensure_all_finite : boolean or 'allow-nan', (default=True)
Whether to raise an error on np.inf and np.nan in tensor. The
possibilities are:

Expand Down Expand Up @@ -377,10 +377,10 @@ def check_array(
# list of accepted types.
dtype = dtype[0]

if force_all_finite not in (True, False, "allow-nan"):
if ensure_all_finite not in (True, False, "allow-nan"):
raise ValueError(
'force_all_finite should be a bool or "allow-nan"'
f". Got {force_all_finite!r} instead"
'ensure_all_finite should be a bool or "allow-nan"'
f". Got {ensure_all_finite!r} instead"
)

if estimator is not None:
Expand All @@ -400,7 +400,7 @@ def check_array(
accept_sparse=accept_sparse,
dtype=dtype,
copy=copy,
force_all_finite=force_all_finite,
ensure_all_finite=ensure_all_finite,
accept_large_sparse=accept_large_sparse,
)
else:
Expand Down Expand Up @@ -460,9 +460,9 @@ def check_array(
"Found array with dim %d. %s expected <= 2."
% (array.ndim, estimator_name)
)
if force_all_finite:
if ensure_all_finite:
array = _assert_all_finite(
array, allow_nan=force_all_finite == "allow-nan", check_only=False
array, allow_nan=ensure_all_finite == "allow-nan", check_only=False
)

if ensure_min_samples > 0:
Expand Down Expand Up @@ -497,7 +497,7 @@ def check_X_y(
dtype="numeric",
order=None,
copy=False,
force_all_finite=True,
ensure_all_finite=True,
ensure_2d=True,
allow_nd=False,
multi_output=False,
Expand Down Expand Up @@ -548,7 +548,7 @@ def check_X_y(
Whether a forced copy will be triggered. If copy=False, a copy might
be triggered by a conversion.

force_all_finite : boolean or 'allow-nan', (default=True)
ensure_all_finite : boolean or 'allow-nan', (default=True)
Whether to raise an error on np.inf and np.nan in X. This parameter
does not influence whether y can have np.inf or np.nan values.
The possibilities are:
Expand Down Expand Up @@ -606,15 +606,15 @@ def check_X_y(
dtype=dtype,
order=order,
copy=copy,
force_all_finite=force_all_finite,
ensure_all_finite=ensure_all_finite,
ensure_2d=ensure_2d,
allow_nd=allow_nd,
ensure_min_samples=ensure_min_samples,
ensure_min_features=ensure_min_features,
estimator=estimator,
)
if multi_output:
y = check_array(y, True, force_all_finite=True, ensure_2d=False, dtype=None)
y = check_array(y, True, ensure_all_finite=True, ensure_2d=False, dtype=None)
else:
y = column_or_1d(y, warn=True)
y = _assert_all_finite(y, check_only=False)
Expand Down
37 changes: 22 additions & 15 deletions python/xorbits/_mars/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import weakref
import zlib
from abc import ABC
from collections import OrderedDict
from contextlib import contextmanager
from types import ModuleType, TracebackType
from typing import (
Expand Down Expand Up @@ -774,28 +775,34 @@ def merge_chunks(chunk_results: List[Tuple[Tuple[int], Any]]) -> Any:
for r in chunk_results:
result.extend(r[1])
return result
elif type(v) is dict:
elif isinstance(v, (dict, OrderedDict)):
# TODO(codingl2k1) : We should register a merge handler for each output type.
result = {}
result = type(v)()
chunk_results = [(k, v) for k, v in chunk_results if v]
if len(chunk_results) == 1:
return chunk_results[0][1]

for r in chunk_results:
d = r[1]
if not result:
if not all(
type(key) is str and type(value) is list for key, value in d.items()
):
raise TypeError(
"only support merge dict with type Dict[str, List]."
)
result.update(d)
else:
if d.keys() != result.keys():
raise TypeError(f"unsupported merge dict with different keys.")
for key, value in d.items():
if key not in result:
if isinstance(value, dict):
result[key] = type(value)()
else:
result[key] = value
else:
for key, value in d.items():
result[key].extend(value)
if isinstance(value, dict):
result[key].update(value)
elif isinstance(value, (list, tuple)):
if isinstance(result[key], (list, tuple)):
result[key].extend(value)
else:
result[key] = [result[key], *value]
else:
if isinstance(result[key], (list, tuple)):
result[key].append(value)
else:
result[key] = value
return result
elif isinstance(v, pa.Table):
result = [r[1] for r in chunk_results]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def test_split_arg_required():
],
)
def test_from_huggingface_execute(setup, path):
xds = from_huggingface(path, split="train")
xds = from_huggingface(path, split="train", trust_remote_code=True)
xds.execute()
ds = xds.fetch()
ds_expected = datasets.load_dataset(path, split="train")
Expand Down Expand Up @@ -83,7 +83,7 @@ def test_from_huggingface_file_lock(setup):
],
)
def test_to_dataframe_execute(setup, path):
xds = from_huggingface(path, split="train")
xds = from_huggingface(path, split="train", trust_remote_code=True)
mdf = xds.to_dataframe()
mdf.execute()
df = mdf.fetch()
Expand All @@ -103,7 +103,7 @@ def add_prefix(x):
x["text"] = "xorbits: " + x["text"]
return x

xds = from_huggingface(path, split="train")
xds = from_huggingface(path, split="train", trust_remote_code=True)
xds = xds.map(add_prefix)
xds.execute()
ds = xds.fetch()
Expand All @@ -119,7 +119,7 @@ def add_prefix(x):
],
)
def test_rechunk_execute(setup, path):
xds = from_huggingface(path, split="train")
xds = from_huggingface(path, split="train", trust_remote_code=True)
xds = xds.rechunk(2)
xds.execute()
ds = xds.fetch()
Expand Down Expand Up @@ -158,7 +158,7 @@ def test_export(setup):
tmp_dir = Path(tempfile.gettempdir())
export_dir = tmp_dir.joinpath("test_export")
shutil.rmtree(export_dir, ignore_errors=True)
db = from_huggingface("cifar10", split="train")
db = from_huggingface("cifar10", split="train", trust_remote_code=True)
# Test invalid export dir
Path(export_dir).touch()
with pytest.raises(Exception, match="dir"):
Expand All @@ -184,7 +184,7 @@ def test_export(setup):
assert len(data_arrow_files) == data_meta_info["num_files"]
assert info["num_rows"] == data_meta_info["num_rows"]

db = from_huggingface("imdb", split="train")
db = from_huggingface("imdb", split="train", trust_remote_code=True)
db.export(
export_dir,
column_groups={"my_text": ["text"], "my_label": ["label"]},
Expand Down
Loading