xorbitsai · luweizheng · Feb 9, 2025 · Feb 6, 2025 · Feb 7, 2025 · Feb 7, 2025
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
@@ -383,7 +383,7 @@ jobs:
           pytest --timeout=1500 \
             -W ignore::PendingDeprecationWarning \
             --cov-config=setup.cfg --cov-report=xml --cov=xorbits \
-            xorbits/xgboost/ xorbits/lightgbm/
+            xorbits/_mars/learn/contrib/xgboost/ xorbits/_mars/learn/contrib/lightgbm/
         elif [ "$MODULE" == "jax" ]; then
           pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/_mars/tensor/fuse/tests/test_runtime_fusion.py
           pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/_mars/tensor/

diff --git a/python/xorbits/_mars/learn/preprocessing/_data.py b/python/xorbits/_mars/learn/preprocessing/_data.py
@@ -215,7 +215,7 @@ def partial_fit(self, X, y=None, session=None, run_kwargs=None):
             reset=first_pass,
             estimator=self,
             dtype=FLOAT_DTYPES,
-            force_all_finite="allow-nan",
+            ensure_all_finite="allow-nan",
         )
 
         if np.isnan(X.shape[0]):  # pragma: no cover
@@ -267,7 +267,7 @@ def transform(self, X, session=None, run_kwargs=None):
             X,
             copy=self.copy,
             dtype=FLOAT_DTYPES,
-            force_all_finite="allow-nan",
+            ensure_all_finite="allow-nan",
             reset=False,
         )
 
@@ -293,7 +293,7 @@ def inverse_transform(self, X, session=None, run_kwargs=None):
         check_is_fitted(self)
 
         X = check_array(
-            X, copy=self.copy, dtype=FLOAT_DTYPES, force_all_finite="allow-nan"
+            X, copy=self.copy, dtype=FLOAT_DTYPES, ensure_all_finite="allow-nan"
         )
 
         X -= self.min_
@@ -382,7 +382,11 @@ def minmax_scale(
     # Unlike the scaler object, this function allows 1d input.
     # If copy is required, it will be done inside the scaler object.
     X = check_array(
-        X, copy=False, ensure_2d=False, dtype=FLOAT_DTYPES, force_all_finite="allow-nan"
+        X,
+        copy=False,
+        ensure_2d=False,
+        dtype=FLOAT_DTYPES,
+        ensure_all_finite="allow-nan",
     )
     original_ndim = X.ndim
 

diff --git a/python/xorbits/_mars/learn/utils/tests/test_validation.py b/python/xorbits/_mars/learn/utils/tests/test_validation.py
@@ -81,7 +81,7 @@ def test_check_array(setup):
 
     for X, dtype, order, copy in product(Xs, dtypes, orders, copy_flags):
         X_checked = check_array(
-            X, dtype=dtype, order=order, copy=copy, force_all_finite=False
+            X, dtype=dtype, order=order, copy=copy, ensure_all_finite=False
         )
         if dtype is not None:
             assert X_checked.dtype == dtype

diff --git a/python/xorbits/_mars/learn/utils/validation.py b/python/xorbits/_mars/learn/utils/validation.py
@@ -171,7 +171,7 @@ def _ensure_no_complex_data(array):
 
 
 def _ensure_sparse_format(
-    spmatrix, accept_sparse, dtype, copy, force_all_finite, accept_large_sparse
+    spmatrix, accept_sparse, dtype, copy, ensure_all_finite, accept_large_sparse
 ):
     """Convert a sparse matrix to a given format.
 
@@ -196,7 +196,7 @@ def _ensure_sparse_format(
         Whether a forced copy will be triggered. If copy=False, a copy might
         be triggered by a conversion.
 
-    force_all_finite : boolean or 'allow-nan', (default=True)
+    ensure_all_finite : boolean or 'allow-nan', (default=True)
         Whether to raise an error on np.inf and np.nan in X. The possibilities
         are:
 
@@ -254,9 +254,9 @@ def _ensure_sparse_format(
         # force copy
         spmatrix = spmatrix.copy()
 
-    if force_all_finite:
+    if ensure_all_finite:
         spmatrix = assert_all_finite(
-            spmatrix, allow_nan=force_all_finite == "allow-nan", check_only=False
+            spmatrix, allow_nan=ensure_all_finite == "allow-nan", check_only=False
         )
 
     return spmatrix
@@ -269,7 +269,7 @@ def check_array(
     dtype="numeric",
     order=None,
     copy=False,
-    force_all_finite=True,
+    ensure_all_finite=True,
     ensure_2d=True,
     allow_nd=False,
     ensure_min_samples=1,
@@ -316,7 +316,7 @@ def check_array(
         Whether a forced copy will be triggered. If copy=False, a copy might
         be triggered by a conversion.
 
-    force_all_finite : boolean or 'allow-nan', (default=True)
+    ensure_all_finite : boolean or 'allow-nan', (default=True)
         Whether to raise an error on np.inf and np.nan in tensor. The
         possibilities are:
 
@@ -377,10 +377,10 @@ def check_array(
             # list of accepted types.
             dtype = dtype[0]
 
-    if force_all_finite not in (True, False, "allow-nan"):
+    if ensure_all_finite not in (True, False, "allow-nan"):
         raise ValueError(
-            'force_all_finite should be a bool or "allow-nan"'
-            f". Got {force_all_finite!r} instead"
+            'ensure_all_finite should be a bool or "allow-nan"'
+            f". Got {ensure_all_finite!r} instead"
         )
 
     if estimator is not None:
@@ -400,7 +400,7 @@ def check_array(
             accept_sparse=accept_sparse,
             dtype=dtype,
             copy=copy,
-            force_all_finite=force_all_finite,
+            ensure_all_finite=ensure_all_finite,
             accept_large_sparse=accept_large_sparse,
         )
     else:
@@ -460,9 +460,9 @@ def check_array(
                 "Found array with dim %d. %s expected <= 2."
                 % (array.ndim, estimator_name)
             )
-        if force_all_finite:
+        if ensure_all_finite:
             array = _assert_all_finite(
-                array, allow_nan=force_all_finite == "allow-nan", check_only=False
+                array, allow_nan=ensure_all_finite == "allow-nan", check_only=False
             )
 
     if ensure_min_samples > 0:
@@ -497,7 +497,7 @@ def check_X_y(
     dtype="numeric",
     order=None,
     copy=False,
-    force_all_finite=True,
+    ensure_all_finite=True,
     ensure_2d=True,
     allow_nd=False,
     multi_output=False,
@@ -548,7 +548,7 @@ def check_X_y(
         Whether a forced copy will be triggered. If copy=False, a copy might
         be triggered by a conversion.
 
-    force_all_finite : boolean or 'allow-nan', (default=True)
+    ensure_all_finite : boolean or 'allow-nan', (default=True)
         Whether to raise an error on np.inf and np.nan in X. This parameter
         does not influence whether y can have np.inf or np.nan values.
         The possibilities are:
@@ -606,15 +606,15 @@ def check_X_y(
         dtype=dtype,
         order=order,
         copy=copy,
-        force_all_finite=force_all_finite,
+        ensure_all_finite=ensure_all_finite,
         ensure_2d=ensure_2d,
         allow_nd=allow_nd,
         ensure_min_samples=ensure_min_samples,
         ensure_min_features=ensure_min_features,
         estimator=estimator,
     )
     if multi_output:
-        y = check_array(y, True, force_all_finite=True, ensure_2d=False, dtype=None)
+        y = check_array(y, True, ensure_all_finite=True, ensure_2d=False, dtype=None)
     else:
         y = column_or_1d(y, warn=True)
         y = _assert_all_finite(y, check_only=False)

diff --git a/python/xorbits/_mars/utils.py b/python/xorbits/_mars/utils.py
@@ -40,6 +40,7 @@
 import weakref
 import zlib
 from abc import ABC
+from collections import OrderedDict
 from contextlib import contextmanager
 from types import ModuleType, TracebackType
 from typing import (
@@ -774,28 +775,34 @@ def merge_chunks(chunk_results: List[Tuple[Tuple[int], Any]]) -> Any:
         for r in chunk_results:
             result.extend(r[1])
         return result
-    elif type(v) is dict:
+    elif isinstance(v, (dict, OrderedDict)):
         # TODO(codingl2k1) : We should register a merge handler for each output type.
-        result = {}
+        result = type(v)()
         chunk_results = [(k, v) for k, v in chunk_results if v]
         if len(chunk_results) == 1:
             return chunk_results[0][1]
+
         for r in chunk_results:
             d = r[1]
-            if not result:
-                if not all(
-                    type(key) is str and type(value) is list for key, value in d.items()
-                ):
-                    raise TypeError(
-                        "only support merge dict with type Dict[str, List]."
-                    )
-                result.update(d)
-            else:
-                if d.keys() != result.keys():
-                    raise TypeError(f"unsupported merge dict with different keys.")
+            for key, value in d.items():
+                if key not in result:
+                    if isinstance(value, dict):
+                        result[key] = type(value)()
+                    else:
+                        result[key] = value
                 else:
-                    for key, value in d.items():
-                        result[key].extend(value)
+                    if isinstance(value, dict):
+                        result[key].update(value)
+                    elif isinstance(value, (list, tuple)):
+                        if isinstance(result[key], (list, tuple)):
+                            result[key].extend(value)
+                        else:
+                            result[key] = [result[key], *value]
+                    else:
+                        if isinstance(result[key], (list, tuple)):
+                            result[key].append(value)
+                        else:
+                            result[key] = value
         return result
     elif isinstance(v, pa.Table):
         result = [r[1] for r in chunk_results]

diff --git a/python/xorbits/datasets/backends/huggingface/tests/test_huggingface_dataset.py b/python/xorbits/datasets/backends/huggingface/tests/test_huggingface_dataset.py
@@ -49,7 +49,7 @@ def test_split_arg_required():
     ],
 )
 def test_from_huggingface_execute(setup, path):
-    xds = from_huggingface(path, split="train")
+    xds = from_huggingface(path, split="train", trust_remote_code=True)
     xds.execute()
     ds = xds.fetch()
     ds_expected = datasets.load_dataset(path, split="train")
@@ -83,7 +83,7 @@ def test_from_huggingface_file_lock(setup):
     ],
 )
 def test_to_dataframe_execute(setup, path):
-    xds = from_huggingface(path, split="train")
+    xds = from_huggingface(path, split="train", trust_remote_code=True)
     mdf = xds.to_dataframe()
     mdf.execute()
     df = mdf.fetch()
@@ -103,7 +103,7 @@ def add_prefix(x):
         x["text"] = "xorbits: " + x["text"]
         return x
 
-    xds = from_huggingface(path, split="train")
+    xds = from_huggingface(path, split="train", trust_remote_code=True)
     xds = xds.map(add_prefix)
     xds.execute()
     ds = xds.fetch()
@@ -119,7 +119,7 @@ def add_prefix(x):
     ],
 )
 def test_rechunk_execute(setup, path):
-    xds = from_huggingface(path, split="train")
+    xds = from_huggingface(path, split="train", trust_remote_code=True)
     xds = xds.rechunk(2)
     xds.execute()
     ds = xds.fetch()
@@ -158,7 +158,7 @@ def test_export(setup):
     tmp_dir = Path(tempfile.gettempdir())
     export_dir = tmp_dir.joinpath("test_export")
     shutil.rmtree(export_dir, ignore_errors=True)
-    db = from_huggingface("cifar10", split="train")
+    db = from_huggingface("cifar10", split="train", trust_remote_code=True)
     # Test invalid export dir
     Path(export_dir).touch()
     with pytest.raises(Exception, match="dir"):
@@ -184,7 +184,7 @@ def test_export(setup):
         assert len(data_arrow_files) == data_meta_info["num_files"]
         assert info["num_rows"] == data_meta_info["num_rows"]
 
-        db = from_huggingface("imdb", split="train")
+        db = from_huggingface("imdb", split="train", trust_remote_code=True)
         db.export(
             export_dir,
             column_groups={"my_text": ["text"], "my_label": ["label"]},