Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch on batching by default and leave data on host when possible for UMAP #6219

Draft
wants to merge 6 commits into
base: branch-25.02
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
235 changes: 144 additions & 91 deletions python/cuml/cuml/manifold/umap.pyx

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2024, NVIDIA CORPORATION.
# Copyright (c) 2024-2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
Expand All @@ -21,6 +21,18 @@
from sklearn.manifold import trustworthiness


pytestmark = [
pytest.mark.filterwarnings(
"ignore:The default value of `data_on_host` "
'will change from False to "auto" in 25.06'
),
pytest.mark.filterwarnings(
"ignore:The default value of `nnd_n_clusters` "
"will change from 1 to 10 in 25.06."
),
]


@pytest.fixture(scope="module")
def manifold_data():
X, _ = make_swiss_roll(n_samples=100, noise=0.05, random_state=42)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2024, NVIDIA CORPORATION.
# Copyright (c) 2024-2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -133,6 +133,10 @@ def test_unsupervised_neighbors(transformer, estimator, classification_data):
pipeline.fit(X_train)


@pytest.mark.filterwarnings(
"ignore:The default value of `data_on_host` "
'will change from False to "auto" in 25.06'
)
def test_umap_with_logistic_regression(classification_data):
X_train, X_test, y_train, y_test = classification_data
# Create pipeline with UMAP for dimensionality reduction and logistic regression
Expand Down
6 changes: 5 additions & 1 deletion python/cuml/cuml/tests/test_api.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
# Copyright (c) 2020-2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -214,6 +214,10 @@ def test_mro(model):
###############################################################################


@pytest.mark.filterwarnings(
"ignore:The default value of `data_on_host` "
'will change from False to "auto" in 25.06'
)
@pytest.mark.parametrize("model_name", list(models.keys()))
# ignore random forest float64 warnings
@pytest.mark.filterwarnings("ignore:To use pickling or GPU-based")
Expand Down
18 changes: 17 additions & 1 deletion python/cuml/cuml/tests/test_device_selection.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
# Copyright (c) 2022-2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -598,6 +598,10 @@ def test_train_cpu_infer_cpu(test_data):
assert_func(cuml_output, test_data)


@pytest.mark.filterwarnings(
"ignore:The default value of `data_on_host` "
'will change from False to "auto" in 25.06'
)
def test_train_gpu_infer_cpu(test_data):
cuEstimator = test_data["cuEstimator"]

Expand Down Expand Up @@ -635,6 +639,10 @@ def test_train_cpu_infer_gpu(test_data):
assert_func(cuml_output, test_data)


@pytest.mark.filterwarnings(
"ignore:The default value of `data_on_host` "
'will change from False to "auto" in 25.06'
)
def test_train_gpu_infer_gpu(test_data):
cuEstimator = test_data["cuEstimator"]
if cuEstimator is UMAP and IS_ARM:
Expand All @@ -653,6 +661,10 @@ def test_train_gpu_infer_gpu(test_data):
assert_func(cuml_output, test_data)


@pytest.mark.filterwarnings(
"ignore:The default value of `data_on_host` "
'will change from False to "auto" in 25.06'
)
def test_pickle_interop(tmp_path, test_data):
pickle_filepath = tmp_path / "model.pickle"

Expand Down Expand Up @@ -843,6 +855,10 @@ def test_ridge_methods(train_device, infer_device):
assert ref_output - tol <= output <= ref_output + tol


@pytest.mark.filterwarnings(
"ignore:The default value of `data_on_host` "
'will change from False to "auto" in 25.06'
)
@pytest.mark.parametrize("device", ["cpu", "gpu"])
@pytest.mark.skipif(
IS_ARM, reason="https://github.com/rapidsai/cuml/issues/5441"
Expand Down
6 changes: 5 additions & 1 deletion python/cuml/cuml/tests/test_input_estimators.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
# Copyright (c) 2024-2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -104,6 +104,10 @@ def make_dataset(dtype, nrows, ncols, ninfo):
###############################################################################


@pytest.mark.filterwarnings(
"ignore:The default value of `data_on_host` "
'will change from False to "auto" in 25.06'
)
@pytest.mark.parametrize("model_name", models.keys())
@pytest.mark.parametrize("dtype", test_dtypes_all)
def test_estimators_all_dtypes(model_name, dtype):
Expand Down
6 changes: 5 additions & 1 deletion python/cuml/cuml/tests/test_internals_api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2023, NVIDIA CORPORATION.
# Copyright (c) 2018-2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -43,6 +43,10 @@ def on_train_end(self, embeddings):
self.train_event = True


@pytest.mark.filterwarnings(
"ignore:The default value of `data_on_host` "
'will change from False to "auto" in 25.06'
)
@pytest.mark.parametrize("n_components", [2, 4, 8])
def test_internals_api(n_components):
callback = CustomCallback()
Expand Down
6 changes: 5 additions & 1 deletion python/cuml/cuml/tests/test_pickle.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2024, NVIDIA CORPORATION.
# Copyright (c) 2019-2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -334,6 +334,10 @@ def assert_model(pickled_model, X_test):
pickle_save_load(tmpdir, create_mod, assert_model)


@pytest.mark.filterwarnings(
"ignore:The default value of `data_on_host` "
'will change from False to "auto" in 25.06'
)
@pytest.mark.parametrize("datatype", [np.float32, np.float64])
@pytest.mark.parametrize("keys", umap_model.keys())
def test_umap_pickle(tmpdir, datatype, keys):
Expand Down
6 changes: 5 additions & 1 deletion python/cuml/cuml/tests/test_public_methods_attributes.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2024, NVIDIA CORPORATION.
# Copyright (c) 2024-2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -90,6 +90,10 @@
]


@pytest.mark.filterwarnings(
"ignore:The default value of `data_on_host` "
'will change from False to "auto" in 25.06'
)
@pytest.mark.parametrize("estimator_name", estimators)
def test_UniversalBase_estimators(estimator_name):
# importing dynamically will also implicitly test that cuML
Expand Down
43 changes: 43 additions & 0 deletions python/cuml/cuml/tests/test_umap.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,53 @@
if not IS_ARM:
import umap

pytestmark = [
pytest.mark.filterwarnings(
"ignore:The default value of `data_on_host` "
'will change from False to "auto" in 25.06'
),
pytest.mark.filterwarnings(
"ignore:The default value of `nnd_n_clusters` "
"will change from 1 to 10 in 25.06."
),
]

dataset_names = ["iris", "digits", "wine", "blobs"]


def test_new_data_on_host_default():
data, labels = make_blobs(
# Make the data big enough so that we can have it on the host
n_samples=50_000 + 1,
n_features=10,
centers=5,
random_state=0,
)
u = cuUMAP()

with pytest.warns(
FutureWarning,
match='The default value of `data_on_host` will change from False to "auto" in 25.06.',
):
u.fit(data)
u.fit_transform(data)

# No warnings when value is explicitly set
u.fit(data, data_on_host=True)
u.fit_transform(data, data_on_host=True)
u.fit(data, data_on_host=False)
u.fit_transform(data, data_on_host=False)

# XXX crashes with CUDA memory error, why? Too many rows?
"""
# No warning when the data is sparse
print("E")
data = scipy_sparse.csr_matrix(data)
u = cuUMAP()
u.fit_transform(data)
"""


@pytest.mark.parametrize(
"nrows", [unit_param(500), quality_param(5000), stress_param(500000)]
)
Expand Down
Loading