From 3ea1918b2d254c903ed87ec71581e912a75627a8 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo Date: Wed, 6 Dec 2023 17:01:52 +0700 Subject: [PATCH 01/11] Modify the distance calculating function in Silhouette coefficient (related to an issue raised in #1468). --- river/metrics/silhouette.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/river/metrics/silhouette.py b/river/metrics/silhouette.py index d3c6d0a27d..3c0bb3d05c 100644 --- a/river/metrics/silhouette.py +++ b/river/metrics/silhouette.py @@ -47,7 +47,7 @@ class Silhouette(metrics.base.ClusteringMetric): ... metric.update(x, y_pred, k_means.centers) >>> metric - Silhouette: 0.568058 + Silhouette: 0.32145 References ---------- @@ -65,18 +65,18 @@ def __init__(self): @staticmethod def _find_distance_second_closest_center(centers, x): - distances = {i: math.sqrt(utils.math.minkowski_distance(centers[i], x, 2)) for i in centers} + distances = {i: utils.math.minkowski_distance(centers[i], x, 2) for i in centers} return sorted(distances.values())[-2] def update(self, x, y_pred, centers, w=1.0): - distance_closest_centroid = math.sqrt(utils.math.minkowski_distance(centers[y_pred], x, 2)) + distance_closest_centroid = utils.math.minkowski_distance(centers[y_pred], x, 2) self._sum_distance_closest_centroid += distance_closest_centroid distance_second_closest_centroid = self._find_distance_second_closest_center(centers, x) self._sum_distance_second_closest_centroid += distance_second_closest_centroid def revert(self, x, y_pred, centers, w=1.0): - distance_closest_centroid = math.sqrt(utils.math.minkowski_distance(centers[y_pred], x, 2)) + distance_closest_centroid = utils.math.minkowski_distance(centers[y_pred], x, 2) self._sum_distance_closest_centroid -= distance_closest_centroid distance_second_closest_centroid = self._find_distance_second_closest_center(centers, x) From 930fe0219a8806d5009fab47340440f185380e94 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo Date: Wed, 6 Dec 2023 17:14:56 +0700 Subject: [PATCH 02/11] Modify the distance calculating function in DenStream (relating to an issued mentioned in #1468). --- river/cluster/denstream.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/river/cluster/denstream.py b/river/cluster/denstream.py index 4d060795de..5100b30e2b 100644 --- a/river/cluster/denstream.py +++ b/river/cluster/denstream.py @@ -120,16 +120,16 @@ class DenStream(base.Clusterer): ... denstream.learn_one(x) >>> denstream.predict_one({0: -1, 1: -2}) - 0 + 1 >>> denstream.predict_one({0: 5, 1: 4}) - 1 + 2 >>> denstream.predict_one({0: 1, 1: 1}) 0 >>> denstream.n_clusters - 2 + 3 """ @@ -183,7 +183,7 @@ def centers(self): @staticmethod def _distance(point_a, point_b): - return math.sqrt(utils.math.minkowski_distance(point_a, point_b, 2)) + return utils.math.minkowski_distance(point_a, point_b, 2) def _get_closest_cluster_key(self, point, clusters): min_distance = math.inf From e4ebcd04e87aac678591f84818df326c8fbf2480 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo Date: Thu, 7 Dec 2023 00:12:55 +0700 Subject: [PATCH 03/11] Change the inserted value of the shared density when KeyError occurs to 1 instead of 0. --- river/cluster/dbstream.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/river/cluster/dbstream.py b/river/cluster/dbstream.py index e098f0de2e..ce58515578 100644 --- a/river/cluster/dbstream.py +++ b/river/cluster/dbstream.py @@ -227,10 +227,10 @@ def _update(self, x): self.s_t[i][j] = self._time_stamp except KeyError: try: - self.s[i][j] = 0 + self.s[i][j] = 1 self.s_t[i][j] = self._time_stamp except KeyError: - self.s[i] = {j: 0} + self.s[i] = {j: 1} self.s_t[i] = {j: self._time_stamp} # prevent collapsing clusters From c4a86761c832cee03f0996437ce854cb3e6ebbc2 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo Date: Sat, 9 Dec 2023 01:55:58 +0700 Subject: [PATCH 04/11] Removing entries within shared density and associated timestamp when removing a micro-cluster (related to an inquiry from #1468) --- river/cluster/dbstream.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/river/cluster/dbstream.py b/river/cluster/dbstream.py index ce58515578..0b4408271f 100644 --- a/river/cluster/dbstream.py +++ b/river/cluster/dbstream.py @@ -266,6 +266,15 @@ def _cleanup(self): if micro_cluster_i.weight * value < weight_weak: micro_clusters.pop(i) + self.s.pop(i, None) + self.s_t.pop(i, None) + # Since self.s and self.s_t always have the same keys and are arranged in ascending orders + for j in self.s: + if j < i: + self.s[j].pop(i, None) + self.s_t[j].pop(i, None) + else: + break # Update microclusters self._micro_clusters = micro_clusters From 80757e3924f94e1249881bf82dd4222cf7001dba Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo Date: Tue, 12 Dec 2023 16:23:58 +0700 Subject: [PATCH 05/11] Modify the first test within test_dbstream.py file --- river/cluster/test_dbstream.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/river/cluster/test_dbstream.py b/river/cluster/test_dbstream.py index 255c71e7a3..a28c589399 100644 --- a/river/cluster/test_dbstream.py +++ b/river/cluster/test_dbstream.py @@ -5,7 +5,7 @@ from river.cluster import DBSTREAM -def build_dbstream(fading_factor=0.001, intersection_factor=0.05): +def build_dbstream(fading_factor=0.01, intersection_factor=0.05): return DBSTREAM( fading_factor=fading_factor, clustering_threshold=1, @@ -31,27 +31,44 @@ def test_cluster_formation_and_cleanup(): X = [ {1: 1}, + {1: 2}, {1: 3}, {1: 3}, {1: 3}, {1: 5}, {1: 7}, {1: 9}, + {1: 10}, {1: 11}, {1: 11}, + {1: 12}, {1: 13}, {1: 11}, {1: 15}, + {1: 15}, + {1: 16}, + {1: 17}, + {1: 17}, {1: 17}, ] for x in X: dbstream.learn_one(x) - assert len(dbstream._micro_clusters) == 3 - assert_micro_cluster_properties(dbstream.micro_clusters[1], center={1: 3}, last_update=3) - assert_micro_cluster_properties(dbstream.micro_clusters[5], center={1: 11}, last_update=10) - assert_micro_cluster_properties(dbstream.micro_clusters[7], center={1: 17}, last_update=12) + assert len(dbstream._micro_clusters) == 4 + assert_micro_cluster_properties(dbstream.micro_clusters[2], center={1: 3}, last_update=4) + assert_micro_cluster_properties(dbstream.micro_clusters[7], center={1: 11}, last_update=13) + assert_micro_cluster_properties(dbstream.micro_clusters[8], center={1: 15}, last_update=15) + assert_micro_cluster_properties(dbstream.micro_clusters[10], center={1: 17}, last_update=19) + + assert dbstream.predict_one({1: 2.0}) == 0 + assert dbstream.predict_one({1: 13.0}) == 1 + assert dbstream.predict_one({1: 13 + 1e-10}) == 2 + assert dbstream.predict_one({1: 16 - 1e-10}) == 2 + assert dbstream.predict_one({1: 18}) == 3 + + assert len(dbstream._clusters) == 4 + assert dbstream.s == dbstream.s_t == {} def test_with_two_micro_clusters(): From 0a5a1c0b71078ebb10247ebc267a1098b87d72ed Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo Date: Tue, 12 Dec 2023 17:34:09 +0700 Subject: [PATCH 06/11] Update the second test within test_dbstream.py file --- river/cluster/test_dbstream.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/river/cluster/test_dbstream.py b/river/cluster/test_dbstream.py index a28c589399..1359cc7745 100644 --- a/river/cluster/test_dbstream.py +++ b/river/cluster/test_dbstream.py @@ -76,24 +76,21 @@ def test_with_two_micro_clusters(): add_cluster(dbstream, initial_point={1: 1, 2: 1}, move_towards={1: 1.7, 2: 1.7}, times=25) add_cluster(dbstream, initial_point={1: 3, 2: 3}, move_towards={1: 2.3, 2: 2.3}, times=25) - # Points in the middle of first and second micro-clusters - for _ in range(5): - dbstream.learn_one({1: 2, 2: 2}) - assert len(dbstream._micro_clusters) == 2 + assert len(dbstream.micro_clusters) == 2 assert_micro_cluster_properties( - dbstream.micro_clusters[0], center={1: 1.597322, 2: 1.597322}, last_update=56 + dbstream.micro_clusters[0], center={1: 2.137623, 2: 2.137623}, last_update=51 ) assert_micro_cluster_properties( - dbstream.micro_clusters[1], center={1: 2.402677, 2: 2.402677}, last_update=56 + dbstream.micro_clusters[1], center={1: 2.914910, 2: 2.914910}, last_update=51 ) - assert dbstream.s == {0: {1: 3.995844478090532}} - assert dbstream.s_t == {0: {1: 56}} + assert dbstream.s == {0: {1: 23.033438964246173}} + assert dbstream.s_t == {0: {1: 51}} dbstream._recluster() assert len(dbstream.clusters) == 1 - assert_micro_cluster_properties(dbstream.clusters[0], center={1: 2.003033, 2: 2.003033}) + assert_micro_cluster_properties(dbstream.clusters[0], center={1: 2.415239, 2: 2.415239}) def test_density_graph_with_three_micro_clusters(): From 2fd8bc4e44e1c59800e1fe83f9a88811bab4bd6c Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo Date: Thu, 14 Dec 2023 16:13:20 +0700 Subject: [PATCH 07/11] Add test for DBSTREAM with synthetic data --- river/cluster/test_dbstream.py | 45 ++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/river/cluster/test_dbstream.py b/river/cluster/test_dbstream.py index 1359cc7745..9d780b7d2b 100644 --- a/river/cluster/test_dbstream.py +++ b/river/cluster/test_dbstream.py @@ -157,3 +157,48 @@ def test_density_graph_with_removed_microcluster(): dbstream._recluster() assert len(dbstream.clusters) == 1 assert_micro_cluster_properties(dbstream.clusters[0], center={1: 3.152231, 2: 3.152231}) + + +def test_dbstream_synthetic_sklearn(): + centers = [(-10, -10), (-5, -5), (0, 0), (5, 5), (10, 10)] + cluster_std = [0.6] * 5 + + # Create a dataset with 15000 data points with 5 centers and cluster SD of 0.6 each + X, y = make_blobs(n_samples=15_000, + cluster_std=cluster_std, + centers=centers, + n_features=2, + random_state=42) + + dbstream = DBSTREAM( + clustering_threshold=2, + fading_factor=0.05, + intersection_factor=0.1, + cleanup_interval=1.0, + minimum_weight=1.0, + ) + + # Use VBeta as the metric to investigate the performance of DBSTREAM + v_beta = metrics.VBeta(beta=1.0) + + for x, y_true in stream.iter_array(X, y): + dbstream.learn_one(x) + y_pred = dbstream.predict_one(x) + v_beta.update(y_true, y_pred) + + assert len(dbstream._micro_clusters) == 12 + assert round(v_beta.get(), 4) == 0.9816 + + assert dbstream.s.keys() == dbstream.s_t.keys() + + dbstream._recluster() + + # Check that the resulted cluster centers are close to the expected centers + dbstream_expected_centers = {0: {0: 10, 1: 10}, + 1: {0: -5, 1: -5}, + 2: {0: 0, 1: 0}, + 3: {0: 5, 1: 5}, + 4: {0: -10, 1: -10}} + + for i in dbstream.centers.keys(): + assert utils.math.minkowski_distance(dbstream.centers[i], dbstream_expected_centers[i], 2) < 0.2 From 3b59cdeab52957def8a395907e19a32cd1f860a6 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo Date: Thu, 14 Dec 2023 17:32:36 +0700 Subject: [PATCH 08/11] Update test_density_graph_with_removed_microcluster function in DBSTREAM testing --- river/cluster/test_dbstream.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/river/cluster/test_dbstream.py b/river/cluster/test_dbstream.py index 9d780b7d2b..caf5a5724d 100644 --- a/river/cluster/test_dbstream.py +++ b/river/cluster/test_dbstream.py @@ -129,7 +129,8 @@ def test_density_graph_with_three_micro_clusters(): def test_density_graph_with_removed_microcluster(): - dbstream = build_dbstream(fading_factor=0.1, intersection_factor=0.3) + dbstream = build_dbstream(fading_factor=0.1, + intersection_factor=0.3) add_cluster(dbstream, initial_point={1: 1, 2: 1}, move_towards={1: 1.7, 2: 1.7}, times=25) add_cluster(dbstream, initial_point={1: 3, 2: 3}, move_towards={1: 2.3, 2: 2.3}, times=25) @@ -137,26 +138,28 @@ def test_density_graph_with_removed_microcluster(): for _ in range(5): dbstream.learn_one({1: 2, 2: 2}) - add_cluster(dbstream, initial_point={1: 4, 2: 4}, move_towards={1: 3.3, 2: 3.3}, times=25) + add_cluster(dbstream, initial_point={1: 3.5, 2: 3.5}, move_towards={1: 2.9, 2: 2.9}, times=25) + # Points in the middle of second and third micro-clusters for _ in range(4): - dbstream.learn_one({1: 3, 2: 3}) + dbstream.learn_one({1: 2.6, 2: 2.6}) assert len(dbstream._micro_clusters) == 2 assert_micro_cluster_properties( - dbstream.micro_clusters[1], center={1: 2.461654, 2: 2.461654}, last_update=86 + dbstream.micro_clusters[0], center={1: 2.023498, 2: 2.023498}, last_update=86 ) assert_micro_cluster_properties( - dbstream.micro_clusters[2], center={1: 3.430485, 2: 3.430485}, last_update=86 + dbstream.micro_clusters[1], center={1: 2.766543, 2: 2.766543}, last_update=86 ) - assert dbstream.s[0] == pytest.approx({1: 3.615835}) - assert dbstream.s[1] == pytest.approx({2: 2.803583}) - assert dbstream.s_t == {0: {1: 56}, 1: {2: 86}} + assert dbstream.s == {0: {1: 4.702391097045977}} + assert dbstream.s_t == {0: {1: 86}} dbstream._recluster() assert len(dbstream.clusters) == 1 - assert_micro_cluster_properties(dbstream.clusters[0], center={1: 3.152231, 2: 3.152231}) + assert_micro_cluster_properties( + dbstream.clusters[0], center={1: 2.560647, 2: 2.560647} + ) def test_dbstream_synthetic_sklearn(): From ad436f46d5924a7b302231428d87706cc7070f0b Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo Date: Thu, 14 Dec 2023 17:51:26 +0700 Subject: [PATCH 09/11] Modify test_density_graph_with_three_micro_clusters in DBSTREAM test file --- river/cluster/test_dbstream.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/river/cluster/test_dbstream.py b/river/cluster/test_dbstream.py index caf5a5724d..795aa4d86f 100644 --- a/river/cluster/test_dbstream.py +++ b/river/cluster/test_dbstream.py @@ -2,7 +2,10 @@ import pytest +from river import stream, utils, metrics + from river.cluster import DBSTREAM +from sklearn.datasets import make_blobs def build_dbstream(fading_factor=0.01, intersection_factor=0.05): @@ -102,30 +105,33 @@ def test_density_graph_with_three_micro_clusters(): for _ in range(5): dbstream.learn_one({1: 2, 2: 2}) + assert dbstream.s == {0: {1: 23.033438964246173}} + assert dbstream.s_t == {0: {1: 51}} + add_cluster(dbstream, initial_point={1: 4, 2: 4}, move_towards={1: 3.3, 2: 3.3}, times=25) # Points in the middle of second and third micro-clusters for _ in range(4): dbstream.learn_one({1: 3, 2: 3}) assert len(dbstream._micro_clusters) == 3 - assert_micro_cluster_properties( - dbstream.micro_clusters[0], center={1: 1.597322, 2: 1.597322}, last_update=56 + dbstream.micro_clusters[0], center={1: 2.0, 2: 2.0}, last_update=56 ) assert_micro_cluster_properties( - dbstream.micro_clusters[1], center={1: 2.461654, 2: 2.461654}, last_update=86 + dbstream.micro_clusters[1], center={1: 3.0, 2: 3.0}, last_update=86 ) assert_micro_cluster_properties( - dbstream.micro_clusters[2], center={1: 3.430485, 2: 3.430485}, last_update=86 + dbstream.micro_clusters[2], center={1: 3.982141, 2: 3.982141}, last_update=82 ) - assert dbstream.s[0] == pytest.approx({1: 3.995844}) - assert dbstream.s[1] == pytest.approx({2: 2.997921}) - assert dbstream.s_t == {0: {1: 56}, 1: {2: 86}} + assert dbstream.s[0] == pytest.approx({1: 23.033439}) + assert dbstream.s[1] == pytest.approx({2: 23.033439}) + assert dbstream.s_t == {0: {1: 51}, 1: {2: 82}} dbstream._recluster() assert len(dbstream.clusters) == 1 - assert_micro_cluster_properties(dbstream.clusters[0], center={1: 2.489894, 2: 2.489894}) + print(dbstream.clusters[0].center) + assert_micro_cluster_properties(dbstream.clusters[0], center={1: 2.800788, 2: 2.800788}) def test_density_graph_with_removed_microcluster(): From 7233a750b50604f72f9d69e75904c0142ddb048a Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo Date: Thu, 14 Dec 2023 17:58:43 +0700 Subject: [PATCH 10/11] Update test_dbstream.py file after pre commit checks. --- river/cluster/test_dbstream.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/river/cluster/test_dbstream.py b/river/cluster/test_dbstream.py index 795aa4d86f..6b5e3776f4 100644 --- a/river/cluster/test_dbstream.py +++ b/river/cluster/test_dbstream.py @@ -1,11 +1,10 @@ from __future__ import annotations import pytest +from sklearn.datasets import make_blobs -from river import stream, utils, metrics - +from river import metrics, stream, utils from river.cluster import DBSTREAM -from sklearn.datasets import make_blobs def build_dbstream(fading_factor=0.01, intersection_factor=0.05): From e16ce2b562c51ca27e6cbfab4e6b36fa3068de48 Mon Sep 17 00:00:00 2001 From: Matti Bispham <149883863+mbispham@users.noreply.github.com> Date: Tue, 2 Jan 2024 02:26:51 +0900 Subject: [PATCH 11/11] Add license to riverml.xyz (#1480) --- docs/.pages | 1 + docs/license/.pages | 2 ++ docs/license/license.md | 3 +++ 3 files changed, 6 insertions(+) create mode 100644 docs/license/.pages create mode 100644 docs/license/license.md diff --git a/docs/.pages b/docs/.pages index 9f7b7e5360..c72bb6ee96 100644 --- a/docs/.pages +++ b/docs/.pages @@ -6,3 +6,4 @@ nav: - faq - releases - benchmarks + - license diff --git a/docs/license/.pages b/docs/license/.pages new file mode 100644 index 0000000000..319cebe79a --- /dev/null +++ b/docs/license/.pages @@ -0,0 +1,2 @@ +title: License 📝 + diff --git a/docs/license/license.md b/docs/license/license.md new file mode 100644 index 0000000000..87980bb6fe --- /dev/null +++ b/docs/license/license.md @@ -0,0 +1,3 @@ +# License + +River is free and open-source software licensed under the [3-clause BSD license](https://github.com/online-ml/river/blob/main/LICENSE). \ No newline at end of file