Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/online-ml/river
Browse files Browse the repository at this point in the history
  • Loading branch information
MaxHalford committed Jan 20, 2024
2 parents 790b3e6 + e16ce2b commit 451a815
Show file tree
Hide file tree
Showing 7 changed files with 123 additions and 41 deletions.
1 change: 1 addition & 0 deletions docs/.pages
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ nav:
- faq
- releases
- benchmarks
- license
2 changes: 2 additions & 0 deletions docs/license/.pages
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
title: License 📝

3 changes: 3 additions & 0 deletions docs/license/license.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# License

River is free and open-source software licensed under the [3-clause BSD license](https://github.com/online-ml/river/blob/main/LICENSE).
13 changes: 11 additions & 2 deletions river/cluster/dbstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,10 +227,10 @@ def _update(self, x):
self.s_t[i][j] = self._time_stamp
except KeyError:
try:
self.s[i][j] = 0
self.s[i][j] = 1
self.s_t[i][j] = self._time_stamp
except KeyError:
self.s[i] = {j: 0}
self.s[i] = {j: 1}
self.s_t[i] = {j: self._time_stamp}

# prevent collapsing clusters
Expand Down Expand Up @@ -266,6 +266,15 @@ def _cleanup(self):

if micro_cluster_i.weight * value < weight_weak:
micro_clusters.pop(i)
self.s.pop(i, None)
self.s_t.pop(i, None)
# Since self.s and self.s_t always have the same keys and are arranged in ascending orders
for j in self.s:
if j < i:
self.s[j].pop(i, None)
self.s_t[j].pop(i, None)
else:
break

# Update microclusters
self._micro_clusters = micro_clusters
Expand Down
8 changes: 4 additions & 4 deletions river/cluster/denstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,16 +120,16 @@ class DenStream(base.Clusterer):
... denstream.learn_one(x)
>>> denstream.predict_one({0: -1, 1: -2})
0
1
>>> denstream.predict_one({0: 5, 1: 4})
1
2
>>> denstream.predict_one({0: 1, 1: 1})
0
>>> denstream.n_clusters
2
3
"""

Expand Down Expand Up @@ -183,7 +183,7 @@ def centers(self):

@staticmethod
def _distance(point_a, point_b):
return math.sqrt(utils.math.minkowski_distance(point_a, point_b, 2))
return utils.math.minkowski_distance(point_a, point_b, 2)

def _get_closest_cluster_key(self, point, clusters):
min_distance = math.inf
Expand Down
129 changes: 98 additions & 31 deletions river/cluster/test_dbstream.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from __future__ import annotations

import pytest
from sklearn.datasets import make_blobs

from river import metrics, stream, utils
from river.cluster import DBSTREAM


def build_dbstream(fading_factor=0.001, intersection_factor=0.05):
def build_dbstream(fading_factor=0.01, intersection_factor=0.05):
return DBSTREAM(
fading_factor=fading_factor,
clustering_threshold=1,
Expand All @@ -31,52 +33,66 @@ def test_cluster_formation_and_cleanup():

X = [
{1: 1},
{1: 2},
{1: 3},
{1: 3},
{1: 3},
{1: 5},
{1: 7},
{1: 9},
{1: 10},
{1: 11},
{1: 11},
{1: 12},
{1: 13},
{1: 11},
{1: 15},
{1: 15},
{1: 16},
{1: 17},
{1: 17},
{1: 17},
]

for x in X:
dbstream.learn_one(x)

assert len(dbstream._micro_clusters) == 3
assert_micro_cluster_properties(dbstream.micro_clusters[1], center={1: 3}, last_update=3)
assert_micro_cluster_properties(dbstream.micro_clusters[5], center={1: 11}, last_update=10)
assert_micro_cluster_properties(dbstream.micro_clusters[7], center={1: 17}, last_update=12)
assert len(dbstream._micro_clusters) == 4
assert_micro_cluster_properties(dbstream.micro_clusters[2], center={1: 3}, last_update=4)
assert_micro_cluster_properties(dbstream.micro_clusters[7], center={1: 11}, last_update=13)
assert_micro_cluster_properties(dbstream.micro_clusters[8], center={1: 15}, last_update=15)
assert_micro_cluster_properties(dbstream.micro_clusters[10], center={1: 17}, last_update=19)

assert dbstream.predict_one({1: 2.0}) == 0
assert dbstream.predict_one({1: 13.0}) == 1
assert dbstream.predict_one({1: 13 + 1e-10}) == 2
assert dbstream.predict_one({1: 16 - 1e-10}) == 2
assert dbstream.predict_one({1: 18}) == 3

assert len(dbstream._clusters) == 4
assert dbstream.s == dbstream.s_t == {}


def test_with_two_micro_clusters():
dbstream = build_dbstream()

add_cluster(dbstream, initial_point={1: 1, 2: 1}, move_towards={1: 1.7, 2: 1.7}, times=25)
add_cluster(dbstream, initial_point={1: 3, 2: 3}, move_towards={1: 2.3, 2: 2.3}, times=25)
# Points in the middle of first and second micro-clusters
for _ in range(5):
dbstream.learn_one({1: 2, 2: 2})

assert len(dbstream._micro_clusters) == 2
assert len(dbstream.micro_clusters) == 2
assert_micro_cluster_properties(
dbstream.micro_clusters[0], center={1: 1.597322, 2: 1.597322}, last_update=56
dbstream.micro_clusters[0], center={1: 2.137623, 2: 2.137623}, last_update=51
)
assert_micro_cluster_properties(
dbstream.micro_clusters[1], center={1: 2.402677, 2: 2.402677}, last_update=56
dbstream.micro_clusters[1], center={1: 2.914910, 2: 2.914910}, last_update=51
)

assert dbstream.s == {0: {1: 3.995844478090532}}
assert dbstream.s_t == {0: {1: 56}}
assert dbstream.s == {0: {1: 23.033438964246173}}
assert dbstream.s_t == {0: {1: 51}}

dbstream._recluster()
assert len(dbstream.clusters) == 1
assert_micro_cluster_properties(dbstream.clusters[0], center={1: 2.003033, 2: 2.003033})
assert_micro_cluster_properties(dbstream.clusters[0], center={1: 2.415239, 2: 2.415239})


def test_density_graph_with_three_micro_clusters():
Expand All @@ -88,58 +104,109 @@ def test_density_graph_with_three_micro_clusters():
for _ in range(5):
dbstream.learn_one({1: 2, 2: 2})

assert dbstream.s == {0: {1: 23.033438964246173}}
assert dbstream.s_t == {0: {1: 51}}

add_cluster(dbstream, initial_point={1: 4, 2: 4}, move_towards={1: 3.3, 2: 3.3}, times=25)
# Points in the middle of second and third micro-clusters
for _ in range(4):
dbstream.learn_one({1: 3, 2: 3})

assert len(dbstream._micro_clusters) == 3

assert_micro_cluster_properties(
dbstream.micro_clusters[0], center={1: 1.597322, 2: 1.597322}, last_update=56
dbstream.micro_clusters[0], center={1: 2.0, 2: 2.0}, last_update=56
)
assert_micro_cluster_properties(
dbstream.micro_clusters[1], center={1: 2.461654, 2: 2.461654}, last_update=86
dbstream.micro_clusters[1], center={1: 3.0, 2: 3.0}, last_update=86
)
assert_micro_cluster_properties(
dbstream.micro_clusters[2], center={1: 3.430485, 2: 3.430485}, last_update=86
dbstream.micro_clusters[2], center={1: 3.982141, 2: 3.982141}, last_update=82
)

assert dbstream.s[0] == pytest.approx({1: 3.995844})
assert dbstream.s[1] == pytest.approx({2: 2.997921})
assert dbstream.s_t == {0: {1: 56}, 1: {2: 86}}
assert dbstream.s[0] == pytest.approx({1: 23.033439})
assert dbstream.s[1] == pytest.approx({2: 23.033439})
assert dbstream.s_t == {0: {1: 51}, 1: {2: 82}}

dbstream._recluster()
assert len(dbstream.clusters) == 1
assert_micro_cluster_properties(dbstream.clusters[0], center={1: 2.489894, 2: 2.489894})
print(dbstream.clusters[0].center)
assert_micro_cluster_properties(dbstream.clusters[0], center={1: 2.800788, 2: 2.800788})


def test_density_graph_with_removed_microcluster():
dbstream = build_dbstream(fading_factor=0.1, intersection_factor=0.3)
dbstream = build_dbstream(fading_factor=0.1,
intersection_factor=0.3)

add_cluster(dbstream, initial_point={1: 1, 2: 1}, move_towards={1: 1.7, 2: 1.7}, times=25)
add_cluster(dbstream, initial_point={1: 3, 2: 3}, move_towards={1: 2.3, 2: 2.3}, times=25)
# Points in the middle of first and second micro-clusters
for _ in range(5):
dbstream.learn_one({1: 2, 2: 2})

add_cluster(dbstream, initial_point={1: 4, 2: 4}, move_towards={1: 3.3, 2: 3.3}, times=25)
add_cluster(dbstream, initial_point={1: 3.5, 2: 3.5}, move_towards={1: 2.9, 2: 2.9}, times=25)

# Points in the middle of second and third micro-clusters
for _ in range(4):
dbstream.learn_one({1: 3, 2: 3})
dbstream.learn_one({1: 2.6, 2: 2.6})

assert len(dbstream._micro_clusters) == 2
assert_micro_cluster_properties(
dbstream.micro_clusters[1], center={1: 2.461654, 2: 2.461654}, last_update=86
dbstream.micro_clusters[0], center={1: 2.023498, 2: 2.023498}, last_update=86
)
assert_micro_cluster_properties(
dbstream.micro_clusters[2], center={1: 3.430485, 2: 3.430485}, last_update=86
dbstream.micro_clusters[1], center={1: 2.766543, 2: 2.766543}, last_update=86
)

assert dbstream.s[0] == pytest.approx({1: 3.615835})
assert dbstream.s[1] == pytest.approx({2: 2.803583})
assert dbstream.s_t == {0: {1: 56}, 1: {2: 86}}
assert dbstream.s == {0: {1: 4.702391097045977}}
assert dbstream.s_t == {0: {1: 86}}

dbstream._recluster()
assert len(dbstream.clusters) == 1
assert_micro_cluster_properties(dbstream.clusters[0], center={1: 3.152231, 2: 3.152231})
assert_micro_cluster_properties(
dbstream.clusters[0], center={1: 2.560647, 2: 2.560647}
)


def test_dbstream_synthetic_sklearn():
centers = [(-10, -10), (-5, -5), (0, 0), (5, 5), (10, 10)]
cluster_std = [0.6] * 5

# Create a dataset with 15000 data points with 5 centers and cluster SD of 0.6 each
X, y = make_blobs(n_samples=15_000,
cluster_std=cluster_std,
centers=centers,
n_features=2,
random_state=42)

dbstream = DBSTREAM(
clustering_threshold=2,
fading_factor=0.05,
intersection_factor=0.1,
cleanup_interval=1.0,
minimum_weight=1.0,
)

# Use VBeta as the metric to investigate the performance of DBSTREAM
v_beta = metrics.VBeta(beta=1.0)

for x, y_true in stream.iter_array(X, y):
dbstream.learn_one(x)
y_pred = dbstream.predict_one(x)
v_beta.update(y_true, y_pred)

assert len(dbstream._micro_clusters) == 12
assert round(v_beta.get(), 4) == 0.9816

assert dbstream.s.keys() == dbstream.s_t.keys()

dbstream._recluster()

# Check that the resulted cluster centers are close to the expected centers
dbstream_expected_centers = {0: {0: 10, 1: 10},
1: {0: -5, 1: -5},
2: {0: 0, 1: 0},
3: {0: 5, 1: 5},
4: {0: -10, 1: -10}}

for i in dbstream.centers.keys():
assert utils.math.minkowski_distance(dbstream.centers[i], dbstream_expected_centers[i], 2) < 0.2
8 changes: 4 additions & 4 deletions river/metrics/silhouette.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class Silhouette(metrics.base.ClusteringMetric):
... metric.update(x, y_pred, k_means.centers)
>>> metric
Silhouette: 0.568058
Silhouette: 0.32145
References
----------
Expand All @@ -65,18 +65,18 @@ def __init__(self):

@staticmethod
def _find_distance_second_closest_center(centers, x):
distances = {i: math.sqrt(utils.math.minkowski_distance(centers[i], x, 2)) for i in centers}
distances = {i: utils.math.minkowski_distance(centers[i], x, 2) for i in centers}
return sorted(distances.values())[-2]

def update(self, x, y_pred, centers, w=1.0):
distance_closest_centroid = math.sqrt(utils.math.minkowski_distance(centers[y_pred], x, 2))
distance_closest_centroid = utils.math.minkowski_distance(centers[y_pred], x, 2)
self._sum_distance_closest_centroid += distance_closest_centroid

distance_second_closest_centroid = self._find_distance_second_closest_center(centers, x)
self._sum_distance_second_closest_centroid += distance_second_closest_centroid

def revert(self, x, y_pred, centers, w=1.0):
distance_closest_centroid = math.sqrt(utils.math.minkowski_distance(centers[y_pred], x, 2))
distance_closest_centroid = utils.math.minkowski_distance(centers[y_pred], x, 2)
self._sum_distance_closest_centroid -= distance_closest_centroid

distance_second_closest_centroid = self._find_distance_second_closest_center(centers, x)
Expand Down

0 comments on commit 451a815

Please sign in to comment.