From 424cc384fe0c626e9adfb082d44ae298af248303 Mon Sep 17 00:00:00 2001 From: Donatas Povilaitis Date: Sun, 15 Oct 2023 20:13:22 +0300 Subject: [PATCH] dbstream: fix adjacency matrix building (#1340) * dbstream: fix adj matrix building * Update unreleased.md * Update docs/releases/unreleased.md Co-authored-by: Max Halford --------- Co-authored-by: Max Halford --- docs/releases/unreleased.md | 1 + river/cluster/dbstream.py | 29 +++++++----- river/cluster/test_dbstream.py | 85 ++++++++++++++++++++++------------ 3 files changed, 73 insertions(+), 42 deletions(-) diff --git a/docs/releases/unreleased.md b/docs/releases/unreleased.md index bee7de13e4..79f2c12327 100644 --- a/docs/releases/unreleased.md +++ b/docs/releases/unreleased.md @@ -16,6 +16,7 @@ River's mini-batch methods now support pandas v2. In particular, River conforms - `cluster_is_up_to_date` is set to `True` at the end of the `self._recluster()` function. - Shared density graph update timestamps are initialized with the current timestamp value - `neighbour_neighbours` are appended correctly to the `seed_set` when generating cluster labels + - When building weighted adjacency matrix the algorithm accounts for possibly orphaned entries in shared density graph ## datasets diff --git a/river/cluster/dbstream.py b/river/cluster/dbstream.py index d7b942e486..10966aa284 100644 --- a/river/cluster/dbstream.py +++ b/river/cluster/dbstream.py @@ -287,18 +287,23 @@ def _generate_weighted_adjacency_matrix(self): weighted_adjacency_matrix = {} for i in list(self.s.keys()): for j in list(self.s[i].keys()): - if ( - self._micro_clusters[i].weight >= self.minimum_weight - and self._micro_clusters[j].weight >= self.minimum_weight - ): - value = self.s[i][j] / ( - (self._micro_clusters[i].weight + self._micro_clusters[j].weight) / 2 - ) - if value > self.intersection_factor: - try: - weighted_adjacency_matrix[i][j] = value - except KeyError: - weighted_adjacency_matrix[i] = {j: value} + try: + if ( + self._micro_clusters[i].weight <= self.minimum_weight + or self._micro_clusters[j].weight <= self.minimum_weight + ): + continue + except KeyError: + continue + + value = self.s[i][j] / ( + (self._micro_clusters[i].weight + self._micro_clusters[j].weight) / 2 + ) + if value > self.intersection_factor: + try: + weighted_adjacency_matrix[i][j] = value + except KeyError: + weighted_adjacency_matrix[i] = {j: value} return weighted_adjacency_matrix diff --git a/river/cluster/test_dbstream.py b/river/cluster/test_dbstream.py index c49089d415..255c71e7a3 100644 --- a/river/cluster/test_dbstream.py +++ b/river/cluster/test_dbstream.py @@ -5,20 +5,30 @@ from river.cluster import DBSTREAM -@pytest.fixture -def dbstream(): +def build_dbstream(fading_factor=0.001, intersection_factor=0.05): return DBSTREAM( - fading_factor=0.001, clustering_threshold=1, cleanup_interval=1, intersection_factor=0.05 + fading_factor=fading_factor, + clustering_threshold=1, + cleanup_interval=1, + intersection_factor=intersection_factor, ) +def add_cluster(dbstream, initial_point, move_towards, times=1): + dbstream.learn_one(initial_point) + for _ in range(times): + dbstream.learn_one(move_towards) + + def assert_micro_cluster_properties(cluster, center, last_update=None): assert cluster.center == pytest.approx(center) if last_update is not None: assert cluster.last_update == last_update -def test_cluster_formation_and_cleanup(dbstream: DBSTREAM): +def test_cluster_formation_and_cleanup(): + dbstream = build_dbstream() + X = [ {1: 1}, {1: 3}, @@ -44,18 +54,12 @@ def test_cluster_formation_and_cleanup(dbstream: DBSTREAM): assert_micro_cluster_properties(dbstream.micro_clusters[7], center={1: 17}, last_update=12) -def test_with_two_micro_clusters(dbstream: DBSTREAM): - # First micro-cluster - dbstream.learn_one({1: 1, 2: 1}) - for _ in range(25): - dbstream.learn_one({1: 1.7, 2: 1.7}) - - # Second micro-cluster - dbstream.learn_one({1: 3, 2: 3}) - for _ in range(25): - dbstream.learn_one({1: 2.3, 2: 2.3}) +def test_with_two_micro_clusters(): + dbstream = build_dbstream() - # Points in the middle of two micro-clusters + add_cluster(dbstream, initial_point={1: 1, 2: 1}, move_towards={1: 1.7, 2: 1.7}, times=25) + add_cluster(dbstream, initial_point={1: 3, 2: 3}, move_towards={1: 2.3, 2: 2.3}, times=25) + # Points in the middle of first and second micro-clusters for _ in range(5): dbstream.learn_one({1: 2, 2: 2}) @@ -75,26 +79,16 @@ def test_with_two_micro_clusters(dbstream: DBSTREAM): assert_micro_cluster_properties(dbstream.clusters[0], center={1: 2.003033, 2: 2.003033}) -def test_density_graph_with_three_micro_clusters(dbstream: DBSTREAM): - # First micro-cluster - dbstream.learn_one({1: 1, 2: 1}) - for _ in range(25): - dbstream.learn_one({1: 1.7, 2: 1.7}) - - # Second micro-cluster - dbstream.learn_one({1: 3, 2: 3}) - for _ in range(25): - dbstream.learn_one({1: 2.3, 2: 2.3}) +def test_density_graph_with_three_micro_clusters(): + dbstream = build_dbstream() + add_cluster(dbstream, initial_point={1: 1, 2: 1}, move_towards={1: 1.7, 2: 1.7}, times=25) + add_cluster(dbstream, initial_point={1: 3, 2: 3}, move_towards={1: 2.3, 2: 2.3}, times=25) # Points in the middle of first and second micro-clusters for _ in range(5): dbstream.learn_one({1: 2, 2: 2}) - # Third micro-cluster - dbstream.learn_one({1: 4, 2: 4}) - for _ in range(25): - dbstream.learn_one({1: 3.3, 2: 3.3}) - + add_cluster(dbstream, initial_point={1: 4, 2: 4}, move_towards={1: 3.3, 2: 3.3}, times=25) # Points in the middle of second and third micro-clusters for _ in range(4): dbstream.learn_one({1: 3, 2: 3}) @@ -118,3 +112,34 @@ def test_density_graph_with_three_micro_clusters(dbstream: DBSTREAM): dbstream._recluster() assert len(dbstream.clusters) == 1 assert_micro_cluster_properties(dbstream.clusters[0], center={1: 2.489894, 2: 2.489894}) + + +def test_density_graph_with_removed_microcluster(): + dbstream = build_dbstream(fading_factor=0.1, intersection_factor=0.3) + + add_cluster(dbstream, initial_point={1: 1, 2: 1}, move_towards={1: 1.7, 2: 1.7}, times=25) + add_cluster(dbstream, initial_point={1: 3, 2: 3}, move_towards={1: 2.3, 2: 2.3}, times=25) + # Points in the middle of first and second micro-clusters + for _ in range(5): + dbstream.learn_one({1: 2, 2: 2}) + + add_cluster(dbstream, initial_point={1: 4, 2: 4}, move_towards={1: 3.3, 2: 3.3}, times=25) + # Points in the middle of second and third micro-clusters + for _ in range(4): + dbstream.learn_one({1: 3, 2: 3}) + + assert len(dbstream._micro_clusters) == 2 + assert_micro_cluster_properties( + dbstream.micro_clusters[1], center={1: 2.461654, 2: 2.461654}, last_update=86 + ) + assert_micro_cluster_properties( + dbstream.micro_clusters[2], center={1: 3.430485, 2: 3.430485}, last_update=86 + ) + + assert dbstream.s[0] == pytest.approx({1: 3.615835}) + assert dbstream.s[1] == pytest.approx({2: 2.803583}) + assert dbstream.s_t == {0: {1: 56}, 1: {2: 86}} + + dbstream._recluster() + assert len(dbstream.clusters) == 1 + assert_micro_cluster_properties(dbstream.clusters[0], center={1: 3.152231, 2: 3.152231})