diff --git a/docs/releases/unreleased.md b/docs/releases/unreleased.md index 31c69c7ce2..9a6f0ba2d3 100644 --- a/docs/releases/unreleased.md +++ b/docs/releases/unreleased.md @@ -8,6 +8,10 @@ River's mini-batch methods now support pandas v2. In particular, River conforms - Made `score_one` method of `anomaly.LocalOutlierFactor` stateless - Defined default score for uninitialized detector +## covariance + +- Added `_from_state` method to `covariance.EmpiricalCovariance` to warm start from previous knowledge. + ## clustering - Add fixes to `cluster.DBSTREAM` algorithm, including: @@ -22,13 +26,13 @@ River's mini-batch methods now support pandas v2. In particular, River conforms - Added `datasets.WebTraffic`, which is a dataset that counts the occurrences of events on a website. It is a multi-output regression dataset with two outputs. -## forest +## drift -- Simplify inner the structures of `forest.ARFClassifier` and `forest.ARFRegressor` by removing redundant class hierarchy. Simplify how concept drift logging can be accessed in individual trees and in the forest as a whole. +- Add `drift.NoDrift` to allow disabling the drift detection capabilities of models. This detector does nothing and always returns `False` when queried whether or not a concept drift was detected. -## covariance +## forest -- Added `_from_state` method to `covariance.EmpiricalCovariance` to warm start from previous knowledge. +- Simplify inner the structures of `forest.ARFClassifier` and `forest.ARFRegressor` by removing redundant class hierarchy. Simplify how concept drift logging can be accessed in individual trees and in the forest as a whole. ## proba @@ -37,6 +41,7 @@ River's mini-batch methods now support pandas v2. In particular, River conforms ## tree - Fix a bug in `tree.splitter.NominalSplitterClassif` that generated a mismatch between the number of existing tree branches and the number of tracked branches. +- Fix a bug in `tree.ExtremelyFastDecisionTreeClassifier` where the split re-evaluation failed when the current branch's feature was not available as a split option. The fix also enables the tree to pre-prune a leaf via the tie-breaking mechanism. ## utils diff --git a/river/drift/__init__.py b/river/drift/__init__.py index 70ba3746c1..b434cd7a5b 100644 --- a/river/drift/__init__.py +++ b/river/drift/__init__.py @@ -12,6 +12,7 @@ from .adwin import ADWIN from .dummy import DummyDriftDetector from .kswin import KSWIN +from .no_drift import NoDrift from .page_hinkley import PageHinkley from .retrain import DriftRetrainingClassifier @@ -22,6 +23,7 @@ "DriftRetrainingClassifier", "DummyDriftDetector", "KSWIN", + "NoDrift", "PageHinkley", "PeriodicTrigger", ] diff --git a/river/drift/no_drift.py b/river/drift/no_drift.py new file mode 100644 index 0000000000..c5173aca04 --- /dev/null +++ b/river/drift/no_drift.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +from river import base +from river.base.drift_detector import DriftDetector + + +class NoDrift(base.DriftDetector): + """Dummy class used to turn off concept drift detection capabilities of adaptive models. + It always signals that no concept drift was detected. + Examples + -------- + + >>> from river import drift + >>> from river import evaluate + >>> from river import forest + >>> from river import metrics + >>> from river.datasets import synth + + >>> dataset = synth.ConceptDriftStream( + ... seed=8, + ... position=500, + ... width=40, + ... ).take(700) + + We can turn off the warning detection capabilities of Adaptive Random Forest (ARF) or + other similar models. Thus, the base models will reset immediately after identifying a drift, + bypassing the background model building phase: + + >>> adaptive_model = forest.ARFClassifier( + ... leaf_prediction="mc", + ... warning_detector=drift.NoDrift(), + ... seed=8 + ... ) + + We can also turn off the concept drift handling capabilities completely: + + >>> stationary_model = forest.ARFClassifier( + ... leaf_prediction="mc", + ... warning_detector=drift.NoDrift(), + ... drift_detector=drift.NoDrift(), + ... seed=8 + ... ) + + Let's put that to test: + + >>> for x, y in dataset: + ... adaptive_model = adaptive_model.learn_one(x, y) + ... stationary_model = stationary_model.learn_one(x, y) + + The adaptive model: + + >>> adaptive_model.n_drifts_detected() + 2 + + >>> adaptive_model.n_warnings_detected() + 0 + + The stationary one: + + >>> stationary_model.n_drifts_detected() + 0 + + >>> stationary_model.n_warnings_detected() + 0 + + """ + + def __init__(self): + super().__init__() + + def update(self, x: int | float) -> DriftDetector: + return self + + @property + def drift_detected(self): + return False diff --git a/river/forest/adaptive_random_forest.py b/river/forest/adaptive_random_forest.py index 8e7a9c6645..c96d83e231 100644 --- a/river/forest/adaptive_random_forest.py +++ b/river/forest/adaptive_random_forest.py @@ -10,7 +10,7 @@ import numpy as np from river import base, metrics, stats -from river.drift import ADWIN +from river.drift import ADWIN, NoDrift from river.tree.hoeffding_tree_classifier import HoeffdingTreeClassifier from river.tree.hoeffding_tree_regressor import HoeffdingTreeRegressor from river.tree.nodes.arf_htc_nodes import ( @@ -32,8 +32,8 @@ def __init__( n_models: int, max_features: bool | str | int, lambda_value: int, - drift_detector: base.DriftDetector | None, - warning_detector: base.DriftDetector | None, + drift_detector: base.DriftDetector, + warning_detector: base.DriftDetector, metric: metrics.base.MultiClassMetric | metrics.base.RegressionMetric, disable_weighted_vote, seed, @@ -50,20 +50,21 @@ def __init__( self._rng = random.Random(self.seed) - self._warning_detectors: list[base.DriftDetector] = ( - None # type: ignore - if self.warning_detector is None - else [self.warning_detector.clone() for _ in range(self.n_models)] - ) - self._drift_detectors: list[base.DriftDetector] = ( - None # type: ignore - if self.drift_detector is None - else [self.drift_detector.clone() for _ in range(self.n_models)] - ) + self._warning_detectors: list[base.DriftDetector] + self._warning_detection_disabled = True + if not isinstance(self.warning_detector, NoDrift): + self._warning_detectors = [self.warning_detector.clone() for _ in range(self.n_models)] + self._warning_detection_disabled = False + + self._drift_detectors: list[base.DriftDetector] + self._drift_detection_disabled = True + if not isinstance(self.drift_detector, NoDrift): + self._drift_detectors = [self.drift_detector.clone() for _ in range(self.n_models)] + self._drift_detection_disabled = False # The background models self._background: list[BaseTreeClassifier | BaseTreeRegressor | None] = ( - None if self.warning_detector is None else [None] * self.n_models # type: ignore + None if self._warning_detection_disabled else [None] * self.n_models # type: ignore ) # Performance metrics used for weighted voting/aggregation @@ -71,10 +72,10 @@ def __init__( # Drift and warning logging self._warning_tracker: dict = ( - collections.defaultdict(int) if self.warning_detector is not None else None # type: ignore + collections.defaultdict(int) if not self._warning_detection_disabled else None # type: ignore ) self._drift_tracker: dict = ( - collections.defaultdict(int) if self.drift_detector is not None else None # type: ignore + collections.defaultdict(int) if not self._drift_detection_disabled else None # type: ignore ) @property @@ -101,12 +102,10 @@ def _drift_detector_input( def _new_base_model(self) -> BaseTreeClassifier | BaseTreeRegressor: raise NotImplementedError - def n_warnings_detected(self, tree_id: int | None = None) -> int | None: + def n_warnings_detected(self, tree_id: int | None = None) -> int: """Get the total number of concept drift warnings detected, or the number on an individual tree basis (optionally). - If warning detection is disabled, will return `None`. - Parameters ---------- tree_id @@ -119,20 +118,18 @@ def n_warnings_detected(self, tree_id: int | None = None) -> int | None: """ - if self.warning_detector is None: - return None + if self._warning_detection_disabled: + return 0 if tree_id is None: return sum(self._warning_tracker.values()) return self._warning_tracker[tree_id] - def n_drifts_detected(self, tree_id: int | None = None) -> int | None: + def n_drifts_detected(self, tree_id: int | None = None) -> int: """Get the total number of concept drifts detected, or such number on an individual tree basis (optionally). - If drift detection is disabled, will return `None`. - Parameters ---------- tree_id @@ -145,8 +142,8 @@ def n_drifts_detected(self, tree_id: int | None = None) -> int | None: """ - if self.drift_detector is None: - return None + if self._drift_detection_disabled: + return 0 if tree_id is None: return sum(self._drift_tracker.values()) @@ -171,13 +168,13 @@ def learn_one(self, x: dict, y: base.typing.Target, **kwargs): k = poisson(rate=self.lambda_value, rng=self._rng) if k > 0: - if self.warning_detector is not None and self._background[i] is not None: + if not self._warning_detection_disabled and self._background[i] is not None: self._background[i].learn_one(x=x, y=y, sample_weight=k) # type: ignore model.learn_one(x=x, y=y, sample_weight=k) drift_input = None - if self.drift_detector is not None and self.warning_detector is not None: + if not self._warning_detection_disabled: drift_input = self._drift_detector_input(i, y, y_pred) self._warning_detectors[i].update(drift_input) @@ -189,7 +186,7 @@ def learn_one(self, x: dict, y: base.typing.Target, **kwargs): # Update warning tracker self._warning_tracker[i] += 1 - if self.drift_detector is not None: + if not self._drift_detection_disabled: drift_input = ( drift_input if drift_input is not None @@ -198,7 +195,7 @@ def learn_one(self, x: dict, y: base.typing.Target, **kwargs): self._drift_detectors[i].update(drift_input) if self._drift_detectors[i].drift_detected: - if self.warning_detector is not None and self._background[i] is not None: + if not self._warning_detection_disabled and self._background[i] is not None: self.data[i] = self._background[i] self._background[i] = None self._warning_detectors[i] = self.warning_detector.clone() diff --git a/river/tree/extremely_fast_decision_tree.py b/river/tree/extremely_fast_decision_tree.py index 944505a65b..9b948e356b 100755 --- a/river/tree/extremely_fast_decision_tree.py +++ b/river/tree/extremely_fast_decision_tree.py @@ -451,59 +451,67 @@ def _reevaluate_best_split(self, node, parent, branch_index, **kwargs): # Manage memory self._enforce_size_limit() - - elif ( - x_best.merit - x_current.merit > hoeffding_bound or hoeffding_bound < self.tau - ) and (id_current != id_best): - # Create a new branch - branch = self._branch_selector(x_best.numerical_feature, x_best.multiway_split) - leaves = tuple( - self._new_leaf(initial_stats, parent=node) - for initial_stats in x_best.children_stats - ) - - new_split = x_best.assemble(branch, node.stats, node.depth, *leaves, **kwargs) - # Update weights in new_split - new_split.last_split_reevaluation_at = node.total_weight - - n_active = n_inactive = 0 - for leaf in node.iter_leaves(): - if leaf.is_active(): - n_active += 1 + elif x_current is not None: + if ( + x_best.merit - x_current.merit > hoeffding_bound + or hoeffding_bound < self.tau + ) and (id_current != id_best): + # Create a new branch + branch = self._branch_selector( + x_best.numerical_feature, x_best.multiway_split + ) + leaves = tuple( + self._new_leaf(initial_stats, parent=node) + for initial_stats in x_best.children_stats + ) + + new_split = x_best.assemble( + branch, node.stats, node.depth, *leaves, **kwargs + ) + # Update weights in new_split + new_split.last_split_reevaluation_at = node.total_weight + + n_active = n_inactive = 0 + for leaf in node.iter_leaves(): + if leaf.is_active(): + n_active += 1 + else: + n_inactive += 1 + + self._n_active_leaves -= n_active + self._n_inactive_leaves -= n_inactive + self._n_active_leaves += len(leaves) + + if parent is None: + # Root case : replace the root node by a new split node + self._root = new_split else: - n_inactive += 1 - - self._n_active_leaves -= n_active - self._n_inactive_leaves -= n_inactive - self._n_active_leaves += len(leaves) - - if parent is None: - # Root case : replace the root node by a new split node - self._root = new_split - else: - parent.children[branch_index] = new_split - - stop_flag = True - - # Manage memory - self._enforce_size_limit() - - elif ( - x_best.merit - x_current.merit > hoeffding_bound or hoeffding_bound < self.tau - ) and (id_current == id_best): - branch = self._branch_selector(x_best.numerical_feature, x_best.multiway_split) - # Change the branch but keep the existing children nodes - new_split = x_best.assemble( - branch, node.stats, node.depth, *tuple(node.children), **kwargs - ) - # Update weights in new_split - new_split.last_split_reevaluation_at = node.total_weight - - if parent is None: - # Root case : replace the root node by a new split node - self._root = new_split - else: - parent.children[branch_index] = new_split + parent.children[branch_index] = new_split + + stop_flag = True + + # Manage memory + self._enforce_size_limit() + + elif ( + x_best.merit - x_current.merit > hoeffding_bound + or hoeffding_bound < self.tau + ) and (id_current == id_best): + branch = self._branch_selector( + x_best.numerical_feature, x_best.multiway_split + ) + # Change the branch but keep the existing children nodes + new_split = x_best.assemble( + branch, node.stats, node.depth, *tuple(node.children), **kwargs + ) + # Update weights in new_split + new_split.last_split_reevaluation_at = node.total_weight + + if parent is None: + # Root case : replace the root node by a new split node + self._root = new_split + else: + parent.children[branch_index] = new_split return stop_flag @@ -551,7 +559,12 @@ def _attempt_to_split(self, node, parent, branch_index, **kwargs): node.total_weight, ) - if x_best.merit - x_null.merit > hoeffding_bound or hoeffding_bound < self.tau: + if x_best.feature is None: + # Pre-pruning - null wins + node.deactivate() + self._n_inactive_leaves += 1 + self._n_active_leaves -= 1 + elif x_best.merit - x_null.merit > hoeffding_bound or hoeffding_bound < self.tau: # Create a new branch branch = self._branch_selector(x_best.numerical_feature, x_best.multiway_split) leaves = tuple(