From c2d1ade256482ce7544ab33e03c30f23f345ed22 Mon Sep 17 00:00:00 2001 From: Daniel Nowak Date: Tue, 23 Apr 2024 18:26:45 -0300 Subject: [PATCH 01/16] add first last with out-of-date docs --- river/tree/__init__.py | 2 + river/tree/last_classifier.py | 391 +++++++++++++++++++++++++++++++++ river/tree/nodes/last_nodes.py | 263 ++++++++++++++++++++++ 3 files changed, 656 insertions(+) create mode 100644 river/tree/last_classifier.py create mode 100644 river/tree/nodes/last_nodes.py diff --git a/river/tree/__init__.py b/river/tree/__init__.py index 5166c553fb..c738f37798 100755 --- a/river/tree/__init__.py +++ b/river/tree/__init__.py @@ -58,6 +58,7 @@ from .hoeffding_tree_classifier import HoeffdingTreeClassifier from .hoeffding_tree_regressor import HoeffdingTreeRegressor from .isoup_tree_regressor import iSOUPTreeRegressor +from .last_classifier import LASTClassifier from .stochastic_gradient_tree import SGTClassifier, SGTRegressor __all__ = [ @@ -69,6 +70,7 @@ "HoeffdingTreeRegressor", "HoeffdingAdaptiveTreeRegressor", "iSOUPTreeRegressor", + "LASTClassifier", "SGTClassifier", "SGTRegressor", ] diff --git a/river/tree/last_classifier.py b/river/tree/last_classifier.py new file mode 100644 index 0000000000..e2168008b7 --- /dev/null +++ b/river/tree/last_classifier.py @@ -0,0 +1,391 @@ +from __future__ import annotations + +from river import base, drift + +from .hoeffding_tree import HoeffdingTree +from .nodes.branch import DTBranch +from .nodes.last_nodes import LeafMajorityClassWithDetector, LeafNaiveBayesWithDetector, LeafNaiveBayesAdaptiveWithDetector +from .nodes.leaf import HTLeaf +from .split_criterion import GiniSplitCriterion, HellingerDistanceCriterion, InfoGainSplitCriterion +from .splitter import GaussianSplitter, Splitter + + +class LASTClassifier(HoeffdingTree, base.Classifier): + """Local Adaptive Streaming Tree classifier. + + Parameters + ---------- + max_depth + The maximum depth a tree can reach. If `None`, the tree will grow indefinitely. + split_criterion + Split criterion to use.
+ - 'gini' - Gini
+ - 'info_gain' - Information Gain
+ - 'hellinger' - Helinger Distance
+ leaf_prediction + Prediction mechanism used at leafs.
+ - 'mc' - Majority Class
+ - 'nb' - Naive Bayes
+ - 'nba' - Naive Bayes Adaptive
+ nb_threshold + Number of instances a leaf should observe before allowing Naive Bayes. + nominal_attributes + List of Nominal attributes identifiers. If empty, then assume that all numeric + attributes should be treated as continuous. + splitter + The Splitter or Attribute Observer (AO) used to monitor the class statistics of numeric + features and perform splits. Splitters are available in the `tree.splitter` module. + Different splitters are available for classification and regression tasks. Classification + and regression splitters can be distinguished by their property `is_target_class`. + This is an advanced option. Special care must be taken when choosing different splitters. + By default, `tree.splitter.GaussianSplitter` is used if `splitter` is `None`. + binary_split + If True, only allow binary splits. + min_branch_fraction + The minimum percentage of observed data required for branches resulting from split + candidates. To validate a split candidate, at least two resulting branches must have + a percentage of samples greater than `min_branch_fraction`. This criterion prevents + unnecessary splits when the majority of instances are concentrated in a single branch. + max_share_to_split + Only perform a split in a leaf if the proportion of elements in the majority class is + smaller than this parameter value. This parameter avoids performing splits when most + of the data belongs to a single class. + max_size + The max size of the tree, in Megabytes (MB). + memory_estimate_period + Interval (number of processed instances) between memory consumption checks. + stop_mem_management + If True, stop growing as soon as memory limit is hit. + remove_poor_attrs + If True, disable poor attributes to reduce memory usage. + merit_preprune + If True, enable merit-based tree pre-pruning. + + Notes + ----- + Local Adaptive Streaming Tree [^1] (LAST) is an incremental decision tree with + adaptive splitting mechanisms. At each leaf, LAST maintains a change detector, + that in case of a change detection, it performs a split. + + + + + References + ---------- + + [^1]: Daniel Nowak Assis, Jean Paul Barddal, and Fabrício Enembreck. + Just Change on Change: Adaptive Splitting Time for Decision Trees in + Data Stream Classification . In Proceedings of ACM SAC Conference (SAC’24). + + [^2]: G. Hulten, L. Spencer, and P. Domingos. Mining time-changing data streams. + In KDD’01, pages 97–106, San Francisco, CA, 2001. ACM Press. + + [^3]: Albert Bifet, Geoff Holmes, Richard Kirkby, Bernhard Pfahringer. + MOA: Massive Online Analysis; Journal of Machine Learning Research 11: 1601-1604, 2010. + + Examples + -------- + + >>> from river.datasets import synth + >>> from river import evaluate + >>> from river import metrics + >>> from river import tree + + >>> gen = synth.Agrawal(classification_function=0, seed=42) + >>> # Take 1000 instances from the infinite data generator + >>> dataset = iter(gen.take(1000)) + + >>> model = tree.HoeffdingTreeClassifier( + ... grace_period=100, + ... delta=1e-5, + ... nominal_attributes=['elevel', 'car', 'zipcode'] + ... ) + + >>> metric = metrics.Accuracy() + + >>> evaluate.progressive_val_score(dataset, model, metric) + Accuracy: 84.58% + """ + + _GINI_SPLIT = "gini" + _INFO_GAIN_SPLIT = "info_gain" + _HELLINGER_SPLIT = "hellinger" + _VALID_SPLIT_CRITERIA = [_GINI_SPLIT, _INFO_GAIN_SPLIT, _HELLINGER_SPLIT] + + _MAJORITY_CLASS = "mc" + _NAIVE_BAYES = "nb" + _NAIVE_BAYES_ADAPTIVE = "nba" + _VALID_LEAF_PREDICTION = [_MAJORITY_CLASS, _NAIVE_BAYES, _NAIVE_BAYES_ADAPTIVE] + + def __init__( + self, + max_depth: int | None = None, + split_criterion: str = "info_gain", + leaf_prediction: str = "nba", + change_detector:base.DriftDetector = drift.ADWIN(), + nb_threshold: int = 0, + nominal_attributes: list | None = None, + splitter: Splitter | None = None, + binary_split: bool = False, + min_branch_fraction: float = 0.01, + max_share_to_split: float = 0.99, + max_size: float = 100.0, + memory_estimate_period: int = 1000000, + stop_mem_management: bool = False, + remove_poor_attrs: bool = False, + merit_preprune: bool = True, + ): + super().__init__( + max_depth=max_depth, + binary_split=binary_split, + max_size=max_size, + memory_estimate_period=memory_estimate_period, + stop_mem_management=stop_mem_management, + remove_poor_attrs=remove_poor_attrs, + merit_preprune=merit_preprune, + ) + self.split_criterion = split_criterion + self.change_detector = change_detector + self.leaf_prediction = leaf_prediction + self.nb_threshold = nb_threshold + self.nominal_attributes = nominal_attributes + + if splitter is None: + self.splitter = GaussianSplitter() + else: + if not splitter.is_target_class: + raise ValueError("The chosen splitter cannot be used in classification tasks.") + self.splitter = splitter # type: ignore + + self.min_branch_fraction = min_branch_fraction + self.max_share_to_split = max_share_to_split + + # To keep track of the observed classes + self.classes: set = set() + + @property + def _mutable_attributes(self): + return {"grace_period", "delta", "tau"} + + @HoeffdingTree.split_criterion.setter # type: ignore + def split_criterion(self, split_criterion): + if split_criterion not in self._VALID_SPLIT_CRITERIA: + print( + "Invalid split_criterion option {}', will use default '{}'".format( + split_criterion, self._INFO_GAIN_SPLIT + ) + ) + self._split_criterion = self._INFO_GAIN_SPLIT + else: + self._split_criterion = split_criterion + + @HoeffdingTree.leaf_prediction.setter # type: ignore + def leaf_prediction(self, leaf_prediction): + if leaf_prediction not in self._VALID_LEAF_PREDICTION: + print( + "Invalid leaf_prediction option {}', will use default '{}'".format( + leaf_prediction, self._NAIVE_BAYES_ADAPTIVE + ) + ) + self._leaf_prediction = self._NAIVE_BAYES_ADAPTIVE + else: + self._leaf_prediction = leaf_prediction + + def _new_leaf(self, initial_stats=None, parent=None): + if initial_stats is None: + initial_stats = {} + if parent is None: + depth = 0 + else: + depth = parent.depth + 1 + + if self._leaf_prediction == self._MAJORITY_CLASS: + return LeafMajorityClassWithDetector(initial_stats, depth, self.splitter, self.change_detector.clone()) + elif self._leaf_prediction == self._NAIVE_BAYES: + return LeafNaiveBayesWithDetector(initial_stats, depth, self.splitter, self.change_detector.clone()) + else: # Naives Bayes Adaptive (default) + return LeafNaiveBayesAdaptiveWithDetector(initial_stats, depth, self.splitter, self.change_detector.clone()) + + def _new_split_criterion(self): + if self._split_criterion == self._GINI_SPLIT: + split_criterion = GiniSplitCriterion(self.min_branch_fraction) + elif self._split_criterion == self._INFO_GAIN_SPLIT: + split_criterion = InfoGainSplitCriterion(self.min_branch_fraction) + elif self._split_criterion == self._HELLINGER_SPLIT: + split_criterion = HellingerDistanceCriterion(self.min_branch_fraction) + else: + split_criterion = InfoGainSplitCriterion(self.min_branch_fraction) + + return split_criterion + + def _attempt_to_split(self, leaf: HTLeaf, parent: DTBranch, parent_branch: int, **kwargs): + """Attempt to split a leaf. + + If the samples seen so far are not from the same class then: + + 1. Find split candidates and select the top 2. + 2. Compute the Hoeffding bound. + 3. If the difference between the top 2 split candidates is larger than the Hoeffding bound: + 3.1 Replace the leaf node by a split node (branch node). + 3.2 Add a new leaf node on each branch of the new split node. + 3.3 Update tree's metrics + + Optional: Disable poor attributes. Depends on the tree's configuration. + + Parameters + ---------- + leaf + The leaf to evaluate. + parent + The leaf's parent. + parent_branch + Parent leaf's branch index. + kwargs + Other parameters passed to the new branch. + """ + if not leaf.observed_class_distribution_is_pure(): # type: ignore + split_criterion = self._new_split_criterion() + + best_split_suggestions = leaf.best_split_suggestions(split_criterion, self) + should_split = False + if len(best_split_suggestions) < 2: + should_split = len(best_split_suggestions) > 0 + else: + best_suggestion = max(best_split_suggestions) + should_split = best_suggestion.merit > 0.0 + if self.remove_poor_attrs: + poor_atts = set() + # Add any poor attribute to set + for suggestion in best_split_suggestions: + poor_atts.add(suggestion.feature) + for poor_att in poor_atts: + leaf.disable_attribute(poor_att) + if should_split: + split_decision = max(best_split_suggestions) + if split_decision.feature is None: + # Pre-pruning - null wins + leaf.deactivate() + self._n_inactive_leaves += 1 + self._n_active_leaves -= 1 + else: + branch = self._branch_selector( + split_decision.numerical_feature, split_decision.multiway_split + ) + leaves = tuple( + self._new_leaf(initial_stats, parent=leaf) + for initial_stats in split_decision.children_stats # type: ignore + ) + + new_split = split_decision.assemble( + branch, leaf.stats, leaf.depth, *leaves, **kwargs + ) + + self._n_active_leaves -= 1 + self._n_active_leaves += len(leaves) + if parent is None: + self._root = new_split + else: + parent.children[parent_branch] = new_split + + # Manage memory + self._enforce_size_limit() + + def learn_one(self, x, y, *, w=1.0): + """Train the model on instance x and corresponding target y. + + Parameters + ---------- + x + Instance attributes. + y + Class label for sample x. + w + Sample weight. + + Notes + ----- + Training tasks: + + * If the tree is empty, create a leaf node as the root. + * If the tree is already initialized, find the corresponding leaf for + the instance and update the leaf node statistics. + * If growth is allowed and the number of instances that the leaf has + observed between split attempts exceed the grace period then attempt + to split. + """ + + # Updates the set of observed classes + self.classes.add(y) + + self._train_weight_seen_by_model += w + + if self._root is None: + self._root = self._new_leaf() + self._n_active_leaves = 1 + + p_node = None + node = None + if isinstance(self._root, DTBranch): + path = iter(self._root.walk(x, until_leaf=False)) + while True: + aux = next(path, None) + if aux is None: + break + p_node = node + node = aux + else: + node = self._root + + if isinstance(node, HTLeaf): + node.learn_one(x, y, w=w, tree=self) + if self._growth_allowed and node.is_active(): + if node.depth >= self.max_depth: # Max depth reached + node.deactivate() + self._n_active_leaves -= 1 + self._n_inactive_leaves += 1 + else: + weight_seen = node.total_weight + if node.change_detector.drift_detected: + p_branch = p_node.branch_no(x) if isinstance(p_node, DTBranch) else None + self._attempt_to_split(node, p_node, p_branch) + node.last_split_attempt_at = weight_seen + else: + while True: + # Split node encountered a previously unseen categorical value (in a multi-way + # test), so there is no branch to sort the instance to + if node.max_branches() == -1 and node.feature in x: + # Create a new branch to the new categorical value + leaf = self._new_leaf(parent=node) + node.add_child(x[node.feature], leaf) + self._n_active_leaves += 1 + node = leaf + # The split feature is missing in the instance. Hence, we pass the new example + # to the most traversed path in the current subtree + else: + _, node = node.most_common_path() + # And we keep trying to reach a leaf + if isinstance(node, DTBranch): + node = node.traverse(x, until_leaf=False) + # Once a leaf is reached, the traversal can stop + if isinstance(node, HTLeaf): + break + # Learn from the sample + node.learn_one(x, y, w=w, tree=self) + + if self._train_weight_seen_by_model % self.memory_estimate_period == 0: + self._estimate_model_size() + + def predict_proba_one(self, x): + proba = {c: 0.0 for c in sorted(self.classes)} + if self._root is not None: + if isinstance(self._root, DTBranch): + leaf = self._root.traverse(x, until_leaf=True) + else: + leaf = self._root + + proba.update(leaf.prediction(x, tree=self)) + return proba + + @property + def _multiclass(self): + return True diff --git a/river/tree/nodes/last_nodes.py b/river/tree/nodes/last_nodes.py new file mode 100644 index 0000000000..1fa2f72e9f --- /dev/null +++ b/river/tree/nodes/last_nodes.py @@ -0,0 +1,263 @@ +from __future__ import annotations + +from river.tree.utils import BranchFactory +from river.utils.norm import normalize_values_in_dict + +from ..splitter.nominal_splitter_classif import NominalSplitterClassif +from ..utils import do_naive_bayes_prediction, round_sig_fig +from .leaf import HTLeaf + + +class LeafMajorityClassWithDetector(HTLeaf): + """Leaf that always predicts the majority class. + + Parameters + ---------- + stats + Initial class observations. + depth + The depth of the node. + splitter + The numeric attribute observer algorithm used to monitor target statistics + and perform split attempts. + kwargs + Other parameters passed to the learning node. + """ + + def __init__(self, stats, depth, splitter,change_detector, **kwargs): + super().__init__(stats, depth, splitter, **kwargs) + self.change_detector = change_detector + + @staticmethod + def new_nominal_splitter(): + return NominalSplitterClassif() + + def learn_one(self, x, y, *, w=1, tree=None): + self.update_stats(y, w) + #TODO : monitor change distribution + if self.is_active(): + mc_pred = self.prediction(x) + detector_input = (max(mc_pred, key=mc_pred.get) != y) + self.change_detector.update(detector_input) + self.update_splitters(x, y, w, tree.nominal_attributes) + + + def update_stats(self, y, w): + try: + self.stats[y] += w + except KeyError: + self.stats[y] = w + + def prediction(self, x, *, tree=None): + return normalize_values_in_dict(self.stats, inplace=False) + + @property + def total_weight(self): + """Calculate the total weight seen by the node. + + Returns + ------- + Total weight seen. + + """ + return sum(self.stats.values()) if self.stats else 0 + + def best_split_suggestions(self, criterion, tree) -> list[BranchFactory]: + maj_class = max(self.stats.values()) + # Only perform split attempts when the majority class does not dominate + # the amount of observed instances + if maj_class and maj_class / self.total_weight > tree.max_share_to_split: + return [BranchFactory()] + + return super().best_split_suggestions(criterion, tree) + + def calculate_promise(self): + """Calculate how likely a node is going to be split. + + A node with a (close to) pure class distribution will less likely be split. + + Returns + ------- + A small value indicates that the node has seen more samples of a + given class than the other classes. + + """ + total_seen = sum(self.stats.values()) + if total_seen > 0: + return total_seen - max(self.stats.values()) + else: + return 0 + + def observed_class_distribution_is_pure(self): + """Check if observed class distribution is pure, i.e. if all samples + belong to the same class. + + Returns + ------- + True if observed number of classes is less than 2, False otherwise. + """ + count = 0 + for weight in self.stats.values(): + if weight != 0: + count += 1 + if count == 2: # No need to count beyond this point + break + return count < 2 + + def __repr__(self): + if not self.stats: + return "" + + text = f"Class {max(self.stats, key=self.stats.get)}:" + for label, proba in sorted(normalize_values_in_dict(self.stats, inplace=False).items()): + text += f"\n\tP({label}) = {round_sig_fig(proba)}" + + return text + + def deactivate(self): + self.change_detector = None + super().deactivate() + + +class LeafNaiveBayesWithDetector(LeafMajorityClassWithDetector): + """Leaf that uses Naive Bayes models. + + Parameters + ---------- + stats + Initial class observations. + depth + The depth of the node. + splitter + The numeric attribute observer algorithm used to monitor target statistics + and perform split attempts. + kwargs + Other parameters passed to the learning node. + """ + + def __init__(self, stats, depth, splitter,change_detector, **kwargs): + super().__init__(stats, depth, splitter,change_detector,**kwargs) + + def learn_one(self, x, y, *, w=1, tree=None): + self.update_stats(y, w) + #TODO : monitor change distribution + if self.is_active(): + nb_pred = self.prediction(x) + detector_input = (max(nb_pred, key=nb_pred.get) == y) + self.change_detector.update(detector_input) + self.update_splitters(x, y, w, tree.nominal_attributes) + + def prediction(self, x, *, tree=None): + if self.is_active() and self.total_weight >= tree.nb_threshold: + return do_naive_bayes_prediction(x, self.stats, self.splitters) + else: + return super().prediction(x) + + def disable_attribute(self, att_index): + """Disable an attribute observer. + + Disabled in Nodes using Naive Bayes, since poor attributes are used in + Naive Bayes calculation. + + Parameters + ---------- + att_index + Attribute index. + """ + pass + + +class LeafNaiveBayesAdaptiveWithDetector(LeafMajorityClassWithDetector): + """Learning node that uses Adaptive Naive Bayes models. + + Parameters + ---------- + stats + Initial class observations. + depth + The depth of the node. + splitter + The numeric attribute observer algorithm used to monitor target statistics + and perform split attempts. + kwargs + Other parameters passed to the learning node. + """ + + def __init__(self, stats, depth, splitter, change_detector, **kwargs): + super().__init__(stats, depth, splitter, change_detector, **kwargs) + self._mc_correct_weight = 0.0 + self._nb_correct_weight = 0.0 + + def learn_one(self, x, y, *, w=1.0, tree=None): + """Update the node with the provided instance. + + Parameters + ---------- + x + Instance attributes for updating the node. + y + Instance class. + w + The instance's weight. + tree + The Hoeffding Tree to update. + + """ + detector_input_mc = 1 + detector_input_nb = 1 + if self.is_active(): + mc_pred = super().prediction(x) + # Empty node (assume the majority class will be the best option) or majority + # class prediction is correct + if len(self.stats) == 0 or max(mc_pred, key=mc_pred.get) == y: + self._mc_correct_weight += w + detector_input_mc = 0 + nb_pred = do_naive_bayes_prediction(x, self.stats, self.splitters) + if len(nb_pred) > 0 and max(nb_pred, key=nb_pred.get) == y: + self._nb_correct_weight += w + detector_input_nb = 0 + + self.update_stats(y, w) + #TODO : monitor change distribution + if self.is_active(): + if self._nb_correct_weight >= self._mc_correct_weight: + self.change_detector.update(detector_input_nb) + else: + self.change_detector.update(detector_input_mc) + self.update_splitters(x, y, w, tree.nominal_attributes) + + + + + def prediction(self, x, *, tree=None): + """Get the probabilities per class for a given instance. + + Parameters + ---------- + x + Instance attributes. + tree + Hoeffding Tree. + + Returns + ------- + Class votes for the given instance. + + """ + if self.is_active() and self._nb_correct_weight >= self._mc_correct_weight: + return do_naive_bayes_prediction(x, self.stats, self.splitters) + else: + return super().prediction(x) + + def disable_attribute(self, att_index): + """Disable an attribute observer. + + Disabled in Nodes using Naive Bayes, since poor attributes are used in + Naive Bayes calculation. + + Parameters + ---------- + att_index + Attribute index. + """ + pass From 099c19da6d396d5d61d598c646928a88f12e1281 Mon Sep 17 00:00:00 2001 From: Daniel Nowak Date: Thu, 9 May 2024 15:11:55 -0300 Subject: [PATCH 02/16] update LAST to detect change in the data distribution + iter_arff none class Docs are also updated --- river/stream/iter_arff.py | 9 ++- .../hoeffding_adaptive_tree_classifier.py | 5 +- river/tree/last_classifier.py | 60 ++++++++++--------- river/tree/nodes/last_nodes.py | 44 ++++++++------ river/tree/split_criterion/base.py | 15 +++++ .../split_criterion/gini_split_criterion.py | 4 ++ .../hellinger_distance_criterion.py | 3 + .../info_gain_split_criterion.py | 3 + 8 files changed, 95 insertions(+), 48 deletions(-) diff --git a/river/stream/iter_arff.py b/river/stream/iter_arff.py index 4464e3f74a..6c91aec40a 100644 --- a/river/stream/iter_arff.py +++ b/river/stream/iter_arff.py @@ -176,7 +176,7 @@ def iter_arff( x = { name: cast(val) if cast else val for name, cast, val in zip(names, casts, r.rstrip().split(",")) - if val != "?" + if val != "?" and val != '' } # Handle target @@ -185,8 +185,11 @@ def iter_arff( if isinstance(target, list): y = {name: x.pop(name, 0) for name in target} else: - y = x.pop(target) if target else None - + try: + y = x.pop(target) if target else None + except KeyError: + y = None + yield x, y # Close the file if we opened it diff --git a/river/tree/hoeffding_adaptive_tree_classifier.py b/river/tree/hoeffding_adaptive_tree_classifier.py index 290d34dad1..56faf3498e 100644 --- a/river/tree/hoeffding_adaptive_tree_classifier.py +++ b/river/tree/hoeffding_adaptive_tree_classifier.py @@ -124,12 +124,15 @@ class HoeffdingAdaptiveTreeClassifier(HoeffdingTreeClassifier): ... nb_threshold=10, ... seed=0 ... ) + + >>> dataset = iter(gen.take(3000)) + + >>> model = tree.LASTClassifier() >>> metric = metrics.Accuracy() >>> evaluate.progressive_val_score(dataset, model, metric) Accuracy: 91.49% - """ def __init__( diff --git a/river/tree/last_classifier.py b/river/tree/last_classifier.py index e2168008b7..06da8d9b31 100644 --- a/river/tree/last_classifier.py +++ b/river/tree/last_classifier.py @@ -27,6 +27,11 @@ class LASTClassifier(HoeffdingTree, base.Classifier): - 'mc' - Majority Class
- 'nb' - Naive Bayes
- 'nba' - Naive Bayes Adaptive
+ change_detector + Change detector that will be created at each leaf of the tree. + track_error + If True, the change detector will have binary inputs for error predictions, + otherwise the input will be the split criteria. nb_threshold Number of instances a leaf should observe before allowing Naive Bayes. nominal_attributes @@ -65,7 +70,8 @@ class LASTClassifier(HoeffdingTree, base.Classifier): ----- Local Adaptive Streaming Tree [^1] (LAST) is an incremental decision tree with adaptive splitting mechanisms. At each leaf, LAST maintains a change detector, - that in case of a change detection, it performs a split. + that in case of a change detection in error or the data distribution of the leaf, + it performs a split. @@ -76,35 +82,20 @@ class LASTClassifier(HoeffdingTree, base.Classifier): [^1]: Daniel Nowak Assis, Jean Paul Barddal, and Fabrício Enembreck. Just Change on Change: Adaptive Splitting Time for Decision Trees in Data Stream Classification . In Proceedings of ACM SAC Conference (SAC’24). - - [^2]: G. Hulten, L. Spencer, and P. Domingos. Mining time-changing data streams. - In KDD’01, pages 97–106, San Francisco, CA, 2001. ACM Press. - - [^3]: Albert Bifet, Geoff Holmes, Richard Kirkby, Bernhard Pfahringer. - MOA: Massive Online Analysis; Journal of Machine Learning Research 11: 1601-1604, 2010. Examples -------- + >>> from river.datasets import synth >>> from river.datasets import synth >>> from river import evaluate >>> from river import metrics >>> from river import tree - >>> gen = synth.Agrawal(classification_function=0, seed=42) - >>> # Take 1000 instances from the infinite data generator - >>> dataset = iter(gen.take(1000)) - - >>> model = tree.HoeffdingTreeClassifier( - ... grace_period=100, - ... delta=1e-5, - ... nominal_attributes=['elevel', 'car', 'zipcode'] - ... ) - - >>> metric = metrics.Accuracy() - - >>> evaluate.progressive_val_score(dataset, model, metric) - Accuracy: 84.58% + >>> gen = synth.ConceptDriftStream(stream=synth.SEA(seed=42, variant=0), + ... drift_stream=synth.SEA(seed=42, variant=1), + ... seed=1, position=500, width=50) + Accuracy: 91.60% """ _GINI_SPLIT = "gini" @@ -123,6 +114,7 @@ def __init__( split_criterion: str = "info_gain", leaf_prediction: str = "nba", change_detector:base.DriftDetector = drift.ADWIN(), + track_error : bool = True, nb_threshold: int = 0, nominal_attributes: list | None = None, splitter: Splitter | None = None, @@ -146,6 +138,7 @@ def __init__( ) self.split_criterion = split_criterion self.change_detector = change_detector + self.track_error = track_error self.leaf_prediction = leaf_prediction self.nb_threshold = nb_threshold self.nominal_attributes = nominal_attributes @@ -199,12 +192,22 @@ def _new_leaf(self, initial_stats=None, parent=None): else: depth = parent.depth + 1 - if self._leaf_prediction == self._MAJORITY_CLASS: - return LeafMajorityClassWithDetector(initial_stats, depth, self.splitter, self.change_detector.clone()) - elif self._leaf_prediction == self._NAIVE_BAYES: - return LeafNaiveBayesWithDetector(initial_stats, depth, self.splitter, self.change_detector.clone()) - else: # Naives Bayes Adaptive (default) - return LeafNaiveBayesAdaptiveWithDetector(initial_stats, depth, self.splitter, self.change_detector.clone()) + if not self.track_error: + if self._leaf_prediction == self._MAJORITY_CLASS: + return LeafMajorityClassWithDetector(initial_stats, depth, self.splitter, self.change_detector.clone()) + elif self._leaf_prediction == self._NAIVE_BAYES: + return LeafNaiveBayesWithDetector(initial_stats, depth, self.splitter, self.change_detector.clone()) + else: # Naives Bayes Adaptive (default) + return LeafNaiveBayesAdaptiveWithDetector(initial_stats, depth, self.splitter, self.change_detector.clone()) + else: + split_criterion = self._new_split_criterion() + if self._leaf_prediction == self._MAJORITY_CLASS: + return LeafMajorityClassWithDetector(initial_stats, depth, self.splitter, self.change_detector.clone(), split_criterion) + elif self._leaf_prediction == self._NAIVE_BAYES: + return LeafNaiveBayesWithDetector(initial_stats, depth, self.splitter, self.change_detector.clone(), split_criterion) + else: # Naives Bayes Adaptive (default) + return LeafNaiveBayesAdaptiveWithDetector(initial_stats, depth, self.splitter, self.change_detector.clone(), split_criterion) + def _new_split_criterion(self): if self._split_criterion == self._GINI_SPLIT: @@ -212,6 +215,9 @@ def _new_split_criterion(self): elif self._split_criterion == self._INFO_GAIN_SPLIT: split_criterion = InfoGainSplitCriterion(self.min_branch_fraction) elif self._split_criterion == self._HELLINGER_SPLIT: + if not self.track_error: + raise ValueError("The Heillinger distance cannot estimate the purity of a single distribution.\ + Use another split criterion or set track_error to True") split_criterion = HellingerDistanceCriterion(self.min_branch_fraction) else: split_criterion = InfoGainSplitCriterion(self.min_branch_fraction) diff --git a/river/tree/nodes/last_nodes.py b/river/tree/nodes/last_nodes.py index 1fa2f72e9f..564c9bcccf 100644 --- a/river/tree/nodes/last_nodes.py +++ b/river/tree/nodes/last_nodes.py @@ -24,9 +24,10 @@ class LeafMajorityClassWithDetector(HTLeaf): Other parameters passed to the learning node. """ - def __init__(self, stats, depth, splitter,change_detector, **kwargs): + def __init__(self, stats, depth, splitter,change_detector, split_criterion = None, **kwargs): super().__init__(stats, depth, splitter, **kwargs) self.change_detector = change_detector + self.split_criterion = split_criterion #if None, the change detector will have binary inputs @staticmethod def new_nominal_splitter(): @@ -34,11 +35,14 @@ def new_nominal_splitter(): def learn_one(self, x, y, *, w=1, tree=None): self.update_stats(y, w) - #TODO : monitor change distribution if self.is_active(): - mc_pred = self.prediction(x) - detector_input = (max(mc_pred, key=mc_pred.get) != y) - self.change_detector.update(detector_input) + if self.split_criterion is None: + mc_pred = self.prediction(x) + detector_input = (max(mc_pred, key=mc_pred.get) != y) + self.change_detector.update(detector_input) + else: + detector_input = self.split_criterion.purity(self.stats) + self.change_detector.update(detector_input) self.update_splitters(x, y, w, tree.nominal_attributes) @@ -135,16 +139,19 @@ class LeafNaiveBayesWithDetector(LeafMajorityClassWithDetector): Other parameters passed to the learning node. """ - def __init__(self, stats, depth, splitter,change_detector, **kwargs): - super().__init__(stats, depth, splitter,change_detector,**kwargs) + def __init__(self, stats, depth, splitter,change_detector, split_criterion = None, **kwargs): + super().__init__(stats, depth, splitter,change_detector,split_criterion,**kwargs) def learn_one(self, x, y, *, w=1, tree=None): self.update_stats(y, w) - #TODO : monitor change distribution if self.is_active(): - nb_pred = self.prediction(x) - detector_input = (max(nb_pred, key=nb_pred.get) == y) - self.change_detector.update(detector_input) + if self.split_criterion is None: + nb_pred = self.prediction(x) + detector_input = (max(nb_pred, key=nb_pred.get) == y) + self.change_detector.update(detector_input) + else: + detector_input = self.split_criterion.purity(self.stats) + self.change_detector.update(detector_input) self.update_splitters(x, y, w, tree.nominal_attributes) def prediction(self, x, *, tree=None): @@ -183,8 +190,8 @@ class LeafNaiveBayesAdaptiveWithDetector(LeafMajorityClassWithDetector): Other parameters passed to the learning node. """ - def __init__(self, stats, depth, splitter, change_detector, **kwargs): - super().__init__(stats, depth, splitter, change_detector, **kwargs) + def __init__(self, stats, depth, splitter, change_detector,split_criterion = None, **kwargs): + super().__init__(stats, depth, splitter, change_detector, split_criterion,**kwargs) self._mc_correct_weight = 0.0 self._nb_correct_weight = 0.0 @@ -218,12 +225,15 @@ def learn_one(self, x, y, *, w=1.0, tree=None): detector_input_nb = 0 self.update_stats(y, w) - #TODO : monitor change distribution if self.is_active(): - if self._nb_correct_weight >= self._mc_correct_weight: - self.change_detector.update(detector_input_nb) + if self.split_criterion is None: + if self._nb_correct_weight >= self._mc_correct_weight: + self.change_detector.update(detector_input_nb) + else: + self.change_detector.update(detector_input_mc) else: - self.change_detector.update(detector_input_mc) + detector_input = self.split_criterion.purity(self.stats) + self.change_detector.update(detector_input) self.update_splitters(x, y, w, tree.nominal_attributes) diff --git a/river/tree/split_criterion/base.py b/river/tree/split_criterion/base.py index d329bf6d58..ae09efd397 100644 --- a/river/tree/split_criterion/base.py +++ b/river/tree/split_criterion/base.py @@ -31,6 +31,21 @@ def merit_of_split(self, pre_split_dist, post_split_dist): ------- Value of the merit of splitting """ + + @abc.abstractmethod + def purity(self, dist): + """Compute how pure (how close the distribution is to have only a single class) + the distribution is. + + Parameters + ---------- + dist + The data distribution. + + Returns + ------- + Value of purity of the distribution according to the splitting merit + """ @staticmethod @abc.abstractmethod diff --git a/river/tree/split_criterion/gini_split_criterion.py b/river/tree/split_criterion/gini_split_criterion.py index 9b4c6e3187..abf49ca783 100644 --- a/river/tree/split_criterion/gini_split_criterion.py +++ b/river/tree/split_criterion/gini_split_criterion.py @@ -27,6 +27,10 @@ def merit_of_split(self, pre_split_dist, post_split_dist): post_split_dist[i], dist_weights[i] ) return 1.0 - gini + + + def purity(self, dist): + return self.compute_gini(dist, sum(dist.values())) @staticmethod def compute_gini(dist, dist_sum_of_weights): diff --git a/river/tree/split_criterion/hellinger_distance_criterion.py b/river/tree/split_criterion/hellinger_distance_criterion.py index 5ad379b6a9..2f7662f34f 100644 --- a/river/tree/split_criterion/hellinger_distance_criterion.py +++ b/river/tree/split_criterion/hellinger_distance_criterion.py @@ -28,6 +28,9 @@ def merit_of_split(self, pre_split_dist, post_split_dist): return -math.inf return self.compute_hellinger(post_split_dist) + def purity(self, dist): + raise ValueError("The Heillinger distance is for 2 or more sets of data.") + @staticmethod def compute_hellinger(dist): try: diff --git a/river/tree/split_criterion/info_gain_split_criterion.py b/river/tree/split_criterion/info_gain_split_criterion.py index 0863e6ac84..f8ced1b03d 100644 --- a/river/tree/split_criterion/info_gain_split_criterion.py +++ b/river/tree/split_criterion/info_gain_split_criterion.py @@ -38,6 +38,9 @@ def compute_entropy(self, dist): return self._compute_entropy_dict(dist) elif isinstance(dist, list): return self._compute_entropy_list(dist) + + def purity(self, dist): + return self.compute_entropy(dist) @staticmethod def _compute_entropy_dict(dist): From 0468d66985b0fec3bbd26a1cf33dc155db80a896 Mon Sep 17 00:00:00 2001 From: Daniel Nowak Date: Tue, 14 May 2024 14:29:15 -0300 Subject: [PATCH 03/16] update docs --- river/tree/last_classifier.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/river/tree/last_classifier.py b/river/tree/last_classifier.py index 06da8d9b31..f346c16083 100644 --- a/river/tree/last_classifier.py +++ b/river/tree/last_classifier.py @@ -94,8 +94,15 @@ class LASTClassifier(HoeffdingTree, base.Classifier): >>> gen = synth.ConceptDriftStream(stream=synth.SEA(seed=42, variant=0), ... drift_stream=synth.SEA(seed=42, variant=1), - ... seed=1, position=500, width=50) - Accuracy: 91.60% + ... seed=1, position=1500, width=50) + >>> dataset = iter(gen.take(3000)) + + >>> model = tree.LASTClassifier() + + >>> metric = metrics.Accuracy() + + >>> evaluate.progressive_val_score(dataset, model, metric) + Accuracy: 92.50% """ _GINI_SPLIT = "gini" From a3944294ba80abfd7624c0f26cf11bc9b4a9dff9 Mon Sep 17 00:00:00 2001 From: Daniel Nowak Date: Sat, 25 May 2024 19:22:58 -0300 Subject: [PATCH 04/16] Update hoeffding_adaptive_tree_classifier.py --- river/tree/hoeffding_adaptive_tree_classifier.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/river/tree/hoeffding_adaptive_tree_classifier.py b/river/tree/hoeffding_adaptive_tree_classifier.py index 56faf3498e..086f80897e 100644 --- a/river/tree/hoeffding_adaptive_tree_classifier.py +++ b/river/tree/hoeffding_adaptive_tree_classifier.py @@ -124,10 +124,6 @@ class HoeffdingAdaptiveTreeClassifier(HoeffdingTreeClassifier): ... nb_threshold=10, ... seed=0 ... ) - - >>> dataset = iter(gen.take(3000)) - - >>> model = tree.LASTClassifier() >>> metric = metrics.Accuracy() From 40618215a405ea146c4e7d9d2971c7e78359d6cf Mon Sep 17 00:00:00 2001 From: Daniel Nowak Date: Thu, 5 Sep 2024 09:23:18 -0300 Subject: [PATCH 05/16] changes after tests --- river/tree/last_classifier.py | 20 +++++++++---------- river/tree/nodes/last_nodes.py | 1 - .../variance_ratio_split_criterion.py | 3 +++ 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/river/tree/last_classifier.py b/river/tree/last_classifier.py index f346c16083..270e96fd8c 100644 --- a/river/tree/last_classifier.py +++ b/river/tree/last_classifier.py @@ -16,7 +16,7 @@ class LASTClassifier(HoeffdingTree, base.Classifier): Parameters ---------- max_depth - The maximum depth a tree can reach. If `None`, the tree will grow indefinitely. + The maximum depth a tree can reach. If `None`, the tree will grow until the system recursion limit. split_criterion Split criterion to use.
- 'gini' - Gini
@@ -28,7 +28,7 @@ class LASTClassifier(HoeffdingTree, base.Classifier): - 'nb' - Naive Bayes
- 'nba' - Naive Bayes Adaptive
change_detector - Change detector that will be created at each leaf of the tree. + Change detector that will be created at each leaf of the tree. track_error If True, the change detector will have binary inputs for error predictions, otherwise the input will be the split criteria. @@ -120,7 +120,7 @@ def __init__( max_depth: int | None = None, split_criterion: str = "info_gain", leaf_prediction: str = "nba", - change_detector:base.DriftDetector = drift.ADWIN(), + change_detector:base.DriftDetector| None = None, track_error : bool = True, nb_threshold: int = 0, nominal_attributes: list | None = None, @@ -144,7 +144,7 @@ def __init__( merit_preprune=merit_preprune, ) self.split_criterion = split_criterion - self.change_detector = change_detector + self.change_detector = change_detector if change_detector is not None else drift.ADWIN() self.track_error = track_error self.leaf_prediction = leaf_prediction self.nb_threshold = nb_threshold @@ -165,7 +165,7 @@ def __init__( @property def _mutable_attributes(self): - return {"grace_period", "delta", "tau"} + return {} @HoeffdingTree.split_criterion.setter # type: ignore def split_criterion(self, split_criterion): @@ -236,9 +236,8 @@ def _attempt_to_split(self, leaf: HTLeaf, parent: DTBranch, parent_branch: int, If the samples seen so far are not from the same class then: - 1. Find split candidates and select the top 2. - 2. Compute the Hoeffding bound. - 3. If the difference between the top 2 split candidates is larger than the Hoeffding bound: + 1. Find split candidates and select the top 1. + 2. If the top1 is greater than zero: 3.1 Replace the leaf node by a split node (branch node). 3.2 Add a new leaf node on each branch of the new split node. 3.3 Update tree's metrics @@ -322,8 +321,8 @@ def learn_one(self, x, y, *, w=1.0): * If the tree is empty, create a leaf node as the root. * If the tree is already initialized, find the corresponding leaf for the instance and update the leaf node statistics. - * If growth is allowed and the number of instances that the leaf has - observed between split attempts exceed the grace period then attempt + * Update the leaf change detector with (1 if the tree misclassified the instance, or 0 if it correctly classified) or the data distribution purity + * If growth is allowed and the then attempt to split. """ @@ -358,6 +357,7 @@ def learn_one(self, x, y, *, w=1.0): self._n_inactive_leaves += 1 else: weight_seen = node.total_weight + #check if the change detector triggered a change if node.change_detector.drift_detected: p_branch = p_node.branch_no(x) if isinstance(p_node, DTBranch) else None self._attempt_to_split(node, p_node, p_branch) diff --git a/river/tree/nodes/last_nodes.py b/river/tree/nodes/last_nodes.py index 564c9bcccf..88761213c5 100644 --- a/river/tree/nodes/last_nodes.py +++ b/river/tree/nodes/last_nodes.py @@ -119,7 +119,6 @@ def __repr__(self): return text def deactivate(self): - self.change_detector = None super().deactivate() diff --git a/river/tree/split_criterion/variance_ratio_split_criterion.py b/river/tree/split_criterion/variance_ratio_split_criterion.py index c51df25a14..3f921da140 100644 --- a/river/tree/split_criterion/variance_ratio_split_criterion.py +++ b/river/tree/split_criterion/variance_ratio_split_criterion.py @@ -34,6 +34,9 @@ def merit_of_split(self, pre_split_dist, post_split_dist): vr -= (n_i / n) * (self.compute_var(post_split_dist[i]) / var) return vr + def purity(self, dist): + return self.compute_var(dist) + @staticmethod def compute_var(dist): return dist.get() From 313b742f7e6a110af044cf1b4e394c9afe4f38c5 Mon Sep 17 00:00:00 2001 From: Daniel Nowak Date: Thu, 5 Sep 2024 19:11:15 -0300 Subject: [PATCH 06/16] solving inheritance and small fixes --- .../hoeffding_adaptive_tree_classifier.py | 1 + river/tree/last_classifier.py | 84 +++--------------- river/tree/nodes/last_nodes.py | 87 +------------------ 3 files changed, 18 insertions(+), 154 deletions(-) diff --git a/river/tree/hoeffding_adaptive_tree_classifier.py b/river/tree/hoeffding_adaptive_tree_classifier.py index 7f1a7fa334..7693fdeff9 100644 --- a/river/tree/hoeffding_adaptive_tree_classifier.py +++ b/river/tree/hoeffding_adaptive_tree_classifier.py @@ -129,6 +129,7 @@ class HoeffdingAdaptiveTreeClassifier(HoeffdingTreeClassifier): >>> metric = metrics.Accuracy() >>> evaluate.progressive_val_score(dataset, model, metric) + Accuracy: 91.49% """ diff --git a/river/tree/last_classifier.py b/river/tree/last_classifier.py index 270e96fd8c..1366a66901 100644 --- a/river/tree/last_classifier.py +++ b/river/tree/last_classifier.py @@ -2,7 +2,7 @@ from river import base, drift -from .hoeffding_tree import HoeffdingTree +from .hoeffding_tree_classifier import HoeffdingTreeClassifier from .nodes.branch import DTBranch from .nodes.last_nodes import LeafMajorityClassWithDetector, LeafNaiveBayesWithDetector, LeafNaiveBayesAdaptiveWithDetector from .nodes.leaf import HTLeaf @@ -10,7 +10,7 @@ from .splitter import GaussianSplitter, Splitter -class LASTClassifier(HoeffdingTree, base.Classifier): +class LASTClassifier(HoeffdingTreeClassifier, base.Classifier): """Local Adaptive Streaming Tree classifier. Parameters @@ -69,12 +69,8 @@ class LASTClassifier(HoeffdingTree, base.Classifier): Notes ----- Local Adaptive Streaming Tree [^1] (LAST) is an incremental decision tree with - adaptive splitting mechanisms. At each leaf, LAST maintains a change detector, - that in case of a change detection in error or the data distribution of the leaf, - it performs a split. - - - + adaptive splitting mechanisms. LAST maintains a change detector at each leaf and splits + this node if a change is detected in the error or the leaf`s data distribution. References ---------- @@ -86,7 +82,6 @@ class LASTClassifier(HoeffdingTree, base.Classifier): Examples -------- - >>> from river.datasets import synth >>> from river.datasets import synth >>> from river import evaluate >>> from river import metrics @@ -105,16 +100,6 @@ class LASTClassifier(HoeffdingTree, base.Classifier): Accuracy: 92.50% """ - _GINI_SPLIT = "gini" - _INFO_GAIN_SPLIT = "info_gain" - _HELLINGER_SPLIT = "hellinger" - _VALID_SPLIT_CRITERIA = [_GINI_SPLIT, _INFO_GAIN_SPLIT, _HELLINGER_SPLIT] - - _MAJORITY_CLASS = "mc" - _NAIVE_BAYES = "nb" - _NAIVE_BAYES_ADAPTIVE = "nba" - _VALID_LEAF_PREDICTION = [_MAJORITY_CLASS, _NAIVE_BAYES, _NAIVE_BAYES_ADAPTIVE] - def __init__( self, max_depth: int | None = None, @@ -135,30 +120,26 @@ def __init__( merit_preprune: bool = True, ): super().__init__( + grace_period=None, max_depth=max_depth, + split_criterion=split_criterion, + delta=None, + tau=None, + leaf_prediction=leaf_prediction, + nb_threshold = nb_threshold, binary_split=binary_split, max_size=max_size, memory_estimate_period=memory_estimate_period, stop_mem_management=stop_mem_management, remove_poor_attrs=remove_poor_attrs, merit_preprune=merit_preprune, + nominal_attributes = nominal_attributes, + splitter = splitter, + min_branch_fraction = min_branch_fraction, + max_share_to_split = max_share_to_split, ) - self.split_criterion = split_criterion self.change_detector = change_detector if change_detector is not None else drift.ADWIN() self.track_error = track_error - self.leaf_prediction = leaf_prediction - self.nb_threshold = nb_threshold - self.nominal_attributes = nominal_attributes - - if splitter is None: - self.splitter = GaussianSplitter() - else: - if not splitter.is_target_class: - raise ValueError("The chosen splitter cannot be used in classification tasks.") - self.splitter = splitter # type: ignore - - self.min_branch_fraction = min_branch_fraction - self.max_share_to_split = max_share_to_split # To keep track of the observed classes self.classes: set = set() @@ -167,29 +148,6 @@ def __init__( def _mutable_attributes(self): return {} - @HoeffdingTree.split_criterion.setter # type: ignore - def split_criterion(self, split_criterion): - if split_criterion not in self._VALID_SPLIT_CRITERIA: - print( - "Invalid split_criterion option {}', will use default '{}'".format( - split_criterion, self._INFO_GAIN_SPLIT - ) - ) - self._split_criterion = self._INFO_GAIN_SPLIT - else: - self._split_criterion = split_criterion - - @HoeffdingTree.leaf_prediction.setter # type: ignore - def leaf_prediction(self, leaf_prediction): - if leaf_prediction not in self._VALID_LEAF_PREDICTION: - print( - "Invalid leaf_prediction option {}', will use default '{}'".format( - leaf_prediction, self._NAIVE_BAYES_ADAPTIVE - ) - ) - self._leaf_prediction = self._NAIVE_BAYES_ADAPTIVE - else: - self._leaf_prediction = leaf_prediction def _new_leaf(self, initial_stats=None, parent=None): if initial_stats is None: @@ -388,17 +346,3 @@ def learn_one(self, x, y, *, w=1.0): if self._train_weight_seen_by_model % self.memory_estimate_period == 0: self._estimate_model_size() - def predict_proba_one(self, x): - proba = {c: 0.0 for c in sorted(self.classes)} - if self._root is not None: - if isinstance(self._root, DTBranch): - leaf = self._root.traverse(x, until_leaf=True) - else: - leaf = self._root - - proba.update(leaf.prediction(x, tree=self)) - return proba - - @property - def _multiclass(self): - return True diff --git a/river/tree/nodes/last_nodes.py b/river/tree/nodes/last_nodes.py index 88761213c5..7c178c86c8 100644 --- a/river/tree/nodes/last_nodes.py +++ b/river/tree/nodes/last_nodes.py @@ -5,10 +5,10 @@ from ..splitter.nominal_splitter_classif import NominalSplitterClassif from ..utils import do_naive_bayes_prediction, round_sig_fig -from .leaf import HTLeaf +from .htc_nodes import LeafMajorityClass -class LeafMajorityClassWithDetector(HTLeaf): +class LeafMajorityClassWithDetector(LeafMajorityClass): """Leaf that always predicts the majority class. Parameters @@ -28,10 +28,6 @@ def __init__(self, stats, depth, splitter,change_detector, split_criterion = Non super().__init__(stats, depth, splitter, **kwargs) self.change_detector = change_detector self.split_criterion = split_criterion #if None, the change detector will have binary inputs - - @staticmethod - def new_nominal_splitter(): - return NominalSplitterClassif() def learn_one(self, x, y, *, w=1, tree=None): self.update_stats(y, w) @@ -45,83 +41,6 @@ def learn_one(self, x, y, *, w=1, tree=None): self.change_detector.update(detector_input) self.update_splitters(x, y, w, tree.nominal_attributes) - - def update_stats(self, y, w): - try: - self.stats[y] += w - except KeyError: - self.stats[y] = w - - def prediction(self, x, *, tree=None): - return normalize_values_in_dict(self.stats, inplace=False) - - @property - def total_weight(self): - """Calculate the total weight seen by the node. - - Returns - ------- - Total weight seen. - - """ - return sum(self.stats.values()) if self.stats else 0 - - def best_split_suggestions(self, criterion, tree) -> list[BranchFactory]: - maj_class = max(self.stats.values()) - # Only perform split attempts when the majority class does not dominate - # the amount of observed instances - if maj_class and maj_class / self.total_weight > tree.max_share_to_split: - return [BranchFactory()] - - return super().best_split_suggestions(criterion, tree) - - def calculate_promise(self): - """Calculate how likely a node is going to be split. - - A node with a (close to) pure class distribution will less likely be split. - - Returns - ------- - A small value indicates that the node has seen more samples of a - given class than the other classes. - - """ - total_seen = sum(self.stats.values()) - if total_seen > 0: - return total_seen - max(self.stats.values()) - else: - return 0 - - def observed_class_distribution_is_pure(self): - """Check if observed class distribution is pure, i.e. if all samples - belong to the same class. - - Returns - ------- - True if observed number of classes is less than 2, False otherwise. - """ - count = 0 - for weight in self.stats.values(): - if weight != 0: - count += 1 - if count == 2: # No need to count beyond this point - break - return count < 2 - - def __repr__(self): - if not self.stats: - return "" - - text = f"Class {max(self.stats, key=self.stats.get)}:" - for label, proba in sorted(normalize_values_in_dict(self.stats, inplace=False).items()): - text += f"\n\tP({label}) = {round_sig_fig(proba)}" - - return text - - def deactivate(self): - super().deactivate() - - class LeafNaiveBayesWithDetector(LeafMajorityClassWithDetector): """Leaf that uses Naive Bayes models. @@ -269,4 +188,4 @@ def disable_attribute(self, att_index): att_index Attribute index. """ - pass + pass \ No newline at end of file From fca62549de2fcb1e7bff3229d119143eea62c66f Mon Sep 17 00:00:00 2001 From: Daniel Nowak Date: Thu, 5 Sep 2024 22:08:16 -0300 Subject: [PATCH 07/16] tests + current_merit method --- river/stream/iter_arff.py | 4 +- .../hoeffding_adaptive_tree_classifier.py | 2 +- river/tree/last_classifier.py | 80 +++++++++++++------ river/tree/nodes/last_nodes.py | 40 +++++----- river/tree/split_criterion/base.py | 9 +-- .../split_criterion/gini_split_criterion.py | 5 +- .../hellinger_distance_criterion.py | 2 +- .../info_gain_split_criterion.py | 4 +- .../variance_ratio_split_criterion.py | 4 +- 9 files changed, 86 insertions(+), 64 deletions(-) diff --git a/river/stream/iter_arff.py b/river/stream/iter_arff.py index 6c91aec40a..f9eec46ded 100644 --- a/river/stream/iter_arff.py +++ b/river/stream/iter_arff.py @@ -176,7 +176,7 @@ def iter_arff( x = { name: cast(val) if cast else val for name, cast, val in zip(names, casts, r.rstrip().split(",")) - if val != "?" and val != '' + if val != "?" and val != "" } # Handle target @@ -189,7 +189,7 @@ def iter_arff( y = x.pop(target) if target else None except KeyError: y = None - + yield x, y # Close the file if we opened it diff --git a/river/tree/hoeffding_adaptive_tree_classifier.py b/river/tree/hoeffding_adaptive_tree_classifier.py index 7693fdeff9..ce40900ffc 100644 --- a/river/tree/hoeffding_adaptive_tree_classifier.py +++ b/river/tree/hoeffding_adaptive_tree_classifier.py @@ -129,7 +129,7 @@ class HoeffdingAdaptiveTreeClassifier(HoeffdingTreeClassifier): >>> metric = metrics.Accuracy() >>> evaluate.progressive_val_score(dataset, model, metric) - + Accuracy: 91.49% """ diff --git a/river/tree/last_classifier.py b/river/tree/last_classifier.py index 1366a66901..91b271adb1 100644 --- a/river/tree/last_classifier.py +++ b/river/tree/last_classifier.py @@ -4,10 +4,14 @@ from .hoeffding_tree_classifier import HoeffdingTreeClassifier from .nodes.branch import DTBranch -from .nodes.last_nodes import LeafMajorityClassWithDetector, LeafNaiveBayesWithDetector, LeafNaiveBayesAdaptiveWithDetector +from .nodes.last_nodes import ( + LeafMajorityClassWithDetector, + LeafNaiveBayesAdaptiveWithDetector, + LeafNaiveBayesWithDetector, +) from .nodes.leaf import HTLeaf from .split_criterion import GiniSplitCriterion, HellingerDistanceCriterion, InfoGainSplitCriterion -from .splitter import GaussianSplitter, Splitter +from .splitter import Splitter class LASTClassifier(HoeffdingTreeClassifier, base.Classifier): @@ -28,9 +32,9 @@ class LASTClassifier(HoeffdingTreeClassifier, base.Classifier): - 'nb' - Naive Bayes
- 'nba' - Naive Bayes Adaptive
change_detector - Change detector that will be created at each leaf of the tree. + Change detector that will be created at each leaf of the tree. track_error - If True, the change detector will have binary inputs for error predictions, + If True, the change detector will have binary inputs for error predictions, otherwise the input will be the split criteria. nb_threshold Number of instances a leaf should observe before allowing Naive Bayes. @@ -97,6 +101,7 @@ class LASTClassifier(HoeffdingTreeClassifier, base.Classifier): >>> metric = metrics.Accuracy() >>> evaluate.progressive_val_score(dataset, model, metric) + Accuracy: 92.50% """ @@ -105,8 +110,8 @@ def __init__( max_depth: int | None = None, split_criterion: str = "info_gain", leaf_prediction: str = "nba", - change_detector:base.DriftDetector| None = None, - track_error : bool = True, + change_detector: base.DriftDetector | None = None, + track_error: bool = True, nb_threshold: int = 0, nominal_attributes: list | None = None, splitter: Splitter | None = None, @@ -120,23 +125,23 @@ def __init__( merit_preprune: bool = True, ): super().__init__( - grace_period=None, + grace_period=1, #no usage max_depth=max_depth, split_criterion=split_criterion, - delta=None, - tau=None, + delta=1., #no usage + tau=1, #no usage leaf_prediction=leaf_prediction, - nb_threshold = nb_threshold, + nb_threshold=nb_threshold, binary_split=binary_split, max_size=max_size, memory_estimate_period=memory_estimate_period, stop_mem_management=stop_mem_management, remove_poor_attrs=remove_poor_attrs, merit_preprune=merit_preprune, - nominal_attributes = nominal_attributes, - splitter = splitter, - min_branch_fraction = min_branch_fraction, - max_share_to_split = max_share_to_split, + nominal_attributes=nominal_attributes, + splitter=splitter, + min_branch_fraction=min_branch_fraction, + max_share_to_split=max_share_to_split, ) self.change_detector = change_detector if change_detector is not None else drift.ADWIN() self.track_error = track_error @@ -148,7 +153,6 @@ def __init__( def _mutable_attributes(self): return {} - def _new_leaf(self, initial_stats=None, parent=None): if initial_stats is None: initial_stats = {} @@ -159,20 +163,43 @@ def _new_leaf(self, initial_stats=None, parent=None): if not self.track_error: if self._leaf_prediction == self._MAJORITY_CLASS: - return LeafMajorityClassWithDetector(initial_stats, depth, self.splitter, self.change_detector.clone()) + return LeafMajorityClassWithDetector( + initial_stats, depth, self.splitter, self.change_detector.clone() + ) elif self._leaf_prediction == self._NAIVE_BAYES: - return LeafNaiveBayesWithDetector(initial_stats, depth, self.splitter, self.change_detector.clone()) + return LeafNaiveBayesWithDetector( + initial_stats, depth, self.splitter, self.change_detector.clone() + ) else: # Naives Bayes Adaptive (default) - return LeafNaiveBayesAdaptiveWithDetector(initial_stats, depth, self.splitter, self.change_detector.clone()) + return LeafNaiveBayesAdaptiveWithDetector( + initial_stats, depth, self.splitter, self.change_detector.clone() + ) else: split_criterion = self._new_split_criterion() if self._leaf_prediction == self._MAJORITY_CLASS: - return LeafMajorityClassWithDetector(initial_stats, depth, self.splitter, self.change_detector.clone(), split_criterion) + return LeafMajorityClassWithDetector( + initial_stats, + depth, + self.splitter, + self.change_detector.clone(), + split_criterion, + ) elif self._leaf_prediction == self._NAIVE_BAYES: - return LeafNaiveBayesWithDetector(initial_stats, depth, self.splitter, self.change_detector.clone(), split_criterion) + return LeafNaiveBayesWithDetector( + initial_stats, + depth, + self.splitter, + self.change_detector.clone(), + split_criterion, + ) else: # Naives Bayes Adaptive (default) - return LeafNaiveBayesAdaptiveWithDetector(initial_stats, depth, self.splitter, self.change_detector.clone(), split_criterion) - + return LeafNaiveBayesAdaptiveWithDetector( + initial_stats, + depth, + self.splitter, + self.change_detector.clone(), + split_criterion, + ) def _new_split_criterion(self): if self._split_criterion == self._GINI_SPLIT: @@ -181,8 +208,10 @@ def _new_split_criterion(self): split_criterion = InfoGainSplitCriterion(self.min_branch_fraction) elif self._split_criterion == self._HELLINGER_SPLIT: if not self.track_error: - raise ValueError("The Heillinger distance cannot estimate the purity of a single distribution.\ - Use another split criterion or set track_error to True") + raise ValueError( + "The Heillinger distance cannot estimate the purity of a single distribution.\ + Use another split criterion or set track_error to True" + ) split_criterion = HellingerDistanceCriterion(self.min_branch_fraction) else: split_criterion = InfoGainSplitCriterion(self.min_branch_fraction) @@ -315,7 +344,7 @@ def learn_one(self, x, y, *, w=1.0): self._n_inactive_leaves += 1 else: weight_seen = node.total_weight - #check if the change detector triggered a change + # check if the change detector triggered a change if node.change_detector.drift_detected: p_branch = p_node.branch_no(x) if isinstance(p_node, DTBranch) else None self._attempt_to_split(node, p_node, p_branch) @@ -345,4 +374,3 @@ def learn_one(self, x, y, *, w=1.0): if self._train_weight_seen_by_model % self.memory_estimate_period == 0: self._estimate_model_size() - diff --git a/river/tree/nodes/last_nodes.py b/river/tree/nodes/last_nodes.py index 7c178c86c8..5ee89fb431 100644 --- a/river/tree/nodes/last_nodes.py +++ b/river/tree/nodes/last_nodes.py @@ -1,10 +1,6 @@ from __future__ import annotations -from river.tree.utils import BranchFactory -from river.utils.norm import normalize_values_in_dict - -from ..splitter.nominal_splitter_classif import NominalSplitterClassif -from ..utils import do_naive_bayes_prediction, round_sig_fig +from ..utils import do_naive_bayes_prediction from .htc_nodes import LeafMajorityClass @@ -24,23 +20,26 @@ class LeafMajorityClassWithDetector(LeafMajorityClass): Other parameters passed to the learning node. """ - def __init__(self, stats, depth, splitter,change_detector, split_criterion = None, **kwargs): + def __init__(self, stats, depth, splitter, change_detector, split_criterion=None, **kwargs): super().__init__(stats, depth, splitter, **kwargs) self.change_detector = change_detector - self.split_criterion = split_criterion #if None, the change detector will have binary inputs - + self.split_criterion = ( + split_criterion # if None, the change detector will have binary inputs + ) + def learn_one(self, x, y, *, w=1, tree=None): self.update_stats(y, w) if self.is_active(): if self.split_criterion is None: mc_pred = self.prediction(x) - detector_input = (max(mc_pred, key=mc_pred.get) != y) + detector_input = max(mc_pred, key=mc_pred.get) != y self.change_detector.update(detector_input) else: - detector_input = self.split_criterion.purity(self.stats) + detector_input = self.split_criterion.current_merit(self.stats) self.change_detector.update(detector_input) self.update_splitters(x, y, w, tree.nominal_attributes) + class LeafNaiveBayesWithDetector(LeafMajorityClassWithDetector): """Leaf that uses Naive Bayes models. @@ -57,18 +56,18 @@ class LeafNaiveBayesWithDetector(LeafMajorityClassWithDetector): Other parameters passed to the learning node. """ - def __init__(self, stats, depth, splitter,change_detector, split_criterion = None, **kwargs): - super().__init__(stats, depth, splitter,change_detector,split_criterion,**kwargs) - + def __init__(self, stats, depth, splitter, change_detector, split_criterion=None, **kwargs): + super().__init__(stats, depth, splitter, change_detector, split_criterion, **kwargs) + def learn_one(self, x, y, *, w=1, tree=None): self.update_stats(y, w) if self.is_active(): if self.split_criterion is None: nb_pred = self.prediction(x) - detector_input = (max(nb_pred, key=nb_pred.get) == y) + detector_input = max(nb_pred, key=nb_pred.get) == y self.change_detector.update(detector_input) else: - detector_input = self.split_criterion.purity(self.stats) + detector_input = self.split_criterion.current_merit(self.stats) self.change_detector.update(detector_input) self.update_splitters(x, y, w, tree.nominal_attributes) @@ -108,8 +107,8 @@ class LeafNaiveBayesAdaptiveWithDetector(LeafMajorityClassWithDetector): Other parameters passed to the learning node. """ - def __init__(self, stats, depth, splitter, change_detector,split_criterion = None, **kwargs): - super().__init__(stats, depth, splitter, change_detector, split_criterion,**kwargs) + def __init__(self, stats, depth, splitter, change_detector, split_criterion=None, **kwargs): + super().__init__(stats, depth, splitter, change_detector, split_criterion, **kwargs) self._mc_correct_weight = 0.0 self._nb_correct_weight = 0.0 @@ -150,13 +149,10 @@ def learn_one(self, x, y, *, w=1.0, tree=None): else: self.change_detector.update(detector_input_mc) else: - detector_input = self.split_criterion.purity(self.stats) + detector_input = self.split_criterion.current_merit(self.stats) self.change_detector.update(detector_input) self.update_splitters(x, y, w, tree.nominal_attributes) - - - def prediction(self, x, *, tree=None): """Get the probabilities per class for a given instance. @@ -188,4 +184,4 @@ def disable_attribute(self, att_index): att_index Attribute index. """ - pass \ No newline at end of file + pass diff --git a/river/tree/split_criterion/base.py b/river/tree/split_criterion/base.py index ae09efd397..84da388b4b 100644 --- a/river/tree/split_criterion/base.py +++ b/river/tree/split_criterion/base.py @@ -31,11 +31,10 @@ def merit_of_split(self, pre_split_dist, post_split_dist): ------- Value of the merit of splitting """ - + @abc.abstractmethod - def purity(self, dist): - """Compute how pure (how close the distribution is to have only a single class) - the distribution is. + def current_merit(self, dist): + """Compute the merit of the distribution. Parameters ---------- @@ -44,7 +43,7 @@ def purity(self, dist): Returns ------- - Value of purity of the distribution according to the splitting merit + Value of merit of the distribution according to the splitting criterion """ @staticmethod diff --git a/river/tree/split_criterion/gini_split_criterion.py b/river/tree/split_criterion/gini_split_criterion.py index abf49ca783..1a71c1b651 100644 --- a/river/tree/split_criterion/gini_split_criterion.py +++ b/river/tree/split_criterion/gini_split_criterion.py @@ -27,9 +27,8 @@ def merit_of_split(self, pre_split_dist, post_split_dist): post_split_dist[i], dist_weights[i] ) return 1.0 - gini - - - def purity(self, dist): + + def current_merit(self, dist): return self.compute_gini(dist, sum(dist.values())) @staticmethod diff --git a/river/tree/split_criterion/hellinger_distance_criterion.py b/river/tree/split_criterion/hellinger_distance_criterion.py index 2f7662f34f..236564f9bd 100644 --- a/river/tree/split_criterion/hellinger_distance_criterion.py +++ b/river/tree/split_criterion/hellinger_distance_criterion.py @@ -28,7 +28,7 @@ def merit_of_split(self, pre_split_dist, post_split_dist): return -math.inf return self.compute_hellinger(post_split_dist) - def purity(self, dist): + def current_merit(self, dist): raise ValueError("The Heillinger distance is for 2 or more sets of data.") @staticmethod diff --git a/river/tree/split_criterion/info_gain_split_criterion.py b/river/tree/split_criterion/info_gain_split_criterion.py index f8ced1b03d..b112ff829d 100644 --- a/river/tree/split_criterion/info_gain_split_criterion.py +++ b/river/tree/split_criterion/info_gain_split_criterion.py @@ -38,8 +38,8 @@ def compute_entropy(self, dist): return self._compute_entropy_dict(dist) elif isinstance(dist, list): return self._compute_entropy_list(dist) - - def purity(self, dist): + + def current_merit(self, dist): return self.compute_entropy(dist) @staticmethod diff --git a/river/tree/split_criterion/variance_ratio_split_criterion.py b/river/tree/split_criterion/variance_ratio_split_criterion.py index 3f921da140..dfdff8ea55 100644 --- a/river/tree/split_criterion/variance_ratio_split_criterion.py +++ b/river/tree/split_criterion/variance_ratio_split_criterion.py @@ -34,9 +34,9 @@ def merit_of_split(self, pre_split_dist, post_split_dist): vr -= (n_i / n) * (self.compute_var(post_split_dist[i]) / var) return vr - def purity(self, dist): + def current_merit(self, dist): return self.compute_var(dist) - + @staticmethod def compute_var(dist): return dist.get() From 5d4af5aaa1f79174b0ab6c9b47309935e8ac51c7 Mon Sep 17 00:00:00 2001 From: Saulo Martiello Mastelini Date: Fri, 6 Sep 2024 11:24:53 -0300 Subject: [PATCH 08/16] Update river/tree/hoeffding_adaptive_tree_classifier.py --- river/tree/hoeffding_adaptive_tree_classifier.py | 1 + 1 file changed, 1 insertion(+) diff --git a/river/tree/hoeffding_adaptive_tree_classifier.py b/river/tree/hoeffding_adaptive_tree_classifier.py index ce40900ffc..2ffbf898c4 100644 --- a/river/tree/hoeffding_adaptive_tree_classifier.py +++ b/river/tree/hoeffding_adaptive_tree_classifier.py @@ -131,6 +131,7 @@ class HoeffdingAdaptiveTreeClassifier(HoeffdingTreeClassifier): >>> evaluate.progressive_val_score(dataset, model, metric) Accuracy: 91.49% + """ def __init__( From b9f376d42fcf8124ce57f03f69695ff48ee34302 Mon Sep 17 00:00:00 2001 From: Daniel Nowak Date: Fri, 6 Sep 2024 14:43:17 -0300 Subject: [PATCH 09/16] update docs --- river/tree/last_classifier.py | 24 ++++++++++++------------ river/tree/nodes/last_nodes.py | 22 ++++++++++++++++++++-- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/river/tree/last_classifier.py b/river/tree/last_classifier.py index 91b271adb1..0d72c3db0e 100644 --- a/river/tree/last_classifier.py +++ b/river/tree/last_classifier.py @@ -15,7 +15,11 @@ class LASTClassifier(HoeffdingTreeClassifier, base.Classifier): - """Local Adaptive Streaming Tree classifier. + """Local Adaptive Streaming Tree Classifier. + + Local Adaptive Streaming Tree [^1] (LAST) is an incremental decision tree with + adaptive splitting mechanisms. LAST maintains a change detector at each leaf and splits + this node if a change is detected in the error or the leaf`s data distribution. Parameters ---------- @@ -70,12 +74,6 @@ class LASTClassifier(HoeffdingTreeClassifier, base.Classifier): merit_preprune If True, enable merit-based tree pre-pruning. - Notes - ----- - Local Adaptive Streaming Tree [^1] (LAST) is an incremental decision tree with - adaptive splitting mechanisms. LAST maintains a change detector at each leaf and splits - this node if a change is detected in the error or the leaf`s data distribution. - References ---------- @@ -103,6 +101,7 @@ class LASTClassifier(HoeffdingTreeClassifier, base.Classifier): >>> evaluate.progressive_val_score(dataset, model, metric) Accuracy: 92.50% + """ def __init__( @@ -125,11 +124,11 @@ def __init__( merit_preprune: bool = True, ): super().__init__( - grace_period=1, #no usage + grace_period=1, # no usage max_depth=max_depth, split_criterion=split_criterion, - delta=1., #no usage - tau=1, #no usage + delta=1.0, # no usage + tau=1, # no usage leaf_prediction=leaf_prediction, nb_threshold=nb_threshold, binary_split=binary_split, @@ -308,8 +307,9 @@ def learn_one(self, x, y, *, w=1.0): * If the tree is empty, create a leaf node as the root. * If the tree is already initialized, find the corresponding leaf for the instance and update the leaf node statistics. - * Update the leaf change detector with (1 if the tree misclassified the instance, or 0 if it correctly classified) or the data distribution purity - * If growth is allowed and the then attempt + * Update the leaf change detector with (1 if the tree misclassified the instance, + or 0 if it correctly classified) or the data distribution purity + * If growth is allowed, then attempt to split. """ diff --git a/river/tree/nodes/last_nodes.py b/river/tree/nodes/last_nodes.py index 5ee89fb431..e1ac181dbe 100644 --- a/river/tree/nodes/last_nodes.py +++ b/river/tree/nodes/last_nodes.py @@ -16,6 +16,12 @@ class LeafMajorityClassWithDetector(LeafMajorityClass): splitter The numeric attribute observer algorithm used to monitor target statistics and perform split attempts. + change_detector + Change detector that monitors the leaf error rate or class distribution and + determines when the leaf will split. + split_criterion + Split criterion used in the tree for updating the change detector if it + monitors the class distribution. kwargs Other parameters passed to the learning node. """ @@ -52,6 +58,12 @@ class LeafNaiveBayesWithDetector(LeafMajorityClassWithDetector): splitter The numeric attribute observer algorithm used to monitor target statistics and perform split attempts. + change_detector + Change detector that monitors the leaf error rate or class distribution and + determines when the leaf will split. + split_criterion + Split criterion used in the tree for updating the change detector if it + monitors the class distribution. kwargs Other parameters passed to the learning node. """ @@ -103,6 +115,12 @@ class LeafNaiveBayesAdaptiveWithDetector(LeafMajorityClassWithDetector): splitter The numeric attribute observer algorithm used to monitor target statistics and perform split attempts. + change_detector + Change detector that monitors the leaf error rate or class distribution and + determines when the leaf will split. + split_criterion + Split criterion used in the tree for updating the change detector if it + monitors the class distribution. kwargs Other parameters passed to the learning node. """ @@ -124,7 +142,7 @@ def learn_one(self, x, y, *, w=1.0, tree=None): w The instance's weight. tree - The Hoeffding Tree to update. + The tree to update. """ detector_input_mc = 1 @@ -161,7 +179,7 @@ def prediction(self, x, *, tree=None): x Instance attributes. tree - Hoeffding Tree. + LAST Tree. Returns ------- From beaa51b1f2842535857dfaa2ee2b799f1ff1dbf3 Mon Sep 17 00:00:00 2001 From: Daniel Nowak Date: Fri, 6 Sep 2024 16:03:41 -0300 Subject: [PATCH 10/16] change docs --- docs/releases/unreleased.md | 6 ++++++ river/tree/last_classifier.py | 2 +- river/tree/nodes/last_nodes.py | 1 + .../intra_cluster_variance_reduction_split_criterion.py | 3 +++ .../split_criterion/variance_reduction_split_criterion.py | 3 +++ 5 files changed, 14 insertions(+), 1 deletion(-) diff --git a/docs/releases/unreleased.md b/docs/releases/unreleased.md index dfb7dd4b19..e046c2a61e 100644 --- a/docs/releases/unreleased.md +++ b/docs/releases/unreleased.md @@ -17,3 +17,9 @@ ## tree - Instead of letting trees grow indefinitely, setting the `max_depth` parameter to `None` will stop the trees from growing when they reach the system recursion limit. + +-Added `LASTClassifier` (Local Adaptive Streaming Tree Classifier) + +## stream + +- `stream.iter_arff` now supports blank values (treated as missing values). \ No newline at end of file diff --git a/river/tree/last_classifier.py b/river/tree/last_classifier.py index 0d72c3db0e..5bcb923045 100644 --- a/river/tree/last_classifier.py +++ b/river/tree/last_classifier.py @@ -309,7 +309,7 @@ def learn_one(self, x, y, *, w=1.0): the instance and update the leaf node statistics. * Update the leaf change detector with (1 if the tree misclassified the instance, or 0 if it correctly classified) or the data distribution purity - * If growth is allowed, then attempt + * If growth is allowed then attempt to split. """ diff --git a/river/tree/nodes/last_nodes.py b/river/tree/nodes/last_nodes.py index e1ac181dbe..f130fe17ac 100644 --- a/river/tree/nodes/last_nodes.py +++ b/river/tree/nodes/last_nodes.py @@ -29,6 +29,7 @@ class LeafMajorityClassWithDetector(LeafMajorityClass): def __init__(self, stats, depth, splitter, change_detector, split_criterion=None, **kwargs): super().__init__(stats, depth, splitter, **kwargs) self.change_detector = change_detector + # change this in future PR's by acessing the tree parameter in the leaf self.split_criterion = ( split_criterion # if None, the change detector will have binary inputs ) diff --git a/river/tree/split_criterion/intra_cluster_variance_reduction_split_criterion.py b/river/tree/split_criterion/intra_cluster_variance_reduction_split_criterion.py index 1436e817e2..0a1af74773 100644 --- a/river/tree/split_criterion/intra_cluster_variance_reduction_split_criterion.py +++ b/river/tree/split_criterion/intra_cluster_variance_reduction_split_criterion.py @@ -27,6 +27,9 @@ def merit_of_split(self, pre_split_dist, post_split_dist): icvr -= n_i / n * self.compute_var(dist) return icvr + def current_merit(self, dist): + return self.compute_var(dist) + @staticmethod def compute_var(dist): icvr = [vr.get() for vr in dist.values()] diff --git a/river/tree/split_criterion/variance_reduction_split_criterion.py b/river/tree/split_criterion/variance_reduction_split_criterion.py index f52cfa7bd3..147ead573b 100644 --- a/river/tree/split_criterion/variance_reduction_split_criterion.py +++ b/river/tree/split_criterion/variance_reduction_split_criterion.py @@ -35,6 +35,9 @@ def merit_of_split(self, pre_split_dist, post_split_dist): vr -= n_i / n * self.compute_var(post_split_dist[i]) return vr + def current_merit(self, dist): + return self.compute_var(dist) + @staticmethod def compute_var(dist): return dist.get() From cb0c133fc0c411bb2d426365f828cd92cacfc9d6 Mon Sep 17 00:00:00 2001 From: Daniel Nowak Date: Fri, 6 Sep 2024 16:08:00 -0300 Subject: [PATCH 11/16] change last --- river/tree/last_classifier.py | 65 ++++++++++++++--------------------- 1 file changed, 25 insertions(+), 40 deletions(-) diff --git a/river/tree/last_classifier.py b/river/tree/last_classifier.py index 5bcb923045..fe2fd41585 100644 --- a/river/tree/last_classifier.py +++ b/river/tree/last_classifier.py @@ -100,7 +100,7 @@ class LASTClassifier(HoeffdingTreeClassifier, base.Classifier): >>> evaluate.progressive_val_score(dataset, model, metric) - Accuracy: 92.50% + Accuracy: 91.10% """ @@ -160,45 +160,30 @@ def _new_leaf(self, initial_stats=None, parent=None): else: depth = parent.depth + 1 - if not self.track_error: - if self._leaf_prediction == self._MAJORITY_CLASS: - return LeafMajorityClassWithDetector( - initial_stats, depth, self.splitter, self.change_detector.clone() - ) - elif self._leaf_prediction == self._NAIVE_BAYES: - return LeafNaiveBayesWithDetector( - initial_stats, depth, self.splitter, self.change_detector.clone() - ) - else: # Naives Bayes Adaptive (default) - return LeafNaiveBayesAdaptiveWithDetector( - initial_stats, depth, self.splitter, self.change_detector.clone() - ) - else: - split_criterion = self._new_split_criterion() - if self._leaf_prediction == self._MAJORITY_CLASS: - return LeafMajorityClassWithDetector( - initial_stats, - depth, - self.splitter, - self.change_detector.clone(), - split_criterion, - ) - elif self._leaf_prediction == self._NAIVE_BAYES: - return LeafNaiveBayesWithDetector( - initial_stats, - depth, - self.splitter, - self.change_detector.clone(), - split_criterion, - ) - else: # Naives Bayes Adaptive (default) - return LeafNaiveBayesAdaptiveWithDetector( - initial_stats, - depth, - self.splitter, - self.change_detector.clone(), - split_criterion, - ) + if self._leaf_prediction == self._MAJORITY_CLASS: + return LeafMajorityClassWithDetector( + initial_stats, + depth, + self.splitter, + self.change_detector.clone(), + split_criterion=self._new_split_criterion() if not self.track_error else None, + ) + elif self._leaf_prediction == self._NAIVE_BAYES: + return LeafNaiveBayesWithDetector( + initial_stats, + depth, + self.splitter, + self.change_detector.clone(), + split_criterion=self._new_split_criterion() if not self.track_error else None, + ) + else: # Naives Bayes Adaptive (default) + return LeafNaiveBayesAdaptiveWithDetector( + initial_stats, + depth, + self.splitter, + self.change_detector.clone(), + split_criterion=self._new_split_criterion() if not self.track_error else None, + ) def _new_split_criterion(self): if self._split_criterion == self._GINI_SPLIT: From 77e7d82b190fbf1f706e5a00a017520115167361 Mon Sep 17 00:00:00 2001 From: Saulo Martiello Mastelini Date: Fri, 6 Sep 2024 16:14:45 -0300 Subject: [PATCH 12/16] Update docs/releases/unreleased.md --- docs/releases/unreleased.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/releases/unreleased.md b/docs/releases/unreleased.md index e046c2a61e..e0536f435b 100644 --- a/docs/releases/unreleased.md +++ b/docs/releases/unreleased.md @@ -18,7 +18,7 @@ - Instead of letting trees grow indefinitely, setting the `max_depth` parameter to `None` will stop the trees from growing when they reach the system recursion limit. --Added `LASTClassifier` (Local Adaptive Streaming Tree Classifier) +-Added `tree.LASTClassifier` (Local Adaptive Streaming Tree Classifier). ## stream From b30cbdc5685ec84adea3326337ce5f4e029adba6 Mon Sep 17 00:00:00 2001 From: Saulo Martiello Mastelini Date: Fri, 6 Sep 2024 16:15:52 -0300 Subject: [PATCH 13/16] Update river/tree/hoeffding_adaptive_tree_classifier.py --- river/tree/hoeffding_adaptive_tree_classifier.py | 1 - 1 file changed, 1 deletion(-) diff --git a/river/tree/hoeffding_adaptive_tree_classifier.py b/river/tree/hoeffding_adaptive_tree_classifier.py index 2ffbf898c4..c825fa5f38 100644 --- a/river/tree/hoeffding_adaptive_tree_classifier.py +++ b/river/tree/hoeffding_adaptive_tree_classifier.py @@ -129,7 +129,6 @@ class HoeffdingAdaptiveTreeClassifier(HoeffdingTreeClassifier): >>> metric = metrics.Accuracy() >>> evaluate.progressive_val_score(dataset, model, metric) - Accuracy: 91.49% """ From 892eb41b50e4e3c1cd56baa9f0ef347d4299a1c7 Mon Sep 17 00:00:00 2001 From: Saulo Martiello Mastelini Date: Fri, 6 Sep 2024 16:19:29 -0300 Subject: [PATCH 14/16] Update river/tree/last_classifier.py --- river/tree/last_classifier.py | 1 - 1 file changed, 1 deletion(-) diff --git a/river/tree/last_classifier.py b/river/tree/last_classifier.py index fe2fd41585..349d82160f 100644 --- a/river/tree/last_classifier.py +++ b/river/tree/last_classifier.py @@ -99,7 +99,6 @@ class LASTClassifier(HoeffdingTreeClassifier, base.Classifier): >>> metric = metrics.Accuracy() >>> evaluate.progressive_val_score(dataset, model, metric) - Accuracy: 91.10% """ From b338c922236d635a294396d5a581f83f2deadd13 Mon Sep 17 00:00:00 2001 From: Daniel Nowak Date: Fri, 6 Sep 2024 16:42:09 -0300 Subject: [PATCH 15/16] add disclamer --- river/tree/last_classifier.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/river/tree/last_classifier.py b/river/tree/last_classifier.py index 349d82160f..ff51bb647b 100644 --- a/river/tree/last_classifier.py +++ b/river/tree/last_classifier.py @@ -21,6 +21,9 @@ class LASTClassifier(HoeffdingTreeClassifier, base.Classifier): adaptive splitting mechanisms. LAST maintains a change detector at each leaf and splits this node if a change is detected in the error or the leaf`s data distribution. + LAST is still not suitable as classifiers of an ensemble due to the change detectors. + The authors in [^1] are working in a version of LAST that overcomes this. + Parameters ---------- max_depth From 46312acfeeba0eb974cebcc7ec78d59b12fa9894 Mon Sep 17 00:00:00 2001 From: Saulo Martiello Mastelini Date: Fri, 6 Sep 2024 16:45:51 -0300 Subject: [PATCH 16/16] Update river/tree/last_classifier.py --- river/tree/last_classifier.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/river/tree/last_classifier.py b/river/tree/last_classifier.py index ff51bb647b..391ba83668 100644 --- a/river/tree/last_classifier.py +++ b/river/tree/last_classifier.py @@ -21,8 +21,8 @@ class LASTClassifier(HoeffdingTreeClassifier, base.Classifier): adaptive splitting mechanisms. LAST maintains a change detector at each leaf and splits this node if a change is detected in the error or the leaf`s data distribution. - LAST is still not suitable as classifiers of an ensemble due to the change detectors. - The authors in [^1] are working in a version of LAST that overcomes this. + LAST is still not suitable for use as a base classifier in ensembles due to the change detectors. + The authors in [^1] are working on a version of LAST that overcomes this limitation. Parameters ----------