Merge branch 'main' into feature/expected-categories

online-ml · Nov 5, 2024 · 4cd961e · 4cd961e
2 parents fcf01f1 + e958fd8
commit 4cd961e
Show file tree

Hide file tree

Showing 166 changed files with 1,366 additions and 660 deletions.
diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
@@ -122,7 +122,7 @@ jobs:
     needs: [build_wheels, build_sdist]
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/download-artifact@v3
+      - uses: actions/download-artifact@v4.1.7
         with:
           name: artifact
           path: dist

diff --git a/.github/workflows/release-docs.yml b/.github/workflows/release-docs.yml
@@ -15,7 +15,7 @@ jobs:
       - name: Build River
         uses: ./.github/actions/install-env
         with:
-          python-version: "3.12.3"
+          python-version: "3.12"
 
       - name: Install extra Ubuntu dependencies
         run: sudo apt-get install graphviz pandoc

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,26 +1,28 @@
 files: river
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.2.0
+    rev: v4.4.0
     hooks:
       - id: check-json
       - id: check-yaml
-      - id: end-of-file-fixer
-      - id: trailing-whitespace
-      - id: mixed-line-ending
 
-  - repo: local
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.5.7
     hooks:
+      # Run the linter.
       - id: ruff
-        name: ruff
-        language: python
-        types: [python]
-        entry: ruff
-        args: 
-          - --fix
+        types_or: [python, pyi, jupyter]
+        args: [--fix]
+      # Run the formatter.
+      - id: ruff-format
+        types_or: [python, pyi, jupyter]
 
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: "v1.1.1"
+    hooks:
       - id: mypy
-        name: mypy
-        language: python
-        types: [python]
-        entry: mypy --implicit-optional
+        args:
+          - "--config-file=pyproject.toml"
+          - "--python-version=3.11"
+          - "--implicit-optional"
diff --git a/docs/introduction/why-use-river.md b/docs/introduction/why-use-river.md
@@ -14,4 +14,4 @@ River supports different machine learning tasks, including regression, classific
 
 ## User experience
 
-River is not the only library allowing you to do online machine learning. But it might just the simplest one to use in the Python ecosystem. River plays nicely with Python dictionaries, therefore making it easy to use in the context of web applications where JSON payloads are aplenty.
+River is not the only library allowing you to do online machine learning. But it might just be the simplest one to use in the Python ecosystem. River plays nicely with Python dictionaries, therefore making it easy to use in the context of web applications where JSON payloads are aplenty.
diff --git a/docs/releases/unreleased.md b/docs/releases/unreleased.md
@@ -1,6 +1,7 @@
 # Unreleased
 
 - The units used in River have been corrected to be based on powers of 2 (KiB, MiB). This only changes the display, the behaviour is unchanged.
+- The methods `learn_one`, `learn_many`, `update`, `revert`, and `append` now return `None`.
 
 ## cluster
 
@@ -9,9 +10,22 @@
 - Add `render_ascii` in `cluster.ODAC` to render the hierarchical cluster's structure in text format.
 - Work with `stats.Var` in `cluster.ODAC` when cluster has only one time series.
 
+## drift
+
+- Make `drift.ADWIN` comply with the reference MOA implementation.
+
+## stats
+
+- Removed the unexported class `stats.CentralMoments`.
+
 ## tree
 
 - Instead of letting trees grow indefinitely, setting the `max_depth` parameter to `None` will stop the trees from growing when they reach the system recursion limit.
+- Added `tree.LASTClassifier` (Local Adaptive Streaming Tree Classifier).
+
+## stream
+
+- `stream.iter_arff` now supports blank values (treated as missing values).
 
 ## preprocessing
 

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -85,7 +85,6 @@ plugins:
 
 extra_javascript:
   - javascripts/config.js
-  - https://polyfill.io/v3/polyfill.min.js?features=es6
   - https://cdn.jsdelivr.net/npm/[email protected]/es5/tex-mml-chtml.js
   - https://cdn.jsdelivr.net/npm/vega@5
   - https://cdn.jsdelivr.net/npm/vega-lite@5

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,10 +37,10 @@ pandas = "^2.1"
 graphviz = "^0.20.1"
 gymnasium = "^0.29.0"
 matplotlib = "^3.0.2"
-mypy = "^1.6.1"
+mypy = "^1.11.1"
 pre-commit = "^3.5.0"
 pytest = "^7.4.2"
-ruff = "^0.1.1"
+ruff = "^0.4.10"
 scikit-learn = "^1.3.1"
 sqlalchemy = "^2.0.22"
 sympy = "^1.10.1"
@@ -117,20 +117,37 @@ markers = [
 ]
 
 [tool.ruff]
-select = ["E", "F", "I", "UP"]  # https://beta.ruff.rs/docs/rules/
 line-length = 100
 target-version = 'py310'
+extend-include = ["*.ipynb"]
+
+[tool.ruff.lint]
+select = [
+    # pycodestyle
+    "E",
+    # Pyflakes
+    "F",
+    # pyupgrade
+    "UP",
+    # isort
+    "I",
+]
 ignore = ["E501"]
+fixable = ["ALL"]
 
-[tool.ruff.isort]
+[tool.ruff.lint.isort]
 required-imports = ["from __future__ import annotations"]
 
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+
 [tool.mypy]
 files = "river"
+strict = true
 
 [[tool.mypy.overrides]]
 module = [
-    "river.*",
     "mmh3.*",
     "numpy.*",
     "sklearn.*",
@@ -141,6 +158,65 @@ module = [
     "vaex.*",
     "torch.*",
     "sqlalchemy.*",
-    "requests.*"
+    "requests.*",
+    "gymnasium.*",
+    "sympy.*",
+    "polars.*"
 ]
 ignore_missing_imports = true
+
+[[tool.mypy.overrides]]
+# Disable strict mode for all non fully-typed modules
+module = [
+    "river.base.*",
+    "river.metrics.*",
+    "river.utils.*",
+    "river.stats.*",
+    "river.optim.*",
+    "river.datasets.*",
+    "river.tree.*",
+    "river.preprocessing.*",
+    "river.stream.*",
+    "river.linear_model.*",
+    "river.evaluate.*",
+    "river.drift.*",
+    "river.compose.*",
+    "river.bandit.*",
+    "river.cluster.*",
+    "river.anomaly.*",
+    "river.time_series.*",
+    "river.feature_extraction.*",
+    "river.ensemble.*",
+    "river.proba.*",
+    "river.multioutput.*",
+    "river.naive_bayes.*",
+    "river.checks.*",
+    "river.rules.*",
+    "river.model_selection.*",
+    "river.forest.*",
+    "river.neighbors.*",
+    "river.sketch.*",
+    "river.facto.*",
+    "river.covariance.*",
+    "river.compat.*",
+    "river.multiclass.*",
+    "river.reco.*",
+    "river.imblearn.*",
+    "river.feature_selection.*",
+    "river.misc.*",
+    "river.active.*",
+    "river.conf.*",
+    "river.neural_net.*",
+    "river.test_estimators",
+    "river.dummy",
+]
+# The strict option is global, the checks must be disabled one by one
+warn_unused_ignores = false
+check_untyped_defs = false
+allow_subclassing_any = true
+allow_any_generics = true
+allow_untyped_calls = true
+allow_incomplete_defs = true
+allow_untyped_defs = true
+implicit_reexport = true
+warn_return_any = false
diff --git a/river/__init__.py b/river/__init__.py
@@ -5,6 +5,7 @@
 memory, or simply when it isn't available all at once. river's API is heavily inspired from that of
 scikit-learn, enough so that users who are familiar with scikit-learn should feel right at home.
 """
+
 from __future__ import annotations
 
 from .__version__ import __version__  # noqa: F401
diff --git a/river/active/__init__.py b/river/active/__init__.py
@@ -1,4 +1,5 @@
 """Online active learning."""
+
 from __future__ import annotations
 
 from . import base

diff --git a/river/active/base.py b/river/active/base.py
@@ -30,8 +30,7 @@ def _wrapped_model(self):
         return self.classifier
 
     @abc.abstractmethod
-    def _ask_for_label(self, x, y_pred) -> bool:
-        ...
+    def _ask_for_label(self, x, y_pred) -> bool: ...
 
     def predict_proba_one(self, x, **kwargs):
         """Predict the probability of each label for `x` and indicate whether a label is needed.

diff --git a/river/active/entropy.py b/river/active/entropy.py
@@ -63,9 +63,7 @@ class EntropySampler(ActiveLearningClassifier):
 
     """
 
-    def __init__(
-        self, classifier: base.Classifier, discount_factor: float = 3, seed=None
-    ):
+    def __init__(self, classifier: base.Classifier, discount_factor: float = 3, seed=None):
         super().__init__(classifier, seed=seed)
         self.discount_factor = discount_factor
 

diff --git a/river/anomaly/base.py b/river/anomaly/base.py
@@ -15,7 +15,7 @@ def _supervised(self):
         return False
 
     @abc.abstractmethod
-    def learn_one(self, x: dict):
+    def learn_one(self, x: dict) -> None:
         """Update the model.
 
         Parameters
@@ -48,7 +48,7 @@ class SupervisedAnomalyDetector(base.Estimator):
     """A supervised anomaly detector."""
 
     @abc.abstractmethod
-    def learn_one(self, x: dict, y: base.typing.Target):
+    def learn_one(self, x: dict, y: base.typing.Target) -> None:
         """Update the model.
 
         Parameters
@@ -137,7 +137,7 @@ def score_one(self, *args, **kwargs):
         """
         return self.anomaly_detector.score_one(*args, **kwargs)
 
-    def learn_one(self, *args, **learn_kwargs):
+    def learn_one(self, *args, **learn_kwargs) -> None:
         """Update the anomaly filter and the underlying anomaly detector.
 
         Parameters

diff --git a/river/anomaly/filter.py b/river/anomaly/filter.py
@@ -86,9 +86,7 @@ class ThresholdFilter(anomaly.base.AnomalyFilter):
 
     """
 
-    def __init__(
-        self, anomaly_detector, threshold: float, protect_anomaly_detector=True
-    ):
+    def __init__(self, anomaly_detector, threshold: float, protect_anomaly_detector=True):
         super().__init__(
             anomaly_detector=anomaly_detector,
             protect_anomaly_detector=protect_anomaly_detector,
@@ -188,7 +186,6 @@ def _unit_test_params(cls):
         from river import preprocessing
 
         yield {
-            "anomaly_detector": preprocessing.StandardScaler()
-            | anomaly.OneClassSVM(nu=0.2),
+            "anomaly_detector": preprocessing.StandardScaler() | anomaly.OneClassSVM(nu=0.2),
             "q": 0.995,
         }
diff --git a/river/anomaly/pad.py b/river/anomaly/pad.py
@@ -80,7 +80,7 @@ class PredictiveAnomalyDetection(anomaly.base.SupervisedAnomalyDetector):
 
     >>> for t, (x, y) in enumerate(datasets.AirlinePassengers()):
     ...     score = PAD.score_one(None, y)
-    ...     PAD = PAD.learn_one(None, y)
+    ...     PAD.learn_one(None, y)
     ...     scores.append(score)
 
     >>> print(scores[-1])
@@ -100,7 +100,6 @@ def __init__(
         n_std: float = 3.0,
         warmup_period: int = 0,
     ):
-
         self.predictive_model = (
             predictive_model
             if predictive_model is not None
@@ -123,9 +122,7 @@ def learn_one(self, x: dict | None, y: base.typing.Target | float):
         self.iter += 1
 
         # Check whether the model is a time-series forecasting or regression/classification model
-        if isinstance(
-            self.predictive_model, time_series.base.Forecaster
-        ) and isinstance(y, float):
+        if isinstance(self.predictive_model, time_series.base.Forecaster) and isinstance(y, float):
             # When there's no data point as dict of features, the target will be passed
             # to the forecaster as an exogenous variable.
             if not x:
@@ -134,7 +131,6 @@ def learn_one(self, x: dict | None, y: base.typing.Target | float):
                 self.predictive_model.learn_one(y=y, x=x)
         else:
             self.predictive_model.learn_one(x=x, y=y)
-        return self
 
     def score_one(self, x: dict, y: base.typing.Target):
         # Return the predicted value of x from the predictive model, first by checking whether

diff --git a/river/api.py b/river/api.py
@@ -1,4 +1,5 @@
 """River API module."""
+
 from __future__ import annotations
 
 from . import (

diff --git a/river/bandit/__init__.py b/river/bandit/__init__.py
@@ -5,6 +5,7 @@
 (see `model_selection.BanditRegressor`).
 
 """
+
 from __future__ import annotations
 
 from . import base, datasets, envs

diff --git a/river/bandit/base.py b/river/bandit/base.py
@@ -65,8 +65,7 @@ def __post_init__(self):
             )
 
     @abc.abstractmethod
-    def _pull(self, arm_ids: list[ArmID]) -> ArmID:
-        ...
+    def _pull(self, arm_ids: list[ArmID]) -> ArmID: ...
 
     def pull(self, arm_ids: list[ArmID]) -> ArmID:
         """Pull arm(s).

diff --git a/river/bandit/bayes_ucb.py b/river/bandit/bayes_ucb.py
@@ -63,9 +63,9 @@ class BayesUCB(bandit.base.Policy):
 
     def __init__(self, reward_obj=None, burn_in=0, seed: int | None = None):
         super().__init__(reward_obj, burn_in)
-        self._posteriors: collections.defaultdict[
-            bandit.base.ArmID, proba.Beta
-        ] = collections.defaultdict(proba.Beta)
+        self._posteriors: collections.defaultdict[bandit.base.ArmID, proba.Beta] = (
+            collections.defaultdict(proba.Beta)
+        )
         self.seed = seed
         self._rng = random.Random(seed)
 

diff --git a/river/bandit/envs/__init__.py b/river/bandit/envs/__init__.py
@@ -15,13 +15,13 @@
 
     RIVER_NAMESPACE = "river_bandits"
 
-    if (env_id := f"{RIVER_NAMESPACE}/CandyCaneContest-v0") not in gym.envs.registry:
+    if (env_id := f"{RIVER_NAMESPACE}/CandyCaneContest-v0") not in gym.envs.registration.registry:
         gym.envs.registration.register(
             id=env_id,
             entry_point="river.bandit.envs:CandyCaneContest",
             max_episode_steps=CandyCaneContest.n_steps,
         )
-    if (env_id := f"{RIVER_NAMESPACE}/KArmedTestbed-v0") not in gym.envs.registry:
+    if (env_id := f"{RIVER_NAMESPACE}/KArmedTestbed-v0") not in gym.envs.registration.registry:
         gym.envs.registration.register(
             id=env_id,
             entry_point="river.bandit.envs:KArmedTestbed",
Original file line number	Diff line number	Diff line change
Expand Up		@@ -14,4 +14,4 @@ River supports different machine learning tasks, including regression, classific

		## User experience

		River is not the only library allowing you to do online machine learning. But it might just the simplest one to use in the Python ecosystem. River plays nicely with Python dictionaries, therefore making it easy to use in the context of web applications where JSON payloads are aplenty.
		River is not the only library allowing you to do online machine learning. But it might just be the simplest one to use in the Python ecosystem. River plays nicely with Python dictionaries, therefore making it easy to use in the context of web applications where JSON payloads are aplenty.
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,7 @@ @@
     (see `model_selection.BanditRegressor`).
     """
     from __future__ import annotations
     from . import base, datasets, envs
@@ Expand Down @@