From 0e75d35fc3e1c0a479f40c4d093353eecff52949 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89mile=20Royer?= Date: Fri, 19 Jul 2024 17:44:08 +0200 Subject: [PATCH 1/4] Replace btdtri with betaincinv The function scipy.special.btdtri was deprecated in SciPy 1.12, and will be removed in SciPy 1.14. scipy.special.betaincinv should be a drop-in replacement. --- river/bandit/bayes_ucb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/river/bandit/bayes_ucb.py b/river/bandit/bayes_ucb.py index c60b6353ea..b7868fe38d 100644 --- a/river/bandit/bayes_ucb.py +++ b/river/bandit/bayes_ucb.py @@ -79,7 +79,7 @@ def compute_index(self, arm_id): """the p-th quantile of the beta distribution for the arm""" p = 1 - 1 / (self._n + 1) posterior = self._posteriors[arm_id] - return scipy.special.btdtri(posterior.alpha, posterior.beta, p) + return scipy.special.betaincinv(posterior.alpha, posterior.beta, p) def update(self, arm_id, *reward_args, **reward_kwargs): """Rewrite update function""" From 9b2c89054fcdebeba84d0ed2914c92886b1c2576 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89mile=20Royer?= Date: Fri, 19 Jul 2024 17:53:48 +0200 Subject: [PATCH 2/4] Use plain datetime.now() in test datetime.datetime.utcnow() is deprecated because it is too easy of a footgun. Since we don't need timezone-aware objects, now() has the same behaviour (to the difference of the local time difference to UTC). The exact time used (or timezone-equivalent) has no impact on this test, the only requirements is for both calls to have the same timestamp. --- river/utils/test_rolling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/river/utils/test_rolling.py b/river/utils/test_rolling.py index 5d484ac7de..7e77ba3d10 100644 --- a/river/utils/test_rolling.py +++ b/river/utils/test_rolling.py @@ -47,6 +47,6 @@ def test_issue_1343(): """ rmean = utils.TimeRolling(proba.MultivariateGaussian(), period=dt.timedelta(microseconds=1)) - t = dt.datetime.utcnow() + t = dt.datetime.now() rmean.update({"a": 0}, t=t) rmean.update({"a": 1}, t=t) From 8dec117c138c88420883046a601f569722ebb7dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89mile=20Royer?= Date: Fri, 16 Aug 2024 10:36:18 +0200 Subject: [PATCH 3/4] Remove concatenation with an empty dataframe Concatenation of empty dataframes causes a FutureWarning in pandas. None values in concatenation are ignored, as long as not all objects to concatenate are None. --- river/covariance/test_emp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/river/covariance/test_emp.py b/river/covariance/test_emp.py index 53167d1cf1..56e7edf74d 100644 --- a/river/covariance/test_emp.py +++ b/river/covariance/test_emp.py @@ -95,7 +95,7 @@ def test_covariance_update_sampled(): def test_covariance_update_many(ddof): cov = covariance.EmpiricalCovariance(ddof=ddof) p = 5 - X_all = pd.DataFrame(columns=range(p)) + X_all = None for _ in range(p): n = np.random.randint(1, 31) @@ -123,7 +123,7 @@ def test_covariance_update_many(ddof): def test_covariance_update_many_shuffled(ddof): cov = covariance.EmpiricalCovariance(ddof=ddof) p = 5 - X_all = pd.DataFrame(columns=range(p)) + X_all = None for _ in range(p): n = np.random.randint(5, 31) @@ -143,7 +143,7 @@ def test_covariance_update_many_sampled(): ddof = 1 cov = covariance.EmpiricalCovariance(ddof=ddof) p = 5 - X_all = pd.DataFrame(columns=range(p)) + X_all = None for _ in range(p): n = np.random.randint(5, 31) From 422aafb75ad09c098d07de6e0c8170a8a808bcd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89mile=20Royer?= Date: Wed, 13 Nov 2024 15:19:26 +0100 Subject: [PATCH 4/4] Preserve the sparse nature of dataframes In a sparse dataframe, all elements must have a sparse Dtype. Values added after the fact need to be converted. --- river/naive_bayes/bernoulli.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/river/naive_bayes/bernoulli.py b/river/naive_bayes/bernoulli.py index 111dbabf89..0967cb47cd 100644 --- a/river/naive_bayes/bernoulli.py +++ b/river/naive_bayes/bernoulli.py @@ -252,18 +252,24 @@ def joint_log_likelihood_many(self, X: pd.DataFrame) -> pd.DataFrame: unknown = [x for x in X.columns if x not in self.feature_counts] missing = [x for x in self.feature_counts if x not in X.columns] + is_sparse = hasattr(X, "sparse") + if unknown: X = X.drop(unknown, axis="columns") if missing: - X[missing] = False + X[missing] = 0 + if is_sparse: + # The new values need to be converted to preserve the sparseness of the dataframe. + # Input values can be intergers or floats, converting all to float preserves the behaviour without the need for complex conversion logic. + X = X.astype(pd.SparseDtype(float, 0.0)) index, columns = X.index, X.columns if not self.class_counts or not self.feature_counts: return pd.DataFrame(index=index) - if hasattr(X, "sparse"): + if is_sparse: X = sparse.csr_matrix(X.sparse.to_coo()) X.data = X.data > self.true_threshold else: