From fe095ecc69d1b0312d87b6d9bc9a9af6ffec1998 Mon Sep 17 00:00:00 2001 From: Matt Gregory Date: Fri, 24 Jan 2025 15:50:10 -0800 Subject: [PATCH] WIP: First implementation of RFNN with sklearn RF estimators This commit implements RFNN using strictly `RandomForestRegressor` objects from `scikit-learn` to mimic the previous implementation using `rpy2` and R's `randomForest` package. This commit was intended to show that regardless of random forest implementation, results from RF-NN should correspond closely. At present, however, these tests are currently failing. Best guess is that the implementation of random forest and hyperparameters differ enough that we can't expect that the same trees are being created in each forest. Testing predicted values will not work until neighbors and distances correspond closely between the two implementations. What I've observed is that neighbors for training data roughly correspond to each other but often don't share the same rank. We have tried using Spearman's rank correlation and the footrule distance to get a measure of the difference in neighbor position, but do not have a good standard for correctness. --- src/sknnr/_rfnn.py | 6 +- src/sknnr/transformers/_rf_sklearn.py | 19 +++++ src/sknnr/transformers/_rfnode_transformer.py | 32 +++++-- tests/test_port.py | 85 +++++++++++++++++++ 4 files changed, 134 insertions(+), 8 deletions(-) create mode 100644 src/sknnr/transformers/_rf_sklearn.py diff --git a/src/sknnr/_rfnn.py b/src/sknnr/_rfnn.py index 0d81a8b..e403997 100644 --- a/src/sknnr/_rfnn.py +++ b/src/sknnr/_rfnn.py @@ -17,6 +17,7 @@ def __init__( *, n_estimators: int = 500, mtry: int | None = None, + method: Literal["rpy2", "sklearn"] = "sklearn", weights: Literal["uniform", "distance"] | Callable = "uniform", algorithm: Literal["auto", "ball_tree", "kd_tree", "brute"] = "auto", leaf_size: int = 30, @@ -32,6 +33,9 @@ def __init__( ) self.n_estimators = n_estimators self.mtry = mtry + self.method = method def _get_transformer(self) -> TransformerMixin: - return RFNodeTransformer() + return RFNodeTransformer( + n_estimators=self.n_estimators, mtry=self.mtry, method=self.method + ) diff --git a/src/sknnr/transformers/_rf_sklearn.py b/src/sknnr/transformers/_rf_sklearn.py new file mode 100644 index 0000000..e928f21 --- /dev/null +++ b/src/sknnr/transformers/_rf_sklearn.py @@ -0,0 +1,19 @@ +from sklearn.ensemble import RandomForestRegressor + + +def sklearn_get_forest(X, y, n_tree, mt): + """ + Train a random forest regression model in sklearn. + """ + rf = RandomForestRegressor( + n_estimators=n_tree, max_features=mt, random_state=42, min_samples_leaf=5 + ) + rf.fit(X, y) + return rf + + +def sklearn_get_nodeset(rf, X): + """ + Get the nodes associated with X of the random forest regression model in sklearn. + """ + return rf.apply(X) diff --git a/src/sknnr/transformers/_rfnode_transformer.py b/src/sknnr/transformers/_rfnode_transformer.py index 86c3b14..20a096f 100644 --- a/src/sknnr/transformers/_rfnode_transformer.py +++ b/src/sknnr/transformers/_rfnode_transformer.py @@ -1,17 +1,26 @@ from __future__ import annotations +from typing import Literal + import numpy as np from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.validation import check_is_fitted from .._base import _validate_data from ._rf_rpy2 import rpy2_get_forest, rpy2_get_nodeset +from ._rf_sklearn import sklearn_get_forest, sklearn_get_nodeset class RFNodeTransformer(TransformerMixin, BaseEstimator): - def __init__(self, n_estimators: int = 500, mtry: int | None = None): + def __init__( + self, + n_estimators: int = 500, + mtry: int | None = None, + method: Literal["rpy2", "sklearn"] = "sklearn", + ): self.n_estimators = n_estimators self.mtry = mtry + self.method = method def fit(self, X, y=None): X = _validate_data( @@ -32,11 +41,17 @@ def fit(self, X, y=None): mt = self.mtry if self.mtry else int(np.sqrt(X.shape[1])) - # Build the individual random forests - self.rfs_ = [ - rpy2_get_forest(X, y[:, i], int(n_tree_list[i]), mt) - for i in range(y.shape[1]) - ] + # Build the individual random forests based on method + if self.method == "rpy2": + self.rfs_ = [ + rpy2_get_forest(X, y[:, i], int(n_tree_list[i]), mt) + for i in range(y.shape[1]) + ] + elif self.method == "sklearn": + self.rfs_ = [ + sklearn_get_forest(X, y[:, i], int(n_tree_list[i]), mt) + for i in range(y.shape[1]) + ] return self def transform(self, X): @@ -48,7 +63,10 @@ def transform(self, X): ensure_min_features=1, ensure_min_samples=1, ) - nodes = [rpy2_get_nodeset(rf, X) for rf in self.rfs_] + if self.method == "rpy2": + nodes = [rpy2_get_nodeset(rf, X) for rf in self.rfs_] + elif self.method == "sklearn": + nodes = [sklearn_get_nodeset(rf, X) for rf in self.rfs_] return np.hstack(nodes) def fit_transform(self, X, y=None): diff --git a/tests/test_port.py b/tests/test_port.py index 421551d..5a4c527 100644 --- a/tests/test_port.py +++ b/tests/test_port.py @@ -1,5 +1,7 @@ import pytest +from numpy import corrcoef from numpy.testing import assert_array_almost_equal, assert_array_equal +from scipy.stats import spearmanr from sklearn.metrics import r2_score from sknnr import ( @@ -112,3 +114,86 @@ def test_score_independent(result, n_components, weighted): ) est = estimator(**hyperparams).fit(dataset.X_train, dataset.y_train) assert est.independent_score_ == pytest.approx(expected_score, abs=0.001) + + +@pytest.mark.parametrize("weighted", [False, True], ids=["unweighted", "weighted"]) +@pytest.mark.parametrize("reference", [True, False], ids=["reference", "target"]) +def test_high_correlation_rfnn(weighted, reference): + """ + Test that predictions for RFNN estimator using yaImpute and sklearn + highly correlate. Currently failing. + """ + dataset = load_moscow_stjoes_results(method="randomForest", n_components=None) + weights = yaimpute_weights if weighted else None + + hyperparams = dict(n_neighbors=5, weights=weights) + rpy2_est = RFNNRegressor(method="rpy2", **hyperparams).fit( + dataset.X_train, dataset.y_train + ) + sklearn_est = RFNNRegressor(method="sklearn", **hyperparams).fit( + dataset.X_train, dataset.y_train + ) + + rpy2_pred = ( + rpy2_est.independent_prediction_ + if reference + else rpy2_est.predict(dataset.X_test) + ) + sklearn_pred = ( + sklearn_est.independent_prediction_ + if reference + else sklearn_est.predict(dataset.X_test) + ) + corr = [ + float(corrcoef(rpy2_pred[:, i], sklearn_pred[:, i])[0, 1]) + for i in range(rpy2_pred.shape[1]) + if rpy2_pred[:, i].std() > 0 and sklearn_pred[:, i].std() > 0 + ] + print(corr) + assert all(c > 0.99 for c in corr) + + +def test_kneighbors_rfnn(): + """ + Test that plots from different implementations of RF-NN share *most* + of the same neighbors. Currently failing. + """ + dataset = load_moscow_stjoes_results(method="randomForest", n_components=None) + + hyperparams = dict(n_neighbors=13) + rpy2_est = RFNNRegressor(method="rpy2", **hyperparams).fit( + dataset.X_train, dataset.y_train + ) + sklearn_est = RFNNRegressor(method="sklearn", **hyperparams).fit( + dataset.X_train, dataset.y_train + ) + + _, rpy2_nn = rpy2_est.kneighbors(return_dataframe_index=True) + _, sklearn_nn = sklearn_est.kneighbors(return_dataframe_index=True) + + def footrule_distance(list1, list2): + """ + The Footrule Distance measures how far elements in one list are from + their corresponding positions in another list. It calculates the sum of absolute + positional differences. + """ + index_map = {int(val): i for i, val in enumerate(list2)} + not_in_list = len(list2) + 1 + diffs = [] + for i, val in enumerate(list1): + if int(val) not in index_map: + diffs.append(abs(i - not_in_list)) + else: + diffs.append(abs(i - index_map[int(val)])) + return sum(diffs) + + footrule_distances = [] + for i in range(rpy2_nn.shape[0]): + fd = footrule_distance(rpy2_nn[i], sklearn_nn[i]) + print(rpy2_nn[i]) + print(sklearn_nn[i]) + print(spearmanr(rpy2_nn[i], sklearn_nn[i])) + print(fd) + footrule_distances.append(fd) + + assert all(fd < 10 for fd in footrule_distances)