diff --git a/src/sknnr/_rfnn.py b/src/sknnr/_rfnn.py index 0d81a8b..e403997 100644 --- a/src/sknnr/_rfnn.py +++ b/src/sknnr/_rfnn.py @@ -17,6 +17,7 @@ def __init__( *, n_estimators: int = 500, mtry: int | None = None, + method: Literal["rpy2", "sklearn"] = "sklearn", weights: Literal["uniform", "distance"] | Callable = "uniform", algorithm: Literal["auto", "ball_tree", "kd_tree", "brute"] = "auto", leaf_size: int = 30, @@ -32,6 +33,9 @@ def __init__( ) self.n_estimators = n_estimators self.mtry = mtry + self.method = method def _get_transformer(self) -> TransformerMixin: - return RFNodeTransformer() + return RFNodeTransformer( + n_estimators=self.n_estimators, mtry=self.mtry, method=self.method + ) diff --git a/src/sknnr/transformers/_rf_sklearn.py b/src/sknnr/transformers/_rf_sklearn.py new file mode 100644 index 0000000..e928f21 --- /dev/null +++ b/src/sknnr/transformers/_rf_sklearn.py @@ -0,0 +1,19 @@ +from sklearn.ensemble import RandomForestRegressor + + +def sklearn_get_forest(X, y, n_tree, mt): + """ + Train a random forest regression model in sklearn. + """ + rf = RandomForestRegressor( + n_estimators=n_tree, max_features=mt, random_state=42, min_samples_leaf=5 + ) + rf.fit(X, y) + return rf + + +def sklearn_get_nodeset(rf, X): + """ + Get the nodes associated with X of the random forest regression model in sklearn. + """ + return rf.apply(X) diff --git a/src/sknnr/transformers/_rfnode_transformer.py b/src/sknnr/transformers/_rfnode_transformer.py index 86c3b14..20a096f 100644 --- a/src/sknnr/transformers/_rfnode_transformer.py +++ b/src/sknnr/transformers/_rfnode_transformer.py @@ -1,17 +1,26 @@ from __future__ import annotations +from typing import Literal + import numpy as np from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.validation import check_is_fitted from .._base import _validate_data from ._rf_rpy2 import rpy2_get_forest, rpy2_get_nodeset +from ._rf_sklearn import sklearn_get_forest, sklearn_get_nodeset class RFNodeTransformer(TransformerMixin, BaseEstimator): - def __init__(self, n_estimators: int = 500, mtry: int | None = None): + def __init__( + self, + n_estimators: int = 500, + mtry: int | None = None, + method: Literal["rpy2", "sklearn"] = "sklearn", + ): self.n_estimators = n_estimators self.mtry = mtry + self.method = method def fit(self, X, y=None): X = _validate_data( @@ -32,11 +41,17 @@ def fit(self, X, y=None): mt = self.mtry if self.mtry else int(np.sqrt(X.shape[1])) - # Build the individual random forests - self.rfs_ = [ - rpy2_get_forest(X, y[:, i], int(n_tree_list[i]), mt) - for i in range(y.shape[1]) - ] + # Build the individual random forests based on method + if self.method == "rpy2": + self.rfs_ = [ + rpy2_get_forest(X, y[:, i], int(n_tree_list[i]), mt) + for i in range(y.shape[1]) + ] + elif self.method == "sklearn": + self.rfs_ = [ + sklearn_get_forest(X, y[:, i], int(n_tree_list[i]), mt) + for i in range(y.shape[1]) + ] return self def transform(self, X): @@ -48,7 +63,10 @@ def transform(self, X): ensure_min_features=1, ensure_min_samples=1, ) - nodes = [rpy2_get_nodeset(rf, X) for rf in self.rfs_] + if self.method == "rpy2": + nodes = [rpy2_get_nodeset(rf, X) for rf in self.rfs_] + elif self.method == "sklearn": + nodes = [sklearn_get_nodeset(rf, X) for rf in self.rfs_] return np.hstack(nodes) def fit_transform(self, X, y=None): diff --git a/tests/test_port.py b/tests/test_port.py index 421551d..5a4c527 100644 --- a/tests/test_port.py +++ b/tests/test_port.py @@ -1,5 +1,7 @@ import pytest +from numpy import corrcoef from numpy.testing import assert_array_almost_equal, assert_array_equal +from scipy.stats import spearmanr from sklearn.metrics import r2_score from sknnr import ( @@ -112,3 +114,86 @@ def test_score_independent(result, n_components, weighted): ) est = estimator(**hyperparams).fit(dataset.X_train, dataset.y_train) assert est.independent_score_ == pytest.approx(expected_score, abs=0.001) + + +@pytest.mark.parametrize("weighted", [False, True], ids=["unweighted", "weighted"]) +@pytest.mark.parametrize("reference", [True, False], ids=["reference", "target"]) +def test_high_correlation_rfnn(weighted, reference): + """ + Test that predictions for RFNN estimator using yaImpute and sklearn + highly correlate. Currently failing. + """ + dataset = load_moscow_stjoes_results(method="randomForest", n_components=None) + weights = yaimpute_weights if weighted else None + + hyperparams = dict(n_neighbors=5, weights=weights) + rpy2_est = RFNNRegressor(method="rpy2", **hyperparams).fit( + dataset.X_train, dataset.y_train + ) + sklearn_est = RFNNRegressor(method="sklearn", **hyperparams).fit( + dataset.X_train, dataset.y_train + ) + + rpy2_pred = ( + rpy2_est.independent_prediction_ + if reference + else rpy2_est.predict(dataset.X_test) + ) + sklearn_pred = ( + sklearn_est.independent_prediction_ + if reference + else sklearn_est.predict(dataset.X_test) + ) + corr = [ + float(corrcoef(rpy2_pred[:, i], sklearn_pred[:, i])[0, 1]) + for i in range(rpy2_pred.shape[1]) + if rpy2_pred[:, i].std() > 0 and sklearn_pred[:, i].std() > 0 + ] + print(corr) + assert all(c > 0.99 for c in corr) + + +def test_kneighbors_rfnn(): + """ + Test that plots from different implementations of RF-NN share *most* + of the same neighbors. Currently failing. + """ + dataset = load_moscow_stjoes_results(method="randomForest", n_components=None) + + hyperparams = dict(n_neighbors=13) + rpy2_est = RFNNRegressor(method="rpy2", **hyperparams).fit( + dataset.X_train, dataset.y_train + ) + sklearn_est = RFNNRegressor(method="sklearn", **hyperparams).fit( + dataset.X_train, dataset.y_train + ) + + _, rpy2_nn = rpy2_est.kneighbors(return_dataframe_index=True) + _, sklearn_nn = sklearn_est.kneighbors(return_dataframe_index=True) + + def footrule_distance(list1, list2): + """ + The Footrule Distance measures how far elements in one list are from + their corresponding positions in another list. It calculates the sum of absolute + positional differences. + """ + index_map = {int(val): i for i, val in enumerate(list2)} + not_in_list = len(list2) + 1 + diffs = [] + for i, val in enumerate(list1): + if int(val) not in index_map: + diffs.append(abs(i - not_in_list)) + else: + diffs.append(abs(i - index_map[int(val)])) + return sum(diffs) + + footrule_distances = [] + for i in range(rpy2_nn.shape[0]): + fd = footrule_distance(rpy2_nn[i], sklearn_nn[i]) + print(rpy2_nn[i]) + print(sklearn_nn[i]) + print(spearmanr(rpy2_nn[i], sklearn_nn[i])) + print(fd) + footrule_distances.append(fd) + + assert all(fd < 10 for fd in footrule_distances)