Skip to content

Commit

Permalink
WIP: First implementation of RFNN with sklearn RF estimators
Browse files Browse the repository at this point in the history
This commit implements RFNN using strictly `RandomForestRegressor`
objects from `scikit-learn` to mimic the previous implementation using
`rpy2` and R's `randomForest` package.  This commit was intended to show
that regardless of random forest implementation, results from RF-NN
should correspond closely.

At present, however, these tests are currently failing.  Best guess is
that the implementation of random forest and hyperparameters differ
enough that we can't expect that the same trees are being created in
each forest.  Testing predicted values will not work until neighbors and
distances correspond closely between the two implementations.

What I've observed is that neighbors for training data roughly
correspond to each other but often don't share the same rank.  We have
tried using Spearman's rank correlation and the footrule distance to get
a measure of the difference in neighbor position, but do not have a good
standard for correctness.
  • Loading branch information
grovduck committed Jan 24, 2025
1 parent 6656528 commit fe095ec
Show file tree
Hide file tree
Showing 4 changed files with 134 additions and 8 deletions.
6 changes: 5 additions & 1 deletion src/sknnr/_rfnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def __init__(
*,
n_estimators: int = 500,
mtry: int | None = None,
method: Literal["rpy2", "sklearn"] = "sklearn",
weights: Literal["uniform", "distance"] | Callable = "uniform",
algorithm: Literal["auto", "ball_tree", "kd_tree", "brute"] = "auto",
leaf_size: int = 30,
Expand All @@ -32,6 +33,9 @@ def __init__(
)
self.n_estimators = n_estimators
self.mtry = mtry
self.method = method

def _get_transformer(self) -> TransformerMixin:
return RFNodeTransformer()
return RFNodeTransformer(
n_estimators=self.n_estimators, mtry=self.mtry, method=self.method
)
19 changes: 19 additions & 0 deletions src/sknnr/transformers/_rf_sklearn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from sklearn.ensemble import RandomForestRegressor


def sklearn_get_forest(X, y, n_tree, mt):
"""
Train a random forest regression model in sklearn.
"""
rf = RandomForestRegressor(
n_estimators=n_tree, max_features=mt, random_state=42, min_samples_leaf=5
)
rf.fit(X, y)
return rf


def sklearn_get_nodeset(rf, X):
"""
Get the nodes associated with X of the random forest regression model in sklearn.
"""
return rf.apply(X)
32 changes: 25 additions & 7 deletions src/sknnr/transformers/_rfnode_transformer.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,26 @@
from __future__ import annotations

from typing import Literal

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted

from .._base import _validate_data
from ._rf_rpy2 import rpy2_get_forest, rpy2_get_nodeset
from ._rf_sklearn import sklearn_get_forest, sklearn_get_nodeset


class RFNodeTransformer(TransformerMixin, BaseEstimator):
def __init__(self, n_estimators: int = 500, mtry: int | None = None):
def __init__(
self,
n_estimators: int = 500,
mtry: int | None = None,
method: Literal["rpy2", "sklearn"] = "sklearn",
):
self.n_estimators = n_estimators
self.mtry = mtry
self.method = method

def fit(self, X, y=None):
X = _validate_data(
Expand All @@ -32,11 +41,17 @@ def fit(self, X, y=None):

mt = self.mtry if self.mtry else int(np.sqrt(X.shape[1]))

# Build the individual random forests
self.rfs_ = [
rpy2_get_forest(X, y[:, i], int(n_tree_list[i]), mt)
for i in range(y.shape[1])
]
# Build the individual random forests based on method
if self.method == "rpy2":
self.rfs_ = [
rpy2_get_forest(X, y[:, i], int(n_tree_list[i]), mt)
for i in range(y.shape[1])
]
elif self.method == "sklearn":
self.rfs_ = [
sklearn_get_forest(X, y[:, i], int(n_tree_list[i]), mt)
for i in range(y.shape[1])
]
return self

def transform(self, X):
Expand All @@ -48,7 +63,10 @@ def transform(self, X):
ensure_min_features=1,
ensure_min_samples=1,
)
nodes = [rpy2_get_nodeset(rf, X) for rf in self.rfs_]
if self.method == "rpy2":
nodes = [rpy2_get_nodeset(rf, X) for rf in self.rfs_]
elif self.method == "sklearn":
nodes = [sklearn_get_nodeset(rf, X) for rf in self.rfs_]
return np.hstack(nodes)

def fit_transform(self, X, y=None):
Expand Down
85 changes: 85 additions & 0 deletions tests/test_port.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import pytest
from numpy import corrcoef
from numpy.testing import assert_array_almost_equal, assert_array_equal
from scipy.stats import spearmanr
from sklearn.metrics import r2_score

from sknnr import (
Expand Down Expand Up @@ -112,3 +114,86 @@ def test_score_independent(result, n_components, weighted):
)
est = estimator(**hyperparams).fit(dataset.X_train, dataset.y_train)
assert est.independent_score_ == pytest.approx(expected_score, abs=0.001)


@pytest.mark.parametrize("weighted", [False, True], ids=["unweighted", "weighted"])
@pytest.mark.parametrize("reference", [True, False], ids=["reference", "target"])
def test_high_correlation_rfnn(weighted, reference):
"""
Test that predictions for RFNN estimator using yaImpute and sklearn
highly correlate. Currently failing.
"""
dataset = load_moscow_stjoes_results(method="randomForest", n_components=None)
weights = yaimpute_weights if weighted else None

hyperparams = dict(n_neighbors=5, weights=weights)
rpy2_est = RFNNRegressor(method="rpy2", **hyperparams).fit(
dataset.X_train, dataset.y_train
)
sklearn_est = RFNNRegressor(method="sklearn", **hyperparams).fit(
dataset.X_train, dataset.y_train
)

rpy2_pred = (
rpy2_est.independent_prediction_
if reference
else rpy2_est.predict(dataset.X_test)
)
sklearn_pred = (
sklearn_est.independent_prediction_
if reference
else sklearn_est.predict(dataset.X_test)
)
corr = [
float(corrcoef(rpy2_pred[:, i], sklearn_pred[:, i])[0, 1])
for i in range(rpy2_pred.shape[1])
if rpy2_pred[:, i].std() > 0 and sklearn_pred[:, i].std() > 0
]
print(corr)
assert all(c > 0.99 for c in corr)


def test_kneighbors_rfnn():
"""
Test that plots from different implementations of RF-NN share *most*
of the same neighbors. Currently failing.
"""
dataset = load_moscow_stjoes_results(method="randomForest", n_components=None)

hyperparams = dict(n_neighbors=13)
rpy2_est = RFNNRegressor(method="rpy2", **hyperparams).fit(
dataset.X_train, dataset.y_train
)
sklearn_est = RFNNRegressor(method="sklearn", **hyperparams).fit(
dataset.X_train, dataset.y_train
)

_, rpy2_nn = rpy2_est.kneighbors(return_dataframe_index=True)
_, sklearn_nn = sklearn_est.kneighbors(return_dataframe_index=True)

def footrule_distance(list1, list2):
"""
The Footrule Distance measures how far elements in one list are from
their corresponding positions in another list. It calculates the sum of absolute
positional differences.
"""
index_map = {int(val): i for i, val in enumerate(list2)}
not_in_list = len(list2) + 1
diffs = []
for i, val in enumerate(list1):
if int(val) not in index_map:
diffs.append(abs(i - not_in_list))
else:
diffs.append(abs(i - index_map[int(val)]))
return sum(diffs)

footrule_distances = []
for i in range(rpy2_nn.shape[0]):
fd = footrule_distance(rpy2_nn[i], sklearn_nn[i])
print(rpy2_nn[i])
print(sklearn_nn[i])
print(spearmanr(rpy2_nn[i], sklearn_nn[i]))
print(fd)
footrule_distances.append(fd)

assert all(fd < 10 for fd in footrule_distances)

0 comments on commit fe095ec

Please sign in to comment.