Skip to content

Commit

Permalink
Release
Browse files Browse the repository at this point in the history
  • Loading branch information
cerlymarco committed Oct 1, 2021
1 parent bb7bf57 commit 5d77def
Show file tree
Hide file tree
Showing 8 changed files with 83 additions and 137 deletions.
Binary file modified imgs/linear_boost_importances.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
85 changes: 36 additions & 49 deletions lineartree/_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from joblib import Parallel, effective_n_jobs # , delayed

from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.base import is_regressor
Expand All @@ -19,6 +19,9 @@
from ._criterion import mse, rmse, mae, poisson
from ._criterion import hamming, crossentropy

import sklearn
_sklearn_v1 = eval(sklearn.__version__.split('.')[0]) > 0


CRITERIA = {"mse": mse,
"rmse": rmse,
Expand Down Expand Up @@ -853,8 +856,7 @@ def __init__(self, base_estimator, *, loss, n_estimators,
max_depth, min_samples_split, min_samples_leaf,
min_weight_fraction_leaf, max_features,
random_state, max_leaf_nodes,
min_impurity_decrease, min_impurity_split,
ccp_alpha):
min_impurity_decrease, ccp_alpha):

self.base_estimator = base_estimator
self.loss = loss
Expand All @@ -867,7 +869,6 @@ def __init__(self, base_estimator, *, loss, n_estimators,
self.random_state = random_state
self.max_leaf_nodes = max_leaf_nodes
self.min_impurity_decrease = min_impurity_decrease
self.min_impurity_split = min_impurity_split
self.ccp_alpha = ccp_alpha

def _fit(self, X, y, sample_weight=None):
Expand Down Expand Up @@ -918,47 +919,33 @@ def _fit(self, X, y, sample_weight=None):
else:
resid = SCORING[self.loss](y, pred)

if self.loss == 'hamming':
tree = DecisionTreeClassifier(
criterion='gini', max_depth=self.max_depth,
min_samples_split=self.min_samples_split,
min_samples_leaf=self.min_samples_leaf,
min_weight_fraction_leaf=self.min_weight_fraction_leaf,
max_features=self.max_features,
random_state=self.random_state,
max_leaf_nodes=self.max_leaf_nodes,
min_impurity_decrease=self.min_impurity_decrease,
min_impurity_split=self.min_impurity_split,
ccp_alpha=self.ccp_alpha
)
else:
tree = DecisionTreeRegressor(
criterion='mse', max_depth=self.max_depth,
min_samples_split=self.min_samples_split,
min_samples_leaf=self.min_samples_leaf,
min_weight_fraction_leaf=self.min_weight_fraction_leaf,
max_features=self.max_features,
random_state=self.random_state,
max_leaf_nodes=self.max_leaf_nodes,
min_impurity_decrease=self.min_impurity_decrease,
min_impurity_split=self.min_impurity_split,
ccp_alpha=self.ccp_alpha
)
if resid.ndim > 1:
resid = resid.mean(1)

criterion = 'squared_error' if _sklearn_v1 else 'mse'

tree = DecisionTreeRegressor(
criterion=criterion, max_depth=self.max_depth,
min_samples_split=self.min_samples_split,
min_samples_leaf=self.min_samples_leaf,
min_weight_fraction_leaf=self.min_weight_fraction_leaf,
max_features=self.max_features,
random_state=self.random_state,
max_leaf_nodes=self.max_leaf_nodes,
min_impurity_decrease=self.min_impurity_decrease,
ccp_alpha=self.ccp_alpha
)

tree.fit(X, resid, sample_weight=sample_weight, check_input=False)
self._trees.append(tree)

impurity = tree.tree_.impurity
pred_leaves = tree.apply(X, check_input=False)
leaves = np.unique(pred_leaves)

worst_leaf = np.argmax([impurity[l] for l in leaves])
worst_leaf = leaves[worst_leaf]
self._leaves.append(worst_leaf)
pred_tree = np.abs(tree.predict(X, check_input=False))
worst_pred = np.max(pred_tree)
self._leaves.append(worst_pred)

pred_leaves = (pred_leaves == worst_leaf).astype(np.float32)
pred_leaves = pred_leaves.reshape(-1, 1)
X = np.concatenate([X, pred_leaves], axis=1)
pred_tree = (pred_tree == worst_pred).astype(np.float32)
pred_tree = pred_tree.reshape(-1, 1)
X = np.concatenate([X, pred_tree], axis=1)

self.base_estimator_ = deepcopy(self.base_estimator)
self.base_estimator_.fit(X, y, sample_weight=sample_weight)
Expand Down Expand Up @@ -993,10 +980,10 @@ def transform(self, X):
self._check_n_features(X, reset=False)

for tree, leaf in zip(self._trees, self._leaves):
pred_leaves = tree.apply(X, check_input=False)
pred_leaves = (pred_leaves == leaf).astype(np.float32)
pred_leaves = pred_leaves.reshape(-1, 1)
X = np.concatenate([X, pred_leaves], axis=1)
pred_tree = np.abs(tree.predict(X, check_input=False))
pred_tree = (pred_tree == leaf).astype(np.float32)
pred_tree = pred_tree.reshape(-1, 1)
X = np.concatenate([X, pred_tree], axis=1)

return X

Expand All @@ -1010,8 +997,8 @@ class _LinearForest(BaseEstimator):
def __init__(self, base_estimator, *, n_estimators, max_depth,
min_samples_split, min_samples_leaf, min_weight_fraction_leaf,
max_features, max_leaf_nodes, min_impurity_decrease,
min_impurity_split, bootstrap, oob_score, n_jobs,
random_state, ccp_alpha, max_samples):
bootstrap, oob_score, n_jobs, random_state,
ccp_alpha, max_samples):

self.base_estimator = base_estimator
self.n_estimators = n_estimators
Expand All @@ -1022,7 +1009,6 @@ def __init__(self, base_estimator, *, n_estimators, max_depth,
self.max_features = max_features
self.max_leaf_nodes = max_leaf_nodes
self.min_impurity_decrease = min_impurity_decrease
self.min_impurity_split = min_impurity_split
self.bootstrap = bootstrap
self.oob_score = oob_score
self.n_jobs = n_jobs
Expand Down Expand Up @@ -1100,17 +1086,18 @@ def _fit(self, X, y, sample_weight=None):
self.base_estimator_.fit(X, y, sample_weight)
resid = y - self.base_estimator_.predict(X)

criterion = 'squared_error' if _sklearn_v1 else 'mse'

self.forest_estimator_ = RandomForestRegressor(
n_estimators=self.n_estimators,
criterion='mse',
criterion=criterion,
max_depth=self.max_depth,
min_samples_split=self.min_samples_split,
min_samples_leaf=self.min_samples_leaf,
min_weight_fraction_leaf=self.min_weight_fraction_leaf,
max_features=self.max_features,
max_leaf_nodes=self.max_leaf_nodes,
min_impurity_decrease=self.min_impurity_decrease,
min_impurity_split=self.min_impurity_split,
bootstrap=self.bootstrap,
oob_score=self.oob_score,
n_jobs=self.n_jobs,
Expand Down
41 changes: 8 additions & 33 deletions lineartree/lineartree.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,10 +566,6 @@ class LinearBoostRegressor(_LinearBoosting, RegressorMixin):
A node will be split if this split induces a decrease of the impurity
greater than or equal to this value.
min_impurity_split : float, default=0
Threshold for early stopping in tree growth. A node will split
if its impurity is above the threshold, otherwise it is a leaf.
ccp_alpha : non-negative float, default=0.0
Complexity parameter used for Minimal Cost-Complexity Pruning. The
subtree with the largest cost complexity that is smaller than
Expand Down Expand Up @@ -619,8 +615,7 @@ def __init__(self, base_estimator, *, loss='linear', n_estimators=10,
max_depth=3, min_samples_split=2, min_samples_leaf=1,
min_weight_fraction_leaf=0.0, max_features=None,
random_state=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
ccp_alpha=0.0):
min_impurity_decrease=0.0, ccp_alpha=0.0):

self.base_estimator = base_estimator
self.loss = loss
Expand All @@ -633,7 +628,6 @@ def __init__(self, base_estimator, *, loss='linear', n_estimators=10,
self.random_state = random_state
self.max_leaf_nodes = max_leaf_nodes
self.min_impurity_decrease = min_impurity_decrease
self.min_impurity_split = min_impurity_split
self.ccp_alpha = ccp_alpha

def fit(self, X, y, sample_weight=None):
Expand Down Expand Up @@ -777,10 +771,6 @@ class LinearBoostClassifier(_LinearBoosting, ClassifierMixin):
A node will be split if this split induces a decrease of the impurity
greater than or equal to this value.
min_impurity_split : float, default=0
Threshold for early stopping in tree growth. A node will split
if its impurity is above the threshold, otherwise it is a leaf.
ccp_alpha : non-negative float, default=0.0
Complexity parameter used for Minimal Cost-Complexity Pruning. The
subtree with the largest cost complexity that is smaller than
Expand Down Expand Up @@ -830,8 +820,7 @@ def __init__(self, base_estimator, *, loss='hamming', n_estimators=10,
max_depth=3, min_samples_split=2, min_samples_leaf=1,
min_weight_fraction_leaf=0.0, max_features=None,
random_state=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
ccp_alpha=0.0):
min_impurity_decrease=0.0, ccp_alpha=0.0):

self.base_estimator = base_estimator
self.loss = loss
Expand All @@ -844,7 +833,6 @@ def __init__(self, base_estimator, *, loss='hamming', n_estimators=10,
self.random_state = random_state
self.max_leaf_nodes = max_leaf_nodes
self.min_impurity_decrease = min_impurity_decrease
self.min_impurity_split = min_impurity_split
self.ccp_alpha = ccp_alpha

def fit(self, X, y, sample_weight=None):
Expand Down Expand Up @@ -1039,10 +1027,6 @@ class LinearForestClassifier(_LinearForest, ClassifierMixin):
A node will be split if this split induces a decrease of the impurity
greater than or equal to this value.
min_impurity_split : float, default=None
Threshold for early stopping in tree growth. A node will split
if its impurity is above the threshold, otherwise it is a leaf.
bootstrap : bool, default=True
Whether bootstrap samples are used when building trees. If False, the
whole dataset is used to build each tree.
Expand Down Expand Up @@ -1076,7 +1060,7 @@ class LinearForestClassifier(_LinearForest, ClassifierMixin):
- If None (default), then draw `X.shape[0]` samples.
- If int, then draw `max_samples` samples.
- If float, then draw `max_samples * X.shape[0]` samples. Thus,
`max_samples` should be in the interval `(0, 1)`.
`max_samples` should be in the interval `(0, 1]`.
Attributes
----------
Expand Down Expand Up @@ -1129,9 +1113,8 @@ def __init__(self, base_estimator, *, n_estimators=100,
max_depth=None, min_samples_split=2, min_samples_leaf=1,
min_weight_fraction_leaf=0., max_features="auto",
max_leaf_nodes=None, min_impurity_decrease=0.,
min_impurity_split=None, bootstrap=True,
oob_score=False, n_jobs=None, random_state=None,
ccp_alpha=0.0, max_samples=None):
bootstrap=True, oob_score=False, n_jobs=None,
random_state=None, ccp_alpha=0.0, max_samples=None):

self.base_estimator = base_estimator
self.n_estimators = n_estimators
Expand All @@ -1142,7 +1125,6 @@ def __init__(self, base_estimator, *, n_estimators=100,
self.max_features = max_features
self.max_leaf_nodes = max_leaf_nodes
self.min_impurity_decrease = min_impurity_decrease
self.min_impurity_split = min_impurity_split
self.bootstrap = bootstrap
self.oob_score = oob_score
self.n_jobs = n_jobs
Expand Down Expand Up @@ -1351,10 +1333,6 @@ class LinearForestRegressor(_LinearForest, RegressorMixin):
A node will be split if this split induces a decrease of the impurity
greater than or equal to this value.
min_impurity_split : float, default=None
Threshold for early stopping in tree growth. A node will split
if its impurity is above the threshold, otherwise it is a leaf.
bootstrap : bool, default=True
Whether bootstrap samples are used when building trees. If False, the
whole dataset is used to build each tree.
Expand Down Expand Up @@ -1388,7 +1366,7 @@ class LinearForestRegressor(_LinearForest, RegressorMixin):
- If None (default), then draw `X.shape[0]` samples.
- If int, then draw `max_samples` samples.
- If float, then draw `max_samples * X.shape[0]` samples. Thus,
`max_samples` should be in the interval `(0, 1)`.
`max_samples` should be in the interval `(0, 1]`.
Attributes
----------
Expand Down Expand Up @@ -1437,14 +1415,12 @@ class LinearForestRegressor(_LinearForest, RegressorMixin):
Authors: Haozhe Zhang, Dan Nettleton, Zhengyuan Zhu.
(https://arxiv.org/abs/1904.10416)
"""

def __init__(self, base_estimator, *, n_estimators=100,
max_depth=None, min_samples_split=2, min_samples_leaf=1,
min_weight_fraction_leaf=0., max_features="auto",
max_leaf_nodes=None, min_impurity_decrease=0.,
min_impurity_split=None, bootstrap=True,
oob_score=False, n_jobs=None, random_state=None,
ccp_alpha=0.0, max_samples=None):
bootstrap=True, oob_score=False, n_jobs=None,
random_state=None, ccp_alpha=0.0, max_samples=None):

self.base_estimator = base_estimator
self.n_estimators = n_estimators
Expand All @@ -1455,7 +1431,6 @@ def __init__(self, base_estimator, *, n_estimators=100,
self.max_features = max_features
self.max_leaf_nodes = max_leaf_nodes
self.min_impurity_decrease = min_impurity_decrease
self.min_impurity_split = min_impurity_split
self.bootstrap = bootstrap
self.oob_score = oob_score
self.n_jobs = n_jobs
Expand Down
Loading

0 comments on commit 5d77def

Please sign in to comment.