Release

cerlymarco · Oct 1, 2021 · 5d77def · 5d77def
1 parent bb7bf57
commit 5d77def
Show file tree

Hide file tree

Showing 8 changed files with 83 additions and 137 deletions.
diff --git a/imgs/linear_boost_importances.png b/imgs/linear_boost_importances.png
diff --git a/lineartree/_classes.py b/lineartree/_classes.py
@@ -6,7 +6,7 @@
 from joblib import Parallel, effective_n_jobs  # , delayed
 
 from sklearn.dummy import DummyClassifier
-from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
+from sklearn.tree import DecisionTreeRegressor
 from sklearn.ensemble import RandomForestRegressor
 
 from sklearn.base import is_regressor
@@ -19,6 +19,9 @@
 from ._criterion import mse, rmse, mae, poisson
 from ._criterion import hamming, crossentropy
 
+import sklearn
+_sklearn_v1 = eval(sklearn.__version__.split('.')[0]) > 0
+
 
 CRITERIA = {"mse": mse,
             "rmse": rmse,
@@ -853,8 +856,7 @@ def __init__(self, base_estimator, *, loss, n_estimators,
                  max_depth, min_samples_split, min_samples_leaf,
                  min_weight_fraction_leaf, max_features,
                  random_state, max_leaf_nodes,
-                 min_impurity_decrease, min_impurity_split,
-                 ccp_alpha):
+                 min_impurity_decrease, ccp_alpha):
 
         self.base_estimator = base_estimator
         self.loss = loss
@@ -867,7 +869,6 @@ def __init__(self, base_estimator, *, loss, n_estimators,
         self.random_state = random_state
         self.max_leaf_nodes = max_leaf_nodes
         self.min_impurity_decrease = min_impurity_decrease
-        self.min_impurity_split = min_impurity_split
         self.ccp_alpha = ccp_alpha
 
     def _fit(self, X, y, sample_weight=None):
@@ -918,47 +919,33 @@ def _fit(self, X, y, sample_weight=None):
             else:
                 resid = SCORING[self.loss](y, pred)
 
-            if self.loss == 'hamming':
-                tree = DecisionTreeClassifier(
-                    criterion='gini', max_depth=self.max_depth,
-                    min_samples_split=self.min_samples_split,
-                    min_samples_leaf=self.min_samples_leaf,
-                    min_weight_fraction_leaf=self.min_weight_fraction_leaf,
-                    max_features=self.max_features,
-                    random_state=self.random_state,
-                    max_leaf_nodes=self.max_leaf_nodes,
-                    min_impurity_decrease=self.min_impurity_decrease,
-                    min_impurity_split=self.min_impurity_split,
-                    ccp_alpha=self.ccp_alpha
-                )
-            else:
-                tree = DecisionTreeRegressor(
-                    criterion='mse', max_depth=self.max_depth,
-                    min_samples_split=self.min_samples_split,
-                    min_samples_leaf=self.min_samples_leaf,
-                    min_weight_fraction_leaf=self.min_weight_fraction_leaf,
-                    max_features=self.max_features,
-                    random_state=self.random_state,
-                    max_leaf_nodes=self.max_leaf_nodes,
-                    min_impurity_decrease=self.min_impurity_decrease,
-                    min_impurity_split=self.min_impurity_split,
-                    ccp_alpha=self.ccp_alpha
-                )
+            if resid.ndim > 1:
+                resid = resid.mean(1)
+
+            criterion = 'squared_error' if _sklearn_v1 else 'mse'
+
+            tree = DecisionTreeRegressor(
+                criterion=criterion, max_depth=self.max_depth,
+                min_samples_split=self.min_samples_split,
+                min_samples_leaf=self.min_samples_leaf,
+                min_weight_fraction_leaf=self.min_weight_fraction_leaf,
+                max_features=self.max_features,
+                random_state=self.random_state,
+                max_leaf_nodes=self.max_leaf_nodes,
+                min_impurity_decrease=self.min_impurity_decrease,
+                ccp_alpha=self.ccp_alpha
+            )
 
             tree.fit(X, resid, sample_weight=sample_weight, check_input=False)
             self._trees.append(tree)
 
-            impurity = tree.tree_.impurity
-            pred_leaves = tree.apply(X, check_input=False)
-            leaves = np.unique(pred_leaves)
-
-            worst_leaf = np.argmax([impurity[l] for l in leaves])
-            worst_leaf = leaves[worst_leaf]
-            self._leaves.append(worst_leaf)
+            pred_tree = np.abs(tree.predict(X, check_input=False))
+            worst_pred = np.max(pred_tree)
+            self._leaves.append(worst_pred)
 
-            pred_leaves = (pred_leaves == worst_leaf).astype(np.float32)
-            pred_leaves = pred_leaves.reshape(-1, 1)
-            X = np.concatenate([X, pred_leaves], axis=1)
+            pred_tree = (pred_tree == worst_pred).astype(np.float32)
+            pred_tree = pred_tree.reshape(-1, 1)
+            X = np.concatenate([X, pred_tree], axis=1)
 
         self.base_estimator_ = deepcopy(self.base_estimator)
         self.base_estimator_.fit(X, y, sample_weight=sample_weight)
@@ -993,10 +980,10 @@ def transform(self, X):
         self._check_n_features(X, reset=False)
 
         for tree, leaf in zip(self._trees, self._leaves):
-            pred_leaves = tree.apply(X, check_input=False)
-            pred_leaves = (pred_leaves == leaf).astype(np.float32)
-            pred_leaves = pred_leaves.reshape(-1, 1)
-            X = np.concatenate([X, pred_leaves], axis=1)
+            pred_tree = np.abs(tree.predict(X, check_input=False))
+            pred_tree = (pred_tree == leaf).astype(np.float32)
+            pred_tree = pred_tree.reshape(-1, 1)
+            X = np.concatenate([X, pred_tree], axis=1)
 
         return X
 
@@ -1010,8 +997,8 @@ class _LinearForest(BaseEstimator):
     def __init__(self, base_estimator, *, n_estimators, max_depth,
                  min_samples_split, min_samples_leaf, min_weight_fraction_leaf,
                  max_features, max_leaf_nodes, min_impurity_decrease,
-                 min_impurity_split, bootstrap, oob_score, n_jobs,
-                 random_state, ccp_alpha, max_samples):
+                 bootstrap, oob_score, n_jobs, random_state,
+                 ccp_alpha, max_samples):
 
         self.base_estimator = base_estimator
         self.n_estimators = n_estimators
@@ -1022,7 +1009,6 @@ def __init__(self, base_estimator, *, n_estimators, max_depth,
         self.max_features = max_features
         self.max_leaf_nodes = max_leaf_nodes
         self.min_impurity_decrease = min_impurity_decrease
-        self.min_impurity_split = min_impurity_split
         self.bootstrap = bootstrap
         self.oob_score = oob_score
         self.n_jobs = n_jobs
@@ -1100,17 +1086,18 @@ def _fit(self, X, y, sample_weight=None):
         self.base_estimator_.fit(X, y, sample_weight)
         resid = y - self.base_estimator_.predict(X)
 
+        criterion = 'squared_error' if _sklearn_v1 else 'mse'
+
         self.forest_estimator_ = RandomForestRegressor(
             n_estimators=self.n_estimators,
-            criterion='mse',
+            criterion=criterion,
             max_depth=self.max_depth,
             min_samples_split=self.min_samples_split,
             min_samples_leaf=self.min_samples_leaf,
             min_weight_fraction_leaf=self.min_weight_fraction_leaf,
             max_features=self.max_features,
             max_leaf_nodes=self.max_leaf_nodes,
             min_impurity_decrease=self.min_impurity_decrease,
-            min_impurity_split=self.min_impurity_split,
             bootstrap=self.bootstrap,
             oob_score=self.oob_score,
             n_jobs=self.n_jobs,

diff --git a/lineartree/lineartree.py b/lineartree/lineartree.py
@@ -566,10 +566,6 @@ class LinearBoostRegressor(_LinearBoosting, RegressorMixin):
         A node will be split if this split induces a decrease of the impurity
         greater than or equal to this value.
 
-    min_impurity_split : float, default=0
-        Threshold for early stopping in tree growth. A node will split
-        if its impurity is above the threshold, otherwise it is a leaf.
-
     ccp_alpha : non-negative float, default=0.0
         Complexity parameter used for Minimal Cost-Complexity Pruning. The
         subtree with the largest cost complexity that is smaller than
@@ -619,8 +615,7 @@ def __init__(self, base_estimator, *, loss='linear', n_estimators=10,
                  max_depth=3, min_samples_split=2, min_samples_leaf=1,
                  min_weight_fraction_leaf=0.0, max_features=None,
                  random_state=None, max_leaf_nodes=None,
-                 min_impurity_decrease=0.0, min_impurity_split=None,
-                 ccp_alpha=0.0):
+                 min_impurity_decrease=0.0, ccp_alpha=0.0):
 
         self.base_estimator = base_estimator
         self.loss = loss
@@ -633,7 +628,6 @@ def __init__(self, base_estimator, *, loss='linear', n_estimators=10,
         self.random_state = random_state
         self.max_leaf_nodes = max_leaf_nodes
         self.min_impurity_decrease = min_impurity_decrease
-        self.min_impurity_split = min_impurity_split
         self.ccp_alpha = ccp_alpha
 
     def fit(self, X, y, sample_weight=None):
@@ -777,10 +771,6 @@ class LinearBoostClassifier(_LinearBoosting, ClassifierMixin):
         A node will be split if this split induces a decrease of the impurity
         greater than or equal to this value.
 
-    min_impurity_split : float, default=0
-        Threshold for early stopping in tree growth. A node will split
-        if its impurity is above the threshold, otherwise it is a leaf.
-
     ccp_alpha : non-negative float, default=0.0
         Complexity parameter used for Minimal Cost-Complexity Pruning. The
         subtree with the largest cost complexity that is smaller than
@@ -830,8 +820,7 @@ def __init__(self, base_estimator, *, loss='hamming', n_estimators=10,
                  max_depth=3, min_samples_split=2, min_samples_leaf=1,
                  min_weight_fraction_leaf=0.0, max_features=None,
                  random_state=None, max_leaf_nodes=None,
-                 min_impurity_decrease=0.0, min_impurity_split=None,
-                 ccp_alpha=0.0):
+                 min_impurity_decrease=0.0, ccp_alpha=0.0):
 
         self.base_estimator = base_estimator
         self.loss = loss
@@ -844,7 +833,6 @@ def __init__(self, base_estimator, *, loss='hamming', n_estimators=10,
         self.random_state = random_state
         self.max_leaf_nodes = max_leaf_nodes
         self.min_impurity_decrease = min_impurity_decrease
-        self.min_impurity_split = min_impurity_split
         self.ccp_alpha = ccp_alpha
 
     def fit(self, X, y, sample_weight=None):
@@ -1039,10 +1027,6 @@ class LinearForestClassifier(_LinearForest, ClassifierMixin):
         A node will be split if this split induces a decrease of the impurity
         greater than or equal to this value.
 
-    min_impurity_split : float, default=None
-        Threshold for early stopping in tree growth. A node will split
-        if its impurity is above the threshold, otherwise it is a leaf.
-
     bootstrap : bool, default=True
         Whether bootstrap samples are used when building trees. If False, the
         whole dataset is used to build each tree.
@@ -1076,7 +1060,7 @@ class LinearForestClassifier(_LinearForest, ClassifierMixin):
         - If None (default), then draw `X.shape[0]` samples.
         - If int, then draw `max_samples` samples.
         - If float, then draw `max_samples * X.shape[0]` samples. Thus,
-          `max_samples` should be in the interval `(0, 1)`.
+          `max_samples` should be in the interval `(0, 1]`.
 
     Attributes
     ----------
@@ -1129,9 +1113,8 @@ def __init__(self, base_estimator, *, n_estimators=100,
                  max_depth=None, min_samples_split=2, min_samples_leaf=1,
                  min_weight_fraction_leaf=0., max_features="auto",
                  max_leaf_nodes=None, min_impurity_decrease=0.,
-                 min_impurity_split=None, bootstrap=True,
-                 oob_score=False, n_jobs=None, random_state=None,
-                 ccp_alpha=0.0, max_samples=None):
+                 bootstrap=True, oob_score=False, n_jobs=None,
+                 random_state=None, ccp_alpha=0.0, max_samples=None):
 
         self.base_estimator = base_estimator
         self.n_estimators = n_estimators
@@ -1142,7 +1125,6 @@ def __init__(self, base_estimator, *, n_estimators=100,
         self.max_features = max_features
         self.max_leaf_nodes = max_leaf_nodes
         self.min_impurity_decrease = min_impurity_decrease
-        self.min_impurity_split = min_impurity_split
         self.bootstrap = bootstrap
         self.oob_score = oob_score
         self.n_jobs = n_jobs
@@ -1351,10 +1333,6 @@ class LinearForestRegressor(_LinearForest, RegressorMixin):
         A node will be split if this split induces a decrease of the impurity
         greater than or equal to this value.
 
-    min_impurity_split : float, default=None
-        Threshold for early stopping in tree growth. A node will split
-        if its impurity is above the threshold, otherwise it is a leaf.
-
     bootstrap : bool, default=True
         Whether bootstrap samples are used when building trees. If False, the
         whole dataset is used to build each tree.
@@ -1388,7 +1366,7 @@ class LinearForestRegressor(_LinearForest, RegressorMixin):
         - If None (default), then draw `X.shape[0]` samples.
         - If int, then draw `max_samples` samples.
         - If float, then draw `max_samples * X.shape[0]` samples. Thus,
-          `max_samples` should be in the interval `(0, 1)`.
+          `max_samples` should be in the interval `(0, 1]`.
 
     Attributes
     ----------
@@ -1437,14 +1415,12 @@ class LinearForestRegressor(_LinearForest, RegressorMixin):
     Authors: Haozhe Zhang, Dan Nettleton, Zhengyuan Zhu.
     (https://arxiv.org/abs/1904.10416)
     """
-
     def __init__(self, base_estimator, *, n_estimators=100,
                  max_depth=None, min_samples_split=2, min_samples_leaf=1,
                  min_weight_fraction_leaf=0., max_features="auto",
                  max_leaf_nodes=None, min_impurity_decrease=0.,
-                 min_impurity_split=None, bootstrap=True,
-                 oob_score=False, n_jobs=None, random_state=None,
-                 ccp_alpha=0.0, max_samples=None):
+                 bootstrap=True, oob_score=False, n_jobs=None,
+                 random_state=None, ccp_alpha=0.0, max_samples=None):
 
         self.base_estimator = base_estimator
         self.n_estimators = n_estimators
@@ -1455,7 +1431,6 @@ def __init__(self, base_estimator, *, n_estimators=100,
         self.max_features = max_features
         self.max_leaf_nodes = max_leaf_nodes
         self.min_impurity_decrease = min_impurity_decrease
-        self.min_impurity_split = min_impurity_split
         self.bootstrap = bootstrap
         self.oob_score = oob_score
         self.n_jobs = n_jobs