andosa · jbaez1212 · May 11, 2018 · May 11, 2018 · May 11, 2018
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@
 # C extensions
 *.so
 
+
 # Packages
 *.egg
 *.egg-info
@@ -41,3 +42,6 @@ output/*/index.html
 
 # Sphinx
 docs/_build
+
+.ipynb_checkpoints
+*/.ipynb_checkpoints/*
diff --git a/treeinterpreter/treeinterpreter.py b/treeinterpreter/treeinterpreter.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 import numpy as np
 import sklearn
-
+from collections import Counter
 from sklearn.ensemble.forest import ForestClassifier, ForestRegressor
 from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, _tree
 from distutils.version import LooseVersion
@@ -38,7 +38,7 @@ def _predict_tree(model, X, joint_contribution=False):
     For a given DecisionTreeRegressor, DecisionTreeClassifier,
     ExtraTreeRegressor, or ExtraTreeClassifier,
     returns a triple of [prediction, bias and feature_contributions], such
-    that prediction ≈ bias + feature_contributions.
+    that prediction â‰ˆ bias + feature_contributions.
     """
     leaves = model.apply(X)
     paths = _get_tree_paths(model.tree_, 0)
@@ -109,11 +109,11 @@ def _predict_tree(model, X, joint_contribution=False):
         return direct_prediction, biases, np.array(contributions)
 
 
-def _predict_forest(model, X, joint_contribution=False):
+def _predict_forest(model, X, joint_contribution=False, aggregated_contributions=False):
     """
     For a given RandomForestRegressor, RandomForestClassifier,
     ExtraTreesRegressor, or ExtraTreesClassifier returns a triple of
-    [prediction, bias and feature_contributions], such that prediction ≈ bias +
+    [prediction, bias and feature_contributions], such that prediction â‰ˆ bias +
     feature_contributions.
     """
     biases = []
@@ -123,27 +123,48 @@ def _predict_forest(model, X, joint_contribution=False):
 
     if joint_contribution:
 
-        for tree in model.estimators_:
-            pred, bias, contribution = _predict_tree(tree, X, joint_contribution=joint_contribution)
+        # If user wants the contributions outputed to be already aggregated run this section. It uses Counter from 
+        # collections to automatically sum all contributions and divide by number of trees.
+        if aggregated_contributions:
+
+            total_contributions = Counter()
 
-            biases.append(bias)
-            contributions.append(contribution)
-            predictions.append(pred)
-
-
-        total_contributions = []
-
-        for i in range(len(X)):
-            contr = {}
-            for j, dct in enumerate(contributions):
-                for k in set(dct[i]).union(set(contr.keys())):
-                    contr[k] = (contr.get(k, 0)*j + dct[i].get(k,0) ) / (j+1)
+            for tree in model.estimators_:
+                pred, bias, contribution = _predict_tree(tree, X, joint_contribution=joint_contribution)
+
+                biases.append(bias)
+                predictions.append(pred)
+                for dct in contribution: 
+                    total_contributions.update(dct)
 
-            total_contributions.append(contr)    
+            # Total Contributions already aggregated. 
+            total_contributions = {x: total_contributions[x]/len(X) for x in total_contributions.keys()}
+
+
+
 
-        for i, item in enumerate(contribution):
-            total_contributions[i]
-            sm = sum([v for v in contribution[i].values()])
+        else:
+            for tree in model.estimators_:
+                pred, bias, contribution = _predict_tree(tree, X, joint_contribution=joint_contribution)
+
+                biases.append(bias)
+                contributions.append(contribution)
+                predictions.append(pred)
+
+
+            total_contributions = []
+
+            for i in range(len(X)):
+                contr = {}
+                for j, dct in enumerate(contributions):
+                    for k in set(dct[i]).union(set(contr.keys())):
+                        contr[k] = (contr.get(k, 0)*j + dct[i].get(k,0) ) / (j+1)
+
+                total_contributions.append(contr)    
+
+            for i, item in enumerate(contribution):
+                total_contributions[i]
+                sm = sum([v for v in contribution[i].values()])
 
 
 
@@ -162,9 +183,9 @@ def _predict_forest(model, X, joint_contribution=False):
             np.mean(contributions, axis=0))
 
 
-def predict(model, X, joint_contribution=False):
+def predict(model, X, joint_contribution=False, aggregated_contributions=False):
     """ Returns a triple (prediction, bias, feature_contributions), such
-    that prediction ≈ bias + feature_contributions.
+    that prediction â‰ˆ bias + feature_contributions.
     Parameters
     ----------
     model : DecisionTreeRegressor, DecisionTreeClassifier,
@@ -179,6 +200,10 @@ def predict(model, X, joint_contribution=False):
     joint_contribution : boolean
     Specifies if contributions are given individually from each feature,
     or jointly over them
+
+    aggregated_contributions : boolean
+    Specifies if contributions are the aggregated contribution of all the 
+    data samples. 
 
     Returns
     -------
@@ -194,6 +219,9 @@ def predict(model, X, joint_contribution=False):
         If joint_contribution is True, then shape is array of size n_samples,
         where each array element is a dict from a tuple of feature indices to
         to a value denoting the contribution from that feature tuple.
+        If aggregated_contributions is False then nothing changes. 
+        If aggregated_contributions is True then contributions is a dictionary 
+        of the average contribution across all samples. 
     """
     # Only single out response variable supported,
     if model.n_outputs_ > 1:
@@ -204,7 +232,7 @@ def predict(model, X, joint_contribution=False):
         return _predict_tree(model, X, joint_contribution=joint_contribution)
     elif (isinstance(model, ForestClassifier) or
           isinstance(model, ForestRegressor)):
-        return _predict_forest(model, X, joint_contribution=joint_contribution)
+        return _predict_forest(model, X, joint_contribution=joint_contribution, aggregated_contributions=aggregated_contributions)
     else:
         raise ValueError("Wrong model type. Base learner needs to be a "
                          "DecisionTreeClassifier or DecisionTreeRegressor.")