Skip to content

Commit

Permalink
Merge branch 'feature-bestfit' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
bbengfort committed Jun 27, 2016
2 parents d769cc3 + 6959820 commit fc91da0
Show file tree
Hide file tree
Showing 7 changed files with 393 additions and 229 deletions.
230 changes: 47 additions & 183 deletions examples/examples.ipynb

Large diffs are not rendered by default.

39 changes: 0 additions & 39 deletions linebestfit.py

This file was deleted.

128 changes: 128 additions & 0 deletions tests/test_bestfit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# tests.test_bestfit
# Tests for the bestfit module.
#
# Author: Benjamin Bengfort <[email protected]>
# Created: Sun Jun 26 19:27:39 2016 -0400
#
# Copyright (C) 2016 District Data Labs
# For license information, see LICENSE.txt
#
# ID: test_bestfit.py [] [email protected] $

"""
Tests for the bestfit module.
"""

##########################################################################
## Imports
##########################################################################

import unittest
import numpy as np
import matplotlib.pyplot as plt

from yellowbrick.bestfit import *
from yellowbrick.anscombe import ANSCOMBE
from yellowbrick.exceptions import YellowbrickValueError
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline


##########################################################################
## Best fit tests
##########################################################################

class BestFitTests(unittest.TestCase):

def test_bad_estimator(self):
"""
Test that a bad estimator name raises a value error.
"""
fig, axe = plt.subplots()
X, y = ANSCOMBE[1]

with self.assertRaises(YellowbrickValueError):
draw_best_fit(X, y, axe, 'pepper')

def test_ensure_same_length(self):
"""
Ensure that vectors of different lengths raise
"""
fig, axe = plt.subplots()
X = np.array([1, 2, 3, 5, 8, 10, 2])
y = np.array([1, 3, 6, 2])

with self.assertRaises(YellowbrickValueError):
draw_best_fit(X, y, axe, 'linear')

with self.assertRaises(YellowbrickValueError):
draw_best_fit(X[:,np.newaxis], y, axe, 'linear')

def test_draw_best_fit(self):
"""
Test that drawing a best fit line works.
"""
fig, axe = plt.subplots()
X, y = ANSCOMBE[0]

self.assertEqual(axe, draw_best_fit(X, y, axe, 'linear'))
self.assertEqual(axe, draw_best_fit(X, y, axe, 'quadratic'))


##########################################################################
## Estimator tests
##########################################################################

class EstimatorTests(unittest.TestCase):
"""
Test the estimator functions for best fit lines.
"""

def test_linear(self):
"""
Test the linear best fit estimator
"""
X, y = ANSCOMBE[0]
X = np.array(X)
y = np.array(y)
X = X[:,np.newaxis]

model = fit_linear(X, y)
self.assertIsNotNone(model)
self.assertIsInstance(model, LinearRegression)


def test_quadratic(self):
"""
Test the quadratic best fit estimator
"""
X, y = ANSCOMBE[1]
X = np.array(X)
y = np.array(y)
X = X[:,np.newaxis]

model = fit_quadratic(X, y)
self.assertIsNotNone(model)
self.assertIsInstance(model, Pipeline)

def test_select_best(self):
"""
Test the select best fit estimator
"""
X, y = ANSCOMBE[1]
X = np.array(X)
y = np.array(y)
X = X[:,np.newaxis]

model = fit_select_best(X, y)
self.assertIsNotNone(model)
self.assertIsInstance(model, Pipeline)

X, y = ANSCOMBE[3]
X = np.array(X)
y = np.array(y)
X = X[:,np.newaxis]

model = fit_select_best(X, y)
self.assertIsNotNone(model)
self.assertIsInstance(model, LinearRegression)
19 changes: 15 additions & 4 deletions yellowbrick/anscombe.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,11 @@
## Imports
##########################################################################


import numpy as np
import matplotlib.pyplot as plt

from yellowbrick.bestfit import draw_best_fit


##########################################################################
## Anscombe Data Arrays
Expand Down Expand Up @@ -55,9 +56,19 @@ def anscombe():
x = arr[0]
y = arr[1]

# Set the X and Y limits
ax.set_xlim(0, 15)
ax.set_ylim(0, 15)

# Draw the points in the scatter plot
ax.scatter(x, y, c='g')
m,b = np.polyfit(x, y, 1)
X = np.linspace(ax.get_xlim()[0], ax.get_xlim()[1], 100)
ax.plot(X, m*X+b, '-')

# Draw the linear best fit line on the plot
draw_best_fit(x, y, ax)

return (axa, axb, axc, axd)


if __name__ == '__main__':
anscombe()
plt.show()
189 changes: 189 additions & 0 deletions yellowbrick/bestfit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
# yellowbrick.bestfit
# Uses Scikit-Learn to compute a best fit function, then draws it in the plot.
#
# Author: Benjamin Bengfort <[email protected]>
# Created: Sun Jun 26 17:27:08 2016 -0400
#
# Copyright (C) 2016 District Data Labs
# For license information, see LICENSE.txt
#
# ID: bestfit.py [] [email protected] $

"""
Uses Scikit-Learn to compute a best fit function, then draws it in the plot.
"""

##########################################################################
## Imports
##########################################################################

import numpy as np

from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error as mse

from operator import itemgetter
from yellowbrick.exceptions import YellowbrickValueError


##########################################################################
## Module Constants
##########################################################################

# Names of the various estimator functions
LINEAR = 'linear'
QUADRATIC = 'quadratic'
EXPONENTIAL = 'exponential'
LOG = 'log'
SELECT_BEST = 'select_best'


##########################################################################
## Draw Line of Best Fit
##########################################################################

def draw_best_fit(X, y, ax, estimator='linear', **kwargs):
"""
Uses Scikit-Learn to fit a model to X and y then uses the resulting model
to predict the curve based on the X values. This curve is drawn to the ax
(matplotlib axis) which must be passed as the third variable.
The estimator function can be one of the following:
'linear': Uses OLS to fit the regression
'quadratic': Uses OLS with Polynomial order 2
'exponential': Not implemented yet
'log': Not implemented yet
'select_best': Selects the best fit via MSE
The remaining keyword arguments are passed to ax.plot to define and
describe the line of best fit.
"""

# Estimators are the types of best fit lines that can be drawn.
estimators = {
LINEAR: fit_linear, # Uses OLS to fit the regression
QUADRATIC: fit_quadratic, # Uses OLS with Polynomial order 2
EXPONENTIAL: fit_exponential, # Not implemented yet
LOG: fit_log, # Not implemented yet
SELECT_BEST: fit_select_best, # Selects the best fit via MSE
}

# Check to make sure that a correct estimator value was passed in.
if estimator not in estimators:
raise YellowbrickValueError(
"'{}' not a valid type of estimator; choose from {}".format(
estimator, ", ".join(estimators.keys())
)
)

# Then collect the estimator function from the mapping.
estimator = estimators[estimator]

# Ensure that X and y are the same length
if len(X) != len(y):
raise YellowbrickValueError((
"X and y must have same length:"
" X len {} doesn't match y len {}!"
).format(len(X), len(y)))

# Ensure that X and y are np.arrays
X = np.array(X)
y = np.array(y)

# Verify that X is a two dimensional array for Scikit-Learn esitmators
# and that its dimensions are (n, 1) where n is the number of rows.
if X.ndim < 2:
X = X[:,np.newaxis] # Reshape X into the correct dimensions

if X.ndim > 2:
raise YellowbrickValueError(
"X must be a (1,) or (n,1) dimensional array not {}".format(x.shape)
)

# Verify that y is a (n,) dimensional array
if y.ndim > 1:
raise YellowbrickValueError(
"y must be a (1,) dimensional array not {}".format(y.shape)
)

# Uses the estimator to fit the data and get the model back.
model = estimator(X, y)

# Plot line of best fit onto the axes that were passed in.
# TODO: determin if xlim or X.min(), X.max() are better params
xr = np.linspace(*ax.get_xlim(), num=100)
ax.plot(xr, model.predict(xr[:,np.newaxis]), **kwargs)
return ax


##########################################################################
## Estimator Functions
##########################################################################

def fit_select_best(X, y):
"""
Selects the best fit of the estimators already implemented by choosing the
model with the smallest mean square error metric for the trained values.
"""
models = [fit(X,y) for fit in [fit_linear, fit_quadratic]]
errors = map(lambda model: mse(y, model.predict(X)), models)

return min(zip(models, errors), key=itemgetter(1))[0]


def fit_linear(X, y):
"""
Uses OLS to fit the regression.
"""
model = linear_model.LinearRegression()
model.fit(X, y)
return model


def fit_quadratic(X, y):
"""
Uses OLS with Polynomial order 2.
"""
model = make_pipeline(
PolynomialFeatures(2), linear_model.LinearRegression()
)
model.fit(X, y)
return model


def fit_exponential(X, y):
"""
Fits an exponential curve to the data.
"""
raise NotImplementedError("Exponential best fit lines are not implemented")


def fit_log(X, y):
"""
Fit a logrithmic curve to the data.
"""
raise NotImplementedError("Logrithmic best fit lines are not implemented")



if __name__ == '__main__':
import os
import pandas as pd
import matplotlib.pyplot as plt

path = os.path.join(os.path.dirname(__file__), "..", "examples", "data", "concrete.xls")
if not os.path.exists(path):
raise Exception("Could not find path for testing")

xkey = 'Fine Aggregate (component 7)(kg in a m^3 mixture)'
ykey = 'Coarse Aggregate (component 6)(kg in a m^3 mixture)'
data = pd.read_excel(path)

fig, axe = plt.subplots()
axe.scatter(data[xkey], data[ykey])
draw_best_fit(data[xkey], data[ykey], axe, 'select_best')

plt.show()
Loading

0 comments on commit fc91da0

Please sign in to comment.