Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Random Forest Step #46

Draft
wants to merge 28 commits into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
4ae8385
Use decorator to register imported steps when initializing
Gitiauxx Sep 24, 2018
7f6f916
add get_input_columns as a step attribute to avoid ugly code to get t…
Gitiauxx Sep 24, 2018
360e3d8
Add random forest as as step
Gitiauxx Sep 24, 2018
fdde25d
Add test for random forest
Gitiauxx Sep 24, 2018
e20edf6
Merge branch 'master' into xavier_testing
Gitiauxx Sep 24, 2018
7f33e8f
Add cross validation and feature importance (for random forest)
Gitiauxx Sep 26, 2018
577e4cc
add cross validation
Gitiauxx Oct 2, 2018
fb70a95
add a parsing capability for random forest to eval numpy functions fr…
Gitiauxx Oct 3, 2018
c495729
Add the possibility to parse output date and possible numpy transform…
Gitiauxx Oct 3, 2018
6f9b8fa
Remove useless inputs in random forest steps
Gitiauxx Oct 3, 2018
102fb60
Fix __init__ for random forest
Gitiauxx Oct 5, 2018
066be5f
add inheritance from OLSRegression in RF
Gitiauxx Oct 5, 2018
5cb5251
Fix bug to import/export model constructed from pickle files
Gitiauxx Oct 10, 2018
e91193d
add demo of different methods and some comparison across them
Gitiauxx Oct 10, 2018
df023d8
Add Doc String for utils
Gitiauxx Oct 16, 2018
4057264
Add Doc String for utils
Gitiauxx Oct 16, 2018
b161cd1
Add examples in doc string
Gitiauxx Oct 17, 2018
5dc5ea5
Add examples in doc string for regression
Gitiauxx Oct 17, 2018
4754ccc
Add doc string for splits and cross_validate_score in shared.py
Gitiauxx Oct 17, 2018
96cffbd
Add new required libraries in setup.py
Gitiauxx Oct 17, 2018
5d151e7
Add new required libraries in environment.yml
Gitiauxx Oct 17, 2018
637c2b2
remove duplicates import of modelmanager
Gitiauxx Oct 17, 2018
fc9a301
Delete neuro_network.py
Gitiauxx Oct 18, 2018
e7553b1
Add a tag role to pickle file
Gitiauxx Oct 19, 2018
ae7d465
branch 'xavier_testing' of https://github.com/udst/urbansim_templates…
Gitiauxx Oct 19, 2018
990e85a
Add tests for utils, random forest and gradient boosting
Gitiauxx Oct 24, 2018
fcbc256
Remove get_inputs_columns
Gitiauxx Oct 25, 2018
c6e40f5
Follow naming convention
Gitiauxx Oct 25, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
name: template-env

channels:
- udst # for orca and pandana
- conda-forge # for choicemodels, statsmodels, and many dependencies
- timothyb0912 # for pylogit

dependencies:
- python=3.6
- geopandas=0.3
- jupyter=1.0
- line_profiler=2.1
- matplotlib=2.2
- memory_profiler=0.54
- numpy=1.15
- orca=1.5
- pandana=0.4
- pandas=0.23
- pylogit=0.2
- pytest=3.8
- scipy=1.1
- statsmodels=0.9
- sklearn=0.19.2
- dill=0.2.8.2


# This Conda environment includes the direct dependencies for template-based UrbanSim
# models, plus a variety of other packages that are useful for validation and testing.

# One-time setup (several minutes):
# `conda env create -f environment.yml`

# Activate the environment:
# `source activate template-env`

# Install development versions of ChoiceModels and UrbanSim Templates
# (only needs to be done once, but run git-pull from these directories
# periodically to update the codebases)

# Navigate to directory where choicemodels folder should go:
# `git clone https://github.com/udst/choicemodels.git`
# `cd choicemodels`
# `python setup.py develop`

# Navigate to directory where urbansim_templates folder should go:
# `git clone https://github.com/udst/urbansim_templates.git`
# `cd urbansim_templates`
# `python setup.py develop`
6 changes: 4 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name='urbansim_templates',
version='0.1.dev13',
version='0.1.dev16',
description='UrbanSim extension for managing model steps',
author='UrbanSim Inc.',
author_email='[email protected]',
Expand All @@ -21,6 +21,8 @@
'pandana >= 0.3',
'pandas >= 0.22',
'statsmodels >= 0.8',
'urbansim >= 3.1.1'
'urbansim >= 3.1.1',
'sklearn >= 0.19.2',
'dill >= 0.2.8.2'
]
)
15 changes: 9 additions & 6 deletions urbansim_templates/modelmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os
import copy
import pickle
import dill as pickle
from collections import OrderedDict

import orca
Expand All @@ -28,7 +28,6 @@ def template(cls):
"""
_templates[cls.__name__] = cls
return cls


def initialize(path='configs'):
"""
Expand Down Expand Up @@ -99,9 +98,9 @@ def build_step(d):
"""
if 'supplemental_objects' in d:
for i, item in enumerate(d['supplemental_objects']):
content = load_supplemental_object(d['name'], **item)
content = load_supplemental_object(d['name'], item['name'], item['content_type'])
d['supplemental_objects'][i]['content'] = content

return _templates[d['template']].from_dict(d)


Expand Down Expand Up @@ -188,8 +187,12 @@ def save_step_to_disk(step):
# Save supplemental objects
if 'supplemental_objects' in d:
for item in filter(None, d['supplemental_objects']):
save_supplemental_object(step.name, **item)
content = item['content']
content.role = item['object_name']
save_supplemental_object(step.name, item['name'], content, item['content_type'])
del item['content']
del item['object_name']


# Save main yaml file
headers = {'modelmanager_version': __version__}
Expand Down Expand Up @@ -219,7 +222,7 @@ def save_supplemental_object(step_name, name, content, content_type, required=Tr

"""
if content_type is 'pickle':
content.to_pickle(os.path.join(_disk_store, step_name+'-'+name+'.pkl'))
pickle.dump(content, open(os.path.join(_disk_store, step_name+'-'+name+'.pkl'), 'wb'))


def get_step(name):
Expand Down
176 changes: 176 additions & 0 deletions urbansim_templates/models/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@
from urbansim.models import RegressionModel
from urbansim.utils import yamlio


from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from .. import modelmanager
from ..utils import convert_to_model
from .shared import TemplateStep


Expand Down Expand Up @@ -203,4 +207,176 @@ def run(self):

orca.get_table(tabname).update_col_from_series(colname, values, cast=True)

@modelmanager.template
class RandomForestRegressionStep(OLSRegressionStep):

def __init__(self, tables=None, model_expression=None, filters=None, out_tables=None,
out_column=None, out_transform=None, out_filters=None, name=None, tags=[]):

super().__init__(tables=tables, model_expression=model_expression, filters=filters, out_tables=out_tables,
out_column=out_column, out_transform=out_transform, name=name)

self.cv_metric = None
self.importance = None



@classmethod
def from_dict(cls, d):
"""
Create an object instance from a saved dictionary representation.
Use a pickled version of the random forest model
Parameters
----------
d : dict

Returns
-------
RandomForestRegressionStep

"""
# Pass values from the dictionary to the __init__() method
obj = cls(tables=d['tables'], model_expression=d['model_expression'],
filters=d['filters'], out_tables=d['out_tables'],
out_column=d['out_column'], out_transform=d['out_transform'],
out_filters=d['out_filters'], name=d['name'], tags=d['tags'],
)

# add supplemental objects
for i, item in enumerate(d['supplemental_objects']):
content = d['supplemental_objects'][i]['content']
setattr(obj, content.role, content)
return obj


def fit(self):
"""Fit method to estimate randomForest()

This function fits a RandomForest() model using the sklearn library,
save the results and compute feature importances

Arguments
------------
self: object instance

"""

# convert model to a format with similar fit and predict structure as in TemplateStep
self.model = convert_to_model(RandomForestRegressor(),
self.model_expression,
ytransform=self.out_transform)

results = self.model.fit(self._get_data())
self.name = self._generate_name()

# compute feature importance
importance = self.model.feature_importances_
self.importance = {}
i = 0
for variable in self.model.rhs:
self.importance[variable] = float(importance[i])
i += 1


def to_dict(self):
"""
Create a dictionary representation of the object.

Returns
-------
dict

"""
d = TemplateStep.to_dict(self)
# Add parameters not in parent class
d.update({
'cross_validation_metric': self.cv_metric,
'features_importance': self.importance
})

# model config is a filepath to a pickled file
d['supplemental_objects'] = []
d['supplemental_objects'].append({'name': self.name,
'object_name': 'model',
'content': self.model,
'content_type': 'pickle'})

return d

def run(self):
"""
Run the model step: calculate predicted values and use them to update a column.

The predicted values are written to Orca and also saved to the class object for
interactive use (`predicted_values`, with type pd.Series). But they are not saved
in the dictionary representation of the model step.

"""
# TO DO - figure out what we can infer about requirements for the underlying data
# and write an 'orca_test' assertion to confirm compliance.

output_column = self._get_out_column()
data = self._get_data('predict')

values = self.model.predict(self._get_data('predict'))
#values = pd.Series(values, index=data.index)
self.predicted_values = values

tabname = self._get_out_table()

orca.get_table(tabname).update_col_from_series(output_column, values, cast=True)




@modelmanager.template
class GradientBoostingRegressionStep(RandomForestRegressionStep):


def fit(self):
"""Fit method to estimate GradientBoosting()

This function fits a GradientBoosting() model using the sklearn library,
save the results.

Arguments
------------
self: object instance

"""

# convert model to a format with similar fit and predict structure as in TemplateStep
self.model = convert_to_model(GradientBoostingRegressor(),
self.model_expression,
ytransform=self.out_transform)

results = self.model.fit(self._get_data())
self.name = self._generate_name()

def to_dict(self):
"""
Create a dictionary representation of the object.

Returns
-------
dict

"""
d = TemplateStep.to_dict(self)
# Add parameters not in parent class
d.update({
'model': self.name,
'cross validation metric': self.cv_metric
})

# model config is a filepath to a pickled file
d['supplemental_objects'] = []
d['supplemental_objects'].append({'name': self.name,
'object_name': 'model',
'content': self.model,
'content_type': 'pickle'})

return d



Loading