Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup PR for existing Survey Assist RF PR #938 #995

Open
wants to merge 34 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
6471a96
add copy of code used in TRB paper
hlu109 Aug 12, 2022
9d6a1af
update user uuid lookup; add documentation note
hlu109 Dec 16, 2022
9bd9b18
Add additional logging to the calculation so that we can monitor the …
shankari Feb 15, 2023
1b9ece0
making `cluster_performance.ipynb`, `generate_figs_for_poster` and `…
humbleOldSage Aug 22, 2023
e7d2a14
Unified Interface for fit function
humbleOldSage Aug 26, 2023
59633e0
Fixing `models.py` to support `regenerate_classification_performance_…
humbleOldSage Aug 30, 2023
0adb5fe
[PARTIALLY TESTED] Single database read and Code Cleanuo
humbleOldSage Sep 14, 2023
e9abd51
[PARTIALLY TESTED] Survey Assist Using RF
humbleOldSage Oct 2, 2023
3820d87
[NOT TESTED]Predict implemented
humbleOldSage Oct 3, 2023
5b2572e
[NOT TESTED] Model storage and Model Testing included
humbleOldSage Oct 9, 2023
bf7f406
[TESTED]Forest Model Integration
humbleOldSage Nov 2, 2023
1d7be5a
Minor fixes
humbleOldSage Nov 2, 2023
b3d0db2
Delete Config file
humbleOldSage Nov 3, 2023
c514fe0
Merge remote-tracking branch 'e-mission-eval-private-data/move-models…
humbleOldSage Nov 3, 2023
3b038a9
removedfile
humbleOldSage Nov 3, 2023
94fc848
Update model.py
humbleOldSage Nov 3, 2023
87f109c
Merge branch 'master' into SurveyAssistUsingRandomForest
humbleOldSage Dec 7, 2023
33cdaab
[Tested, Will fail]Integrating RF model on server and more Unit test
humbleOldSage Dec 9, 2023
01fcb2a
minor fix
humbleOldSage Dec 9, 2023
f5fec64
Delete model.py
humbleOldSage Dec 12, 2023
585cc90
Update TestForestModel.py
humbleOldSage Dec 13, 2023
61bbe3f
Minor Fixes
humbleOldSage Dec 16, 2023
a32ce4f
[Tested]Adding Integration test
humbleOldSage Jan 2, 2024
052cb08
Improving test
humbleOldSage Jan 10, 2024
104dd9a
Integration Testing for forest model
humbleOldSage Feb 5, 2024
1b523ed
[Tested] Improvements for model integration
humbleOldSage Mar 15, 2024
35a1346
Forest Model related data additions
humbleOldSage Mar 21, 2024
19bb394
Update TestForestModelIntegration.py
humbleOldSage Mar 22, 2024
450094c
[TESTED] Updated ForestModelLoadAndSave.py
humbleOldSage Mar 22, 2024
ad968de
Fixing TestForestModelLoadandSave.py
humbleOldSage Mar 28, 2024
6d4cc3a
Import format changed + Whitespace / Newline fixes
Nov 18, 2024
419babb
Reverting changes; utility functions already present in clustering.py
Nov 18, 2024
e7f5d21
Cleaned up TestForestModelLoadandSave.py
Nov 21, 2024
6daf0b8
Using train / test data split + Added value-check tests + Reduced ins…
Nov 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion conf/analysis/trip_model.conf.json.sample
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"model_type": "greedy",
"model_type": "forest",
"model_storage": "document_database",
"minimum_trips": 14,
"model_parameters": {
Expand All @@ -8,6 +8,24 @@
"similarity_threshold_meters": 500,
"apply_cutoff": false,
"incremental_evaluation": false
},
"forest": {
"loc_feature" : "coordinates",
"radius": 100,
"size_thresh":1,
"purity_thresh":1.0,
"gamma":0.05,
"C":1,
"n_estimators":100,
"criterion":"gini",
"max_depth":null,
"min_samples_split":2,
"min_samples_leaf":1,
"max_features":"sqrt",
"bootstrap":true,
"random_state":42,
"use_start_clusters":false,
"use_trip_clusters":true
}
}
}
30 changes: 15 additions & 15 deletions emission/analysis/modelling/trip_model/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
import logging

# import clustering algorithms
import sklearn.metrics.pairwise as smp
import sklearn.cluster as sc
from sklearn import metrics
from sklearn import svm
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import sklearn.metrics.pairwise as sklmp
import sklearn.cluster as sklc
import sklearn.metrics.cluster as sklmc
import sklearn.svm as skls
import sklearn.pipeline as sklpl
import sklearn.preprocessing as sklpp

# our imports
# NOTE: this requires changing the branch of e-mission-server to
Expand Down Expand Up @@ -102,7 +102,7 @@ def add_loc_clusters(
dist_matrix_meters = get_distance_matrix(loc_df, loc_type)

for r in radii:
model = sc.DBSCAN(r, metric="precomputed",
model = sklc.DBSCAN(r, metric="precomputed",
min_samples=min_samples).fit(dist_matrix_meters)
labels = model.labels_
# print(model.n_features_in_)
Expand Down Expand Up @@ -150,7 +150,7 @@ def add_loc_clusters(
dist_matrix_meters = get_distance_matrix(loc_df, loc_type)

for r in radii:
labels = sc.OPTICS(
labels = sklc.OPTICS(
min_samples=optics_min_samples,
max_eps=r,
xi=optics_xi,
Expand Down Expand Up @@ -178,7 +178,7 @@ def add_loc_clusters(
dist_matrix_meters = get_distance_matrix(p_loc_df, loc_type)

for r in radii:
labels = sc.DBSCAN(
labels = sklc.DBSCAN(
r, metric="precomputed",
min_samples=min_samples).fit(dist_matrix_meters).labels_

Expand Down Expand Up @@ -231,7 +231,7 @@ def add_loc_clusters(
# what the bandwidth roughly corresponds to in the real world/make
# the value a little more interpretable.
LATLON_TO_M = 1 / 111139
labels = sc.MeanShift(
labels = sklc.MeanShift(
bandwidth=LATLON_TO_M * r,
min_bin_freq=min_samples,
cluster_all=False,
Expand Down Expand Up @@ -325,9 +325,9 @@ def add_loc_SVM(loc_df,
]]
y_train = labeled_points_in_cluster.purpose_confirm.to_list()

labels = make_pipeline(
StandardScaler(),
svm.SVC(
labels = sklpl.make_pipeline(
sklpp.StandardScaler(),
skls.SVC(
kernel='rbf',
gamma=svm_gamma,
C=svm_C,
Expand Down Expand Up @@ -381,7 +381,7 @@ def get_distance_matrix(loc_df, loc_type):
radians_lat_lon = np.radians(loc_df[[loc_type + "_lat", loc_type + "_lon"]])

dist_matrix_meters = pd.DataFrame(
smp.haversine_distances(radians_lat_lon, radians_lat_lon) *
sklmp.haversine_distances(radians_lat_lon, radians_lat_lon) *
EARTH_RADIUS)
return dist_matrix_meters

Expand All @@ -404,7 +404,7 @@ def single_cluster_purity(points_in_cluster, label_col='purpose_confirm'):


def purity_score(y_true, y_pred):
contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
contingency_matrix = sklmc.contingency_matrix(y_true, y_pred)
purity = np.sum(np.amax(contingency_matrix,
axis=0)) / np.sum(contingency_matrix)
return purity
197 changes: 197 additions & 0 deletions emission/analysis/modelling/trip_model/forest_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
import joblib
from typing import Dict, List, Optional, Tuple
import sklearn.metrics.pairwise as smp
import emission.core.wrapper.confirmedtrip as ecwc
import logging
from io import BytesIO

import json
import emission.analysis.modelling.trip_model.trip_model as eamuu
import emission.analysis.modelling.trip_model.config as eamtc
import emission.storage.timeseries.builtin_timeseries as estb
import emission.storage.decorations.trip_queries as esdtq
import emission.analysis.modelling.trip_model.models as eamtm

EARTH_RADIUS = 6371000

class ForestClassifierModel(eamuu.TripModel):

def __init__(self,config=None):

if config is None:
config = eamtc.get_config_value_or_raise('model_parameters.forest')
logging.debug(f'ForestClassifier loaded model config from file')
else:
logging.debug(f'ForestClassifier using model config argument')

random_forest_expected_keys = [
'loc_feature',
'n_estimators',
'criterion',
'min_samples_split',
'min_samples_leaf',
'max_features',
'bootstrap',
]
######### Not Tested #########
# The below code is used when we cluster the coordinates (loc_cluster parameter = True)
# before passing to Random Forest. Commenting this for now since it is not used. Not tested either.
###############################

# cluster_expected_keys= [
# 'radius',
# 'size_thresh',
# 'purity_thresh',
# 'gamma',
# 'C',
# 'use_start_clusters',
# 'use_trip_clusters',
# ]
#
# if config['loc_feature'] == 'cluster':
# for k in cluster_expected_keys:
# if config.get(k) is None:
# msg = f"cluster trip model config missing expected key {k}"
# raise KeyError(msg)
#######################################
for k in random_forest_expected_keys:
if config.get(k) is None:
msg = f"forest trip model config missing expected key {k}"
raise KeyError(msg)
self.model=eamtm.ForestClassifier(**config)


def fit(self,trips: List[ecwc.Confirmedtrip]):
'''
trips : List of Entry type data
'''
# check and raise exception if no data to fit
logging.debug(f'fit called with {len(trips)} trips')

unlabeled = list(filter(lambda t: len(t['data']['user_input']) == 0, trips))
if len(unlabeled) > 0:
msg = f'model.fit cannot be called with unlabeled trips, found {len(unlabeled)}'
raise Exception(msg)

#Convert List of Entry to dataframe
data_df = estb.BuiltinTimeSeries.to_data_df("analysis/confirmed_trip",trips)
labeled_trip_df = esdtq.filter_labeled_trips(data_df)
expanded_labeled_trip_df= esdtq.expand_userinputs(labeled_trip_df)
#fit models on dataframe
self.model.fit(expanded_labeled_trip_df)


def predict(self, trip: List[float]) -> Tuple[List[Dict], int]:
'''
trip : A single trip whose mode, pupose and replaced mode are required
returns.
'''

#check if theres no trip to predict
logging.debug(f"forest classifier predict called with {len(trip)} trips")
if len(trip) == 0:
msg = f'model.predict cannot be called with an empty trip'
raise Exception(msg)
# CONVERT TRIP TO dataFrame
test_df = estb.BuiltinTimeSeries.to_data_df("analysis/confirmed_trip",[trip])
predcitions_df= self.model.predict(test_df)

# the predictions_df currently holds the highest probable options
# individually in all three categories. the predictions_df are in the form
#
# purpose_pred | purpose_proba | mode_pred | mode_proba | replaced_pred | replaced proba
# dog-park | 1.0 | e-bike | 0.99 | walk | 1.1
#
#
# However, to keep the trip model general, the forest model is expected to return
#
#PREDICTIONS [ {'labels': {'mode_confirm': 'e-bike', 'replaced_mode': 'walk', 'purpose_confirm': 'dog-park'},
# 'p': ( Currently average of the 3 probabilities)}]
labels= {
'mode_confirm': predcitions_df['mode_pred'].iloc[0],
'replaced_mode' : predcitions_df['replaced_pred'].iloc[0],
'purpose_confirm' : predcitions_df['purpose_pred'].iloc[0]
}

avg_proba = predcitions_df[['purpose_proba','mode_proba','replaced_proba']].mean(axis=1).iloc[0]
predictions =[{
'labels' : labels,
'p' : avg_proba
}]
return predictions, len(predictions)

def to_dict(self):
"""
Convert the model to a dictionary suitable for storage.
"""
data={}
attr=[ 'purpose_predictor','mode_predictor','replaced_predictor','purpose_enc','mode_enc','train_df']

######### Not Tested #########
# The below code is used when we cluster the coordinates (loc_cluster parameter = True)
# before passing to Random Forest. Commenting this for now since it is not used. Not tested either.
###############################
# if self.model.loc_feature == 'cluster':
# ## confirm this includes all the extra encoders/models
# attr.extend([ 'cluster_enc','end_cluster_model','start_cluster_model','trip_grouper'])

for attribute_name in attr:
if not hasattr(self.model,attribute_name):
raise ValueError(f"Attribute {attribute_name} not found in the model")

buffer=BytesIO()
try:
joblib.dump(getattr(self.model,attribute_name),buffer)
except Exception as e:
raise RuntimeError(f"Error serializing { attribute_name}: {str(e)}")
buffer.seek(0)
data[attribute_name]=buffer.getvalue()

return data

def from_dict(self,model: Dict):
"""
Load the model from a dictionary.
"""
attr=[ 'purpose_predictor','mode_predictor','replaced_predictor','purpose_enc','mode_enc','train_df']

######### Not Tested #########
# The below code is used when we cluster the coordinates (loc_cluster parameter = True)
# before passing to Random Forest. Commenting this for now since it is not used. Not tested either.
###############################
# if self.model.loc_feature == 'cluster':
# ## TODO : confirm this includes all the extra encoders/models
# attr.extend([ 'cluster_enc','end_cluster_model','start_cluster_model','trip_grouper'])
for attribute_name in attr:
if attribute_name not in model:
raise ValueError(f"Attribute {attribute_name} missing in the model")
try:
buffer = BytesIO(model[attribute_name])
setattr(self.model,attribute_name, joblib.load(buffer))
except Exception as e:
raise RuntimeError(f"Error deserializing { attribute_name}: {str(e)}")
# If we do not wish to raise the exception after logging the error, comment the line above

def extract_features(self, trip: ecwc.Confirmedtrip) -> List[float]:
"""
extract the relevant features for learning from a trip for this model instance

:param trip: the trip to extract features from
:type trip: Confirmedtrip
:return: a vector containing features to predict from
:rtype: List[float]
"""
# ForestClassifier class in models.py file handles features extraction.
pass

def is_incremental(self) -> bool:
"""
whether this model requires the complete user history to build (False),
or, if only the incremental data since last execution is required (True).

:return: if the model is incremental. the current timestamp will be recorded
in the analysis pipeline. the next call to this model will only include
trip data for trips later than the recorded timestamp.
:rtype: bool
"""
pass
18 changes: 10 additions & 8 deletions emission/analysis/modelling/trip_model/model_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import emission.analysis.modelling.trip_model.trip_model as eamuu
import emission.analysis.modelling.similarity.od_similarity as eamso
import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamug
import emission.analysis.modelling.trip_model.forest_classifier as eamuf


SIMILARITY_THRESHOLD_METERS=500
Expand All @@ -11,6 +12,7 @@
class ModelType(Enum):
# ENUM_NAME_CAPS = 'SHORTHAND_NAME_CAPS'
GREEDY_SIMILARITY_BINNING = 'GREEDY'
RANDOM_FOREST_CLASSIFIER = 'FOREST'

def build(self, config=None) -> eamuu.TripModel:
"""
Expand All @@ -24,16 +26,16 @@ def build(self, config=None) -> eamuu.TripModel:
:raises KeyError: if the requested model name does not exist
"""
# Dict[ModelType, TripModel]
MODELS = {
ModelType.GREEDY_SIMILARITY_BINNING: eamug.GreedySimilarityBinning(config)
}
MODELS = {
ModelType.GREEDY_SIMILARITY_BINNING: eamug.GreedySimilarityBinning,
ModelType.RANDOM_FOREST_CLASSIFIER: eamuf.ForestClassifierModel
}
model = MODELS.get(self)
if model is None:
model_names = list(lambda e: e.name, MODELS.keys())
models = ",".join(model_names)
raise KeyError(f"ModelType {self.name} not found in factory, please add to build method")

return model
available_models = ', '.join([ e.name for e in ModelType])
raise KeyError(f"ModelType {self.name} not found in factory, Available models are {available_models}."\
"Otherwise please add new model to build method")
return model(config)

@classmethod
def names(cls):
Expand Down
Loading
Loading