e-mission · MukuFlash03 · Aug 12, 2022 · Dec 16, 2022 · Feb 15, 2023 · Aug 22, 2023
diff --git a/conf/analysis/trip_model.conf.json.sample b/conf/analysis/trip_model.conf.json.sample
@@ -1,5 +1,5 @@
 {
-    "model_type": "greedy",
+    "model_type": "forest",
     "model_storage": "document_database",
     "minimum_trips": 14,
     "model_parameters": {
@@ -8,6 +8,24 @@
             "similarity_threshold_meters": 500,
             "apply_cutoff": false,
             "incremental_evaluation": false
+        },
+        "forest": {
+            "loc_feature" : "coordinates",
+            "radius": 100,
+            "size_thresh":1,
+            "purity_thresh":1.0,
+            "gamma":0.05,
+            "C":1,
+            "n_estimators":100,
+            "criterion":"gini",
+            "max_depth":null,
+            "min_samples_split":2,
+            "min_samples_leaf":1,
+            "max_features":"sqrt",
+            "bootstrap":true,
+            "random_state":42,
+            "use_start_clusters":false,
+            "use_trip_clusters":true            
         }
     }
 }
diff --git a/emission/analysis/modelling/trip_model/clustering.py b/emission/analysis/modelling/trip_model/clustering.py
@@ -6,12 +6,12 @@
 import logging
 
 # import clustering algorithms
-import sklearn.metrics.pairwise as smp
-import sklearn.cluster as sc
-from sklearn import metrics
-from sklearn import svm
-from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import StandardScaler
+import sklearn.metrics.pairwise as sklmp
+import sklearn.cluster as sklc
+import sklearn.metrics.cluster as sklmc
+import sklearn.svm as skls
+import sklearn.pipeline as sklpl
+import sklearn.preprocessing as sklpp
 
 # our imports
 # NOTE: this requires changing the branch of e-mission-server to
@@ -102,7 +102,7 @@ def add_loc_clusters(
         dist_matrix_meters = get_distance_matrix(loc_df, loc_type)
 
         for r in radii:
-            model = sc.DBSCAN(r, metric="precomputed",
+            model = sklc.DBSCAN(r, metric="precomputed",
                               min_samples=min_samples).fit(dist_matrix_meters)
             labels = model.labels_
             # print(model.n_features_in_)
@@ -150,7 +150,7 @@ def add_loc_clusters(
         dist_matrix_meters = get_distance_matrix(loc_df, loc_type)
 
         for r in radii:
-            labels = sc.OPTICS(
+            labels = sklc.OPTICS(
                 min_samples=optics_min_samples,
                 max_eps=r,
                 xi=optics_xi,
@@ -178,7 +178,7 @@ def add_loc_clusters(
             dist_matrix_meters = get_distance_matrix(p_loc_df, loc_type)
 
             for r in radii:
-                labels = sc.DBSCAN(
+                labels = sklc.DBSCAN(
                     r, metric="precomputed",
                     min_samples=min_samples).fit(dist_matrix_meters).labels_
 
@@ -231,7 +231,7 @@ def add_loc_clusters(
             # what the bandwidth roughly corresponds to in the real world/make
             # the value a little more interpretable.
             LATLON_TO_M = 1 / 111139
-            labels = sc.MeanShift(
+            labels = sklc.MeanShift(
                 bandwidth=LATLON_TO_M * r,
                 min_bin_freq=min_samples,
                 cluster_all=False,
@@ -325,9 +325,9 @@ def add_loc_SVM(loc_df,
                 ]]
                 y_train = labeled_points_in_cluster.purpose_confirm.to_list()
 
-                labels = make_pipeline(
-                    StandardScaler(),
-                    svm.SVC(
+                labels = sklpl.make_pipeline(
+                    sklpp.StandardScaler(),
+                    skls.SVC(
                         kernel='rbf',
                         gamma=svm_gamma,
                         C=svm_C,
@@ -381,7 +381,7 @@ def get_distance_matrix(loc_df, loc_type):
     radians_lat_lon = np.radians(loc_df[[loc_type + "_lat", loc_type + "_lon"]])
 
     dist_matrix_meters = pd.DataFrame(
-        smp.haversine_distances(radians_lat_lon, radians_lat_lon) *
+        sklmp.haversine_distances(radians_lat_lon, radians_lat_lon) *
         EARTH_RADIUS)
     return dist_matrix_meters
 
@@ -404,7 +404,7 @@ def single_cluster_purity(points_in_cluster, label_col='purpose_confirm'):
 
 
 def purity_score(y_true, y_pred):
-    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
+    contingency_matrix = sklmc.contingency_matrix(y_true, y_pred)
     purity = np.sum(np.amax(contingency_matrix,
                             axis=0)) / np.sum(contingency_matrix)
     return purity
diff --git a/emission/analysis/modelling/trip_model/forest_classifier.py b/emission/analysis/modelling/trip_model/forest_classifier.py
@@ -0,0 +1,197 @@
+import joblib
+from typing import Dict, List, Optional, Tuple
+import sklearn.metrics.pairwise as smp 
+import emission.core.wrapper.confirmedtrip as ecwc
+import logging
+from io import BytesIO
+
+import json
+import emission.analysis.modelling.trip_model.trip_model as eamuu
+import emission.analysis.modelling.trip_model.config as eamtc
+import emission.storage.timeseries.builtin_timeseries as estb
+import emission.storage.decorations.trip_queries as esdtq
+import emission.analysis.modelling.trip_model.models as eamtm
+
+EARTH_RADIUS = 6371000
+
+class ForestClassifierModel(eamuu.TripModel):
+
+    def __init__(self,config=None):
+
+        if config is None:
+            config = eamtc.get_config_value_or_raise('model_parameters.forest')
+            logging.debug(f'ForestClassifier loaded model config from file')
+        else:
+            logging.debug(f'ForestClassifier using model config argument')
+
+        random_forest_expected_keys = [
+            'loc_feature',
+            'n_estimators',
+            'criterion',
+            'min_samples_split',
+            'min_samples_leaf',
+            'max_features',
+            'bootstrap',
+        ]     
+        ######### Not Tested #########
+        # The below code is used when we cluster the coordinates (loc_cluster parameter = True)
+        # before passing to Random Forest. Commenting this for now since it is not used. Not tested either.
+        ###############################
+
+        # cluster_expected_keys= [
+        #     'radius',
+        #     'size_thresh',  
+        #     'purity_thresh',
+        #     'gamma',
+        #     'C',
+        #     'use_start_clusters',
+        #     'use_trip_clusters',
+        # ]
+        #
+        # if config['loc_feature'] == 'cluster':
+        #     for k in cluster_expected_keys:
+        #         if config.get(k) is None:
+        #             msg = f"cluster trip model config missing expected key {k}"
+        #             raise KeyError(msg)
+        #######################################
+        for k in random_forest_expected_keys:
+            if config.get(k) is None:
+                msg = f"forest trip model config missing expected key {k}"
+                raise KeyError(msg)   
+        self.model=eamtm.ForestClassifier(**config)
+
+
+    def fit(self,trips: List[ecwc.Confirmedtrip]):
+        '''
+        trips : List of Entry type data 
+        '''
+        # check and raise exception if no data to fit
+        logging.debug(f'fit called with {len(trips)} trips')
+
+        unlabeled = list(filter(lambda t: len(t['data']['user_input']) == 0, trips))
+        if len(unlabeled) > 0:
+            msg = f'model.fit cannot be called with unlabeled trips, found {len(unlabeled)}'
+            raise Exception(msg)    
+
+        #Convert List of Entry to dataframe     
+        data_df = estb.BuiltinTimeSeries.to_data_df("analysis/confirmed_trip",trips)
+        labeled_trip_df = esdtq.filter_labeled_trips(data_df)
+        expanded_labeled_trip_df= esdtq.expand_userinputs(labeled_trip_df)
+        #fit models on dataframe
+        self.model.fit(expanded_labeled_trip_df)       
+
+
+    def predict(self, trip: List[float]) -> Tuple[List[Dict], int]:
+        '''
+        trip : A single trip whose mode, pupose and replaced mode are required
+        returns.
+        '''
+
+        #check if theres no trip to predict        
+        logging.debug(f"forest classifier predict called with {len(trip)} trips")
+        if len(trip) == 0:
+            msg = f'model.predict cannot be called with an empty trip'
+            raise Exception(msg)        
+        # CONVERT TRIP TO dataFrame        
+        test_df = estb.BuiltinTimeSeries.to_data_df("analysis/confirmed_trip",[trip])
+        predcitions_df= self.model.predict(test_df)
+
+        # the predictions_df currently holds the highest probable options
+        # individually in all three categories. the predictions_df are in the form 
+        #
+        # purpose_pred | purpose_proba | mode_pred | mode_proba | replaced_pred | replaced proba 
+        # dog-park     |   1.0         |  e-bike   | 0.99       | walk          | 1.1 
+        #
+        #
+        # However, to keep the trip model general, the forest model is expected to return  
+        #
+        #PREDICTIONS [  {'labels': {'mode_confirm': 'e-bike', 'replaced_mode': 'walk', 'purpose_confirm': 'dog-park'},
+        #                 'p':  ( Currently average of the 3 probabilities)}]
+        labels= {
+            'mode_confirm': predcitions_df['mode_pred'].iloc[0],
+            'replaced_mode' : predcitions_df['replaced_pred'].iloc[0],
+            'purpose_confirm' : predcitions_df['purpose_pred'].iloc[0]
+            }
+
+        avg_proba = predcitions_df[['purpose_proba','mode_proba','replaced_proba']].mean(axis=1).iloc[0]
+        predictions =[{
+            'labels' : labels,
+            'p' : avg_proba
+        }]
+        return predictions, len(predictions)
+
+    def to_dict(self):
+        """
+        Convert the model to a dictionary suitable for storage.
+        """
+        data={}
+        attr=[ 'purpose_predictor','mode_predictor','replaced_predictor','purpose_enc','mode_enc','train_df']
+
+        ######### Not Tested #########
+        # The below code is used when we cluster the coordinates (loc_cluster parameter = True)
+        # before passing to Random Forest. Commenting this for now since it is not used. Not tested either.
+        ###############################
+        # if self.model.loc_feature == 'cluster':
+        #     ## confirm this includes all the extra encoders/models
+        #     attr.extend([ 'cluster_enc','end_cluster_model','start_cluster_model','trip_grouper'])
+
+        for attribute_name in attr:
+            if not hasattr(self.model,attribute_name):
+                raise ValueError(f"Attribute {attribute_name} not found in the model")
+
+            buffer=BytesIO()
+            try:
+                joblib.dump(getattr(self.model,attribute_name),buffer)
+            except Exception as e:
+                raise RuntimeError(f"Error serializing { attribute_name}: {str(e)}")    
+            buffer.seek(0)
+            data[attribute_name]=buffer.getvalue()
+
+        return data
+
+    def from_dict(self,model: Dict):
+        """
+        Load the model from a dictionary.
+        """
+        attr=[ 'purpose_predictor','mode_predictor','replaced_predictor','purpose_enc','mode_enc','train_df']
+
+        ######### Not Tested #########
+        # The below code is used when we cluster the coordinates (loc_cluster parameter = True)
+        # before passing to Random Forest. Commenting this for now since it is not used. Not tested either.
+        ###############################
+        # if self.model.loc_feature == 'cluster':
+        #     ## TODO : confirm this includes all the extra encoders/models
+        #     attr.extend([ 'cluster_enc','end_cluster_model','start_cluster_model','trip_grouper'])
+        for attribute_name in attr:
+            if attribute_name not in model:
+                raise ValueError(f"Attribute {attribute_name} missing in the model")
+            try:
+                buffer = BytesIO(model[attribute_name])
+                setattr(self.model,attribute_name, joblib.load(buffer))
+            except Exception as e:
+                raise RuntimeError(f"Error deserializing { attribute_name}: {str(e)}")    
+                # If we do not wish to raise the exception after logging the error, comment the line above
+
+    def extract_features(self, trip: ecwc.Confirmedtrip) -> List[float]:
+        """
+        extract the relevant features for learning from a trip for this model instance
+
+        :param trip: the trip to extract features from
+        :type trip: Confirmedtrip
+        :return: a vector containing features to predict from
+        :rtype: List[float]
+        """
+        # ForestClassifier class in models.py file handles features extraction.
+        pass
+
+    def is_incremental(self) -> bool:
+        """
+        whether this model requires the complete user history to build (False),
+        or, if only the incremental data since last execution is required (True).
+
+        :return: if the model is incremental. the current timestamp will be recorded
+        in the analysis pipeline. the next call to this model will only include 
+        trip data for trips later than the recorded timestamp.
+        :rtype: bool
+        """
+        pass
diff --git a/emission/analysis/modelling/trip_model/model_type.py b/emission/analysis/modelling/trip_model/model_type.py
@@ -3,6 +3,7 @@
 import emission.analysis.modelling.trip_model.trip_model as eamuu
 import emission.analysis.modelling.similarity.od_similarity as eamso
 import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamug
+import emission.analysis.modelling.trip_model.forest_classifier as eamuf
 
 
 SIMILARITY_THRESHOLD_METERS=500
@@ -11,6 +12,7 @@
 class ModelType(Enum):
     # ENUM_NAME_CAPS = 'SHORTHAND_NAME_CAPS'
     GREEDY_SIMILARITY_BINNING = 'GREEDY'
+    RANDOM_FOREST_CLASSIFIER = 'FOREST'
 
     def build(self, config=None) -> eamuu.TripModel:
         """
@@ -24,16 +26,16 @@ def build(self, config=None) -> eamuu.TripModel:
         :raises KeyError: if the requested model name does not exist
         """
         # Dict[ModelType, TripModel]
-        MODELS = {
-                ModelType.GREEDY_SIMILARITY_BINNING: eamug.GreedySimilarityBinning(config)
-        }
+        MODELS = {                
+                ModelType.GREEDY_SIMILARITY_BINNING: eamug.GreedySimilarityBinning,
+                ModelType.RANDOM_FOREST_CLASSIFIER: eamuf.ForestClassifierModel
+                }    
         model = MODELS.get(self)
         if model is None:
-            model_names = list(lambda e: e.name, MODELS.keys())
-            models = ",".join(model_names)
-            raise KeyError(f"ModelType {self.name} not found in factory, please add to build method")
-
-        return model
+            available_models = ', '.join([ e.name for e in ModelType])
+            raise KeyError(f"ModelType {self.name} not found in factory, Available models are {available_models}."\
+                           "Otherwise please add new model to build method")
+        return model(config)
 
     @classmethod
     def names(cls):