From 3e87e500dddaa1a19582171e31cb7f1a36730b7a Mon Sep 17 00:00:00 2001 From: rfitzger Date: Thu, 12 May 2022 13:36:00 -0600 Subject: [PATCH 01/46] checkpoint: initial work on clustering abstraction --- .../__init__.py | 0 .../greedy_similarity_clustering.py | 142 ++++++++++++++++++ .../prediction.py | 15 ++ .../probabilistic_clustering_model.py | 67 +++++++++ .../probabilistic_clustering_model/util.py | 63 ++++++++ .../analysis/modelling/similarity/__init__.py | 0 .../confirmed_trip_feature_extraction.py | 28 ++++ .../modelling/similarity/od_similarity.py | 26 ++++ .../modelling/similarity/similarity_metric.py | 46 ++++++ 9 files changed, 387 insertions(+) create mode 100644 emission/analysis/modelling/probabilistic_clustering_model/__init__.py create mode 100644 emission/analysis/modelling/probabilistic_clustering_model/greedy_similarity_clustering.py create mode 100644 emission/analysis/modelling/probabilistic_clustering_model/prediction.py create mode 100644 emission/analysis/modelling/probabilistic_clustering_model/probabilistic_clustering_model.py create mode 100644 emission/analysis/modelling/probabilistic_clustering_model/util.py create mode 100644 emission/analysis/modelling/similarity/__init__.py create mode 100644 emission/analysis/modelling/similarity/confirmed_trip_feature_extraction.py create mode 100644 emission/analysis/modelling/similarity/od_similarity.py create mode 100644 emission/analysis/modelling/similarity/similarity_metric.py diff --git a/emission/analysis/modelling/probabilistic_clustering_model/__init__.py b/emission/analysis/modelling/probabilistic_clustering_model/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/emission/analysis/modelling/probabilistic_clustering_model/greedy_similarity_clustering.py b/emission/analysis/modelling/probabilistic_clustering_model/greedy_similarity_clustering.py new file mode 100644 index 000000000..e04fe6411 --- /dev/null +++ b/emission/analysis/modelling/probabilistic_clustering_model/greedy_similarity_clustering.py @@ -0,0 +1,142 @@ + +import logging +from pathlib import Path +from typing import Callable, List, Optional, Tuple +from emission.analysis.modelling.probabilistic_clustering_model.prediction import Prediction +from emission.analysis.modelling.probabilistic_clustering_model.probabilistic_clustering_model import ProbabilisticClusteringModel +from emission.analysis.modelling.similarity.similarity_metric import SimilarityMetric +from emission.analysis.modelling.tour_model.similarity import similarity +from emission.analysis.modelling.tour_model_first_only.load_predict import loadModelStage +import emission.analysis.modelling.tour_model.data_preprocessing as preprocess +from emission.core.wrapper.confirmedtrip import Confirmedtrip +import emission.analysis.modelling.probabilistic_clustering_model.util as util +import emission.analysis.modelling.similarity.confirmed_trip_feature_extraction as ctfe + +RADIUS=500 + + +class GreedySimilarityClustering(ProbabilisticClusteringModel): + + def __init__(self, dir: Path, user_id: str, metric: SimilarityMetric, sim_thresh: float, apply_cutoff: bool = True) -> None: + """instantiate a clustering model for a user + + :param dir: the model load/save directory + :type dir: Path + :param user_id: identity (UUID) of the e-mission user + :type user_id: str + :param metric: type of similarity metric to use + :type metric: SimilarityMetric + :param sim_thresh: max distance threshold for similarity (assumed meters) + :type sim_thresh: float + :param apply_cutoff: ignore clusters which are small, based on a "knee point" heuristic (default True) + :type apply_cutoff: bool + """ + super().__init__() + self.directory = dir + self.user_id = user_id + self.metric = metric + self.sim_thresh = sim_thresh + self.apply_cutoff = apply_cutoff + self.trip_locations_by_bin = {} + self.trip_labels_by_bin = {} + self.loaded = False + + + def load(self, user_id: str): + self.trip_locations_by_bin = util.load_json_model_stage('locations_first_round_' + str(user_id)) + self.trip_labels_by_bin = util.load_json_model_stage('user_labels_first_round_' + str(user_id)) + self.loaded = True + + def save(self, user_id: str): + """save this model to disk for the given user + + :param user_id: id for the user associated with this model + :type user_id: str + """ + + # see load_predict.py save_models as called by build_user_model + + pass + + def fit(self, data: List[Confirmedtrip], labels: List[int]): + """train the model by passing data, where each row in the data + corresponds to a label at the matching index of the label input + + :param data: 2D array of features to train from + :type data: List[Confirmedtrip] + :param labels: vector of labels associated with the input data + :type labels: List[int] + """ + self.trip_labels_by_bin = {} + self.trip_locations_by_bin = {} + num_bins = 0 + + # assign bins to all trips + for trip in data: + trip_location = self.metric.extract_features(trip) + trip_labels = ctfe.label_features(trip) + bin_id = self.find_bin(trip) + if bin_id is not None: + # add to existing bin + self.trip_locations_by_bin[bin_id].append(trip_location) + self.trip_labels_by_bin[bin_id].append(trip_labels) + else: + # create new bin + new_bin_id = num_bins + self.trip_locations_by_bin[new_bin_id] = [trip_location] + self.trip_labels_by_bin[new_bin_id] = [trip_labels] + num_bins += 1 + + if len(self.trip_locations_by_bin) > 1 and self.apply_cutoff: + # apply the cutoff heuristic to reduce over-fitting to small clusters + bin_sizes = [len(b) for b in self.trip_locations_by_bin.values()] + cutoff_idx, cutoff_value = util.find_knee_point(bin_sizes) + logging.debug("bins = %s, elbow distance = %s" % (num_bins, cutoff_value)) + + # bins = bins.sort(key=lambda bin: len(bin), reverse=True) + + # see similarity.py fit method + # if self.cutoff: + # self.delete_bins() + # self.labels_ = self.get_result_labels() + + # see build_save_model for creating the self.user_labels + + pass + + def predict(self, trip: Confirmedtrip) -> Tuple[List[Prediction], int]: + if not self.loaded: + msg = ( + "predict called on unloaded clustering model " + f"for user {self.user_id}" + ) + raise IOError(msg) + + logging.debug(f"running greedy similarity clustering") + selected_bin = self.find_bin(trip) + if selected_bin is None: + logging.debug(f"unable to predict bin for trip {trip}") + return [], -1 + else: + labels = self.trip_labels_by_bin[selected_bin] + logging.debug(f"found cluster {selected_bin} with labels {labels}") + return labels, len(self.trip_locations_by_bin[selected_bin]) + + def find_bin(self, trip: Confirmedtrip) -> Optional[int]: + """finds a bin which contains at least one matching feature + + :param trip: incoming trip features to test with + :type trip: Confirmedtrip + :return: either a bin number, or, None if no match was found + :rtype: Optional[int] + """ + trip_features = self.metric.extract_features(trip) + bin_ids = list(self.trip_locations_by_bin.keys()) + selected_bin = None + for bin_id in bin_ids: + this_bin = self.trip_locations_by_bin[bin_id] + for binned_features in this_bin: + if self.metric.similar(trip_features, binned_features, self.sim_thresh): + selected_bin = bin_id + break + return selected_bin \ No newline at end of file diff --git a/emission/analysis/modelling/probabilistic_clustering_model/prediction.py b/emission/analysis/modelling/probabilistic_clustering_model/prediction.py new file mode 100644 index 000000000..ed7311b79 --- /dev/null +++ b/emission/analysis/modelling/probabilistic_clustering_model/prediction.py @@ -0,0 +1,15 @@ +from typing import Dict, TypedDict + +# something like this: +# x = {'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'church', 'replaced_mode': 'drove_alone'}, 'p': 0.9333333333333333} + +class Prediction(TypedDict): + labels: Dict[str, str] + p: float + + @classmethod + def from_dict(cls, d) -> Prediction: + labels = d.get('labels') + p = d.get('p') + return Prediction(labels, p +) \ No newline at end of file diff --git a/emission/analysis/modelling/probabilistic_clustering_model/probabilistic_clustering_model.py b/emission/analysis/modelling/probabilistic_clustering_model/probabilistic_clustering_model.py new file mode 100644 index 000000000..f21e9dbb8 --- /dev/null +++ b/emission/analysis/modelling/probabilistic_clustering_model/probabilistic_clustering_model.py @@ -0,0 +1,67 @@ +from __future__ import annotations +from abc import ABCMeta, abstractmethod +from typing import List, Tuple +import numpy as np +from pathlib import Path + +from emission.analysis.modelling.probabilistic_clustering_model.prediction import Prediction + + +class ProbabilisticClusteringModel(metaclass=ABCMeta): + + @abstractmethod + def load(self, user_id: str): + """load a model from disk for the given user + + :param user_id: id for the user associated with this model + :type user_id: str + """ + + @abstractmethod + def save(self, user_id: str): + """save this model to disk for the given user + + :param user_id: id for the user associated with this model + :type user_id: str + """ + pass + + @abstractmethod + def fit(data: List[List[float]], labels: List[int]): + """train the model on data, where each row in the data + corresponds to a label at the matching index of the label input + + :param data: 2D array of features to train from + :type data: List[List[float]] + :param labels: vector of labels associated with the input data + :type labels: List[int] + """ + pass + + @abstractmethod + def predict(self, data: List[float]) -> Tuple[List[Prediction], int]: + """use this model to predict labels for some data + + :param data: a single row of features in the model's feature space + :type data: List[float] + :return: the predictions and the total count of observations + :rtype: Tuple[List[Prediction], int] + """ + pass + + +# data: List[ConfirmedTrip] +# +# for user in users: +# time: int = get_latest_time(user) +# data = preprocess.read_data_since_time(user, time) +# +# filtered_data = preprocess.filter_data(data, RADIUS) +# if not valid: +# ... +# filepath = file_path(user) +# model = c.load(filepath) +# X = extract_features(filtered_data) +# model.fit() +# model.save(filepath) + diff --git a/emission/analysis/modelling/probabilistic_clustering_model/util.py b/emission/analysis/modelling/probabilistic_clustering_model/util.py new file mode 100644 index 000000000..d7889c703 --- /dev/null +++ b/emission/analysis/modelling/probabilistic_clustering_model/util.py @@ -0,0 +1,63 @@ +from typing import List, Tuple, Union +import jsonpickle as jpickle +import logging +from past.utils import old_div +import numpy +from numpy.linalg import norm + + +def load_json_model_stage(filename: str, numpy_decode: bool = True) -> Union[dict, list]: + """loads some clustering model resource, assumed to be a + json object. if the file is not found, returns an empty list. + + :param filename: file name to load + :type filename: str + :param numpy_decode: if part of the data is numpy encoded + :type numpy_decode: bool + :return: json object parsed, or, an empty list + :rtype: Union[dict, list] + """ + logging.debug(f"At stage: loading model") + try: + with open(filename, "r") as f: + if numpy_decode: + # see https://jsonpickle.github.io/extensions.html + import jsonpickle.ext.numpy as jsonpickle_numpy + jsonpickle_numpy.register_handlers() + result = jpickle.loads(f.read()) + return result + except IOError: + logging.info(f"No model found at {filename}, no prediction") + return [] + +def find_knee_point(values: List[float]) -> Tuple[float, int]: + """for a list of values, find the value which represents the cut-off point + or "elbow" in the function when values are sorted. + + based on this stack overflow answer: https://stackoverflow.com/a/2022348/4803266 + And summarized by the statement: "A quick way of finding the elbow is to draw a + line from the first to the last point of the curve and then find the data point + that is farthest away from that line." + + :param values: list of values from which to select a cut-off + :type values: List[float] + :return: the index and value to use as a cutoff + :rtype: Tuple[int, float] + """ + N = len(values) + x = list(range(N)) + max = 0 + index = -1 + a = numpy.array([x[0], values[0]]) + b = numpy.array([x[-1], values[-1]]) + n = norm(b-a) + new_y = [] + for i in range(0, N): + p = numpy.array([x[i], values[i]]) + dist = old_div(norm(numpy.cross(p-a,p-b)),n) + new_y.append(dist) + if dist > max: + max = dist + index = i + value = values[index] + return [index, value] diff --git a/emission/analysis/modelling/similarity/__init__.py b/emission/analysis/modelling/similarity/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/emission/analysis/modelling/similarity/confirmed_trip_feature_extraction.py b/emission/analysis/modelling/similarity/confirmed_trip_feature_extraction.py new file mode 100644 index 000000000..1ff3594ab --- /dev/null +++ b/emission/analysis/modelling/similarity/confirmed_trip_feature_extraction.py @@ -0,0 +1,28 @@ +from typing import Dict, List +from emission.core.wrapper.confirmedtrip import Confirmedtrip +import emission.analysis.modelling.tour_model.label_processing as lp + + +def origin_features(trip: Confirmedtrip) -> List[float]: + origin = trip.data.start_loc["coordinates"] + return origin + +def destination_features(trip: Confirmedtrip) -> List[float]: + destination = trip.data.end_loc["coordinates"] + return destination + +def od_features(trip: Confirmedtrip) -> List[float]: + o_lat, o_lon = origin_features(trip) + d_lat, d_lon = destination_features(trip) + return [o_lat, o_lon, d_lat, d_lon] + +def distance_feature(trip: Confirmedtrip) -> List[float]: + return [trip.data.distance] + +def duration_feature(trip: Confirmedtrip) -> List[float]: + return [trip.data.duration] + +def label_features(trip: Confirmedtrip) -> Dict: + labels = trip.data.user_input + labels_normalized = lp.map_labels(labels) # could be replaced by localization logic + return labels_normalized diff --git a/emission/analysis/modelling/similarity/od_similarity.py b/emission/analysis/modelling/similarity/od_similarity.py new file mode 100644 index 000000000..12d0c12c8 --- /dev/null +++ b/emission/analysis/modelling/similarity/od_similarity.py @@ -0,0 +1,26 @@ +from typing import List +from emission.analysis.modelling.similarity.similarity_metric import SimilarityMetric +import emission.analysis.modelling.similarity.confirmed_trip_feature_extraction as ctfe +from emission.analysis.modelling.tour_model.similarity import similarity +from emission.core.wrapper.confirmedtrip import Confirmedtrip +import emission.core.common as ecc + + +class OriginDestinationSimilarity(SimilarityMetric): + """ + similarity metric which compares, for two trips, + the distance for origin to origin, and destination to destination + """ + + def extract_features(self, trip: Confirmedtrip) -> List[float]: + return ctfe.od_features(trip) + + def similarity(self, a: List[float], b: List[float], thresh: float) -> List[float]: + o_dist = ecc.calDistance([a[0], a[1]], [b[0], b[1]]) + d_dist = ecc.calDistance([a[2], a[3]], [b[2], b[3]]) + return [o_dist, d_dist] + + def similar(self, a: List[float], b: List[float], thresh: float) -> bool: + o_dist, d_dist = self.similarity(a, b) + is_similar = o_dist <= thresh and d_dist <= thresh + return is_similar \ No newline at end of file diff --git a/emission/analysis/modelling/similarity/similarity_metric.py b/emission/analysis/modelling/similarity/similarity_metric.py new file mode 100644 index 000000000..8dfab1902 --- /dev/null +++ b/emission/analysis/modelling/similarity/similarity_metric.py @@ -0,0 +1,46 @@ +from abc import ABCMeta, abstractmethod +from typing import List + +from emission.core.wrapper.confirmedtrip import Confirmedtrip + + +class SimilarityMetric(metaclass=ABCMeta): + + @abstractmethod + def extract_features(self, trip: Confirmedtrip) -> List[float]: + """extracts the features we want to compare for similarity + + :param trip: a confirmed trip + :type trip: Confirmedtrip + :return: the features to compare + :rtype: List[float] + """ + pass + + def similarity(self, a: List[float], b: List[float]) -> List[float]: + """compares the features, producing their similarity + as computed by this similarity metric + + :param a: features for a trip + :type a: List[float] + :param b: features for another trip + :type b: List[float] + :return: for each feature, the similarity of these features + :rtype: List[float] + """ + pass + + def similar(self, a: List[float], b: List[float], thresh: float) -> bool: + """compares the features, returning true if they are similar + within some threshold + + :param a: features for a trip + :type a: List[float] + :param b: features for another trip + :type b: List[float] + :param thresh: threshold for similarity + :type thresh: float + :return: true if the feature similarity is within some threshold + :rtype: float + """ + pass \ No newline at end of file From a1e75a6de155b32c17849fb2da7d803bb27a09a1 Mon Sep 17 00:00:00 2001 From: rfitzger Date: Thu, 19 May 2022 14:37:12 -0600 Subject: [PATCH 02/46] user label prediction refactor --- .gitignore | 1 + .../greedy_similarity_clustering.py | 142 ------------ .../prediction.py | 15 -- .../probabilistic_clustering_model.py | 67 ------ .../probabilistic_clustering_model/util.py | 63 ----- .../modelling/similarity/od_similarity.py | 3 +- .../__init__.py | 0 .../modelling/user_label_model/bin_record.py | 26 +++ .../greedy_similarity_binning.py | 215 ++++++++++++++++++ .../user_label_model/model_storage.py | 69 ++++++ .../modelling/user_label_model/model_type.py | 23 ++ .../modelling/user_label_model/prediction.py | 12 + .../modelling/user_label_model/run_model.py | 201 ++++++++++++++++ .../user_label_prediction_model.py | 81 +++++++ .../modelling/user_label_model/util.py | 130 +++++++++++ ...reedy_c46b0f38-fc47-4e09-a8de-f15f382f7121 | 1 + 16 files changed, 761 insertions(+), 288 deletions(-) delete mode 100644 emission/analysis/modelling/probabilistic_clustering_model/greedy_similarity_clustering.py delete mode 100644 emission/analysis/modelling/probabilistic_clustering_model/prediction.py delete mode 100644 emission/analysis/modelling/probabilistic_clustering_model/probabilistic_clustering_model.py delete mode 100644 emission/analysis/modelling/probabilistic_clustering_model/util.py rename emission/analysis/modelling/{probabilistic_clustering_model => user_label_model}/__init__.py (100%) create mode 100644 emission/analysis/modelling/user_label_model/bin_record.py create mode 100644 emission/analysis/modelling/user_label_model/greedy_similarity_binning.py create mode 100644 emission/analysis/modelling/user_label_model/model_storage.py create mode 100644 emission/analysis/modelling/user_label_model/model_type.py create mode 100644 emission/analysis/modelling/user_label_model/prediction.py create mode 100644 emission/analysis/modelling/user_label_model/run_model.py create mode 100644 emission/analysis/modelling/user_label_model/user_label_prediction_model.py create mode 100644 emission/analysis/modelling/user_label_model/util.py create mode 100644 user_label_model_greedy_c46b0f38-fc47-4e09-a8de-f15f382f7121 diff --git a/.gitignore b/.gitignore index e1a0cf045..1b467ec07 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ *.swp *debug.log .DS_Store +.vscode CFC_WebApp/config.json CFC_WebApp/keys.json diff --git a/emission/analysis/modelling/probabilistic_clustering_model/greedy_similarity_clustering.py b/emission/analysis/modelling/probabilistic_clustering_model/greedy_similarity_clustering.py deleted file mode 100644 index e04fe6411..000000000 --- a/emission/analysis/modelling/probabilistic_clustering_model/greedy_similarity_clustering.py +++ /dev/null @@ -1,142 +0,0 @@ - -import logging -from pathlib import Path -from typing import Callable, List, Optional, Tuple -from emission.analysis.modelling.probabilistic_clustering_model.prediction import Prediction -from emission.analysis.modelling.probabilistic_clustering_model.probabilistic_clustering_model import ProbabilisticClusteringModel -from emission.analysis.modelling.similarity.similarity_metric import SimilarityMetric -from emission.analysis.modelling.tour_model.similarity import similarity -from emission.analysis.modelling.tour_model_first_only.load_predict import loadModelStage -import emission.analysis.modelling.tour_model.data_preprocessing as preprocess -from emission.core.wrapper.confirmedtrip import Confirmedtrip -import emission.analysis.modelling.probabilistic_clustering_model.util as util -import emission.analysis.modelling.similarity.confirmed_trip_feature_extraction as ctfe - -RADIUS=500 - - -class GreedySimilarityClustering(ProbabilisticClusteringModel): - - def __init__(self, dir: Path, user_id: str, metric: SimilarityMetric, sim_thresh: float, apply_cutoff: bool = True) -> None: - """instantiate a clustering model for a user - - :param dir: the model load/save directory - :type dir: Path - :param user_id: identity (UUID) of the e-mission user - :type user_id: str - :param metric: type of similarity metric to use - :type metric: SimilarityMetric - :param sim_thresh: max distance threshold for similarity (assumed meters) - :type sim_thresh: float - :param apply_cutoff: ignore clusters which are small, based on a "knee point" heuristic (default True) - :type apply_cutoff: bool - """ - super().__init__() - self.directory = dir - self.user_id = user_id - self.metric = metric - self.sim_thresh = sim_thresh - self.apply_cutoff = apply_cutoff - self.trip_locations_by_bin = {} - self.trip_labels_by_bin = {} - self.loaded = False - - - def load(self, user_id: str): - self.trip_locations_by_bin = util.load_json_model_stage('locations_first_round_' + str(user_id)) - self.trip_labels_by_bin = util.load_json_model_stage('user_labels_first_round_' + str(user_id)) - self.loaded = True - - def save(self, user_id: str): - """save this model to disk for the given user - - :param user_id: id for the user associated with this model - :type user_id: str - """ - - # see load_predict.py save_models as called by build_user_model - - pass - - def fit(self, data: List[Confirmedtrip], labels: List[int]): - """train the model by passing data, where each row in the data - corresponds to a label at the matching index of the label input - - :param data: 2D array of features to train from - :type data: List[Confirmedtrip] - :param labels: vector of labels associated with the input data - :type labels: List[int] - """ - self.trip_labels_by_bin = {} - self.trip_locations_by_bin = {} - num_bins = 0 - - # assign bins to all trips - for trip in data: - trip_location = self.metric.extract_features(trip) - trip_labels = ctfe.label_features(trip) - bin_id = self.find_bin(trip) - if bin_id is not None: - # add to existing bin - self.trip_locations_by_bin[bin_id].append(trip_location) - self.trip_labels_by_bin[bin_id].append(trip_labels) - else: - # create new bin - new_bin_id = num_bins - self.trip_locations_by_bin[new_bin_id] = [trip_location] - self.trip_labels_by_bin[new_bin_id] = [trip_labels] - num_bins += 1 - - if len(self.trip_locations_by_bin) > 1 and self.apply_cutoff: - # apply the cutoff heuristic to reduce over-fitting to small clusters - bin_sizes = [len(b) for b in self.trip_locations_by_bin.values()] - cutoff_idx, cutoff_value = util.find_knee_point(bin_sizes) - logging.debug("bins = %s, elbow distance = %s" % (num_bins, cutoff_value)) - - # bins = bins.sort(key=lambda bin: len(bin), reverse=True) - - # see similarity.py fit method - # if self.cutoff: - # self.delete_bins() - # self.labels_ = self.get_result_labels() - - # see build_save_model for creating the self.user_labels - - pass - - def predict(self, trip: Confirmedtrip) -> Tuple[List[Prediction], int]: - if not self.loaded: - msg = ( - "predict called on unloaded clustering model " - f"for user {self.user_id}" - ) - raise IOError(msg) - - logging.debug(f"running greedy similarity clustering") - selected_bin = self.find_bin(trip) - if selected_bin is None: - logging.debug(f"unable to predict bin for trip {trip}") - return [], -1 - else: - labels = self.trip_labels_by_bin[selected_bin] - logging.debug(f"found cluster {selected_bin} with labels {labels}") - return labels, len(self.trip_locations_by_bin[selected_bin]) - - def find_bin(self, trip: Confirmedtrip) -> Optional[int]: - """finds a bin which contains at least one matching feature - - :param trip: incoming trip features to test with - :type trip: Confirmedtrip - :return: either a bin number, or, None if no match was found - :rtype: Optional[int] - """ - trip_features = self.metric.extract_features(trip) - bin_ids = list(self.trip_locations_by_bin.keys()) - selected_bin = None - for bin_id in bin_ids: - this_bin = self.trip_locations_by_bin[bin_id] - for binned_features in this_bin: - if self.metric.similar(trip_features, binned_features, self.sim_thresh): - selected_bin = bin_id - break - return selected_bin \ No newline at end of file diff --git a/emission/analysis/modelling/probabilistic_clustering_model/prediction.py b/emission/analysis/modelling/probabilistic_clustering_model/prediction.py deleted file mode 100644 index ed7311b79..000000000 --- a/emission/analysis/modelling/probabilistic_clustering_model/prediction.py +++ /dev/null @@ -1,15 +0,0 @@ -from typing import Dict, TypedDict - -# something like this: -# x = {'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'church', 'replaced_mode': 'drove_alone'}, 'p': 0.9333333333333333} - -class Prediction(TypedDict): - labels: Dict[str, str] - p: float - - @classmethod - def from_dict(cls, d) -> Prediction: - labels = d.get('labels') - p = d.get('p') - return Prediction(labels, p -) \ No newline at end of file diff --git a/emission/analysis/modelling/probabilistic_clustering_model/probabilistic_clustering_model.py b/emission/analysis/modelling/probabilistic_clustering_model/probabilistic_clustering_model.py deleted file mode 100644 index f21e9dbb8..000000000 --- a/emission/analysis/modelling/probabilistic_clustering_model/probabilistic_clustering_model.py +++ /dev/null @@ -1,67 +0,0 @@ -from __future__ import annotations -from abc import ABCMeta, abstractmethod -from typing import List, Tuple -import numpy as np -from pathlib import Path - -from emission.analysis.modelling.probabilistic_clustering_model.prediction import Prediction - - -class ProbabilisticClusteringModel(metaclass=ABCMeta): - - @abstractmethod - def load(self, user_id: str): - """load a model from disk for the given user - - :param user_id: id for the user associated with this model - :type user_id: str - """ - - @abstractmethod - def save(self, user_id: str): - """save this model to disk for the given user - - :param user_id: id for the user associated with this model - :type user_id: str - """ - pass - - @abstractmethod - def fit(data: List[List[float]], labels: List[int]): - """train the model on data, where each row in the data - corresponds to a label at the matching index of the label input - - :param data: 2D array of features to train from - :type data: List[List[float]] - :param labels: vector of labels associated with the input data - :type labels: List[int] - """ - pass - - @abstractmethod - def predict(self, data: List[float]) -> Tuple[List[Prediction], int]: - """use this model to predict labels for some data - - :param data: a single row of features in the model's feature space - :type data: List[float] - :return: the predictions and the total count of observations - :rtype: Tuple[List[Prediction], int] - """ - pass - - -# data: List[ConfirmedTrip] -# -# for user in users: -# time: int = get_latest_time(user) -# data = preprocess.read_data_since_time(user, time) -# -# filtered_data = preprocess.filter_data(data, RADIUS) -# if not valid: -# ... -# filepath = file_path(user) -# model = c.load(filepath) -# X = extract_features(filtered_data) -# model.fit() -# model.save(filepath) - diff --git a/emission/analysis/modelling/probabilistic_clustering_model/util.py b/emission/analysis/modelling/probabilistic_clustering_model/util.py deleted file mode 100644 index d7889c703..000000000 --- a/emission/analysis/modelling/probabilistic_clustering_model/util.py +++ /dev/null @@ -1,63 +0,0 @@ -from typing import List, Tuple, Union -import jsonpickle as jpickle -import logging -from past.utils import old_div -import numpy -from numpy.linalg import norm - - -def load_json_model_stage(filename: str, numpy_decode: bool = True) -> Union[dict, list]: - """loads some clustering model resource, assumed to be a - json object. if the file is not found, returns an empty list. - - :param filename: file name to load - :type filename: str - :param numpy_decode: if part of the data is numpy encoded - :type numpy_decode: bool - :return: json object parsed, or, an empty list - :rtype: Union[dict, list] - """ - logging.debug(f"At stage: loading model") - try: - with open(filename, "r") as f: - if numpy_decode: - # see https://jsonpickle.github.io/extensions.html - import jsonpickle.ext.numpy as jsonpickle_numpy - jsonpickle_numpy.register_handlers() - result = jpickle.loads(f.read()) - return result - except IOError: - logging.info(f"No model found at {filename}, no prediction") - return [] - -def find_knee_point(values: List[float]) -> Tuple[float, int]: - """for a list of values, find the value which represents the cut-off point - or "elbow" in the function when values are sorted. - - based on this stack overflow answer: https://stackoverflow.com/a/2022348/4803266 - And summarized by the statement: "A quick way of finding the elbow is to draw a - line from the first to the last point of the curve and then find the data point - that is farthest away from that line." - - :param values: list of values from which to select a cut-off - :type values: List[float] - :return: the index and value to use as a cutoff - :rtype: Tuple[int, float] - """ - N = len(values) - x = list(range(N)) - max = 0 - index = -1 - a = numpy.array([x[0], values[0]]) - b = numpy.array([x[-1], values[-1]]) - n = norm(b-a) - new_y = [] - for i in range(0, N): - p = numpy.array([x[i], values[i]]) - dist = old_div(norm(numpy.cross(p-a,p-b)),n) - new_y.append(dist) - if dist > max: - max = dist - index = i - value = values[index] - return [index, value] diff --git a/emission/analysis/modelling/similarity/od_similarity.py b/emission/analysis/modelling/similarity/od_similarity.py index 12d0c12c8..b55e51618 100644 --- a/emission/analysis/modelling/similarity/od_similarity.py +++ b/emission/analysis/modelling/similarity/od_similarity.py @@ -9,7 +9,8 @@ class OriginDestinationSimilarity(SimilarityMetric): """ similarity metric which compares, for two trips, - the distance for origin to origin, and destination to destination + the distance for origin to origin, and destination to destination, + in meters. """ def extract_features(self, trip: Confirmedtrip) -> List[float]: diff --git a/emission/analysis/modelling/probabilistic_clustering_model/__init__.py b/emission/analysis/modelling/user_label_model/__init__.py similarity index 100% rename from emission/analysis/modelling/probabilistic_clustering_model/__init__.py rename to emission/analysis/modelling/user_label_model/__init__.py diff --git a/emission/analysis/modelling/user_label_model/bin_record.py b/emission/analysis/modelling/user_label_model/bin_record.py new file mode 100644 index 000000000..dc9f57d9f --- /dev/null +++ b/emission/analysis/modelling/user_label_model/bin_record.py @@ -0,0 +1,26 @@ +from typing import Dict, List +from emission.analysis.modelling.user_label_model.prediction import Prediction + +# something like this: +# bin_data = { +# "predictions": [ +# {'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'insurance', 'replaced_mode': 'drove_alone'}, 'p': 0.9333333333333333} +# ], +# "locations": [ +# [-122.00, 39.00, -122.01, 39.00] +# ], +# "labels": [ +# {'mode_confirm': 'shared_ride', 'purpose_confirm': 'insurance_payment', 'replaced_mode': 'drove_alone'} +# ] +# } + +BinRecord = Dict + +# todo: if OpenPATH goes to Python 3.8, we can use this: +# +# class BinRecord(TypedDict): +# predictions: List[Prediction] +# features: List[List[float]] +# labels: List[Dict[str, str]] + + diff --git a/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py b/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py new file mode 100644 index 000000000..5f0f00f47 --- /dev/null +++ b/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py @@ -0,0 +1,215 @@ +import logging +import pandas as pd +from pathlib import Path +from typing import Dict, List, Optional, Tuple +from emission.analysis.modelling.user_label_model.bin_record import BinRecord +from emission.analysis.modelling.user_label_model.prediction import ( + Prediction, +) +from emission.analysis.modelling.user_label_model.user_label_prediction_model import ( + UserLabelPredictionModel, +) +from emission.analysis.modelling.similarity.similarity_metric import SimilarityMetric +from emission.analysis.modelling.tour_model.similarity import similarity +from emission.analysis.modelling.tour_model_first_only.load_predict import ( + loadModelStage, +) +import emission.analysis.modelling.tour_model.data_preprocessing as preprocess +from emission.core.wrapper.confirmedtrip import Confirmedtrip +import emission.analysis.modelling.similarity.confirmed_trip_feature_extraction as ctfe +import emission.analysis.modelling.user_label_model.util as util +import emission.analysis.modelling.tour_model.label_processing as lp + + +class GreedySimilarityBinning(UserLabelPredictionModel): + + def __init__( + self, + metric: SimilarityMetric, + sim_thresh: float, + apply_cutoff: bool = False, + ) -> None: + """ + instantiate a clustering model for a user. + + this technique employs a greedy similarity heuristic to associate + trips with collections of probabilistic class labels. in pseudocode: + + # fit + for each bin_id, bin in bins: + for each bin_trip in bin.trips: + if similar(trip, bin_trip): + append trip to bin.trips + + # prediction + for each bin_id, bin in bins: + for each bin_trip in bin.trips: + if similar(trip, bin_trip): + return bin.predictions: List[Prediction] + + the number of predictions is not assumed to be the number of features. + + :param dir: the model load/save directory + :type dir: Path + :param user_id: identity (UUID) of the e-mission user + :type user_id: str + :param metric: type of similarity metric to use + :type metric: SimilarityMetric + :param sim_thresh: max distance threshold for similarity (assumed meters) + :type sim_thresh: float + :param apply_cutoff: ignore clusters which are small, based on a "knee point" heuristic (default False) + :type apply_cutoff: bool + """ + super().__init__() + self.metric = metric + self.sim_thresh = sim_thresh + self.apply_cutoff = apply_cutoff + self.bins: Dict[int, BinRecord] = {} + self.loaded = False + + def fit(self, trips: List[Confirmedtrip]): + """train the model by passing data, where each row in the data + corresponds to a label at the matching index of the label input + + :param trips: 2D array of features to train from + :type trips: List[Confirmedtrip] + """ + self.bins = {} + self._assign_bins(trips) + if len(self.bins) > 1 and self.apply_cutoff: + self._apply_cutoff() + self._generate_predictions() + logging.info(f"model fit to trip data") + + def predict(self, trip: Confirmedtrip) -> Tuple[List[Prediction], int]: + if not self.loaded: + msg = ( + "predict called on unloaded model " + f"for user {self.user_id}" + ) + raise IOError(msg) + + logging.debug(f"running greedy similarity clustering") + predicted_bin, bin_record = self._nearest_bin(trip) + if predicted_bin is None: + logging.debug(f"unable to predict bin for trip {trip}") + return [], -1 + else: + labels = bin_record['prediction'] + n_features = len(bin_record['features']) + logging.debug(f"found cluster {predicted_bin} with labels {labels}") + return labels, n_features + + def is_incremental(self) -> bool: + return False + + def to_dict(self) -> Dict: + return self.bins + + def from_dict(self, model: Dict): + self.bins = model + + def extract_features(self, trip: Confirmedtrip) -> List[float]: + features = self.metric.extract_features(trip) + return features + + def _assign_bins(self, trips: List[Confirmedtrip]): + """ + assigns each trip to a bin by greedy similarity search + + :param data: trips to assign to bins + :type data: List[Confirmedtrip] + """ + for trip in trips: + trip_features = self.extract_features(trip) + trip_labels = trip['data']['user_input'] + bin_id, bin_record = self._nearest_bin(trip) + if bin_id is not None: + # add to existing bin + bin_record['features'].append(trip_features) + bin_record['labels'].append(trip_labels) + else: + # create new bin + new_bin_id = len(self.bins) + new_bin_record = { + "features": [trip_features], + "labels": [trip_labels], + "predictions": [] + } + self.bins[new_bin_id] = new_bin_record + + def _nearest_bin(self, trip: Confirmedtrip) -> Tuple[Optional[int], Optional[BinRecord]]: + """ + finds a bin which contains at least one matching feature. the + first record matching by similarity measure is returned. if + none are found, (None, None) is returned. + + :param trip: incoming trip features to test with + :type trip: Confirmedtrip + :return: nearest record and bin number, if found + :rtype: Optional[Tuple[BinRecord, Int]] + """ + trip_features = self.extract_features(trip) + selected_bin = None + selected_record = None + + for bin_id, bin_record in self.bins.items(): + if self.metric.similar(trip_features, bin_record['features'], self.sim_thresh): + selected_bin = bin_id + selected_record = bin_record + break + + return selected_bin, selected_record + + def _apply_cutoff(self): + """ + removes small clusters by an "elbow search" heuristic. see + https://stackoverflow.com/a/2022348/4803266. + """ + num_bins = len(self.bins) + bin_sizes = [len(bin_rec['features']) for bin_rec in self.bins.values()] + _, cutoff_bin_size = util.find_knee_point(bin_sizes) + logging.debug( + "bins = %s, elbow distance = %s" % (num_bins, cutoff_bin_size) + ) + + updated_bins = {bin_id: bin_rec + for bin_id, bin_rec in self.bins.items() + if len(bin_rec['features']) >= cutoff_bin_size} + + removed = len(self.bins) - len(updated_bins) + logging.debug( + f"removed %s bins with less than %s entries" + % (removed, cutoff_bin_size) + ) + # previous version held onto the removed bins for analysis, + # we could do that here if that use case is still relevant + self.bins = updated_bins + + def _generate_predictions(self): + """ + helper function to transform binned features and labels into predictions. + + for each bin, the unique label combinations are counted. their + probability is estimated with label_count / total_labels. + """ + for _, bin_record in self.bins: + user_label_df = pd.DataFrame(bin_record['labels']) + user_label_df = lp.map_labels(user_label_df).dropna() + # compute the sum of trips in this cluster + sum_trips = len(user_label_df) + # compute unique label sets and their probabilities in one cluster + # 'p' refers to probability + unique_labels = user_label_df.groupby(user_label_df.columns.tolist()).size().reset_index(name='uniqcount') + unique_labels['p'] = unique_labels.uniqcount / sum_trips + labels_columns = user_label_df.columns.to_list() + bin_label_combo_list = [] + for i in range(len(unique_labels)): + one_set_labels = {} + # e.g. labels_only={'mode_confirm': 'pilot_ebike', 'purpose_confirm': 'work', 'replaced_mode': 'walk'} + labels_only = {column: unique_labels.iloc[i][column] for column in labels_columns} + one_set_labels["labels"] = labels_only + one_set_labels['p'] = unique_labels.iloc[i]['p'] + # e.g. one_set_labels = {'labels': {'mode_confirm': 'walk', 'replaced_mode': 'walk', 'purpose_confirm': 'exercise'}, 'p': 1.0} + bin_label_combo_list.append(one_set_labels) + bin_record['predictions'] = bin_label_combo_list \ No newline at end of file diff --git a/emission/analysis/modelling/user_label_model/model_storage.py b/emission/analysis/modelling/user_label_model/model_storage.py new file mode 100644 index 000000000..4be59c9c0 --- /dev/null +++ b/emission/analysis/modelling/user_label_model/model_storage.py @@ -0,0 +1,69 @@ +from enum import Enum +from typing import Dict +import emission.analysis.modelling.user_label_model.util as util + +class ModelStorage(Enum): + FILE_SYSTEM = 0 + DATABASE = 1 + @classmethod + def names(cls): + return list(map(lambda e: e.name, list(cls))) + + +def create_filename(user_id, model_name) -> str: + return f"user_label_model_{model_name}_{str(user_id)}" + + +def create_database_table_name(model_name) -> str: + return f"user_label_model_{model_name}" + + +def load(user_id, model_name: str, model_storage: ModelStorage) -> Dict: + """load a user label model from a model storage location + + :param user_id: the user to request a model for + :type user_id: UUID + :param model_name: _description_ + :type model_name: str + :param save_format: _description_ + :type save_format: SaveFormat + :return: _description_ + :rtype: Dict + """ + if model_storage == ModelStorage.FILE_SYSTEM: + filename = create_filename(user_id, model_name) + model_data = util.load_fs(filename) + return model_data + elif model_storage == ModelStorage.DATABASE: + table_name = create_database_table_name(model_name) + model_data = util.load_db(user_id, table_name) + return model_data + else: + storage_types_str = ",".join(ModelStorage.names()) + msg = ( + f"unknown model storage type {model_storage}, must be one of " + f"{{{storage_types_str}}}" + ) + raise TypeError(msg) + +def save(user_id, model_data: Dict, model_name: str, model_storage: ModelStorage): + try: + if model_storage == ModelStorage.FILE_SYSTEM: + filename = create_filename(user_id, model_name) + util.save_fs(filename, model_data) + elif model_storage == ModelStorage.DATABASE: + table_name = create_database_table_name(model_name) + util.save_db(user_id, table_name, model_data) + else: + storage_types_str = ",".join(ModelStorage.names()) + msg = ( + f"unknown model storage type {model_storage}, must be one of " + f"{{{storage_types_str}}}" + ) + raise TypeError(msg) + except IOError as e: + msg = ( + f"cannot save model for user {user_id}, model_name {model_name} " + f"to the file system" + ) + raise IOError(msg) from e diff --git a/emission/analysis/modelling/user_label_model/model_type.py b/emission/analysis/modelling/user_label_model/model_type.py new file mode 100644 index 000000000..43aad7c1f --- /dev/null +++ b/emission/analysis/modelling/user_label_model/model_type.py @@ -0,0 +1,23 @@ +from enum import Enum + + +class ModelType(Enum): + GREEDY_SIMILARITY_BINNING = 'greedy' + + @classmethod + def names(cls): + return list(map(lambda e: e.name, list(cls))) + + @property + def model_name(self): + """ + used in filenames, database tables, etc. should be + a POSIX-compliant name. + + when adding new model types, this should be set on the + right-hand side of the enum, above. + + :return: a simple name for this model type + :rtype: str + """ + return self.value \ No newline at end of file diff --git a/emission/analysis/modelling/user_label_model/prediction.py b/emission/analysis/modelling/user_label_model/prediction.py new file mode 100644 index 000000000..f037f2f28 --- /dev/null +++ b/emission/analysis/modelling/user_label_model/prediction.py @@ -0,0 +1,12 @@ +from typing import Dict + +# something like this: +# x = {'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'church', 'replaced_mode': 'drove_alone'}, 'p': 0.9333333333333333} + +Prediction = Dict + +# todo: if OpenPATH goes to Python 3.8, we can use this: +# +# class Prediction(TypedDict): +# labels: Dict[str, str] +# p: float diff --git a/emission/analysis/modelling/user_label_model/run_model.py b/emission/analysis/modelling/user_label_model/run_model.py new file mode 100644 index 000000000..d96947027 --- /dev/null +++ b/emission/analysis/modelling/user_label_model/run_model.py @@ -0,0 +1,201 @@ +import logging +from datetime import datetime +from tracemalloc import start +from typing import Optional + +import emission.storage.timeseries.abstract_timeseries as esta +from emission.analysis.modelling.similarity.od_similarity import \ + OriginDestinationSimilarity +from emission.analysis.modelling.user_label_model.greedy_similarity_binning import \ + GreedySimilarityBinning +from emission.analysis.modelling.user_label_model.model_storage import ( + ModelStorage, load, save) +from emission.analysis.modelling.user_label_model.model_type import ModelType +from emission.analysis.modelling.user_label_model.user_label_prediction_model import \ + UserLabelPredictionModel +from emission.core.wrapper.confirmedtrip import Confirmedtrip +from numpy import isin + +import emission.analysis.modelling.tour_model.cluster_pipeline as pipeline +import emission.storage.decorations.analysis_timeseries_queries as esda +from emission.storage.timeseries.timequery import TimeQuery + +SIMILARITY_THRESHOLD_METERS = 500 + + +def _model_factory(model_type: ModelType): + """ + instantiates the requested user model type with the configured + parameters. if future model types are created, they should be + added here. + + :param model_type: internally-used model name + :type model_type: ModelType + :raises KeyError: if the requested model name does not exist + :return: a user label prediction model + :rtype: UserLabelPredictionModel + """ + MODELS = { + ModelType.GREEDY_SIMILARITY_BINNING: GreedySimilarityBinning( + metric=OriginDestinationSimilarity(), + sim_thresh=SIMILARITY_THRESHOLD_METERS, + apply_cutoff=False + ) + } + model = MODELS.get(model_type) + if model is None: + if not isinstance(model_type, ModelType): + raise TypeError(f"provided model type {model_type} is not an instance of ModelType") + else: + model_names = list(lambda e: e.name, MODELS.keys()) + models = ",".join(model_names) + raise KeyError(f"user label model {model_type.name} not found in factory, must be one of {{{models}}}") + return model + + +def update_user_label_model(user_id, model_type: ModelType, model_storage: ModelStorage, min_trips: int = 14): + """ + create/update a user label model for a user. + + updating the user label model occurs as a background task for the server. + trips for the user are collected and the data is fit to the requested model type. + if the model type is "incremental", only the newest trips are used. + + :param user_id: id of user + :type user_id: _type_ + :param model_type: type of model to build + :type model_type: str + :param model_storage: storage destination for built model + :type model_storage: ModelStorage + :param min_trips: minimum number of labeled trips per user to apply prediction (default 14) + :type min_trips: int + """ + model = _model_factory(model_type) + model_name = model_type.model_name + + if model.is_incremental: + + # read in existing model, if it exists + model_data = load(user_id, model_name, model_storage) + model.from_dict(model_data) + + # todo: get timestamp from pipeline, use as filter in query + start_ts = -1 + end_ts = datetime.now() + time_query = TimeQuery( + timeType="data.start_ts", + startTs=start_ts, + endTs=end_ts + ) + else: + time_query = None + + trips = _get_trips_for_user(user_id, time_query, min_trips) + + # fit and store the updated model + model.fit(trips) + model_data = model.to_dict() + save(user_id, model_data, model_name, model_storage) + + if model.is_incremental: + new_timestamp = datetime.now() + # todo: update pipeline with new timestamp + + logging.debug(f"{model_name} label prediction model built for user {user_id}") + + +def predict_labels_with_n( + trip: Confirmedtrip, + model_type = ModelType.GREEDY_SIMILARITY_BINNING, + model_storage = ModelStorage.FILE_SYSTEM): + """ + invoke the user label prediction model to predict labels for a trip. + + :param trip: the trip to predict labels for + :type trip: Confirmedtrip + :param model_type: type of prediction model to run + :type model_type: ModelType + :param model_storage: location to read/write models + :type model_storage: ModelStorage + :return: a list of predictions + :rtype: List[Prediction] + """ + user_id = trip['user_id'] + model = _load_user_label_model(user_id, model_type, model_storage) + predictions, n = model.predict(trip) + return predictions, n + + +def _get_trips_for_user(user_id, time_query: Optional[TimeQuery]=None, min_trips: int=14): + """ + load the labeled trip data for this user, subject to a time query. if the user + does not have at least $min_trips trips with labels, then return an empty list. + + :param user_id: user to collect trips from + :type user_id: _type_ + :param time_query: query to restrict the time (optional) + :type time_query: Optional[TimeQuery] + :param min_trips: minimum number of labeled trips required to train + :type min_trips: int + """ + trips = esda.get_entries(key=esda.CONFIRMED_TRIP_KEY, user_id=user_id, time_query=time_query) + labeled_trips = [trip for trip in trips if trip['data']['user_input'] != {}] + if not len(labeled_trips) >= min_trips: + msg = ( + f"Total: {len(trips)}, labeled: {len(labeled_trips)}, user " + f"{user_id} doesn't have enough valid trips for further analysis." + ) + logging.debug(msg) + return [] + return labeled_trips + + +def _load_user_label_model(user_id, model_type: ModelType, model_storage: ModelStorage) -> UserLabelPredictionModel: + model = _model_factory(model_type) + model_name = model_type.model_name + + model_data = load(user_id, model_name, model_storage) + model.from_dict(model_data) + + return model + + +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', + level=logging.DEBUG) + all_users = esta.TimeSeries.get_uuid_list() + + # case 1: the new trip matches a bin from the 1st round and a cluster from the 2nd round + user_id = all_users[0] + update_user_label_model(user_id, ModelType.GREEDY_SIMILARITY_BINNING, ModelStorage.FILE_SYSTEM) + filter_trips = _get_trips_for_user(user_id) + new_trip = filter_trips[4] + # result is [{'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'church', 'replaced_mode': 'drove_alone'}, + # 'p': 0.9333333333333333}, {'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'entertainment', + # 'replaced_mode': 'drove_alone'}, 'p': 0.06666666666666667}] + pl, _ = predict_labels_with_n(new_trip) + assert len(pl) > 0, f"Invalid prediction {pl}" + + # case 2: no existing files for the user who has the new trip: + # 1. the user is invalid(< 10 existing fully labeled trips, or < 50% of trips that fully labeled) + # 2. the user doesn't have common trips + user_id = all_users[1] + update_user_label_model(user_id, ModelType.GREEDY_SIMILARITY_BINNING, ModelStorage.FILE_SYSTEM) + filter_trips = _get_trips_for_user(user_id) + new_trip = filter_trips[0] + # result is [] + pl, _ = predict_labels_with_n(new_trip) + assert len(pl) == 0, f"Invalid prediction {pl}" + + # case3: the new trip is novel trip(doesn't fall in any 1st round bins) + user = all_users[0] + update_user_label_model(user_id, ModelType.GREEDY_SIMILARITY_BINNING, ModelStorage.FILE_SYSTEM) + filter_trips = _get_trips_for_user(user_id) + new_trip = filter_trips[0] + # result is [] + pl = predict_labels_with_n(new_trip) + assert len(pl) == 0, f"Invalid prediction {pl}" + + # case 4: the new trip falls in a 1st round bin, but predict to be a new cluster in the 2nd round + # result is [] + # no example for now diff --git a/emission/analysis/modelling/user_label_model/user_label_prediction_model.py b/emission/analysis/modelling/user_label_model/user_label_prediction_model.py new file mode 100644 index 000000000..f9ff4e21c --- /dev/null +++ b/emission/analysis/modelling/user_label_model/user_label_prediction_model.py @@ -0,0 +1,81 @@ +from abc import ABCMeta, abstractmethod +from typing import Dict, List, Tuple + +from emission.analysis.modelling.user_label_model.prediction import Prediction +from emission.core.wrapper.confirmedtrip import Confirmedtrip + + +class UserLabelPredictionModel(metaclass=ABCMeta): + + @abstractmethod + def fit(data: List[List[float]]): + """ + train the model on data in an unsupervised learning setting. + + :param data: 2D array of features to train from + :type data: List[List[float]] + """ + pass + + @abstractmethod + def predict(self, data: List[float]) -> Tuple[List[Prediction], int]: + """use this model to predict labels for some data + + :param data: a single row of features in the model's feature space + :type data: List[float] + :return: the predictions and the total count of observations + :rtype: Tuple[List[Prediction], int] + """ + pass + + @abstractmethod + def to_dict(self) -> Dict: + """ + export the model as a python Dict, to be stored via the file + system or a document database. + + should be serializable. supported types at this time + (2022-05-19) include all built-in Python types and Numpy types. + + :return: the model as a Dict + :rtype: Dict + """ + pass + + @abstractmethod + def from_dict(self, model: Dict): + """ + import the model from a python Dict that was stored in the file + system or a database. forms a codec which should be idempotent + when composed with to_dict. + + :param model: the model as a python Dict + :type model: Dict + """ + pass + + @property + @abstractmethod + def is_incremental(self) -> bool: + """ + whether this model requires the complete user history to build (False), + or, if only the incremental data since last execution is required (True). + + :return: if the model is incremental. the current timestamp will be recorded + in the analysis pipeline. the next call to this model will only include + trip data for trips later than the recorded timestamp. + :rtype: bool + """ + pass + + @abstractmethod + def extract_features(self, trip: Confirmedtrip) -> List[float]: + """ + extract the relevant features for learning from a trip for this model instance + + :param trip: the trip to extract features from + :type trip: Confirmedtrip + :return: a vector containing features to predict from + :rtype: List[float] + """ + pass diff --git a/emission/analysis/modelling/user_label_model/util.py b/emission/analysis/modelling/user_label_model/util.py new file mode 100644 index 000000000..765aadcd3 --- /dev/null +++ b/emission/analysis/modelling/user_label_model/util.py @@ -0,0 +1,130 @@ +from typing import Dict, List, Optional, Tuple +import jsonpickle as jpickle +import logging +from past.utils import old_div +import numpy +from numpy.linalg import norm + + +def load_fs(filename: str, numpy_decode: bool = True) -> dict: + """loads model state as a pickled object on the file system. + if the file is not found, returns an empty dict. + + :param filename: file name to load + :type filename: str + :param numpy_decode: if part of the data is numpy encoded + :type numpy_decode: bool + :return: json object parsed, or, an empty list + :rtype: Dict + """ + logging.debug(f"At stage: loading model") + try: + with open(filename, "r") as f: + contents = f.read() + except FileNotFoundError: + logging.info(f"No model found at {filename}, no prediction") + return {} + + try: + if numpy_decode: + # see https://jsonpickle.github.io/extensions.html + import jsonpickle.ext.numpy as jsonpickle_numpy + jsonpickle_numpy.register_handlers() + result = jpickle.loads(contents) + return result + except Exception as e: + msg = ( + f"failure decoding stored model at {filename}, " + f"numpy_decode={numpy_decode}" + ) + raise IOError(msg) from e + + +def save_fs(filename: str, obj: object): + """save model state as a pickled object on the file system + + :param filename: filename to write + :type filename: str + :param obj: the object to pickle + store + :type obj: object + + """ + try: + logging.debug("At stage: saving model") + obj_capsule = jpickle.dumps(obj) + with open(filename, "w") as fd: + fd.write(obj_capsule) + except Exception as e: + msg = f"failed writing clustering model contents to file system" + raise IOError(msg) from e + + +def load_db(user_id: str, table: str, timestamp: Optional[int] = None) -> Dict: + """ + loads a user label prediction model from a database table. + + data is assumed stored in a document database, with the structure: + + { "user_id": user_id, "data": model_data } + + :param user_id: user id to filter on + :type user_id: str + :param table: the table name + :type table: str + :param timestamp: optional time to + :return: + :rtype: Dict + """ + # build the time query if a timestamp is provided + time_query = lambda confirmed_trip: confirmed_trip['data']['start_ts'] >= timestamp \ + if timestamp is not None else None + pass + +def save_db(user_id, table: str, model_data: Dict): + """ + saves a user label prediction model to the database. + + data is assumed stored in a document database, with the structure: + + { "user_id": user_id, "data": model_data } + + :param user_id: the user to store data for + :type user_id: object + :param table: the table name + :type table: str + :param model_data: the data row to store tagged by this user id + :type model_data: Dict + """ + pass + +def find_knee_point(values: List[float]) -> Tuple[float, int]: + """for a list of values, find the value which represents the cut-off point + or "elbow" in the function when values are sorted. + + based on this stack overflow answer: https://stackoverflow.com/a/2022348/4803266 + And summarized by the statement: "A quick way of finding the elbow is to draw a + line from the first to the last point of the curve and then find the data point + that is farthest away from that line." + + :param values: list of values from which to select a cut-off + :type values: List[float] + :return: the index and value to use as a cutoff + :rtype: Tuple[int, float] + """ + N = len(values) + x = list(range(N)) + max = 0 + index = -1 + a = numpy.array([x[0], values[0]]) + b = numpy.array([x[-1], values[-1]]) + n = norm(b - a) + new_y = [] + for i in range(0, N): + p = numpy.array([x[i], values[i]]) + dist = old_div(norm(numpy.cross(p - a, p - b)), n) + new_y.append(dist) + if dist > max: + max = dist + index = i + value = values[index] + return [index, value] diff --git a/user_label_model_greedy_c46b0f38-fc47-4e09-a8de-f15f382f7121 b/user_label_model_greedy_c46b0f38-fc47-4e09-a8de-f15f382f7121 new file mode 100644 index 000000000..9e26dfeeb --- /dev/null +++ b/user_label_model_greedy_c46b0f38-fc47-4e09-a8de-f15f382f7121 @@ -0,0 +1 @@ +{} \ No newline at end of file From 091bf66b87e533e08bdbd6cb6a61321ed2bd9c53 Mon Sep 17 00:00:00 2001 From: rfitzger Date: Tue, 21 Jun 2022 15:14:04 -0600 Subject: [PATCH 03/46] integrate user label model into pipeline --- .../user_label_model/model_storage.py | 126 +++++++++++++----- .../modelling/user_label_model/run_model.py | 95 ++++++------- .../modelling/user_label_model/util.py | 24 ++-- .../wrapper/user_label_prediction_model.py | 20 +++ .../analysis_timeseries_queries.py | 1 + emission/storage/pipeline_queries.py | 8 ++ .../storage/timeseries/builtin_timeseries.py | 27 +++- 7 files changed, 204 insertions(+), 97 deletions(-) create mode 100644 emission/core/wrapper/user_label_prediction_model.py diff --git a/emission/analysis/modelling/user_label_model/model_storage.py b/emission/analysis/modelling/user_label_model/model_storage.py index 4be59c9c0..b04db7366 100644 --- a/emission/analysis/modelling/user_label_model/model_storage.py +++ b/emission/analysis/modelling/user_label_model/model_storage.py @@ -1,6 +1,17 @@ from enum import Enum -from typing import Dict +from typing import Dict, Optional +from emission.analysis.modelling.user_label_model.model_type import ModelType +import emission.core.wrapper.user_label_prediction_model as ecwu +import emission.storage.pipeline_queries as epq +import emission.storage.timeseries.abstract_timeseries as esta +import emission.storage.decorations.analysis_timeseries_queries as esda import emission.analysis.modelling.user_label_model.util as util +import logging +import arrow +import pymongo +from emission.storage.timeseries.builtin_timeseries import BuiltinTimeSeries + +from emission.storage.timeseries.timequery import TimeQuery class ModelStorage(Enum): FILE_SYSTEM = 0 @@ -10,34 +21,44 @@ def names(cls): return list(map(lambda e: e.name, list(cls))) -def create_filename(user_id, model_name) -> str: - return f"user_label_model_{model_name}_{str(user_id)}" - - -def create_database_table_name(model_name) -> str: - return f"user_label_model_{model_name}" +def create_filename(user_id, model_type: ModelType) -> str: + return f"user_label_model_{model_type.name}_{str(user_id)}" -def load(user_id, model_name: str, model_storage: ModelStorage) -> Dict: +def load(user_id, model_type: ModelType, model_storage: ModelStorage) -> Optional[Dict]: """load a user label model from a model storage location :param user_id: the user to request a model for - :type user_id: UUID - :param model_name: _description_ - :type model_name: str - :param save_format: _description_ - :type save_format: SaveFormat - :return: _description_ - :rtype: Dict + :param model_type: expected type of model stored + :param model_storage: storage format + :return: the model representation as a Python Dict or None + :raises: TypeError if loaded model has different type than expected type """ if model_storage == ModelStorage.FILE_SYSTEM: - filename = create_filename(user_id, model_name) + filename = create_filename(user_id, model_type) model_data = util.load_fs(filename) return model_data elif model_storage == ModelStorage.DATABASE: - table_name = create_database_table_name(model_name) - model_data = util.load_db(user_id, table_name) - return model_data + + # retrieve stored model with timestamp that matches/exceeds the most + # recent PipelineState.USER_LABEL_MODEL entry + ts = esda.get_timeseries_for_user(user_id) + if not isinstance(ts, BuiltinTimeSeries): + raise Exception('user model storage requires BuiltInTimeSeries') + latest_model_entry = ts.get_first_entry( + key=esda.USER_LABEL_MODEL_STORE_KEY, + field='data.model_ts', + sort_order=pymongo.DESCENDING + ) + if latest_model_entry.model_type != model_type: + msg = ( + f"loading model for user {user_id} has model type {latest_model_entry.model_type} " + f"but was expected to have model type {model_type}" + ) + raise TypeError(msg) + model = latest_model_entry['data']['model'] if latest_model_entry is not None else None + return model + else: storage_types_str = ",".join(ModelStorage.names()) msg = ( @@ -46,24 +67,63 @@ def load(user_id, model_name: str, model_storage: ModelStorage) -> Dict: ) raise TypeError(msg) -def save(user_id, model_data: Dict, model_name: str, model_storage: ModelStorage): - try: - if model_storage == ModelStorage.FILE_SYSTEM: - filename = create_filename(user_id, model_name) +def save( + user_id, + model_type: ModelType, + model_data: Dict, + model_timestamp: int, + model_storage: ModelStorage = ModelStorage.DATABASE): + """saves a model to storage + + :param user_id: user associated with this model + :param model_type: type of model stored + :param model_data: data for this model to store, should be a dict + :param model_storage: type of storage to load from, defaults to ModelStorage.DATABASE + :raises TypeError: unknown ModelType + :raises IOError: failure when writing to storage medium + """ + + if model_storage == ModelStorage.FILE_SYSTEM: + try: + filename = create_filename(user_id, model_type) util.save_fs(filename, model_data) - elif model_storage == ModelStorage.DATABASE: - table_name = create_database_table_name(model_name) - util.save_db(user_id, table_name, model_data) - else: + except IOError as e: + msg = ( + f"failure storing model for user {user_id}, model {model_type.name} " + f"to the file system" + ) + raise IOError(msg) from e + + elif model_storage == ModelStorage.DATABASE: + + row = ecwu.UserLabelPredictionModel() + row.user_id = user_id + row.model_ts = model_timestamp + row.model_type = model_type + row.model = model_data + + try: + ts = esta.TimeSeries.get_time_series(user_id) + ts.insert_data(user_id, esda.USER_LABEL_MODEL_STORE_KEY, row) + except Exception as e: + msg = ( + f"failure storing model for user {user_id}, model {model_type.name} " + f"to the database" + ) + raise IOError(msg) from e + + try: + epq.mark_user_label_model_done(user_id, model_timestamp) + except Exception as e: + msg = ( + f"failure updating user label pipeline state for user {user_id}" + ) + raise IOError(msg) from e + + else: storage_types_str = ",".join(ModelStorage.names()) msg = ( f"unknown model storage type {model_storage}, must be one of " f"{{{storage_types_str}}}" ) raise TypeError(msg) - except IOError as e: - msg = ( - f"cannot save model for user {user_id}, model_name {model_name} " - f"to the file system" - ) - raise IOError(msg) from e diff --git a/emission/analysis/modelling/user_label_model/run_model.py b/emission/analysis/modelling/user_label_model/run_model.py index d96947027..1c3dad2de 100644 --- a/emission/analysis/modelling/user_label_model/run_model.py +++ b/emission/analysis/modelling/user_label_model/run_model.py @@ -1,5 +1,6 @@ import logging from datetime import datetime +import arrow from tracemalloc import start from typing import Optional @@ -17,6 +18,7 @@ from numpy import isin import emission.analysis.modelling.tour_model.cluster_pipeline as pipeline +import emission.storage.pipeline_queries as epq import emission.storage.decorations.analysis_timeseries_queries as esda from emission.storage.timeseries.timequery import TimeQuery @@ -53,7 +55,11 @@ def _model_factory(model_type: ModelType): return model -def update_user_label_model(user_id, model_type: ModelType, model_storage: ModelStorage, min_trips: int = 14): +def update_user_label_model( + user_id, + model_type: ModelType, + model_storage: ModelStorage = ModelStorage.DATABASE, + min_trips: int = 14): """ create/update a user label model for a user. @@ -62,46 +68,31 @@ def update_user_label_model(user_id, model_type: ModelType, model_storage: Model if the model type is "incremental", only the newest trips are used. :param user_id: id of user - :type user_id: _type_ - :param model_type: type of model to build - :type model_type: str - :param model_storage: storage destination for built model - :type model_storage: ModelStorage + :param model_type: type of model to build. this is also stored on the database. if + there is a mismatch, an exception is thrown + :param model_storage: storage destination for built model (default DATABASE) :param min_trips: minimum number of labeled trips per user to apply prediction (default 14) - :type min_trips: int """ + + # this timestamp is used for recording the state of the updated model + timestamp = arrow.now() model = _model_factory(model_type) - model_name = model_type.model_name - - if model.is_incremental: - - # read in existing model, if it exists - model_data = load(user_id, model_name, model_storage) - model.from_dict(model_data) - - # todo: get timestamp from pipeline, use as filter in query - start_ts = -1 - end_ts = datetime.now() - time_query = TimeQuery( - timeType="data.start_ts", - startTs=start_ts, - endTs=end_ts - ) - else: - time_query = None + # if a previous model exists, deserialize the stored model + model_data_prev = load(user_id, model_type, model_storage) + if model_data_prev is not None: + model.from_dict(model_data_prev) + + # get all relevant trips + time_query = epq.get_time_query_for_user_label_model(user_id) if model.is_incremental else None trips = _get_trips_for_user(user_id, time_query, min_trips) - # fit and store the updated model + # train and store the model model.fit(trips) - model_data = model.to_dict() - save(user_id, model_data, model_name, model_storage) + model_data_next = model.to_dict() + save(user_id, model_type, model_data_next, timestamp, model_storage) - if model.is_incremental: - new_timestamp = datetime.now() - # todo: update pipeline with new timestamp - - logging.debug(f"{model_name} label prediction model built for user {user_id}") + logging.debug(f"{model_type.name} label prediction model built for user {user_id} with timestamp {timestamp}") def predict_labels_with_n( @@ -112,18 +103,17 @@ def predict_labels_with_n( invoke the user label prediction model to predict labels for a trip. :param trip: the trip to predict labels for - :type trip: Confirmedtrip :param model_type: type of prediction model to run - :type model_type: ModelType :param model_storage: location to read/write models - :type model_storage: ModelStorage :return: a list of predictions - :rtype: List[Prediction] """ user_id = trip['user_id'] model = _load_user_label_model(user_id, model_type, model_storage) - predictions, n = model.predict(trip) - return predictions, n + if model is None: + return [], -1 + else: + predictions, n = model.predict(trip) + return predictions, n def _get_trips_for_user(user_id, time_query: Optional[TimeQuery]=None, min_trips: int=14): @@ -150,15 +140,26 @@ def _get_trips_for_user(user_id, time_query: Optional[TimeQuery]=None, min_trips return labeled_trips -def _load_user_label_model(user_id, model_type: ModelType, model_storage: ModelStorage) -> UserLabelPredictionModel: - model = _model_factory(model_type) - model_name = model_type.model_name - - model_data = load(user_id, model_name, model_storage) - model.from_dict(model_data) - - return model +def _load_user_label_model( + user_id, + model_type: ModelType, + model_storage: ModelStorage) -> Optional[UserLabelPredictionModel]: + """helper to build a user label prediction model class with the + contents of a stored model for some user. + :param user_id: user to retrieve the model for + :param model_type: UserLabelPredictionModel type configured for this OpenPATH server + :param model_storage: storage type + :return: model, or None if no model is stored for this user + """ + model_dict = load(user_id, model_type, model_storage) + if model_dict is None: + return None + else: + model = _model_factory(model_type) + model.from_dict(model_dict) + return model + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', diff --git a/emission/analysis/modelling/user_label_model/util.py b/emission/analysis/modelling/user_label_model/util.py index 765aadcd3..33fb8b07c 100644 --- a/emission/analysis/modelling/user_label_model/util.py +++ b/emission/analysis/modelling/user_label_model/util.py @@ -5,8 +5,10 @@ import numpy from numpy.linalg import norm +import emission.storage.decorations.analysis_timeseries_queries as esda -def load_fs(filename: str, numpy_decode: bool = True) -> dict: + +def load_fs(filename: str, numpy_decode: bool = True) -> Optional[dict]: """loads model state as a pickled object on the file system. if the file is not found, returns an empty dict. @@ -17,13 +19,14 @@ def load_fs(filename: str, numpy_decode: bool = True) -> dict: :return: json object parsed, or, an empty list :rtype: Dict """ + raise Exception("deprecated, use db instead") logging.debug(f"At stage: loading model") try: with open(filename, "r") as f: contents = f.read() except FileNotFoundError: logging.info(f"No model found at {filename}, no prediction") - return {} + return None try: if numpy_decode: @@ -49,6 +52,7 @@ def save_fs(filename: str, obj: object): :type obj: object """ + raise Exception("deprecated, use db instead") try: logging.debug("At stage: saving model") obj_capsule = jpickle.dumps(obj) @@ -59,25 +63,21 @@ def save_fs(filename: str, obj: object): raise IOError(msg) from e -def load_db(user_id: str, table: str, timestamp: Optional[int] = None) -> Dict: +def load_db(user_id: str, timestamp: Optional[int] = None) -> Dict: """ - loads a user label prediction model from a database table. - - data is assumed stored in a document database, with the structure: - - { "user_id": user_id, "data": model_data } + loads a user label prediction model from the analysis database. :param user_id: user id to filter on :type user_id: str - :param table: the table name - :type table: str :param timestamp: optional time to :return: :rtype: Dict """ + esda.get_entry # build the time query if a timestamp is provided - time_query = lambda confirmed_trip: confirmed_trip['data']['start_ts'] >= timestamp \ - if timestamp is not None else None + # time_query = lambda trip: trip['data']['start_ts'] >= timestamp \ + # if timestamp is not None else None + pass def save_db(user_id, table: str, model_data: Dict): diff --git a/emission/core/wrapper/user_label_prediction_model.py b/emission/core/wrapper/user_label_prediction_model.py new file mode 100644 index 000000000..1326eaa0a --- /dev/null +++ b/emission/core/wrapper/user_label_prediction_model.py @@ -0,0 +1,20 @@ +# Based on modeprediction.py +from emission.analysis.modelling.user_label_model.model_type import ModelType +import emission.core.wrapper.wrapperbase as ecwb + + +class UserLabelPredictionModel(ecwb.WrapperBase): + props = {"user_id": ecwb.WrapperBase.Access.WORM, # the trip that this is part of + "model_type": ecwb.WrapperBase.Access.WORM, # emission.analysis.modelling.user_label_model.model_type.py + "model": ecwb.WrapperBase.Access.WORM, # the (serialized) state of the model for this trip + "model_ts": ecwb.WrapperBase.Access.WORM, # time that this model was stored + } + + enums = { + "model_type": ModelType + } + geojson = {} + local_dates = {} + + def _populateDependencies(self): + pass diff --git a/emission/storage/decorations/analysis_timeseries_queries.py b/emission/storage/decorations/analysis_timeseries_queries.py index 839ebedfc..d63d32ab4 100644 --- a/emission/storage/decorations/analysis_timeseries_queries.py +++ b/emission/storage/decorations/analysis_timeseries_queries.py @@ -37,6 +37,7 @@ METRICS_DAILY_USER_MEDIAN_SPEED = "metrics/daily_user_median_speed" METRICS_DAILY_MEAN_MEDIAN_SPEED = "metrics/daily_mean_median_speed" INFERRED_LABELS_KEY = "inference/labels" +USER_LABEL_MODEL_STORE_KEY = "inference/user_label_model" # General methods diff --git a/emission/storage/pipeline_queries.py b/emission/storage/pipeline_queries.py index ae77199bd..a7fdfe7bb 100644 --- a/emission/storage/pipeline_queries.py +++ b/emission/storage/pipeline_queries.py @@ -121,6 +121,14 @@ def mark_mode_inference_complete(user_id): def mark_mode_inference_failed(user_id): mark_stage_failed(user_id, ps.PipelineStages.MODE_INFERENCE) +def get_time_query_for_user_label_model(user_id): # TODO: here + tq = get_time_range_for_stage(user_id, ps.PipelineStages.USER_LABEL_MODEL) + tq.timeType = 'data.model_ts' + return tq + +def mark_user_label_model_done(user_id, last_ts=None): + mark_stage_done(user_id, ps.PipelineStages.USER_LABEL_MODEL, last_ts) + def get_time_range_for_confirmed_object_creation(user_id): tq = get_time_range_for_stage(user_id, ps.PipelineStages.CREATE_CONFIRMED_OBJECTS) tq.timeType = "data.end_ts" diff --git a/emission/storage/timeseries/builtin_timeseries.py b/emission/storage/timeseries/builtin_timeseries.py index ac95aa0a1..d64e158b2 100644 --- a/emission/storage/timeseries/builtin_timeseries.py +++ b/emission/storage/timeseries/builtin_timeseries.py @@ -86,6 +86,7 @@ def __init__(self, user_id): "metrics/daily_mean_median_speed": self.analysis_timeseries_db, "inference/prediction": self.analysis_timeseries_db, "inference/labels": self.analysis_timeseries_db, + "inference/user_label_model": self.analysis_timeseries_db, "analysis/inferred_section": self.analysis_timeseries_db, "analysis/inferred_labels": self.analysis_timeseries_db, "analysis/inferred_trip": self.analysis_timeseries_db, @@ -298,6 +299,24 @@ def to_data_df(key, entry_it, map_fn = None): return deduped_df.reset_index(drop=True) + def get_first_entry(self, key, field, sort_order, time_query=None): + """gets the first entry with the provided key when sorted by some field + + :param key: the metadata key for the entries, used to identify the stream + :param field: the field in the stream whose max value we want. + :param sort_order: pymongo.ASCENDING or pymongon.DESCENDING + :param time_query: the time range in which to search the stream + :return: a database row, or None if no match is found + """ + result_it = self.get_timeseries_db(key).find(self._get_query([key], time_query), + {"_id": False, field: True}).sort(field, sort_order).limit(1) + result_list = list(result_it) + if len(result_list) == 0: + return None + else: + return result_list[0] + + def get_first_value_for_field(self, key, field, sort_order, time_query=None): """ Currently used to get the max value of the location values so that we can send data @@ -310,13 +329,11 @@ def get_first_value_for_field(self, key, field, sort_order, time_query=None): It is assumed that the values for the field are sortable. :return: the max value for the field in the stream identified by key. -1 if there are no entries for the key. """ - result_it = self.get_timeseries_db(key).find(self._get_query([key], time_query), - {"_id": False, field: True}).sort(field, sort_order).limit(1) - result_list = list(result_it) - if len(result_list) == 0: + retVal = self.get_first_entry(key, field, sort_order, time_query) + if retVal is None: return -1 - retVal = result_list[0] + # extract the specified field from the entry that was found field_parts = field.split(".") for part in field_parts: retVal = retVal[part] From 5fcd5080e1d500b586b45319786008bd7b134f3a Mon Sep 17 00:00:00 2001 From: rfitzger Date: Tue, 21 Jun 2022 15:19:13 -0600 Subject: [PATCH 04/46] cleanup --- .../modelling/user_label_model/util.py | 36 +------------------ 1 file changed, 1 insertion(+), 35 deletions(-) diff --git a/emission/analysis/modelling/user_label_model/util.py b/emission/analysis/modelling/user_label_model/util.py index 33fb8b07c..cc4d82406 100644 --- a/emission/analysis/modelling/user_label_model/util.py +++ b/emission/analysis/modelling/user_label_model/util.py @@ -61,41 +61,7 @@ def save_fs(filename: str, obj: object): except Exception as e: msg = f"failed writing clustering model contents to file system" raise IOError(msg) from e - - -def load_db(user_id: str, timestamp: Optional[int] = None) -> Dict: - """ - loads a user label prediction model from the analysis database. - - :param user_id: user id to filter on - :type user_id: str - :param timestamp: optional time to - :return: - :rtype: Dict - """ - esda.get_entry - # build the time query if a timestamp is provided - # time_query = lambda trip: trip['data']['start_ts'] >= timestamp \ - # if timestamp is not None else None - - pass - -def save_db(user_id, table: str, model_data: Dict): - """ - saves a user label prediction model to the database. - - data is assumed stored in a document database, with the structure: - - { "user_id": user_id, "data": model_data } - - :param user_id: the user to store data for - :type user_id: object - :param table: the table name - :type table: str - :param model_data: the data row to store tagged by this user id - :type model_data: Dict - """ - pass + def find_knee_point(values: List[float]) -> Tuple[float, int]: """for a list of values, find the value which represents the cut-off point From c6306b41542493ee6f495f85cda5c130382ff508 Mon Sep 17 00:00:00 2001 From: rfitzger Date: Tue, 21 Jun 2022 15:23:25 -0600 Subject: [PATCH 05/46] cleanup --- .../modelling/user_label_model/run_model.py | 44 ++++++++----------- 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/emission/analysis/modelling/user_label_model/run_model.py b/emission/analysis/modelling/user_label_model/run_model.py index 1c3dad2de..7c60baf2d 100644 --- a/emission/analysis/modelling/user_label_model/run_model.py +++ b/emission/analysis/modelling/user_label_model/run_model.py @@ -1,45 +1,37 @@ import logging -from datetime import datetime -import arrow -from tracemalloc import start from typing import Optional +import arrow +import emission.analysis.modelling.tour_model.cluster_pipeline as pipeline +import emission.storage.decorations.analysis_timeseries_queries as esda +import emission.storage.pipeline_queries as epq import emission.storage.timeseries.abstract_timeseries as esta -from emission.analysis.modelling.similarity.od_similarity import \ - OriginDestinationSimilarity -from emission.analysis.modelling.user_label_model.greedy_similarity_binning import \ - GreedySimilarityBinning -from emission.analysis.modelling.user_label_model.model_storage import ( - ModelStorage, load, save) +import emission.analysis.modelling.similarity.od_similarity as eamso +import emission.analysis.modelling.user_label_model.greedy_similarity_binning as eamug +import emission.analysis.modelling.user_label_model.model_storage as eamum +from emission.analysis.modelling.user_label_model.model_storage import ModelStorage from emission.analysis.modelling.user_label_model.model_type import ModelType -from emission.analysis.modelling.user_label_model.user_label_prediction_model import \ - UserLabelPredictionModel +import emission.analysis.modelling.user_label_model.user_label_prediction_model as eamuu from emission.core.wrapper.confirmedtrip import Confirmedtrip -from numpy import isin - -import emission.analysis.modelling.tour_model.cluster_pipeline as pipeline -import emission.storage.pipeline_queries as epq -import emission.storage.decorations.analysis_timeseries_queries as esda from emission.storage.timeseries.timequery import TimeQuery +from numpy import isin SIMILARITY_THRESHOLD_METERS = 500 -def _model_factory(model_type: ModelType): +def _model_factory(model_type: ModelType) -> eamuu.UserLabelPredictionModel: """ instantiates the requested user model type with the configured parameters. if future model types are created, they should be added here. :param model_type: internally-used model name - :type model_type: ModelType - :raises KeyError: if the requested model name does not exist :return: a user label prediction model - :rtype: UserLabelPredictionModel + :raises KeyError: if the requested model name does not exist """ MODELS = { - ModelType.GREEDY_SIMILARITY_BINNING: GreedySimilarityBinning( - metric=OriginDestinationSimilarity(), + ModelType.GREEDY_SIMILARITY_BINNING: eamug.GreedySimilarityBinning( + metric=eamso.OriginDestinationSimilarity(), sim_thresh=SIMILARITY_THRESHOLD_METERS, apply_cutoff=False ) @@ -79,7 +71,7 @@ def update_user_label_model( model = _model_factory(model_type) # if a previous model exists, deserialize the stored model - model_data_prev = load(user_id, model_type, model_storage) + model_data_prev = eamum.load(user_id, model_type, model_storage) if model_data_prev is not None: model.from_dict(model_data_prev) @@ -90,7 +82,7 @@ def update_user_label_model( # train and store the model model.fit(trips) model_data_next = model.to_dict() - save(user_id, model_type, model_data_next, timestamp, model_storage) + eamum.save(user_id, model_type, model_data_next, timestamp, model_storage) logging.debug(f"{model_type.name} label prediction model built for user {user_id} with timestamp {timestamp}") @@ -143,7 +135,7 @@ def _get_trips_for_user(user_id, time_query: Optional[TimeQuery]=None, min_trips def _load_user_label_model( user_id, model_type: ModelType, - model_storage: ModelStorage) -> Optional[UserLabelPredictionModel]: + model_storage: ModelStorage) -> Optional[eamuu.UserLabelPredictionModel]: """helper to build a user label prediction model class with the contents of a stored model for some user. @@ -152,7 +144,7 @@ def _load_user_label_model( :param model_storage: storage type :return: model, or None if no model is stored for this user """ - model_dict = load(user_id, model_type, model_storage) + model_dict = eamum.load(user_id, model_type, model_storage) if model_dict is None: return None else: From 714c017eb4fd37018e3cba1db51ed07590925975 Mon Sep 17 00:00:00 2001 From: rfitzger Date: Tue, 21 Jun 2022 15:33:15 -0600 Subject: [PATCH 06/46] integration --- bin/build_label_model.py | 9 +++++++-- .../classification/inference/labels/inferrers.py | 7 ++++++- .../analysis/modelling/user_label_model/run_model.py | 2 +- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/bin/build_label_model.py b/bin/build_label_model.py index 7ba3fe066..afee0d813 100644 --- a/bin/build_label_model.py +++ b/bin/build_label_model.py @@ -6,12 +6,14 @@ import argparse import uuid import copy +from emission.analysis.modelling.user_label_model.model_storage import ModelStorage +from emission.analysis.modelling.user_label_model.model_type import ModelType import emission.pipeline.reset as epr import emission.core.get_database as edb import emission.core.wrapper.user as ecwu import emission.storage.timeseries.abstract_timeseries as esta -import emission.analysis.modelling.tour_model_first_only.build_save_model as eamtb +import emission.analysis.modelling.user_label_model.run_model as eamur def _get_user_list(args): if args.all: @@ -64,4 +66,7 @@ def _email_2_user_list(email_list): logging.info("received list with %s users" % user_list) for user_id in user_list: logging.info("building model for user %s" % user_id) - eamtb.build_user_model(user_id) + model_type = ModelType.GREEDY_SIMILARITY_BINNING + model_storage = ModelStorage.DATABASE + min_trips = 14 + eamur.update_user_label_model(user_id, model_type, model_storage, min_trips) diff --git a/emission/analysis/classification/inference/labels/inferrers.py b/emission/analysis/classification/inference/labels/inferrers.py index 6ce4c7702..723ef4900 100644 --- a/emission/analysis/classification/inference/labels/inferrers.py +++ b/emission/analysis/classification/inference/labels/inferrers.py @@ -6,6 +6,9 @@ import copy import emission.analysis.modelling.tour_model_first_only.load_predict as lp +from emission.analysis.modelling.user_label_model.model_storage import ModelStorage +import emission.analysis.modelling.user_label_model.run_model as eamur +from emission.analysis.modelling.user_label_model.model_type import ModelType # A set of placeholder predictors to allow pipeline development without a real inference algorithm. # For the moment, the system is configured to work with two labels, "mode_confirm" and @@ -140,7 +143,9 @@ def n_to_confidence_coeff(n, max_confidence=None, first_confidence=None, confide # predict_two_stage_bin_cluster but with the above reduction in confidence def predict_cluster_confidence_discounting(trip, max_confidence=None, first_confidence=None, confidence_multiplier=None): - labels, n = lp.predict_labels_with_n(trip) + model_type = ModelType.GREEDY_SIMILARITY_BINNING + model_storage = ModelStorage.DATABASE + labels, n = eamur.predict_labels_with_n(trip, model_type, model_storage) if n <= 0: # No model data or trip didn't match a cluster logging.debug(f"In predict_cluster_confidence_discounting: n={n}; returning as-is") return labels diff --git a/emission/analysis/modelling/user_label_model/run_model.py b/emission/analysis/modelling/user_label_model/run_model.py index 7c60baf2d..ef80c21c8 100644 --- a/emission/analysis/modelling/user_label_model/run_model.py +++ b/emission/analysis/modelling/user_label_model/run_model.py @@ -90,7 +90,7 @@ def update_user_label_model( def predict_labels_with_n( trip: Confirmedtrip, model_type = ModelType.GREEDY_SIMILARITY_BINNING, - model_storage = ModelStorage.FILE_SYSTEM): + model_storage = ModelStorage.DATABASE): """ invoke the user label prediction model to predict labels for a trip. From c1540f3a3c7e54f0261bb93d72feccb0236f9c69 Mon Sep 17 00:00:00 2001 From: rfitzger Date: Tue, 21 Jun 2022 15:36:23 -0600 Subject: [PATCH 07/46] comments --- .../user_label_model/greedy_similarity_binning.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py b/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py index 5f0f00f47..d97cc4fb9 100644 --- a/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py @@ -50,15 +50,10 @@ def __init__( the number of predictions is not assumed to be the number of features. :param dir: the model load/save directory - :type dir: Path :param user_id: identity (UUID) of the e-mission user - :type user_id: str :param metric: type of similarity metric to use - :type metric: SimilarityMetric :param sim_thresh: max distance threshold for similarity (assumed meters) - :type sim_thresh: float :param apply_cutoff: ignore clusters which are small, based on a "knee point" heuristic (default False) - :type apply_cutoff: bool """ super().__init__() self.metric = metric @@ -72,7 +67,6 @@ def fit(self, trips: List[Confirmedtrip]): corresponds to a label at the matching index of the label input :param trips: 2D array of features to train from - :type trips: List[Confirmedtrip] """ self.bins = {} self._assign_bins(trips) @@ -101,6 +95,9 @@ def predict(self, trip: Confirmedtrip) -> Tuple[List[Prediction], int]: return labels, n_features def is_incremental(self) -> bool: + """ + greedy similarity binning is not an incremental model + """ return False def to_dict(self) -> Dict: From 91bdfe4b7cca37cf4885ea540da14deedd2974c8 Mon Sep 17 00:00:00 2001 From: rfitzger Date: Tue, 21 Jun 2022 15:36:40 -0600 Subject: [PATCH 08/46] comments --- .../modelling/user_label_model/greedy_similarity_binning.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py b/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py index d97cc4fb9..4f3417439 100644 --- a/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py @@ -142,9 +142,7 @@ def _nearest_bin(self, trip: Confirmedtrip) -> Tuple[Optional[int], Optional[Bin none are found, (None, None) is returned. :param trip: incoming trip features to test with - :type trip: Confirmedtrip :return: nearest record and bin number, if found - :rtype: Optional[Tuple[BinRecord, Int]] """ trip_features = self.extract_features(trip) selected_bin = None From 567c4d824bf2a021e6c296df497d9ae55b5537fa Mon Sep 17 00:00:00 2001 From: rfitzger Date: Tue, 21 Jun 2022 16:18:02 -0600 Subject: [PATCH 09/46] add user label model stage --- emission/core/wrapper/pipelinestate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/emission/core/wrapper/pipelinestate.py b/emission/core/wrapper/pipelinestate.py index dc5d882ba..e37f44446 100644 --- a/emission/core/wrapper/pipelinestate.py +++ b/emission/core/wrapper/pipelinestate.py @@ -19,6 +19,7 @@ class PipelineStages(enum.Enum): CLEAN_RESAMPLING = 11 MODE_INFERENCE = 4 LABEL_INFERENCE = 14 + USER_LABEL_MODEL = 16 EXPECTATION_POPULATION = 15 CREATE_CONFIRMED_OBJECTS = 13 TOUR_MODEL = 5 From b6fb2a700cfa71f59afdae30dc17d642d4480c80 Mon Sep 17 00:00:00 2001 From: rfitzger Date: Wed, 22 Jun 2022 08:42:43 -0600 Subject: [PATCH 10/46] simplification, cleanup, comments --- bin/build_label_model.py | 1 + .../inference/labels/inferrers.py | 1 + .../modelling/similarity/od_similarity.py | 7 +--- .../modelling/similarity/similarity_metric.py | 14 +++----- .../modelling/user_label_model/bin_record.py | 26 -------------- .../greedy_similarity_binning.py | 36 +++++++------------ .../user_label_model/model_storage.py | 10 +++--- .../modelling/user_label_model/prediction.py | 12 ------- .../user_label_prediction_model.py | 3 +- 9 files changed, 25 insertions(+), 85 deletions(-) delete mode 100644 emission/analysis/modelling/user_label_model/bin_record.py delete mode 100644 emission/analysis/modelling/user_label_model/prediction.py diff --git a/bin/build_label_model.py b/bin/build_label_model.py index afee0d813..666a811c0 100644 --- a/bin/build_label_model.py +++ b/bin/build_label_model.py @@ -66,6 +66,7 @@ def _email_2_user_list(email_list): logging.info("received list with %s users" % user_list) for user_id in user_list: logging.info("building model for user %s" % user_id) + # these can come from the application config as default values model_type = ModelType.GREEDY_SIMILARITY_BINNING model_storage = ModelStorage.DATABASE min_trips = 14 diff --git a/emission/analysis/classification/inference/labels/inferrers.py b/emission/analysis/classification/inference/labels/inferrers.py index 723ef4900..b4f973061 100644 --- a/emission/analysis/classification/inference/labels/inferrers.py +++ b/emission/analysis/classification/inference/labels/inferrers.py @@ -143,6 +143,7 @@ def n_to_confidence_coeff(n, max_confidence=None, first_confidence=None, confide # predict_two_stage_bin_cluster but with the above reduction in confidence def predict_cluster_confidence_discounting(trip, max_confidence=None, first_confidence=None, confidence_multiplier=None): + # these can come from the application config as default values model_type = ModelType.GREEDY_SIMILARITY_BINNING model_storage = ModelStorage.DATABASE labels, n = eamur.predict_labels_with_n(trip, model_type, model_storage) diff --git a/emission/analysis/modelling/similarity/od_similarity.py b/emission/analysis/modelling/similarity/od_similarity.py index b55e51618..10164aad5 100644 --- a/emission/analysis/modelling/similarity/od_similarity.py +++ b/emission/analysis/modelling/similarity/od_similarity.py @@ -19,9 +19,4 @@ def extract_features(self, trip: Confirmedtrip) -> List[float]: def similarity(self, a: List[float], b: List[float], thresh: float) -> List[float]: o_dist = ecc.calDistance([a[0], a[1]], [b[0], b[1]]) d_dist = ecc.calDistance([a[2], a[3]], [b[2], b[3]]) - return [o_dist, d_dist] - - def similar(self, a: List[float], b: List[float], thresh: float) -> bool: - o_dist, d_dist = self.similarity(a, b) - is_similar = o_dist <= thresh and d_dist <= thresh - return is_similar \ No newline at end of file + return [o_dist, d_dist] \ No newline at end of file diff --git a/emission/analysis/modelling/similarity/similarity_metric.py b/emission/analysis/modelling/similarity/similarity_metric.py index 8dfab1902..2996e059b 100644 --- a/emission/analysis/modelling/similarity/similarity_metric.py +++ b/emission/analysis/modelling/similarity/similarity_metric.py @@ -11,22 +11,18 @@ def extract_features(self, trip: Confirmedtrip) -> List[float]: """extracts the features we want to compare for similarity :param trip: a confirmed trip - :type trip: Confirmedtrip :return: the features to compare - :rtype: List[float] """ pass + @abstractmethod def similarity(self, a: List[float], b: List[float]) -> List[float]: """compares the features, producing their similarity as computed by this similarity metric :param a: features for a trip - :type a: List[float] :param b: features for another trip - :type b: List[float] :return: for each feature, the similarity of these features - :rtype: List[float] """ pass @@ -35,12 +31,10 @@ def similar(self, a: List[float], b: List[float], thresh: float) -> bool: within some threshold :param a: features for a trip - :type a: List[float] :param b: features for another trip - :type b: List[float] :param thresh: threshold for similarity - :type thresh: float :return: true if the feature similarity is within some threshold - :rtype: float """ - pass \ No newline at end of file + similarity_values = self.similarity(a, b) + is_similar = all(map(lambda sim: sim <= thresh, similarity_values)) + return is_similar \ No newline at end of file diff --git a/emission/analysis/modelling/user_label_model/bin_record.py b/emission/analysis/modelling/user_label_model/bin_record.py deleted file mode 100644 index dc9f57d9f..000000000 --- a/emission/analysis/modelling/user_label_model/bin_record.py +++ /dev/null @@ -1,26 +0,0 @@ -from typing import Dict, List -from emission.analysis.modelling.user_label_model.prediction import Prediction - -# something like this: -# bin_data = { -# "predictions": [ -# {'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'insurance', 'replaced_mode': 'drove_alone'}, 'p': 0.9333333333333333} -# ], -# "locations": [ -# [-122.00, 39.00, -122.01, 39.00] -# ], -# "labels": [ -# {'mode_confirm': 'shared_ride', 'purpose_confirm': 'insurance_payment', 'replaced_mode': 'drove_alone'} -# ] -# } - -BinRecord = Dict - -# todo: if OpenPATH goes to Python 3.8, we can use this: -# -# class BinRecord(TypedDict): -# predictions: List[Prediction] -# features: List[List[float]] -# labels: List[Dict[str, str]] - - diff --git a/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py b/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py index 4f3417439..255d26c94 100644 --- a/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py @@ -1,24 +1,14 @@ import logging -import pandas as pd -from pathlib import Path from typing import Dict, List, Optional, Tuple -from emission.analysis.modelling.user_label_model.bin_record import BinRecord -from emission.analysis.modelling.user_label_model.prediction import ( - Prediction, -) -from emission.analysis.modelling.user_label_model.user_label_prediction_model import ( - UserLabelPredictionModel, -) -from emission.analysis.modelling.similarity.similarity_metric import SimilarityMetric -from emission.analysis.modelling.tour_model.similarity import similarity -from emission.analysis.modelling.tour_model_first_only.load_predict import ( - loadModelStage, -) -import emission.analysis.modelling.tour_model.data_preprocessing as preprocess -from emission.core.wrapper.confirmedtrip import Confirmedtrip -import emission.analysis.modelling.similarity.confirmed_trip_feature_extraction as ctfe -import emission.analysis.modelling.user_label_model.util as util + import emission.analysis.modelling.tour_model.label_processing as lp +import emission.analysis.modelling.user_label_model.util as util +import pandas as pd +from emission.analysis.modelling.similarity.similarity_metric import \ + SimilarityMetric +from emission.analysis.modelling.user_label_model.user_label_prediction_model import \ + UserLabelPredictionModel +from emission.core.wrapper.confirmedtrip import Confirmedtrip class GreedySimilarityBinning(UserLabelPredictionModel): @@ -59,7 +49,7 @@ def __init__( self.metric = metric self.sim_thresh = sim_thresh self.apply_cutoff = apply_cutoff - self.bins: Dict[int, BinRecord] = {} + self.bins: Dict[int, Dict] = {} self.loaded = False def fit(self, trips: List[Confirmedtrip]): @@ -75,7 +65,7 @@ def fit(self, trips: List[Confirmedtrip]): self._generate_predictions() logging.info(f"model fit to trip data") - def predict(self, trip: Confirmedtrip) -> Tuple[List[Prediction], int]: + def predict(self, trip: Confirmedtrip) -> Tuple[List[Dict], int]: if not self.loaded: msg = ( "predict called on unloaded model " @@ -135,14 +125,14 @@ def _assign_bins(self, trips: List[Confirmedtrip]): } self.bins[new_bin_id] = new_bin_record - def _nearest_bin(self, trip: Confirmedtrip) -> Tuple[Optional[int], Optional[BinRecord]]: + def _nearest_bin(self, trip: Confirmedtrip) -> Tuple[Optional[int], Optional[Dict]]: """ finds a bin which contains at least one matching feature. the first record matching by similarity measure is returned. if none are found, (None, None) is returned. :param trip: incoming trip features to test with - :return: nearest record and bin number, if found + :return: nearest bin record, if found """ trip_features = self.extract_features(trip) selected_bin = None @@ -207,4 +197,4 @@ def _generate_predictions(self): one_set_labels['p'] = unique_labels.iloc[i]['p'] # e.g. one_set_labels = {'labels': {'mode_confirm': 'walk', 'replaced_mode': 'walk', 'purpose_confirm': 'exercise'}, 'p': 1.0} bin_label_combo_list.append(one_set_labels) - bin_record['predictions'] = bin_label_combo_list \ No newline at end of file + bin_record['predictions'] = bin_label_combo_list diff --git a/emission/analysis/modelling/user_label_model/model_storage.py b/emission/analysis/modelling/user_label_model/model_storage.py index b04db7366..f8ce6cef8 100644 --- a/emission/analysis/modelling/user_label_model/model_storage.py +++ b/emission/analysis/modelling/user_label_model/model_storage.py @@ -1,17 +1,15 @@ from enum import Enum from typing import Dict, Optional -from emission.analysis.modelling.user_label_model.model_type import ModelType + +import emission.analysis.modelling.user_label_model.util as util import emission.core.wrapper.user_label_prediction_model as ecwu +import emission.storage.decorations.analysis_timeseries_queries as esda import emission.storage.pipeline_queries as epq import emission.storage.timeseries.abstract_timeseries as esta -import emission.storage.decorations.analysis_timeseries_queries as esda -import emission.analysis.modelling.user_label_model.util as util -import logging -import arrow import pymongo +from emission.analysis.modelling.user_label_model.model_type import ModelType from emission.storage.timeseries.builtin_timeseries import BuiltinTimeSeries -from emission.storage.timeseries.timequery import TimeQuery class ModelStorage(Enum): FILE_SYSTEM = 0 diff --git a/emission/analysis/modelling/user_label_model/prediction.py b/emission/analysis/modelling/user_label_model/prediction.py deleted file mode 100644 index f037f2f28..000000000 --- a/emission/analysis/modelling/user_label_model/prediction.py +++ /dev/null @@ -1,12 +0,0 @@ -from typing import Dict - -# something like this: -# x = {'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'church', 'replaced_mode': 'drove_alone'}, 'p': 0.9333333333333333} - -Prediction = Dict - -# todo: if OpenPATH goes to Python 3.8, we can use this: -# -# class Prediction(TypedDict): -# labels: Dict[str, str] -# p: float diff --git a/emission/analysis/modelling/user_label_model/user_label_prediction_model.py b/emission/analysis/modelling/user_label_model/user_label_prediction_model.py index f9ff4e21c..c6ca40a45 100644 --- a/emission/analysis/modelling/user_label_model/user_label_prediction_model.py +++ b/emission/analysis/modelling/user_label_model/user_label_prediction_model.py @@ -1,7 +1,6 @@ from abc import ABCMeta, abstractmethod from typing import Dict, List, Tuple -from emission.analysis.modelling.user_label_model.prediction import Prediction from emission.core.wrapper.confirmedtrip import Confirmedtrip @@ -18,7 +17,7 @@ def fit(data: List[List[float]]): pass @abstractmethod - def predict(self, data: List[float]) -> Tuple[List[Prediction], int]: + def predict(self, data: List[float]) -> Tuple[List[Dict], int]: """use this model to predict labels for some data :param data: a single row of features in the model's feature space From dd90f1a662c9fa213dc629065d3bf09ebf501808 Mon Sep 17 00:00:00 2001 From: rfitzger Date: Wed, 22 Jun 2022 08:51:11 -0600 Subject: [PATCH 11/46] cleanup, naming --- .../user_label_model/model_storage.py | 4 +- .../modelling/user_label_model/run_model.py | 81 +++++++++---------- 2 files changed, 41 insertions(+), 44 deletions(-) diff --git a/emission/analysis/modelling/user_label_model/model_storage.py b/emission/analysis/modelling/user_label_model/model_storage.py index f8ce6cef8..6a17dcb99 100644 --- a/emission/analysis/modelling/user_label_model/model_storage.py +++ b/emission/analysis/modelling/user_label_model/model_storage.py @@ -23,7 +23,7 @@ def create_filename(user_id, model_type: ModelType) -> str: return f"user_label_model_{model_type.name}_{str(user_id)}" -def load(user_id, model_type: ModelType, model_storage: ModelStorage) -> Optional[Dict]: +def load_model(user_id, model_type: ModelType, model_storage: ModelStorage) -> Optional[Dict]: """load a user label model from a model storage location :param user_id: the user to request a model for @@ -65,7 +65,7 @@ def load(user_id, model_type: ModelType, model_storage: ModelStorage) -> Optiona ) raise TypeError(msg) -def save( +def save_model( user_id, model_type: ModelType, model_data: Dict, diff --git a/emission/analysis/modelling/user_label_model/run_model.py b/emission/analysis/modelling/user_label_model/run_model.py index ef80c21c8..45cfefa26 100644 --- a/emission/analysis/modelling/user_label_model/run_model.py +++ b/emission/analysis/modelling/user_label_model/run_model.py @@ -19,34 +19,6 @@ SIMILARITY_THRESHOLD_METERS = 500 -def _model_factory(model_type: ModelType) -> eamuu.UserLabelPredictionModel: - """ - instantiates the requested user model type with the configured - parameters. if future model types are created, they should be - added here. - - :param model_type: internally-used model name - :return: a user label prediction model - :raises KeyError: if the requested model name does not exist - """ - MODELS = { - ModelType.GREEDY_SIMILARITY_BINNING: eamug.GreedySimilarityBinning( - metric=eamso.OriginDestinationSimilarity(), - sim_thresh=SIMILARITY_THRESHOLD_METERS, - apply_cutoff=False - ) - } - model = MODELS.get(model_type) - if model is None: - if not isinstance(model_type, ModelType): - raise TypeError(f"provided model type {model_type} is not an instance of ModelType") - else: - model_names = list(lambda e: e.name, MODELS.keys()) - models = ",".join(model_names) - raise KeyError(f"user label model {model_type.name} not found in factory, must be one of {{{models}}}") - return model - - def update_user_label_model( user_id, model_type: ModelType, @@ -68,10 +40,10 @@ def update_user_label_model( # this timestamp is used for recording the state of the updated model timestamp = arrow.now() - model = _model_factory(model_type) + model = model_factory(model_type) # if a previous model exists, deserialize the stored model - model_data_prev = eamum.load(user_id, model_type, model_storage) + model_data_prev = eamum.load_model(user_id, model_type, model_storage) if model_data_prev is not None: model.from_dict(model_data_prev) @@ -82,7 +54,7 @@ def update_user_label_model( # train and store the model model.fit(trips) model_data_next = model.to_dict() - eamum.save(user_id, model_type, model_data_next, timestamp, model_storage) + eamum.save_model(user_id, model_type, model_data_next, timestamp, model_storage) logging.debug(f"{model_type.name} label prediction model built for user {user_id} with timestamp {timestamp}") @@ -108,17 +80,43 @@ def predict_labels_with_n( return predictions, n -def _get_trips_for_user(user_id, time_query: Optional[TimeQuery]=None, min_trips: int=14): +def model_factory(model_type: ModelType) -> eamuu.UserLabelPredictionModel: + """ + instantiates the requested user model type with the configured + parameters. + + hey YOU! if future model types are created, they should be added here! + + :param model_type: internally-used model name (an enum) + :return: a user label prediction model + :raises KeyError: if the requested model name does not exist + """ + MODELS = { + ModelType.GREEDY_SIMILARITY_BINNING: eamug.GreedySimilarityBinning( + metric=eamso.OriginDestinationSimilarity(), + sim_thresh=SIMILARITY_THRESHOLD_METERS, + apply_cutoff=False + ) + } + model = MODELS.get(model_type) + if model is None: + if not isinstance(model_type, ModelType): + raise TypeError(f"provided model type {model_type} is not an instance of ModelType") + else: + model_names = list(lambda e: e.name, MODELS.keys()) + models = ",".join(model_names) + raise KeyError(f"user label model {model_type.name} not found in factory, must be one of {{{models}}}") + return model + + +def _get_trips_for_user(user_id, time_query: Optional[TimeQuery], min_trips: int): """ load the labeled trip data for this user, subject to a time query. if the user does not have at least $min_trips trips with labels, then return an empty list. :param user_id: user to collect trips from - :type user_id: _type_ :param time_query: query to restrict the time (optional) - :type time_query: Optional[TimeQuery] :param min_trips: minimum number of labeled trips required to train - :type min_trips: int """ trips = esda.get_entries(key=esda.CONFIRMED_TRIP_KEY, user_id=user_id, time_query=time_query) labeled_trips = [trip for trip in trips if trip['data']['user_input'] != {}] @@ -144,24 +142,23 @@ def _load_user_label_model( :param model_storage: storage type :return: model, or None if no model is stored for this user """ - model_dict = eamum.load(user_id, model_type, model_storage) + model_dict = eamum.load_model(user_id, model_type, model_storage) if model_dict is None: return None else: - model = _model_factory(model_type) + model = model_factory(model_type) model.from_dict(model_dict) return model if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', - level=logging.DEBUG) + logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',level=logging.DEBUG) all_users = esta.TimeSeries.get_uuid_list() # case 1: the new trip matches a bin from the 1st round and a cluster from the 2nd round user_id = all_users[0] update_user_label_model(user_id, ModelType.GREEDY_SIMILARITY_BINNING, ModelStorage.FILE_SYSTEM) - filter_trips = _get_trips_for_user(user_id) + filter_trips = _get_trips_for_user(user_id, None, 0) new_trip = filter_trips[4] # result is [{'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'church', 'replaced_mode': 'drove_alone'}, # 'p': 0.9333333333333333}, {'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'entertainment', @@ -174,7 +171,7 @@ def _load_user_label_model( # 2. the user doesn't have common trips user_id = all_users[1] update_user_label_model(user_id, ModelType.GREEDY_SIMILARITY_BINNING, ModelStorage.FILE_SYSTEM) - filter_trips = _get_trips_for_user(user_id) + filter_trips = _get_trips_for_user(user_id, None, 0) new_trip = filter_trips[0] # result is [] pl, _ = predict_labels_with_n(new_trip) @@ -183,7 +180,7 @@ def _load_user_label_model( # case3: the new trip is novel trip(doesn't fall in any 1st round bins) user = all_users[0] update_user_label_model(user_id, ModelType.GREEDY_SIMILARITY_BINNING, ModelStorage.FILE_SYSTEM) - filter_trips = _get_trips_for_user(user_id) + filter_trips = _get_trips_for_user(user_id, None, 0) new_trip = filter_trips[0] # result is [] pl = predict_labels_with_n(new_trip) From 580498f0a3e0be72657c1a4e2402b430fc72899a Mon Sep 17 00:00:00 2001 From: rfitzger Date: Wed, 22 Jun 2022 09:00:10 -0600 Subject: [PATCH 12/46] logging --- emission/analysis/modelling/user_label_model/run_model.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/emission/analysis/modelling/user_label_model/run_model.py b/emission/analysis/modelling/user_label_model/run_model.py index 45cfefa26..d33fb754c 100644 --- a/emission/analysis/modelling/user_label_model/run_model.py +++ b/emission/analysis/modelling/user_label_model/run_model.py @@ -46,6 +46,9 @@ def update_user_label_model( model_data_prev = eamum.load_model(user_id, model_type, model_storage) if model_data_prev is not None: model.from_dict(model_data_prev) + logging.debug(f"loaded {model_type.name} user label model for user {user_id}") + else: + logging.debug(f"building first {model_type.name} user label model for user {user_id}") # get all relevant trips time_query = epq.get_time_query_for_user_label_model(user_id) if model.is_incremental else None From 22bb4191f06d10c23eefa31643a0344e15c276da Mon Sep 17 00:00:00 2001 From: rfitzger Date: Wed, 22 Jun 2022 09:01:24 -0600 Subject: [PATCH 13/46] remove 'data' field --- emission/analysis/modelling/user_label_model/model_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/emission/analysis/modelling/user_label_model/model_storage.py b/emission/analysis/modelling/user_label_model/model_storage.py index 6a17dcb99..2a538cec8 100644 --- a/emission/analysis/modelling/user_label_model/model_storage.py +++ b/emission/analysis/modelling/user_label_model/model_storage.py @@ -54,7 +54,7 @@ def load_model(user_id, model_type: ModelType, model_storage: ModelStorage) -> O f"but was expected to have model type {model_type}" ) raise TypeError(msg) - model = latest_model_entry['data']['model'] if latest_model_entry is not None else None + model = latest_model_entry.model if latest_model_entry is not None else None return model else: From 92aa9c9e9162bbc3651b8a22f9bfe44c65952b44 Mon Sep 17 00:00:00 2001 From: rfitzger Date: Wed, 22 Jun 2022 14:47:43 -0600 Subject: [PATCH 14/46] invalid file checked in --- user_label_model_greedy_c46b0f38-fc47-4e09-a8de-f15f382f7121 | 1 - 1 file changed, 1 deletion(-) delete mode 100644 user_label_model_greedy_c46b0f38-fc47-4e09-a8de-f15f382f7121 diff --git a/user_label_model_greedy_c46b0f38-fc47-4e09-a8de-f15f382f7121 b/user_label_model_greedy_c46b0f38-fc47-4e09-a8de-f15f382f7121 deleted file mode 100644 index 9e26dfeeb..000000000 --- a/user_label_model_greedy_c46b0f38-fc47-4e09-a8de-f15f382f7121 +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file From 2f3be9a7c99d51d0e902cd22447299d0cdf6c6aa Mon Sep 17 00:00:00 2001 From: rfitzger Date: Wed, 22 Jun 2022 15:35:02 -0600 Subject: [PATCH 15/46] code review --- .../confirmed_trip_feature_extraction.py | 44 +++++++++---- .../modelling/similarity/od_similarity.py | 9 ++- .../modelling/similarity/similarity_metric.py | 2 +- .../greedy_similarity_binning.py | 32 +++++----- .../user_label_model/model_storage.py | 41 ++++--------- .../modelling/user_label_model/run_model.py | 53 ++++++++-------- .../user_label_prediction_model.py | 4 +- .../modelling/user_label_model/util.py | 61 +------------------ 8 files changed, 93 insertions(+), 153 deletions(-) diff --git a/emission/analysis/modelling/similarity/confirmed_trip_feature_extraction.py b/emission/analysis/modelling/similarity/confirmed_trip_feature_extraction.py index 1ff3594ab..3ee4eba78 100644 --- a/emission/analysis/modelling/similarity/confirmed_trip_feature_extraction.py +++ b/emission/analysis/modelling/similarity/confirmed_trip_feature_extraction.py @@ -1,28 +1,48 @@ -from typing import Dict, List -from emission.core.wrapper.confirmedtrip import Confirmedtrip +from typing import List +import emission.core.wrapper.confirmedtrip as ecwc import emission.analysis.modelling.tour_model.label_processing as lp -def origin_features(trip: Confirmedtrip) -> List[float]: +def origin_features(trip: ecwc.Confirmedtrip) -> List[float]: + """extract the trip origin coordinates. + + :param trip: trip to extract features from + :return: origin coordinates + """ origin = trip.data.start_loc["coordinates"] return origin -def destination_features(trip: Confirmedtrip) -> List[float]: +def destination_features(trip: ecwc.Confirmedtrip) -> List[float]: + """extract the trip destination coordinates. + + :param trip: trip to extract features from + :return: destination coordinates + """ destination = trip.data.end_loc["coordinates"] return destination -def od_features(trip: Confirmedtrip) -> List[float]: +def od_features(trip: ecwc.Confirmedtrip) -> List[float]: + """extract both origin and destination coordinates. + + :param trip: trip to extract features from + :return: od coordinates + """ o_lat, o_lon = origin_features(trip) d_lat, d_lon = destination_features(trip) return [o_lat, o_lon, d_lat, d_lon] -def distance_feature(trip: Confirmedtrip) -> List[float]: +def distance_feature(trip: ecwc.Confirmedtrip) -> List[float]: + """provided for forward compatibility. + + :param trip: trip to extract features from + :return: distance feature + """ return [trip.data.distance] -def duration_feature(trip: Confirmedtrip) -> List[float]: - return [trip.data.duration] +def duration_feature(trip: ecwc.Confirmedtrip) -> List[float]: + """provided for forward compatibility. -def label_features(trip: Confirmedtrip) -> Dict: - labels = trip.data.user_input - labels_normalized = lp.map_labels(labels) # could be replaced by localization logic - return labels_normalized + :param trip: trip to extract features from + :return: duration feature + """ + return [trip.data.duration] diff --git a/emission/analysis/modelling/similarity/od_similarity.py b/emission/analysis/modelling/similarity/od_similarity.py index 10164aad5..5a0f7d1d2 100644 --- a/emission/analysis/modelling/similarity/od_similarity.py +++ b/emission/analysis/modelling/similarity/od_similarity.py @@ -1,19 +1,18 @@ from typing import List -from emission.analysis.modelling.similarity.similarity_metric import SimilarityMetric +import emission.analysis.modelling.similarity.similarity_metric as eamss import emission.analysis.modelling.similarity.confirmed_trip_feature_extraction as ctfe -from emission.analysis.modelling.tour_model.similarity import similarity -from emission.core.wrapper.confirmedtrip import Confirmedtrip +import emission.core.wrapper.confirmedtrip as ecwc import emission.core.common as ecc -class OriginDestinationSimilarity(SimilarityMetric): +class OriginDestinationSimilarity(eamss.SimilarityMetric): """ similarity metric which compares, for two trips, the distance for origin to origin, and destination to destination, in meters. """ - def extract_features(self, trip: Confirmedtrip) -> List[float]: + def extract_features(self, trip: ecwc.Confirmedtrip) -> List[float]: return ctfe.od_features(trip) def similarity(self, a: List[float], b: List[float], thresh: float) -> List[float]: diff --git a/emission/analysis/modelling/similarity/similarity_metric.py b/emission/analysis/modelling/similarity/similarity_metric.py index 2996e059b..9521f1d1c 100644 --- a/emission/analysis/modelling/similarity/similarity_metric.py +++ b/emission/analysis/modelling/similarity/similarity_metric.py @@ -37,4 +37,4 @@ def similar(self, a: List[float], b: List[float], thresh: float) -> bool: """ similarity_values = self.similarity(a, b) is_similar = all(map(lambda sim: sim <= thresh, similarity_values)) - return is_similar \ No newline at end of file + return is_similar diff --git a/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py b/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py index 255d26c94..0ff7be648 100644 --- a/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py @@ -1,27 +1,28 @@ import logging from typing import Dict, List, Optional, Tuple +import emission.analysis.modelling.similarity.similarity_metric as eamss import emission.analysis.modelling.tour_model.label_processing as lp +import emission.analysis.modelling.user_label_model.user_label_prediction_model as eamuu import emission.analysis.modelling.user_label_model.util as util +import emission.core.wrapper.confirmedtrip as ecwc import pandas as pd -from emission.analysis.modelling.similarity.similarity_metric import \ - SimilarityMetric -from emission.analysis.modelling.user_label_model.user_label_prediction_model import \ - UserLabelPredictionModel -from emission.core.wrapper.confirmedtrip import Confirmedtrip -class GreedySimilarityBinning(UserLabelPredictionModel): +class GreedySimilarityBinning(eamuu.UserLabelPredictionModel): def __init__( self, - metric: SimilarityMetric, + metric: eamss.SimilarityMetric, sim_thresh: float, apply_cutoff: bool = False, ) -> None: """ instantiate a clustering model for a user. + replaces the original similarity class + [https://github.com/e-mission/e-mission-server/blob/5b9e608154de15e32df4f70a07a5b95477e7dbf5/emission/analysis/modelling/tour_model/similarity.py#L67] + this technique employs a greedy similarity heuristic to associate trips with collections of probabilistic class labels. in pseudocode: @@ -52,7 +53,7 @@ def __init__( self.bins: Dict[int, Dict] = {} self.loaded = False - def fit(self, trips: List[Confirmedtrip]): + def fit(self, trips: List[ecwc.Confirmedtrip]): """train the model by passing data, where each row in the data corresponds to a label at the matching index of the label input @@ -65,12 +66,9 @@ def fit(self, trips: List[Confirmedtrip]): self._generate_predictions() logging.info(f"model fit to trip data") - def predict(self, trip: Confirmedtrip) -> Tuple[List[Dict], int]: + def predict(self, trip: ecwc.Confirmedtrip) -> Tuple[List[Dict], int]: if not self.loaded: - msg = ( - "predict called on unloaded model " - f"for user {self.user_id}" - ) + msg = f"predict called on unloaded model for user {self.user_id}" raise IOError(msg) logging.debug(f"running greedy similarity clustering") @@ -96,13 +94,14 @@ def to_dict(self) -> Dict: def from_dict(self, model: Dict): self.bins = model - def extract_features(self, trip: Confirmedtrip) -> List[float]: + def extract_features(self, trip: ecwc.Confirmedtrip) -> List[float]: features = self.metric.extract_features(trip) return features - def _assign_bins(self, trips: List[Confirmedtrip]): + def _assign_bins(self, trips: List[ecwc.Confirmedtrip]): """ assigns each trip to a bin by greedy similarity search + [see https://github.com/e-mission/e-mission-server/blob/5b9e608154de15e32df4f70a07a5b95477e7dbf5/emission/analysis/modelling/tour_model/similarity.py#L118] :param data: trips to assign to bins :type data: List[Confirmedtrip] @@ -125,7 +124,7 @@ def _assign_bins(self, trips: List[Confirmedtrip]): } self.bins[new_bin_id] = new_bin_record - def _nearest_bin(self, trip: Confirmedtrip) -> Tuple[Optional[int], Optional[Dict]]: + def _nearest_bin(self, trip: ecwc.Confirmedtrip) -> Tuple[Optional[int], Optional[Dict]]: """ finds a bin which contains at least one matching feature. the first record matching by similarity measure is returned. if @@ -174,6 +173,7 @@ def _apply_cutoff(self): def _generate_predictions(self): """ helper function to transform binned features and labels into predictions. + taken from [https://github.com/e-mission/e-mission-server/blob/10772f892385d44e11e51e796b0780d8f6609a2c/emission/analysis/modelling/tour_model_first_only/build_save_model.py#L40] for each bin, the unique label combinations are counted. their probability is estimated with label_count / total_labels. diff --git a/emission/analysis/modelling/user_label_model/model_storage.py b/emission/analysis/modelling/user_label_model/model_storage.py index 2a538cec8..7f4f32308 100644 --- a/emission/analysis/modelling/user_label_model/model_storage.py +++ b/emission/analysis/modelling/user_label_model/model_storage.py @@ -1,29 +1,27 @@ from enum import Enum from typing import Dict, Optional -import emission.analysis.modelling.user_label_model.util as util +import emission.analysis.modelling.user_label_model.model_type as eamum import emission.core.wrapper.user_label_prediction_model as ecwu import emission.storage.decorations.analysis_timeseries_queries as esda import emission.storage.pipeline_queries as epq import emission.storage.timeseries.abstract_timeseries as esta +import emission.storage.timeseries.builtin_timeseries as estb import pymongo -from emission.analysis.modelling.user_label_model.model_type import ModelType -from emission.storage.timeseries.builtin_timeseries import BuiltinTimeSeries class ModelStorage(Enum): - FILE_SYSTEM = 0 - DATABASE = 1 + """ + enumeration of model storage destinations. currently restricted to + DATABASE only. + """ + DATABASE = 0 @classmethod def names(cls): return list(map(lambda e: e.name, list(cls))) -def create_filename(user_id, model_type: ModelType) -> str: - return f"user_label_model_{model_type.name}_{str(user_id)}" - - -def load_model(user_id, model_type: ModelType, model_storage: ModelStorage) -> Optional[Dict]: +def load_model(user_id, model_type: eamum.ModelType, model_storage: ModelStorage) -> Optional[Dict]: """load a user label model from a model storage location :param user_id: the user to request a model for @@ -32,16 +30,12 @@ def load_model(user_id, model_type: ModelType, model_storage: ModelStorage) -> O :return: the model representation as a Python Dict or None :raises: TypeError if loaded model has different type than expected type """ - if model_storage == ModelStorage.FILE_SYSTEM: - filename = create_filename(user_id, model_type) - model_data = util.load_fs(filename) - return model_data - elif model_storage == ModelStorage.DATABASE: + if model_storage == ModelStorage.DATABASE: # retrieve stored model with timestamp that matches/exceeds the most # recent PipelineState.USER_LABEL_MODEL entry ts = esda.get_timeseries_for_user(user_id) - if not isinstance(ts, BuiltinTimeSeries): + if not isinstance(ts, estb.BuiltinTimeSeries): raise Exception('user model storage requires BuiltInTimeSeries') latest_model_entry = ts.get_first_entry( key=esda.USER_LABEL_MODEL_STORE_KEY, @@ -67,7 +61,7 @@ def load_model(user_id, model_type: ModelType, model_storage: ModelStorage) -> O def save_model( user_id, - model_type: ModelType, + model_type: eamum.ModelType, model_data: Dict, model_timestamp: int, model_storage: ModelStorage = ModelStorage.DATABASE): @@ -81,18 +75,7 @@ def save_model( :raises IOError: failure when writing to storage medium """ - if model_storage == ModelStorage.FILE_SYSTEM: - try: - filename = create_filename(user_id, model_type) - util.save_fs(filename, model_data) - except IOError as e: - msg = ( - f"failure storing model for user {user_id}, model {model_type.name} " - f"to the file system" - ) - raise IOError(msg) from e - - elif model_storage == ModelStorage.DATABASE: + if model_storage == ModelStorage.DATABASE: row = ecwu.UserLabelPredictionModel() row.user_id = user_id diff --git a/emission/analysis/modelling/user_label_model/run_model.py b/emission/analysis/modelling/user_label_model/run_model.py index d33fb754c..54191986b 100644 --- a/emission/analysis/modelling/user_label_model/run_model.py +++ b/emission/analysis/modelling/user_label_model/run_model.py @@ -2,27 +2,24 @@ from typing import Optional import arrow -import emission.analysis.modelling.tour_model.cluster_pipeline as pipeline -import emission.storage.decorations.analysis_timeseries_queries as esda -import emission.storage.pipeline_queries as epq -import emission.storage.timeseries.abstract_timeseries as esta import emission.analysis.modelling.similarity.od_similarity as eamso import emission.analysis.modelling.user_label_model.greedy_similarity_binning as eamug -import emission.analysis.modelling.user_label_model.model_storage as eamum -from emission.analysis.modelling.user_label_model.model_storage import ModelStorage -from emission.analysis.modelling.user_label_model.model_type import ModelType +import emission.analysis.modelling.user_label_model.model_storage as eamums +import emission.analysis.modelling.user_label_model.model_type as eamumt import emission.analysis.modelling.user_label_model.user_label_prediction_model as eamuu -from emission.core.wrapper.confirmedtrip import Confirmedtrip -from emission.storage.timeseries.timequery import TimeQuery -from numpy import isin +import emission.core.wrapper.confirmedtrip as ecwc +import emission.storage.decorations.analysis_timeseries_queries as esda +import emission.storage.pipeline_queries as epq +import emission.storage.timeseries.abstract_timeseries as esta +import emission.storage.timeseries.timequery as estt -SIMILARITY_THRESHOLD_METERS = 500 +SIMILARITY_THRESHOLD_METERS = 500 # should come from app config def update_user_label_model( user_id, - model_type: ModelType, - model_storage: ModelStorage = ModelStorage.DATABASE, + model_type: eamumt.ModelType, + model_storage: eamums.ModelStorage = eamums.ModelStorage.DATABASE, min_trips: int = 14): """ create/update a user label model for a user. @@ -43,7 +40,7 @@ def update_user_label_model( model = model_factory(model_type) # if a previous model exists, deserialize the stored model - model_data_prev = eamum.load_model(user_id, model_type, model_storage) + model_data_prev = eamums.load_model(user_id, model_type, model_storage) if model_data_prev is not None: model.from_dict(model_data_prev) logging.debug(f"loaded {model_type.name} user label model for user {user_id}") @@ -57,15 +54,15 @@ def update_user_label_model( # train and store the model model.fit(trips) model_data_next = model.to_dict() - eamum.save_model(user_id, model_type, model_data_next, timestamp, model_storage) + eamums.save_model(user_id, model_type, model_data_next, timestamp, model_storage) logging.debug(f"{model_type.name} label prediction model built for user {user_id} with timestamp {timestamp}") def predict_labels_with_n( - trip: Confirmedtrip, - model_type = ModelType.GREEDY_SIMILARITY_BINNING, - model_storage = ModelStorage.DATABASE): + trip: ecwc.Confirmedtrip, + model_type = eamumt.ModelType.GREEDY_SIMILARITY_BINNING, + model_storage = eamums.ModelStorage.DATABASE): """ invoke the user label prediction model to predict labels for a trip. @@ -83,7 +80,7 @@ def predict_labels_with_n( return predictions, n -def model_factory(model_type: ModelType) -> eamuu.UserLabelPredictionModel: +def model_factory(model_type: eamumt.ModelType) -> eamuu.UserLabelPredictionModel: """ instantiates the requested user model type with the configured parameters. @@ -95,7 +92,7 @@ def model_factory(model_type: ModelType) -> eamuu.UserLabelPredictionModel: :raises KeyError: if the requested model name does not exist """ MODELS = { - ModelType.GREEDY_SIMILARITY_BINNING: eamug.GreedySimilarityBinning( + eamumt.ModelType.GREEDY_SIMILARITY_BINNING: eamug.GreedySimilarityBinning( metric=eamso.OriginDestinationSimilarity(), sim_thresh=SIMILARITY_THRESHOLD_METERS, apply_cutoff=False @@ -103,7 +100,7 @@ def model_factory(model_type: ModelType) -> eamuu.UserLabelPredictionModel: } model = MODELS.get(model_type) if model is None: - if not isinstance(model_type, ModelType): + if not isinstance(model_type, eamumt.ModelType): raise TypeError(f"provided model type {model_type} is not an instance of ModelType") else: model_names = list(lambda e: e.name, MODELS.keys()) @@ -112,7 +109,7 @@ def model_factory(model_type: ModelType) -> eamuu.UserLabelPredictionModel: return model -def _get_trips_for_user(user_id, time_query: Optional[TimeQuery], min_trips: int): +def _get_trips_for_user(user_id, time_query: Optional[estt.TimeQuery], min_trips: int): """ load the labeled trip data for this user, subject to a time query. if the user does not have at least $min_trips trips with labels, then return an empty list. @@ -135,8 +132,8 @@ def _get_trips_for_user(user_id, time_query: Optional[TimeQuery], min_trips: int def _load_user_label_model( user_id, - model_type: ModelType, - model_storage: ModelStorage) -> Optional[eamuu.UserLabelPredictionModel]: + model_type: eamumt.ModelType, + model_storage: eamums.ModelStorage) -> Optional[eamuu.UserLabelPredictionModel]: """helper to build a user label prediction model class with the contents of a stored model for some user. @@ -145,7 +142,7 @@ def _load_user_label_model( :param model_storage: storage type :return: model, or None if no model is stored for this user """ - model_dict = eamum.load_model(user_id, model_type, model_storage) + model_dict = eamums.load_model(user_id, model_type, model_storage) if model_dict is None: return None else: @@ -160,7 +157,7 @@ def _load_user_label_model( # case 1: the new trip matches a bin from the 1st round and a cluster from the 2nd round user_id = all_users[0] - update_user_label_model(user_id, ModelType.GREEDY_SIMILARITY_BINNING, ModelStorage.FILE_SYSTEM) + update_user_label_model(user_id, eamumt.ModelType.GREEDY_SIMILARITY_BINNING, eamums.ModelStorage.DATABASE) filter_trips = _get_trips_for_user(user_id, None, 0) new_trip = filter_trips[4] # result is [{'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'church', 'replaced_mode': 'drove_alone'}, @@ -173,7 +170,7 @@ def _load_user_label_model( # 1. the user is invalid(< 10 existing fully labeled trips, or < 50% of trips that fully labeled) # 2. the user doesn't have common trips user_id = all_users[1] - update_user_label_model(user_id, ModelType.GREEDY_SIMILARITY_BINNING, ModelStorage.FILE_SYSTEM) + update_user_label_model(user_id, eamumt.ModelType.GREEDY_SIMILARITY_BINNING, eamums.ModelStorage.DATABASE) filter_trips = _get_trips_for_user(user_id, None, 0) new_trip = filter_trips[0] # result is [] @@ -182,7 +179,7 @@ def _load_user_label_model( # case3: the new trip is novel trip(doesn't fall in any 1st round bins) user = all_users[0] - update_user_label_model(user_id, ModelType.GREEDY_SIMILARITY_BINNING, ModelStorage.FILE_SYSTEM) + update_user_label_model(user_id, eamumt.ModelType.GREEDY_SIMILARITY_BINNING, eamums.ModelStorage.DATABASE) filter_trips = _get_trips_for_user(user_id, None, 0) new_trip = filter_trips[0] # result is [] diff --git a/emission/analysis/modelling/user_label_model/user_label_prediction_model.py b/emission/analysis/modelling/user_label_model/user_label_prediction_model.py index c6ca40a45..452b7133b 100644 --- a/emission/analysis/modelling/user_label_model/user_label_prediction_model.py +++ b/emission/analysis/modelling/user_label_model/user_label_prediction_model.py @@ -1,7 +1,7 @@ from abc import ABCMeta, abstractmethod from typing import Dict, List, Tuple -from emission.core.wrapper.confirmedtrip import Confirmedtrip +import emission.core.wrapper.confirmedtrip as ecwc class UserLabelPredictionModel(metaclass=ABCMeta): @@ -68,7 +68,7 @@ def is_incremental(self) -> bool: pass @abstractmethod - def extract_features(self, trip: Confirmedtrip) -> List[float]: + def extract_features(self, trip: ecwc.Confirmedtrip) -> List[float]: """ extract the relevant features for learning from a trip for this model instance diff --git a/emission/analysis/modelling/user_label_model/util.py b/emission/analysis/modelling/user_label_model/util.py index cc4d82406..f5aebf85c 100644 --- a/emission/analysis/modelling/user_label_model/util.py +++ b/emission/analysis/modelling/user_label_model/util.py @@ -1,67 +1,8 @@ -from typing import Dict, List, Optional, Tuple -import jsonpickle as jpickle -import logging +from typing import List, Tuple from past.utils import old_div import numpy from numpy.linalg import norm -import emission.storage.decorations.analysis_timeseries_queries as esda - - -def load_fs(filename: str, numpy_decode: bool = True) -> Optional[dict]: - """loads model state as a pickled object on the file system. - if the file is not found, returns an empty dict. - - :param filename: file name to load - :type filename: str - :param numpy_decode: if part of the data is numpy encoded - :type numpy_decode: bool - :return: json object parsed, or, an empty list - :rtype: Dict - """ - raise Exception("deprecated, use db instead") - logging.debug(f"At stage: loading model") - try: - with open(filename, "r") as f: - contents = f.read() - except FileNotFoundError: - logging.info(f"No model found at {filename}, no prediction") - return None - - try: - if numpy_decode: - # see https://jsonpickle.github.io/extensions.html - import jsonpickle.ext.numpy as jsonpickle_numpy - jsonpickle_numpy.register_handlers() - result = jpickle.loads(contents) - return result - except Exception as e: - msg = ( - f"failure decoding stored model at {filename}, " - f"numpy_decode={numpy_decode}" - ) - raise IOError(msg) from e - - -def save_fs(filename: str, obj: object): - """save model state as a pickled object on the file system - - :param filename: filename to write - :type filename: str - :param obj: the object to pickle + store - :type obj: object - - """ - raise Exception("deprecated, use db instead") - try: - logging.debug("At stage: saving model") - obj_capsule = jpickle.dumps(obj) - with open(filename, "w") as fd: - fd.write(obj_capsule) - except Exception as e: - msg = f"failed writing clustering model contents to file system" - raise IOError(msg) from e - def find_knee_point(values: List[float]) -> Tuple[float, int]: """for a list of values, find the value which represents the cut-off point From de122821cbc4a0c6b7c742abffc02765bccd9232 Mon Sep 17 00:00:00 2001 From: rfitzger Date: Wed, 22 Jun 2022 15:45:23 -0600 Subject: [PATCH 16/46] relocate tour model test code --- .../modelling/user_label_model/run_model.py | 39 ------------ emission/tests/modellingTests/TestRunModel.py | 61 +++++++++++++++++++ 2 files changed, 61 insertions(+), 39 deletions(-) create mode 100644 emission/tests/modellingTests/TestRunModel.py diff --git a/emission/analysis/modelling/user_label_model/run_model.py b/emission/analysis/modelling/user_label_model/run_model.py index 54191986b..21124bcfa 100644 --- a/emission/analysis/modelling/user_label_model/run_model.py +++ b/emission/analysis/modelling/user_label_model/run_model.py @@ -150,42 +150,3 @@ def _load_user_label_model( model.from_dict(model_dict) return model - -if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',level=logging.DEBUG) - all_users = esta.TimeSeries.get_uuid_list() - - # case 1: the new trip matches a bin from the 1st round and a cluster from the 2nd round - user_id = all_users[0] - update_user_label_model(user_id, eamumt.ModelType.GREEDY_SIMILARITY_BINNING, eamums.ModelStorage.DATABASE) - filter_trips = _get_trips_for_user(user_id, None, 0) - new_trip = filter_trips[4] - # result is [{'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'church', 'replaced_mode': 'drove_alone'}, - # 'p': 0.9333333333333333}, {'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'entertainment', - # 'replaced_mode': 'drove_alone'}, 'p': 0.06666666666666667}] - pl, _ = predict_labels_with_n(new_trip) - assert len(pl) > 0, f"Invalid prediction {pl}" - - # case 2: no existing files for the user who has the new trip: - # 1. the user is invalid(< 10 existing fully labeled trips, or < 50% of trips that fully labeled) - # 2. the user doesn't have common trips - user_id = all_users[1] - update_user_label_model(user_id, eamumt.ModelType.GREEDY_SIMILARITY_BINNING, eamums.ModelStorage.DATABASE) - filter_trips = _get_trips_for_user(user_id, None, 0) - new_trip = filter_trips[0] - # result is [] - pl, _ = predict_labels_with_n(new_trip) - assert len(pl) == 0, f"Invalid prediction {pl}" - - # case3: the new trip is novel trip(doesn't fall in any 1st round bins) - user = all_users[0] - update_user_label_model(user_id, eamumt.ModelType.GREEDY_SIMILARITY_BINNING, eamums.ModelStorage.DATABASE) - filter_trips = _get_trips_for_user(user_id, None, 0) - new_trip = filter_trips[0] - # result is [] - pl = predict_labels_with_n(new_trip) - assert len(pl) == 0, f"Invalid prediction {pl}" - - # case 4: the new trip falls in a 1st round bin, but predict to be a new cluster in the 2nd round - # result is [] - # no example for now diff --git a/emission/tests/modellingTests/TestRunModel.py b/emission/tests/modellingTests/TestRunModel.py new file mode 100644 index 000000000..c08e87132 --- /dev/null +++ b/emission/tests/modellingTests/TestRunModel.py @@ -0,0 +1,61 @@ +import unittest + + +import emission.analysis.modelling.user_label_model.model_storage as eamums +import emission.analysis.modelling.user_label_model.model_type as eamumt +import emission.analysis.modelling.user_label_model.run_model as eamur +import emission.storage.timeseries.abstract_timeseries as esta + + +class TestSimilarityAux(unittest.TestCase): + """these tests were copied forward during a refactor of the tour model + [https://github.com/e-mission/e-mission-server/blob/10772f892385d44e11e51e796b0780d8f6609a2c/emission/analysis/modelling/tour_model_first_only/load_predict.py#L114] + + it's uncertain what condition they are in besides having been refactored to + use the more recent tour modeling code. + """ + def setUp(self): + self.all_users = esta.TimeSeries.get_uuid_list() + if len(self.all_users) == 0: + self.fail('test invariant failed: no users found') + + def testTrip1(self): + + # case 1: the new trip matches a bin from the 1st round and a cluster from the 2nd round + user_id = self.all_users[0] + eamur.update_user_label_model(user_id, eamumt.ModelType.GREEDY_SIMILARITY_BINNING, eamums.ModelStorage.DATABASE) + filter_trips = eamur._get_trips_for_user(user_id, None, 0) + new_trip = filter_trips[4] + # result is [{'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'church', 'replaced_mode': 'drove_alone'}, + # 'p': 0.9333333333333333}, {'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'entertainment', + # 'replaced_mode': 'drove_alone'}, 'p': 0.06666666666666667}] + pl, _ = eamur.predict_labels_with_n(new_trip) + assert len(pl) > 0, f"Invalid prediction {pl}" + + def testTrip2(self): + + # case 2: no existing files for the user who has the new trip: + # 1. the user is invalid(< 10 existing fully labeled trips, or < 50% of trips that fully labeled) + # 2. the user doesn't have common trips + user_id = self.all_users[1] + eamur.update_user_label_model(user_id, eamumt.ModelType.GREEDY_SIMILARITY_BINNING, eamums.ModelStorage.DATABASE) + filter_trips = eamur._get_trips_for_user(user_id, None, 0) + new_trip = filter_trips[0] + # result is [] + pl, _ = eamur.predict_labels_with_n(new_trip) + assert len(pl) == 0, f"Invalid prediction {pl}" + + def testTrip3(self): + + # case3: the new trip is novel trip(doesn't fall in any 1st round bins) + user_id = self.all_users[0] + eamur.update_user_label_model(user_id, eamumt.ModelType.GREEDY_SIMILARITY_BINNING, eamums.ModelStorage.DATABASE) + filter_trips = eamur._get_trips_for_user(user_id, None, 0) + new_trip = filter_trips[0] + # result is [] + pl = eamur.predict_labels_with_n(new_trip) + assert len(pl) == 0, f"Invalid prediction {pl}" + + # case 4: the new trip falls in a 1st round bin, but predict to be a new cluster in the 2nd round + # result is [] + # no example for now From c795a55772e943372e4518f27e7a2082b46ed388 Mon Sep 17 00:00:00 2001 From: rfitzger Date: Thu, 23 Jun 2022 11:02:02 -0600 Subject: [PATCH 17/46] cleanup --- bin/build_label_model.py | 9 ++++----- emission/analysis/modelling/user_label_model/util.py | 3 +++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/bin/build_label_model.py b/bin/build_label_model.py index 666a811c0..4fe2fac8b 100644 --- a/bin/build_label_model.py +++ b/bin/build_label_model.py @@ -5,9 +5,8 @@ import argparse import uuid -import copy -from emission.analysis.modelling.user_label_model.model_storage import ModelStorage -from emission.analysis.modelling.user_label_model.model_type import ModelType +import emission.analysis.modelling.user_label_model.model_storage as eamums +import emission.analysis.modelling.user_label_model.model_type as eamumt import emission.pipeline.reset as epr import emission.core.get_database as edb @@ -67,7 +66,7 @@ def _email_2_user_list(email_list): for user_id in user_list: logging.info("building model for user %s" % user_id) # these can come from the application config as default values - model_type = ModelType.GREEDY_SIMILARITY_BINNING - model_storage = ModelStorage.DATABASE + model_type = eamumt.ModelType.GREEDY_SIMILARITY_BINNING + model_storage = eamums.ModelStorage.DATABASE min_trips = 14 eamur.update_user_label_model(user_id, model_type, model_storage, min_trips) diff --git a/emission/analysis/modelling/user_label_model/util.py b/emission/analysis/modelling/user_label_model/util.py index f5aebf85c..662137ba5 100644 --- a/emission/analysis/modelling/user_label_model/util.py +++ b/emission/analysis/modelling/user_label_model/util.py @@ -8,6 +8,9 @@ def find_knee_point(values: List[float]) -> Tuple[float, int]: """for a list of values, find the value which represents the cut-off point or "elbow" in the function when values are sorted. + copied from original similarity algorithm. permalink: + [https://github.com/e-mission/e-mission-server/blob/5b9e608154de15e32df4f70a07a5b95477e7dbf5/emission/analysis/modelling/tour_model/similarity.py#L256] + based on this stack overflow answer: https://stackoverflow.com/a/2022348/4803266 And summarized by the statement: "A quick way of finding the elbow is to draw a line from the first to the last point of the curve and then find the data point From b98d5dcf4820a6953bc03878b37d2c921c00ddca Mon Sep 17 00:00:00 2001 From: rfitzger Date: Thu, 23 Jun 2022 11:21:33 -0600 Subject: [PATCH 18/46] cleanup and documentation --- .../confirmed_trip_feature_extraction.py | 1 - .../modelling/similarity/similarity_metric.py | 4 +-- .../greedy_similarity_binning.py | 26 +++++++++++++++++++ 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/emission/analysis/modelling/similarity/confirmed_trip_feature_extraction.py b/emission/analysis/modelling/similarity/confirmed_trip_feature_extraction.py index 3ee4eba78..d1d2b4e77 100644 --- a/emission/analysis/modelling/similarity/confirmed_trip_feature_extraction.py +++ b/emission/analysis/modelling/similarity/confirmed_trip_feature_extraction.py @@ -1,6 +1,5 @@ from typing import List import emission.core.wrapper.confirmedtrip as ecwc -import emission.analysis.modelling.tour_model.label_processing as lp def origin_features(trip: ecwc.Confirmedtrip) -> List[float]: diff --git a/emission/analysis/modelling/similarity/similarity_metric.py b/emission/analysis/modelling/similarity/similarity_metric.py index 9521f1d1c..b6490793f 100644 --- a/emission/analysis/modelling/similarity/similarity_metric.py +++ b/emission/analysis/modelling/similarity/similarity_metric.py @@ -1,13 +1,13 @@ from abc import ABCMeta, abstractmethod from typing import List -from emission.core.wrapper.confirmedtrip import Confirmedtrip +import emission.core.wrapper.confirmedtrip as ecwc class SimilarityMetric(metaclass=ABCMeta): @abstractmethod - def extract_features(self, trip: Confirmedtrip) -> List[float]: + def extract_features(self, trip: ecwc.Confirmedtrip) -> List[float]: """extracts the features we want to compare for similarity :param trip: a confirmed trip diff --git a/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py b/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py index 0ff7be648..444ab46b0 100644 --- a/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py @@ -40,6 +40,32 @@ def __init__( the number of predictions is not assumed to be the number of features. + the original similarity class (link above) used a nested List data + structure to capture the notion of binning. this was then copied into + a Dict when the model needed to be saved. the same technique can be + written to work directly on nested Dicts with no loss in performance. + the data takes the form: + { + bin_id: { + "features": [ + [f1, f2, .., fn], + ... + ], + "labels": [ + { label1: value1, ... } + ], + "predictions": [ + { "labels": { label1: value1, ... }, 'p': p_val } + ] + } + } + where + - bin_id: int index of a bin containing similar trips + - f_x: float feature value (an ordinate such as origin.x) + - label_x: str OpenPATH user label category such as "mode_confirm" + - value_x: str user-provided label for a category + - p_val: float probability of a prediction, real number in [0, 1] + :param dir: the model load/save directory :param user_id: identity (UUID) of the e-mission user :param metric: type of similarity metric to use From a183b221a4405a646b96ebfb5f9e4fe4ab7dccfd Mon Sep 17 00:00:00 2001 From: rfitzger Date: Thu, 23 Jun 2022 15:57:41 -0600 Subject: [PATCH 19/46] module rename trip_model, begin unit tests --- bin/build_label_model.py | 8 +- .../inference/labels/inferrers.py | 6 +- .../modelling/similarity/od_similarity.py | 2 +- .../__init__.py | 0 .../greedy_similarity_binning.py | 28 ++-- .../model_storage.py | 14 +- .../model_type.py | 0 .../run_model.py | 18 +-- .../trip_model.py} | 2 +- .../{user_label_model => trip_model}/util.py | 0 emission/core/wrapper/entry.py | 2 + emission/core/wrapper/pipelinestate.py | 2 +- ...label_prediction_model.py => tripmodel.py} | 8 +- .../analysis_timeseries_queries.py | 2 +- emission/storage/pipeline_queries.py | 8 +- .../storage/timeseries/builtin_timeseries.py | 2 +- .../TestGreedySimilarityBinning.py | 29 ++++ emission/tests/modellingTests/TestRunModel.py | 12 +- .../modellingTests/TestSimilarityMetric.py | 33 +++++ emission/tests/modellingTests/__init__.py | 0 .../modellingTests/modellingTestAssets.py | 124 ++++++++++++++++++ 21 files changed, 250 insertions(+), 50 deletions(-) rename emission/analysis/modelling/{user_label_model => trip_model}/__init__.py (100%) rename emission/analysis/modelling/{user_label_model => trip_model}/greedy_similarity_binning.py (89%) rename emission/analysis/modelling/{user_label_model => trip_model}/model_storage.py (89%) rename emission/analysis/modelling/{user_label_model => trip_model}/model_type.py (100%) rename emission/analysis/modelling/{user_label_model => trip_model}/run_model.py (88%) rename emission/analysis/modelling/{user_label_model/user_label_prediction_model.py => trip_model/trip_model.py} (97%) rename emission/analysis/modelling/{user_label_model => trip_model}/util.py (100%) rename emission/core/wrapper/{user_label_prediction_model.py => tripmodel.py} (64%) create mode 100644 emission/tests/modellingTests/TestGreedySimilarityBinning.py create mode 100644 emission/tests/modellingTests/TestSimilarityMetric.py create mode 100644 emission/tests/modellingTests/__init__.py create mode 100644 emission/tests/modellingTests/modellingTestAssets.py diff --git a/bin/build_label_model.py b/bin/build_label_model.py index 4fe2fac8b..caaf21d65 100644 --- a/bin/build_label_model.py +++ b/bin/build_label_model.py @@ -5,14 +5,14 @@ import argparse import uuid -import emission.analysis.modelling.user_label_model.model_storage as eamums -import emission.analysis.modelling.user_label_model.model_type as eamumt +import emission.analysis.modelling.trip_model.model_storage as eamums +import emission.analysis.modelling.trip_model.model_type as eamumt import emission.pipeline.reset as epr import emission.core.get_database as edb import emission.core.wrapper.user as ecwu import emission.storage.timeseries.abstract_timeseries as esta -import emission.analysis.modelling.user_label_model.run_model as eamur +import emission.analysis.modelling.trip_model.run_model as eamur def _get_user_list(args): if args.all: @@ -69,4 +69,4 @@ def _email_2_user_list(email_list): model_type = eamumt.ModelType.GREEDY_SIMILARITY_BINNING model_storage = eamums.ModelStorage.DATABASE min_trips = 14 - eamur.update_user_label_model(user_id, model_type, model_storage, min_trips) + eamur.update_trip_model(user_id, model_type, model_storage, min_trips) diff --git a/emission/analysis/classification/inference/labels/inferrers.py b/emission/analysis/classification/inference/labels/inferrers.py index b4f973061..ab06a2cd2 100644 --- a/emission/analysis/classification/inference/labels/inferrers.py +++ b/emission/analysis/classification/inference/labels/inferrers.py @@ -6,9 +6,9 @@ import copy import emission.analysis.modelling.tour_model_first_only.load_predict as lp -from emission.analysis.modelling.user_label_model.model_storage import ModelStorage -import emission.analysis.modelling.user_label_model.run_model as eamur -from emission.analysis.modelling.user_label_model.model_type import ModelType +from emission.analysis.modelling.trip_model.model_storage import ModelStorage +import emission.analysis.modelling.trip_model.run_model as eamur +from emission.analysis.modelling.trip_model.model_type import ModelType # A set of placeholder predictors to allow pipeline development without a real inference algorithm. # For the moment, the system is configured to work with two labels, "mode_confirm" and diff --git a/emission/analysis/modelling/similarity/od_similarity.py b/emission/analysis/modelling/similarity/od_similarity.py index 5a0f7d1d2..3b84bd764 100644 --- a/emission/analysis/modelling/similarity/od_similarity.py +++ b/emission/analysis/modelling/similarity/od_similarity.py @@ -15,7 +15,7 @@ class OriginDestinationSimilarity(eamss.SimilarityMetric): def extract_features(self, trip: ecwc.Confirmedtrip) -> List[float]: return ctfe.od_features(trip) - def similarity(self, a: List[float], b: List[float], thresh: float) -> List[float]: + def similarity(self, a: List[float], b: List[float]) -> List[float]: o_dist = ecc.calDistance([a[0], a[1]], [b[0], b[1]]) d_dist = ecc.calDistance([a[2], a[3]], [b[2], b[3]]) return [o_dist, d_dist] \ No newline at end of file diff --git a/emission/analysis/modelling/user_label_model/__init__.py b/emission/analysis/modelling/trip_model/__init__.py similarity index 100% rename from emission/analysis/modelling/user_label_model/__init__.py rename to emission/analysis/modelling/trip_model/__init__.py diff --git a/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py similarity index 89% rename from emission/analysis/modelling/user_label_model/greedy_similarity_binning.py rename to emission/analysis/modelling/trip_model/greedy_similarity_binning.py index 444ab46b0..bc5be67fa 100644 --- a/emission/analysis/modelling/user_label_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py @@ -1,15 +1,16 @@ import logging +from tokenize import group from typing import Dict, List, Optional, Tuple import emission.analysis.modelling.similarity.similarity_metric as eamss import emission.analysis.modelling.tour_model.label_processing as lp -import emission.analysis.modelling.user_label_model.user_label_prediction_model as eamuu -import emission.analysis.modelling.user_label_model.util as util +import emission.analysis.modelling.trip_model.trip_model as eamuu +import emission.analysis.modelling.trip_model.util as util import emission.core.wrapper.confirmedtrip as ecwc import pandas as pd -class GreedySimilarityBinning(eamuu.UserLabelPredictionModel): +class GreedySimilarityBinning(eamuu.TripModel): def __init__( self, @@ -85,6 +86,10 @@ def fit(self, trips: List[ecwc.Confirmedtrip]): :param trips: 2D array of features to train from """ + unlabeled = list(filter(lambda t: len(t['data']['user_input']) == 0, trips)) + if len(unlabeled) > 0: + msg = f'model.fit cannot be called with unlabeled trips, found {len(unlabeled)}' + raise Exception(msg) self.bins = {} self._assign_bins(trips) if len(self.bins) > 1 and self.apply_cutoff: @@ -132,6 +137,7 @@ def _assign_bins(self, trips: List[ecwc.Confirmedtrip]): :param data: trips to assign to bins :type data: List[Confirmedtrip] """ + logging.debug(f"_assign_bins called with trips {trips}") for trip in trips: trip_features = self.extract_features(trip) trip_labels = trip['data']['user_input'] @@ -159,14 +165,19 @@ def _nearest_bin(self, trip: ecwc.Confirmedtrip) -> Tuple[Optional[int], Optiona :param trip: incoming trip features to test with :return: nearest bin record, if found """ + logging.debug(f"_nearest_bin called with trip {trip}") + trip_features = self.extract_features(trip) selected_bin = None selected_record = None for bin_id, bin_record in self.bins.items(): - if self.metric.similar(trip_features, bin_record['features'], self.sim_thresh): - selected_bin = bin_id - selected_record = bin_record + for bin_features in bin_record['features']: + if self.metric.similar(trip_features, bin_features, self.sim_thresh): + selected_bin = bin_id + selected_record = bin_record + break + if selected_bin is not None: break return selected_bin, selected_record @@ -204,14 +215,15 @@ def _generate_predictions(self): for each bin, the unique label combinations are counted. their probability is estimated with label_count / total_labels. """ - for _, bin_record in self.bins: + for _, bin_record in self.bins.items(): user_label_df = pd.DataFrame(bin_record['labels']) user_label_df = lp.map_labels(user_label_df).dropna() # compute the sum of trips in this cluster sum_trips = len(user_label_df) # compute unique label sets and their probabilities in one cluster # 'p' refers to probability - unique_labels = user_label_df.groupby(user_label_df.columns.tolist()).size().reset_index(name='uniqcount') + group_cols = user_label_df.columns.tolist() + unique_labels = user_label_df.groupby(group_cols).size().reset_index(name='uniqcount') unique_labels['p'] = unique_labels.uniqcount / sum_trips labels_columns = user_label_df.columns.to_list() bin_label_combo_list = [] diff --git a/emission/analysis/modelling/user_label_model/model_storage.py b/emission/analysis/modelling/trip_model/model_storage.py similarity index 89% rename from emission/analysis/modelling/user_label_model/model_storage.py rename to emission/analysis/modelling/trip_model/model_storage.py index 7f4f32308..1404fbf75 100644 --- a/emission/analysis/modelling/user_label_model/model_storage.py +++ b/emission/analysis/modelling/trip_model/model_storage.py @@ -1,8 +1,8 @@ from enum import Enum from typing import Dict, Optional -import emission.analysis.modelling.user_label_model.model_type as eamum -import emission.core.wrapper.user_label_prediction_model as ecwu +import emission.analysis.modelling.trip_model.model_type as eamum +import emission.core.wrapper.tripmodel as ecwu import emission.storage.decorations.analysis_timeseries_queries as esda import emission.storage.pipeline_queries as epq import emission.storage.timeseries.abstract_timeseries as esta @@ -33,12 +33,12 @@ def load_model(user_id, model_type: eamum.ModelType, model_storage: ModelStorage if model_storage == ModelStorage.DATABASE: # retrieve stored model with timestamp that matches/exceeds the most - # recent PipelineState.USER_LABEL_MODEL entry + # recent PipelineState.TRIP_MODEL entry ts = esda.get_timeseries_for_user(user_id) if not isinstance(ts, estb.BuiltinTimeSeries): raise Exception('user model storage requires BuiltInTimeSeries') latest_model_entry = ts.get_first_entry( - key=esda.USER_LABEL_MODEL_STORE_KEY, + key=esda.TRIP_MODEL_STORE_KEY, field='data.model_ts', sort_order=pymongo.DESCENDING ) @@ -77,7 +77,7 @@ def save_model( if model_storage == ModelStorage.DATABASE: - row = ecwu.UserLabelPredictionModel() + row = ecwu.Tripmodel() row.user_id = user_id row.model_ts = model_timestamp row.model_type = model_type @@ -85,7 +85,7 @@ def save_model( try: ts = esta.TimeSeries.get_time_series(user_id) - ts.insert_data(user_id, esda.USER_LABEL_MODEL_STORE_KEY, row) + ts.insert_data(user_id, esda.TRIP_MODEL_STORE_KEY, row) except Exception as e: msg = ( f"failure storing model for user {user_id}, model {model_type.name} " @@ -94,7 +94,7 @@ def save_model( raise IOError(msg) from e try: - epq.mark_user_label_model_done(user_id, model_timestamp) + epq.mark_trip_model_done(user_id, model_timestamp) except Exception as e: msg = ( f"failure updating user label pipeline state for user {user_id}" diff --git a/emission/analysis/modelling/user_label_model/model_type.py b/emission/analysis/modelling/trip_model/model_type.py similarity index 100% rename from emission/analysis/modelling/user_label_model/model_type.py rename to emission/analysis/modelling/trip_model/model_type.py diff --git a/emission/analysis/modelling/user_label_model/run_model.py b/emission/analysis/modelling/trip_model/run_model.py similarity index 88% rename from emission/analysis/modelling/user_label_model/run_model.py rename to emission/analysis/modelling/trip_model/run_model.py index 21124bcfa..8726f0b6e 100644 --- a/emission/analysis/modelling/user_label_model/run_model.py +++ b/emission/analysis/modelling/trip_model/run_model.py @@ -3,10 +3,10 @@ import arrow import emission.analysis.modelling.similarity.od_similarity as eamso -import emission.analysis.modelling.user_label_model.greedy_similarity_binning as eamug -import emission.analysis.modelling.user_label_model.model_storage as eamums -import emission.analysis.modelling.user_label_model.model_type as eamumt -import emission.analysis.modelling.user_label_model.user_label_prediction_model as eamuu +import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamug +import emission.analysis.modelling.trip_model.model_storage as eamums +import emission.analysis.modelling.trip_model.model_type as eamumt +import emission.analysis.modelling.trip_model.trip_model as eamuu import emission.core.wrapper.confirmedtrip as ecwc import emission.storage.decorations.analysis_timeseries_queries as esda import emission.storage.pipeline_queries as epq @@ -16,7 +16,7 @@ SIMILARITY_THRESHOLD_METERS = 500 # should come from app config -def update_user_label_model( +def update_trip_model( user_id, model_type: eamumt.ModelType, model_storage: eamums.ModelStorage = eamums.ModelStorage.DATABASE, @@ -48,7 +48,7 @@ def update_user_label_model( logging.debug(f"building first {model_type.name} user label model for user {user_id}") # get all relevant trips - time_query = epq.get_time_query_for_user_label_model(user_id) if model.is_incremental else None + time_query = epq.get_time_query_for_trip_model(user_id) if model.is_incremental else None trips = _get_trips_for_user(user_id, time_query, min_trips) # train and store the model @@ -80,7 +80,7 @@ def predict_labels_with_n( return predictions, n -def model_factory(model_type: eamumt.ModelType) -> eamuu.UserLabelPredictionModel: +def model_factory(model_type: eamumt.ModelType) -> eamuu.TripModel: """ instantiates the requested user model type with the configured parameters. @@ -133,12 +133,12 @@ def _get_trips_for_user(user_id, time_query: Optional[estt.TimeQuery], min_trips def _load_user_label_model( user_id, model_type: eamumt.ModelType, - model_storage: eamums.ModelStorage) -> Optional[eamuu.UserLabelPredictionModel]: + model_storage: eamums.ModelStorage) -> Optional[eamuu.TripModel]: """helper to build a user label prediction model class with the contents of a stored model for some user. :param user_id: user to retrieve the model for - :param model_type: UserLabelPredictionModel type configured for this OpenPATH server + :param model_type: TripModel type configured for this OpenPATH server :param model_storage: storage type :return: model, or None if no model is stored for this user """ diff --git a/emission/analysis/modelling/user_label_model/user_label_prediction_model.py b/emission/analysis/modelling/trip_model/trip_model.py similarity index 97% rename from emission/analysis/modelling/user_label_model/user_label_prediction_model.py rename to emission/analysis/modelling/trip_model/trip_model.py index 452b7133b..b0ad2ab3a 100644 --- a/emission/analysis/modelling/user_label_model/user_label_prediction_model.py +++ b/emission/analysis/modelling/trip_model/trip_model.py @@ -4,7 +4,7 @@ import emission.core.wrapper.confirmedtrip as ecwc -class UserLabelPredictionModel(metaclass=ABCMeta): +class TripModel(metaclass=ABCMeta): @abstractmethod def fit(data: List[List[float]]): diff --git a/emission/analysis/modelling/user_label_model/util.py b/emission/analysis/modelling/trip_model/util.py similarity index 100% rename from emission/analysis/modelling/user_label_model/util.py rename to emission/analysis/modelling/trip_model/util.py diff --git a/emission/core/wrapper/entry.py b/emission/core/wrapper/entry.py index a421279c5..635a03947 100644 --- a/emission/core/wrapper/entry.py +++ b/emission/core/wrapper/entry.py @@ -131,6 +131,8 @@ def _getData2Wrapper(): "inference/prediction": "modeprediction", # the predicted labels for a particular trip (one entry per algorithm) "inference/labels": "labelprediction", + # the serialized trip model for user label prediction + "inference/trip_model": "tripmodel", # equivalent of cleaned_section, but with the mode set to the # inferred mode instead of just walk/bike/motorized # used for consistency and to make the client work whether or not we were diff --git a/emission/core/wrapper/pipelinestate.py b/emission/core/wrapper/pipelinestate.py index e37f44446..1622675a9 100644 --- a/emission/core/wrapper/pipelinestate.py +++ b/emission/core/wrapper/pipelinestate.py @@ -19,7 +19,7 @@ class PipelineStages(enum.Enum): CLEAN_RESAMPLING = 11 MODE_INFERENCE = 4 LABEL_INFERENCE = 14 - USER_LABEL_MODEL = 16 + TRIP_MODEL = 16 EXPECTATION_POPULATION = 15 CREATE_CONFIRMED_OBJECTS = 13 TOUR_MODEL = 5 diff --git a/emission/core/wrapper/user_label_prediction_model.py b/emission/core/wrapper/tripmodel.py similarity index 64% rename from emission/core/wrapper/user_label_prediction_model.py rename to emission/core/wrapper/tripmodel.py index 1326eaa0a..17ae94c73 100644 --- a/emission/core/wrapper/user_label_prediction_model.py +++ b/emission/core/wrapper/tripmodel.py @@ -1,13 +1,13 @@ # Based on modeprediction.py -from emission.analysis.modelling.user_label_model.model_type import ModelType +from emission.analysis.modelling.trip_model.model_type import ModelType import emission.core.wrapper.wrapperbase as ecwb -class UserLabelPredictionModel(ecwb.WrapperBase): +class Tripmodel(ecwb.WrapperBase): props = {"user_id": ecwb.WrapperBase.Access.WORM, # the trip that this is part of - "model_type": ecwb.WrapperBase.Access.WORM, # emission.analysis.modelling.user_label_model.model_type.py + "model_type": ecwb.WrapperBase.Access.WORM, # emission.analysis.modelling.trip_model.model_type.py "model": ecwb.WrapperBase.Access.WORM, # the (serialized) state of the model for this trip - "model_ts": ecwb.WrapperBase.Access.WORM, # time that this model was stored + "model_ts": ecwb.WrapperBase.Access.WORM, # timestamp that model is "current" to wrt input data } enums = { diff --git a/emission/storage/decorations/analysis_timeseries_queries.py b/emission/storage/decorations/analysis_timeseries_queries.py index d63d32ab4..9f8ab6a70 100644 --- a/emission/storage/decorations/analysis_timeseries_queries.py +++ b/emission/storage/decorations/analysis_timeseries_queries.py @@ -37,7 +37,7 @@ METRICS_DAILY_USER_MEDIAN_SPEED = "metrics/daily_user_median_speed" METRICS_DAILY_MEAN_MEDIAN_SPEED = "metrics/daily_mean_median_speed" INFERRED_LABELS_KEY = "inference/labels" -USER_LABEL_MODEL_STORE_KEY = "inference/user_label_model" +TRIP_MODEL_STORE_KEY = "inference/trip_model" # General methods diff --git a/emission/storage/pipeline_queries.py b/emission/storage/pipeline_queries.py index a7fdfe7bb..29fc665c8 100644 --- a/emission/storage/pipeline_queries.py +++ b/emission/storage/pipeline_queries.py @@ -121,13 +121,13 @@ def mark_mode_inference_complete(user_id): def mark_mode_inference_failed(user_id): mark_stage_failed(user_id, ps.PipelineStages.MODE_INFERENCE) -def get_time_query_for_user_label_model(user_id): # TODO: here - tq = get_time_range_for_stage(user_id, ps.PipelineStages.USER_LABEL_MODEL) +def get_time_query_for_trip_model(user_id): # TODO: here + tq = get_time_range_for_stage(user_id, ps.PipelineStages.TRIP_MODEL) tq.timeType = 'data.model_ts' return tq -def mark_user_label_model_done(user_id, last_ts=None): - mark_stage_done(user_id, ps.PipelineStages.USER_LABEL_MODEL, last_ts) +def mark_trip_model_done(user_id, last_ts=None): + mark_stage_done(user_id, ps.PipelineStages.TRIP_MODEL, last_ts) def get_time_range_for_confirmed_object_creation(user_id): tq = get_time_range_for_stage(user_id, ps.PipelineStages.CREATE_CONFIRMED_OBJECTS) diff --git a/emission/storage/timeseries/builtin_timeseries.py b/emission/storage/timeseries/builtin_timeseries.py index d64e158b2..6a4cc164b 100644 --- a/emission/storage/timeseries/builtin_timeseries.py +++ b/emission/storage/timeseries/builtin_timeseries.py @@ -86,7 +86,7 @@ def __init__(self, user_id): "metrics/daily_mean_median_speed": self.analysis_timeseries_db, "inference/prediction": self.analysis_timeseries_db, "inference/labels": self.analysis_timeseries_db, - "inference/user_label_model": self.analysis_timeseries_db, + "inference/trip_model": self.analysis_timeseries_db, "analysis/inferred_section": self.analysis_timeseries_db, "analysis/inferred_labels": self.analysis_timeseries_db, "analysis/inferred_trip": self.analysis_timeseries_db, diff --git a/emission/tests/modellingTests/TestGreedySimilarityBinning.py b/emission/tests/modellingTests/TestGreedySimilarityBinning.py new file mode 100644 index 000000000..ed2d86333 --- /dev/null +++ b/emission/tests/modellingTests/TestGreedySimilarityBinning.py @@ -0,0 +1,29 @@ +import unittest +import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg +import emission.tests.modellingTests.modellingTestAssets as etmm +import emission.analysis.modelling.similarity.od_similarity as eamso +import json +import logging + + +class TestGreedySimilarityBinning(unittest.TestCase): + + def setUp(self) -> None: + logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', + level=logging.DEBUG) + + def testBinning(self): + label_data = { + "mode_labels": ['walk', 'bike', 'transit'], + "purpose_labels": ['work', 'home', 'school'], + "replaced_mode_labels": ['drive'] + } + trips = etmm.generate_mock_trips("joe", 14, [0, 0], [1, 1], label_data, 6, has_label_p=1.0) + model = eamtg.GreedySimilarityBinning( + metric=eamso.OriginDestinationSimilarity(), + sim_thresh=500, # meters, + apply_cutoff=False # currently unused + ) + + model.fit(trips) + print(json.dumps(model.bins, sort_keys=True, indent=4)) \ No newline at end of file diff --git a/emission/tests/modellingTests/TestRunModel.py b/emission/tests/modellingTests/TestRunModel.py index c08e87132..f54ffd196 100644 --- a/emission/tests/modellingTests/TestRunModel.py +++ b/emission/tests/modellingTests/TestRunModel.py @@ -1,9 +1,9 @@ import unittest -import emission.analysis.modelling.user_label_model.model_storage as eamums -import emission.analysis.modelling.user_label_model.model_type as eamumt -import emission.analysis.modelling.user_label_model.run_model as eamur +import emission.analysis.modelling.trip_model.model_storage as eamums +import emission.analysis.modelling.trip_model.model_type as eamumt +import emission.analysis.modelling.trip_model.run_model as eamur import emission.storage.timeseries.abstract_timeseries as esta @@ -23,7 +23,7 @@ def testTrip1(self): # case 1: the new trip matches a bin from the 1st round and a cluster from the 2nd round user_id = self.all_users[0] - eamur.update_user_label_model(user_id, eamumt.ModelType.GREEDY_SIMILARITY_BINNING, eamums.ModelStorage.DATABASE) + eamur.update_trip_model(user_id, eamumt.ModelType.GREEDY_SIMILARITY_BINNING, eamums.ModelStorage.DATABASE) filter_trips = eamur._get_trips_for_user(user_id, None, 0) new_trip = filter_trips[4] # result is [{'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'church', 'replaced_mode': 'drove_alone'}, @@ -38,7 +38,7 @@ def testTrip2(self): # 1. the user is invalid(< 10 existing fully labeled trips, or < 50% of trips that fully labeled) # 2. the user doesn't have common trips user_id = self.all_users[1] - eamur.update_user_label_model(user_id, eamumt.ModelType.GREEDY_SIMILARITY_BINNING, eamums.ModelStorage.DATABASE) + eamur.update_trip_model(user_id, eamumt.ModelType.GREEDY_SIMILARITY_BINNING, eamums.ModelStorage.DATABASE) filter_trips = eamur._get_trips_for_user(user_id, None, 0) new_trip = filter_trips[0] # result is [] @@ -49,7 +49,7 @@ def testTrip3(self): # case3: the new trip is novel trip(doesn't fall in any 1st round bins) user_id = self.all_users[0] - eamur.update_user_label_model(user_id, eamumt.ModelType.GREEDY_SIMILARITY_BINNING, eamums.ModelStorage.DATABASE) + eamur.update_trip_model(user_id, eamumt.ModelType.GREEDY_SIMILARITY_BINNING, eamums.ModelStorage.DATABASE) filter_trips = eamur._get_trips_for_user(user_id, None, 0) new_trip = filter_trips[0] # result is [] diff --git a/emission/tests/modellingTests/TestSimilarityMetric.py b/emission/tests/modellingTests/TestSimilarityMetric.py new file mode 100644 index 000000000..54a80fbcf --- /dev/null +++ b/emission/tests/modellingTests/TestSimilarityMetric.py @@ -0,0 +1,33 @@ +import unittest +import emission.tests.modellingTests.modellingTestAssets as etmm +import emission.analysis.modelling.similarity.od_similarity as eamso + +class TestSimilarityMetric(unittest.TestCase): + def setUp(self) -> None: + pass + + def testODsAreSimilar(self): + generate_points_thresh = 0.001 # approx. 111 meters + similarity_threshold = 500 # + # random, but, points are sampled within a circle and should always be < sim threshold + trips = etmm.generate_mock_trips('bob', 2, [0, 0], [1, 1], threshold=generate_points_thresh) + metric = eamso.OriginDestinationSimilarity() + coords0 = metric.extract_features(trips[0]) + coords1 = metric.extract_features(trips[1]) + similar = metric.similar(coords0, coords1, similarity_threshold) + self.assertTrue(similar) + + def testODsAreNotSimilar(self): + generate_points_thresh = 0.001 # approx. 111 meters + similarity_threshold = 500 # + + trips0 = etmm.generate_mock_trips('bob', 1, [0, 0], [1, 1], threshold=generate_points_thresh) + trips1 = etmm.generate_mock_trips('alice', 1, [2, 2], [3, 3], threshold=generate_points_thresh) + metric = eamso.OriginDestinationSimilarity() + coords0 = metric.extract_features(trips0[0]) + coords1 = metric.extract_features(trips1[0]) + similar = metric.similar(coords0, coords1, similarity_threshold) + self.assertFalse(similar) + +if __name__ == '__main__': + unittest.main() diff --git a/emission/tests/modellingTests/__init__.py b/emission/tests/modellingTests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/emission/tests/modellingTests/modellingTestAssets.py b/emission/tests/modellingTests/modellingTestAssets.py new file mode 100644 index 000000000..d31190bc8 --- /dev/null +++ b/emission/tests/modellingTests/modellingTestAssets.py @@ -0,0 +1,124 @@ +import random +from typing import Tuple, List, Dict + +import emission.core.wrapper.entry as ecwe +import arrow +import math + + +def generate_trip_coordinates( + ref_coords: Tuple[float, float], + within_threshold: bool, + threshold: float, + max: float = 0.1 # approx. 10km in WGS84 + ) -> Tuple[float, float]: + """generates trip coordinate data to use when mocking a set of trip data. + + :param origin: origin coordinates + :param destination: destination coordinates + :param trips: number of nearby coordinate pairs to generate + :param within_threshold: how many of these trips are within some distance threshold + :param threshold: the distance threshold, in WGS84 + :param max: max distance, in WGS84, defaults to 0.1 (approx. 10km) + :return: generated coordinate pairs sampled in a + circle from some coordinates up to some threshold + """ + angle = 2 * math.pi * random.random() + radius_threshold = threshold / 2 + radius = random.uniform(0, radius_threshold) if within_threshold else random.uniform(radius_threshold, max) + x = radius * math.cos(angle) + ref_coords[0] + y = radius * math.sin(angle) + ref_coords[1] + return (x, y) + + +def sample_trip_labels( + mode_labels, + purpose_labels, + replaced_mode_labels, + mode_weights=None, + purpose_weights=None, + replaced_mode_weights=None): + """samples trip labels + + :param mode_labels: labels for mode_confirm + :param purpose_labels: labels for purpose_confirm + :param mode_weights: sample weights, defaults to None, see random.choices "weights" + :param purpose_weights: sample weights, defaults to None for uniform sampling + :return: sampled trip labels + """ + mw = [1.0 / len(mode_labels) for i in range(len(mode_labels))] \ + if mode_weights is not None else mode_weights + rw = [1.0 / len(replaced_mode_labels) for i in range(len(replaced_mode_labels))] \ + if replaced_mode_weights is not None else replaced_mode_weights + pw = [1.0 / len(purpose_labels) for i in range(len(purpose_labels))] \ + if purpose_weights is not None else purpose_weights + mode_label_samples = random.choices(population=mode_labels, k=1, weights=mw) + replaced_mode_label_samples = random.choices(population=replaced_mode_labels, k=1, weights=rw) + purpose_label_samples = random.choices(population=purpose_labels, k=1, weights=pw) + user_input = { + "mode_confirm": mode_label_samples[0], + "replaced_mode": replaced_mode_label_samples[0], + "purpose_confirm": purpose_label_samples[0] + } + return user_input + + +def build_mock_trip(user_id, origin, destination, labels) -> Dict: + key = "analysis/confirmed_trip" + data = { + "start_loc": { + "coordinates": origin + }, + "end_loc": { + "coordinates": destination + }, + "user_input": labels + } + + return ecwe.Entry.create_fake_entry(user_id, key, data, write_ts=arrow.now()) + + +def generate_mock_trips( + user_id, + trips, + origin, + destination, + label_data = None, + within_threshold = None, + threshold = 0.01, + max = 0.1, + has_label_p = 0.7, + seed = 0): + + random.seed(seed) + within = within_threshold if within_threshold is not None else trips + trips_within_threshold = [i < within for i in range(trips)] + result = [] + for within in trips_within_threshold: + o = generate_trip_coordinates(origin, within, threshold, max) + d = generate_trip_coordinates(destination, within, threshold, max) + labels = {} if label_data is None or random.random() > has_label_p \ + else sample_trip_labels( + mode_labels=label_data.get('mode_labels'), + replaced_mode_labels=label_data.get('replaced_mode_labels'), + purpose_labels=label_data.get('purpose_labels'), + mode_weights=label_data.get('mode_weights'), + replaced_mode_weights=label_data.get('replaced_mode_weights'), + purpose_weights=label_data.get('purpose_weights') + ) + trip = build_mock_trip(user_id, o, d, labels) + result.append(trip) + + random.shuffle(result) + return result + + +if __name__ == '__main__': + label_data = { + "mode_labels": ['walk', 'bike', 'drive'], + "purpose_labels": ['work', 'home', 'school'], + "replaced_mode_labels": ['walk', 'bike', 'drive'] + } + result = generate_mock_trips('joe-bob', 14, [0, 0], [1,1], label_data, 6) + for r in result: + print(r) \ No newline at end of file From f318bea3800a0ae43932110fe45d45739215f359 Mon Sep 17 00:00:00 2001 From: rfitzger Date: Thu, 23 Jun 2022 16:13:19 -0600 Subject: [PATCH 20/46] fixes from testing --- .../trip_model/greedy_similarity_binning.py | 9 ++-- .../TestGreedySimilarityBinning.py | 51 ++++++++++++++++++- .../modellingTests/TestSimilarityMetric.py | 2 - 3 files changed, 54 insertions(+), 8 deletions(-) diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py index bc5be67fa..4b93867f4 100644 --- a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py @@ -95,11 +95,12 @@ def fit(self, trips: List[ecwc.Confirmedtrip]): if len(self.bins) > 1 and self.apply_cutoff: self._apply_cutoff() self._generate_predictions() + self.loaded = True logging.info(f"model fit to trip data") def predict(self, trip: ecwc.Confirmedtrip) -> Tuple[List[Dict], int]: if not self.loaded: - msg = f"predict called on unloaded model for user {self.user_id}" + msg = f"predict called on unloaded model" raise IOError(msg) logging.debug(f"running greedy similarity clustering") @@ -108,7 +109,7 @@ def predict(self, trip: ecwc.Confirmedtrip) -> Tuple[List[Dict], int]: logging.debug(f"unable to predict bin for trip {trip}") return [], -1 else: - labels = bin_record['prediction'] + labels = bin_record['predictions'] n_features = len(bin_record['features']) logging.debug(f"found cluster {predicted_bin} with labels {labels}") return labels, n_features @@ -137,7 +138,7 @@ def _assign_bins(self, trips: List[ecwc.Confirmedtrip]): :param data: trips to assign to bins :type data: List[Confirmedtrip] """ - logging.debug(f"_assign_bins called with trips {trips}") + logging.debug(f"_assign_bins called with {len(trips)} trips") for trip in trips: trip_features = self.extract_features(trip) trip_labels = trip['data']['user_input'] @@ -165,7 +166,7 @@ def _nearest_bin(self, trip: ecwc.Confirmedtrip) -> Tuple[Optional[int], Optiona :param trip: incoming trip features to test with :return: nearest bin record, if found """ - logging.debug(f"_nearest_bin called with trip {trip}") + logging.debug(f"_nearest_bin called") trip_features = self.extract_features(trip) selected_bin = None diff --git a/emission/tests/modellingTests/TestGreedySimilarityBinning.py b/emission/tests/modellingTests/TestGreedySimilarityBinning.py index ed2d86333..706184d98 100644 --- a/emission/tests/modellingTests/TestGreedySimilarityBinning.py +++ b/emission/tests/modellingTests/TestGreedySimilarityBinning.py @@ -18,7 +18,19 @@ def testBinning(self): "purpose_labels": ['work', 'home', 'school'], "replaced_mode_labels": ['drive'] } - trips = etmm.generate_mock_trips("joe", 14, [0, 0], [1, 1], label_data, 6, has_label_p=1.0) + + n = 20 + should_be_grouped = 5 + trips = etmm.generate_mock_trips( + user_id="joe", + trips=n, + origin=(0, 0), + destination=(1, 1), + label_data=label_data, + within_threshold=should_be_grouped, + threshold=0.001, # ~ 111 meters in degrees WGS84 + has_label_p=1.0 + ) model = eamtg.GreedySimilarityBinning( metric=eamso.OriginDestinationSimilarity(), sim_thresh=500, # meters, @@ -26,4 +38,39 @@ def testBinning(self): ) model.fit(trips) - print(json.dumps(model.bins, sort_keys=True, indent=4)) \ No newline at end of file + + # 5 trip features should appear together in one bin + at_least_one_large_bin = any(map(lambda b: len(b['features']) >= should_be_grouped, model.bins.values())) + self.assertTrue(at_least_one_large_bin, "at least one bin should have at least 5 features in it") + + def testPrediction(self): + label_data = { + "mode_labels": ['skipping'], + "purpose_labels": ['pizza_party'], + "replaced_mode_labels": ['crabwalking'] + } + + n = 6 + trips = etmm.generate_mock_trips( + user_id="joe", + trips=n, + origin=(0, 0), + destination=(1, 1), + label_data=label_data, + threshold=0.001, # ~ 111 meters in degrees WGS84 + has_label_p=1.0 + ) + model = eamtg.GreedySimilarityBinning( + metric=eamso.OriginDestinationSimilarity(), + sim_thresh=500, # meters, + apply_cutoff=False # currently unused + ) + + train = trips[0:5] + test = trips[5] + + model.fit(train) + results, n = model.predict(test) + + self.assertEqual(len(results), 1, "should have found a matching bin") + self.assertEqual(n, len(train), "that bin should have had the whole train set in it") diff --git a/emission/tests/modellingTests/TestSimilarityMetric.py b/emission/tests/modellingTests/TestSimilarityMetric.py index 54a80fbcf..ae37fc39a 100644 --- a/emission/tests/modellingTests/TestSimilarityMetric.py +++ b/emission/tests/modellingTests/TestSimilarityMetric.py @@ -3,8 +3,6 @@ import emission.analysis.modelling.similarity.od_similarity as eamso class TestSimilarityMetric(unittest.TestCase): - def setUp(self) -> None: - pass def testODsAreSimilar(self): generate_points_thresh = 0.001 # approx. 111 meters From 5e37bc77466bc27faa8f9e0cf3cbd183508b0605 Mon Sep 17 00:00:00 2001 From: rfitzger Date: Thu, 23 Jun 2022 16:25:44 -0600 Subject: [PATCH 21/46] comments, modify default sampling rate --- .../TestGreedySimilarityBinning.py | 2 - .../modellingTests/modellingTestAssets.py | 49 +++++++++++++++++-- 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/emission/tests/modellingTests/TestGreedySimilarityBinning.py b/emission/tests/modellingTests/TestGreedySimilarityBinning.py index 706184d98..2e8ba0d90 100644 --- a/emission/tests/modellingTests/TestGreedySimilarityBinning.py +++ b/emission/tests/modellingTests/TestGreedySimilarityBinning.py @@ -29,7 +29,6 @@ def testBinning(self): label_data=label_data, within_threshold=should_be_grouped, threshold=0.001, # ~ 111 meters in degrees WGS84 - has_label_p=1.0 ) model = eamtg.GreedySimilarityBinning( metric=eamso.OriginDestinationSimilarity(), @@ -58,7 +57,6 @@ def testPrediction(self): destination=(1, 1), label_data=label_data, threshold=0.001, # ~ 111 meters in degrees WGS84 - has_label_p=1.0 ) model = eamtg.GreedySimilarityBinning( metric=eamso.OriginDestinationSimilarity(), diff --git a/emission/tests/modellingTests/modellingTestAssets.py b/emission/tests/modellingTests/modellingTestAssets.py index d31190bc8..1353e40fe 100644 --- a/emission/tests/modellingTests/modellingTestAssets.py +++ b/emission/tests/modellingTests/modellingTestAssets.py @@ -1,5 +1,6 @@ import random from typing import Tuple, List, Dict +import emission.core.wrapper.confirmedtrip as ecwc import emission.core.wrapper.entry as ecwe import arrow @@ -14,9 +15,7 @@ def generate_trip_coordinates( ) -> Tuple[float, float]: """generates trip coordinate data to use when mocking a set of trip data. - :param origin: origin coordinates - :param destination: destination coordinates - :param trips: number of nearby coordinate pairs to generate + :param ref_coords: reference coordinates to use as the center of the sampling circle :param within_threshold: how many of these trips are within some distance threshold :param threshold: the distance threshold, in WGS84 :param max: max distance, in WGS84, defaults to 0.1 (approx. 10km) @@ -42,8 +41,10 @@ def sample_trip_labels( :param mode_labels: labels for mode_confirm :param purpose_labels: labels for purpose_confirm + :param replaced_mode_labels: labels for replaced_mode :param mode_weights: sample weights, defaults to None, see random.choices "weights" :param purpose_weights: sample weights, defaults to None for uniform sampling + :param replaced_mode_weights: sample weights, defaults to None :return: sampled trip labels """ mw = [1.0 / len(mode_labels) for i in range(len(mode_labels))] \ @@ -63,7 +64,15 @@ def sample_trip_labels( return user_input -def build_mock_trip(user_id, origin, destination, labels) -> Dict: +def build_mock_trip(user_id, origin, destination, labels) -> ecwc.Confirmedtrip: + """repackages mock data as a Confirmedtrip Entry type + + :param user_id: the user id UUID + :param origin: trip origin coordinates + :param destination: trip destination coordinates + :param labels: user labels for the trip + :return: a Confirmedtrip entry + """ key = "analysis/confirmed_trip" data = { "start_loc": { @@ -87,8 +96,38 @@ def generate_mock_trips( within_threshold = None, threshold = 0.01, max = 0.1, - has_label_p = 0.7, + has_label_p = 1.0, seed = 0): + """mocking function that generates multiple trips for a user. some are sampled + within a threshold from the provided o/d pair, and some have labels. some other + ones can be sampled to appear outside of the threshold of the o/d locations. + + label_data is an optional dictionary with labels and sample weights, for example: + { + "mode_labels": ['walk', 'bike'], + "replaced_mode_labels": ['drive', 'tnc'], + "purpose_labels": ['home', 'work'], + "mode_weights": [0.8, 0.2], + "replaced_mode_weights": [0.4, 0.6], + "purpose_weights": [0.1, 0.9] + } + + weights entries are optional and result in uniform sampling. + + :param user_id: user UUID + :param trips: number of trips + :param origin: origin coordinates + :param destination: destination coordinates + :param label_data: dictionary of label data, see above, defaults to None + :param within_threshold: number of trips that should fall within the provided + distance threshold in degrees WGS84, defaults to None + :param threshold: distance threshold in WGS84 for sampling, defaults to 0.01 + :param max: maximum distance beyond the threshold for trips sampled that + are not within the threshold, defaults to 0.1 degrees WGS84 + :param has_label_p: probability a trip has labels, defaults to 1.0 + :param seed: random seed, defaults to 0 + :return: randomly sampled trips + """ random.seed(seed) within = within_threshold if within_threshold is not None else trips From 6637b70f1f5b1eead3d67877bd3c1ac59ba25638 Mon Sep 17 00:00:00 2001 From: rfitzger Date: Mon, 27 Jun 2022 10:15:54 -0600 Subject: [PATCH 22/46] adding missing python dependencies --- setup/environment36.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/setup/environment36.yml b/setup/environment36.yml index b65e2daf1..c4de01fdf 100644 --- a/setup/environment36.yml +++ b/setup/environment36.yml @@ -8,11 +8,14 @@ dependencies: - cheroot=8.4.2 - future=0.18.0 - geojson=2.4.1 +- geocoder=1.38.1 +- geopy=2.2.0 - google-auth=1.20.1 - jsonpickle=1.4.1 - numpy=1.19.1 - pandas=1.1.0 - pip=20.2.2 +- polyline=1.4.0 - python-dateutil=2.8.1 - pytz=2020.1 - requests=2.24.0 From fbae1822b34c50d0eaf5c76e187fe29f2c0da44d Mon Sep 17 00:00:00 2001 From: rfitzger Date: Tue, 28 Jun 2022 13:40:40 -0600 Subject: [PATCH 23/46] fixes related to e2e test --- bin/build_label_model.py | 2 +- .../inference/labels/inferrers.py | 2 +- .../modelling/similarity/similarity_metric.py | 2 + .../trip_model/greedy_similarity_binning.py | 21 ++- .../modelling/trip_model/model_storage.py | 47 +++-- .../modelling/trip_model/model_type.py | 60 ++++++- .../modelling/trip_model/run_model.py | 109 +++++------- emission/core/common.py | 8 +- emission/storage/pipeline_queries.py | 15 +- .../storage/timeseries/builtin_timeseries.py | 8 +- emission/storage/timeseries/timequery.py | 2 + .../TestGreedySimilarityBinning.py | 48 ++++- emission/tests/modellingTests/TestRunModel.py | 168 +++++++++++++----- .../modellingTests/modellingTestAssets.py | 8 +- 14 files changed, 354 insertions(+), 146 deletions(-) diff --git a/bin/build_label_model.py b/bin/build_label_model.py index caaf21d65..ed3c4a1d1 100644 --- a/bin/build_label_model.py +++ b/bin/build_label_model.py @@ -67,6 +67,6 @@ def _email_2_user_list(email_list): logging.info("building model for user %s" % user_id) # these can come from the application config as default values model_type = eamumt.ModelType.GREEDY_SIMILARITY_BINNING - model_storage = eamums.ModelStorage.DATABASE + model_storage = eamums.ModelStorage.DOCUMENT_DATABASE min_trips = 14 eamur.update_trip_model(user_id, model_type, model_storage, min_trips) diff --git a/emission/analysis/classification/inference/labels/inferrers.py b/emission/analysis/classification/inference/labels/inferrers.py index ab06a2cd2..7b784c38e 100644 --- a/emission/analysis/classification/inference/labels/inferrers.py +++ b/emission/analysis/classification/inference/labels/inferrers.py @@ -145,7 +145,7 @@ def n_to_confidence_coeff(n, max_confidence=None, first_confidence=None, confide def predict_cluster_confidence_discounting(trip, max_confidence=None, first_confidence=None, confidence_multiplier=None): # these can come from the application config as default values model_type = ModelType.GREEDY_SIMILARITY_BINNING - model_storage = ModelStorage.DATABASE + model_storage = ModelStorage.DOCUMENT_DATABASE labels, n = eamur.predict_labels_with_n(trip, model_type, model_storage) if n <= 0: # No model data or trip didn't match a cluster logging.debug(f"In predict_cluster_confidence_discounting: n={n}; returning as-is") diff --git a/emission/analysis/modelling/similarity/similarity_metric.py b/emission/analysis/modelling/similarity/similarity_metric.py index b6490793f..82d3513f8 100644 --- a/emission/analysis/modelling/similarity/similarity_metric.py +++ b/emission/analysis/modelling/similarity/similarity_metric.py @@ -1,5 +1,6 @@ from abc import ABCMeta, abstractmethod from typing import List +import logging import emission.core.wrapper.confirmedtrip as ecwc @@ -37,4 +38,5 @@ def similar(self, a: List[float], b: List[float], thresh: float) -> bool: """ similarity_values = self.similarity(a, b) is_similar = all(map(lambda sim: sim <= thresh, similarity_values)) + logging.debug(f"is_similar: {is_similar} | similarity: {similarity_values}") return is_similar diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py index 4b93867f4..7ac3c7fad 100644 --- a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py @@ -12,6 +12,8 @@ class GreedySimilarityBinning(eamuu.TripModel): + is_incremental = False + def __init__( self, metric: eamss.SimilarityMetric, @@ -61,7 +63,8 @@ def __init__( } } where - - bin_id: int index of a bin containing similar trips + - bin_id: str index of a bin containing similar trips, as a string + (string type for bin_id comes from mongodb object key type requirements) - f_x: float feature value (an ordinate such as origin.x) - label_x: str OpenPATH user label category such as "mode_confirm" - value_x: str user-provided label for a category @@ -77,7 +80,7 @@ def __init__( self.metric = metric self.sim_thresh = sim_thresh self.apply_cutoff = apply_cutoff - self.bins: Dict[int, Dict] = {} + self.bins: Dict[str, Dict] = {} self.loaded = False def fit(self, trips: List[ecwc.Confirmedtrip]): @@ -96,7 +99,10 @@ def fit(self, trips: List[ecwc.Confirmedtrip]): self._apply_cutoff() self._generate_predictions() self.loaded = True + binned_features = sum([len(b['features']) for b in self.bins.values() ]) logging.info(f"model fit to trip data") + logging.info(f'source data: {len(trips)} rows') + logging.info(f'stored model: {binned_features} entries') def predict(self, trip: ecwc.Confirmedtrip) -> Tuple[List[Dict], int]: if not self.loaded: @@ -114,12 +120,6 @@ def predict(self, trip: ecwc.Confirmedtrip) -> Tuple[List[Dict], int]: logging.debug(f"found cluster {predicted_bin} with labels {labels}") return labels, n_features - def is_incremental(self) -> bool: - """ - greedy similarity binning is not an incremental model - """ - return False - def to_dict(self) -> Dict: return self.bins @@ -149,12 +149,13 @@ def _assign_bins(self, trips: List[ecwc.Confirmedtrip]): bin_record['labels'].append(trip_labels) else: # create new bin - new_bin_id = len(self.bins) + new_bin_id = str(len(self.bins)) new_bin_record = { "features": [trip_features], "labels": [trip_labels], "predictions": [] } + logging.debug(f"creating new bin {new_bin_id} at location {trip_features}") self.bins[new_bin_id] = new_bin_record def _nearest_bin(self, trip: ecwc.Confirmedtrip) -> Tuple[Optional[int], Optional[Dict]]: @@ -175,6 +176,8 @@ def _nearest_bin(self, trip: ecwc.Confirmedtrip) -> Tuple[Optional[int], Optiona for bin_id, bin_record in self.bins.items(): for bin_features in bin_record['features']: if self.metric.similar(trip_features, bin_features, self.sim_thresh): + logging.debug(f"found nearest bin id {bin_id}") + logging.debug(f"similar: {trip_features}, {bin_features}") selected_bin = bin_id selected_record = bin_record break diff --git a/emission/analysis/modelling/trip_model/model_storage.py b/emission/analysis/modelling/trip_model/model_storage.py index 1404fbf75..b251fb3db 100644 --- a/emission/analysis/modelling/trip_model/model_storage.py +++ b/emission/analysis/modelling/trip_model/model_storage.py @@ -1,5 +1,6 @@ from enum import Enum from typing import Dict, Optional +import logging import emission.analysis.modelling.trip_model.model_type as eamum import emission.core.wrapper.tripmodel as ecwu @@ -13,9 +14,9 @@ class ModelStorage(Enum): """ enumeration of model storage destinations. currently restricted to - DATABASE only. + DOCUMENT_DATABASE only. """ - DATABASE = 0 + DOCUMENT_DATABASE = 0 @classmethod def names(cls): return list(map(lambda e: e.name, list(cls))) @@ -29,8 +30,9 @@ def load_model(user_id, model_type: eamum.ModelType, model_storage: ModelStorage :param model_storage: storage format :return: the model representation as a Python Dict or None :raises: TypeError if loaded model has different type than expected type + KeyError if the ModelType is not known """ - if model_storage == ModelStorage.DATABASE: + if model_storage == ModelStorage.DOCUMENT_DATABASE: # retrieve stored model with timestamp that matches/exceeds the most # recent PipelineState.TRIP_MODEL entry @@ -42,14 +44,31 @@ def load_model(user_id, model_type: eamum.ModelType, model_storage: ModelStorage field='data.model_ts', sort_order=pymongo.DESCENDING ) - if latest_model_entry.model_type != model_type: + + if latest_model_entry is None: + logging.debug(f'no {model_type.name} model found for user {user_id}') + return None + + write_ts = latest_model_entry['metadata']['write_ts'] + logging.debug(f'retrieved latest trip model recorded at timestamp {write_ts}') + logging.debug(latest_model_entry) + + # parse str to enum for ModelType + latest_model_type_str = latest_model_entry.get('data', {}).get('model_type') + if latest_model_type_str is None: + raise TypeError('stored model does not have a model type') + latest_model_type = eamum.ModelType.from_str(latest_model_type_str) + + if latest_model_entry is None: + return None + elif latest_model_type != model_type: msg = ( - f"loading model for user {user_id} has model type {latest_model_entry.model_type} " - f"but was expected to have model type {model_type}" + f"loading model for user {user_id} has model type '{latest_model_type.name}' " + f"but was expected to have model type {model_type.name}" ) raise TypeError(msg) - model = latest_model_entry.model if latest_model_entry is not None else None - return model + else: + return latest_model_entry['data']['model'] else: storage_types_str = ",".join(ModelStorage.names()) @@ -64,7 +83,7 @@ def save_model( model_type: eamum.ModelType, model_data: Dict, model_timestamp: int, - model_storage: ModelStorage = ModelStorage.DATABASE): + model_storage: ModelStorage = ModelStorage.DOCUMENT_DATABASE): """saves a model to storage :param user_id: user associated with this model @@ -75,7 +94,7 @@ def save_model( :raises IOError: failure when writing to storage medium """ - if model_storage == ModelStorage.DATABASE: + if model_storage == ModelStorage.DOCUMENT_DATABASE: row = ecwu.Tripmodel() row.user_id = user_id @@ -92,14 +111,6 @@ def save_model( f"to the database" ) raise IOError(msg) from e - - try: - epq.mark_trip_model_done(user_id, model_timestamp) - except Exception as e: - msg = ( - f"failure updating user label pipeline state for user {user_id}" - ) - raise IOError(msg) from e else: storage_types_str = ",".join(ModelStorage.names()) diff --git a/emission/analysis/modelling/trip_model/model_type.py b/emission/analysis/modelling/trip_model/model_type.py index 43aad7c1f..0c88f0fc7 100644 --- a/emission/analysis/modelling/trip_model/model_type.py +++ b/emission/analysis/modelling/trip_model/model_type.py @@ -1,9 +1,47 @@ +from __future__ import annotations from enum import Enum +import emission.analysis.modelling.trip_model.trip_model as eamuu +import emission.analysis.modelling.similarity.od_similarity as eamso +import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamug + + +SIMILARITY_THRESHOLD_METERS=500 class ModelType(Enum): GREEDY_SIMILARITY_BINNING = 'greedy' + @classmethod + def build(cls, model_type: ModelType) -> eamuu.TripModel: + """ + instantiates the requested user model type with the configured + parameters. + + hey YOU! if future model types are created, they should be added here! + + :param model_type: internally-used model name (an enum) + :return: a user label prediction model + :raises KeyError: if the requested model name does not exist + """ + # Dict[ModelType, TripModel] + # inject default values here at construction + MODELS = { + cls.GREEDY_SIMILARITY_BINNING: eamug.GreedySimilarityBinning( + metric=eamso.OriginDestinationSimilarity(), + sim_thresh=SIMILARITY_THRESHOLD_METERS, + apply_cutoff=False + ) + } + model = MODELS.get(model_type) + if model is None: + if not isinstance(model_type, ModelType): + raise TypeError(f"provided model type {model_type} is not an instance of ModelType") + else: + model_names = list(lambda e: e.name, MODELS.keys()) + models = ",".join(model_names) + raise KeyError(f"user label model {model_type.name} not found in factory, must be one of {{{models}}}") + return model + @classmethod def names(cls): return list(map(lambda e: e.name, list(cls))) @@ -20,4 +58,24 @@ def model_name(self): :return: a simple name for this model type :rtype: str """ - return self.value \ No newline at end of file + return self.value + + @classmethod + def from_str(cls, str): + """attempts to match the provided string to a known ModelType + since a short name is 'nicer', we attempt to match on the enum + value first (for example, 'greedy'). as a fallback, we attempt + to match on the full ModelType name (for example, + 'GREEDY_SIMILARITY_BINNING'). + + :param str: a string name of a ModelType + """ + try: + return cls(str) + except ValueError: + try: + return cls[str] + except KeyError: + names_list = '{' + ','.join(cls.names) + '}' + msg = f'{str} is not a known ModelType, should be one of {names_list}' + raise KeyError(msg) \ No newline at end of file diff --git a/emission/analysis/modelling/trip_model/run_model.py b/emission/analysis/modelling/trip_model/run_model.py index 8726f0b6e..38e674db0 100644 --- a/emission/analysis/modelling/trip_model/run_model.py +++ b/emission/analysis/modelling/trip_model/run_model.py @@ -1,5 +1,5 @@ import logging -from typing import Optional +from typing import List, Optional import arrow import emission.analysis.modelling.similarity.od_similarity as eamso @@ -13,13 +13,11 @@ import emission.storage.timeseries.abstract_timeseries as esta import emission.storage.timeseries.timequery as estt -SIMILARITY_THRESHOLD_METERS = 500 # should come from app config - def update_trip_model( user_id, model_type: eamumt.ModelType, - model_storage: eamums.ModelStorage = eamums.ModelStorage.DATABASE, + model_storage: eamums.ModelStorage = eamums.ModelStorage.DOCUMENT_DATABASE, min_trips: int = 14): """ create/update a user label model for a user. @@ -34,35 +32,44 @@ def update_trip_model( :param model_storage: storage destination for built model (default DATABASE) :param min_trips: minimum number of labeled trips per user to apply prediction (default 14) """ - - # this timestamp is used for recording the state of the updated model - timestamp = arrow.now() - model = model_factory(model_type) - - # if a previous model exists, deserialize the stored model - model_data_prev = eamums.load_model(user_id, model_type, model_storage) - if model_data_prev is not None: - model.from_dict(model_data_prev) - logging.debug(f"loaded {model_type.name} user label model for user {user_id}") - else: - logging.debug(f"building first {model_type.name} user label model for user {user_id}") - - # get all relevant trips - time_query = epq.get_time_query_for_trip_model(user_id) if model.is_incremental else None - trips = _get_trips_for_user(user_id, time_query, min_trips) - - # train and store the model - model.fit(trips) - model_data_next = model.to_dict() - eamums.save_model(user_id, model_type, model_data_next, timestamp, model_storage) - - logging.debug(f"{model_type.name} label prediction model built for user {user_id} with timestamp {timestamp}") + try: + # this timestamp is used for recording the state of the updated model + timestamp = arrow.now().timestamp + model = eamumt.ModelType.build(model_type) + + # if a previous model exists, deserialize the stored model + model_data_prev = eamums.load_model(user_id, model_type, model_storage) + if model_data_prev is not None: + model.from_dict(model_data_prev) + logging.debug(f"loaded {model_type.name} user label model for user {user_id}") + else: + logging.debug(f"building first {model_type.name} user label model for user {user_id}") + + logging.debug(f'model type {model_type.name} is incremental? {model.is_incremental}') + trips = _get_training_data(user_id, min_trips, model.is_incremental) + logging.debug(f'found {len(trips)} for trip model') + + # train and store the model + model.fit(trips) + model_data_next = model.to_dict() + + eamums.save_model(user_id, model_type, model_data_next, timestamp, model_storage) + logging.debug(f"{model_type.name} label prediction model built for user {user_id} with timestamp {timestamp}") + + epq.mark_trip_model_done(user_id, timestamp) + + except Exception as e: + epq.mark_trip_model_failed(user_id) + msg = ( + f"failure updating user label pipeline state for user {user_id}" + ) + raise IOError(msg) from e def predict_labels_with_n( trip: ecwc.Confirmedtrip, model_type = eamumt.ModelType.GREEDY_SIMILARITY_BINNING, - model_storage = eamums.ModelStorage.DATABASE): + model_storage = eamums.ModelStorage.DOCUMENT_DATABASE): """ invoke the user label prediction model to predict labels for a trip. @@ -72,7 +79,7 @@ def predict_labels_with_n( :return: a list of predictions """ user_id = trip['user_id'] - model = _load_user_label_model(user_id, model_type, model_storage) + model = _load_stored_trip_model(user_id, model_type, model_storage) if model is None: return [], -1 else: @@ -80,36 +87,7 @@ def predict_labels_with_n( return predictions, n -def model_factory(model_type: eamumt.ModelType) -> eamuu.TripModel: - """ - instantiates the requested user model type with the configured - parameters. - - hey YOU! if future model types are created, they should be added here! - - :param model_type: internally-used model name (an enum) - :return: a user label prediction model - :raises KeyError: if the requested model name does not exist - """ - MODELS = { - eamumt.ModelType.GREEDY_SIMILARITY_BINNING: eamug.GreedySimilarityBinning( - metric=eamso.OriginDestinationSimilarity(), - sim_thresh=SIMILARITY_THRESHOLD_METERS, - apply_cutoff=False - ) - } - model = MODELS.get(model_type) - if model is None: - if not isinstance(model_type, eamumt.ModelType): - raise TypeError(f"provided model type {model_type} is not an instance of ModelType") - else: - model_names = list(lambda e: e.name, MODELS.keys()) - models = ",".join(model_names) - raise KeyError(f"user label model {model_type.name} not found in factory, must be one of {{{models}}}") - return model - - -def _get_trips_for_user(user_id, time_query: Optional[estt.TimeQuery], min_trips: int): +def _get_training_data(user_id, min_trips: int, incremental: bool): """ load the labeled trip data for this user, subject to a time query. if the user does not have at least $min_trips trips with labels, then return an empty list. @@ -117,8 +95,15 @@ def _get_trips_for_user(user_id, time_query: Optional[estt.TimeQuery], min_trips :param user_id: user to collect trips from :param time_query: query to restrict the time (optional) :param min_trips: minimum number of labeled trips required to train + :param incremental: if true, only collect trips which have arrived since the + last time this model was trained, otherwise, collect all + historical data for this user """ + time_query = epq.get_time_query_for_trip_model(user_id) if incremental else None + logging.debug(f'time query for training data collection: {time_query}') + trips = esda.get_entries(key=esda.CONFIRMED_TRIP_KEY, user_id=user_id, time_query=time_query) + print(f'found {len(trips)} training rows') labeled_trips = [trip for trip in trips if trip['data']['user_input'] != {}] if not len(labeled_trips) >= min_trips: msg = ( @@ -127,10 +112,12 @@ def _get_trips_for_user(user_id, time_query: Optional[estt.TimeQuery], min_trips ) logging.debug(msg) return [] + + logging.debug(f'found {len(labeled_trips)} labeled trips for user {user_id}') return labeled_trips -def _load_user_label_model( +def _load_stored_trip_model( user_id, model_type: eamumt.ModelType, model_storage: eamums.ModelStorage) -> Optional[eamuu.TripModel]: @@ -146,7 +133,7 @@ def _load_user_label_model( if model_dict is None: return None else: - model = model_factory(model_type) + model = eamumt.ModelType.build(model_type) model.from_dict(model_dict) return model diff --git a/emission/core/common.py b/emission/core/common.py index a5d7c777b..4d97ce681 100644 --- a/emission/core/common.py +++ b/emission/core/common.py @@ -40,8 +40,14 @@ def travel_date_time(time1,time2): return travel_time.seconds def calDistance(point1, point2, coordinates=False): + """haversine distance - earthRadius = 6371000 + :param point1: a coordinate in degrees WGS84 + :param point2: another coordinate in degrees WGS84 + :param coordinates: if false, expect a list of coordinates, defaults to False + :return: distance approximately in meters + """ + earthRadius = 6371000 # meters # SHANKARI: Why do we have two calDistance() functions? # Need to combine into one # points are now in geojson format (lng,lat) diff --git a/emission/storage/pipeline_queries.py b/emission/storage/pipeline_queries.py index 29fc665c8..b8a326d3c 100644 --- a/emission/storage/pipeline_queries.py +++ b/emission/storage/pipeline_queries.py @@ -121,13 +121,20 @@ def mark_mode_inference_complete(user_id): def mark_mode_inference_failed(user_id): mark_stage_failed(user_id, ps.PipelineStages.MODE_INFERENCE) -def get_time_query_for_trip_model(user_id): # TODO: here +def get_time_query_for_trip_model(user_id): tq = get_time_range_for_stage(user_id, ps.PipelineStages.TRIP_MODEL) - tq.timeType = 'data.model_ts' - return tq + if tq.startTs is None: + return None + else: + tq.timeType = 'data.model_ts' + return tq def mark_trip_model_done(user_id, last_ts=None): - mark_stage_done(user_id, ps.PipelineStages.TRIP_MODEL, last_ts) + last_processed_ts = last_ts + END_FUZZ_AVOID_LTE if last_ts is not None else None + mark_stage_done(user_id, ps.PipelineStages.TRIP_MODEL, last_processed_ts) + +def mark_trip_model_failed(user_id): + mark_stage_failed(user_id, ps.PipelineStages.TRIP_MODEL) def get_time_range_for_confirmed_object_creation(user_id): tq = get_time_range_for_stage(user_id, ps.PipelineStages.CREATE_CONFIRMED_OBJECTS) diff --git a/emission/storage/timeseries/builtin_timeseries.py b/emission/storage/timeseries/builtin_timeseries.py index 6a4cc164b..9a6f756c3 100644 --- a/emission/storage/timeseries/builtin_timeseries.py +++ b/emission/storage/timeseries/builtin_timeseries.py @@ -308,13 +308,15 @@ def get_first_entry(self, key, field, sort_order, time_query=None): :param time_query: the time range in which to search the stream :return: a database row, or None if no match is found """ - result_it = self.get_timeseries_db(key).find(self._get_query([key], time_query), - {"_id": False, field: True}).sort(field, sort_order).limit(1) + find_query = self._get_query([key], time_query) + result_it = self.get_timeseries_db(key).find(find_query).sort(field, sort_order).limit(1) result_list = list(result_it) if len(result_list) == 0: return None else: - return result_list[0] + first_entry = result_list[0] + del first_entry['_id'] + return first_entry def get_first_value_for_field(self, key, field, sort_order, time_query=None): diff --git a/emission/storage/timeseries/timequery.py b/emission/storage/timeseries/timequery.py index c3378417b..950e0adb0 100644 --- a/emission/storage/timeseries/timequery.py +++ b/emission/storage/timeseries/timequery.py @@ -22,3 +22,5 @@ def get_query(self): ret_query[time_key].update({"$gte": self.startTs}) return ret_query + def __repr__(self): + return f"TimeQuery {self.timeType} with range [{self.startTs}, {self.endTs})" diff --git a/emission/tests/modellingTests/TestGreedySimilarityBinning.py b/emission/tests/modellingTests/TestGreedySimilarityBinning.py index 2e8ba0d90..22de885d3 100644 --- a/emission/tests/modellingTests/TestGreedySimilarityBinning.py +++ b/emission/tests/modellingTests/TestGreedySimilarityBinning.py @@ -13,6 +13,9 @@ def setUp(self) -> None: level=logging.DEBUG) def testBinning(self): + """ + when $should_be_grouped trips are the same, they should appear in a bin + """ label_data = { "mode_labels": ['walk', 'bike', 'transit'], "purpose_labels": ['work', 'home', 'school'], @@ -38,11 +41,14 @@ def testBinning(self): model.fit(trips) - # 5 trip features should appear together in one bin + # $should_be_grouped trip features should appear together in one bin at_least_one_large_bin = any(map(lambda b: len(b['features']) >= should_be_grouped, model.bins.values())) self.assertTrue(at_least_one_large_bin, "at least one bin should have at least 5 features in it") def testPrediction(self): + """ + training and testing with similar trips should lead to a positive bin match + """ label_data = { "mode_labels": ['skipping'], "purpose_labels": ['pizza_party'], @@ -72,3 +78,43 @@ def testPrediction(self): self.assertEqual(len(results), 1, "should have found a matching bin") self.assertEqual(n, len(train), "that bin should have had the whole train set in it") + + def testNoPrediction(self): + """ + when trained on trips in Colorado, shouldn't have a prediction for a trip in Alaska + """ + label_data = { + "mode_labels": ['skipping'], + "purpose_labels": ['pizza_party'], + "replaced_mode_labels": ['crabwalking'] + } + + n = 5 + train = etmm.generate_mock_trips( + user_id="joe", + trips=n, + origin=(39.7645187, -104.9951944), # Denver, CO + destination=(39.7435206, -105.2369292), # Golden, CO + label_data=label_data, + threshold=0.001, # ~ 111 meters in degrees WGS84 + ) + test = etmm.generate_mock_trips( + user_id="joe", + trips=1, + origin=(61.1042262, -150.5611644), # Anchorage, AK + destination=(62.2721466, -150.3233046), # Talkeetna, AK + label_data=label_data, + threshold=0.001, # ~ 111 meters in degrees WGS84 + ) + + model = eamtg.GreedySimilarityBinning( + metric=eamso.OriginDestinationSimilarity(), + sim_thresh=500, # meters, + apply_cutoff=False # currently unused + ) + + model.fit(train) + results, n = model.predict(test[0]) + + self.assertEqual(len(results), 0, "should have found a matching bin") + self.assertEqual(n, -1, "that bin should have had the whole train set in it") diff --git a/emission/tests/modellingTests/TestRunModel.py b/emission/tests/modellingTests/TestRunModel.py index f54ffd196..c7ba8cd8e 100644 --- a/emission/tests/modellingTests/TestRunModel.py +++ b/emission/tests/modellingTests/TestRunModel.py @@ -1,60 +1,142 @@ import unittest - +import logging import emission.analysis.modelling.trip_model.model_storage as eamums import emission.analysis.modelling.trip_model.model_type as eamumt import emission.analysis.modelling.trip_model.run_model as eamur import emission.storage.timeseries.abstract_timeseries as esta +import emission.tests.modellingTests.modellingTestAssets as etmm +import emission.storage.decorations.analysis_timeseries_queries as esda +import uuid - -class TestSimilarityAux(unittest.TestCase): +class TestRunModel(unittest.TestCase): """these tests were copied forward during a refactor of the tour model [https://github.com/e-mission/e-mission-server/blob/10772f892385d44e11e51e796b0780d8f6609a2c/emission/analysis/modelling/tour_model_first_only/load_predict.py#L114] it's uncertain what condition they are in besides having been refactored to use the more recent tour modeling code. """ - def setUp(self): - self.all_users = esta.TimeSeries.get_uuid_list() - if len(self.all_users) == 0: - self.fail('test invariant failed: no users found') - def testTrip1(self): - - # case 1: the new trip matches a bin from the 1st round and a cluster from the 2nd round - user_id = self.all_users[0] - eamur.update_trip_model(user_id, eamumt.ModelType.GREEDY_SIMILARITY_BINNING, eamums.ModelStorage.DATABASE) - filter_trips = eamur._get_trips_for_user(user_id, None, 0) - new_trip = filter_trips[4] - # result is [{'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'church', 'replaced_mode': 'drove_alone'}, - # 'p': 0.9333333333333333}, {'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'entertainment', - # 'replaced_mode': 'drove_alone'}, 'p': 0.06666666666666667}] - pl, _ = eamur.predict_labels_with_n(new_trip) - assert len(pl) > 0, f"Invalid prediction {pl}" - - def testTrip2(self): - - # case 2: no existing files for the user who has the new trip: - # 1. the user is invalid(< 10 existing fully labeled trips, or < 50% of trips that fully labeled) - # 2. the user doesn't have common trips - user_id = self.all_users[1] - eamur.update_trip_model(user_id, eamumt.ModelType.GREEDY_SIMILARITY_BINNING, eamums.ModelStorage.DATABASE) - filter_trips = eamur._get_trips_for_user(user_id, None, 0) - new_trip = filter_trips[0] - # result is [] - pl, _ = eamur.predict_labels_with_n(new_trip) - assert len(pl) == 0, f"Invalid prediction {pl}" - - def testTrip3(self): - - # case3: the new trip is novel trip(doesn't fall in any 1st round bins) - user_id = self.all_users[0] - eamur.update_trip_model(user_id, eamumt.ModelType.GREEDY_SIMILARITY_BINNING, eamums.ModelStorage.DATABASE) - filter_trips = eamur._get_trips_for_user(user_id, None, 0) - new_trip = filter_trips[0] - # result is [] - pl = eamur.predict_labels_with_n(new_trip) - assert len(pl) == 0, f"Invalid prediction {pl}" + def setUp(self) -> None: + """ + sets up the end-to-end run model test with Confirmedtrip data + """ + logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', + level=logging.DEBUG) + + # configuration for randomly-generated test data + self.user_id = user_id = 'TestRunModel-TestData' + self.origin = (-105.1705977, 39.7402654,) + self.destination = (-105.1755606, 39.7673075) + self.min_trips = 14 + total_trips = 100 + clustered_trips = 33 # bins must have at least self.min_trips similar trips by default + has_label_percent = 0.9 # let's make a few that don't have a label, but invariant + # $clustered_trips * $has_label_percent > self.min_trips + # must be correct or else this test could fail under some random test cases. + + ts = esta.TimeSeries.get_time_series(user_id) + test_data = list(ts.find_entries(["analysis/confirmed_trip"])) + if len(test_data) == 0: + # generate test data for the database + + logging.debug(f"inserting mock Confirmedtrips into database") + + + train = etmm.generate_mock_trips( + user_id=user_id, + trips=total_trips, + origin=self.origin, + destination=self.destination, + label_data={ + "mode_labels": ['ebike', 'bike'], + "purpose_labels": ['happy-hour', 'dog-park'], + "replaced_mode_labels": ['walk'] + }, + within_threshold=clustered_trips, + has_label_p=has_label_percent + # + ) + + ts.bulk_insert(train) + + # confirm data write did not fail + test_data = esda.get_entries(key="analysis/confirmed_trip", user_id=user_id, time_query=None) + if len(test_data) != total_trips: + logging.debug(f'test invariant failed after generating test data') + self.fail() + else: + logging.debug(f'found {total_trips} trips in database') + + def testRoundTrip(self): + """ + train a model, save it, load it, and use it for prediction + """ + + logging.debug(f'(TRAIN) creating a model based on trips in database') + eamur.update_trip_model( + user_id=self.user_id, + model_type=eamumt.ModelType.GREEDY_SIMILARITY_BINNING, + model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, + min_trips=self.min_trips + ) + + logging.debug(f'(TEST) testing prediction of stored model') + test = etmm.build_mock_trip( + user_id=self.user_id, + origin=self.origin, + destination=self.destination + ) + prediction, n = eamur.predict_labels_with_n(test) + logging.debug(prediction) + + self.assertNotEqual(len(prediction), 0, "should have a prediction") + + + + + + # def setUp(self): + # self.all_users = esta.TimeSeries.get_uuid_list() + # if len(self.all_users) == 0: + # self.fail('test invariant failed: no users found') + + # def testTrip1(self): + + # # case 1: the new trip matches a bin from the 1st round and a cluster from the 2nd round + # user_id = self.all_users[0] + # eamur.update_trip_model(user_id, eamumt.ModelType.GREEDY_SIMILARITY_BINNING, eamums.ModelStorage.DATABASE) + # filter_trips = eamur._get_trips_for_user(user_id, None, 0) + # new_trip = filter_trips[4] + # # result is [{'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'church', 'replaced_mode': 'drove_alone'}, + # # 'p': 0.9333333333333333}, {'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'entertainment', + # # 'replaced_mode': 'drove_alone'}, 'p': 0.06666666666666667}] + # pl, _ = eamur.predict_labels_with_n(new_trip) + # assert len(pl) > 0, f"Invalid prediction {pl}" + + # def testTrip2(self): + + # # case 2: no existing files for the user who has the new trip: + # # 1. the user is invalid(< 10 existing fully labeled trips, or < 50% of trips that fully labeled) + # # 2. the user doesn't have common trips + # user_id = self.all_users[1] + # eamur.update_trip_model(user_id, eamumt.ModelType.GREEDY_SIMILARITY_BINNING, eamums.ModelStorage.DATABASE) + # filter_trips = eamur._get_trips_for_user(user_id, None, 0) + # new_trip = filter_trips[0] + # # result is [] + # pl, _ = eamur.predict_labels_with_n(new_trip) + # assert len(pl) == 0, f"Invalid prediction {pl}" + + # def testTrip3(self): + + # # case3: the new trip is novel trip(doesn't fall in any 1st round bins) + # user_id = self.all_users[0] + # eamur.update_trip_model(user_id, eamumt.ModelType.GREEDY_SIMILARITY_BINNING, eamums.ModelStorage.DATABASE) + # filter_trips = eamur._get_trips_for_user(user_id, None, 0) + # new_trip = filter_trips[0] + # # result is [] + # pl = eamur.predict_labels_with_n(new_trip) + # assert len(pl) == 0, f"Invalid prediction {pl}" # case 4: the new trip falls in a 1st round bin, but predict to be a new cluster in the 2nd round # result is [] diff --git a/emission/tests/modellingTests/modellingTestAssets.py b/emission/tests/modellingTests/modellingTestAssets.py index 1353e40fe..f630bf66a 100644 --- a/emission/tests/modellingTests/modellingTestAssets.py +++ b/emission/tests/modellingTests/modellingTestAssets.py @@ -64,27 +64,29 @@ def sample_trip_labels( return user_input -def build_mock_trip(user_id, origin, destination, labels) -> ecwc.Confirmedtrip: +def build_mock_trip(user_id, origin, destination, labels = {}) -> ecwc.Confirmedtrip: """repackages mock data as a Confirmedtrip Entry type :param user_id: the user id UUID :param origin: trip origin coordinates :param destination: trip destination coordinates - :param labels: user labels for the trip + :param labels: user labels for the trip, optional, default none :return: a Confirmedtrip entry """ key = "analysis/confirmed_trip" data = { "start_loc": { + "type": "Point", "coordinates": origin }, "end_loc": { + "type": "Point", "coordinates": destination }, "user_input": labels } - return ecwe.Entry.create_fake_entry(user_id, key, data, write_ts=arrow.now()) + return ecwe.Entry.create_fake_entry(user_id, key, data, write_ts=arrow.now().timestamp) def generate_mock_trips( From 1320e96096cee80e15b925a3c25868ac8ae5b6cf Mon Sep 17 00:00:00 2001 From: rfitzger Date: Wed, 29 Jun 2022 14:34:07 -0600 Subject: [PATCH 24/46] integrate config file --- bin/build_label_model.py | 10 +- conf/analysis/trip_model.conf.json.sample | 13 ++ .../inference/labels/inferrers.py | 9 +- .../modelling/similarity/similarity_metric.py | 1 - .../similarity/similarity_metric_type.py | 49 ++++++ .../modelling/tour_model/label_processing.py | 16 +- .../analysis/modelling/trip_model/config.py | 79 ++++++++++ .../trip_model/greedy_similarity_binning.py | 63 ++++---- .../modelling/trip_model/model_storage.py | 15 ++ .../modelling/trip_model/model_type.py | 32 ++-- .../modelling/trip_model/run_model.py | 24 ++- .../modellingTests/TestRunGreedyModel.py | 135 +++++++++++++++++ emission/tests/modellingTests/TestRunModel.py | 143 ------------------ 13 files changed, 374 insertions(+), 215 deletions(-) create mode 100644 conf/analysis/trip_model.conf.json.sample create mode 100644 emission/analysis/modelling/similarity/similarity_metric_type.py create mode 100644 emission/analysis/modelling/trip_model/config.py create mode 100644 emission/tests/modellingTests/TestRunGreedyModel.py delete mode 100644 emission/tests/modellingTests/TestRunModel.py diff --git a/bin/build_label_model.py b/bin/build_label_model.py index ed3c4a1d1..aa2623281 100644 --- a/bin/build_label_model.py +++ b/bin/build_label_model.py @@ -5,14 +5,13 @@ import argparse import uuid -import emission.analysis.modelling.trip_model.model_storage as eamums -import emission.analysis.modelling.trip_model.model_type as eamumt import emission.pipeline.reset as epr import emission.core.get_database as edb import emission.core.wrapper.user as ecwu import emission.storage.timeseries.abstract_timeseries as esta import emission.analysis.modelling.trip_model.run_model as eamur +import emission.analysis.modelling.trip_model.config as eamtc def _get_user_list(args): if args.all: @@ -66,7 +65,8 @@ def _email_2_user_list(email_list): for user_id in user_list: logging.info("building model for user %s" % user_id) # these can come from the application config as default values - model_type = eamumt.ModelType.GREEDY_SIMILARITY_BINNING - model_storage = eamums.ModelStorage.DOCUMENT_DATABASE - min_trips = 14 + + model_type = eamtc.get_model_type() + model_storage = eamtc.get_model_storage() + min_trips = eamtc.get_minimum_trips() eamur.update_trip_model(user_id, model_type, model_storage, min_trips) diff --git a/conf/analysis/trip_model.conf.json.sample b/conf/analysis/trip_model.conf.json.sample new file mode 100644 index 000000000..845e67a6a --- /dev/null +++ b/conf/analysis/trip_model.conf.json.sample @@ -0,0 +1,13 @@ +{ + "model_type": "greedy", + "model_storage": "document_database", + "minimum_trips": 14, + "model_parameters": { + "greedy": { + "metric": "od_similarity", + "similarity_threshold_meters": 500, + "apply_cutoff": false, + "incremental_evaluation": false + } + } +} \ No newline at end of file diff --git a/emission/analysis/classification/inference/labels/inferrers.py b/emission/analysis/classification/inference/labels/inferrers.py index 7b784c38e..c6b939671 100644 --- a/emission/analysis/classification/inference/labels/inferrers.py +++ b/emission/analysis/classification/inference/labels/inferrers.py @@ -6,9 +6,8 @@ import copy import emission.analysis.modelling.tour_model_first_only.load_predict as lp -from emission.analysis.modelling.trip_model.model_storage import ModelStorage import emission.analysis.modelling.trip_model.run_model as eamur -from emission.analysis.modelling.trip_model.model_type import ModelType +import emission.analysis.modelling.trip_model.config as eamtc # A set of placeholder predictors to allow pipeline development without a real inference algorithm. # For the moment, the system is configured to work with two labels, "mode_confirm" and @@ -143,9 +142,9 @@ def n_to_confidence_coeff(n, max_confidence=None, first_confidence=None, confide # predict_two_stage_bin_cluster but with the above reduction in confidence def predict_cluster_confidence_discounting(trip, max_confidence=None, first_confidence=None, confidence_multiplier=None): - # these can come from the application config as default values - model_type = ModelType.GREEDY_SIMILARITY_BINNING - model_storage = ModelStorage.DOCUMENT_DATABASE + # load application config + model_type = eamtc.get_model_type() + model_storage = eamtc.get_model_storage() labels, n = eamur.predict_labels_with_n(trip, model_type, model_storage) if n <= 0: # No model data or trip didn't match a cluster logging.debug(f"In predict_cluster_confidence_discounting: n={n}; returning as-is") diff --git a/emission/analysis/modelling/similarity/similarity_metric.py b/emission/analysis/modelling/similarity/similarity_metric.py index 82d3513f8..6be00216f 100644 --- a/emission/analysis/modelling/similarity/similarity_metric.py +++ b/emission/analysis/modelling/similarity/similarity_metric.py @@ -38,5 +38,4 @@ def similar(self, a: List[float], b: List[float], thresh: float) -> bool: """ similarity_values = self.similarity(a, b) is_similar = all(map(lambda sim: sim <= thresh, similarity_values)) - logging.debug(f"is_similar: {is_similar} | similarity: {similarity_values}") return is_similar diff --git a/emission/analysis/modelling/similarity/similarity_metric_type.py b/emission/analysis/modelling/similarity/similarity_metric_type.py new file mode 100644 index 000000000..6f3f4c776 --- /dev/null +++ b/emission/analysis/modelling/similarity/similarity_metric_type.py @@ -0,0 +1,49 @@ +from __future__ import annotations +import enum + + +import emission.analysis.modelling.similarity.od_similarity as eamso +import emission.analysis.modelling.similarity.similarity_metric as eamss + +class SimilarityMetricType(enum.Enum): + OD_SIMILARITY = 0 + + def build(self) -> eamss.SimilarityMetric: + """ + + hey YOU! add future similarity metric types here please! + + :raises KeyError: if the SimilarityMetricType isn't found in the below dictionary + :return: the associated similarity metric + """ + metrics = { + SimilarityMetricType.OD_SIMILARITY: eamso.OriginDestinationSimilarity() + } + + metric = metrics.get(self) + if metric is None: + names = "{" + ",".join(SimilarityMetricType.names) + "}" + msg = f"unknown metric type {metric}, must be one of {names}" + raise KeyError(msg) + else: + return metric + + + @classmethod + def names(cls): + return list(map(lambda e: e.name, list(cls))) + + @classmethod + def from_str(cls, str): + """attempts to match the provided string to a known SimilarityMetricType. + not case sensitive. + + :param str: a string name of a SimilarityMetricType + """ + try: + str_caps = str.upper() + return cls[str_caps] + except KeyError: + names = "{" + ",".join(cls.names) + "}" + msg = f"{str} is not a known SimilarityMetricType, must be one of {names}" + raise KeyError(msg) \ No newline at end of file diff --git a/emission/analysis/modelling/tour_model/label_processing.py b/emission/analysis/modelling/tour_model/label_processing.py index e69707305..1384b6ebb 100644 --- a/emission/analysis/modelling/tour_model/label_processing.py +++ b/emission/analysis/modelling/tour_model/label_processing.py @@ -34,14 +34,14 @@ def map_labels_mode(user_input_df): # convert mode if "replaced_mode" in user_input_df.columns: same_mode_df = user_input_df[user_input_df.replaced_mode == "same_mode"] - logging.debug("The following rows will be changed %s" % - same_mode_df.index) - for a in range(len(user_input_df)): - if user_input_df.iloc[a]["replaced_mode"] == "same_mode": - # to see which row will be converted - # logging.debug("The following rows will be changed: %s", user_input_df.iloc[a]) - user_input_df.iloc[a]["replaced_mode"] = user_input_df.iloc[a]['mode_confirm'] - logging.debug("Finished changing all rows") + if len(same_mode_df) > 0: + logging.debug("The following rows will be changed %s" % same_mode_df.index) + for a in range(len(user_input_df)): + if user_input_df.iloc[a]["replaced_mode"] == "same_mode": + # to see which row will be converted + # logging.debug("The following rows will be changed: %s", user_input_df.iloc[a]) + user_input_df.iloc[a]["replaced_mode"] = user_input_df.iloc[a]['mode_confirm'] + logging.debug("Finished changing all rows") else: logging.info("map_labels_mode: no replaced mode column found, early return") return user_input_df diff --git a/emission/analysis/modelling/trip_model/config.py b/emission/analysis/modelling/trip_model/config.py new file mode 100644 index 000000000..76b3c6e6d --- /dev/null +++ b/emission/analysis/modelling/trip_model/config.py @@ -0,0 +1,79 @@ +import json +import re +from this import d +from typing import Optional +import logging +from numpy import isin + +import emission.analysis.modelling.trip_model.model_storage as eamums +import emission.analysis.modelling.trip_model.model_type as eamumt + +config_filename = "" + +def load_config(): + global config_filename + try: + config_filename = 'conf/analysis/trip_model.conf.json' + config_file = open(config_filename) + except: + print("analysis.trip_model.conf.json not configured, falling back to sample, default configuration") + config_filename = 'conf/analysis/trip_model.conf.json.sample' + config_file = open('conf/analysis/trip_model.conf.json.sample') + ret_val = json.load(config_file) + config_file.close() + return ret_val + +config_data = load_config() + +def reload_config(): + global config_data + config_data = load_config() + +def get_config(): + return config_data + +def get_optional_config_value(key) -> Optional[str]: + """ + get a config value at the provided path/key + + :param key: a key name or a dot-delimited path to some key within the config object + :return: the value at the key, or, None if not found + """ + cursor = config_data + path = key.split(".") + for k in path: + cursor = cursor.get(k) + if cursor is None: + return None + return cursor + +def get_config_value_or_raise(key): + logging.debug(f'getting key {key} in config') + value = get_optional_config_value(key) + if value is None: + logging.debug('config object:') + logging.debug(json.dumps(config_data, indent=2)) + msg = f"expected config key {key} not found in config file {config_filename}" + raise KeyError(msg) + else: + return value + +def get_model_type(): + model_type_str = get_config_value_or_raise('model_type') + model_type = eamumt.ModelType.from_str(model_type_str) + return model_type + +def get_model_storage(): + model_storage_str = get_config_value_or_raise('model_storage') + model_storage = eamums.ModelStorage.from_str(model_storage_str) + return model_storage + +def get_minimum_trips(): + minimum_trips = get_config_value_or_raise('minimum_trips') + if not isinstance(minimum_trips, int): + msg = f"config key 'minimum_trips' not an integer in config file {config_filename}" + raise TypeError(msg) + return minimum_trips + + + diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py index 7ac3c7fad..43206cd11 100644 --- a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py @@ -2,24 +2,21 @@ from tokenize import group from typing import Dict, List, Optional, Tuple +import emission.analysis.modelling.similarity.similarity_metric_type as eamssmt import emission.analysis.modelling.similarity.similarity_metric as eamss import emission.analysis.modelling.tour_model.label_processing as lp import emission.analysis.modelling.trip_model.trip_model as eamuu import emission.analysis.modelling.trip_model.util as util +import emission.analysis.modelling.trip_model.config as eamtc import emission.core.wrapper.confirmedtrip as ecwc import pandas as pd class GreedySimilarityBinning(eamuu.TripModel): - is_incremental = False + is_incremental: bool = False # overwritten during __init__ - def __init__( - self, - metric: eamss.SimilarityMetric, - sim_thresh: float, - apply_cutoff: bool = False, - ) -> None: + def __init__(self, config=None): """ instantiate a clustering model for a user. @@ -70,18 +67,35 @@ def __init__( - value_x: str user-provided label for a category - p_val: float probability of a prediction, real number in [0, 1] - :param dir: the model load/save directory - :param user_id: identity (UUID) of the e-mission user - :param metric: type of similarity metric to use - :param sim_thresh: max distance threshold for similarity (assumed meters) - :param apply_cutoff: ignore clusters which are small, based on a "knee point" heuristic (default False) + :param config: if provided, a manual configuration for testing purposes. these + values should be provided by the config file when running OpenPATH. + see config.py for more details. """ - super().__init__() - self.metric = metric - self.sim_thresh = sim_thresh - self.apply_cutoff = apply_cutoff + + if config is None: + config = eamtc.get_config_value_or_raise('model_parameters.greedy') + logging.debug(f'GreedySimilarityBinning loaded model config from file') + else: + logging.debug(f'GreedySimilarityBinning using model config argument') + + expected_keys = [ + 'metric', + 'similarity_threshold_meters', + 'apply_cutoff', + 'incremental_evaluation' + ] + for k in expected_keys: + if config.get(k) is None: + msg = f"greedy trip model config missing expected key {k}" + raise KeyError(msg) + + self.metric = eamssmt.SimilarityMetricType.from_str(config['metric']).build() + self.sim_thresh = config['similarity_threshold_meters'] + self.apply_cutoff = config['apply_cutoff'] + self.is_incremental = config['incremental_evaluation'] + self.bins: Dict[str, Dict] = {} - self.loaded = False + def fit(self, trips: List[ecwc.Confirmedtrip]): """train the model by passing data, where each row in the data @@ -98,16 +112,11 @@ def fit(self, trips: List[ecwc.Confirmedtrip]): if len(self.bins) > 1 and self.apply_cutoff: self._apply_cutoff() self._generate_predictions() - self.loaded = True + binned_features = sum([len(b['features']) for b in self.bins.values() ]) - logging.info(f"model fit to trip data") - logging.info(f'source data: {len(trips)} rows') - logging.info(f'stored model: {binned_features} entries') + logging.info(f"greedy binning model fit to {len(trips)} rows of trip data") def predict(self, trip: ecwc.Confirmedtrip) -> Tuple[List[Dict], int]: - if not self.loaded: - msg = f"predict called on unloaded model" - raise IOError(msg) logging.debug(f"running greedy similarity clustering") predicted_bin, bin_record = self._nearest_bin(trip) @@ -115,10 +124,10 @@ def predict(self, trip: ecwc.Confirmedtrip) -> Tuple[List[Dict], int]: logging.debug(f"unable to predict bin for trip {trip}") return [], -1 else: - labels = bin_record['predictions'] + predictions = bin_record['predictions'] n_features = len(bin_record['features']) - logging.debug(f"found cluster {predicted_bin} with labels {labels}") - return labels, n_features + logging.debug(f"found cluster {predicted_bin} with predictions {predictions}") + return predictions, n_features def to_dict(self) -> Dict: return self.bins diff --git a/emission/analysis/modelling/trip_model/model_storage.py b/emission/analysis/modelling/trip_model/model_storage.py index b251fb3db..26b2d4cc3 100644 --- a/emission/analysis/modelling/trip_model/model_storage.py +++ b/emission/analysis/modelling/trip_model/model_storage.py @@ -21,6 +21,21 @@ class ModelStorage(Enum): def names(cls): return list(map(lambda e: e.name, list(cls))) + @classmethod + def from_str(cls, str): + """ + attempts to match the provided string to a known ModelStorage type. + not case sensitive. + + :param str: a string name of a ModelType + """ + try: + str_caps = str.upper() + return cls[str_caps] + except KeyError: + names = "{" + ",".join(cls.names) + "}" + msg = f"{str} is not a known ModelStorage, must be one of {names}" + raise KeyError(msg) def load_model(user_id, model_type: eamum.ModelType, model_storage: ModelStorage) -> Optional[Dict]: """load a user label model from a model storage location diff --git a/emission/analysis/modelling/trip_model/model_type.py b/emission/analysis/modelling/trip_model/model_type.py index 0c88f0fc7..b5e761fb0 100644 --- a/emission/analysis/modelling/trip_model/model_type.py +++ b/emission/analysis/modelling/trip_model/model_type.py @@ -9,10 +9,10 @@ class ModelType(Enum): - GREEDY_SIMILARITY_BINNING = 'greedy' + # ENUM_NAME_CAPS = 'SHORTHAND_NAME_CAPS' + GREEDY_SIMILARITY_BINNING = 'GREEDY' - @classmethod - def build(cls, model_type: ModelType) -> eamuu.TripModel: + def build(self, config=None) -> eamuu.TripModel: """ instantiates the requested user model type with the configured parameters. @@ -24,22 +24,15 @@ def build(cls, model_type: ModelType) -> eamuu.TripModel: :raises KeyError: if the requested model name does not exist """ # Dict[ModelType, TripModel] - # inject default values here at construction MODELS = { - cls.GREEDY_SIMILARITY_BINNING: eamug.GreedySimilarityBinning( - metric=eamso.OriginDestinationSimilarity(), - sim_thresh=SIMILARITY_THRESHOLD_METERS, - apply_cutoff=False - ) + ModelType.GREEDY_SIMILARITY_BINNING: eamug.GreedySimilarityBinning(config) } - model = MODELS.get(model_type) + model = MODELS.get(self) if model is None: - if not isinstance(model_type, ModelType): - raise TypeError(f"provided model type {model_type} is not an instance of ModelType") - else: - model_names = list(lambda e: e.name, MODELS.keys()) - models = ",".join(model_names) - raise KeyError(f"user label model {model_type.name} not found in factory, must be one of {{{models}}}") + model_names = list(lambda e: e.name, MODELS.keys()) + models = ",".join(model_names) + raise KeyError(f"ModelType {self.name} not found in factory, please add to build method") + return model @classmethod @@ -66,15 +59,16 @@ def from_str(cls, str): since a short name is 'nicer', we attempt to match on the enum value first (for example, 'greedy'). as a fallback, we attempt to match on the full ModelType name (for example, - 'GREEDY_SIMILARITY_BINNING'). + 'GREEDY_SIMILARITY_BINNING'). not case sensitive. :param str: a string name of a ModelType """ try: - return cls(str) + str_caps = str.upper() + return cls(str_caps) except ValueError: try: - return cls[str] + return cls[str_caps] except KeyError: names_list = '{' + ','.join(cls.names) + '}' msg = f'{str} is not a known ModelType, should be one of {names_list}' diff --git a/emission/analysis/modelling/trip_model/run_model.py b/emission/analysis/modelling/trip_model/run_model.py index 38e674db0..485294a5e 100644 --- a/emission/analysis/modelling/trip_model/run_model.py +++ b/emission/analysis/modelling/trip_model/run_model.py @@ -18,7 +18,9 @@ def update_trip_model( user_id, model_type: eamumt.ModelType, model_storage: eamums.ModelStorage = eamums.ModelStorage.DOCUMENT_DATABASE, - min_trips: int = 14): + min_trips: int = 14, + model_config = None + ): """ create/update a user label model for a user. @@ -31,11 +33,12 @@ def update_trip_model( there is a mismatch, an exception is thrown :param model_storage: storage destination for built model (default DATABASE) :param min_trips: minimum number of labeled trips per user to apply prediction (default 14) + :param model_config: optional configuration for model, for debugging purposes """ try: # this timestamp is used for recording the state of the updated model timestamp = arrow.now().timestamp - model = eamumt.ModelType.build(model_type) + model = model_type.build(model_config) # if a previous model exists, deserialize the stored model model_data_prev = eamums.load_model(user_id, model_type, model_storage) @@ -69,17 +72,19 @@ def update_trip_model( def predict_labels_with_n( trip: ecwc.Confirmedtrip, model_type = eamumt.ModelType.GREEDY_SIMILARITY_BINNING, - model_storage = eamums.ModelStorage.DOCUMENT_DATABASE): + model_storage = eamums.ModelStorage.DOCUMENT_DATABASE, + model_config = None): """ invoke the user label prediction model to predict labels for a trip. :param trip: the trip to predict labels for :param model_type: type of prediction model to run :param model_storage: location to read/write models + :param model_config: optional configuration for model, for debugging purposes :return: a list of predictions """ user_id = trip['user_id'] - model = _load_stored_trip_model(user_id, model_type, model_storage) + model = _load_stored_trip_model(user_id, model_type, model_storage, model_config) if model is None: return [], -1 else: @@ -99,7 +104,10 @@ def _get_training_data(user_id, min_trips: int, incremental: bool): last time this model was trained, otherwise, collect all historical data for this user """ - time_query = epq.get_time_query_for_trip_model(user_id) if incremental else None + # must call this regardless of whether model is incremental or not as it has + # the side effect of marking the start state of the pipeline execution + time_query_from_pipeline = epq.get_time_query_for_trip_model(user_id) + time_query = time_query_from_pipeline if incremental else None logging.debug(f'time query for training data collection: {time_query}') trips = esda.get_entries(key=esda.CONFIRMED_TRIP_KEY, user_id=user_id, time_query=time_query) @@ -120,20 +128,22 @@ def _get_training_data(user_id, min_trips: int, incremental: bool): def _load_stored_trip_model( user_id, model_type: eamumt.ModelType, - model_storage: eamums.ModelStorage) -> Optional[eamuu.TripModel]: + model_storage: eamums.ModelStorage, + model_config = None) -> Optional[eamuu.TripModel]: """helper to build a user label prediction model class with the contents of a stored model for some user. :param user_id: user to retrieve the model for :param model_type: TripModel type configured for this OpenPATH server :param model_storage: storage type + :param model_config: optional configuration for model, for debugging purposes :return: model, or None if no model is stored for this user """ model_dict = eamums.load_model(user_id, model_type, model_storage) if model_dict is None: return None else: - model = eamumt.ModelType.build(model_type) + model = model_type.build(model_config) model.from_dict(model_dict) return model diff --git a/emission/tests/modellingTests/TestRunGreedyModel.py b/emission/tests/modellingTests/TestRunGreedyModel.py new file mode 100644 index 000000000..f3709475c --- /dev/null +++ b/emission/tests/modellingTests/TestRunGreedyModel.py @@ -0,0 +1,135 @@ +import unittest +import logging + +import emission.analysis.modelling.trip_model.model_storage as eamums +import emission.analysis.modelling.trip_model.model_type as eamumt +import emission.analysis.modelling.trip_model.run_model as eamur +import emission.storage.timeseries.abstract_timeseries as esta +import emission.tests.modellingTests.modellingTestAssets as etmm +import emission.storage.decorations.analysis_timeseries_queries as esda +import emission.analysis.modelling.trip_model.config as eamtc + + +class TestRunGreedyModel(unittest.TestCase): + """these tests were copied forward during a refactor of the tour model + [https://github.com/e-mission/e-mission-server/blob/10772f892385d44e11e51e796b0780d8f6609a2c/emission/analysis/modelling/tour_model_first_only/load_predict.py#L114] + + it's uncertain what condition they are in besides having been refactored to + use the more recent tour modeling code. + """ + + def setUp(self): + """ + sets up the end-to-end run model test with Confirmedtrip data + """ + logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', + level=logging.DEBUG) + + # configuration for randomly-generated test data + self.user_id = user_id = 'TestRunGreedyModel-TestData' + self.origin = (-105.1705977, 39.7402654,) + self.destination = (-105.1755606, 39.7673075) + self.min_trips = 14 + self.total_trips = 100 + self.clustered_trips = 33 # bins must have at least self.min_trips similar trips by default + self.has_label_percent = 0.9 # let's make a few that don't have a label, but invariant + # $clustered_trips * $has_label_percent > self.min_trips + # must be correct or else this test could fail under some random test cases. + + # test data can be saved between test invocations, check if data exists before generating + ts = esta.TimeSeries.get_time_series(user_id) + test_data = list(ts.find_entries(["analysis/confirmed_trip"])) + if len(test_data) == 0: + # generate test data for the database + logging.debug(f"inserting mock Confirmedtrips into database") + + # generate labels with a known sample weight that we can rely on in the test + label_data = { + "mode_labels": ['ebike', 'bike'], + "purpose_labels": ['happy-hour', 'dog-park'], + "replaced_mode_labels": ['walk'], + "mode_weights": [0.9, 0.1], + "purpose_weights": [0.1, 0.9] + } + + train = etmm.generate_mock_trips( + user_id=user_id, + trips=self.total_trips, + origin=self.origin, + destination=self.destination, + label_data=label_data, + within_threshold=self.clustered_trips, + threshold=0.004, # ~400m + has_label_p=self.has_label_percent + ) + + ts.bulk_insert(train) + + # confirm data write did not fail + test_data = esda.get_entries(key="analysis/confirmed_trip", user_id=user_id, time_query=None) + if len(test_data) != self.total_trips: + logging.debug(f'test invariant failed after generating test data') + self.fail() + else: + logging.debug(f'found {self.total_trips} trips in database') + + def tearDown(self): + """ + delete entries for user self.user_id in the database, not + yet implemented in database operations, so these test entries will + have to stick around for now. + """ + pass + + def testBuildGreedyModelFromConfig(self): + """ + greedy model takes config arguments via the constructor for testing + purposes but will load from a file in /conf/analysis/ which is tested here + """ + + eamumt.ModelType.GREEDY_SIMILARITY_BINNING.build() + # success if it didn't throw + + def test1RoundTripGreedySimilarityBinning(self): + """ + train a model, save it, load it, and use it for prediction, using + the high-level training/testing API provided via + run_model.py:update_trip_model() # train + run_model.py:predict_labels_with_n() # test + + for clustering, use the default greedy similarity binning model + """ + + # pass along debug model configuration + greedy_model_config = { + "metric": "od_similarity", + "similarity_threshold_meters": 500, + "apply_cutoff": False, + "incremental_evaluation": False + } + + logging.debug(f'(TRAIN) creating a model based on trips in database') + eamur.update_trip_model( + user_id=self.user_id, + model_type=eamumt.ModelType.GREEDY_SIMILARITY_BINNING, + model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, + min_trips=self.min_trips, + model_config=greedy_model_config + ) + + logging.debug(f'(TEST) testing prediction of stored model') + test = etmm.build_mock_trip( + user_id=self.user_id, + origin=self.origin, + destination=self.destination + ) + prediction, n = eamur.predict_labels_with_n( + trip = test, + model_type=eamumt.ModelType.GREEDY_SIMILARITY_BINNING, + model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, + model_config=greedy_model_config + ) + + [logging.debug(p) for p in sorted(prediction, key=lambda r: r['p'], reverse=True)] + + self.assertNotEqual(len(prediction), 0, "should have a prediction") diff --git a/emission/tests/modellingTests/TestRunModel.py b/emission/tests/modellingTests/TestRunModel.py deleted file mode 100644 index c7ba8cd8e..000000000 --- a/emission/tests/modellingTests/TestRunModel.py +++ /dev/null @@ -1,143 +0,0 @@ -import unittest -import logging - -import emission.analysis.modelling.trip_model.model_storage as eamums -import emission.analysis.modelling.trip_model.model_type as eamumt -import emission.analysis.modelling.trip_model.run_model as eamur -import emission.storage.timeseries.abstract_timeseries as esta -import emission.tests.modellingTests.modellingTestAssets as etmm -import emission.storage.decorations.analysis_timeseries_queries as esda -import uuid - -class TestRunModel(unittest.TestCase): - """these tests were copied forward during a refactor of the tour model - [https://github.com/e-mission/e-mission-server/blob/10772f892385d44e11e51e796b0780d8f6609a2c/emission/analysis/modelling/tour_model_first_only/load_predict.py#L114] - - it's uncertain what condition they are in besides having been refactored to - use the more recent tour modeling code. - """ - - def setUp(self) -> None: - """ - sets up the end-to-end run model test with Confirmedtrip data - """ - logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', - level=logging.DEBUG) - - # configuration for randomly-generated test data - self.user_id = user_id = 'TestRunModel-TestData' - self.origin = (-105.1705977, 39.7402654,) - self.destination = (-105.1755606, 39.7673075) - self.min_trips = 14 - total_trips = 100 - clustered_trips = 33 # bins must have at least self.min_trips similar trips by default - has_label_percent = 0.9 # let's make a few that don't have a label, but invariant - # $clustered_trips * $has_label_percent > self.min_trips - # must be correct or else this test could fail under some random test cases. - - ts = esta.TimeSeries.get_time_series(user_id) - test_data = list(ts.find_entries(["analysis/confirmed_trip"])) - if len(test_data) == 0: - # generate test data for the database - - logging.debug(f"inserting mock Confirmedtrips into database") - - - train = etmm.generate_mock_trips( - user_id=user_id, - trips=total_trips, - origin=self.origin, - destination=self.destination, - label_data={ - "mode_labels": ['ebike', 'bike'], - "purpose_labels": ['happy-hour', 'dog-park'], - "replaced_mode_labels": ['walk'] - }, - within_threshold=clustered_trips, - has_label_p=has_label_percent - # - ) - - ts.bulk_insert(train) - - # confirm data write did not fail - test_data = esda.get_entries(key="analysis/confirmed_trip", user_id=user_id, time_query=None) - if len(test_data) != total_trips: - logging.debug(f'test invariant failed after generating test data') - self.fail() - else: - logging.debug(f'found {total_trips} trips in database') - - def testRoundTrip(self): - """ - train a model, save it, load it, and use it for prediction - """ - - logging.debug(f'(TRAIN) creating a model based on trips in database') - eamur.update_trip_model( - user_id=self.user_id, - model_type=eamumt.ModelType.GREEDY_SIMILARITY_BINNING, - model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, - min_trips=self.min_trips - ) - - logging.debug(f'(TEST) testing prediction of stored model') - test = etmm.build_mock_trip( - user_id=self.user_id, - origin=self.origin, - destination=self.destination - ) - prediction, n = eamur.predict_labels_with_n(test) - logging.debug(prediction) - - self.assertNotEqual(len(prediction), 0, "should have a prediction") - - - - - - # def setUp(self): - # self.all_users = esta.TimeSeries.get_uuid_list() - # if len(self.all_users) == 0: - # self.fail('test invariant failed: no users found') - - # def testTrip1(self): - - # # case 1: the new trip matches a bin from the 1st round and a cluster from the 2nd round - # user_id = self.all_users[0] - # eamur.update_trip_model(user_id, eamumt.ModelType.GREEDY_SIMILARITY_BINNING, eamums.ModelStorage.DATABASE) - # filter_trips = eamur._get_trips_for_user(user_id, None, 0) - # new_trip = filter_trips[4] - # # result is [{'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'church', 'replaced_mode': 'drove_alone'}, - # # 'p': 0.9333333333333333}, {'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'entertainment', - # # 'replaced_mode': 'drove_alone'}, 'p': 0.06666666666666667}] - # pl, _ = eamur.predict_labels_with_n(new_trip) - # assert len(pl) > 0, f"Invalid prediction {pl}" - - # def testTrip2(self): - - # # case 2: no existing files for the user who has the new trip: - # # 1. the user is invalid(< 10 existing fully labeled trips, or < 50% of trips that fully labeled) - # # 2. the user doesn't have common trips - # user_id = self.all_users[1] - # eamur.update_trip_model(user_id, eamumt.ModelType.GREEDY_SIMILARITY_BINNING, eamums.ModelStorage.DATABASE) - # filter_trips = eamur._get_trips_for_user(user_id, None, 0) - # new_trip = filter_trips[0] - # # result is [] - # pl, _ = eamur.predict_labels_with_n(new_trip) - # assert len(pl) == 0, f"Invalid prediction {pl}" - - # def testTrip3(self): - - # # case3: the new trip is novel trip(doesn't fall in any 1st round bins) - # user_id = self.all_users[0] - # eamur.update_trip_model(user_id, eamumt.ModelType.GREEDY_SIMILARITY_BINNING, eamums.ModelStorage.DATABASE) - # filter_trips = eamur._get_trips_for_user(user_id, None, 0) - # new_trip = filter_trips[0] - # # result is [] - # pl = eamur.predict_labels_with_n(new_trip) - # assert len(pl) == 0, f"Invalid prediction {pl}" - - # case 4: the new trip falls in a 1st round bin, but predict to be a new cluster in the 2nd round - # result is [] - # no example for now From c8d733d1324efc71cd5dc0704ce6876c4726a815 Mon Sep 17 00:00:00 2001 From: rfitzger Date: Wed, 29 Jun 2022 14:54:29 -0600 Subject: [PATCH 25/46] fix edge cases for empty training sets --- .../modelling/trip_model/model_storage.py | 8 +++- .../modelling/trip_model/run_model.py | 41 +++++++++++-------- .../modellingTests/TestRunGreedyModel.py | 28 +++++++++++++ 3 files changed, 58 insertions(+), 19 deletions(-) diff --git a/emission/analysis/modelling/trip_model/model_storage.py b/emission/analysis/modelling/trip_model/model_storage.py index 26b2d4cc3..13177cb3a 100644 --- a/emission/analysis/modelling/trip_model/model_storage.py +++ b/emission/analysis/modelling/trip_model/model_storage.py @@ -108,7 +108,13 @@ def save_model( :raises TypeError: unknown ModelType :raises IOError: failure when writing to storage medium """ - + if len(model_data) == 0: + # this wouldn't be good, esp for incremental models, because it can + # wipe out all of a model's history. save_model should be avoided at the + # call site when the model is empty. + msg = f'trip model for user {user_id} is empty but save_model called' + raise Exception(msg) + if model_storage == ModelStorage.DOCUMENT_DATABASE: row = ecwu.Tripmodel() diff --git a/emission/analysis/modelling/trip_model/run_model.py b/emission/analysis/modelling/trip_model/run_model.py index 485294a5e..e3480b464 100644 --- a/emission/analysis/modelling/trip_model/run_model.py +++ b/emission/analysis/modelling/trip_model/run_model.py @@ -50,16 +50,29 @@ def update_trip_model( logging.debug(f'model type {model_type.name} is incremental? {model.is_incremental}') trips = _get_training_data(user_id, min_trips, model.is_incremental) - logging.debug(f'found {len(trips)} for trip model') - - # train and store the model - model.fit(trips) - model_data_next = model.to_dict() - eamums.save_model(user_id, model_type, model_data_next, timestamp, model_storage) - logging.debug(f"{model_type.name} label prediction model built for user {user_id} with timestamp {timestamp}") - - epq.mark_trip_model_done(user_id, timestamp) + if not len(trips) >= min_trips: + msg = ( + f"Total: {len(trips)}, labeled: {len(trips)}, user " + f"{user_id} doesn't have enough valid trips for further analysis." + ) + logging.debug(msg) + epq.mark_trip_model_failed(user_id) + else: + + # train and store the model + model.fit(trips) + model_data_next = model.to_dict() + + if len(model_data_next) == 0: + epq.mark_trip_model_failed(user_id) + msg = f"trip model for user {user_id} is empty" + raise Exception(msg) + + eamums.save_model(user_id, model_type, model_data_next, timestamp, model_storage) + logging.debug(f"{model_type.name} label prediction model built for user {user_id} with timestamp {timestamp}") + + epq.mark_trip_model_done(user_id, timestamp) except Exception as e: epq.mark_trip_model_failed(user_id) @@ -92,14 +105,13 @@ def predict_labels_with_n( return predictions, n -def _get_training_data(user_id, min_trips: int, incremental: bool): +def _get_training_data(user_id, int, incremental: bool): """ load the labeled trip data for this user, subject to a time query. if the user does not have at least $min_trips trips with labels, then return an empty list. :param user_id: user to collect trips from :param time_query: query to restrict the time (optional) - :param min_trips: minimum number of labeled trips required to train :param incremental: if true, only collect trips which have arrived since the last time this model was trained, otherwise, collect all historical data for this user @@ -113,13 +125,6 @@ def _get_training_data(user_id, min_trips: int, incremental: bool): trips = esda.get_entries(key=esda.CONFIRMED_TRIP_KEY, user_id=user_id, time_query=time_query) print(f'found {len(trips)} training rows') labeled_trips = [trip for trip in trips if trip['data']['user_input'] != {}] - if not len(labeled_trips) >= min_trips: - msg = ( - f"Total: {len(trips)}, labeled: {len(labeled_trips)}, user " - f"{user_id} doesn't have enough valid trips for further analysis." - ) - logging.debug(msg) - return [] logging.debug(f'found {len(labeled_trips)} labeled trips for user {user_id}') return labeled_trips diff --git a/emission/tests/modellingTests/TestRunGreedyModel.py b/emission/tests/modellingTests/TestRunGreedyModel.py index f3709475c..d73238837 100644 --- a/emission/tests/modellingTests/TestRunGreedyModel.py +++ b/emission/tests/modellingTests/TestRunGreedyModel.py @@ -90,6 +90,34 @@ def testBuildGreedyModelFromConfig(self): eamumt.ModelType.GREEDY_SIMILARITY_BINNING.build() # success if it didn't throw + def testTrainGreedyModelWithZeroTrips(self): + """ + greedy model takes config arguments via the constructor for testing + purposes but will load from a file in /conf/analysis/ which is tested here + """ + + # making an assumption here... + unused_user_id = 'asdjfkl;asdfjkl;asd08234ur13fi4jhf2103mkl' + + # pass along debug model configuration + greedy_model_config = { + "metric": "od_similarity", + "similarity_threshold_meters": 500, + "apply_cutoff": False, + "incremental_evaluation": False + } + + logging.debug(f'~~~~ do nothing ~~~~') + eamur.update_trip_model( + user_id=unused_user_id, + model_type=eamumt.ModelType.GREEDY_SIMILARITY_BINNING, + model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, + min_trips=self.min_trips, + model_config=greedy_model_config + ) + + # todo: check the pipeline for this user to confirm they don't have a current timestamp + def test1RoundTripGreedySimilarityBinning(self): """ train a model, save it, load it, and use it for prediction, using From 4483a14bf89d2982e15734dafb8d6b687474aa24 Mon Sep 17 00:00:00 2001 From: rfitzger Date: Thu, 7 Jul 2022 15:56:55 -0600 Subject: [PATCH 26/46] add test of incremental training --- .../confirmed_trip_feature_extraction.py | 29 +++- .../TestRunGreedyIncrementalModel.py | 162 ++++++++++++++++++ .../modellingTests/TestRunGreedyModel.py | 7 +- .../modellingTests/modellingTestAssets.py | 68 +++++--- 4 files changed, 233 insertions(+), 33 deletions(-) create mode 100644 emission/tests/modellingTests/TestRunGreedyIncrementalModel.py diff --git a/emission/analysis/modelling/similarity/confirmed_trip_feature_extraction.py b/emission/analysis/modelling/similarity/confirmed_trip_feature_extraction.py index d1d2b4e77..1d743097e 100644 --- a/emission/analysis/modelling/similarity/confirmed_trip_feature_extraction.py +++ b/emission/analysis/modelling/similarity/confirmed_trip_feature_extraction.py @@ -8,8 +8,12 @@ def origin_features(trip: ecwc.Confirmedtrip) -> List[float]: :param trip: trip to extract features from :return: origin coordinates """ - origin = trip.data.start_loc["coordinates"] - return origin + try: + origin = trip['data']['start_loc']["coordinates"] + return origin + except KeyError as e: + msg = 'Confirmedtrip expected to have path data.start_loc.coordinates' + raise KeyError(msg) from e def destination_features(trip: ecwc.Confirmedtrip) -> List[float]: """extract the trip destination coordinates. @@ -17,8 +21,13 @@ def destination_features(trip: ecwc.Confirmedtrip) -> List[float]: :param trip: trip to extract features from :return: destination coordinates """ - destination = trip.data.end_loc["coordinates"] - return destination + try: + destination = trip['data']['start_loc']["coordinates"] + return destination + except KeyError as e: + msg = 'Confirmedtrip expected to have path data.start_loc.coordinates' + raise KeyError(msg) from e + def od_features(trip: ecwc.Confirmedtrip) -> List[float]: """extract both origin and destination coordinates. @@ -36,7 +45,11 @@ def distance_feature(trip: ecwc.Confirmedtrip) -> List[float]: :param trip: trip to extract features from :return: distance feature """ - return [trip.data.distance] + try: + return [trip['data']['distance']] + except KeyError as e: + msg = 'Confirmedtrip expected to have path data.distance' + raise KeyError(msg) from e def duration_feature(trip: ecwc.Confirmedtrip) -> List[float]: """provided for forward compatibility. @@ -44,4 +57,8 @@ def duration_feature(trip: ecwc.Confirmedtrip) -> List[float]: :param trip: trip to extract features from :return: duration feature """ - return [trip.data.duration] + try: + return [trip['data']['duration']] + except KeyError as e: + msg = 'Confirmedtrip expected to have path data.duration' + raise KeyError(msg) from e diff --git a/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py b/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py new file mode 100644 index 000000000..a28a0fce4 --- /dev/null +++ b/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py @@ -0,0 +1,162 @@ +from this import d +import unittest +import logging +import json +import numpy as np +import uuid + +import bson.json_util as bju + +import emission.analysis.modelling.trip_model.model_storage as eamums +import emission.analysis.modelling.trip_model.model_type as eamumt +import emission.analysis.modelling.similarity.od_similarity as eamso +import emission.analysis.modelling.trip_model.run_model as eamur +import emission.storage.timeseries.abstract_timeseries as esta +import emission.storage.decorations.analysis_timeseries_queries as esdatq +import emission.tests.modellingTests.modellingTestAssets as etmm +import emission.storage.decorations.analysis_timeseries_queries as esda +import emission.analysis.modelling.trip_model.config as eamtc +import emission.core.wrapper.entry as ecwe + + +class TestRunGreedyModel(unittest.TestCase): + + def setUp(self): + """ + sets up the end-to-end run model test with Confirmedtrip data from a + test set of Confirmedtrip entries + """ + logging.basicConfig( + format='%(asctime)s:%(levelname)s:%(message)s', + level=logging.DEBUG) + + # emission/tests/data/real_examples/shankari_2016-06-20.expected_confirmed_trips + self.user_id = uuid.UUID('aa9fdec9-2944-446c-8ee2-50d79b3044d3') + self.ts = esta.TimeSeries.get_time_series(self.user_id) + self.new_trips_per_invocation = 3 + self.model_type = eamumt.ModelType.GREEDY_SIMILARITY_BINNING + self.model_storage = eamums.ModelStorage.DOCUMENT_DATABASE + sim_threshold = 500 # meters + self.greedy_model_config = { + "metric": "od_similarity", + "similarity_threshold_meters": sim_threshold, + "apply_cutoff": False, + "incremental_evaluation": True + } + + # test data can be saved between test invocations, check if data exists before generating + self.initial_data = list(self.ts.find_entries([esdatq.CONFIRMED_TRIP_KEY])) + if len(self.initial_data) == 0: + + # first time running against this database instance: + # 1. load trips from source file into database + # 2. create an initial entry for the incremental binning model + + # load in existing trips + input_file = 'emission/tests/data/real_examples/shankari_2016-06-20.expected_confirmed_trips' + with open(input_file, 'r') as f: + trips_json = json.loads(f.read(), object_hook=bju.object_hook) + trips = [ecwe.Entry(r) for r in trips_json] + logging.debug(f'loaded {len(trips)} trips from {input_file}') + self.ts.bulk_insert(trips) + + # confirm write to database succeeded + self.initial_data = list(self.ts.find_entries([esdatq.CONFIRMED_TRIP_KEY])) + if len(self.initial_data) == 0: + logging.debug(f'test setup failed while loading trips from file') + self.fail() + + eamur.update_trip_model( + user_id=self.user_id, + model_type=self.model_type, + model_storage=self.model_storage, + min_trips=5, # there are 5 similar trips in the file + model_config=self.greedy_model_config + ) + + logging.debug(f'setup: found {len(self.initial_data)} trips in database') + + # determine which trips are similar, and find the + # centroid of their origins and destinations to build + # new similar trips from + metric = eamso.OriginDestinationSimilarity() + features = [] + + for trip in self.initial_data: + f = metric.extract_features(trip) + features.append(f) + + # 2022-07-07 rjf: the Confirmedtrip dataset used here has 6 trips (initially) + # but only 5 are "similar" within 500 meters. here we dynamically dis- + # include trip 6. set up like this in case we have to switch datasets + # in the future (as long as the outliers are not similar!) + similar_matrix = [[metric.similar(t1, t2, sim_threshold) + for t1 in features] + for t2 in features] + similar_trips = [] + similar_features = [] + for idx, f in enumerate(self.initial_data): + sim = [similar_matrix[idx][i] for i in range(len(features)) if i != idx] + similar = any(sim) + if similar: + similar_trips.append(self.initial_data[idx]) + similar_features.append(features[idx]) + + # after running, how many trips should be stored together in a similar bin? + self.initial_similar_trips = len(similar_trips) + self.expected_trips = self.initial_similar_trips + self.new_trips_per_invocation + + # find the centroid of the similar trip data + src_x, src_y, dst_x, dst_y = np.mean(similar_features, axis=0) + self.origin = [src_x, src_y] + self.destination = [dst_x, dst_y] + + + def tearDown(self): + """ + delete entries for user self.user_id in the database, not + yet implemented in database operations, so these test entries will + have to stick around for now. + """ + pass + + def testIncrementalRun(self): + + # create a new trip sampling from the centroid and the existing + # set of user input data + label_data = etmm.extract_trip_labels(self.similar_trips) + new_trips = etmm.generate_mock_trips( + user_id=self.user_id, + trips=self.new_trips_per_invocation, + origin=self.origin, + destination=self.destination, + label_data=label_data, + threshold=0.0005 # ~50m + ) + self.ts.bulk_insert(new_trips) + + # train the new model on the complete collection of trips + eamur.update_trip_model( + user_id=self.user_id, + model_type=self.model_type, + model_storage=self.model_storage, + min_trips=self.initial_similar_trips, + model_config=self.greedy_model_config + ) + updated_model = eamur._load_stored_trip_model( + self.user_id, + model_type=self.model_type, + model_storage=self.model_storage, + model_config=self.greedy_model_config + ) + + self.assertEqual(len(updated_model.bins), 2, + 'there should be two bins, one with similar trips, one with an outlier') + + trips_in_bin = len(updated_model.bins['0']) + self.assertEqual(trips_in_bin, self.expected_trips, + 'expected number of trips stored in bin') + + self.assertEqual(len(updated_model.bins['1']), 1, + 'the second bin should have exactly one entry (an outlier)') + \ No newline at end of file diff --git a/emission/tests/modellingTests/TestRunGreedyModel.py b/emission/tests/modellingTests/TestRunGreedyModel.py index d73238837..84e066e02 100644 --- a/emission/tests/modellingTests/TestRunGreedyModel.py +++ b/emission/tests/modellingTests/TestRunGreedyModel.py @@ -45,9 +45,9 @@ def setUp(self): # generate labels with a known sample weight that we can rely on in the test label_data = { - "mode_labels": ['ebike', 'bike'], - "purpose_labels": ['happy-hour', 'dog-park'], - "replaced_mode_labels": ['walk'], + "mode_confirm": ['ebike', 'bike'], + "purpose_confirm": ['happy-hour', 'dog-park'], + "replaced_mode": ['walk'], "mode_weights": [0.9, 0.1], "purpose_weights": [0.1, 0.9] } @@ -161,3 +161,4 @@ def test1RoundTripGreedySimilarityBinning(self): [logging.debug(p) for p in sorted(prediction, key=lambda r: r['p'], reverse=True)] self.assertNotEqual(len(prediction), 0, "should have a prediction") + diff --git a/emission/tests/modellingTests/modellingTestAssets.py b/emission/tests/modellingTests/modellingTestAssets.py index f630bf66a..83eca458c 100644 --- a/emission/tests/modellingTests/modellingTestAssets.py +++ b/emission/tests/modellingTests/modellingTestAssets.py @@ -30,6 +30,26 @@ def generate_trip_coordinates( return (x, y) +def extract_trip_labels(trips: List[ecwc.Confirmedtrip]) -> Dict: + """ + helper to build the `label_data` argument for the generate_mock_trips + function below. reads all entries from a list of Confirmedtrip entries. + + :param trips: the trips to read from + :return: label_data + """ + keys = ['mode_confirm', 'purpose_confirm', 'replaced_mode'] + result = {k: set() for k in keys} + for k in keys: + for t in trips: + entry = t['data']['user_input'].get(k) + if entry is not None: + result[k].add(entry) + for k in result.keys(): + result[k] = list(result[k]) + return result + + def sample_trip_labels( mode_labels, purpose_labels, @@ -47,21 +67,21 @@ def sample_trip_labels( :param replaced_mode_weights: sample weights, defaults to None :return: sampled trip labels """ - mw = [1.0 / len(mode_labels) for i in range(len(mode_labels))] \ - if mode_weights is not None else mode_weights - rw = [1.0 / len(replaced_mode_labels) for i in range(len(replaced_mode_labels))] \ - if replaced_mode_weights is not None else replaced_mode_weights - pw = [1.0 / len(purpose_labels) for i in range(len(purpose_labels))] \ - if purpose_weights is not None else purpose_weights - mode_label_samples = random.choices(population=mode_labels, k=1, weights=mw) - replaced_mode_label_samples = random.choices(population=replaced_mode_labels, k=1, weights=rw) - purpose_label_samples = random.choices(population=purpose_labels, k=1, weights=pw) - user_input = { - "mode_confirm": mode_label_samples[0], - "replaced_mode": replaced_mode_label_samples[0], - "purpose_confirm": purpose_label_samples[0] - } - return user_input + user_inputs = [ + ('mode_confirm', mode_labels, mode_weights), + ('replaced_mode', replaced_mode_labels, replaced_mode_weights), + ('purpose_confirm', purpose_labels, purpose_weights) + ] + + result = {} + for key, labels, weights in user_inputs: + if len(labels) > 0: + if weights is None: + weights = [1.0 / len(labels) for i in range(len(labels))] + samples = random.choices(population=labels,k=1,weights=weights) + result[key] = samples[0] + + return result def build_mock_trip(user_id, origin, destination, labels = {}) -> ecwc.Confirmedtrip: @@ -106,9 +126,9 @@ def generate_mock_trips( label_data is an optional dictionary with labels and sample weights, for example: { - "mode_labels": ['walk', 'bike'], - "replaced_mode_labels": ['drive', 'tnc'], - "purpose_labels": ['home', 'work'], + "mode_confirm": ['walk', 'bike'], + "replaced_mode": ['drive', 'tnc'], + "purpose_confirm": ['home', 'work'], "mode_weights": [0.8, 0.2], "replaced_mode_weights": [0.4, 0.6], "purpose_weights": [0.1, 0.9] @@ -140,9 +160,9 @@ def generate_mock_trips( d = generate_trip_coordinates(destination, within, threshold, max) labels = {} if label_data is None or random.random() > has_label_p \ else sample_trip_labels( - mode_labels=label_data.get('mode_labels'), - replaced_mode_labels=label_data.get('replaced_mode_labels'), - purpose_labels=label_data.get('purpose_labels'), + mode_labels=label_data.get('mode_confirm'), + replaced_mode_labels=label_data.get('replaced_mode'), + purpose_labels=label_data.get('purpose_confirm'), mode_weights=label_data.get('mode_weights'), replaced_mode_weights=label_data.get('replaced_mode_weights'), purpose_weights=label_data.get('purpose_weights') @@ -156,9 +176,9 @@ def generate_mock_trips( if __name__ == '__main__': label_data = { - "mode_labels": ['walk', 'bike', 'drive'], - "purpose_labels": ['work', 'home', 'school'], - "replaced_mode_labels": ['walk', 'bike', 'drive'] + "mode_confirm": ['walk', 'bike', 'drive'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['walk', 'bike', 'drive'] } result = generate_mock_trips('joe-bob', 14, [0, 0], [1,1], label_data, 6) for r in result: From 6aae056dce81879eb0c64125f96e74482e79acf4 Mon Sep 17 00:00:00 2001 From: rfitzger Date: Thu, 7 Jul 2022 16:27:28 -0600 Subject: [PATCH 27/46] incremental model testing --- .../trip_model/greedy_similarity_binning.py | 2 ++ .../modelling/trip_model/run_model.py | 3 ++- .../TestRunGreedyIncrementalModel.py | 19 +++++++++++-------- .../modellingTests/modellingTestAssets.py | 1 + 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py index 43206cd11..0bab1d26b 100644 --- a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py @@ -103,6 +103,8 @@ def fit(self, trips: List[ecwc.Confirmedtrip]): :param trips: 2D array of features to train from """ + + logging.debug(f'fit called with {len(trips)} trips') unlabeled = list(filter(lambda t: len(t['data']['user_input']) == 0, trips)) if len(unlabeled) > 0: msg = f'model.fit cannot be called with unlabeled trips, found {len(unlabeled)}' diff --git a/emission/analysis/modelling/trip_model/run_model.py b/emission/analysis/modelling/trip_model/run_model.py index e3480b464..9ba90e641 100644 --- a/emission/analysis/modelling/trip_model/run_model.py +++ b/emission/analysis/modelling/trip_model/run_model.py @@ -122,7 +122,8 @@ def _get_training_data(user_id, int, incremental: bool): time_query = time_query_from_pipeline if incremental else None logging.debug(f'time query for training data collection: {time_query}') - trips = esda.get_entries(key=esda.CONFIRMED_TRIP_KEY, user_id=user_id, time_query=time_query) + ts = esta.TimeSeries.get_time_series(user_id) + trips = list(ts.find_entries([esda.CONFIRMED_TRIP_KEY], time_query=time_query)) print(f'found {len(trips)} training rows') labeled_trips = [trip for trip in trips if trip['data']['user_input'] != {}] diff --git a/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py b/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py index a28a0fce4..17214c07f 100644 --- a/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py +++ b/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py @@ -46,6 +46,7 @@ def setUp(self): # test data can be saved between test invocations, check if data exists before generating self.initial_data = list(self.ts.find_entries([esdatq.CONFIRMED_TRIP_KEY])) + if len(self.initial_data) == 0: # first time running against this database instance: @@ -66,11 +67,12 @@ def setUp(self): logging.debug(f'test setup failed while loading trips from file') self.fail() + logging.debug('writing initial trip model') eamur.update_trip_model( user_id=self.user_id, model_type=self.model_type, model_storage=self.model_storage, - min_trips=5, # there are 5 similar trips in the file + min_trips=4, # there are 4 similar and labelled trips in the file model_config=self.greedy_model_config ) @@ -93,21 +95,21 @@ def setUp(self): similar_matrix = [[metric.similar(t1, t2, sim_threshold) for t1 in features] for t2 in features] - similar_trips = [] - similar_features = [] + self.similar_trips = [] + self.similar_features = [] for idx, f in enumerate(self.initial_data): sim = [similar_matrix[idx][i] for i in range(len(features)) if i != idx] similar = any(sim) if similar: - similar_trips.append(self.initial_data[idx]) - similar_features.append(features[idx]) + self.similar_trips.append(self.initial_data[idx]) + self.similar_features.append(features[idx]) # after running, how many trips should be stored together in a similar bin? - self.initial_similar_trips = len(similar_trips) + self.initial_similar_trips = len(self.similar_trips) self.expected_trips = self.initial_similar_trips + self.new_trips_per_invocation # find the centroid of the similar trip data - src_x, src_y, dst_x, dst_y = np.mean(similar_features, axis=0) + src_x, src_y, dst_x, dst_y = np.mean(self.similar_features, axis=0) self.origin = [src_x, src_y] self.destination = [dst_x, dst_y] @@ -131,7 +133,7 @@ def testIncrementalRun(self): origin=self.origin, destination=self.destination, label_data=label_data, - threshold=0.0005 # ~50m + threshold=0.0001 # ~10m ) self.ts.bulk_insert(new_trips) @@ -154,6 +156,7 @@ def testIncrementalRun(self): 'there should be two bins, one with similar trips, one with an outlier') trips_in_bin = len(updated_model.bins['0']) + print(f'trips in bins: {[len(x) for x in updated_model.bins.values()]}') self.assertEqual(trips_in_bin, self.expected_trips, 'expected number of trips stored in bin') diff --git a/emission/tests/modellingTests/modellingTestAssets.py b/emission/tests/modellingTests/modellingTestAssets.py index 83eca458c..90d5e154f 100644 --- a/emission/tests/modellingTests/modellingTestAssets.py +++ b/emission/tests/modellingTests/modellingTestAssets.py @@ -1,5 +1,6 @@ import random from typing import Tuple, List, Dict +import emission.analysis.modelling.trip_model.trip_model as eamtm import emission.core.wrapper.confirmedtrip as ecwc import emission.core.wrapper.entry as ecwe From 1cd57d80cdc5c58b4e8cf683a63cf16ea4d376df Mon Sep 17 00:00:00 2001 From: rfitzger Date: Mon, 11 Jul 2022 11:45:17 -0600 Subject: [PATCH 28/46] update/finish e2e testing of greedy trip model --- .../trip_model/greedy_similarity_binning.py | 1 - .../modelling/trip_model/model_storage.py | 6 +- .../modelling/trip_model/run_model.py | 51 +++++++++++------ emission/core/wrapper/tripmodel.py | 8 +-- emission/storage/pipeline_queries.py | 9 +-- .../TestGreedySimilarityBinning.py | 56 +++++++++++-------- .../TestRunGreedyIncrementalModel.py | 22 ++++++-- .../modellingTests/modellingTestAssets.py | 30 ++++++++-- 8 files changed, 120 insertions(+), 63 deletions(-) diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py index 0bab1d26b..751bbaccd 100644 --- a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py @@ -109,7 +109,6 @@ def fit(self, trips: List[ecwc.Confirmedtrip]): if len(unlabeled) > 0: msg = f'model.fit cannot be called with unlabeled trips, found {len(unlabeled)}' raise Exception(msg) - self.bins = {} self._assign_bins(trips) if len(self.bins) > 1 and self.apply_cutoff: self._apply_cutoff() diff --git a/emission/analysis/modelling/trip_model/model_storage.py b/emission/analysis/modelling/trip_model/model_storage.py index 13177cb3a..61bd4eb93 100644 --- a/emission/analysis/modelling/trip_model/model_storage.py +++ b/emission/analysis/modelling/trip_model/model_storage.py @@ -1,6 +1,7 @@ from enum import Enum from typing import Dict, Optional import logging +import json import emission.analysis.modelling.trip_model.model_type as eamum import emission.core.wrapper.tripmodel as ecwu @@ -56,7 +57,7 @@ def load_model(user_id, model_type: eamum.ModelType, model_storage: ModelStorage raise Exception('user model storage requires BuiltInTimeSeries') latest_model_entry = ts.get_first_entry( key=esda.TRIP_MODEL_STORE_KEY, - field='data.model_ts', + field='metadata.write_ts', sort_order=pymongo.DESCENDING ) @@ -74,6 +75,7 @@ def load_model(user_id, model_type: eamum.ModelType, model_storage: ModelStorage raise TypeError('stored model does not have a model type') latest_model_type = eamum.ModelType.from_str(latest_model_type_str) + # validate and return if latest_model_entry is None: return None elif latest_model_type != model_type: @@ -104,6 +106,7 @@ def save_model( :param user_id: user associated with this model :param model_type: type of model stored :param model_data: data for this model to store, should be a dict + :param model_timestamp: time that model is current to :param model_storage: type of storage to load from, defaults to ModelStorage.DATABASE :raises TypeError: unknown ModelType :raises IOError: failure when writing to storage medium @@ -118,7 +121,6 @@ def save_model( if model_storage == ModelStorage.DOCUMENT_DATABASE: row = ecwu.Tripmodel() - row.user_id = user_id row.model_ts = model_timestamp row.model_type = model_type row.model = model_data diff --git a/emission/analysis/modelling/trip_model/run_model.py b/emission/analysis/modelling/trip_model/run_model.py index 9ba90e641..c625ee361 100644 --- a/emission/analysis/modelling/trip_model/run_model.py +++ b/emission/analysis/modelling/trip_model/run_model.py @@ -1,7 +1,9 @@ import logging from typing import List, Optional +from uuid import UUID -import arrow +import time +import emission.storage.timeseries.timequery as estt import emission.analysis.modelling.similarity.od_similarity as eamso import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamug import emission.analysis.modelling.trip_model.model_storage as eamums @@ -37,21 +39,34 @@ def update_trip_model( """ try: # this timestamp is used for recording the state of the updated model - timestamp = arrow.now().timestamp + timestamp = time.time() model = model_type.build(model_config) # if a previous model exists, deserialize the stored model model_data_prev = eamums.load_model(user_id, model_type, model_storage) - if model_data_prev is not None: + stored_model_exists = model_data_prev is not None + if stored_model_exists: model.from_dict(model_data_prev) logging.debug(f"loaded {model_type.name} user label model for user {user_id}") else: logging.debug(f"building first {model_type.name} user label model for user {user_id}") + # must call this regardless of whether model is incremental or not as it has + # the additional effect of marking the start state of the pipeline execution + time_query_from_pipeline = epq.get_time_query_for_trip_model(user_id) + time_query = time_query_from_pipeline if model.is_incremental else None logging.debug(f'model type {model_type.name} is incremental? {model.is_incremental}') - trips = _get_training_data(user_id, min_trips, model.is_incremental) + logging.debug(f'time query for training data collection: {time_query}') + + trips = _get_training_data(user_id, time_query) - if not len(trips) >= min_trips: + # don't start training for a user that doesn't have at least $trips many trips + # (assume if a stored model exists for the user, that they met this requirement previously) + if len(trips) == 0: + msg = f"no new confirmed trips found in database to train model for user {user_id}" + logging.debug(msg) + epq.mark_trip_model_failed(user_id) + elif not stored_model_exists and not len(trips) >= min_trips: msg = ( f"Total: {len(trips)}, labeled: {len(trips)}, user " f"{user_id} doesn't have enough valid trips for further analysis." @@ -69,10 +84,11 @@ def update_trip_model( msg = f"trip model for user {user_id} is empty" raise Exception(msg) - eamums.save_model(user_id, model_type, model_data_next, timestamp, model_storage) - logging.debug(f"{model_type.name} label prediction model built for user {user_id} with timestamp {timestamp}") + last_done_ts = _latest_timestamp(trips) + eamums.save_model(user_id, model_type, model_data_next, last_done_ts, model_storage) + logging.debug(f"{model_type.name} label prediction model built for user {user_id} with timestamp {last_done_ts}") - epq.mark_trip_model_done(user_id, timestamp) + epq.mark_trip_model_done(user_id, last_done_ts) except Exception as e: epq.mark_trip_model_failed(user_id) @@ -105,22 +121,14 @@ def predict_labels_with_n( return predictions, n -def _get_training_data(user_id, int, incremental: bool): +def _get_training_data(user_id: UUID, time_query: Optional[estt.TimeQuery]): """ load the labeled trip data for this user, subject to a time query. if the user does not have at least $min_trips trips with labels, then return an empty list. :param user_id: user to collect trips from :param time_query: query to restrict the time (optional) - :param incremental: if true, only collect trips which have arrived since the - last time this model was trained, otherwise, collect all - historical data for this user """ - # must call this regardless of whether model is incremental or not as it has - # the side effect of marking the start state of the pipeline execution - time_query_from_pipeline = epq.get_time_query_for_trip_model(user_id) - time_query = time_query_from_pipeline if incremental else None - logging.debug(f'time query for training data collection: {time_query}') ts = esta.TimeSeries.get_time_series(user_id) trips = list(ts.find_entries([esda.CONFIRMED_TRIP_KEY], time_query=time_query)) @@ -153,3 +161,12 @@ def _load_stored_trip_model( model.from_dict(model_dict) return model + +def _latest_timestamp(trips: List[ecwc.Confirmedtrip]) -> float: + """extract the latest timestamp observed from a list of trips + + :param trips: the trips to review + :return: the latest timestamp + """ + ts = sorted(trips, key=lambda r: r['data']['end_ts'], reverse=True)[0]['data']['end_ts'] + return ts \ No newline at end of file diff --git a/emission/core/wrapper/tripmodel.py b/emission/core/wrapper/tripmodel.py index 17ae94c73..d2c16198a 100644 --- a/emission/core/wrapper/tripmodel.py +++ b/emission/core/wrapper/tripmodel.py @@ -4,10 +4,10 @@ class Tripmodel(ecwb.WrapperBase): - props = {"user_id": ecwb.WrapperBase.Access.WORM, # the trip that this is part of - "model_type": ecwb.WrapperBase.Access.WORM, # emission.analysis.modelling.trip_model.model_type.py - "model": ecwb.WrapperBase.Access.WORM, # the (serialized) state of the model for this trip - "model_ts": ecwb.WrapperBase.Access.WORM, # timestamp that model is "current" to wrt input data + props = { + "model_type": ecwb.WrapperBase.Access.WORM, # emission.analysis.modelling.trip_model.model_type.py + "model": ecwb.WrapperBase.Access.WORM, # the (serialized) state of the model for this trip + "model_ts": ecwb.WrapperBase.Access.WORM, # timestamp that model is "current" to wrt input data } enums = { diff --git a/emission/storage/pipeline_queries.py b/emission/storage/pipeline_queries.py index b8a326d3c..b154b5f47 100644 --- a/emission/storage/pipeline_queries.py +++ b/emission/storage/pipeline_queries.py @@ -124,14 +124,15 @@ def mark_mode_inference_failed(user_id): def get_time_query_for_trip_model(user_id): tq = get_time_range_for_stage(user_id, ps.PipelineStages.TRIP_MODEL) if tq.startTs is None: + # time_query=None, request all confirmed trips for user return None else: - tq.timeType = 'data.model_ts' + # key off of Confirmedtrip end timestamp for the provided time range + tq.timeType = 'data.end_ts' return tq -def mark_trip_model_done(user_id, last_ts=None): - last_processed_ts = last_ts + END_FUZZ_AVOID_LTE if last_ts is not None else None - mark_stage_done(user_id, ps.PipelineStages.TRIP_MODEL, last_processed_ts) +def mark_trip_model_done(user_id, last_ts): + mark_stage_done(user_id, ps.PipelineStages.TRIP_MODEL, last_ts + END_FUZZ_AVOID_LTE) def mark_trip_model_failed(user_id): mark_stage_failed(user_id, ps.PipelineStages.TRIP_MODEL) diff --git a/emission/tests/modellingTests/TestGreedySimilarityBinning.py b/emission/tests/modellingTests/TestGreedySimilarityBinning.py index 22de885d3..af0e0bce7 100644 --- a/emission/tests/modellingTests/TestGreedySimilarityBinning.py +++ b/emission/tests/modellingTests/TestGreedySimilarityBinning.py @@ -17,9 +17,9 @@ def testBinning(self): when $should_be_grouped trips are the same, they should appear in a bin """ label_data = { - "mode_labels": ['walk', 'bike', 'transit'], - "purpose_labels": ['work', 'home', 'school'], - "replaced_mode_labels": ['drive'] + "mode_confirm": ['walk', 'bike', 'transit'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive'] } n = 20 @@ -33,11 +33,14 @@ def testBinning(self): within_threshold=should_be_grouped, threshold=0.001, # ~ 111 meters in degrees WGS84 ) - model = eamtg.GreedySimilarityBinning( - metric=eamso.OriginDestinationSimilarity(), - sim_thresh=500, # meters, - apply_cutoff=False # currently unused - ) + + model_config = { + "metric": "od_similarity", + "similarity_threshold_meters": 500, # meters, + "apply_cutoff": False, + "incremental_evaluation": False + } + model = eamtg.GreedySimilarityBinning(model_config) model.fit(trips) @@ -50,9 +53,9 @@ def testPrediction(self): training and testing with similar trips should lead to a positive bin match """ label_data = { - "mode_labels": ['skipping'], - "purpose_labels": ['pizza_party'], - "replaced_mode_labels": ['crabwalking'] + "mode_confirm": ['skipping'], + "purpose_confirm": ['pizza_party'], + "replaced_mode": ['crabwalking'] } n = 6 @@ -64,11 +67,14 @@ def testPrediction(self): label_data=label_data, threshold=0.001, # ~ 111 meters in degrees WGS84 ) - model = eamtg.GreedySimilarityBinning( - metric=eamso.OriginDestinationSimilarity(), - sim_thresh=500, # meters, - apply_cutoff=False # currently unused - ) + + model_config = { + "metric": "od_similarity", + "similarity_threshold_meters": 500, # meters, + "apply_cutoff": False, + "incremental_evaluation": False + } + model = eamtg.GreedySimilarityBinning(model_config) train = trips[0:5] test = trips[5] @@ -84,9 +90,9 @@ def testNoPrediction(self): when trained on trips in Colorado, shouldn't have a prediction for a trip in Alaska """ label_data = { - "mode_labels": ['skipping'], - "purpose_labels": ['pizza_party'], - "replaced_mode_labels": ['crabwalking'] + "mode_confirm": ['skipping'], + "purpose_confirm": ['pizza_party'], + "replaced_mode": ['crabwalking'] } n = 5 @@ -107,11 +113,13 @@ def testNoPrediction(self): threshold=0.001, # ~ 111 meters in degrees WGS84 ) - model = eamtg.GreedySimilarityBinning( - metric=eamso.OriginDestinationSimilarity(), - sim_thresh=500, # meters, - apply_cutoff=False # currently unused - ) + model_config = { + "metric": "od_similarity", + "similarity_threshold_meters": 500, # meters, + "apply_cutoff": False, + "incremental_evaluation": False + } + model = eamtg.GreedySimilarityBinning(model_config) model.fit(train) results, n = model.predict(test[0]) diff --git a/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py b/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py index 17214c07f..386e73648 100644 --- a/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py +++ b/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py @@ -4,6 +4,7 @@ import json import numpy as np import uuid +import time import bson.json_util as bju @@ -92,21 +93,24 @@ def setUp(self): # but only 5 are "similar" within 500 meters. here we dynamically dis- # include trip 6. set up like this in case we have to switch datasets # in the future (as long as the outliers are not similar!) + # 2022-07-11 rjf: ooh, let's remove the ones without labels too similar_matrix = [[metric.similar(t1, t2, sim_threshold) for t1 in features] for t2 in features] self.similar_trips = [] self.similar_features = [] for idx, f in enumerate(self.initial_data): + has_labels = len(self.initial_data[idx]['data']['user_input']) > 0 sim = [similar_matrix[idx][i] for i in range(len(features)) if i != idx] similar = any(sim) - if similar: + if has_labels and similar: self.similar_trips.append(self.initial_data[idx]) self.similar_features.append(features[idx]) # after running, how many trips should be stored together in a similar bin? self.initial_similar_trips = len(self.similar_trips) self.expected_trips = self.initial_similar_trips + self.new_trips_per_invocation + logging.debug(f"end of test, expecting {self.expected_trips} trips") # find the centroid of the similar trip data src_x, src_y, dst_x, dst_y = np.mean(self.similar_features, axis=0) @@ -126,6 +130,9 @@ def testIncrementalRun(self): # create a new trip sampling from the centroid and the existing # set of user input data + # timestamps for these rows cannot be within the last 5 seconds + # based on invariant set in pipeline_queries.py by the + # END_FUZZ_AVOID_LTE constant. label_data = etmm.extract_trip_labels(self.similar_trips) new_trips = etmm.generate_mock_trips( user_id=self.user_id, @@ -133,9 +140,14 @@ def testIncrementalRun(self): origin=self.origin, destination=self.destination, label_data=label_data, - threshold=0.0001 # ~10m + threshold=0.0001, # ~10m, + start_ts=time.time() - 20, + end_ts=time.time() - 10 ) + self.ts.bulk_insert(new_trips) + all_trips = list(self.ts.find_entries([esdatq.CONFIRMED_TRIP_KEY])) + logging.debug(f'total of {len(all_trips)} now stored in database') # train the new model on the complete collection of trips eamur.update_trip_model( @@ -155,11 +167,11 @@ def testIncrementalRun(self): self.assertEqual(len(updated_model.bins), 2, 'there should be two bins, one with similar trips, one with an outlier') - trips_in_bin = len(updated_model.bins['0']) - print(f'trips in bins: {[len(x) for x in updated_model.bins.values()]}') + trips_in_bin = len(updated_model.bins['0']['features']) + print(f'trips in bins: {[len(x["features"]) for x in updated_model.bins.values()]}') self.assertEqual(trips_in_bin, self.expected_trips, 'expected number of trips stored in bin') - self.assertEqual(len(updated_model.bins['1']), 1, + self.assertEqual(len(updated_model.bins['1']['features']), 1, 'the second bin should have exactly one entry (an outlier)') \ No newline at end of file diff --git a/emission/tests/modellingTests/modellingTestAssets.py b/emission/tests/modellingTests/modellingTestAssets.py index 90d5e154f..879a3a2ca 100644 --- a/emission/tests/modellingTests/modellingTestAssets.py +++ b/emission/tests/modellingTests/modellingTestAssets.py @@ -1,10 +1,11 @@ import random -from typing import Tuple, List, Dict +from typing import Optional, Tuple, List, Dict +from uuid import UUID import emission.analysis.modelling.trip_model.trip_model as eamtm import emission.core.wrapper.confirmedtrip as ecwc import emission.core.wrapper.entry as ecwe -import arrow +import time import math @@ -85,21 +86,36 @@ def sample_trip_labels( return result -def build_mock_trip(user_id, origin, destination, labels = {}) -> ecwc.Confirmedtrip: +def build_mock_trip( + user_id: UUID, + origin, + destination, + labels: Optional[Dict] = {}, + start_ts: Optional[float] = None, + end_ts: Optional[float] = None) -> ecwc.Confirmedtrip: """repackages mock data as a Confirmedtrip Entry type + NOTE: these mock objects **do not** include all fields. see Trip and Confirmedtrip + classes for the complete list and expand if necessary. + :param user_id: the user id UUID :param origin: trip origin coordinates :param destination: trip destination coordinates :param labels: user labels for the trip, optional, default none - :return: a Confirmedtrip entry + :param start_ts: optional timestamp for trip start, otherwise NOW + :param end_ts: optional timestamp for trip end, otherwise NOW + :return: a mock Confirmedtrip entry """ + start_ts = start_ts if start_ts is not None else time.time() + end_ts = end_ts if end_ts is not None else time.time() key = "analysis/confirmed_trip" data = { + "start_ts": start_ts, "start_loc": { "type": "Point", "coordinates": origin }, + "end_ts": end_ts, "end_loc": { "type": "Point", "coordinates": destination @@ -107,7 +123,7 @@ def build_mock_trip(user_id, origin, destination, labels = {}) -> ecwc.Confirmed "user_input": labels } - return ecwe.Entry.create_fake_entry(user_id, key, data, write_ts=arrow.now().timestamp) + return ecwe.Entry.create_fake_entry(user_id, key, data, write_ts=time.time()) def generate_mock_trips( @@ -117,6 +133,8 @@ def generate_mock_trips( destination, label_data = None, within_threshold = None, + start_ts: None = None, + end_ts: None = None, threshold = 0.01, max = 0.1, has_label_p = 1.0, @@ -168,7 +186,7 @@ def generate_mock_trips( replaced_mode_weights=label_data.get('replaced_mode_weights'), purpose_weights=label_data.get('purpose_weights') ) - trip = build_mock_trip(user_id, o, d, labels) + trip = build_mock_trip(user_id, o, d, labels, start_ts, end_ts) result.append(trip) random.shuffle(result) From 3f0a551aadbd4aec0b5d49a4f2d34aa671b6ee5a Mon Sep 17 00:00:00 2001 From: rfitzger Date: Mon, 11 Jul 2022 11:50:46 -0600 Subject: [PATCH 29/46] comments --- .../modellingTests/TestRunGreedyIncrementalModel.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py b/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py index 386e73648..84aeecfdd 100644 --- a/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py +++ b/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py @@ -127,7 +127,16 @@ def tearDown(self): pass def testIncrementalRun(self): - + """ + incremental trip models train from Confirmedtrip entries at most + once. to test this behavior, a model is built based on a small + Confirmedtrip dataset stored at a file location (See setUp, above). + this happens once and is not repeated when the test is re-run, + unless a new database instance is spun up. within the test method, + an additional few mock trips are generated with a later timestamp. + the training model should 1) only see the new trips, 2) have been + trained on the expected number of trips at completion. + """ # create a new trip sampling from the centroid and the existing # set of user input data # timestamps for these rows cannot be within the last 5 seconds From 0b7649f564733117d4218694c21f9747adc694ce Mon Sep 17 00:00:00 2001 From: Shankari Date: Sun, 7 Aug 2022 10:21:05 -0700 Subject: [PATCH 30/46] Revert "adding missing python dependencies" This reverts commit 6637b70f1f5b1eead3d67877bd3c1ac59ba25638. Ran the new tests, and they seem to pass even without these additions --- setup/environment36.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/setup/environment36.yml b/setup/environment36.yml index c4de01fdf..b65e2daf1 100644 --- a/setup/environment36.yml +++ b/setup/environment36.yml @@ -8,14 +8,11 @@ dependencies: - cheroot=8.4.2 - future=0.18.0 - geojson=2.4.1 -- geocoder=1.38.1 -- geopy=2.2.0 - google-auth=1.20.1 - jsonpickle=1.4.1 - numpy=1.19.1 - pandas=1.1.0 - pip=20.2.2 -- polyline=1.4.0 - python-dateutil=2.8.1 - pytz=2020.1 - requests=2.24.0 From 75c1a022680c1ff4b9e4a567e538ad24740f8fcb Mon Sep 17 00:00:00 2001 From: Shankari Date: Sun, 7 Aug 2022 11:29:30 -0700 Subject: [PATCH 31/46] Added backwards compat test to showcase the change in behavior from "all" to "any" This test is currently expected to fail, until we change the new implementation to match the old implementation This is a test to validate https://github.com/e-mission/e-mission-server/pull/852#discussion_r939611260 --- .../modellingTests/TestBackwardsCompat.py | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 emission/tests/modellingTests/TestBackwardsCompat.py diff --git a/emission/tests/modellingTests/TestBackwardsCompat.py b/emission/tests/modellingTests/TestBackwardsCompat.py new file mode 100644 index 000000000..878c3fd27 --- /dev/null +++ b/emission/tests/modellingTests/TestBackwardsCompat.py @@ -0,0 +1,73 @@ +import unittest +import emission.analysis.modelling.tour_model_first_only.load_predict as lp +import emission.analysis.modelling.tour_model.similarity as oursim +import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg +import emission.tests.modellingTests.modellingTestAssets as etmm +import emission.analysis.modelling.similarity.od_similarity as eamso +import json +import logging +import numpy as np +import pandas as pd +import emission.core.common as ecc +import emission.core.wrapper.entry as ecwe + +# +# Test to see if the new implementations are consistent with the old implementations +# + +class TestBackwardsCompat(unittest.TestCase): + def setUp(self) -> None: + logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', + level=logging.DEBUG) + + def testAnyVsAllWhilePredicting(self): + trip_coords = (8,12) + trips = [] + for i in range(trip_coords[0], trip_coords[1], 1): + trips.append(ecwe.Entry({"data": {"start_loc": {"coordinates": [i/10,i/10]}, + "end_loc": {"coordinates": [i/10+0.1, i/10+0.1]}, + "user_input": {"mode_confirm": "walk", "purpose_confirm": "exercise"}}, + "metadata": {"key": "analysis/confirmed_trip"}})) + distanceMatrix = np.zeros((len(trips), len(trips))) + for i, trip1 in enumerate(trips): + for j, trip2 in enumerate(trips): + distanceMatrix[i][j] = ecc.calDistance( + trip1.data.start_loc["coordinates"], + trip2.data.start_loc["coordinates"]) + logging.debug("For the test trips, distance matrix is") + logging.debug("%s" % pd.DataFrame(distanceMatrix)) + +# 0 1 2 3 4 +# 0 0.000000 15724.471142 31448.726739 47172.742840 62896.495491 +# 1 15724.471142 0.000000 15724.255604 31448.271720 47172.024395 +# 2 31448.726739 15724.255604 0.000000 15724.016124 31447.768817 +# 3 47172.742840 31448.271720 15724.016124 0.000000 15723.752703 +# 4 62896.495491 47172.024395 31447.768817 15723.752703 0.000000 +# . +# So let's pick a threshold of 16000. With the "any" approach, all of them will +# be in one bin, with the "all" approach, we will end up with multiple bins + old_builder = oursim.similarity(trips, 16000, + shouldFilter=False, cutoff=False) + old_builder.bin_data() + old_bins = old_builder.bins + logging.debug("old bins = %s" % old_bins) +# old bins = [[0, 1], [2, 3]] + + model_config = { + "metric": "od_similarity", + "similarity_threshold_meters": 16000, # meters, + "apply_cutoff": False, + "incremental_evaluation": False + } + new_builder = eamtg.GreedySimilarityBinning(model_config) + new_builder.fit(trips) + new_bins = new_builder.bins + logging.debug("new bins = %s" % new_bins) + self.assertEqual(len(old_bins), len(new_bins), + f"old bins = {old_bins} but new_bins = {new_bins}") + + +if __name__ == '__main__': + unittest.main() + + From 2ce8e06926e958b6996209a817f9d7a268e0caf4 Mon Sep 17 00:00:00 2001 From: Shankari Date: Sun, 7 Aug 2022 13:35:30 -0700 Subject: [PATCH 32/46] Add a monkeytest to check backwards compat Ror randomly generated trips that will end up in two clusters. This now fails as expected https://github.com/e-mission/e-mission-server/pull/872#issuecomment-1207480908 --- .../modellingTests/TestBackwardsCompat.py | 134 ++++++++++++++++++ 1 file changed, 134 insertions(+) diff --git a/emission/tests/modellingTests/TestBackwardsCompat.py b/emission/tests/modellingTests/TestBackwardsCompat.py index 878c3fd27..b81b5f529 100644 --- a/emission/tests/modellingTests/TestBackwardsCompat.py +++ b/emission/tests/modellingTests/TestBackwardsCompat.py @@ -4,6 +4,8 @@ import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg import emission.tests.modellingTests.modellingTestAssets as etmm import emission.analysis.modelling.similarity.od_similarity as eamso +import emission.analysis.modelling.tour_model_first_only.build_save_model as eamtb +import emission.analysis.modelling.tour_model_first_only.load_predict as eamtl import json import logging import numpy as np @@ -66,7 +68,139 @@ def testAnyVsAllWhilePredicting(self): self.assertEqual(len(old_bins), len(new_bins), f"old bins = {old_bins} but new_bins = {new_bins}") + @staticmethod + def old_predict_with_n(trip, bin_locations, user_labels, cluster_sizes, RADIUS): + logging.debug(f"At stage: first round prediction") + pred_bin = eamtl.find_bin(trip, bin_locations, RADIUS) + logging.debug(f"At stage: matched with bin {pred_bin}") + if pred_bin == -1: + logging.info(f"No match found for {trip['data']['start_loc']} early return") + return [], 0 + + user_input_pred_list = user_labels[pred_bin] + this_cluster_size = cluster_sizes[pred_bin] + logging.debug(f"At stage: looked up user input {user_input_pred_list}") + return user_input_pred_list, this_cluster_size + + def testRandomTripsWithinTheSameThreshold(self): + label_data = { + "mode_confirm": ['walk', 'bike', 'transit'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive'] + } + + n = 60 + trips = etmm.generate_mock_trips( + user_id="joe", + trips=n, + origin=(0, 0), + destination=(1, 1), + label_data=label_data, + threshold=0.001, # ~ 111 meters in degrees WGS84 + ) + + # These fields should ignored for the first round, but are extracted anyway + # So let's fill them in with dummy values + for t in trips: + t["data"]["distance"] = 1000 + t["data"]["duration"] = 10 + + train = trips[0:50] + test = trips[50:60] + + model_config = { + "metric": "od_similarity", + "similarity_threshold_meters": 500, # meters, + "apply_cutoff": False, + "incremental_evaluation": False + } + new_model = eamtg.GreedySimilarityBinning(model_config) + new_model.fit(train) + + old_builder = oursim.similarity(train, 500, + shouldFilter=False, cutoff=False) + old_builder.fit() + + self.assertEqual(len(old_builder.bins), len(new_model.bins), + f"old bins = {old_builder.bins} but new_bins = {new_model.bins}") + + self.assertEqual(len(old_builder.bins), 1, + f"all trips within threshold, so expected one bin, found {len(old_builder.bins)}") + + old_user_inputs = eamtb.create_user_input_map(train, old_builder.bins) + old_location_map = eamtb.create_location_map(train, old_builder.bins) + old_cluster_sizes = {k: len(old_location_map[k]) for k in old_location_map} + + for test_trip in test: + new_results, new_n = new_model.predict(test_trip) + old_results, old_n = TestBackwardsCompat.old_predict_with_n(test_trip, + old_location_map, old_user_inputs, old_cluster_sizes, 500) + + self.assertEqual(old_n, new_n, + f"for test trip {test_trip} old n = {old_n} and new_n = {new_n}") + + self.assertEqual(old_results, new_results, + f"for test trip {test_trip} old result = {old_results} and new result = {new_results}") + + def testRandomTripsOutsideTheSameThreshold(self): + label_data = { + "mode_confirm": ['walk', 'bike', 'transit'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive'] + } + + n = 60 + trips = etmm.generate_mock_trips( + user_id="joe", + trips=n, + origin=(0, 0), + destination=(1, 1), + label_data=label_data, + threshold=0.1, # Much bigger than the 500m threshold, so we will get multiple bins + ) + + # These fields should ignored for the first round, but are extracted anyway + # So let's fill them in with dummy values + for t in trips: + t["data"]["distance"] = 1000 + t["data"]["duration"] = 10 + + train = trips[0:50] + test = trips[50:60] + + model_config = { + "metric": "od_similarity", + "similarity_threshold_meters": 500, # meters, + "apply_cutoff": False, + "incremental_evaluation": False + } + new_model = eamtg.GreedySimilarityBinning(model_config) + new_model.fit(train) + + old_builder = oursim.similarity(train, 500, + shouldFilter=False, cutoff=False) + old_builder.fit() + + logging.debug(f"old bins = {len(old_builder.bins)} but new_bins = {len(new_model.bins)}") + + self.assertEqual(len(old_builder.bins), len(new_model.bins), + f"old bins = {old_builder.bins} but new_bins = {new_model.bins}") + + old_user_inputs = eamtb.create_user_input_map(train, old_builder.bins) + old_location_map = eamtb.create_location_map(train, old_builder.bins) + old_cluster_sizes = {k: len(old_location_map[k]) for k in old_location_map} + + for test_trip in test: + new_results, new_n = new_model.predict(test_trip) + old_results, old_n = TestBackwardsCompat.old_predict_with_n(test_trip, + old_location_map, old_user_inputs, old_cluster_sizes, 500) + + self.assertEqual(old_n, new_n, + f"for test trip {test_trip} old n = {old_n} and new_n = {new_n}") + + self.assertEqual(old_results, new_results, + f"for test trip {test_trip} old result = {old_results} and new result = {new_results}") if __name__ == '__main__': unittest.main() From d1e9af600c5058443cb8866af9d452cdc94c48b8 Mon Sep 17 00:00:00 2001 From: Rob Fitzgerald Date: Mon, 8 Aug 2022 09:20:17 -0600 Subject: [PATCH 33/46] Update bin/build_label_model.py comments Co-authored-by: shankari --- bin/build_label_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/build_label_model.py b/bin/build_label_model.py index aa2623281..3afd62901 100644 --- a/bin/build_label_model.py +++ b/bin/build_label_model.py @@ -69,4 +69,5 @@ def _email_2_user_list(email_list): model_type = eamtc.get_model_type() model_storage = eamtc.get_model_storage() min_trips = eamtc.get_minimum_trips() + ## Rebuild and save the trip model with the specified parameters eamur.update_trip_model(user_id, model_type, model_storage, min_trips) From 29cdc15ac765256c8fd067896600b7a5eac7716d Mon Sep 17 00:00:00 2001 From: rfitzger Date: Tue, 9 Aug 2022 10:48:46 -0600 Subject: [PATCH 34/46] checkpoint: addressing review --- .../trip_model/greedy_similarity_binning.py | 12 ++++++++---- .../modellingTests/TestRunGreedyIncrementalModel.py | 1 + 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py index 751bbaccd..2f911a686 100644 --- a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py @@ -201,18 +201,22 @@ def _apply_cutoff(self): removes small clusters by an "elbow search" heuristic. see https://stackoverflow.com/a/2022348/4803266. """ - num_bins = len(self.bins) - bin_sizes = [len(bin_rec['features']) for bin_rec in self.bins.values()] + # the cutoff point is an index along the sorted bins. any bin with a gte + # index value is removed, as that bin has been found to be smaller than the cutoff. + bins_sorted = self.bins.sort(key=lambda bin: len(bin['features']), reverse=True) + + num_bins = len(bins_sorted) + bin_sizes = [len(bin_rec['features']) for bin_rec in bins_sorted.values()] _, cutoff_bin_size = util.find_knee_point(bin_sizes) logging.debug( "bins = %s, elbow distance = %s" % (num_bins, cutoff_bin_size) ) updated_bins = {bin_id: bin_rec - for bin_id, bin_rec in self.bins.items() + for bin_id, bin_rec in bins_sorted.items() if len(bin_rec['features']) >= cutoff_bin_size} - removed = len(self.bins) - len(updated_bins) + removed = len(bins_sorted) - len(updated_bins) logging.debug( f"removed %s bins with less than %s entries" % (removed, cutoff_bin_size) diff --git a/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py b/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py index 84aeecfdd..4063f7fa3 100644 --- a/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py +++ b/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py @@ -173,6 +173,7 @@ def testIncrementalRun(self): model_config=self.greedy_model_config ) + # the 6th trip in the original dataset was an outlier and should form it's own cluster self.assertEqual(len(updated_model.bins), 2, 'there should be two bins, one with similar trips, one with an outlier') From a5cb123afd7530e422796df463b44c0cce066b2b Mon Sep 17 00:00:00 2001 From: rfitzger Date: Tue, 9 Aug 2022 12:24:22 -0600 Subject: [PATCH 35/46] apply revisions from review, comments --- .../trip_model/greedy_similarity_binning.py | 1 - .../modelling/trip_model/model_storage.py | 1 - .../modelling/trip_model/run_model.py | 2 -- .../TestGreedySimilarityBinning.py | 15 ++++++------ .../TestRunGreedyIncrementalModel.py | 10 +++----- .../modellingTests/TestRunGreedyModel.py | 23 +++++++++++-------- 6 files changed, 24 insertions(+), 28 deletions(-) diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py index 2f911a686..f34f0faf8 100644 --- a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py @@ -3,7 +3,6 @@ from typing import Dict, List, Optional, Tuple import emission.analysis.modelling.similarity.similarity_metric_type as eamssmt -import emission.analysis.modelling.similarity.similarity_metric as eamss import emission.analysis.modelling.tour_model.label_processing as lp import emission.analysis.modelling.trip_model.trip_model as eamuu import emission.analysis.modelling.trip_model.util as util diff --git a/emission/analysis/modelling/trip_model/model_storage.py b/emission/analysis/modelling/trip_model/model_storage.py index 61bd4eb93..8e89f2419 100644 --- a/emission/analysis/modelling/trip_model/model_storage.py +++ b/emission/analysis/modelling/trip_model/model_storage.py @@ -6,7 +6,6 @@ import emission.analysis.modelling.trip_model.model_type as eamum import emission.core.wrapper.tripmodel as ecwu import emission.storage.decorations.analysis_timeseries_queries as esda -import emission.storage.pipeline_queries as epq import emission.storage.timeseries.abstract_timeseries as esta import emission.storage.timeseries.builtin_timeseries as estb import pymongo diff --git a/emission/analysis/modelling/trip_model/run_model.py b/emission/analysis/modelling/trip_model/run_model.py index c625ee361..e3e2b1c4e 100644 --- a/emission/analysis/modelling/trip_model/run_model.py +++ b/emission/analysis/modelling/trip_model/run_model.py @@ -4,8 +4,6 @@ import time import emission.storage.timeseries.timequery as estt -import emission.analysis.modelling.similarity.od_similarity as eamso -import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamug import emission.analysis.modelling.trip_model.model_storage as eamums import emission.analysis.modelling.trip_model.model_type as eamumt import emission.analysis.modelling.trip_model.trip_model as eamuu diff --git a/emission/tests/modellingTests/TestGreedySimilarityBinning.py b/emission/tests/modellingTests/TestGreedySimilarityBinning.py index af0e0bce7..fcdde51f1 100644 --- a/emission/tests/modellingTests/TestGreedySimilarityBinning.py +++ b/emission/tests/modellingTests/TestGreedySimilarityBinning.py @@ -1,8 +1,6 @@ import unittest import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg import emission.tests.modellingTests.modellingTestAssets as etmm -import emission.analysis.modelling.similarity.od_similarity as eamso -import json import logging @@ -22,21 +20,24 @@ def testBinning(self): "replaced_mode": ['drive'] } + # generate $n trips. $m of them should have origin and destinations sampled + # within a radius that should have them binned. n = 20 - should_be_grouped = 5 + m = 5 trips = etmm.generate_mock_trips( user_id="joe", trips=n, origin=(0, 0), destination=(1, 1), label_data=label_data, - within_threshold=should_be_grouped, + within_threshold=m, threshold=0.001, # ~ 111 meters in degrees WGS84 ) + # pass in a test configuration to the binning algorithm model_config = { "metric": "od_similarity", - "similarity_threshold_meters": 500, # meters, + "similarity_threshold_meters": 500, # meters, "apply_cutoff": False, "incremental_evaluation": False } @@ -44,8 +45,8 @@ def testBinning(self): model.fit(trips) - # $should_be_grouped trip features should appear together in one bin - at_least_one_large_bin = any(map(lambda b: len(b['features']) >= should_be_grouped, model.bins.values())) + # $m trip features should appear together in one bin + at_least_one_large_bin = any(map(lambda b: len(b['features']) == m, model.bins.values())) self.assertTrue(at_least_one_large_bin, "at least one bin should have at least 5 features in it") def testPrediction(self): diff --git a/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py b/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py index 4063f7fa3..430b65810 100644 --- a/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py +++ b/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py @@ -1,4 +1,3 @@ -from this import d import unittest import logging import json @@ -15,9 +14,8 @@ import emission.storage.timeseries.abstract_timeseries as esta import emission.storage.decorations.analysis_timeseries_queries as esdatq import emission.tests.modellingTests.modellingTestAssets as etmm -import emission.storage.decorations.analysis_timeseries_queries as esda -import emission.analysis.modelling.trip_model.config as eamtc import emission.core.wrapper.entry as ecwe +import emission.core.get_database as edb class TestRunGreedyModel(unittest.TestCase): @@ -120,11 +118,9 @@ def setUp(self): def tearDown(self): """ - delete entries for user self.user_id in the database, not - yet implemented in database operations, so these test entries will - have to stick around for now. + clean up database entries related to this test """ - pass + edb.get_analysis_timeseries_db().delete_many({'user_id': self.user_id}) def testIncrementalRun(self): """ diff --git a/emission/tests/modellingTests/TestRunGreedyModel.py b/emission/tests/modellingTests/TestRunGreedyModel.py index 84e066e02..07d12d429 100644 --- a/emission/tests/modellingTests/TestRunGreedyModel.py +++ b/emission/tests/modellingTests/TestRunGreedyModel.py @@ -7,7 +7,8 @@ import emission.storage.timeseries.abstract_timeseries as esta import emission.tests.modellingTests.modellingTestAssets as etmm import emission.storage.decorations.analysis_timeseries_queries as esda -import emission.analysis.modelling.trip_model.config as eamtc +import emission.core.get_database as edb +import emission.storage.pipeline_queries as epq class TestRunGreedyModel(unittest.TestCase): @@ -36,6 +37,9 @@ def setUp(self): # $clustered_trips * $has_label_percent > self.min_trips # must be correct or else this test could fail under some random test cases. + # for a negative test, below + self.unused_user_id = 'asdjfkl;asdfjkl;asd08234ur13fi4jhf2103mkl' + # test data can be saved between test invocations, check if data exists before generating ts = esta.TimeSeries.get_time_series(user_id) test_data = list(ts.find_entries(["analysis/confirmed_trip"])) @@ -75,11 +79,9 @@ def setUp(self): def tearDown(self): """ - delete entries for user self.user_id in the database, not - yet implemented in database operations, so these test entries will - have to stick around for now. + clean up database """ - pass + edb.get_analysis_timeseries_db().delete_many({'user_id': self.user_id}) def testBuildGreedyModelFromConfig(self): """ @@ -96,9 +98,6 @@ def testTrainGreedyModelWithZeroTrips(self): purposes but will load from a file in /conf/analysis/ which is tested here """ - # making an assumption here... - unused_user_id = 'asdjfkl;asdfjkl;asd08234ur13fi4jhf2103mkl' - # pass along debug model configuration greedy_model_config = { "metric": "od_similarity", @@ -109,14 +108,18 @@ def testTrainGreedyModelWithZeroTrips(self): logging.debug(f'~~~~ do nothing ~~~~') eamur.update_trip_model( - user_id=unused_user_id, + user_id=self.unused_user_id, model_type=eamumt.ModelType.GREEDY_SIMILARITY_BINNING, model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, min_trips=self.min_trips, model_config=greedy_model_config ) - # todo: check the pipeline for this user to confirm they don't have a current timestamp + # user had no entries so their pipeline state should not have been set + # if it was set, the time query here would + time_query = epq.get_time_query_for_trip_model(self.unused_user_id) + self.assertIsNone(time_query, "should not have a pipeline state entry") + def test1RoundTripGreedySimilarityBinning(self): """ From e00559ea1bec132293fc505b33ab1453cab3aed0 Mon Sep 17 00:00:00 2001 From: rfitzger Date: Tue, 9 Aug 2022 16:11:49 -0600 Subject: [PATCH 36/46] greedy binning should fit trip to all binned trips --- .../trip_model/greedy_similarity_binning.py | 26 ++++++++++++++++--- .../modellingTests/TestRunGreedyModel.py | 8 ++++-- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py index f34f0faf8..c6ed041ba 100644 --- a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py @@ -113,7 +113,6 @@ def fit(self, trips: List[ecwc.Confirmedtrip]): self._apply_cutoff() self._generate_predictions() - binned_features = sum([len(b['features']) for b in self.bins.values() ]) logging.info(f"greedy binning model fit to {len(trips)} rows of trip data") def predict(self, trip: ecwc.Confirmedtrip) -> Tuple[List[Dict], int]: @@ -151,11 +150,13 @@ def _assign_bins(self, trips: List[ecwc.Confirmedtrip]): for trip in trips: trip_features = self.extract_features(trip) trip_labels = trip['data']['user_input'] - bin_id, bin_record = self._nearest_bin(trip) + + bin_id = self._find_matching_bin_id(trip_features) if bin_id is not None: # add to existing bin - bin_record['features'].append(trip_features) - bin_record['labels'].append(trip_labels) + logging.debug(f"adding trip to bin {bin_id} with features {trip_features}") + self.bins[bin_id]['features'].append(trip_features) + self.bins[bin_id]['labels'].append(trip_labels) else: # create new bin new_bin_id = str(len(self.bins)) @@ -167,12 +168,29 @@ def _assign_bins(self, trips: List[ecwc.Confirmedtrip]): logging.debug(f"creating new bin {new_bin_id} at location {trip_features}") self.bins[new_bin_id] = new_bin_record + def _find_matching_bin_id(self, trip_features: List[float]) -> Optional[str]: + """ + finds an existing bin where all bin features are "similar" to the incoming + trip features. + + :param trip_features: feature row for the incoming trip + :return: the id of a bin if a match was found, otherwise None + """ + for bin_id, bin_record in self.bins.items(): + matches_bin = all([self.metric.similar(trip_features, bin_sample, self.sim_thresh) + for bin_sample in bin_record['features']]) + if matches_bin: + return bin_id + return None + def _nearest_bin(self, trip: ecwc.Confirmedtrip) -> Tuple[Optional[int], Optional[Dict]]: """ finds a bin which contains at least one matching feature. the first record matching by similarity measure is returned. if none are found, (None, None) is returned. + [see https://github.com/e-mission/e-mission-server/blob/10772f892385d44e11e51e796b0780d8f6609a2c/emission/analysis/modelling/tour_model_first_only/load_predict.py#L46] + :param trip: incoming trip features to test with :return: nearest bin record, if found """ diff --git a/emission/tests/modellingTests/TestRunGreedyModel.py b/emission/tests/modellingTests/TestRunGreedyModel.py index 07d12d429..3baf8143f 100644 --- a/emission/tests/modellingTests/TestRunGreedyModel.py +++ b/emission/tests/modellingTests/TestRunGreedyModel.py @@ -9,6 +9,7 @@ import emission.storage.decorations.analysis_timeseries_queries as esda import emission.core.get_database as edb import emission.storage.pipeline_queries as epq +import emission.core.wrapper.pipelinestate as ecwp class TestRunGreedyModel(unittest.TestCase): @@ -117,8 +118,11 @@ def testTrainGreedyModelWithZeroTrips(self): # user had no entries so their pipeline state should not have been set # if it was set, the time query here would - time_query = epq.get_time_query_for_trip_model(self.unused_user_id) - self.assertIsNone(time_query, "should not have a pipeline state entry") + stage = ecwp.PipelineStages.TRIP_MODEL + pipeline_state = epq.get_current_state(self.unused_user_id, stage) + self.assertIsNone( + pipeline_state['curr_run_ts'], + "pipeline should not have a current timestamp for the test user") def test1RoundTripGreedySimilarityBinning(self): From b3be3d6b91e64179ae17ac8ec69856d1cf7679ad Mon Sep 17 00:00:00 2001 From: rfitzger Date: Tue, 9 Aug 2022 19:58:20 -0600 Subject: [PATCH 37/46] typo --- .../modelling/similarity/confirmed_trip_feature_extraction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/emission/analysis/modelling/similarity/confirmed_trip_feature_extraction.py b/emission/analysis/modelling/similarity/confirmed_trip_feature_extraction.py index 1d743097e..04a83fe5e 100644 --- a/emission/analysis/modelling/similarity/confirmed_trip_feature_extraction.py +++ b/emission/analysis/modelling/similarity/confirmed_trip_feature_extraction.py @@ -22,7 +22,7 @@ def destination_features(trip: ecwc.Confirmedtrip) -> List[float]: :return: destination coordinates """ try: - destination = trip['data']['start_loc']["coordinates"] + destination = trip['data']['end_loc']["coordinates"] return destination except KeyError as e: msg = 'Confirmedtrip expected to have path data.start_loc.coordinates' From 05278f2535deb2db1995aaf2493e9d12af20f260 Mon Sep 17 00:00:00 2001 From: rfitzger Date: Tue, 9 Aug 2022 20:01:27 -0600 Subject: [PATCH 38/46] fix log typo --- .../modelling/similarity/confirmed_trip_feature_extraction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/emission/analysis/modelling/similarity/confirmed_trip_feature_extraction.py b/emission/analysis/modelling/similarity/confirmed_trip_feature_extraction.py index 04a83fe5e..029359424 100644 --- a/emission/analysis/modelling/similarity/confirmed_trip_feature_extraction.py +++ b/emission/analysis/modelling/similarity/confirmed_trip_feature_extraction.py @@ -25,7 +25,7 @@ def destination_features(trip: ecwc.Confirmedtrip) -> List[float]: destination = trip['data']['end_loc']["coordinates"] return destination except KeyError as e: - msg = 'Confirmedtrip expected to have path data.start_loc.coordinates' + msg = 'Confirmedtrip expected to have path data.end_loc.coordinates' raise KeyError(msg) from e From 351a75ac8140efc60aec1dfea6be19155da88826 Mon Sep 17 00:00:00 2001 From: rfitzger Date: Tue, 9 Aug 2022 20:18:03 -0600 Subject: [PATCH 39/46] failed "predict" should return num=0 --- .../trip_model/greedy_similarity_binning.py | 24 +++++++------------ 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py index c6ed041ba..a3f75a7ae 100644 --- a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py @@ -118,14 +118,14 @@ def fit(self, trips: List[ecwc.Confirmedtrip]): def predict(self, trip: ecwc.Confirmedtrip) -> Tuple[List[Dict], int]: logging.debug(f"running greedy similarity clustering") - predicted_bin, bin_record = self._nearest_bin(trip) - if predicted_bin is None: + predicted_bin_id, predicted_bin_record = self._nearest_bin(trip) + if predicted_bin_id is None: logging.debug(f"unable to predict bin for trip {trip}") - return [], -1 + return [], 0 else: - predictions = bin_record['predictions'] - n_features = len(bin_record['features']) - logging.debug(f"found cluster {predicted_bin} with predictions {predictions}") + predictions = predicted_bin_record['predictions'] + n_features = len(predicted_bin_record['features']) + logging.debug(f"found cluster {predicted_bin_id} with predictions {predictions}") return predictions, n_features def to_dict(self) -> Dict: @@ -197,21 +197,15 @@ def _nearest_bin(self, trip: ecwc.Confirmedtrip) -> Tuple[Optional[int], Optiona logging.debug(f"_nearest_bin called") trip_features = self.extract_features(trip) - selected_bin = None - selected_record = None for bin_id, bin_record in self.bins.items(): for bin_features in bin_record['features']: if self.metric.similar(trip_features, bin_features, self.sim_thresh): logging.debug(f"found nearest bin id {bin_id}") logging.debug(f"similar: {trip_features}, {bin_features}") - selected_bin = bin_id - selected_record = bin_record - break - if selected_bin is not None: - break - - return selected_bin, selected_record + return bin_id, bin_record + + return None, None def _apply_cutoff(self): """ From 696b999c8caff82ab29d3066bb25b636d9103aa9 Mon Sep 17 00:00:00 2001 From: rfitzger Date: Wed, 10 Aug 2022 09:11:50 -0600 Subject: [PATCH 40/46] update tests after sim metric fix --- .../TestGreedySimilarityBinning.py | 4 +- .../TestRunGreedyIncrementalModel.py | 101 ++++++++++-------- .../modellingTests/TestRunGreedyModel.py | 1 + 3 files changed, 61 insertions(+), 45 deletions(-) diff --git a/emission/tests/modellingTests/TestGreedySimilarityBinning.py b/emission/tests/modellingTests/TestGreedySimilarityBinning.py index fcdde51f1..0c0f1a26b 100644 --- a/emission/tests/modellingTests/TestGreedySimilarityBinning.py +++ b/emission/tests/modellingTests/TestGreedySimilarityBinning.py @@ -125,5 +125,5 @@ def testNoPrediction(self): model.fit(train) results, n = model.predict(test[0]) - self.assertEqual(len(results), 0, "should have found a matching bin") - self.assertEqual(n, -1, "that bin should have had the whole train set in it") + self.assertEqual(len(results), 0, "should not have found a matching bin") + self.assertEqual(n, 0, "the number of features in an empty bin is zero") diff --git a/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py b/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py index 430b65810..b9e51495f 100644 --- a/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py +++ b/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py @@ -4,7 +4,7 @@ import numpy as np import uuid import time - +import pandas as pd import bson.json_util as bju import emission.analysis.modelling.trip_model.model_storage as eamums @@ -43,37 +43,34 @@ def setUp(self): "incremental_evaluation": True } - # test data can be saved between test invocations, check if data exists before generating - self.initial_data = list(self.ts.find_entries([esdatq.CONFIRMED_TRIP_KEY])) + existing_entries_for_user = list(self.ts.find_entries([esdatq.CONFIRMED_TRIP_KEY])) + if len(existing_entries_for_user) != 0: + raise Exception(f"test invariant failed, there should be no entries for user {self.user_id}") + + # load in trips from a test file source + input_file = 'emission/tests/data/real_examples/shankari_2016-06-20.expected_confirmed_trips' + with open(input_file, 'r') as f: + trips_json = json.loads(f.read(), object_hook=bju.object_hook) + trips = [ecwe.Entry(r) for r in trips_json] + logging.debug(f'loaded {len(trips)} trips from {input_file}') + self.ts.bulk_insert(trips) + # confirm write to database succeeded + self.initial_data = list(self.ts.find_entries([esdatq.CONFIRMED_TRIP_KEY])) if len(self.initial_data) == 0: + logging.debug(f'test setup failed while loading trips from file') + self.fail() - # first time running against this database instance: - # 1. load trips from source file into database - # 2. create an initial entry for the incremental binning model - - # load in existing trips - input_file = 'emission/tests/data/real_examples/shankari_2016-06-20.expected_confirmed_trips' - with open(input_file, 'r') as f: - trips_json = json.loads(f.read(), object_hook=bju.object_hook) - trips = [ecwe.Entry(r) for r in trips_json] - logging.debug(f'loaded {len(trips)} trips from {input_file}') - self.ts.bulk_insert(trips) - - # confirm write to database succeeded - self.initial_data = list(self.ts.find_entries([esdatq.CONFIRMED_TRIP_KEY])) - if len(self.initial_data) == 0: - logging.debug(f'test setup failed while loading trips from file') - self.fail() - - logging.debug('writing initial trip model') - eamur.update_trip_model( - user_id=self.user_id, - model_type=self.model_type, - model_storage=self.model_storage, - min_trips=4, # there are 4 similar and labelled trips in the file - model_config=self.greedy_model_config - ) + logging.debug('writing initial trip model') + # there are 4 labelled trips in the file. 2 of these trips are "similar" + # within 500 meters, the other two are not. + eamur.update_trip_model( + user_id=self.user_id, + model_type=self.model_type, + model_storage=self.model_storage, + min_trips=4, + model_config=self.greedy_model_config + ) logging.debug(f'setup: found {len(self.initial_data)} trips in database') @@ -81,25 +78,43 @@ def setUp(self): # centroid of their origins and destinations to build # new similar trips from metric = eamso.OriginDestinationSimilarity() - features = [] - + features = [] for trip in self.initial_data: f = metric.extract_features(trip) features.append(f) # 2022-07-07 rjf: the Confirmedtrip dataset used here has 6 trips (initially) - # but only 5 are "similar" within 500 meters. here we dynamically dis- + # but only 2 are "similar" within 500 meters. here we dynamically dis- # include trip 6. set up like this in case we have to switch datasets # in the future (as long as the outliers are not similar!) # 2022-07-11 rjf: ooh, let's remove the ones without labels too - similar_matrix = [[metric.similar(t1, t2, sim_threshold) + similarity_matrix = [[metric.similar(t1, t2, sim_threshold) for t1 in features] for t2 in features] + + # let's see what's going on here + trips_df = pd.DataFrame(similarity_matrix) + trips_df['labels?'] = [len(t['data']['user_input']) > 0 for t in self.initial_data] + logging.debug("test data similarity matrix") + logging.debug("\n%s" % trips_df) + + # 0 1 2 3 4 5 labels? + # 0 True True True True False False True + # 1 True True True True False False False + # 2 True True True True False False False + # 3 True True True True False False True + # 4 False False False False True False True + # 5 False False False False False True True + + # trip 0 and 3 are similar and will form bin 0 + # trip 1 and 2 have no labels and will be ignored + # trips 4 and 5 are both dis-similar from the rest and will form singleton bins + self.similar_trips = [] self.similar_features = [] for idx, f in enumerate(self.initial_data): has_labels = len(self.initial_data[idx]['data']['user_input']) > 0 - sim = [similar_matrix[idx][i] for i in range(len(features)) if i != idx] + sim = [similarity_matrix[idx][i] for i in range(len(features)) if i != idx] similar = any(sim) if has_labels and similar: self.similar_trips.append(self.initial_data[idx]) @@ -121,6 +136,7 @@ def tearDown(self): clean up database entries related to this test """ edb.get_analysis_timeseries_db().delete_many({'user_id': self.user_id}) + edb.get_pipeline_state_db().delete_many({'user_id': self.user_id}) def testIncrementalRun(self): """ @@ -133,11 +149,8 @@ def testIncrementalRun(self): the training model should 1) only see the new trips, 2) have been trained on the expected number of trips at completion. """ - # create a new trip sampling from the centroid and the existing - # set of user input data - # timestamps for these rows cannot be within the last 5 seconds - # based on invariant set in pipeline_queries.py by the - # END_FUZZ_AVOID_LTE constant. + # create a new trip sampling from the centroid of the trips that + # are in bin '0', which has two similar and labeled trips. label_data = etmm.extract_trip_labels(self.similar_trips) new_trips = etmm.generate_mock_trips( user_id=self.user_id, @@ -149,7 +162,7 @@ def testIncrementalRun(self): start_ts=time.time() - 20, end_ts=time.time() - 10 ) - + self.ts.bulk_insert(new_trips) all_trips = list(self.ts.find_entries([esdatq.CONFIRMED_TRIP_KEY])) logging.debug(f'total of {len(all_trips)} now stored in database') @@ -169,9 +182,9 @@ def testIncrementalRun(self): model_config=self.greedy_model_config ) - # the 6th trip in the original dataset was an outlier and should form it's own cluster - self.assertEqual(len(updated_model.bins), 2, - 'there should be two bins, one with similar trips, one with an outlier') + # the 5th and 6th trip in the original dataset were outliers and should form their own cluster + self.assertEqual(len(updated_model.bins), 3, + 'there should be three bins, one with 2 similar trips, and two singleton bins') trips_in_bin = len(updated_model.bins['0']['features']) print(f'trips in bins: {[len(x["features"]) for x in updated_model.bins.values()]}') @@ -180,4 +193,6 @@ def testIncrementalRun(self): self.assertEqual(len(updated_model.bins['1']['features']), 1, 'the second bin should have exactly one entry (an outlier)') + self.assertEqual(len(updated_model.bins['2']['features']), 1, + 'the third bin should have exactly one entry (an outlier)') \ No newline at end of file diff --git a/emission/tests/modellingTests/TestRunGreedyModel.py b/emission/tests/modellingTests/TestRunGreedyModel.py index 3baf8143f..c5b20deb0 100644 --- a/emission/tests/modellingTests/TestRunGreedyModel.py +++ b/emission/tests/modellingTests/TestRunGreedyModel.py @@ -83,6 +83,7 @@ def tearDown(self): clean up database """ edb.get_analysis_timeseries_db().delete_many({'user_id': self.user_id}) + edb.get_pipeline_state_db().delete_many({'user_id': self.user_id}) def testBuildGreedyModelFromConfig(self): """ From 76b4f2e7d9b6b9b96f344dc65b3c53e6971787f3 Mon Sep 17 00:00:00 2001 From: shankari Date: Wed, 10 Aug 2022 21:37:30 -0700 Subject: [PATCH 41/46] Apply suggestions from code review --- .../modelling/trip_model/greedy_similarity_binning.py | 4 +++- emission/analysis/modelling/trip_model/util.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py index a3f75a7ae..673a6d2ae 100644 --- a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py @@ -185,7 +185,7 @@ def _find_matching_bin_id(self, trip_features: List[float]) -> Optional[str]: def _nearest_bin(self, trip: ecwc.Confirmedtrip) -> Tuple[Optional[int], Optional[Dict]]: """ - finds a bin which contains at least one matching feature. the + finds a bin which contains at least all matching features. the first record matching by similarity measure is returned. if none are found, (None, None) is returned. @@ -211,9 +211,11 @@ def _apply_cutoff(self): """ removes small clusters by an "elbow search" heuristic. see https://stackoverflow.com/a/2022348/4803266. + Copied over from https://github.com/e-mission/e-mission-server/blob/5b9e608154de15e32df4f70a07a5b95477e7dbf5/emission/analysis/modelling/tour_model/similarity.py#L158 """ # the cutoff point is an index along the sorted bins. any bin with a gte # index value is removed, as that bin has been found to be smaller than the cutoff. + # This was the last line of calc_cutoff_bins in the old code, and is moved to the equivalent of delete_bins in the new code bins_sorted = self.bins.sort(key=lambda bin: len(bin['features']), reverse=True) num_bins = len(bins_sorted) diff --git a/emission/analysis/modelling/trip_model/util.py b/emission/analysis/modelling/trip_model/util.py index 662137ba5..7d22b5d22 100644 --- a/emission/analysis/modelling/trip_model/util.py +++ b/emission/analysis/modelling/trip_model/util.py @@ -11,6 +11,7 @@ def find_knee_point(values: List[float]) -> Tuple[float, int]: copied from original similarity algorithm. permalink: [https://github.com/e-mission/e-mission-server/blob/5b9e608154de15e32df4f70a07a5b95477e7dbf5/emission/analysis/modelling/tour_model/similarity.py#L256] + with `y` passed in as `values` based on this stack overflow answer: https://stackoverflow.com/a/2022348/4803266 And summarized by the statement: "A quick way of finding the elbow is to draw a line from the first to the last point of the curve and then find the data point From c3837401e5419fef9507c2f657bb8e0482dbc197 Mon Sep 17 00:00:00 2001 From: shankari Date: Wed, 10 Aug 2022 21:48:01 -0700 Subject: [PATCH 42/46] Add more links to the old code --- .../modelling/trip_model/greedy_similarity_binning.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py index 673a6d2ae..68f402c2f 100644 --- a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py @@ -218,6 +218,12 @@ def _apply_cutoff(self): # This was the last line of calc_cutoff_bins in the old code, and is moved to the equivalent of delete_bins in the new code bins_sorted = self.bins.sort(key=lambda bin: len(bin['features']), reverse=True) + + +# The first two lines below correspond to the original lines below in the original elbow_distance +# y = [0] * len(self.bins) +# for i in range(len(self.bins)): +# y[i] = len(self.bins[i]) num_bins = len(bins_sorted) bin_sizes = [len(bin_rec['features']) for bin_rec in bins_sorted.values()] _, cutoff_bin_size = util.find_knee_point(bin_sizes) From d032601722b9dd24771ab02323947dedade1a8de Mon Sep 17 00:00:00 2001 From: shankari Date: Wed, 10 Aug 2022 23:14:59 -0700 Subject: [PATCH 43/46] More documentation and clarification --- .../modelling/trip_model/greedy_similarity_binning.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py index 68f402c2f..030fbe604 100644 --- a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py @@ -220,6 +220,12 @@ def _apply_cutoff(self): +# The first two lines below correspond to the original lines below in the original elbow_distance +# y = [0] * len(self.bins) +# for i in range(len(self.bins)): +# y[i] = len(self.bins[i]) + + # The first two lines below correspond to the original lines below in the original elbow_distance # y = [0] * len(self.bins) # for i in range(len(self.bins)): From 6901b99a60fd9281118f337774f14f919c1db1a9 Mon Sep 17 00:00:00 2001 From: shankari Date: Wed, 10 Aug 2022 23:18:16 -0700 Subject: [PATCH 44/46] Revert duplicate commit https://github.com/e-mission/e-mission-server/pull/852/commits/d032601722b9dd24771ab02323947dedade1a8de --- .../modelling/trip_model/greedy_similarity_binning.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py index 030fbe604..68f402c2f 100644 --- a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py @@ -220,12 +220,6 @@ def _apply_cutoff(self): -# The first two lines below correspond to the original lines below in the original elbow_distance -# y = [0] * len(self.bins) -# for i in range(len(self.bins)): -# y[i] = len(self.bins[i]) - - # The first two lines below correspond to the original lines below in the original elbow_distance # y = [0] * len(self.bins) # for i in range(len(self.bins)): From b252622d38dd0b06872ae006d3e30fd4f6deb26e Mon Sep 17 00:00:00 2001 From: rfitzger Date: Thu, 11 Aug 2022 14:25:22 -0600 Subject: [PATCH 45/46] comments and naming changes --- .../trip_model/greedy_similarity_binning.py | 75 +++++++++++++------ .../TestGreedySimilarityBinning.py | 2 +- .../TestRunGreedyIncrementalModel.py | 8 +- 3 files changed, 56 insertions(+), 29 deletions(-) diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py index 68f402c2f..d750a451e 100644 --- a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py @@ -23,30 +23,57 @@ def __init__(self, config=None): [https://github.com/e-mission/e-mission-server/blob/5b9e608154de15e32df4f70a07a5b95477e7dbf5/emission/analysis/modelling/tour_model/similarity.py#L67] this technique employs a greedy similarity heuristic to associate - trips with collections of probabilistic class labels. in pseudocode: + trips with collections of probabilistic class labels. new bins are + created when the next feature vector is not similar to any existing bins. + for a new feature vector to be similar to an existing bin, it must be + similar to all of the previous feature vectors found in that bin, by way + of a provided similarity metric and threshold value. + + in pseudocode: # fit for each bin_id, bin in bins: - for each bin_trip in bin.trips: - if similar(trip, bin_trip): - append trip to bin.trips + for each bin_feature_row in bin.feature_rows: + if not similar(trip.feature_row, bin_feature_row): + return + append trip to bin + + the prediction of labels for some input trip takes a similar form, + where the first bin that is found to be similar is treated as the + class label to apply: # prediction for each bin_id, bin in bins: - for each bin_trip in bin.trips: - if similar(trip, bin_trip): - return bin.predictions: List[Prediction] - - the number of predictions is not assumed to be the number of features. - - the original similarity class (link above) used a nested List data + for each bin_feature_row in bin.feature_rows: + if not similar(trip.feature_row, bin_feature_row): + break + return bin_id + + to train the predictions, label sets are aggregated within a bin so that + the occurences of some unique label combination is counted. the probability + of a specific unique label combination is assigned by the proportion + of counts of this unique label set to the total number of trips stored at + this bin. the set of unique label sets and their prediction value are then + returned during prediction. + + in terms of the data structure of the model, each bin is a Dictionary with + three fields, "feature_rows", "labels", and "predictions", each a list. + whereas the number and index of "feature_rows" and "labels" are assumed to + match and be idempotent across multiple training calls, the "predictions" + are over-written at each call of "fit" and are not assumed to match the number + of "feature_rows" or "labels" stored in a bin. + + historical note: the original similarity class (link above) used a nested list data structure to capture the notion of binning. this was then copied into - a Dict when the model needed to be saved. the same technique can be - written to work directly on nested Dicts with no loss in performance. + a Dict when the model needed to be saved. the same technique can be re-written to + work directly on Dictionaries with no loss in the algorithm's time complexity. this + also helps when running in incremental mode to persist relevant training data and to + minimize codec + serialization errors. + the data takes the form: { bin_id: { - "features": [ + "feature_rows": [ [f1, f2, .., fn], ... ], @@ -124,7 +151,7 @@ def predict(self, trip: ecwc.Confirmedtrip) -> Tuple[List[Dict], int]: return [], 0 else: predictions = predicted_bin_record['predictions'] - n_features = len(predicted_bin_record['features']) + n_features = len(predicted_bin_record['feature_rows']) logging.debug(f"found cluster {predicted_bin_id} with predictions {predictions}") return predictions, n_features @@ -155,15 +182,15 @@ def _assign_bins(self, trips: List[ecwc.Confirmedtrip]): if bin_id is not None: # add to existing bin logging.debug(f"adding trip to bin {bin_id} with features {trip_features}") - self.bins[bin_id]['features'].append(trip_features) + self.bins[bin_id]['feature_rows'].append(trip_features) self.bins[bin_id]['labels'].append(trip_labels) else: # create new bin new_bin_id = str(len(self.bins)) new_bin_record = { - "features": [trip_features], - "labels": [trip_labels], - "predictions": [] + 'feature_rows': [trip_features], + 'labels': [trip_labels], + 'predictions': [] } logging.debug(f"creating new bin {new_bin_id} at location {trip_features}") self.bins[new_bin_id] = new_bin_record @@ -178,7 +205,7 @@ def _find_matching_bin_id(self, trip_features: List[float]) -> Optional[str]: """ for bin_id, bin_record in self.bins.items(): matches_bin = all([self.metric.similar(trip_features, bin_sample, self.sim_thresh) - for bin_sample in bin_record['features']]) + for bin_sample in bin_record['feature_rows']]) if matches_bin: return bin_id return None @@ -199,7 +226,7 @@ def _nearest_bin(self, trip: ecwc.Confirmedtrip) -> Tuple[Optional[int], Optiona trip_features = self.extract_features(trip) for bin_id, bin_record in self.bins.items(): - for bin_features in bin_record['features']: + for bin_features in bin_record['feature_rows']: if self.metric.similar(trip_features, bin_features, self.sim_thresh): logging.debug(f"found nearest bin id {bin_id}") logging.debug(f"similar: {trip_features}, {bin_features}") @@ -216,7 +243,7 @@ def _apply_cutoff(self): # the cutoff point is an index along the sorted bins. any bin with a gte # index value is removed, as that bin has been found to be smaller than the cutoff. # This was the last line of calc_cutoff_bins in the old code, and is moved to the equivalent of delete_bins in the new code - bins_sorted = self.bins.sort(key=lambda bin: len(bin['features']), reverse=True) + bins_sorted = self.bins.sort(key=lambda bin: len(bin['feature_rows']), reverse=True) @@ -225,7 +252,7 @@ def _apply_cutoff(self): # for i in range(len(self.bins)): # y[i] = len(self.bins[i]) num_bins = len(bins_sorted) - bin_sizes = [len(bin_rec['features']) for bin_rec in bins_sorted.values()] + bin_sizes = [len(bin_rec['feature_rows']) for bin_rec in bins_sorted.values()] _, cutoff_bin_size = util.find_knee_point(bin_sizes) logging.debug( "bins = %s, elbow distance = %s" % (num_bins, cutoff_bin_size) @@ -233,7 +260,7 @@ def _apply_cutoff(self): updated_bins = {bin_id: bin_rec for bin_id, bin_rec in bins_sorted.items() - if len(bin_rec['features']) >= cutoff_bin_size} + if len(bin_rec['feature_rows']) >= cutoff_bin_size} removed = len(bins_sorted) - len(updated_bins) logging.debug( diff --git a/emission/tests/modellingTests/TestGreedySimilarityBinning.py b/emission/tests/modellingTests/TestGreedySimilarityBinning.py index 0c0f1a26b..32bed47aa 100644 --- a/emission/tests/modellingTests/TestGreedySimilarityBinning.py +++ b/emission/tests/modellingTests/TestGreedySimilarityBinning.py @@ -46,7 +46,7 @@ def testBinning(self): model.fit(trips) # $m trip features should appear together in one bin - at_least_one_large_bin = any(map(lambda b: len(b['features']) == m, model.bins.values())) + at_least_one_large_bin = any(map(lambda b: len(b['feature_rows']) == m, model.bins.values())) self.assertTrue(at_least_one_large_bin, "at least one bin should have at least 5 features in it") def testPrediction(self): diff --git a/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py b/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py index b9e51495f..e03a4046b 100644 --- a/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py +++ b/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py @@ -186,13 +186,13 @@ def testIncrementalRun(self): self.assertEqual(len(updated_model.bins), 3, 'there should be three bins, one with 2 similar trips, and two singleton bins') - trips_in_bin = len(updated_model.bins['0']['features']) - print(f'trips in bins: {[len(x["features"]) for x in updated_model.bins.values()]}') + trips_in_bin = len(updated_model.bins['0']['feature_rows']) + print(f'trips in bins: {[len(x["feature_vectors"]) for x in updated_model.bins.values()]}') self.assertEqual(trips_in_bin, self.expected_trips, 'expected number of trips stored in bin') - self.assertEqual(len(updated_model.bins['1']['features']), 1, + self.assertEqual(len(updated_model.bins['1']['feature_rows']), 1, 'the second bin should have exactly one entry (an outlier)') - self.assertEqual(len(updated_model.bins['2']['features']), 1, + self.assertEqual(len(updated_model.bins['2']['feature_rows']), 1, 'the third bin should have exactly one entry (an outlier)') \ No newline at end of file From dab7295cf8354581b9bfea051af08c18f11e1251 Mon Sep 17 00:00:00 2001 From: Rob Fitzgerald Date: Thu, 11 Aug 2022 15:01:26 -0600 Subject: [PATCH 46/46] Update emission/tests/modellingTests/TestRunGreedyIncrementalModel.py Co-authored-by: shankari --- emission/tests/modellingTests/TestRunGreedyIncrementalModel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py b/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py index e03a4046b..9f8b78254 100644 --- a/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py +++ b/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py @@ -187,7 +187,7 @@ def testIncrementalRun(self): 'there should be three bins, one with 2 similar trips, and two singleton bins') trips_in_bin = len(updated_model.bins['0']['feature_rows']) - print(f'trips in bins: {[len(x["feature_vectors"]) for x in updated_model.bins.values()]}') + print(f'trips in bins: {[len(x["feature_rows"]) for x in updated_model.bins.values()]}') self.assertEqual(trips_in_bin, self.expected_trips, 'expected number of trips stored in bin')