Skip to content

Commit

Permalink
comments and naming changes
Browse files Browse the repository at this point in the history
  • Loading branch information
robfitzgerald committed Aug 11, 2022
1 parent 6901b99 commit b252622
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 29 deletions.
75 changes: 51 additions & 24 deletions emission/analysis/modelling/trip_model/greedy_similarity_binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,30 +23,57 @@ def __init__(self, config=None):
[https://github.com/e-mission/e-mission-server/blob/5b9e608154de15e32df4f70a07a5b95477e7dbf5/emission/analysis/modelling/tour_model/similarity.py#L67]
this technique employs a greedy similarity heuristic to associate
trips with collections of probabilistic class labels. in pseudocode:
trips with collections of probabilistic class labels. new bins are
created when the next feature vector is not similar to any existing bins.
for a new feature vector to be similar to an existing bin, it must be
similar to all of the previous feature vectors found in that bin, by way
of a provided similarity metric and threshold value.
in pseudocode:
# fit
for each bin_id, bin in bins:
for each bin_trip in bin.trips:
if similar(trip, bin_trip):
append trip to bin.trips
for each bin_feature_row in bin.feature_rows:
if not similar(trip.feature_row, bin_feature_row):
return
append trip to bin
the prediction of labels for some input trip takes a similar form,
where the first bin that is found to be similar is treated as the
class label to apply:
# prediction
for each bin_id, bin in bins:
for each bin_trip in bin.trips:
if similar(trip, bin_trip):
return bin.predictions: List[Prediction]
the number of predictions is not assumed to be the number of features.
the original similarity class (link above) used a nested List data
for each bin_feature_row in bin.feature_rows:
if not similar(trip.feature_row, bin_feature_row):
break
return bin_id
to train the predictions, label sets are aggregated within a bin so that
the occurences of some unique label combination is counted. the probability
of a specific unique label combination is assigned by the proportion
of counts of this unique label set to the total number of trips stored at
this bin. the set of unique label sets and their prediction value are then
returned during prediction.
in terms of the data structure of the model, each bin is a Dictionary with
three fields, "feature_rows", "labels", and "predictions", each a list.
whereas the number and index of "feature_rows" and "labels" are assumed to
match and be idempotent across multiple training calls, the "predictions"
are over-written at each call of "fit" and are not assumed to match the number
of "feature_rows" or "labels" stored in a bin.
historical note: the original similarity class (link above) used a nested list data
structure to capture the notion of binning. this was then copied into
a Dict when the model needed to be saved. the same technique can be
written to work directly on nested Dicts with no loss in performance.
a Dict when the model needed to be saved. the same technique can be re-written to
work directly on Dictionaries with no loss in the algorithm's time complexity. this
also helps when running in incremental mode to persist relevant training data and to
minimize codec + serialization errors.
the data takes the form:
{
bin_id: {
"features": [
"feature_rows": [
[f1, f2, .., fn],
...
],
Expand Down Expand Up @@ -124,7 +151,7 @@ def predict(self, trip: ecwc.Confirmedtrip) -> Tuple[List[Dict], int]:
return [], 0
else:
predictions = predicted_bin_record['predictions']
n_features = len(predicted_bin_record['features'])
n_features = len(predicted_bin_record['feature_rows'])
logging.debug(f"found cluster {predicted_bin_id} with predictions {predictions}")
return predictions, n_features

Expand Down Expand Up @@ -155,15 +182,15 @@ def _assign_bins(self, trips: List[ecwc.Confirmedtrip]):
if bin_id is not None:
# add to existing bin
logging.debug(f"adding trip to bin {bin_id} with features {trip_features}")
self.bins[bin_id]['features'].append(trip_features)
self.bins[bin_id]['feature_rows'].append(trip_features)
self.bins[bin_id]['labels'].append(trip_labels)
else:
# create new bin
new_bin_id = str(len(self.bins))
new_bin_record = {
"features": [trip_features],
"labels": [trip_labels],
"predictions": []
'feature_rows': [trip_features],
'labels': [trip_labels],
'predictions': []
}
logging.debug(f"creating new bin {new_bin_id} at location {trip_features}")
self.bins[new_bin_id] = new_bin_record
Expand All @@ -178,7 +205,7 @@ def _find_matching_bin_id(self, trip_features: List[float]) -> Optional[str]:
"""
for bin_id, bin_record in self.bins.items():
matches_bin = all([self.metric.similar(trip_features, bin_sample, self.sim_thresh)
for bin_sample in bin_record['features']])
for bin_sample in bin_record['feature_rows']])
if matches_bin:
return bin_id
return None
Expand All @@ -199,7 +226,7 @@ def _nearest_bin(self, trip: ecwc.Confirmedtrip) -> Tuple[Optional[int], Optiona
trip_features = self.extract_features(trip)

for bin_id, bin_record in self.bins.items():
for bin_features in bin_record['features']:
for bin_features in bin_record['feature_rows']:
if self.metric.similar(trip_features, bin_features, self.sim_thresh):
logging.debug(f"found nearest bin id {bin_id}")
logging.debug(f"similar: {trip_features}, {bin_features}")
Expand All @@ -216,7 +243,7 @@ def _apply_cutoff(self):
# the cutoff point is an index along the sorted bins. any bin with a gte
# index value is removed, as that bin has been found to be smaller than the cutoff.
# This was the last line of calc_cutoff_bins in the old code, and is moved to the equivalent of delete_bins in the new code
bins_sorted = self.bins.sort(key=lambda bin: len(bin['features']), reverse=True)
bins_sorted = self.bins.sort(key=lambda bin: len(bin['feature_rows']), reverse=True)



Expand All @@ -225,15 +252,15 @@ def _apply_cutoff(self):
# for i in range(len(self.bins)):
# y[i] = len(self.bins[i])
num_bins = len(bins_sorted)
bin_sizes = [len(bin_rec['features']) for bin_rec in bins_sorted.values()]
bin_sizes = [len(bin_rec['feature_rows']) for bin_rec in bins_sorted.values()]
_, cutoff_bin_size = util.find_knee_point(bin_sizes)
logging.debug(
"bins = %s, elbow distance = %s" % (num_bins, cutoff_bin_size)
)

updated_bins = {bin_id: bin_rec
for bin_id, bin_rec in bins_sorted.items()
if len(bin_rec['features']) >= cutoff_bin_size}
if len(bin_rec['feature_rows']) >= cutoff_bin_size}

removed = len(bins_sorted) - len(updated_bins)
logging.debug(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def testBinning(self):
model.fit(trips)

# $m trip features should appear together in one bin
at_least_one_large_bin = any(map(lambda b: len(b['features']) == m, model.bins.values()))
at_least_one_large_bin = any(map(lambda b: len(b['feature_rows']) == m, model.bins.values()))
self.assertTrue(at_least_one_large_bin, "at least one bin should have at least 5 features in it")

def testPrediction(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,13 +186,13 @@ def testIncrementalRun(self):
self.assertEqual(len(updated_model.bins), 3,
'there should be three bins, one with 2 similar trips, and two singleton bins')

trips_in_bin = len(updated_model.bins['0']['features'])
print(f'trips in bins: {[len(x["features"]) for x in updated_model.bins.values()]}')
trips_in_bin = len(updated_model.bins['0']['feature_rows'])
print(f'trips in bins: {[len(x["feature_vectors"]) for x in updated_model.bins.values()]}')
self.assertEqual(trips_in_bin, self.expected_trips,
'expected number of trips stored in bin')

self.assertEqual(len(updated_model.bins['1']['features']), 1,
self.assertEqual(len(updated_model.bins['1']['feature_rows']), 1,
'the second bin should have exactly one entry (an outlier)')
self.assertEqual(len(updated_model.bins['2']['features']), 1,
self.assertEqual(len(updated_model.bins['2']['feature_rows']), 1,
'the third bin should have exactly one entry (an outlier)')

0 comments on commit b252622

Please sign in to comment.