Merge pull request #847 from shankari/improve_has_label

Create a dataframe version of has_final_labels
e-mission · Jan 15, 2022 · 1f6d6c8 · 1f6d6c8
2 parents 6bfecc9 + e9fe2ed
commit 1f6d6c8
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 0 deletions.
diff --git a/emission/storage/decorations/trip_queries.py b/emission/storage/decorations/trip_queries.py
@@ -236,6 +236,15 @@ def has_final_labels(confirmed_trip_data):
     return (confirmed_trip_data["user_input"] != {}
             or confirmed_trip_data["expectation"]["to_label"] == False)
 
+# Create an alternate method to work on the dataframe column-wise
+# instead of iterating over each individual row for improved performance
+def has_final_labels_df(df):
+    # print(df.expectation)
+    # print(pd.DataFrame(df.expectation.to_list(), index=df.index))
+    to_list_series = pd.DataFrame(df.expectation.to_list(), index=df.index).to_label
+    return df[(df.user_input != {})
+            | (to_list_series == False)]
+
 def get_max_prob_label(inferred_label_list):
     # Two columns: "labels" and "p"
     label_prob_df = pd.DataFrame(inferred_label_list)

diff --git a/emission/tests/storageTests/TestTripQueries.py b/emission/tests/storageTests/TestTripQueries.py
@@ -399,6 +399,39 @@ def testHasFinalLabels(self):
             "expectation": {"to_label": False}
         })))
 
+    def testHasFinalLabelsDataFrame(self):
+        test_mixed_df = pd.DataFrame(
+            [{"user_input": {"mode_confirm": "bike", "purpose_confirm": "shopping"},
+            "expectation": {"to_label": True}}] * 3 +
+            [{"user_input": {}, "expectation": {"to_label": False},
+            "inferred_labels":
+                [{"labels": {"mode_confirm": "bike", "purpose_confirm": "shopping"}, "p": 0.1},
+                {"labels": {"mode_confirm": "walk", "purpose_confirm": "exercise"}, "p": 0.9}]
+            }] * 3 +
+            [{"user_input": {}, "expectation": {"to_label": True},
+            "inferred_labels":
+                [{"labels": {"mode_confirm": "bike", "purpose_confirm": "shopping"}, "p": 0.2},
+                {"labels": {"mode_confirm": "walk", "purpose_confirm": "exercise"}, "p": 0.4},
+                {"labels": {"mode_confirm": "drove_alone", "purpose_confirm": "work"}, "p": 0.4}]
+            }] * 3 +
+            [{"user_input": {}, "expectation": {"to_label": True}}] * 3)
+
+        has_user_labels_df = test_mixed_df[test_mixed_df.user_input != {}]
+        # only the actual user inputs will be counted in the old way
+        self.assertEqual(has_user_labels_df.shape[0], 3)
+
+        # print(test_mixed_df.apply(lambda row: print(row.user_input), axis=1))
+        self.assertEqual(np.count_nonzero(test_mixed_df.apply(
+            lambda row: esdt.has_final_labels(row), axis=1)), 6)
+
+        has_final_labels_df = test_mixed_df[test_mixed_df.apply(
+            lambda row: esdt.has_final_labels(row), axis=1)]
+        # the actual user inputs and to_label = false will be counted in the new way
+        self.assertEqual(has_final_labels_df.shape[0], 6)
+
+        self.assertEqual(esdt.has_final_labels_df(test_mixed_df).shape[0], 6)
+
+
     def testGetMaxProbLabel(self):
         self.assertEqual(esdt.get_max_prob_label([
             {'labels': {'mc': 30, 'pc': 40}, 'p': 0.9}]), {'mc': 30, 'pc': 40})