e-mission · shankari · Sep 13, 2023 · Aug 26, 2023 · Aug 29, 2023 · Aug 29, 2023
diff --git a/emission/storage/timeseries/builtin_timeseries.py b/emission/storage/timeseries/builtin_timeseries.py
@@ -440,3 +440,63 @@ def update_data(user_id, key, obj_id, data):
         logging.debug("updating entry %s into timeseries" % new_entry)
         edb.save(ts.get_timeseries_db(key), new_entry)
 
+    def find_entries_count(self, key_list = None, time_query = None, geo_query = None, extra_query_list = None):
+        """
+        Returns the total number of documents for the given key_list referring to each of the two timeseries db.
+
+        Input: Key list with keys from both timeseries DBs = [key1, key2, key3, key4, ...]
+                Suppose (key1, key2) are orig_tsdb keys and (key3, key4) are analysis_tsdb keys
+        Output: Tuple of lists  = (orig_tsdb_count, analysis_tsdb_count)
+                                = ([count_key1, count_key2, ...], [count_key3, count_key4, ...])
+                Orig_tsdb_count and Analysis_tsdb_count are lists containing counts of matching documents 
+                for each key considered separately for the specific timeseries DB.
+
+        :param key_list: list of metadata keys we are querying for.
+        :param time_query: the time range in which to search the stream
+        :param geo_query: the query for a geographical area
+        :param extra_query_list: any additional queries to filter out data
+
+        For key_list = None, total count of all documents are returned for each of the matching timeseries DBs.
+        """
+        logging.debug("builtin_timeseries.find_entries_count() called")
+
+        orig_tsdb = self.timeseries_db
+        analysis_tsdb = self.analysis_timeseries_db
+
+        orig_tsdb_counts = []
+        analysis_tsdb_counts = []
+
+        if key_list == [] or key_list is None:
+            key_list = None
+
+        # Segregate orig_tsdb and analysis_tsdb keys
+        (orig_tsdb_keys, analysis_tsdb_keys) = self._split_key_list(key_list)
+
+        orig_tsdb_counts = self._get_entries_counts_for_timeseries(orig_tsdb, orig_tsdb_keys, time_query, geo_query, extra_query_list)
+        analysis_tsdb_counts = self._get_entries_counts_for_timeseries(analysis_tsdb, analysis_tsdb_keys, time_query, geo_query, extra_query_list)
+
+        return (orig_tsdb_counts, analysis_tsdb_counts)
+
+
+    def _get_entries_counts_for_timeseries(self, tsdb, key_list, time_query, geo_query, extra_query_list):
+
+        tsdb_queries = []
+        tsdb_counts = []
+
+        # For each key in orig_tsdb keys, create a query
+        if key_list is not None:
+            for key in key_list:
+                tsdb_query = self._get_query([key], time_query, geo_query, extra_query_list)
+                tsdb_queries.append(tsdb_query)
+            # For each query generated for each orig_tsdb key, fetch count of matching documents
+            for query in tsdb_queries:
+                entries_count = tsdb.count_documents(query)
+                tsdb_counts.append(entries_count)
+        else:
+            tsdb_queries = self._get_query(key_list, time_query, geo_query, extra_query_list)
+            entries_count = tsdb.count_documents(tsdb_queries)
+            tsdb_counts = [entries_count]
+
+        return tsdb_counts      
+
+
diff --git a/emission/tests/storageTests/TestTimeSeries.py b/emission/tests/storageTests/TestTimeSeries.py
@@ -81,6 +81,76 @@ def testExtraQueries(self):
         with self.assertRaises(AttributeError):
             list(ts.find_entries(time_query=tq, extra_query_list=[ignored_phones]))
 
+    def testFindEntriesCount(self):
+        '''
+        Test: Specific keys with other parameters not passed values.
+        Input: For each dataset: ["background/location", "background/filtered_location", "analysis/confirmed_trip"]
+            - Testing this with sample dataset: "shankari_2015-aug-21", "shankari_2015-aug-27"
+        Output: Aug_21: ([738, 508], [0]), Aug_27: ([555, 327], [0])
+            - Actual output just returns a single number for count of entries.
+            - Validated using grep count of occurrences for keys: 1) "background/location"     2) "background/filtered_location"
+                - $ grep -c <key> <dataset>.json
+
+        For Aggregate Timeseries test case:
+        - The expected output would be summed-up values for the respective keys from the individual users testing outputs mentioned above.
+        - Output: ([1293, 835], [0])
+            - For each of the 3 input keys from key_list1: 
+                - 1293 = 738 (UUID1) + 555 (UUID2)
+                - 835 = 508 (UUID1) + 327 (UUID2)
+                - 0 = 0 (UUID1) + 0 (UUID2)
+
+        '''
+
+        ts1_aug_21 = esta.TimeSeries.get_time_series(self.testUUID1)
+        ts2_aug_27 = esta.TimeSeries.get_time_series(self.testUUID)
+
+        # Test case: Combination of original and analysis timeseries DB keys for Aug-21 dataset
+        key_list1=["background/location", "background/filtered_location", "analysis/confirmed_trip"]
+        count_ts1 = ts1_aug_21.find_entries_count(key_list=key_list1)
+        self.assertEqual(count_ts1, ([738, 508], [0]))
+
+        # Test case: Combination of original and analysis timeseries DB keys for Aug-27 dataset
+        key_list1=["background/location", "background/filtered_location", "analysis/confirmed_trip"]
+        count_ts2 = ts2_aug_27.find_entries_count(key_list=key_list1)
+        self.assertEqual(count_ts2, ([555, 327], [0]))
+
+        # Test case: Only original timeseries DB keys for Aug-27 dataset
+        key_list2=["background/location", "background/filtered_location"]
+        count_ts3 = ts2_aug_27.find_entries_count(key_list=key_list2)
+        self.assertEqual(count_ts3, ([555, 327], []))
+
+        # Test case: Only analysis timeseries DB keys
+        key_list3=["analysis/confirmed_trip"]
+        count_ts4 = ts2_aug_27.find_entries_count(key_list=key_list3)
+        self.assertEqual(count_ts4, ([], [0]))
-        self.assertEqual(count_ts4, ([], [0]))
+        self.assertEqual(count_ts4, 0)
-        self.assertEqual(count_ts4, ([], [0]))
+        self.assertEqual(count_ts4, 0)
+
+        # Test case: Empty key_list which should return total count of all documents in the two DBs
+        key_list4=[]
+        count_ts5 = ts1_aug_21.find_entries_count(key_list=key_list4)
+        self.assertEqual(count_ts5, ([2125], [0]))
-        self.assertEqual(count_ts5, ([2125], [0]))
+        self.assertEqual(count_ts5, 2125)
-        self.assertEqual(count_ts5, ([2125], [0]))
+        self.assertEqual(count_ts5, 2125)
+
+        # Test case: Invalid or unmatched key in metadata field 
+        key_list5=["randomxyz_123test"]
+        with self.assertRaises(KeyError) as ke:
+            count_ts6 = ts1_aug_21.find_entries_count(key_list=key_list5)
+        self.assertEqual(str(ke.exception), "'randomxyz_123test'")
+
+        # Test case: Aggregate timeseries DB User data passed as input
+        ts_agg = esta.TimeSeries.get_aggregate_time_series()
+        count_ts7 = ts_agg.find_entries_count(key_list=key_list1)
+        self.assertEqual(count_ts7, ([1293, 835], [0]))
-        self.assertEqual(count_ts7, ([1293, 835], [0]))
+        self.assertEqual(count_ts7, 2125)
-        self.assertEqual(count_ts7, ([1293, 835], [0]))
+        self.assertEqual(count_ts7, 2125)
+
+        # Test case: New User created with no data to check
+        self.testEmail = None
+        self.testUUID2 = self.testUUID
+        etc.createAndFillUUID(self)
+        ts_new_user = esta.TimeSeries.get_time_series(self.testUUID)
+        count_ts8 = ts_new_user.find_entries_count(key_list=key_list1)
+        self.assertEqual(count_ts8, ([0, 0], [0]))
+
+        print("Assert Test for Count Data successful!")
+
+
 if __name__ == '__main__':
     import emission.tests.common as etc
     etc.configLogging()