Merge pull request #2 from kunaljubce/kunaljubce/python-script-to-pro…

…cess-1br-v1 Kunaljubce/python script to process 1br v1
kunaljubce · Mar 14, 2024 · bdfecbc · bdfecbc
2 parents a506b4c + 72c006c
commit bdfecbc
Show file tree

Hide file tree

Showing 2 changed files with 98 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -1,3 +1,29 @@
 # 1brc
+This is a Python implementation of the wildly popular One Billion Rows challenge, initiated originally to be solved only in Java - https://github.com/gunnarmorling/1brc
+
+## Experimental Setup
+
+* All executions and runtimes have been noted while running on Python 3.8.18 on a Apple M2 Pro machine with a 16 GB RAM and 500 GB hard disk.
+* `createMeasurements.py` copied from https://github.com/ifnesi/1brc/blob/main/createMeasurements.py.
+* For every iteration, the runtime noted below is the observed best runtime after multiple trials with different batch sizes and other parameter adjustments.  
+
+### Runtime improvements over iterations
+
+| Iteration Number | Runtime (in seconds) |
+| ---------------- | -------------------- |
+| 1                | 1138.61              |
+| 2                |               |
+
+### Implementation Details and improvements done over iterations
+
+##### Iteration 1
+* Read the file in batches. Call `batch_calculation()` to do the calculation of average batch by batch, as the file is read. Inside this function - 
+* * First the tuple object read is split and converted to a list - [Place, Temperature]
+* * This list is then converted to a dict {Place, Temperature} and inserted into an List[Dict] variable - `input_batch_list`.
+* * Finally we are passing this `input_batch_list` variable to the `calc_average_over_entire_data()` function.
+* The `calc_average_over_entire_data()` achieves two objectives - 
+* * When called from within `batch_calculation()`, it iterates over the `input_batch_list` and calculates the average per batch.
+* * 
+
+
 
-`createMeasurements.py` copied from https://github.com/ifnesi/1brc/blob/main/createMeasurements.py
diff --git a/calc_avg_native_python.py b/calc_avg_native_python.py
@@ -0,0 +1,71 @@
+import time
+from itertools import islice
+from typing import Tuple, List, Dict
+
+filename = 'measurements.txt'
+
+n = 500000  # Or whatever chunk size you want
+
+temp_per_place_all_batches = [] # List of dicts i.e. [{'Alexandra': -7.6}, {'Malé': 30.6}, {'Pyongyang': 3.4}]
+total_temp_per_place_all_batches = {}
+avg_temp_per_place_all_batches = {}
+
+def calc_average_over_entire_data(temp_per_place_all_batches: List[Dict[str, float]]) -> Dict[str, float]:
+    '''
+    Function to calculate average of entire data i.e. the parsed data that is returned from `batch_calculation()`
+
+    :param temp_per_place_all_batches: List of dictionaries, each element in the form `{Name of Place: Temp in floating point}`
+    :returns: Dict object in the form - `{Name of Place: Calculated avg temp in floating point}`
+    '''
+
+    for place_temp_dict in temp_per_place_all_batches:
+        for place, temp in place_temp_dict.items():
+            if place in total_temp_per_place_all_batches.keys():
+                total_temp_per_place_all_batches[place][0] += temp
+                total_temp_per_place_all_batches[place][1] += 1
+            else:
+                temp_with_count_list = [temp, 1]
+                total_temp_per_place_all_batches[place] = temp_with_count_list
+
+    for place, total_temp_with_count in total_temp_per_place_all_batches.items():
+        avg_temp_per_place_all_batches[place] = round(total_temp_with_count[0]/total_temp_with_count[1], 1)
+
+    return avg_temp_per_place_all_batches
+
+
+def batch_calculation(input_batch: Tuple, batch_counter: int) -> int:
+    '''
+    Function to calculate average per batch
+
+    :param input_batch: One input batch containing `n` rows of measurements data
+    :returns: Updated batch_counter.
+    '''
+    batch_counter += 1
+    print("Processing batch - ", batch_counter)
+    input_batch_list = []
+
+    for element in input_batch:
+        element_list = element.split(';')
+        input_batch_list.append({element_list[0]: float(element_list[1])})
+
+    calc_average_over_entire_data(input_batch_list)
+    return batch_counter
+
+
+def main():
+
+    start_time = time.time()
+    batch_counter: int = 0
+
+    with open(filename, 'rb') as f:
+        for n_lines in iter(lambda: tuple(line.decode('utf-8').strip() for line in islice(f, n)), ()):    
+            batch_counter = batch_calculation(n_lines, batch_counter)
+
+    calc_average_over_entire_data(temp_per_place_all_batches)
+
+    print(avg_temp_per_place_all_batches)
+    print("Time taken:", (time.time() - start_time), "seconds")
+
+
+if __name__ == '__main__':
+    main()