-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from kunaljubce/kunaljubce/python-script-to-pro…
…cess-1br-v1 Kunaljubce/python script to process 1br v1
- Loading branch information
Showing
2 changed files
with
98 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,29 @@ | ||
# 1brc | ||
This is a Python implementation of the wildly popular One Billion Rows challenge, initiated originally to be solved only in Java - https://github.com/gunnarmorling/1brc | ||
|
||
## Experimental Setup | ||
|
||
* All executions and runtimes have been noted while running on Python 3.8.18 on a Apple M2 Pro machine with a 16 GB RAM and 500 GB hard disk. | ||
* `createMeasurements.py` copied from https://github.com/ifnesi/1brc/blob/main/createMeasurements.py. | ||
* For every iteration, the runtime noted below is the observed best runtime after multiple trials with different batch sizes and other parameter adjustments. | ||
|
||
### Runtime improvements over iterations | ||
|
||
| Iteration Number | Runtime (in seconds) | | ||
| ---------------- | -------------------- | | ||
| 1 | 1138.61 | | ||
| 2 | | | ||
|
||
### Implementation Details and improvements done over iterations | ||
|
||
##### Iteration 1 | ||
* Read the file in batches. Call `batch_calculation()` to do the calculation of average batch by batch, as the file is read. Inside this function - | ||
* * First the tuple object read is split and converted to a list - [Place, Temperature] | ||
* * This list is then converted to a dict {Place, Temperature} and inserted into an List[Dict] variable - `input_batch_list`. | ||
* * Finally we are passing this `input_batch_list` variable to the `calc_average_over_entire_data()` function. | ||
* The `calc_average_over_entire_data()` achieves two objectives - | ||
* * When called from within `batch_calculation()`, it iterates over the `input_batch_list` and calculates the average per batch. | ||
* * | ||
|
||
|
||
|
||
`createMeasurements.py` copied from https://github.com/ifnesi/1brc/blob/main/createMeasurements.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
import time | ||
from itertools import islice | ||
from typing import Tuple, List, Dict | ||
|
||
filename = 'measurements.txt' | ||
|
||
n = 500000 # Or whatever chunk size you want | ||
|
||
temp_per_place_all_batches = [] # List of dicts i.e. [{'Alexandra': -7.6}, {'Malé': 30.6}, {'Pyongyang': 3.4}] | ||
total_temp_per_place_all_batches = {} | ||
avg_temp_per_place_all_batches = {} | ||
|
||
def calc_average_over_entire_data(temp_per_place_all_batches: List[Dict[str, float]]) -> Dict[str, float]: | ||
''' | ||
Function to calculate average of entire data i.e. the parsed data that is returned from `batch_calculation()` | ||
:param temp_per_place_all_batches: List of dictionaries, each element in the form `{Name of Place: Temp in floating point}` | ||
:returns: Dict object in the form - `{Name of Place: Calculated avg temp in floating point}` | ||
''' | ||
|
||
for place_temp_dict in temp_per_place_all_batches: | ||
for place, temp in place_temp_dict.items(): | ||
if place in total_temp_per_place_all_batches.keys(): | ||
total_temp_per_place_all_batches[place][0] += temp | ||
total_temp_per_place_all_batches[place][1] += 1 | ||
else: | ||
temp_with_count_list = [temp, 1] | ||
total_temp_per_place_all_batches[place] = temp_with_count_list | ||
|
||
for place, total_temp_with_count in total_temp_per_place_all_batches.items(): | ||
avg_temp_per_place_all_batches[place] = round(total_temp_with_count[0]/total_temp_with_count[1], 1) | ||
|
||
return avg_temp_per_place_all_batches | ||
|
||
|
||
def batch_calculation(input_batch: Tuple, batch_counter: int) -> int: | ||
''' | ||
Function to calculate average per batch | ||
:param input_batch: One input batch containing `n` rows of measurements data | ||
:returns: Updated batch_counter. | ||
''' | ||
batch_counter += 1 | ||
print("Processing batch - ", batch_counter) | ||
input_batch_list = [] | ||
|
||
for element in input_batch: | ||
element_list = element.split(';') | ||
input_batch_list.append({element_list[0]: float(element_list[1])}) | ||
|
||
calc_average_over_entire_data(input_batch_list) | ||
return batch_counter | ||
|
||
|
||
def main(): | ||
|
||
start_time = time.time() | ||
batch_counter: int = 0 | ||
|
||
with open(filename, 'rb') as f: | ||
for n_lines in iter(lambda: tuple(line.decode('utf-8').strip() for line in islice(f, n)), ()): | ||
batch_counter = batch_calculation(n_lines, batch_counter) | ||
|
||
calc_average_over_entire_data(temp_per_place_all_batches) | ||
|
||
print(avg_temp_per_place_all_batches) | ||
print("Time taken:", (time.time() - start_time), "seconds") | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |