diff --git a/.gitignore b/.gitignore index 8b69396a..eabcd78f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +__pycache__/ # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. release.zip diff --git a/public/port-0.0.0-py3-none-any.whl b/public/port-0.0.0-py3-none-any.whl index 4b6f4ac8..2032c421 100644 Binary files a/public/port-0.0.0-py3-none-any.whl and b/public/port-0.0.0-py3-none-any.whl differ diff --git a/src/framework/processing/py/dist/port-0.0.0-py3-none-any.whl b/src/framework/processing/py/dist/port-0.0.0-py3-none-any.whl index 4b6f4ac8..2032c421 100644 Binary files a/src/framework/processing/py/dist/port-0.0.0-py3-none-any.whl and b/src/framework/processing/py/dist/port-0.0.0-py3-none-any.whl differ diff --git a/src/framework/processing/py/port/script.py b/src/framework/processing/py/port/script.py index 7abb7d71..22ee8b7f 100644 --- a/src/framework/processing/py/port/script.py +++ b/src/framework/processing/py/port/script.py @@ -1,62 +1,154 @@ +import fnmatch +import json +from datetime import datetime +from collections import namedtuple + import port.api.props as props -from port.api.commands import (CommandSystemDonate, CommandUIRender) +from port.api.commands import CommandSystemDonate, CommandUIRender import pandas as pd import zipfile +ExtractionResult = namedtuple("ExtractionResult", ["id", "title", "data_frame"]) + + +def get_in(dct, *key_path): + for key in key_path: + dct = dct.get(key) + if dct is None: + return + return dct + + +def parse_json_to_dataframe(parsed_dict): + data = [] + for obj in parsed_dict["timelineObjects"]: + if "activitySegment" not in obj: + continue + + segment = obj["activitySegment"] + activity_type = segment["activityType"] + + if activity_type not in {"WALKING", "CYCLING","RUNNING"}: + continue + + start_timestamp_str = segment["duration"]["startTimestamp"] + start_timestamp = datetime.fromisoformat( + start_timestamp_str[:-1] + ) # remove the 'Z' + + if meters := get_in(segment, "waypointPath", "distanceMeters"): + distance_meters = meters + elif meters := get_in(segment, "simplifiedRawPath", "distanceMeters"): + distance_meters = meters + elif meters := segment.get("distance"): + distance_meters = meters + else: + continue + + data.append([start_timestamp, activity_type, distance_meters]) + + return pd.DataFrame( + data, columns=["startTimestamp", "activityType", "distanceMeters"] + ) + + +def aggregate_distance_by_day_activity(df): + # Format the startTimestamp to "year-month-day" format + df["startTimestamp"] = df["startTimestamp"].dt.strftime("%Y-%m-%d") + + # Group by formatted date and activityType, then aggregate the distance + aggregated_df = ( + df.groupby(["startTimestamp", "activityType"])["distanceMeters"] + .sum() + .reset_index() + ) + + return aggregated_df + + +def extract(df): + aggregated_df = aggregate_distance_by_day_activity(df) + aggregated_df["Afstand in km"] = aggregated_df["distanceMeters"] / 1000 + + results = [] + for activity_type, title in [ + ("WALKING", {"en": "Walking", "nl": "Gewandeld"}), + ("CYCLING", {"en": "Cycling", "nl": "Gefietst"}), + ("RUNNING", {"en": "Running", "nl": "Hardgelopen"}), + ]: + df = aggregated_df.loc[aggregated_df["activityType"] == activity_type] + if len(df) == 0: + continue + + df["Datum"] = df["startTimestamp"] + df = ( + df.drop(columns=["distanceMeters", "activityType", "startTimestamp"]) + .reset_index(drop=True) + .reindex(columns=["Datum", "Afstand in km"]) + ) + results.append( + ExtractionResult( + id=activity_type.lower(), + title=props.Translatable(title), + data_frame=df, + ) + ) + return results + def process(sessionId): yield donate(f"{sessionId}-tracking", '[{ "message": "user entered script" }]') - platforms = ["Twitter", "Facebook", "Instagram", "Youtube"] - - subflows = len(platforms) - steps = 2 - step_percentage = (100/subflows)/steps - - # progress in % - progress = 0 - - for index, platform in enumerate(platforms): - meta_data = [] - meta_data.append(("debug", f"{platform}: start")) - - # STEP 1: select the file - progress += step_percentage - data = None - while True: - meta_data.append(("debug", f"{platform}: prompt file")) - promptFile = prompt_file(platform, "application/zip, text/plain") - fileResult = yield render_donation_page(platform, promptFile, progress) - if fileResult.__type__ == 'PayloadString': - meta_data.append(("debug", f"{platform}: extracting file")) - extractionResult = doSomethingWithTheFile(platform, fileResult.value) - if extractionResult != 'invalid': - meta_data.append(("debug", f"{platform}: extraction successful, go to consent form")) - data = extractionResult + meta_data = [] + meta_data.append(("debug", f"start")) + + # STEP 1: select the file + data = None + while True: + print("A") + promptFile = prompt_file() + print("B") + fileResult = yield render_donation_page(promptFile, 33) + print("C") + if fileResult.__type__ == "PayloadString": + meta_data.append(("debug", f"extracting file")) + extractionResult = extract_data_from_zip(fileResult.value) + if extractionResult == "invalid": + meta_data.append( + ("debug", f"prompt confirmation to retry file selection") + ) + retry_result = yield render_donation_page(retry_confirmation(), 33) + if retry_result.__type__ == "PayloadTrue": + meta_data.append(("debug", f"skip due to invalid file")) + continue + else: + meta_data.append(("debug", f"retry prompt file")) break + if extractionResult == 'no-data': + retry_result = yield render_donation_page(retry_no_data_confirmation(), 33) + if retry_result.__type__ == "PayloadTrue": + continue else: - meta_data.append(("debug", f"{platform}: prompt confirmation to retry file selection")) - retry_result = yield render_donation_page(platform, retry_confirmation(platform), progress) - if retry_result.__type__ == 'PayloadTrue': - meta_data.append(("debug", f"{platform}: skip due to invalid file")) - continue - else: - meta_data.append(("debug", f"{platform}: retry prompt file")) - break + break else: - meta_data.append(("debug", f"{platform}: skip to next step")) + meta_data.append( + ("debug", f"extraction successful, go to consent form") + ) + data = extractionResult break - - # STEP 2: ask for consent - progress += step_percentage - if data is not None: - meta_data.append(("debug", f"{platform}: prompt consent")) - prompt = prompt_consent(platform, data, meta_data) - consent_result = yield render_donation_page(platform, prompt, progress) - if consent_result.__type__ == "PayloadJSON": - meta_data.append(("debug", f"{platform}: donate consent data")) - yield donate(f"{sessionId}-{platform}", consent_result.value) + else: + meta_data.append(("debug", f"skip to next step")) + break + + # STEP 2: ask for consent + if data is not None: + meta_data.append(("debug", f"prompt consent")) + prompt = prompt_consent(data, meta_data) + consent_result = yield render_donation_page(prompt, 67) + if consent_result.__type__ == "PayloadJSON": + meta_data.append(("debug", f"donate consent data")) + yield donate(f"{sessionId}", consent_result.value) yield render_end_page() @@ -66,78 +158,92 @@ def render_end_page(): return CommandUIRender(page) -def render_donation_page(platform, body, progress): +def render_donation_page(body, progress): header = props.PropsUIHeader(props.Translatable({ - "en": platform, - "nl": platform + "en": "Google activity", + "nl": "Google activity" })) footer = props.PropsUIFooter(progress) - page = props.PropsUIPageDonation(platform, header, body, footer) + page = props.PropsUIPageDonation("google-activity", header, body, footer) return CommandUIRender(page) -def retry_confirmation(platform): - text = props.Translatable({ - "en": f"Unfortunately, we cannot process your {platform} file. Continue, if you are sure that you selected the right file. Try again to select a different file.", - "nl": f"Helaas, kunnen we uw {platform} bestand niet verwerken. Weet u zeker dat u het juiste bestand heeft gekozen? Ga dan verder. Probeer opnieuw als u een ander bestand wilt kiezen." - }) - ok = props.Translatable({ - "en": "Try again", - "nl": "Probeer opnieuw" - }) - cancel = props.Translatable({ - "en": "Continue", - "nl": "Verder" - }) +def retry_confirmation(): + text = props.Translatable( + { + "en": f"Unfortunately, we cannot process your file. Continue, if you are sure that you selected the right file. Try again to select a different file.", + "nl": f"Helaas, kunnen we uw bestand niet verwerken. Weet u zeker dat u het juiste bestand heeft gekozen? Ga dan verder. Probeer opnieuw als u een ander bestand wilt kiezen.", + } + ) + ok = props.Translatable({"en": "Try again", "nl": "Probeer opnieuw"}) + cancel = props.Translatable({"en": "Continue", "nl": "Verder"}) + return props.PropsUIPromptConfirm(text, ok, cancel) + +def retry_no_data_confirmation(): + text = props.Translatable( + { + "en": f"There does not seem to be location information in your file. Continue, if you are sure that you selected the right file. Try again to select a different file.", + "nl": f"Helaas, er lijkt geen lokatie informatie in uw bestand te zitten. Weet u zeker dat u het juiste bestand heeft gekozen? Ga dan verder. Probeer opnieuw als u een ander bestand wilt kiezen.", + } + ) + ok = props.Translatable({"en": "Try again", "nl": "Probeer opnieuw"}) + cancel = props.Translatable({"en": "Continue", "nl": "Verder"}) return props.PropsUIPromptConfirm(text, ok, cancel) -def prompt_file(platform, extensions): + +def prompt_file(): description = props.Translatable({ - "en": f"Please follow the download instructions and choose the file that you stored on your device. Click “Skip” at the right bottom, if you do not have a {platform} file. ", - "nl": f"Volg de download instructies en kies het bestand dat u opgeslagen heeft op uw apparaat. Als u geen {platform} bestand heeft klik dan op “Overslaan” rechts onder." + "en": f"Please follow the download instructions and choose the file that you stored on your device. Click 'Skip' at the right bottom, if you do not have a file. ", + "nl": f"Volg de download instructies en kies het bestand dat u opgeslagen heeft op uw apparaat. Als u geen bestand heeft klik dan op 'Overslaan' rechts onder.", }) - return props.PropsUIPromptFileInput(description, extensions) + return props.PropsUIPromptFileInput(description, "application/zip") -def doSomethingWithTheFile(platform, filename): - return extract_zip_contents(filename) +def prompt_consent(tables, meta_data): + log_title = props.Translatable({"en": "Log messages", "nl": "Log berichten"}) + tables = [ + props.PropsUIPromptConsentFormTable(table.id, table.title, table.data_frame) + for table in tables + ] + meta_frame = pd.DataFrame(meta_data, columns=["type", "message"]) + meta_table = props.PropsUIPromptConsentFormTable( + "log_messages", log_title, meta_frame + ) + return props.PropsUIPromptConsentForm(tables, [meta_table]) -def extract_zip_contents(filename): - names = [] - try: - file = zipfile.ZipFile(filename) - data = [] - for name in file.namelist(): - names.append(name) - info = file.getinfo(name) - data.append((name, info.compress_size, info.file_size)) - return data - except zipfile.error: - return "invalid" +def filter_json_files(file_list): + pattern = "**/Semantic Location History/*/*_*.json" + return [f for f in file_list if fnmatch.fnmatch(f, pattern)] -def prompt_consent(id, data, meta_data): - table_title = props.Translatable({ - "en": "Zip file contents", - "nl": "Inhoud zip bestand" - }) +def load_and_process_file(z, file, callback): + with z.open(file) as f: + return callback(json.load(f)) - log_title = props.Translatable({ - "en": "Log messages", - "nl": "Log berichten" - }) - data_frame = pd.DataFrame(data, columns=["filename", "compressed size", "size"]) - table = props.PropsUIPromptConsentFormTable("zip_content", table_title, data_frame) - meta_frame = pd.DataFrame(meta_data, columns=["type", "message"]) - meta_table = props.PropsUIPromptConsentFormTable("log_messages", log_title, meta_frame) - return props.PropsUIPromptConsentForm([table], [meta_table]) +def extract_data_from_zip(zip_filepath): + with zipfile.ZipFile(zip_filepath, "r") as z: + files = filter_json_files(z.namelist()) + dfs = [load_and_process_file(z, f, parse_json_to_dataframe) for f in files] + if not dfs: + return "no-data" + df = pd.concat(dfs, ignore_index=True) + return extract(df) def donate(key, json_string): return CommandSystemDonate(key, json_string) + + +if __name__ == "__main__": + import sys + + if len(sys.argv) > 1: + print(extract_data_from_zip(sys.argv[1])) + else: + print("please provide a zip file as argument") diff --git a/src/framework/processing/py/tests/script_test.py b/src/framework/processing/py/tests/script_test.py new file mode 100644 index 00000000..650d846c --- /dev/null +++ b/src/framework/processing/py/tests/script_test.py @@ -0,0 +1,156 @@ +from datetime import datetime +import pytest +import zipfile + + +from port.script import parse_json_to_dataframe +from port.script import aggregate_distance_by_day_activity +from port.script import extract +from port.script import extract_data_from_zip + + +@pytest.fixture +def sample_data(): + return { + "timelineObjects": [ + { + "activitySegment": { + "duration": {"startTimestamp": "2023-04-01T19:13:27.023Z"}, + "activityType": "CYCLING", + "waypointPath": {"distanceMeters": 3600.33}, + } + } + ] + } + + +@pytest.fixture +def sample_data_multiple_activities(): + return { + "timelineObjects": [ + { + "activitySegment": { + "duration": {"startTimestamp": "2023-04-01T19:13:27.023Z"}, + "activityType": "CYCLING", + "waypointPath": {"distanceMeters": 3600.33}, + } + }, + { + "activitySegment": { + "duration": {"startTimestamp": "2023-04-01T20:13:27.023Z"}, + "activityType": "CYCLING", + "waypointPath": {"distanceMeters": 1400.0}, + } + }, + { + "activitySegment": { + "duration": {"startTimestamp": "2023-04-02T08:13:27.023Z"}, + "activityType": "WALKING", + "waypointPath": {"distanceMeters": 800.5}, + } + }, + { + "activitySegment": { + "duration": {"startTimestamp": "2023-04-01T19:13:27.023Z"}, + "activityType": "RUNNING", + "waypointPath": {"distanceMeters": 3600.33}, + } + }, + { + "activitySegment": { + "duration": {"startTimestamp": "2023-04-01T20:13:27.023Z"}, + "activityType": "RUNNING", + "waypointPath": {"distanceMeters": 1400.0}, + } + }, + ] + } + + +def test_parse_json_to_dataframe(sample_data): + df = parse_json_to_dataframe(sample_data) + assert len(df) == 1 + assert df.iloc[0]["activityType"] == "CYCLING" + assert df.iloc[0]["distanceMeters"] == 3600.33 + assert isinstance(df.iloc[0]["startTimestamp"], datetime) + + +def test_parse_json_to_dataframe_skips_non_walking_or_cycling(): + parsed_dict = { + "timelineObjects": [ + { + "activitySegment": { + "activityType": "WALKING", + "duration": {"startTimestamp": "2023-09-17T10:00:00Z"}, + "waypointPath": {"distanceMeters": 1000}, + } + }, + { + "activitySegment": { + "activityType": "CYCLING", + "duration": {"startTimestamp": "2023-09-17T11:00:00Z"}, + "waypointPath": {"distanceMeters": 5000}, + } + }, + { + "activitySegment": { + "activityType": "DRIVING", + "duration": {"startTimestamp": "2023-09-17T12:00:00Z"}, + "waypointPath": {"distanceMeters": 20000}, + } + }, + ] + } + + df = parse_json_to_dataframe(parsed_dict) + assert "DRIVING" not in df.activityType.values + + +def test_aggregate_distance_by_day_activity(sample_data): + df = parse_json_to_dataframe(sample_data) + aggregated_df = aggregate_distance_by_day_activity(df) + + assert len(aggregated_df) == 1 + assert aggregated_df.iloc[0]["startTimestamp"] == "2023-04-01" + assert aggregated_df.iloc[0]["activityType"] == "CYCLING" + assert aggregated_df.iloc[0]["distanceMeters"] == 3600.33 + + +def test_aggregation_over_multiple_activities(sample_data_multiple_activities): + df = parse_json_to_dataframe(sample_data_multiple_activities) + aggregated_df = aggregate_distance_by_day_activity(df) + + # Verify that there are 2 aggregated entries (one for each day) + assert len(aggregated_df) == 3 + + # For 2023-04-01, there were two cycling activities. We sum their distances. + cycling_data = aggregated_df[(aggregated_df["activityType"] == "CYCLING")] + assert len(cycling_data) == 1 + assert cycling_data.iloc[0]["distanceMeters"] == (3600.33 + 1400.0) + + # For 2023-04-02, there was one walking activity. + walking_data = aggregated_df[aggregated_df["activityType"] == "WALKING"] + assert len(walking_data) == 1 + assert walking_data.iloc[0]["distanceMeters"] == 800.5 + + # For 2023-05-02, there was one running activity. + walking_data = aggregated_df[aggregated_df["activityType"] == "RUNNING"] + assert len(walking_data) == 1 + assert walking_data.iloc[0]["distanceMeters"] == (3600.33 + 1400.0) + + +def test_extract_sample_data(sample_data): + results = extract(parse_json_to_dataframe(sample_data)) + # Verify the results + assert len(results) == 1 + assert results[0].id == "cycling" + assert results[0].title.translations["nl"] == "Gefietst" + for result in results: + assert "distanceMeters" not in result.data_frame.columns + assert "Afstand in km" in result.data_frame.columns + +def test_empty_zip(tmp_path): + path = tmp_path.joinpath("test.zip") + z = zipfile.ZipFile(path, "w") + z.close() + assert extract_data_from_zip(path) == "no-data" \ No newline at end of file