Fix missing history

Two issues: First, when generating history from scratch, it would overwrite the history for one store. Second, the Github action would not correctly download the latest history so it would also not fill in history but rather start fresh. This fixes both my making it an error if history is missing and by ensuring it's correctly filled in when generating from scratch.
Javex · Feb 23, 2024 · 2b2760e · 2b2760e
1 parent d947000
commit 2b2760e
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 11 deletions.
diff --git a/.github/workflows/scrape-groceries.yml b/.github/workflows/scrape-groceries.yml
@@ -93,8 +93,9 @@ jobs:
         with:
           role-to-assume: "${{ vars.TARGET_ROLE_ARN }}"
           aws-region: ap-southeast-2
-      # Sync copies the file down if it exists, skips otherwise
-      - run: aws s3 sync s3://grocery-scrape-au/latest-canonical.json.gz ./output/
+      # Use "cp" because it is an error if the file doesn't exist. It means history
+      # is broken and needs to be built from scratch locally.
+      - run: aws s3 cp s3://grocery-scrape-au/latest-canonical.json.gz ./output/
       - run: python3 main.py analysis --compress
       - uses: actions/upload-artifact@v3
         with:

diff --git a/.github/workflows/scrape-test.yml b/.github/workflows/scrape-test.yml
@@ -54,8 +54,9 @@ jobs:
         with:
           role-to-assume: "${{ vars.TARGET_ROLE_ARN }}"
           aws-region: ap-southeast-2
-      # Sync copies the file down if it exists, skips otherwise
-      - run: aws s3 sync s3://grocery-scrape-au/latest-canonical.json.gz ./output/
+      # Use "cp" because it is an error if the file doesn't exist. It means history
+      # is broken and needs to be built from scratch locally.
+      - run: aws s3 cp s3://grocery-scrape-au/latest-canonical.json.gz ./output/
       - run: python3 main.py analysis
       - uses: actions/upload-artifact@v3
         with:

diff --git a/hotprices_au/analysis.py b/hotprices_au/analysis.py
@@ -47,13 +47,16 @@ def dedup_items(items):
     return dedup_items
 
 
-def merge_price_history(old_items, new_items):
+def merge_price_history(old_items, new_items, store_filter):
     if old_items is None:
         return new_items
 
     lookup = {}
     for old_item in old_items:
-        lookup[(old_item['store'], old_item['id'])] = old_item
+        if store_filter is not None and old_item['store'] != store_filter:
+            new_items.append(old_item)
+        else:
+            lookup[(old_item['store'], old_item['id'])] = old_item
 
     new_prices = {}
     for new_item in new_items:
@@ -100,7 +103,17 @@ def copy_items_to_site(latest_canonical_file, data_dir: pathlib.Path, compress):
             latest_canonical_file_store.write_text(store_data)
 
 
-def transform_data(day, output_dir, data_dir, store_filter=None, compress=False):
+def transform_data(
+        day, output_dir, data_dir,
+        store_filter=None,
+        compress=False,
+        require_history=True
+    ):
+    """
+    require_history: Whether to expect the "latest-canonical.json.gz" to
+        already exist. Default is true (updating history) but can be set
+        to false if doing a full history build.
+    """
     all_items = []
     for store in sites.sites.keys():
         if store_filter is not None and store_filter != store:
@@ -133,10 +146,10 @@ def transform_data(day, output_dir, data_dir, store_filter=None, compress=False)
         all_items += store_items
 
     latest_canonical_file = pathlib.Path(output_dir / "latest-canonical.json.gz")
-    if latest_canonical_file.exists():
+    if latest_canonical_file.exists() or require_history:
         with gzip.open(latest_canonical_file, 'rt') as fp:
             old_items = json.loads(fp.read())
-        all_items = merge_price_history(old_items, all_items)
+        all_items = merge_price_history(old_items, all_items, store_filter)
 
     with gzip.open(latest_canonical_file, 'wt') as fp:
         fp.write(json.dumps(all_items))
@@ -146,7 +159,7 @@ def transform_data(day, output_dir, data_dir, store_filter=None, compress=False)
 
 
 def parse_full_history(output_dir: pathlib.Path, data_dir, store_filter=None, compress=False):
-    # First remote canonical data
+    # First remove canonical data
     latest_canonical_file = output_dir / "latest-canonical.json.gz"
     if latest_canonical_file.exists():
         latest_canonical_file.unlink()
@@ -161,4 +174,4 @@ def parse_full_history(output_dir: pathlib.Path, data_dir, store_filter=None, co
         for data_file in sorted(store.iterdir()):
             fname = data_file.name
             day = datetime.strptime(fname.split('.')[0], '%Y-%m-%d')
-            transform_data(day, output_dir, data_dir, store_filter=store.name, compress=compress)
+            transform_data(day, output_dir, data_dir, store_filter=store.name, compress=compress, require_history=False)