Skip to content

Commit

Permalink
Fix missing history
Browse files Browse the repository at this point in the history
Two issues: First, when generating history from scratch, it would
overwrite the history for one store. Second, the Github action would not
correctly download the latest history so it would also not fill in
history but rather start fresh.

This fixes both my making it an error if history is missing and by
ensuring it's correctly filled in when generating from scratch.
  • Loading branch information
Javex committed Feb 23, 2024
1 parent d947000 commit 2b2760e
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 11 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/scrape-groceries.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,9 @@ jobs:
with:
role-to-assume: "${{ vars.TARGET_ROLE_ARN }}"
aws-region: ap-southeast-2
# Sync copies the file down if it exists, skips otherwise
- run: aws s3 sync s3://grocery-scrape-au/latest-canonical.json.gz ./output/
# Use "cp" because it is an error if the file doesn't exist. It means history
# is broken and needs to be built from scratch locally.
- run: aws s3 cp s3://grocery-scrape-au/latest-canonical.json.gz ./output/
- run: python3 main.py analysis --compress
- uses: actions/upload-artifact@v3
with:
Expand Down
5 changes: 3 additions & 2 deletions .github/workflows/scrape-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,9 @@ jobs:
with:
role-to-assume: "${{ vars.TARGET_ROLE_ARN }}"
aws-region: ap-southeast-2
# Sync copies the file down if it exists, skips otherwise
- run: aws s3 sync s3://grocery-scrape-au/latest-canonical.json.gz ./output/
# Use "cp" because it is an error if the file doesn't exist. It means history
# is broken and needs to be built from scratch locally.
- run: aws s3 cp s3://grocery-scrape-au/latest-canonical.json.gz ./output/
- run: python3 main.py analysis
- uses: actions/upload-artifact@v3
with:
Expand Down
27 changes: 20 additions & 7 deletions hotprices_au/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,16 @@ def dedup_items(items):
return dedup_items


def merge_price_history(old_items, new_items):
def merge_price_history(old_items, new_items, store_filter):
if old_items is None:
return new_items

lookup = {}
for old_item in old_items:
lookup[(old_item['store'], old_item['id'])] = old_item
if store_filter is not None and old_item['store'] != store_filter:
new_items.append(old_item)
else:
lookup[(old_item['store'], old_item['id'])] = old_item

new_prices = {}
for new_item in new_items:
Expand Down Expand Up @@ -100,7 +103,17 @@ def copy_items_to_site(latest_canonical_file, data_dir: pathlib.Path, compress):
latest_canonical_file_store.write_text(store_data)


def transform_data(day, output_dir, data_dir, store_filter=None, compress=False):
def transform_data(
day, output_dir, data_dir,
store_filter=None,
compress=False,
require_history=True
):
"""
require_history: Whether to expect the "latest-canonical.json.gz" to
already exist. Default is true (updating history) but can be set
to false if doing a full history build.
"""
all_items = []
for store in sites.sites.keys():
if store_filter is not None and store_filter != store:
Expand Down Expand Up @@ -133,10 +146,10 @@ def transform_data(day, output_dir, data_dir, store_filter=None, compress=False)
all_items += store_items

latest_canonical_file = pathlib.Path(output_dir / "latest-canonical.json.gz")
if latest_canonical_file.exists():
if latest_canonical_file.exists() or require_history:
with gzip.open(latest_canonical_file, 'rt') as fp:
old_items = json.loads(fp.read())
all_items = merge_price_history(old_items, all_items)
all_items = merge_price_history(old_items, all_items, store_filter)

with gzip.open(latest_canonical_file, 'wt') as fp:
fp.write(json.dumps(all_items))
Expand All @@ -146,7 +159,7 @@ def transform_data(day, output_dir, data_dir, store_filter=None, compress=False)


def parse_full_history(output_dir: pathlib.Path, data_dir, store_filter=None, compress=False):
# First remote canonical data
# First remove canonical data
latest_canonical_file = output_dir / "latest-canonical.json.gz"
if latest_canonical_file.exists():
latest_canonical_file.unlink()
Expand All @@ -161,4 +174,4 @@ def parse_full_history(output_dir: pathlib.Path, data_dir, store_filter=None, co
for data_file in sorted(store.iterdir()):
fname = data_file.name
day = datetime.strptime(fname.split('.')[0], '%Y-%m-%d')
transform_data(day, output_dir, data_dir, store_filter=store.name, compress=compress)
transform_data(day, output_dir, data_dir, store_filter=store.name, compress=compress, require_history=False)

0 comments on commit 2b2760e

Please sign in to comment.