Skip to content

Scrape Groceries

Scrape Groceries #412

name: Scrape Groceries
on:
workflow_dispatch:
schedule:
- cron: "0 1 * * *" # Daily at 1am UTC (12pm AEST)
env:
AWS_REGION: ap-southeast-2
jobs:
scrape-woolies:
permissions:
contents: read # Required for checkout action
id-token: write # This is required for requesting the JWT
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- run: pip3 install -r requirements.txt
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: "${{ vars.TARGET_ROLE_ARN }}"
aws-region: ap-southeast-2
# it's okay to ignore the exit code on the last line, if the file doesn't exist then we scrape it
- run: |
save_path="$(python3 main.py sync --print-save-path woolies)"
echo "Save path is ${save_path}"
mkdir -p ./output/
aws s3 cp "s3://grocery-scrape-au/${save_path}" "./output/${save_path}" || true
- run: python3 main.py sync woolies --skip-existing
- uses: actions/upload-artifact@v3
with:
name: woolies_snapshot
path: ./output/woolies/
# Need to refresh credentials, the job can run for a while and they expire
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: "${{ vars.TARGET_ROLE_ARN }}"
aws-region: ap-southeast-2
- run: aws s3 sync ./output/woolies/ s3://grocery-scrape-au/woolies/
scrape-coles:
permissions:
contents: read # Required for checkout action
id-token: write # This is required for requesting the JWT
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- run: pip3 install -r requirements.txt
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: "${{ vars.TARGET_ROLE_ARN }}"
aws-region: ap-southeast-2
# it's okay to ignore the exit code on the last line, if the file doesn't exist then we scrape it
- run: |
save_path="$(python3 main.py sync --print-save-path coles)"
echo "Save path is ${save_path}"
mkdir -p ./output/
aws s3 cp "s3://grocery-scrape-au/${save_path}" "./output/${save_path}" || true
- run: python3 main.py sync coles --skip-existing
- uses: actions/upload-artifact@v3
with:
name: coles_snapshot
path: ./output/coles/
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: "${{ vars.TARGET_ROLE_ARN }}"
aws-region: ap-southeast-2
- run: aws s3 sync ./output/coles/ s3://grocery-scrape-au/coles/
merge-price-history:
permissions:
contents: read # Required for checkout action
id-token: write # This is required for requesting the JWT
runs-on: ubuntu-latest
needs:
- scrape-woolies
- scrape-coles
steps:
- uses: actions/checkout@v4
- name: Download coles artifact
uses: actions/download-artifact@v3
with:
name: coles_snapshot
path: ./output/coles/
- name: Download woolies artifact
uses: actions/download-artifact@v3
with:
name: woolies_snapshot
path: ./output/woolies/
- run: pip3 install -r requirements.txt
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: "${{ vars.TARGET_ROLE_ARN }}"
aws-region: ap-southeast-2
# Use "cp" because it is an error if the file doesn't exist. It means history
# is broken and needs to be built from scratch locally.
- run: aws s3 cp s3://grocery-scrape-au/latest-canonical.json.gz ./output/
- run: python3 main.py analysis --compress
- uses: actions/upload-artifact@v3
with:
name: latest_canonical
path: ./output/latest-canonical.json.gz
- name: Upload all files after finished analysis
# The content-encoding is necessary so that S3 sends the correct content-encoding header on GET
run: |
aws s3 sync ./output/ s3://grocery-scrape-au/
aws s3 cp --content-encoding gzip static/data/latest-canonical.woolies.compressed.json.gz s3://hotprices.org/data/
aws s3 cp --content-encoding gzip static/data/latest-canonical.coles.compressed.json.gz s3://hotprices.org/data/