Scrape Groceries #453
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Scrape Groceries | |
on: | |
workflow_dispatch: | |
schedule: | |
- cron: "0 1 * * *" # Daily at 1am UTC (12pm AEST) | |
env: | |
AWS_REGION: ap-southeast-2 | |
jobs: | |
scrape-woolies: | |
permissions: | |
contents: read # Required for checkout action | |
id-token: write # This is required for requesting the JWT | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v4 | |
- run: pip3 install -r requirements.txt | |
- name: Configure AWS Credentials | |
uses: aws-actions/configure-aws-credentials@v4 | |
with: | |
role-to-assume: "${{ vars.TARGET_ROLE_ARN }}" | |
aws-region: ap-southeast-2 | |
# it's okay to ignore the exit code on the last line, if the file doesn't exist then we scrape it | |
- run: | | |
save_path="$(python3 main.py sync --print-save-path woolies)" | |
echo "Save path is ${save_path}" | |
mkdir -p ./output/ | |
aws s3 cp "s3://grocery-scrape-au/${save_path}" "./output/${save_path}" || true | |
- run: python3 main.py sync woolies --skip-existing | |
- uses: actions/upload-artifact@v3 | |
with: | |
name: woolies_snapshot | |
path: ./output/woolies/ | |
# Need to refresh credentials, the job can run for a while and they expire | |
- name: Configure AWS Credentials | |
uses: aws-actions/configure-aws-credentials@v4 | |
with: | |
role-to-assume: "${{ vars.TARGET_ROLE_ARN }}" | |
aws-region: ap-southeast-2 | |
- run: aws s3 sync ./output/woolies/ s3://grocery-scrape-au/woolies/ | |
scrape-coles: | |
permissions: | |
contents: read # Required for checkout action | |
id-token: write # This is required for requesting the JWT | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v4 | |
- run: pip3 install -r requirements.txt | |
- name: Configure AWS Credentials | |
uses: aws-actions/configure-aws-credentials@v4 | |
with: | |
role-to-assume: "${{ vars.TARGET_ROLE_ARN }}" | |
aws-region: ap-southeast-2 | |
# it's okay to ignore the exit code on the last line, if the file doesn't exist then we scrape it | |
- run: | | |
save_path="$(python3 main.py sync --print-save-path coles)" | |
echo "Save path is ${save_path}" | |
mkdir -p ./output/ | |
aws s3 cp "s3://grocery-scrape-au/${save_path}" "./output/${save_path}" || true | |
- run: python3 main.py sync coles --skip-existing | |
- uses: actions/upload-artifact@v3 | |
with: | |
name: coles_snapshot | |
path: ./output/coles/ | |
- name: Configure AWS Credentials | |
uses: aws-actions/configure-aws-credentials@v4 | |
with: | |
role-to-assume: "${{ vars.TARGET_ROLE_ARN }}" | |
aws-region: ap-southeast-2 | |
- run: aws s3 sync ./output/coles/ s3://grocery-scrape-au/coles/ | |
merge-price-history: | |
permissions: | |
contents: read # Required for checkout action | |
id-token: write # This is required for requesting the JWT | |
runs-on: ubuntu-latest | |
needs: | |
- scrape-woolies | |
- scrape-coles | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Download coles artifact | |
uses: actions/download-artifact@v3 | |
with: | |
name: coles_snapshot | |
path: ./output/coles/ | |
- name: Download woolies artifact | |
uses: actions/download-artifact@v3 | |
with: | |
name: woolies_snapshot | |
path: ./output/woolies/ | |
- run: pip3 install -r requirements.txt | |
- name: Configure AWS Credentials | |
uses: aws-actions/configure-aws-credentials@v4 | |
with: | |
role-to-assume: "${{ vars.TARGET_ROLE_ARN }}" | |
aws-region: ap-southeast-2 | |
# Use "cp" because it is an error if the file doesn't exist. It means history | |
# is broken and needs to be built from scratch locally. | |
- run: aws s3 cp s3://grocery-scrape-au/latest-canonical.json.gz ./output/ | |
- run: python3 main.py analysis --compress | |
- uses: actions/upload-artifact@v3 | |
with: | |
name: latest_canonical | |
path: ./output/latest-canonical.json.gz | |
- name: Upload all files after finished analysis | |
# The content-encoding is necessary so that S3 sends the correct content-encoding header on GET | |
run: | | |
aws s3 sync ./output/ s3://grocery-scrape-au/ | |
aws s3 cp --content-encoding gzip static/data/latest-canonical.woolies.compressed.json.gz s3://hotprices.org/data/ | |
aws s3 cp --content-encoding gzip static/data/latest-canonical.coles.compressed.json.gz s3://hotprices.org/data/ |