diff --git a/.gitignore b/.gitignore index 6dd1e02..349cd7a 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ /build /env /venv +/.venv *.pyc /.vscode .coverage diff --git a/README.md b/README.md index 83283b5..e9e5727 100644 --- a/README.md +++ b/README.md @@ -37,11 +37,15 @@ The new HTTP Archive data pipeline built entirely on GCP This repo handles the HTTP Archive data pipeline, which takes the results of the monthly HTTP Archive run and saves this to the `httparchive` dataset in BigQuery. +A secondary pipeline is responsible for populating the Technology Report Firestore collections. + There are currently two main pipelines: - The `all` pipeline which saves data to the new `httparchive.all` dataset - The `combined` pipline which saves data to the legacy tables. This processes both the `summary` tables (`summary_pages` and `summary_requests`) and `non-summary` pipeline (`pages`, `requests`, `response_bodies`....etc.) +The secondary `tech_report` pipeline saves data to a Firestore database (e.g. `tech-report-apis-prod`) across various collections ([see `TECHNOLOGY_QUERIES` in constants.py](modules/constants.py)) + The pipelines are run in Google Cloud Platform (GCP) and are kicked off automatically on crawl completion, based on the code in the `main` branch which is deployed to GCP on each merge. The [`data-pipeline` workflow](https://console.cloud.google.com/workflows/workflow/us-west1/data-pipeline/executions?project=httparchive) as defined by the [data-pipeline-workflows.yaml](./data-pipeline-workflows.yaml) file, runs the whole process from start to finish, including generating the manifest file for each of the two runs (desktop and mobile) and then starting the four dataflow jobs (desktop all, mobile all, desktop combined, mobile combined) in sequence to upload of the HAR files to the BigQuery tables. This can be rerun in case of failure by [publishing a crawl-complete message](#publishing-a-pubsub-message), providing no data was saved to the final BigQuery tables. @@ -156,6 +160,7 @@ This method is best used when developing locally, as a convenience for running t # run the pipeline using a flex template ./run_flex_template all [...] ./run_flex_template combined [...] +./run_flex_template tech_report [...] ``` ### Running a flex template from the Cloud Console diff --git a/build_flex_template.sh b/build_flex_template.sh index 79d0c8f..a5a0d71 100755 --- a/build_flex_template.sh +++ b/build_flex_template.sh @@ -5,9 +5,13 @@ set -u BUILD_TAG=$(date -u +"%Y-%m-%d_%H-%M-%S") +# all and combined pipelines for type in all combined do - gcloud builds submit --substitutions=_TYPE="${type}",_BUILD_TAG="${BUILD_TAG}" . + gcloud builds submit --substitutions=_TYPE="${type}",_BUILD_TAG="${BUILD_TAG}",_WORKER_TYPE=n1-standard-32 . done +# tech_report pipeline +gcloud builds submit --substitutions=_TYPE=tech_report,_BUILD_TAG="${BUILD_TAG}",_WORKER_TYPE=n1-standard-1 . + echo "${BUILD_TAG}" diff --git a/flex_template_metadata_tech_report.json b/flex_template_metadata_tech_report.json new file mode 100644 index 0000000..1a03719 --- /dev/null +++ b/flex_template_metadata_tech_report.json @@ -0,0 +1,36 @@ +{ + "name": "Technology Report API pipeline", + "description": "Runs a pipeline to generate firestore API results", + "parameters": [ + { + "name": "query_type", + "label":"Query type", + "helpText": "Technology query type", + "isOptional": true + }, + { + "name": "firestore_project", + "label":"Firestore project", + "helpText": "Google Cloud project with Firestore", + "isOptional": true + }, + { + "name": "firestore_collection", + "label":"Firestore collection", + "helpText": "Firestore collection with HTTPArchive data", + "isOptional": true + }, + { + "name": "firestore_database", + "label":"Firestore database", + "helpText": "Firestore database with HTTPArchive data", + "isOptional": true + }, + { + "name": "date", + "label":"Date", + "helpText": "Date to query", + "isOptional": true + } + ] +} diff --git a/run_flex_template.sh b/run_flex_template.sh index 0c21026..8c20a4c 100755 --- a/run_flex_template.sh +++ b/run_flex_template.sh @@ -12,17 +12,23 @@ DF_JOB_ID="${REPO}-${TYPE}-$(date -u +%Y%m%d-%H%M%S)" DF_TEMP_BUCKET="gs://${PROJECT}-staging/dataflow" TEMPLATE_BASE_PATH="gs://${PROJECT}/dataflow/templates" +# find the latest template if unset +: "${TEMPLATE_PATH:=$(gsutil ls ${TEMPLATE_BASE_PATH}/${REPO}-"${TYPE}"*.json | sort -r | head -n 1)}" + case "${TYPE}~${TEMPLATE_PATH}" in all~|combined~) : ;; all~gs://*all*) : ;; combined~gs://*combined*) : ;; + tech_report~gs://*tech_report*) : ;; *) - echo "Expected an argumment of either [all|combined] and optionally TEMPLATE_PATH to be set (otherwise the latest template will be used)" + echo "Expected an argumment of either [all|combined|tech_report] and optionally TEMPLATE_PATH to be set (otherwise the latest template will be used)" echo "Examples" echo " $(basename "$0") all ..." echo " $(basename "$0") combined ..." + echo " $(basename "$0") tech_report ..." echo " TEMPLATE_PATH=${TEMPLATE_BASE_PATH}/${REPO}-all-2022-10-12_00-19-44.json $(basename "$0") all ..." echo " TEMPLATE_PATH=${TEMPLATE_BASE_PATH}/${REPO}-combined-2022-10-12_00-19-44.json $(basename "$0") combined ..." + echo " TEMPLATE_PATH=${TEMPLATE_BASE_PATH}/${REPO}-tech_report-2022-10-12_00-19-44.json $(basename "$0") tech_report ..." exit 1 ;; esac @@ -30,8 +36,8 @@ esac # drop the first argument shift -# find the latest template if unset -: "${TEMPLATE_PATH:=$(gsutil ls ${TEMPLATE_BASE_PATH}/${REPO}-"${TYPE}"*.json | sort -r | head -n 1)}" +# replace underscores with hyphens in the job id +DF_JOB_ID=${DF_JOB_ID//_/-} set -u diff --git a/run_tech_report.py b/run_tech_report.py new file mode 100644 index 0000000..4906e74 --- /dev/null +++ b/run_tech_report.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 + +import logging + +from apache_beam.runners import DataflowRunner + +from modules import tech_report_pipeline + + +def run(argv=None): + logging.getLogger().setLevel(logging.INFO) + p = tech_report_pipeline.create_pipeline() + pipeline_result = p.run(argv) + if not isinstance(p.runner, DataflowRunner): + pipeline_result.wait_until_finish() + + +if __name__ == "__main__": + run()