Skip to content

feat: add script to call sm2a promotion pipeline #193

feat: add script to call sm2a promotion pipeline

feat: add script to call sm2a promotion pipeline #193

Workflow file for this run

# This GitHub Actions workflow automates the process of
# publishing dataset collections to a staging environment
# It is triggered by a pull request to the main branch
# that modifies any files within the ingestion-data/dataset-config/ directory
# The workflow includes steps to
# - publish the datasets,
# - constantly updates the status of the workflow in the PR comment
name: Publish collection to staging
on:
pull_request:
branches: ['main']
paths:
# Run the workflow only if files inside this path are updated
- ingestion-data/staging/dataset-config/*
push:
branches:
- main
permissions:
pull-requests: write
contents: read
jobs:
publish-new-datasets:
if: ${{ github.event_name == 'pull_request' && (github.event.action == 'synchronize' || github.event.action == 'opened') }}
runs-on: ubuntu-latest
environment: staging
outputs:
publishedCollections: ${{ steps.publish-collections.outputs.success_collections }}
commentId: ${{ steps.init-comment.outputs.COMMENT_ID }}
steps:
- uses: actions/checkout@v4
# Initializes the PR comment
# Edits existing or creates new comment
# Why? - Cleanliness!
- name: Initialize PR comment with workflow start
id: init-comment
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
WORKFLOW_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
body="### Workflow Status
**Starting workflow...** [View action run]($WORKFLOW_URL)"
# Get the PR number
PR_NUMBER=${{ github.event.pull_request.number }}
# Fetch existing comments
COMMENTS=$(gh api repos/${{ github.repository }}/issues/${PR_NUMBER}/comments --jq '.[] | select(.body | contains("### Workflow Status")) | {id: .id, body: .body}')
# Check if a comment already exists
COMMENT_ID=$(echo "$COMMENTS" | jq -r '.id' | head -n 1)
if [ -z "$COMMENT_ID" ]; then
# No existing comment, create a new one
COMMENT_ID=$(gh api repos/${{ github.repository }}/issues/${PR_NUMBER}/comments -f body="$body" --jq '.id')
else
# Comment exists, overwrite the existing comment
gh api repos/${{ github.repository }}/issues/comments/$COMMENT_ID -X PATCH -f body="$body"
fi
echo "COMMENT_ID=$COMMENT_ID" >> $GITHUB_OUTPUT
# Find only the newly added files
# Only .json files
# The files are outputted to GITHUB_OUTPUT, which can be used in subsequent steps
- name: Get newly added files
id: changed-files
uses: tj-actions/changed-files@v45
with:
files: |
**.json
- name: List all newly added files
env:
ADDED_FILES: ${{ steps.changed-files.outputs.added_files }}
run: |
for file in ${ADDED_FILES}; do
echo "$file was added"
done
# Uses service client creds to get token
# No username/password needed
- name: Get auth token
id: get-token
run: |
echo "Vars: $vars"
response=$(curl -X POST \
${{ vars.STAGING_COGNITO_DOMAIN }}/oauth2/token \
-H "Content-Type: application/x-www-form-urlencoded" \
-d "grant_type=client_credentials" \
-d "client_id=${{ vars.STAGING_CLIENT_ID }}" \
-d "client_secret=${{ secrets.STAGING_CLIENT_SECRET }}"
)
access_token=$(echo "$response" | jq -r '.access_token')
echo "ACCESS_TOKEN=$access_token" >> $GITHUB_OUTPUT
# Makes request to /dataset/publish endpoint
# Outputs only files that were successfully published
# Used by other steps
# If none of the requests are successful, workflow fails
# Updates the PR comment with status of collection publication
- name: Publish all newly added collections to staging
id: publish-collections
env:
ADDED_FILES: ${{ steps.changed-files.outputs.added_files }}
WORKFLOWS_URL: ${{ vars.STAGING_WORKFLOWS_URL }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
AUTH_TOKEN: ${{ steps.get-token.outputs.ACCESS_TOKEN }}
COMMENT_ID: ${{ steps.init-comment.outputs.COMMENT_ID }}
run: |
if [ -z "$WORKFLOWS_URL" ]; then
echo "WORKFLOWS_URL is not set"
exit 1
fi
if [ -z "$AUTH_TOKEN" ]; then
echo "AUTH_TOKEN is not set"
exit 1
fi
publish_url="${WORKFLOWS_URL%/}/dataset/publish"
bearer_token=$AUTH_TOKEN
# Track successful publications
all_failed=true
declare -a success_collections=()
status_message='### Collection Publication Status
'
for file in ${ADDED_FILES}; do
echo $file
if [ -f "$file" ]; then
dataset_config=$(jq '.' "$file")
collection_id=$(jq -r '.collection' "$file")
echo "Publishing $collection_id"
response=$(curl -s -w "%{http_code}" -o response.txt -X POST "$publish_url" \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $AUTH_TOKEN" \
-d "$dataset_config"
)
status_code=$(tail -n1 <<< "$response")
# Update status message based on response code
if [ "$status_code" -eq 200 ] || [ "$status_code" -eq 201 ]; then
echo "$collection_id successfully published ✅"
status_message+="- **$collection_id**: Successfully published ✅
"
success_collections+=("$file")
all_failed=false
else
echo "$collection_id failed to publish ❌"
status_message+="- **$collection_id**: Failed to publish. Error code $status_code. ❌
"
fi
else
echo "File $file does not exist"
exit 1
fi
done
# Exit workflow if all the requests fail
if [ "$all_failed" = true ]; then
echo "All collections failed to publish."
exit 1
fi
# Output only successful collections to be used in subsequent steps
echo "success_collections=${success_collections[*]}" >> $GITHUB_OUTPUT
# Update PR comment
CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body')
UPDATED_BODY="$CURRENT_BODY
$status_message"
gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY"
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.9'
- uses: actions/cache@v4
with:
path: ${{ env.pythonLocation }}
key: ${{ env.pythonLocation }}-pip-${{ hashFiles('requirements.txt') }}
# If the workflow fails at any point, the PR comment will be updated
- name: Update PR comment on overall workflow failure
if: failure()
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
COMMENT_ID: ${{ steps.init-comment.outputs.COMMENT_ID }}
run: |
WORKFLOW_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body')
UPDATED_BODY="$CURRENT_BODY
** ❌ The workflow run failed. [See logs here]($WORKFLOW_URL)**"
gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY"
create-mdx-files-and-open-pr:
runs-on: ubuntu-latest
environment: staging
needs: publish-new-datasets
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Use output from publish-new-datasets
run: |
echo "The output from the previous step is: ${{ needs.publish-new-datasets.outputs.publishedCollections }}"
# Creates a slim dataset mdx file for each collection based on the dataset config json
- name: Create dataset mdx for given collections
env:
PUBLISHED_COLLECTION_FILES: ${{ needs.publish-new-datasets.outputs.publishedCollections }}
run: |
echo $PUBLISHED_COLLECTION_FILES
collection_ids=""
pip install -r ./scripts/requirements.txt
for file in ${PUBLISHED_COLLECTION_FILES}; do
collection_id=$(python3 ./scripts/generate_mdx.py "$file")
collection_id=$(echo "$collection_id" | sed 's/^["\s]*//;s/["\s]*$//')
echo "Processed collection ID: $collection_id"
collection_ids="$collection_ids$collection_id,"
done
# Remove trailing comma
collection_ids=${collection_ids%,}
echo "Final collection_ids: $collection_ids"
echo "collection_ids=${collection_ids}" >> $GITHUB_ENV
- name: Set up Variables
run: |
echo "VEDA_CONFIG_REPO=${{ vars.VEDA_CONFIG_REPO_ORG }}/${{ vars.VEDA_CONFIG_REPO_NAME }}" >> $GITHUB_ENV
- name: Clone veda-config repository
run: |
git clone https://github.com/${{ env.VEDA_CONFIG_REPO }}.git
ls
- name: Copy untracked mdx files to veda-config
run: |
echo "Copying untracked .mdx files to veda-config repository"
ls ./ingestion-data/dataset-mdx/
mkdir -p datasets
find ingestion-data/dataset-mdx/ -name '*.mdx' -exec cp {} veda-config/datasets/ \;
- name: Create veda-config PR with changes
id: create-pr
env:
GITHUB_TOKEN: ${{ secrets.VEDA_CONFIG_REPO_ACCESS_TOKEN }}
COMMENT_ID: ${{ needs.publish-new-datasets.outputs.commentId }}
PUBLISHED_COLLECTION_FILES: ${{ steps.publish-collections.outputs.success_collections }}
run: |
cd veda-config
git config --global user.name "github-actions[bot]"
git config --global user.email "github-actions[bot]@users.noreply.github.com"
git remote set-url origin https://${{ secrets.VEDA_CONFIG_REPO_ACCESS_TOKEN }}@github.com/${{ env.VEDA_CONFIG_REPO }}
files_string=$(IFS=$'\n'; echo "${PUBLISHED_COLLECTION_FILES[*]}")
hash=$(echo -n "$files_string" | md5sum | cut -d ' ' -f 1)
NEW_BRANCH="add-dataset-$hash"
git fetch origin
if git ls-remote --exit-code --heads origin $NEW_BRANCH; then
git push origin --delete $NEW_BRANCH
fi
git checkout -b $NEW_BRANCH
git status
git add .
git commit -m "feat: add MDX files for dataset(s) [Automated workflow]"
git push origin $NEW_BRANCH
# Convert the comma-separated list into bullet points
collection_bullet_points=""
IFS=',' read -ra IDs <<< "$collection_ids"
# Extract the first collection ID
first_collection_id="${IDs[0]}"
for id in "${IDs[@]}"; do
collection_bullet_points+="- $id\n"
done
pr_title="Add dataset(s) - $first_collection_id [Automated PR by ${{ github.actor }}]"
body="### Add dataset(s) - $first_collection_id [Automated PR by ${{ github.actor }}]\n\n$collection_bullet_points"
echo "$body"
PR_URL=$(GITHUB_TOKEN=${{ secrets.VEDA_CONFIG_REPO_ACCESS_TOKEN }} gh pr create -R ${{ env.VEDA_CONFIG_REPO }} -H $NEW_BRANCH -B develop --title "$pr_title" --body "$(echo -e "$body")")
echo "PR_URL=$PR_URL" >> $GITHUB_OUTPUT
echo "PR creation succeeded!"
# Updates the comment with a link to the above PR
- name: Update PR comment with PR creation result
if: success()
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
COMMENT_ID: ${{ needs.publish-new-datasets.outputs.commentId }}
run: |
PR_URL=${{ steps.create-pr.outputs.PR_URL }}
CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body')
UPDATED_BODY="$CURRENT_BODY
**A PR has been created with the dataset configuration: 🗺️ [PR link]($PR_URL)**"
gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY"
- name: Update PR comment on PR creation failure
if: failure() && steps.create-pr.outcome == 'failure'
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
COMMENT_ID: ${{ needs.publish-new-datasets.outputs.commentId }}
run: |
CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body')
UPDATED_BODY="$CURRENT_BODY
**Failed ❌ to create a PR with the dataset configuration. 😔 **"
gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY"
# If the workflow fails at any point, the PR comment will be updated
- name: Update PR comment on overall workflow failure
if: failure() && steps.create-pr.outcome != 'failure'
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
COMMENT_ID: ${{ needs.publish-new-datasets.outputs.commentId }}
run: |
WORKFLOW_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body')
UPDATED_BODY="$CURRENT_BODY
# Output WORKFLOW_URL to logs for verification
echo "Workflow URL: $WORKFLOW_URL"
** ❌ The workflow run failed. [See logs here]($WORKFLOW_URL)**"
gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY"
echo "Updated Comment Body: $UPDATED_BODY"
publish-to-prod-on-pr-merge:
# TEMPORARILY COMMENTED OUT TO TEST API REQUEST
# if: ${{ github.event_name == 'pull_request' && github.event.action == 'closed' && github.event.pull_request.merged == true }}
runs-on: ubuntu-latest
environment: staging
needs: [publish-new-datasets, create-mdx-files-and-open-pr]
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Use output from publish-new-datasets
run: |
echo "The output from the previous step is: ${{ needs.publish-new-datasets.outputs.publishedCollections }}"
- name: Publish to production on PR merge
env:
PUBLISHED_COLLECTION_FILES: ${{ needs.publish-new-datasets.outputs.publishedCollections }}
SM2A_ADMIN_USERNAME: ${{ secrets.SM2A_ADMIN_USERNAME }}
SM2A_ADMIN_PASSWORD: ${{ secrets.SM2A_ADMIN_PASSWORD }}
SM2A_API_URL: ${{ vars.SM2A_API_URL }}
PROMOTION_DAG: ${{ vars.PROMOTION_DAG_NAME }}
run: |
echo $PUBLISHED_COLLECTION_FILES
collection_ids=""
pip install -r ./scripts/requirements.txt
for file in ${PUBLISHED_COLLECTION_FILES}; do
python3 ./scripts/promote_to_production.py "$file"
echo "Processed file: $file"
done