Skip to content

Incremental data sync #11679

Incremental data sync

Incremental data sync #11679

Workflow file for this run

name: Incremental data sync
on:
workflow_dispatch: {}
schedule:
- cron: '30 */2 * * *'
concurrency:
group: ${{ github.workflow }}
cancel-in-progress: false
jobs:
sync:
runs-on: ubuntu-latest
env:
GITHUB_TOKEN: ${{ secrets.PERSONAL_GITHUB_TOKEN }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_REGION: ${{ secrets.AWS_REGION }}
TRINO_VERSION: 459
TRINO_REST_VERSION: "0.155"
GITHUB_OWNER: trinodb
GITHUB_REPO: trino
TRINO_DEST_SCHEMA: trinocicd.v2
TRINO_SRC_SCHEMA: github.default
EMPTY_INSERT_LIMIT: 5
# TODO temporarily removed artifacts since they're causing OOMs, figure out a different way to fetch them
SYNC_TABLES: runs,jobs,steps,check_suites,check_runs,check_run_annotations,pulls,pull_commits,pull_stats,review_comments,reviews,issues,issue_comments,commits,teams,members
steps:
- uses: actions/checkout@v4
- name: Download trino-rest
run: |
curl -fLOsS https://github.com/nineinchnick/trino-rest/releases/download/v$TRINO_REST_VERSION/trino-rest-github-$TRINO_REST_VERSION.zip
unzip trino-rest-github-$TRINO_REST_VERSION.zip
- name: Start Trino
run: |
cat <<EOF >config.properties
coordinator=true
node-scheduler.include-coordinator=true
http-server.http.port=8080
discovery.uri=http://localhost:8080
query.max-memory-per-node=4086929818B
EOF
mkdir -p $(pwd)/hive-cache && chmod go+w $(pwd)/hive-cache
CONTAINER_ID=$(docker run \
-v $(pwd)/config.properties:/etc/trino/config.properties \
-v $(pwd)/trino-rest-github-$TRINO_REST_VERSION:/usr/lib/trino/plugin/github \
-v $(pwd)/catalog/github.properties:/etc/trino/catalog/github.properties \
-v $(pwd)/catalog/trinocicd.properties:/etc/trino/catalog/trinocicd.properties \
-v $(pwd)/hive-cache:/opt/hive-cache \
-v $(pwd)/sql:/sql \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
-e AWS_REGION \
-e GITHUB_TOKEN \
-p 8080:8080 \
--name trino \
-d \
trinodb/trino:$TRINO_VERSION)
SERVER_IP=$(docker inspect --format '{{range.NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "$CONTAINER_ID")
echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV
echo "SERVER_IP=$SERVER_IP" >> $GITHUB_ENV
i=0
until docker inspect "${CONTAINER_ID}" --format "{{json .State.Health.Status }}" | grep -q '"healthy"'; do
if [[ $((i++)) -ge 10 ]]; then
echo "🚨 Too many retries waiting for Trino to start"
exit 1
fi
sleep 10
done
- name: Run Sync
run: |
docker run \
-e GITHUB_OWNER \
-e GITHUB_REPO \
-e TRINO_DEST_SCHEMA \
-e TRINO_SRC_SCHEMA \
-e EMPTY_INSERT_LIMIT \
-e SYNC_TABLES \
-e TRINO_URL=jdbc:trino://$SERVER_IP:8080/github/default \
--rm \
nineinchnick/trino-rest:$TRINO_REST_VERSION \
java -cp "/usr/lib/trino/plugin/github/*" pl.net.was.rest.github.Sync
- name: Run Sync for Starburst Teams
run: |
docker run \
-e GITHUB_OWNER=starburstdata \
-e GITHUB_REPO \
-e TRINO_DEST_SCHEMA \
-e TRINO_SRC_SCHEMA \
-e EMPTY_INSERT_LIMIT \
-e SYNC_TABLES=teams,members \
-e TRINO_URL=jdbc:trino://$SERVER_IP:8080/github/default \
--rm \
nineinchnick/trino-rest:$TRINO_REST_VERSION \
java -cp "/usr/lib/trino/plugin/github/*" pl.net.was.rest.github.Sync
- name: Dump Trino logs
if: always()
run: |
docker logs trino