Check Website links #782
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Ultralytics YOLO 🚀, AGPL-3.0 license | |
# Continuous Integration (CI) GitHub Actions tests broken link checker using https://github.com/lycheeverse/lychee | |
# Ignores the following status codes to reduce false positives: | |
# - 401(Vimeo, 'unauthorized') | |
# - 403(OpenVINO, 'forbidden') | |
# - 429(Instagram, 'too many requests') | |
# - 500(Zenodo, 'cached') | |
# - 502(Zenodo, 'bad gateway') | |
# - 999(LinkedIn, 'unknown status code') | |
name: Check Website links | |
on: | |
workflow_dispatch: | |
schedule: | |
- cron: "0 0 * * *" # runs at 00:00 UTC every day | |
jobs: | |
Links: | |
runs-on: ubuntu-latest | |
strategy: | |
fail-fast: false # This ensures that if one job fails, the others will still run | |
matrix: | |
website: [www.ultralytics.com, docs.ultralytics.com] | |
steps: | |
- name: Download and install lychee | |
run: | | |
LYCHEE_URL=$(curl -s https://api.github.com/repos/lycheeverse/lychee/releases/latest | grep "browser_download_url" | grep "x86_64-unknown-linux-gnu.tar.gz" | cut -d '"' -f 4) | |
curl -L $LYCHEE_URL -o lychee.tar.gz | |
tar xzf lychee.tar.gz | |
sudo mv lychee /usr/local/bin | |
- name: Download Website | |
run: | | |
# Download sitemap.xml | |
wget -O sitemap.xml https://${{ matrix.website }}/sitemap.xml | |
# Parse URLs using a combination of tr, sed, and grep | |
tr '\n' ' ' < sitemap.xml | \ | |
sed 's/<loc>/\n<loc>/g' | \ | |
grep -oP '(?<=<loc>).*?(?=</loc>)' | \ | |
sed 's/^[[:space:]]*//;s/[[:space:]]*$//' > urls.txt | |
# Count total URLs to be downloaded | |
total_urls=$(wc -l < urls.txt) | |
echo "Total URLs to be downloaded: $total_urls" | |
# Download all URLs | |
wget \ | |
--adjust-extension \ | |
--reject "*.jpg*,*.jpeg*,*.png*,*.gif*,*.webp*,*.svg*,*.txt" \ | |
--input-file=urls.txt \ | |
--no-clobber \ | |
--no-parent \ | |
--wait=0.001 \ | |
--random-wait \ | |
--tries=3 \ | |
--no-verbose \ | |
--force-directories | |
- name: Run Broken Link Checks on Website | |
id: lychee | |
uses: ultralytics/actions/retry@main | |
with: | |
timeout_minutes: 60 | |
retry_delay_seconds: 300 | |
retries: 2 | |
run: | | |
# Count successfully downloaded files | |
downloaded_files=$(find ${{ matrix.website }} -type f | wc -l) | |
echo "Scanning $downloaded_files downloaded pages for broken links..." | |
# Create summary.txt with the total page count | |
echo "*Results for $downloaded_files pages in https://${{ matrix.website }}*" > summary.txt | |
echo "" >> summary.txt | |
rm -rf .lycheecache | |
lychee \ | |
--scheme 'https' \ | |
--timeout 60 \ | |
--insecure \ | |
--accept 401,403,429,500,502,999 \ | |
--exclude-all-private \ | |
--exclude 'https?://(www\.)?(linkedin\.com|twitter\.com|instagram\.com|kaggle\.com|tiktok\.com|fonts\.gstatic\.com|fonts\.googleapis\.com|url\.com|tesla\.com|wellfound\.com|.*\.cloudfunctions\.net|0\.0\.0\.0:5543/predict/from_files)' \ | |
--exclude-path '**/ci.yaml' \ | |
--github-token ${{ secrets.GITHUB_TOKEN }} \ | |
--header "User-Agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.183 Safari/537.36" \ | |
'./${{ matrix.website }}/**/*.html' | tee -a summary.txt | |
# Add the summary to GitHub step summary | |
cat summary.txt >> $GITHUB_STEP_SUMMARY | |
# Prepare the summary for Slack (escape newlines, remove [], remove .html, and escape special characters) | |
ESCAPED_SUMMARY=$(awk '{printf "%s\\n", $0}' summary.txt | sed 's/\[//g; s/\]//g; s/\.html//g; s/"/\\"/g') | |
echo "SUMMARY<<EOF" >> $GITHUB_ENV | |
echo "$ESCAPED_SUMMARY" >> $GITHUB_ENV | |
echo "EOF" >> $GITHUB_ENV | |
# Check if lychee found any broken links | |
if grep -q "0 Errors" summary.txt; then | |
echo "No broken links found." | |
exit 0 | |
else | |
echo "Broken links found." | |
exit 1 | |
fi | |
- name: Check for failure and notify | |
if: always() && steps.lychee.outcome == 'failure' && github.event_name == 'schedule' && github.run_attempt == '1' | |
uses: slackapi/[email protected] | |
with: | |
payload: | | |
{"text": "GitHub Actions: Errors found in ${{ github.workflow }} for ${{ matrix.website }} ❌\n\n\n*Repository:* https://github.com/${{ github.repository }}\n*Action:* https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}\n*Author:* ${{ github.actor }}\n*Event:* ${{ github.event_name }}\n\n\n${{ env.SUMMARY }}\n"} | |
env: | |
SLACK_WEBHOOK_URL: ${{ matrix.website == 'www.ultralytics.com' && secrets.SLACK_WEBHOOK_URL_WEBSITE || secrets.SLACK_WEBHOOK_URL_YOLO }} |