Skip to content

Check Website links #784

Check Website links

Check Website links #784

Workflow file for this run

# Ultralytics YOLO 🚀, AGPL-3.0 license
# Continuous Integration (CI) GitHub Actions tests broken link checker using https://github.com/lycheeverse/lychee
# Ignores the following status codes to reduce false positives:
# - 401(Vimeo, 'unauthorized')
# - 403(OpenVINO, 'forbidden')
# - 429(Instagram, 'too many requests')
# - 500(Zenodo, 'cached')
# - 502(Zenodo, 'bad gateway')
# - 999(LinkedIn, 'unknown status code')
name: Check Website links
on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *" # runs at 00:00 UTC every day
jobs:
Links:
runs-on: ubuntu-latest
strategy:
fail-fast: false # This ensures that if one job fails, the others will still run
matrix:
website: [www.ultralytics.com, docs.ultralytics.com]
steps:
- name: Download and install lychee
run: |
LYCHEE_URL=$(curl -s https://api.github.com/repos/lycheeverse/lychee/releases/latest | grep "browser_download_url" | grep "x86_64-unknown-linux-gnu.tar.gz" | cut -d '"' -f 4)
curl -L $LYCHEE_URL -o lychee.tar.gz
tar xzf lychee.tar.gz
sudo mv lychee /usr/local/bin
- name: Download Website
run: |
# Download sitemap.xml
wget -O sitemap.xml https://${{ matrix.website }}/sitemap.xml
# Parse URLs using a combination of tr, sed, and grep
tr '\n' ' ' < sitemap.xml | \
sed 's/<loc>/\n<loc>/g' | \
grep -oP '(?<=<loc>).*?(?=</loc>)' | \
sed 's/^[[:space:]]*//;s/[[:space:]]*$//' > urls.txt
# Count total URLs to be downloaded
total_urls=$(wc -l < urls.txt)
echo "Total URLs to be downloaded: $total_urls"
# Download all URLs
wget \
--adjust-extension \
--reject "*.jpg*,*.jpeg*,*.png*,*.gif*,*.webp*,*.svg*,*.txt" \
--input-file=urls.txt \
--no-clobber \
--no-parent \
--wait=0.001 \
--random-wait \
--tries=3 \
--no-verbose \
--force-directories
- name: Run Broken Link Checks on Website
id: lychee
uses: ultralytics/actions/retry@main
with:
timeout_minutes: 60
retry_delay_seconds: 300
retries: 2
run: |
# Count successfully downloaded files
downloaded_files=$(find ${{ matrix.website }} -type f | wc -l)
echo "Scanning $downloaded_files downloaded pages for broken links..."
# Create summary.txt with the total page count
echo "*Results for $downloaded_files pages in https://${{ matrix.website }}*" > summary.txt
echo "" >> summary.txt
rm -rf .lycheecache
lychee \
--scheme 'https' \
--timeout 60 \
--insecure \
--accept 401,403,429,500,502,999 \
--exclude-all-private \
--exclude 'https?://(www\.)?(linkedin\.com|twitter\.com|instagram\.com|kaggle\.com|tiktok\.com|fonts\.gstatic\.com|fonts\.googleapis\.com|url\.com|tesla\.com|wellfound\.com|.*\.cloudfunctions\.net|0\.0\.0\.0:5543/predict/from_files)' \
--exclude-path '**/ci.yaml' \
--github-token ${{ secrets.GITHUB_TOKEN }} \
--header "User-Agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.183 Safari/537.36" \
'./${{ matrix.website }}/**/*.html' | tee -a summary.txt
# Add the summary to GitHub step summary
cat summary.txt >> $GITHUB_STEP_SUMMARY
# Prepare the summary for Slack (escape newlines, remove [], remove .html, and escape special characters)
ESCAPED_SUMMARY=$(awk '{printf "%s\\n", $0}' summary.txt | sed 's/\[//g; s/\]//g; s/\.html//g; s/"/\\"/g')
echo "SUMMARY<<EOF" >> $GITHUB_ENV
echo "$ESCAPED_SUMMARY" >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
# Check if lychee found any broken links
if grep -q "0 Errors" summary.txt; then
echo "No broken links found."
exit 0
else
echo "Broken links found."
exit 1
fi
- name: Check for failure and notify
if: always() && steps.lychee.outcome == 'failure' && github.event_name == 'schedule' && github.run_attempt == '1'
uses: slackapi/[email protected]
with:
payload: |
{"text": "GitHub Actions: Errors found in ${{ github.workflow }} for ${{ matrix.website }} ❌\n\n\n*Repository:* https://github.com/${{ github.repository }}\n*Action:* https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}\n*Author:* ${{ github.actor }}\n*Event:* ${{ github.event_name }}\n\n\n${{ env.SUMMARY }}\n"}
env:
SLACK_WEBHOOK_URL: ${{ matrix.website == 'www.ultralytics.com' && secrets.SLACK_WEBHOOK_URL_WEBSITE || secrets.SLACK_WEBHOOK_URL_YOLO }}