Skip to content

Commit

Permalink
Add markdown link checker (#605)
Browse files Browse the repository at this point in the history
Co-authored-by: Akuli <[email protected]>
  • Loading branch information
littlewhitecloud and Akuli authored Jan 13, 2025
1 parent acbd5ed commit 0206836
Show file tree
Hide file tree
Showing 3 changed files with 154 additions and 2 deletions.
13 changes: 12 additions & 1 deletion .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,18 @@ jobs:
version: 2.7.0
- run: editorconfig-checker

markdownlink-checker:
timeout-minutes: 1
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v5
with:
python-version: '3.13'
- run: pip install requests
- name: "Check the links in the markdown files"
run: python check-markdown-links.py

test:
timeout-minutes: 5
runs-on: ubuntu-22.04
Expand Down Expand Up @@ -59,7 +71,6 @@ jobs:
- uses: actions/checkout@v3
- run: sudo apt update
- run: sudo apt install -y llvm-{11,13,14}-dev clang-{11,13,14} make

- run: LLVM_CONFIG=llvm-config-11 ./doctest.sh
- run: make clean
- run: LLVM_CONFIG=llvm-config-13 ./doctest.sh
Expand Down
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ If you have any trouble with this, please create an issue!

<details> <summary>Linux</summary>

Following the [instructions in the README](README.md#setup-linux) is enough.
Following the [instructions in the README](README.md#setup) is enough.

To edit the C code, you can use any editor that uses `clangd`.
The `make` command creates a file `compile_flags.txt`
Expand Down
141 changes: 141 additions & 0 deletions check-markdown-links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
"""Check that links in markdown files point to reasonable places."""
# Get from https://github.com/Akuli/porcupine/

import argparse
import os
import re
import subprocess
import sys
from functools import cache
from http.client import responses as status_code_names
from pathlib import Path

import requests

PROJECT_ROOT = Path(__file__).absolute().parent
assert (PROJECT_ROOT / "README.md").is_file()
os.chdir(PROJECT_ROOT)


def find_markdown_files():
output = subprocess.check_output(["git", "ls-files", "*.md"], text=True)
return [Path(line) for line in output.splitlines()]


def find_links(markdown_file_path):
content = markdown_file_path.read_text(encoding="utf-8")

for lineno, line in enumerate(content.splitlines(), start=1):
link_regexes = [
# [text](target)
# The text can contain spaces, and it can even be split across many lines.
# This also detects images ![text](target) and that's a good thing.
r"\]\(([^\s()]+(?:\([^\s()]+\))?)\)",
# [blah blah]: target
r"^\[[^\[\]]+\]: (.+)$",
]
for regex in link_regexes:
for link_target in re.findall(regex, line):
yield (lineno, link_target)


@cache
def check_https_url(url):
try:
# Many sites redirect to front page for bad URLs. Let's not treat that as ok.
response = requests.head(url, timeout=10, allow_redirects=False)
except requests.exceptions.RequestException as e:
return f"HTTP HEAD request failed: {e}"

if url == "https://github.com/Akuli/jou/issues/new":
# It returns 302 Found, because it redirects to login page
expected_status = 302
else:
expected_status = 200

if response.status_code != expected_status:
return f"site returns {response.status_code} {status_code_names[response.status_code]}"

return None


def get_all_refs(path):
result = []
for title in re.findall(r"\n#+ (.*)", path.read_text()):
words = re.findall(r"[a-z0-9]+", title.lower().replace("'", ""))
result.append("#" + "-".join(words))
return result


def check_link(markdown_file_path, link_target, offline_mode=False):
if link_target.startswith("http://"):
return "this link should probably use https instead of http"

if link_target.startswith("https://"):
assert not offline_mode
return check_https_url(link_target)

if "//" in link_target:
return "double slashes are allowed only in http:// and https:// links"

if "\\" in link_target:
return "use forward slashes instead of backslashes"

path = markdown_file_path.parent / link_target.split("#")[0]

if PROJECT_ROOT not in path.resolve().parents:
return "link points outside of the Jou project folder"

if not path.exists():
return "link points to a file or folder that doesn't exist"

if "#" in link_target:
# Reference to title within markdown file.
# For example: architecture-and-design.md#loading-order
if (not path.is_file()) or path.suffix != ".md":
return "hashtag '#' can only be used with markdown files"

refs = get_all_refs(path)
ref = "#" + link_target.split("#", 1)[1]
if ref not in refs:
return f"no heading in {path} matches {ref} (should be one of: {' '.join(refs)})"

return None


def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--offline",
action="store_true",
help="don't do HTTP requests, just assume that https:// links are fine",
)
args = parser.parse_args()

paths = find_markdown_files()
assert paths

good_links = 0
bad_links = 0

for path in paths:
for lineno, link_target in find_links(path):
if link_target.startswith("https://") and args.offline:
continue

problem = check_link(path, link_target, offline_mode=args.offline)
if problem:
print(f"{path}:{lineno}: {problem}")
bad_links += 1
else:
good_links += 1

assert good_links + bad_links > 0

if bad_links > 0:
sys.exit(1)
else:
print(f"checked {good_links} links, no errors :)")


main()

0 comments on commit 0206836

Please sign in to comment.