-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(docs): add CI Spell checks (#949)
Co-authored-by: Joshua Croft <[email protected]>
- Loading branch information
1 parent
d8cc924
commit 076ec5d
Showing
76 changed files
with
2,346 additions
and
877 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
name: Spell Checking | ||
|
||
on: | ||
pull_request: | ||
branches: | ||
- "**" | ||
|
||
jobs: | ||
spellcheck: | ||
name: Check Spelling with custom Python script | ||
runs-on: ubuntu-latest | ||
strategy: | ||
matrix: | ||
python-version: [3.8] | ||
steps: | ||
# Step 1: Checkout the code | ||
- name: Checkout code | ||
uses: actions/checkout@v2 | ||
|
||
# Step 2: Set up Python | ||
- name: Set up Python | ||
uses: actions/setup-python@v2 | ||
with: | ||
python-version: ${{ matrix.python-version }} | ||
|
||
# Step 3: Install dependencies | ||
- name: Install dependencies | ||
run: | | ||
sudo apt-get update | ||
sudo apt-get install -y aspell aspell-en # Install aspell and the English dictionary | ||
python -m pip install --upgrade pip | ||
pip install spacy markdown-it-py pyspellchecker | ||
python -m spacy download en_core_web_sm # Install spaCy language model | ||
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi | ||
# Step 4: Run the custom Python spell-checking script | ||
- name: Run spell-check script | ||
run: | | ||
python check_spelling.py || exit 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
import os | ||
import re | ||
import spacy | ||
from markdown_it import MarkdownIt | ||
from spellchecker import SpellChecker | ||
import sys | ||
|
||
# Load the spaCy model and spell checker | ||
nlp = spacy.load("en_core_web_sm") | ||
spell = SpellChecker() | ||
|
||
# Load custom dictionary | ||
custom_dict_path = 'dictionaries/custom_dict.txt' | ||
with open(custom_dict_path, 'r') as f: | ||
custom_words = set(line.strip().lower() for line in f) | ||
|
||
# Regex patterns | ||
import_pattern = re.compile(r'import\s*{\s*([\s\S]*?)\s*}\s*from\s*["\']([^"\']+)["\'];', re.MULTILINE) | ||
jsx_like_tags_pattern = re.compile(r'<[^>]*>[\s\S]*?<\/[^>]*>|<[^>]*?/>', re.DOTALL) | ||
path_pattern = re.compile(r'path:\s*"/[^"]*"') | ||
guidebox_pattern = re.compile(r'<GuideBox[\s\S]*?/>', re.IGNORECASE) | ||
hex_colours = re.compile(r'([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3})') | ||
|
||
# Corrected word pattern to allow apostrophes in valid words | ||
word_pattern = re.compile(r"\b\w+(?:'\w+)?\b") | ||
|
||
# Pattern to exclude words containing escape sequences (\n, \u, etc.) | ||
escape_sequence_pattern = re.compile(r'\\[nu][0-9a-fA-F]+|u[0-9a-fA-F]{4}') | ||
|
||
# Function to extract text while ignoring specified components and handling code blocks differently | ||
def extract_text_from_mdx(file_path): | ||
with open(file_path, 'r') as file: | ||
content = file.read() | ||
|
||
# Remove import statements | ||
content = import_pattern.sub('', content) | ||
|
||
# Remove paths and GuideBox components | ||
content = path_pattern.sub('', content) | ||
content = guidebox_pattern.sub('', content) | ||
|
||
# Remove JSX components and JSX-like tags | ||
content = jsx_like_tags_pattern.sub('', content) | ||
|
||
# Initialize the Markdown parser | ||
md = MarkdownIt() | ||
parsed = md.parse(content) | ||
|
||
# Extract text while separating code blocks for warnings | ||
text = [] | ||
code_blocks = [] | ||
in_code_block = False | ||
|
||
def traverse(node): | ||
nonlocal in_code_block | ||
if node.type == 'fence': | ||
if not in_code_block: | ||
code_blocks.append(node.content) # Capture code block content | ||
in_code_block = not in_code_block | ||
elif node.type == 'code_inline' and not in_code_block: | ||
return | ||
elif node.type == 'text' and not in_code_block: | ||
text.append(node.content) | ||
|
||
for child in node.children or []: | ||
traverse(child) | ||
|
||
for node in parsed: | ||
traverse(node) | ||
|
||
return '\n'.join(text), code_blocks | ||
|
||
# Function to check for spelling errors | ||
def check_spelling(text, is_code_block=False): | ||
def split_underscore_words(word): | ||
return re.split(r'[_\s]+', word) | ||
|
||
# Use the updated word pattern to find words | ||
words = word_pattern.findall(text) | ||
processed_words = [] | ||
for word in words: | ||
if '_' in word: | ||
processed_words.extend(split_underscore_words(word)) | ||
else: | ||
processed_words.append(word) | ||
|
||
# Patterns to exclude | ||
n_prefix_pattern = re.compile(r'\bn\w+') | ||
css_value_pattern = re.compile(r'^\d+(px|%|em|rem|vh|vw|pt|cm|mm|in|s|ms|deg)?$') # CSS values | ||
hex_color_pattern = re.compile(r'^(#?[A-Fa-f0-9]{3}|#?[A-Fa-f0-9]{6})$') # Hex colors | ||
eth_address_pattern = re.compile(r'^0x[a-fA-F0-9]{40}$') # Ethereum addresses | ||
hash_pattern = re.compile(r'^[a-f0-9]{40}$') # Hash-like strings (40 hex characters) | ||
|
||
# Filter out custom words, valid words with apostrophes, | ||
# words matching escape sequences, "n-prefixed" words, CSS values, hex colors, ETH addresses, and hash strings | ||
reduced_words = [ | ||
i.lower() for i in processed_words | ||
if ( | ||
i.lower() not in custom_words | ||
and not escape_sequence_pattern.search(i) | ||
and "'" not in i # Exclude words with apostrophes for misspelling check | ||
and not n_prefix_pattern.match(i) # Exclude "n-prefixed" words | ||
and not css_value_pattern.match(i) # Exclude CSS values | ||
and not hex_color_pattern.match(i) # Exclude hex colors | ||
and not eth_address_pattern.match(i) # Exclude Ethereum addresses | ||
and not hash_pattern.match(i) # Exclude hash-like strings | ||
and i.strip() # Exclude empty strings | ||
) | ||
] | ||
misspelled = spell.unknown(reduced_words) | ||
|
||
# Return misspelled words with a flag indicating if they came from code | ||
return misspelled if not is_code_block else {'warnings': misspelled} | ||
|
||
# Function to check all .mdx files in a directory | ||
def check_directory(directory): | ||
has_errors = False | ||
|
||
for root, _, files in os.walk(directory): | ||
for file in files: | ||
if file.endswith('.mdx'): | ||
file_path = os.path.join(root, file) | ||
print(f'========== Checking file: {file_path} ==========') | ||
|
||
# Extract text and code blocks from the MDX file | ||
text, code_blocks = extract_text_from_mdx(file_path) | ||
|
||
# Check for spelling errors in text | ||
errors = check_spelling(text) | ||
if errors: | ||
print(f'Spelling errors in {file_path}:') | ||
for error in errors: | ||
print(f' - {error}') | ||
has_errors = True | ||
|
||
# Check for spelling errors in code blocks (warnings) | ||
warnings = [] | ||
for code_block in code_blocks: | ||
warnings = check_spelling(code_block, is_code_block=True).get('warnings', []) | ||
|
||
if warnings: | ||
print(f'Warnings (spelling errors in code block) in {file_path}:') | ||
for warning in warnings: | ||
print(f' - {warning}') | ||
|
||
return has_errors | ||
|
||
# Directory to check | ||
directory_path = 'pages' | ||
has_errors = check_directory(directory_path) | ||
|
||
# Return False if errors were found | ||
sys.exit(1 if has_errors else 0) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
10px |
Oops, something went wrong.