Skip to content

Commit

Permalink
feat(docs): add CI Spell checks (#949)
Browse files Browse the repository at this point in the history
Co-authored-by: Joshua Croft <[email protected]>
  • Loading branch information
FelixNicolaeBucsa and Joshua Croft authored Sep 23, 2024
1 parent d8cc924 commit 076ec5d
Show file tree
Hide file tree
Showing 76 changed files with 2,346 additions and 877 deletions.
39 changes: 39 additions & 0 deletions .github/workflows/spellcheck.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: Spell Checking

on:
pull_request:
branches:
- "**"

jobs:
spellcheck:
name: Check Spelling with custom Python script
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.8]
steps:
# Step 1: Checkout the code
- name: Checkout code
uses: actions/checkout@v2

# Step 2: Set up Python
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}

# Step 3: Install dependencies
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y aspell aspell-en # Install aspell and the English dictionary
python -m pip install --upgrade pip
pip install spacy markdown-it-py pyspellchecker
python -m spacy download en_core_web_sm # Install spaCy language model
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
# Step 4: Run the custom Python spell-checking script
- name: Run spell-check script
run: |
python check_spelling.py || exit 1
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Welcome to the code repository for Fetch.ai's documentation at [fetch.ai/docs](f
### Install dependencies

```bash
pnpm install
pnpm install test
```

### Run Development Server
Expand Down
153 changes: 153 additions & 0 deletions check_spelling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import os
import re
import spacy
from markdown_it import MarkdownIt
from spellchecker import SpellChecker
import sys

# Load the spaCy model and spell checker
nlp = spacy.load("en_core_web_sm")
spell = SpellChecker()

# Load custom dictionary
custom_dict_path = 'dictionaries/custom_dict.txt'
with open(custom_dict_path, 'r') as f:
custom_words = set(line.strip().lower() for line in f)

# Regex patterns
import_pattern = re.compile(r'import\s*{\s*([\s\S]*?)\s*}\s*from\s*["\']([^"\']+)["\'];', re.MULTILINE)
jsx_like_tags_pattern = re.compile(r'<[^>]*>[\s\S]*?<\/[^>]*>|<[^>]*?/>', re.DOTALL)
path_pattern = re.compile(r'path:\s*"/[^"]*"')
guidebox_pattern = re.compile(r'<GuideBox[\s\S]*?/>', re.IGNORECASE)
hex_colours = re.compile(r'([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3})')

# Corrected word pattern to allow apostrophes in valid words
word_pattern = re.compile(r"\b\w+(?:'\w+)?\b")

# Pattern to exclude words containing escape sequences (\n, \u, etc.)
escape_sequence_pattern = re.compile(r'\\[nu][0-9a-fA-F]+|u[0-9a-fA-F]{4}')

# Function to extract text while ignoring specified components and handling code blocks differently
def extract_text_from_mdx(file_path):
with open(file_path, 'r') as file:
content = file.read()

# Remove import statements
content = import_pattern.sub('', content)

# Remove paths and GuideBox components
content = path_pattern.sub('', content)
content = guidebox_pattern.sub('', content)

# Remove JSX components and JSX-like tags
content = jsx_like_tags_pattern.sub('', content)

# Initialize the Markdown parser
md = MarkdownIt()
parsed = md.parse(content)

# Extract text while separating code blocks for warnings
text = []
code_blocks = []
in_code_block = False

def traverse(node):
nonlocal in_code_block
if node.type == 'fence':
if not in_code_block:
code_blocks.append(node.content) # Capture code block content
in_code_block = not in_code_block
elif node.type == 'code_inline' and not in_code_block:
return
elif node.type == 'text' and not in_code_block:
text.append(node.content)

for child in node.children or []:
traverse(child)

for node in parsed:
traverse(node)

return '\n'.join(text), code_blocks

# Function to check for spelling errors
def check_spelling(text, is_code_block=False):
def split_underscore_words(word):
return re.split(r'[_\s]+', word)

# Use the updated word pattern to find words
words = word_pattern.findall(text)
processed_words = []
for word in words:
if '_' in word:
processed_words.extend(split_underscore_words(word))
else:
processed_words.append(word)

# Patterns to exclude
n_prefix_pattern = re.compile(r'\bn\w+')
css_value_pattern = re.compile(r'^\d+(px|%|em|rem|vh|vw|pt|cm|mm|in|s|ms|deg)?$') # CSS values
hex_color_pattern = re.compile(r'^(#?[A-Fa-f0-9]{3}|#?[A-Fa-f0-9]{6})$') # Hex colors
eth_address_pattern = re.compile(r'^0x[a-fA-F0-9]{40}$') # Ethereum addresses
hash_pattern = re.compile(r'^[a-f0-9]{40}$') # Hash-like strings (40 hex characters)

# Filter out custom words, valid words with apostrophes,
# words matching escape sequences, "n-prefixed" words, CSS values, hex colors, ETH addresses, and hash strings
reduced_words = [
i.lower() for i in processed_words
if (
i.lower() not in custom_words
and not escape_sequence_pattern.search(i)
and "'" not in i # Exclude words with apostrophes for misspelling check
and not n_prefix_pattern.match(i) # Exclude "n-prefixed" words
and not css_value_pattern.match(i) # Exclude CSS values
and not hex_color_pattern.match(i) # Exclude hex colors
and not eth_address_pattern.match(i) # Exclude Ethereum addresses
and not hash_pattern.match(i) # Exclude hash-like strings
and i.strip() # Exclude empty strings
)
]
misspelled = spell.unknown(reduced_words)

# Return misspelled words with a flag indicating if they came from code
return misspelled if not is_code_block else {'warnings': misspelled}

# Function to check all .mdx files in a directory
def check_directory(directory):
has_errors = False

for root, _, files in os.walk(directory):
for file in files:
if file.endswith('.mdx'):
file_path = os.path.join(root, file)
print(f'========== Checking file: {file_path} ==========')

# Extract text and code blocks from the MDX file
text, code_blocks = extract_text_from_mdx(file_path)

# Check for spelling errors in text
errors = check_spelling(text)
if errors:
print(f'Spelling errors in {file_path}:')
for error in errors:
print(f' - {error}')
has_errors = True

# Check for spelling errors in code blocks (warnings)
warnings = []
for code_block in code_blocks:
warnings = check_spelling(code_block, is_code_block=True).get('warnings', [])

if warnings:
print(f'Warnings (spelling errors in code block) in {file_path}:')
for warning in warnings:
print(f' - {warning}')

return has_errors

# Directory to check
directory_path = 'pages'
has_errors = check_directory(directory_path)

# Return False if errors were found
sys.exit(1 if has_errors else 0)
1 change: 1 addition & 0 deletions dictionaries/code_warnings_known.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
10px
Loading

0 comments on commit 076ec5d

Please sign in to comment.