feat(docs): add CI Spell checks (#949)

Co-authored-by: Joshua Croft <[email protected]>
fetchai · Sep 23, 2024 · 076ec5d · 076ec5d
1 parent d8cc924
commit 076ec5d
Show file tree

Hide file tree

Showing 76 changed files with 2,346 additions and 877 deletions.
diff --git a/.github/workflows/spellcheck.yml b/.github/workflows/spellcheck.yml
@@ -0,0 +1,39 @@
+name: Spell Checking
+
+on:
+  pull_request:
+    branches:
+      - "**"
+
+jobs:
+  spellcheck:
+    name: Check Spelling with custom Python script
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.8]
+    steps:
+      # Step 1: Checkout the code
+      - name: Checkout code
+        uses: actions/checkout@v2
+
+      # Step 2: Set up Python
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      # Step 3: Install dependencies
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y aspell aspell-en  # Install aspell and the English dictionary
+          python -m pip install --upgrade pip
+          pip install spacy markdown-it-py pyspellchecker
+          python -m spacy download en_core_web_sm  # Install spaCy language model
+          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+
+      # Step 4: Run the custom Python spell-checking script
+      - name: Run spell-check script
+        run: |
+          python check_spelling.py || exit 1
diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@ Welcome to the code repository for Fetch.ai's documentation at [fetch.ai/docs](f
 ### Install dependencies
 
 ```bash
-pnpm install
+pnpm install test
 ```
 
 ### Run Development Server

diff --git a/check_spelling.py b/check_spelling.py
@@ -0,0 +1,153 @@
+import os
+import re
+import spacy
+from markdown_it import MarkdownIt
+from spellchecker import SpellChecker
+import sys
+
+# Load the spaCy model and spell checker
+nlp = spacy.load("en_core_web_sm")
+spell = SpellChecker()
+
+# Load custom dictionary
+custom_dict_path = 'dictionaries/custom_dict.txt'
+with open(custom_dict_path, 'r') as f:
+    custom_words = set(line.strip().lower() for line in f)
+
+# Regex patterns
+import_pattern = re.compile(r'import\s*{\s*([\s\S]*?)\s*}\s*from\s*["\']([^"\']+)["\'];', re.MULTILINE)
+jsx_like_tags_pattern = re.compile(r'<[^>]*>[\s\S]*?<\/[^>]*>|<[^>]*?/>', re.DOTALL)
+path_pattern = re.compile(r'path:\s*"/[^"]*"')
+guidebox_pattern = re.compile(r'<GuideBox[\s\S]*?/>', re.IGNORECASE)
+hex_colours = re.compile(r'([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3})')
+
+# Corrected word pattern to allow apostrophes in valid words
+word_pattern = re.compile(r"\b\w+(?:'\w+)?\b")
+
+# Pattern to exclude words containing escape sequences (\n, \u, etc.)
+escape_sequence_pattern = re.compile(r'\\[nu][0-9a-fA-F]+|u[0-9a-fA-F]{4}')
+
+# Function to extract text while ignoring specified components and handling code blocks differently
+def extract_text_from_mdx(file_path):
+    with open(file_path, 'r') as file:
+        content = file.read()
+
+    # Remove import statements
+    content = import_pattern.sub('', content)
+
+    # Remove paths and GuideBox components
+    content = path_pattern.sub('', content)
+    content = guidebox_pattern.sub('', content)
+
+    # Remove JSX components and JSX-like tags
+    content = jsx_like_tags_pattern.sub('', content)
+
+    # Initialize the Markdown parser
+    md = MarkdownIt()
+    parsed = md.parse(content)
+
+    # Extract text while separating code blocks for warnings
+    text = []
+    code_blocks = []
+    in_code_block = False
+
+    def traverse(node):
+        nonlocal in_code_block
+        if node.type == 'fence':
+            if not in_code_block:
+                code_blocks.append(node.content)  # Capture code block content
+            in_code_block = not in_code_block
+        elif node.type == 'code_inline' and not in_code_block:
+            return
+        elif node.type == 'text' and not in_code_block:
+            text.append(node.content)
+
+        for child in node.children or []:
+            traverse(child)
+
+    for node in parsed:
+        traverse(node)
+
+    return '\n'.join(text), code_blocks
+
+# Function to check for spelling errors
+def check_spelling(text, is_code_block=False):
+    def split_underscore_words(word):
+        return re.split(r'[_\s]+', word)
+
+    # Use the updated word pattern to find words
+    words = word_pattern.findall(text)
+    processed_words = []
+    for word in words:
+        if '_' in word:
+            processed_words.extend(split_underscore_words(word))
+        else:
+            processed_words.append(word)
+
+    # Patterns to exclude
+    n_prefix_pattern = re.compile(r'\bn\w+')
+    css_value_pattern = re.compile(r'^\d+(px|%|em|rem|vh|vw|pt|cm|mm|in|s|ms|deg)?$')  # CSS values
+    hex_color_pattern = re.compile(r'^(#?[A-Fa-f0-9]{3}|#?[A-Fa-f0-9]{6})$')  # Hex colors
+    eth_address_pattern = re.compile(r'^0x[a-fA-F0-9]{40}$')  # Ethereum addresses
+    hash_pattern = re.compile(r'^[a-f0-9]{40}$')  # Hash-like strings (40 hex characters)
+
+    # Filter out custom words, valid words with apostrophes,
+    # words matching escape sequences, "n-prefixed" words, CSS values, hex colors, ETH addresses, and hash strings
+    reduced_words = [
+        i.lower() for i in processed_words
+        if (
+            i.lower() not in custom_words
+            and not escape_sequence_pattern.search(i)
+            and "'" not in i  # Exclude words with apostrophes for misspelling check
+            and not n_prefix_pattern.match(i)  # Exclude "n-prefixed" words
+            and not css_value_pattern.match(i)  # Exclude CSS values
+            and not hex_color_pattern.match(i)  # Exclude hex colors
+            and not eth_address_pattern.match(i)  # Exclude Ethereum addresses
+            and not hash_pattern.match(i)  # Exclude hash-like strings
+            and i.strip()  # Exclude empty strings
+        )
+    ]
+    misspelled = spell.unknown(reduced_words)
+
+    # Return misspelled words with a flag indicating if they came from code
+    return misspelled if not is_code_block else {'warnings': misspelled}
+
+# Function to check all .mdx files in a directory
+def check_directory(directory):
+    has_errors = False
+
+    for root, _, files in os.walk(directory):
+        for file in files:
+            if file.endswith('.mdx'):
+                file_path = os.path.join(root, file)
+                print(f'========== Checking file: {file_path} ==========')
+
+                # Extract text and code blocks from the MDX file
+                text, code_blocks = extract_text_from_mdx(file_path)
+
+                # Check for spelling errors in text
+                errors = check_spelling(text)
+                if errors:
+                    print(f'Spelling errors in {file_path}:')
+                    for error in errors:
+                        print(f'  - {error}')
+                    has_errors = True
+
+                # Check for spelling errors in code blocks (warnings)
+                warnings = []
+                for code_block in code_blocks:
+                    warnings = check_spelling(code_block, is_code_block=True).get('warnings', [])
+
+                if warnings:
+                    print(f'Warnings (spelling errors in code block) in {file_path}:')
+                    for warning in warnings:
+                        print(f'  - {warning}')
+
+    return has_errors
+
+# Directory to check
+directory_path = 'pages'
+has_errors = check_directory(directory_path)
+
+# Return False if errors were found
+sys.exit(1 if has_errors else 0)
diff --git a/dictionaries/code_warnings_known.txt b/dictionaries/code_warnings_known.txt
@@ -0,0 +1 @@
+10px