diff --git a/markdown_to_json.py b/markdown_to_json.py new file mode 100644 index 0000000..859bff0 --- /dev/null +++ b/markdown_to_json.py @@ -0,0 +1,54 @@ +# This file check all the .md files from task folder and convert them into json (with proper error check and formatting) and put them in task-json folder. +import os +import json +import re +from pyparser import * + +def process_tasks(tasks_folder, json_folder): + # Create the JSON folder if it doesn't exist + os.makedirs(json_folder, exist_ok=True) + + # Get a list of all markdown files in the tasks folder + md_files = [f for f in os.listdir(tasks_folder) if f.endswith('.md')] + + for md_file in md_files: + # Construct full paths for markdown and json files + md_path = os.path.join(tasks_folder, md_file) + json_file = md_file.replace('.md', '.json') + json_path = os.path.join(json_folder, json_file) + + # Check if JSON file needs to be created or updated + if not os.path.exists(json_path) or os.path.getmtime(md_path) > os.path.getmtime(json_path): + print(f"Processing {md_file}...") + + try: + # Read the markdown file + with open(md_path, 'r', encoding='utf-8') as md_file: + markdown_content = md_file.read() + + # Parse markdown to JSON + json_content = parse_markdown_to_json(markdown_content) + + # Write the JSON content to file + with open(json_path, 'w', encoding='utf-8', newline='\n') as json_file: + json.dump(json_content, json_file, indent=2, ensure_ascii=False) + json_file.write('\n') # Add a final newline + + print(f"Successfully converted and saved {json_file}") + except MarkdownParsingError as e: + # Handle specific markdown parsing errors + print(f"Error processing {md_file}: {str(e)}") + except Exception as e: + # Handle any other unexpected errors + print(f"Unexpected error processing {md_file}: {str(e)}") + else: + # Skip processing if JSON file is up to date + print(f"Skipping {md_file} - JSON file already up to date") + +if __name__ == "__main__": + # Define folder paths + tasks_folder = "tasks" + json_folder = "tasks-json" + + # Run the main processing function + process_tasks(tasks_folder, json_folder) diff --git a/pyparser.py b/pyparser.py new file mode 100644 index 0000000..770c6f7 --- /dev/null +++ b/pyparser.py @@ -0,0 +1,83 @@ +import os +import json +import re + +class MarkdownParsingError(Exception): + pass + +def parse_markdown_to_json(markdown_text): + task = { + "name": "", + "description": "", + "modality": "", + "diagram": None, + "citations": None, + "examples": [], + "tags": [] + } + + # Helper function to extract content between headers + def extract_section(start_pattern, end_pattern=None): + if end_pattern: + match = re.search(f"{start_pattern}(.*?){end_pattern}", markdown_text, re.DOTALL) + else: + match = re.search(f"{start_pattern}(.*?)$", markdown_text, re.DOTALL) + if match: + return match.group(1).strip() + return None + + # Extract name (required) + name_match = re.search(r'^#\s+(.+)$', markdown_text, re.MULTILINE) + if not name_match: + raise MarkdownParsingError("Task name not found. Expected '# Task Name' at the start.") + task["name"] = name_match.group(1).strip() + + # Extract description (required) + description = extract_section(r'## Description:', r'##') + if not description: + raise MarkdownParsingError("Description section not found or empty.") + task["description"] = description + + # Extract modality (required) + modality = extract_section(r'## Modality:', r'##') + if not modality: + raise MarkdownParsingError("Modality section not found or empty.") + task["modality"] = modality + + # Extract diagram (optional) + diagram = extract_section(r'## Diagram \(Optional\):', r'##') + task["diagram"] = diagram + + # Extract citations (optional) + citations = extract_section(r'## Citations \(Optional\):', r'##') + if citations: + task["citations"] = [citation.strip()[2:] for citation in citations.split('\n') if citation.strip().startswith('-')] + + # Extract examples (required) + examples_text = extract_section(r'## Examples:', r'## Tags:') + if not examples_text: + raise MarkdownParsingError("Examples section not found or empty.") + + examples = re.split(r'###\s+Example\s+\d+:', examples_text) + if len(examples) < 2: # First split is empty, so we need at least 2 elements + raise MarkdownParsingError("No examples found. Expected at least one '### Example X:' section.") + + for example in examples[1:]: # Skip the first split as it's empty + input_match = re.search(r'Input:\s*```(.+?)```\s*Output:', example, re.DOTALL) + output_match = re.search(r'Output:\s*```(.+?)```\s*$', example, re.DOTALL) + if not input_match or not output_match: + raise MarkdownParsingError(f"Invalid example format. Expected 'Input:' and 'Output:' sections with code blocks.") + task["examples"].append([{ + "input": input_match.group(1).strip(), + "output": output_match.group(1).strip() + }]) + + # Extract tags (required) + tags_text = extract_section(r'## Tags:') + if not tags_text: + raise MarkdownParsingError("Tags section not found or empty.") + task["tags"] = [tag.strip()[2:] for tag in tags_text.split('\n') if tag.strip().startswith('-')] + if not task["tags"]: + raise MarkdownParsingError("No tags found. Expected at least one tag starting with '-'.") + + return task