-
Notifications
You must be signed in to change notification settings - Fork 4
/
readmesfix.py
executable file
·156 lines (127 loc) · 5.99 KB
/
readmesfix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env python
import argparse
import csv
import fileinput
import glob
import json
import os
import re
import subprocess
import sys
import tempfile
import textwrap
import traceback
import git
import requests
import tqdm
import util
ENV_VAR_NAME = 'GITHUB_ACCESS_TOKEN'
if ENV_VAR_NAME not in os.environ:
sys.exit(f"Error: you need to setup {ENV_VAR_NAME} env var to run this script.")
AUTH_PARAMS = {'access_token': os.environ[ENV_VAR_NAME]}
HEADING_WITHOUT_SPACE_RE = re.compile(r'^(#+)([^\s#])(.*?)(#+)?$')
CODE_BLOCK_FENCE_BACK_TICKS_RE = re.compile(r'^```')
CODE_BLOCK_FENCE_TILDES_RE = re.compile(r'^~~~')
last_valid_fence = None
inside_code_block = False
def detect_code_block_back_ticks_fence(match):
global inside_code_block, last_valid_fence
if inside_code_block:
if last_valid_fence == '`':
inside_code_block = False
else:
inside_code_block = True
last_valid_fence = '`'
return match.group(0)
def detect_code_block_tildes_fence(match):
global inside_code_block, last_valid_fence
if inside_code_block:
if last_valid_fence == '~':
inside_code_block = False
else:
inside_code_block = True
last_valid_fence = '~'
return match.group(0)
def heading_fix(match):
global inside_code_block
if inside_code_block:
return match.group(0)
elif match.group(4):
return f'{match.group(1)} {match.group(2)}{match.group(3)} {match.group(4)}'
else:
return f'{match.group(1)} {match.group(2)}{match.group(3)}'
def crlf_paths(paths):
paths_with_crlf = set()
for path in paths:
if 'CRLF' in subprocess.run(['file', path], check=True, stdout=subprocess.PIPE).stdout.decode('utf-8'):
paths_with_crlf.add(path)
return paths_with_crlf
def insensitive_glob(pattern, *, recursive=False):
"""From: http://stackoverflow.com/a/10886685/1165181"""
def either(char):
return f'[{char.lower()}{char.upper()}]' if char.isalpha() else char
return glob.glob(''.join(either(char) for char in pattern), recursive=recursive)
def create_pr(repo_name, base_branch, branch_name):
params = {
'title': f"Fix broken headings in Markdown files",
'head': branch_name,
'base': base_branch,
'body': textwrap.dedent("""\
GitHub changed the way Markdown headings are parsed, so this change fixes it.
See [bryant1410/readmesfix](https://github.com/bryant1410/readmesfix) for more information.
Tackles bryant1410/readmesfix#1
"""),
}
pull_request_endpoint = f"https://api.github.com/repos/{repo_name}/pulls"
response = requests.post(pull_request_endpoint, json=params, params=AUTH_PARAMS)
response_dict = json.loads(response.text)
if response.status_code != 201:
print(f"There was an error creating the PR of {repo_name}: {response_dict}")
def main(dataset_path):
global inside_code_block,last_valid_fence
with open(dataset_path) as file:
number_of_lines = sum(1 for _ in file)
file.seek(0)
for (repo_name,) in tqdm.tqdm(csv.reader(file), total=number_of_lines):
with tempfile.TemporaryDirectory() as temp_dir, util.pushd(temp_dir):
# noinspection PyBroadException
try:
repo = git.Repo.clone_from(f'[email protected]:{repo_name}.git', '.', depth=1, origin='upstream')
markdown_paths = set(insensitive_glob('**/*.md', recursive=True)) \
| set(insensitive_glob('**/*.mkdn?', recursive=True)) \
| set(insensitive_glob('**/*.mdown', recursive=True)) \
| set(insensitive_glob('**/*.markdown', recursive=True))
markdown_paths = {path for path in markdown_paths if os.path.isfile(path)}
if markdown_paths: # Gets stuck otherwise
paths_with_crlf = crlf_paths(markdown_paths)
with fileinput.input(markdown_paths, inplace=True) as markdown_file:
use_crlf = False
for line in markdown_file:
if fileinput.isfirstline():
inside_code_block = False
last_valid_fence = None
use_crlf = markdown_file.filename() in paths_with_crlf
if use_crlf and line and line[-1] == '\n':
line = line[:-1] + '\r\n'
CODE_BLOCK_FENCE_BACK_TICKS_RE.sub(detect_code_block_back_ticks_fence, line)
CODE_BLOCK_FENCE_TILDES_RE.sub(detect_code_block_tildes_fence, line)
print(HEADING_WITHOUT_SPACE_RE.sub(heading_fix, line), end='')
if repo.index.diff(None):
repo.git.add('.')
repo.git.commit(m="Fix broken Markdown headings")
response = requests.post(f'https://api.github.com/repos/{repo_name}/forks',
params=AUTH_PARAMS)
response_dict = json.loads(response.text)
if response.status_code == 202:
repo.create_remote('origin', response_dict['ssh_url']).push()
create_pr(repo_name, response_dict["default_branch"],
f'{response_dict["owner"]["login"]}:{response_dict["default_branch"]}')
else:
print(f"There was an error forking {repo_name}: {response_dict}")
except Exception:
print(traceback.format_exc())
if __name__ == '__main__':
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('--dataset', default='top_broken.tsv')
args = arg_parser.parse_args()
main(args.dataset)