-
Notifications
You must be signed in to change notification settings - Fork 6
/
main.py
126 lines (90 loc) · 2.63 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import re
from urllib.parse import quote as url_quote
from getpass import getpass as get_pass
from collections import namedtuple
from time import sleep
from os.path import exists
import requests
from ipymarkup import show_span_box_markup as show_markup
TEXT_MATCH = {
'Accept': 'application/vnd.github.v3.text-match+json'
}
URLS = 'urls.txt'
SerpRecord = namedtuple(
'SerpRecord',
['user', 'repo', 'path', 'matches']
)
def format_q_params(params):
for key, values in params:
for value in values:
yield '{key}:{value}'.format(
key=key,
value=value
)
def gh_q(text, extensions=(), no_orgs=(), no_users=()):
params = [
('extension', extensions),
('-org', no_orgs),
('-user', no_users)
]
return '{text}+{params}'.format(
text=text,
params='+'.join(format_q_params(params))
)
def format_params(params):
for key, value in sorted(params.items()):
yield '{key}={value}'.format(
key=key,
value=value
)
def gh_url(*path, **params):
return 'https://api.github.com/{path}?{params}'.format(
path='/'.join(path),
params='&'.join(format_params(params))
)
def gh_search_code_url(q, sort='indexed', page=1):
return gh_url('search', 'code', q=q, sort=sort, page=page)
def call_gh(url, auth, headers=None):
response = requests.get(
url,
auth=auth,
headers=headers
)
return response, response.json()
def parse_serp_total(data):
return data['total_count']
def get_pages(total, step=30):
for index, _ in enumerate(range(0, total, step)):
yield index + 1
def is_broken(data):
return 'items' not in data
def parse_serp(data):
for item in data['items']:
path = item['path']
repo = item['repository']
user = repo['owner']
matches = [_['fragment'] for _ in item['text_matches']]
yield SerpRecord(
user['login'],
repo['name'],
path, matches
)
def get_serp_record_url(record):
return 'https://github.com/{user}/{repo}/tree/master/{path}'.format(
user=record.user,
repo=record.repo,
path=record.path
)
def get_spans(text, pattern):
for match in re.finditer(pattern, text, re.I):
yield match.start(), match.end()
def dump_lines(lines, path):
with open(path, 'w') as file:
for line in lines:
file.write(line + '\n')
def load_lines(path):
if not exists(path):
return
with open(path) as file:
for line in file:
yield line.rstrip('\n')