-
Notifications
You must be signed in to change notification settings - Fork 6
/
search_readme.py
63 lines (44 loc) · 1.62 KB
/
search_readme.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
# Verify that there is a token set as an env variable
shell_token = "GITHUB_ORTHOGRAPHIC_TOKEN"
GITHUB_TOKEN = os.environ[shell_token]
# Use the parsed version
#f_wordlist = "wordlists/wikipedia_list.txt"
f_wordlist = "wordlists/parsed_wikipedia_list.txt"
import requests, os, json, codecs, time, pprint
from datetime import datetime
# github's search API
url = "https://api.github.com/search/repositories"
WORDS = []
with open(f_wordlist) as FIN:
for line in FIN:
WORDS.append(line.split('-')[0])
for word in WORDS:
f_word = os.path.join("search_data",word)
if os.path.exists(f_word):
print word,"already exists skipping"
continue
params = {
"q" : word+'+in:readme+stars:>1+file:',
"sort":"stars",
"order":"desc",
"per_page":100,
"access_token":GITHUB_TOKEN,
}
payload_str = "&".join("%s=%s" % (k,v) for k,v in params.items())
r = requests.get(url,params=payload_str)
js = r.json()
with codecs.open(f_word,'w','utf-8') as FOUT:
FOUT.write(json.dumps(js,indent=2))
print word, js["total_count"]
limit = int(r.headers["x-ratelimit-remaining"])
#print "x-remaining-remaining", limit
#pprint.pprint(dict(r.headers))
utc_reset_time = int(r.headers["x-ratelimit-reset"])
reset_time = datetime.utcfromtimestamp(utc_reset_time)
if limit < 2:
now_time = datetime.utcnow()
delta_seconds = (reset_time - now_time).total_seconds() + 2
if delta_seconds < 0: break
print "Sleeping {:0.3f} seconds for API limit.".format(delta_seconds)
time.sleep(delta_seconds)