-
Notifications
You must be signed in to change notification settings - Fork 4
/
github_reports.py
executable file
·220 lines (176 loc) · 7.92 KB
/
github_reports.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#!/usr/bin/env python3
# Note: this script is deprecated and no longer runs in production! Unsure
# when it was originally taken out of production but confirmed in #support
# chat room today. Leaving here for history / code re-use. -mroth 8/19/2015.
import bisect
import copy
import http.client
import iso8601
import json
import re
import socket
import time
import urllib.error
import urllib.request
import util
# Non-exercise keys in the dictionary
SPECIAL_VALUES = ["elapsed_time", "max_id", "last_time", "time_this_period"]
# prephantom hash, shared by all users who have yet to do anything.
# We don't want to filter reviews from prephantoms, since they might be
# different people
PREPHANTOM_HASH = 1840534623
# Regex to use for getting the user hash from the github report.
# Used in association with rate-limiting of bug reports.
USER_HASH_REGEX = re.compile("User hash: (\d+)")
# Frequency (seconds) with which one user can file bug reports
# without some being ignored
WAIT_PERIOD = 2 * 60
def get_errors(old_reports):
stats = {}
issues = []
# The issue at which we should stop looking for more
# -1 indicates we shouldn't do more than 1 page
last_issue = old_reports.get("max_id", -1)
# Track the number of the first issue we find this time
first_issue = [last_issue] # Lets us modify this from within get_issues
urlfetch_errors = (socket.error, urllib.error.HTTPError,
http.client.HTTPException)
def get_issues(page):
url = ("https://api.github.com/repos/Khan/khan-exercises/issues"
"?page=%d&per_page=100" % page)
issue_data = util.retry(
lambda: urllib.request.urlopen(url, timeout=60),
'fetching khan-exercises issues',
lambda exc: isinstance(exc, urlfetch_errors))
# This flag is False if we should continue to the next page of
# issues and True if we should stop looking at more pages.
done = False
for issue in json.loads(issue_data.read()):
if issue["user"]["login"] == "KhanBugz":
if last_issue == -1:
# If we have no data so far, only go one page.
done = True
if issue["number"] > last_issue:
first_issue[0] = max(first_issue[0], issue["number"])
issues.append(issue)
else:
# If we've come to an issue we already saw,
# don't continue to further pages or issues
done = True
break
if ((re.findall(
r'<(.*?)>; rel="(.*?)"',
issue_data.info().get("Link"))[0][1] == "next") and
not done):
get_issues(page + 1)
get_issues(1)
first_issue = first_issue[0]
for issue in issues:
regex_matches = re.findall(
r'Khan:master/exercises/(.+?)\.html', issue["body"])
if len(regex_matches) == 0:
print(issue)
continue
user_hash = re.search(USER_HASH_REGEX, issue["body"])
try:
user_hash = user_hash.group(1)
except AttributeError:
user_hash = ""
# We can't distinguish prephantom users from each other,
# nor can we distinguish users with no hash. Put them all in the same,
# non rate-limited bucket.
if user_hash == PREPHANTOM_HASH:
user_hash = ""
created_at = iso8601.parse_date(issue["created_at"])
exercise = regex_matches[0]
if exercise not in stats:
stats[exercise] = {}
stats[exercise]["href"] = [issue["html_url"]]
stats[exercise]["users"] = {user_hash: [created_at]}
else:
old_times = stats[exercise]["users"].get(user_hash, [])
# Rate-limit number of bugs we count -- if someone submits
# two bugs in a very short timeframe, only count 1 -- the rest
# are probably bogus
if (not user_hash or not old_times or
abs(created_at - old_times[-1]).total_seconds > WAIT_PERIOD):
# We keep this list sorted so that we can more quickly
# look at the frequency with which a user submits messages
bisect.insort(old_times, created_at)
stats[exercise]["href"].append(issue["html_url"])
else:
print("Ignoring %s because user %s has posted too frequently"
% (issue["html_url"], user_hash))
stats[exercise]["users"][user_hash] = old_times
for ex in old_reports:
if ex not in SPECIAL_VALUES:
old_reports[ex]["this_period"] = 0
for ex in stats:
if ex not in old_reports:
old_reports[ex] = {"num_errors": 0,
"this_period": 0}
users = stats[ex]["users"]
issue_count = sum([len(users[u]) for u in users])
old_reports[ex]["num_errors"] += issue_count
old_reports[ex]["this_period"] = issue_count
old_reports[ex]["href"] = stats[ex]["href"]
cur_time = time.time()
this_period = cur_time - old_reports["last_time"]
old_reports["max_id"] = first_issue
old_reports["elapsed_time"] += this_period
old_reports["last_time"] = cur_time
old_reports["time_this_period"] = this_period
return old_reports
def generate_slack_links(links):
"""Given a list of links, generate a string that can be inserted into
a Slack message with them."""
return ", ".join("<%s|%d>" % (el, idx + 1) for idx, el in enumerate(links))
def main():
try:
exercise_file = open(util.relative_path("exercise_reports"), 'r+')
ex_reports = json.loads(exercise_file.read())
except IOError:
exercise_file = open(util.relative_path("exercise_reports"), 'w')
ex_reports = {"elapsed_time": 1, # Filler value
"max_id": -1,
"last_time": 0}
new_reports = get_errors(copy.deepcopy(ex_reports))
period_len = new_reports["time_this_period"]
for ex in new_reports:
if ex in SPECIAL_VALUES:
continue
if ex in ex_reports and ex_reports[ex]["num_errors"] > 0:
errors_this_period = new_reports[ex]["this_period"]
mean, probability = util.probability(ex_reports[ex]["num_errors"],
ex_reports["elapsed_time"],
errors_this_period,
period_len)
print("%s] TOTAL %s/%ss; %s-: %s/%ss; m=%.3f p=%.3f"
% (time.strftime("%Y-%m-%d %H:%M:%S %Z"),
ex_reports[ex]["num_errors"], ex_reports["elapsed_time"],
ex_reports["last_time"],
errors_this_period, period_len,
mean, probability))
if (probability > 0.997 and errors_this_period > 1):
util.send_to_slack(
"*Elevated exercise bug report rate in exercise `%s`\n"
"Reports: %s. We saw %s in the last %s minutes,"
" while the mean indicates we should see around %s."
" *Probability that this is abnormally elevated: %.4f.*"
% (ex,
generate_slack_links(new_reports[ex]["href"]),
util.thousand_commas(errors_this_period),
util.thousand_commas(int(period_len / 60)),
util.thousand_commas(round(mean, 2)),
probability),
channel="#support")
if "href" in new_reports[ex]:
del new_reports[ex]["href"] # don't need to keep the links around
del new_reports["time_this_period"]
# Overwrite with new contents
exercise_file.seek(0)
exercise_file.truncate()
exercise_file.write(json.dumps(new_reports))
exercise_file.close()
if __name__ == "__main__":
main()