-
Notifications
You must be signed in to change notification settings - Fork 0
/
stgithub.py
executable file
·654 lines (557 loc) · 24.9 KB
/
stgithub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
#!/usr/bin/env python
# -*- coding: utf8 -*-
"""
This module provides interfaces to "unofficial GitHub API",
i.e. data available in the user interface but not in the official API.
This includes:
- user contributions timeline (all repositories contributed to,
organizations joined publicly, created repos, reported issues, etc.).
There is no official API for this, and public datasets like GHTorrent
do not report some of these events.
- user contribution stats (just number of contributions per year).
You can get the same information from GHTorrent,
but this method is only taking one HTTP request and thus it's much faster.
- get weekly contributors stats for a projects
(number of Lines Of Code contributed per week by top 100 contributors
since the beginning of the project).
LOC information is not available via API, and similar stats for commits take
multiple requests via official API.
.. autoclass:: Scraper
:members: full_user_activity_timeline, project_contributor_stats,
user_daily_contrib_num, links_to_recent_user_activity
"""
from __future__ import print_function
import argparse
from collections import defaultdict
import datetime
from functools import wraps
import logging
import re
import threading
import time
import warnings
from xml.etree import ElementTree
from bs4 import BeautifulSoup
import feedparser
import pandas as pd
import requests
import six # Queue
__version__ = '0.1.0'
__author__ = "Marat (@cmu.edu)"
__license__ = "GPL v3"
BASE_URL = 'https://github.com'
HEADERS = { # browser headers for non-API URLs
'X-Requested-With': 'XMLHttpRequest',
'Accept-Encoding': "gzip,deflate,br",
'Accept': "*/*",
'Origin': BASE_URL,
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:60.0) "
"Gecko/20100101 Firefox/60.0",
"Host": 'github.com',
"Referer": BASE_URL,
"DNT": "1",
"Accept-Language": 'en-US,en;q=0.5',
"Connection": "keep-alive",
"Cache-Control": 'max-age=0',
}
class GitHubScrapingError(requests.HTTPError):
pass
def normalize_text(string):
# type: (six.string_types) -> six.string_types
""" Normalize spaces and newlines
>>> normalize_text("\\nHello world \\t\\n!")
'Hello world!'
"""
return " ".join(string.split())
def _int(value):
if isinstance(value, six.string_types):
value = value.replace(",", "")
if value.endswith('k'):
# in the detailed list, large numbers are reduced
# to something like "1.7k"
value = float(value[:-1]) * 1000
return int(value)
def extract_repo(link):
# type: (six.string_types) -> six.string_types
""" Extract repository slug from a GitHub link
>>> extract_repo("/org/repo/blabla?something=foo")
'org/repo'
>>> extract_repo("org/repo")
'org/repo'
"""
return "/".join(link.strip("/").split("/", 2)[:2])
def _parse_timeline_update_record(record_div):
# type(BeautifulSoup) -> dict
"""
Args:
record_div(BeautifulSoup): a BS4 HTML element object,
representing one chunk of GitHub user activity.
Returns:
Dict[str, Dict[str, int]]: {
repository1: {
'commits': ...,
'issues': ...,
'pull_requests': ...,
'reviews': ...,
'created_repository': {0|1},
}
}
"""
# Note: GitHub lists only first 25 repos for each activity
# data[repo][activity] = <number>
record_data = defaultdict(lambda: defaultdict(int))
# get record title:
if record_div.button:
# created commits, repositories, issues,
# reviewed pull requests
title = normalize_text(record_div.button.text)
if re.match(
r'Reviewed \d[\d,]* pull requests? in \d+ repositor(y|ies)', title):
for repo_div in record_div.find_all(
'div', class_='profile-rollup-summarized'):
repo_div_button = repo_div.button
if not repo_div_button:
# "N repositories not shown"
continue
repo_span, count_span = repo_div_button.find_all('span')
repo = repo_span.text.strip()
count = _int(count_span.text.split()[0])
record_data[repo]['reviews'] += count
elif re.match(r'Opened \d[\d,]* (?:other )?issues? in \d+ repositor(y|ies)',
title):
for repo_div in record_div.find_all(
'div', class_='profile-rollup-summarized'):
repo_div_button = repo_div.button
if not repo_div_button:
# "N repositories not shown"
continue
repo = repo_div_button.div.span.text.strip()
count = 0
count_span = repo_div.button.find_all(
'span', recursive=False)[0]
for span in count_span.find_all('span'):
count += _int(span.text)
record_data[repo]['issues'] += count
elif re.match(r'Created \d[\d,]*\+? (?:other )?repositor(y|ies)', title):
# e.g. Created 100+ repositories
for link in record_div.find_all(
'a', attrs={'data-hovercard-type': "repository"}):
record_data[link.text]['created_repository'] = 1
elif re.match(r'Opened \d[\d,]* (?:other )?pull requests? '
r'in \d+ repositor(y|ies)', title):
for repo_div in record_div.find_all(
'div', class_='profile-rollup-summarized'):
repo_div_button = repo_div.button
if not repo_div_button:
# "N repositories not shown"
continue
repo = repo_div_button.div.span.text.strip()
count = 0
count_span = repo_div.button.find_all('span', recursive=False)[
0]
for span in count_span.find_all('span'):
count += _int(span.text)
record_data[repo]['pull_requests'] += count
elif re.match(r'Created \d[\d,]* commits? in \d+ repositor(y|ies)', title):
for repo_li in record_div.ul.find_all('li', recursive=False):
li_div = repo_li.div
if not li_div:
continue # "N repositories not shown"
repo_link = li_div.find_all('a', recursive=False)[1]
repo = extract_repo(repo_link["href"])
count = _int(repo_link.text.strip().split(" ")[0])
record_data[repo]['commits'] += count
else:
raise ValueError("Unexpected title: %s\n%s"
"" % (title, str(record_div)))
elif record_div.h4:
title = normalize_text(record_div.h4.text)
repo = record_div.h4.a and record_div.h4.a.text
if title.startswith("Created an issue in"):
record_data[repo]['issues'] += 1
elif title.startswith("Joined the"):
record_data[record_div.a['href'].strip('/')]['joined_org'] = 1
elif title.startswith("Created a pull request in"):
# fist PR in a given month
record_data[repo]['pull_requests'] += 1
elif title == "Joined GitHub":
pass
elif title.startswith("Opened their first issue on GitHub in"):
record_data[repo]['issues'] += 1
elif title.startswith("Opened their first pull request on GitHub in"):
record_data[repo]['pull_requests'] += 1
elif title.startswith("Created their first repository"):
links = record_div.find_all(
'a', attrs={'data-hovercard-type': "repository"})
if not links: # private repository
repo = ''
else:
repo = extract_repo(links[0].get('href'))
record_data[repo]['created_repository'] = 1
else:
raise ValueError("Unexpected title: " + title)
elif len(record_div.span) == 3:
# private activity
title = normalize_text(record_div.find_all('span')[1].text)
if title.endswith(' in private repositories'):
record_data[None]['private_contrib'] += _int(title.split(" ", 1)[0])
else:
raise ValueError("Unexpected title: " + title)
else:
raise ValueError("Unexpected activity:" + str(record_div))
# convert defaultdict to dict
return {rep: dict(activities) for rep, activities in record_data.items()}
def _parse_timeline_update(bs4_tree):
# type(BeautifulSoup) -> tuple
""" Parse a chunk of activity acquired via Ajax, usually one month.
Yields:
Tuple[str, Dict[str, int]]:
(month, {output of _parse_timeline_update_record()})
<div class="contribution-activity-listing"> # month div
<div class="profile-timeline discussion-timeline"> # one extra wrapper
<h3> # month title
<div class="profile-rollup-wrapper"> # record divs
...
Terminology:
timeline consists of updates
updates contain one or more months. Only one month is non-empty
month cosists of records - a single chunk of reported activity
record might contain information about several repositories,
e.g. Created N commits in M repositories
"""
# sometimes next chunk includes several months.
# In these cases, all except one are empty;
# often empty "months" represent ranges, e.g. April 2018 - December 2018
# to handle such cases, month is lazily evaluated
for month_div in bs4_tree.find_all("div", class_="profile-timeline"):
record_month = None
month_data = {}
for record_div in month_div.find_all("div", class_="profile-rollup-wrapper"):
try:
parsed_record = _parse_timeline_update_record(record_div)
except:
logging.error("Failed to parse record. Please contact the "
"maintainer and send the following HTML, along "
"with the user profile you're scraping:")
logging.error(record_div.prettify())
raise
if not parsed_record: # ignore empty months
continue
for record_repo, record_activity in parsed_record.items():
if record_repo not in month_data:
month_data[record_repo] = {}
# we might have several activities in the same record repository
# in a given month, e.g. issues, PRs and commits
month_data[record_repo].update(record_activity)
record_month = record_month or pd.to_datetime(
month_div.h3.text.strip()).strftime('%Y-%m')
if month_data:
yield record_month, month_data
def _extract_activity_feed_links(text):
tree = BeautifulSoup(text, 'html.parser')
date = None
for span in tree.find_all('span'):
if 'f6' not in span['class']:
continue
try:
date = pd.to_datetime(span.text.strip()).strftime("%Y-%m-%d")
except ValueError:
continue
break
links = []
for link in tree.find_all('a'):
href = link.get('href', '')
chunks = href.split("/")
# hrefs start with "/" so chunks[0] is an empty string
# this is why 'commit/issue/tree' is chunks[3], not [2]
if len(chunks) < 5 or \
chunks[3] not in ('commit', 'issue', 'tree'):
continue
if href not in links:
links.append(href)
yield (date, href)
def guard(func):
# TODO: once released in stutils, reuse from there
semaphore = threading.Lock()
@wraps(func)
def wrapper(*args, **kwargs):
semaphore.acquire()
try:
return func(*args, **kwargs)
finally:
semaphore.release()
return wrapper
class Scraper(object):
""" A class to access "unofficial GitHub API"
.. note::
This "unofficial API" is rate limited, just as the official one.
The rate limit is 40 requests in 80 seconds, and some calls take
multiple requests. So, for example, parsing a user activity timeline
typically takes couple minutes.
Use this "API" with caution as it might be extremely slow.
"""
_instance = None # singleton instance
cookies = None # cookies for non-API URLs
# limit is imposed if over 40 requests are made in 80 seconds
# thus, keeping track of issued requests
queue = None
# after many experiments, 40/121 looks to be the fastest option
queue_max_size = 40
queue_time_length = 121
retries_on_timeout = 5
def __new__(cls, *args, **kwargs): # Singleton
if not isinstance(cls._instance, cls):
cls._instance = super(Scraper, cls).__new__(cls, *args, **kwargs)
return cls._instance
def __init__(self):
self.session = requests.Session()
self.queue = six.moves.queue.Queue(maxsize=self.queue_max_size)
@guard
def _request(self, url, params=None, headers=None):
headers = headers or HEADERS
if not url.startswith(BASE_URL):
url = BASE_URL + url
while True:
if self.queue.full():
sleep_interval = self.queue.get() - time.time() + self.queue_time_length
if sleep_interval > 0:
logging.info("Hibernating for %.2f seconds to maintain "
"GitHub XHR rate limit..", sleep_interval)
time.sleep(sleep_interval)
self.queue.put(time.time())
# handle network errors and GitHub downtimes
# also, internal errors, like joshaber March 2015
r = None
for _ in range(self.retries_on_timeout):
try:
r = self.session.get(url, headers=headers, params=params)
except requests.exceptions.RequestException:
time.sleep(1)
continue
if r.status_code < 500:
break
else:
r = None
if r is None:
raise GitHubScrapingError(
"GitHub is not responding to requests. Try again later.")
if r.status_code == 429:
logging.info("Hit GitHub XHR rate limit, retry in 10 seconds..")
time.sleep(10)
continue
break
r.raise_for_status()
return r
def project_contributor_stats(self, repo_slug):
# type: (str) -> list
"""Get top 100 contributors weekly commit stats over the project history
Args:
repo_slug (str): <owner_login>/<repo_name>
Returns:
list: A list of top 100 contributors in the repo, with their logins,
total number of commits and weekly contribution counts as number
of lines added, changed or deleted. Note that weeks are
started on Sunday and represented by a Unix timestamp.
>>> Scraper().project_contributor_stats('pandas-dev/pandas') # doctest: +SKIP
[{u'author': {u'avatar': u'https://avatars0.githubusercontent.com/...',
u'hovercard_url': u'/hovercards?user_id=1435085',
u'id': 1435085,
u'login': u'blbradley',
u'path': u'/blbradley'},
u'total': 8,
u'weeks': [{u'a': 0, u'c': 0, u'd': 0, u'w': 1249171200},
{u'a': 0, u'c': 0, u'd': 0, u'w': 1249776000},
{u'a': 0, u'c': 0, u'd': 0, u'w': 1250380800},
...
}]
"""
for i in range(self.retries_on_timeout):
try:
res = self._request(
"/%s/graphs/contributors-data" % repo_slug).json()
except ValueError:
# sometimes GitHub just returns empty page
# without throwing a timeout
time.sleep(1)
continue
else:
return res
raise GitHubScrapingError(
"GitHub returns empty responses. Try again later.")
def user_daily_contrib_num(self, user, year):
# type: (str, int) -> dict
""" Get number of daily contributions of a GitHub user in a given year.
This method represents the white and green grid in the profile page.
Args:
user (str): The GitHub login of the user to get stats for.
year (int): Year of contributions to get
Returns:
dict: A dictionary with keys being %Y-%m-%d formatted dates, and
values being the number of contributions. This method does not
differentiate types of contributions, i.e. it is a sum
of commits, issues, submitted and reviewed pull requests, etc.
>>> Scraper().user_daily_contrib_num('user2589', 2018)
{'2018-01-01': 0,
'2018-01-02': 15,
...
'2018-12-31': 0}
"""
url = "/users/%s/contributions?from=%d-12-01&to=%d-12-31&full_graph=1" \
% (user, year, year)
year = str(year)
start_token = '<svg'
stop_token = '/svg>'
response_text = self._request(url).text
# cut out first <svg> element,
# since HTML outside of it is sometimes malformed
response_text = start_token + response_text.split(
start_token, 1)[-1].split(stop_token, 1)[0] + stop_token
tree = ElementTree.fromstring(response_text)
return {rect.attrib['data-date']: _int(rect.attrib.get('data-count'))
for rect in tree.iter('rect')
if rect.attrib.get('class') == 'day'
and rect.attrib.get('data-date', '').startswith(year)}
def links_to_recent_user_activity(self, user):
""" Get user events as a 2-tuple generator: (date, link).
Events include: commits, issues and refs creation (tags/branches).
Internally, this method is using Atom feed.
The result includes up to couple month of activity;
sometimes it also misses up to one month of recent events.
.. note::
This method is know to return incomplete data.
Proceed with caution.
Args:
user (str): The GitHub login of the user.
Yields:
Tuple[str, str]: (<%Y-%m-%d date>, link to the activity)
It seems like this feed only includes tags and commits
>>> list(Scraper().links_to_recent_user_activity('user2589')) # doctest: +SKIP
[('2018-12-01', '/user2589/Q/tree/master'),
('2018-12-01',
'/user2589/Q/commit/9184f20f939a70e3930ef762cc83906220433fc8'),
('2018-11-20', '/user2589/TAC_Github/tree/master'),
...]
"""
warnings.warn(
"This method is know to return incomplete data."
"Proceed with caution.", DeprecationWarning)
page = None
while True:
request = self._request('/%s' % user, params={'page': page},
headers={'Accept': 'application/atom+xml'})
page = 1 if page is None else page + 1
activity_log = feedparser.parse(request.text).entries
if not activity_log:
return
for record in activity_log:
for chunk in record['content']:
for date, link in _extract_activity_feed_links(
chunk['value'].encode('utf8')):
yield date, link
def full_user_activity_timeline(self, user, start=None, to=None):
# type: (str, str, str) -> Generator[Tuple[str, Dict]]
""" Get a list of public user contributions, by month by repository.
.. note: User timeline sometimes does not include all contributions.
E.g., this issue is not reflected in the reporter timeline:
https://github.com/GoogleCloudPlatform/webapp2/issues/104
Maybe, it
Args:
user (str): GitHub login of the user to get activity for.
start (str): date to start with, e.g. '2017-01' or '2017-01-01'.
`datetime` objects should also work.
to (str): upper bound of date ranges to parse, same as `start`.
**Note**: the day is 1 by default, i.e. '2017-01'
will be interpreted as **1st** of January 2017.
Yields:
Dict[str, int]:
A generator of activity dictionaries.
Each dict has fields `month`, a `%Y-%m` formatted month, and
`repo`, a repository slug. Other fields indicate number of
contributions of a given type:
- `commits`: number of commits.
- `issues`: number of reported issues.
- `reviews`: number of reviewed pull requests.
GitHub counts any commented pull request as reviewed,
also ignoring any code comments.
- `pull_requests`: number of pull requsts submitted.
- `created_repository`: can be only 1.
- `joined_org`: can be only 1.
The repository slug in this case is the GitHub org name.
- `private`: all contributions in private repositories combined,
if user enabled anonymous reporting of private activities.
The repository slug in this case is an empty string.
The output of this method is suitable for a pd.DataFrame constructor:
>>> pd.DataFrame(
... Scraper().full_user_activity_timeline('user2589'))
commits ... reviews
...
111 NaN ... NaN
112 NaN ... NaN
113 1.0 ... NaN
<BLANKLINE>
[114 rows x 9 columns]
It is even better to index on month+repo and replace NaNs:
>>> pd.DataFrame(
... Scraper().full_user_activity_timeline('user2589')
... ).set_index(['month', 'repo']).fillna(0).astype(int)
commits ... reviews
month repo ...
...
2012-05 user2589/minicms 11 ... 0
2011-09 alsoicode/django-admin-sortable 0 ... 0
2011-08 user2589/django-rosetta 0 ... 0
mbi/django-rosetta 0 ... 0
2005-03 user2589/schooligan 1 ... 0
<BLANKLINE>
[114 rows x 7 columns]
"""
if start:
if not isinstance(start, datetime.datetime):
start = pd.to_datetime(start)
start = start.strftime('%Y-%m')
if to:
if not isinstance(to, datetime.datetime): # str or unicode
to = pd.to_datetime(to)
now = to.strftime('%Y-%m-%d')
else:
now = datetime.datetime.now().strftime('%Y-%m-%d')
url = '/%s?tab=overview&include_header=no&utf8=✓&from=%s&to=%s' % (
user, now[:8] + '01', now)
while True:
soup = BeautifulSoup(self._request(url).text, 'html.parser')
for month_div in soup.find_all('div', class_='contribution-activity-listing'):
for month, data in _parse_timeline_update(month_div):
if start and month < start:
return
for repo, activity in data.items():
activity['repo'] = repo
activity['month'] = month
yield activity
form = soup.form
if not form:
break
url = form.attrs['data-url']
if not form.button:
break
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Get a user contribution timeline")
parser.add_argument('user', type=str,
help='GitHub login of the user to parse')
parser.add_argument('--from', type=str, nargs='?',
help='Lower end of the date range, default: no limit')
parser.add_argument('--to', type=str, nargs='?',
help='Upper end of the date range, default: now')
parser.add_argument('-o', '--output', default="-",
type=argparse.FileType('w'),
help='Output filename, "-" or skip for stdin')
parser.add_argument('-v', '--verbose', action='store_true',
help="Log progress to stderr")
args = parser.parse_args()
logging.basicConfig(format='%(asctime)s %(message)s',
level=logging.INFO if args.verbose else logging.WARNING)
COLUMNS = ('commits', 'issues', 'pull_requests', 'reviews',
'private_contrib', 'created_repository', 'joined_org')
df = pd.DataFrame(Scraper().full_user_activity_timeline(args.user))
df = df.set_index(['month', 'repo']).fillna(0).astype(int)
df.to_csv(args.output)