diff --git a/fixtures/record/record_test.csv b/fixtures/record/record_test.csv index 2dbc7a7..beebf8a 100644 --- a/fixtures/record/record_test.csv +++ b/fixtures/record/record_test.csv @@ -15,3 +15,4 @@ "joined_organization.html","{""CMUSTRUDEL"": {""joined_org"": 1}}" "created_pull_request.html","{""spacetelescope/asdf"": {""pull_requests"": 1}}" "first_issue.html","{""mbi/django-rosetta"": {""issues"": 1}}" +"stress_test.html","{""AdoptOpenJDK/openjdk-build"": {""issues"": 3}, ""jimleitch01/packet-openstack-demos"": {""issues"": 4}, ""NixOS/nixpkgs"": {""issues"": 2}, ""dereckson/documentation"": {""issues"": 4}, ""hyperic/sigar"": {""issues"": 1}, ""vielmetti/mastodon-terraform"": {""issues"": 4}, ""vielmetti/sisyphus"": {""issues"": 2}, ""pnathan/ec2bringup"": {""issues"": 2}, ""node-red/node-red-docker"": {""issues"": 1}, ""linuxserver/docker-nginx-arm64"": {""issues"": 1}, ""tianon/jenkins-groovy"": {""issues"": 3}, ""minio/sha256-simd"": {""issues"": 1}, ""errordeveloper/kxd"": {""issues"": 3}, ""deoxxa/don"": {""issues"": 5}, ""golang/go"": {""issues"": 1}, ""upcoming/upcoming-www"": {""issues"": 1}, ""hishamhm/lua-mastodon"": {""issues"": 4}, ""glynnbird/toot"": {""issues"": 2}, ""vielmetti/mastodon-ham-radio"": {""issues"": 5}, ""WorksOnArm/worksonarm-news"": {""issues"": 5}, ""tootsuite/mastodon"": {""issues"": 4}, ""kubernetes/kubernetes-anywhere"": {""issues"": 1}, ""knu/ruby-unf_ext"": {""issues"": 1}, ""bcicen/ctop"": {""issues"": 2}, ""mwotton/aa-coworking"": {""issues"": 1}}" diff --git a/fixtures/record/stress_test.html b/fixtures/record/stress_test.html new file mode 100644 index 0000000..7e17f04 --- /dev/null +++ b/fixtures/record/stress_test.html @@ -0,0 +1,1624 @@ +
+ + + + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ 12 + repositories not shown +
+
+
diff --git a/stgithub.py b/stgithub.py index 47a34ec..324a4f6 100755 --- a/stgithub.py +++ b/stgithub.py @@ -80,6 +80,12 @@ def normalize_text(string): return " ".join(string.split()) +def _int(value): + if isinstance(value, six.string_types): + value = value.replace(",", "") + return int(value) + + def extract_repo(link): # type: (six.string_types) -> six.string_types """ Extract repository slug from a GitHub link @@ -120,52 +126,65 @@ def _parse_timeline_update_record(record_div): # reviewed pull requests title = normalize_text(record_div.button.text) if re.match( - r'Reviewed \d+ pull requests? in \d+ repositor(y|ies)', title): + r'Reviewed \d[\d,]* pull requests? in \d+ repositor(y|ies)', title): for repo_div in record_div.find_all( 'div', class_='profile-rollup-summarized'): - repo_span, count_span = repo_div.button.find_all('span') - repo = repo_span.text - count = int(count_span.text.split()[0]) + repo_div_button = repo_div.button + if not repo_div_button: + # "N repositories not shown" + continue + repo_span, count_span = repo_div_button.find_all('span') + repo = repo_span.text.strip() + count = _int(count_span.text.split()[0]) record_data[repo]['reviews'] += count - elif re.match(r'Opened \d+ (?:other )?issues? in \d+ repositor(y|ies)', + elif re.match(r'Opened \d[\d,]* (?:other )?issues? in \d+ repositor(y|ies)', title): for repo_div in record_div.find_all( 'div', class_='profile-rollup-summarized'): - repo = repo_div.button.div.span.text + repo_div_button = repo_div.button + if not repo_div_button: + # "N repositories not shown" + continue + repo = repo_div_button.div.span.text.strip() count = 0 count_span = repo_div.button.find_all( 'span', recursive=False)[0] for span in count_span.find_all('span'): - count += int(span.text) + count += _int(span.text) record_data[repo]['issues'] += count - elif re.match(r'Created \d+ (?:other )?repositor(y|ies)', title): + elif re.match(r'Created \d[\d,]*\+? (?:other )?repositor(y|ies)', title): + # e.g. Created 100+ repositories for link in record_div.find_all( 'a', attrs={'data-hovercard-type': "repository"}): record_data[link.text]['created_repository'] = 1 - elif re.match(r'Opened \d+ (?:other )?pull requests? ' + elif re.match(r'Opened \d[\d,]* (?:other )?pull requests? ' r'in \d+ repositor(y|ies)', title): for repo_div in record_div.find_all( 'div', class_='profile-rollup-summarized'): - repo = repo_div.button.div.span.text + repo_div_button = repo_div.button + if not repo_div_button: + # "N repositories not shown" + continue + repo = repo_div_button.div.span.text.strip() count = 0 count_span = repo_div.button.find_all('span', recursive=False)[ 0] for span in count_span.find_all('span'): - count += int(span.text) + count += _int(span.text) record_data[repo]['pull_requests'] += count - elif re.match(r'Created \d+ commits? in \d+ repositor(y|ies)', title): + elif re.match(r'Created \d[\d,]* commits? in \d+ repositor(y|ies)', title): for repo_li in record_div.ul.find_all('li', recursive=False): li_div = repo_li.div if not li_div: continue # "N repositories not shown" repo_link = li_div.find_all('a', recursive=False)[1] repo = extract_repo(repo_link["href"]) - count = int(repo_link.text.strip().split(" ")[0]) + count = _int(repo_link.text.strip().split(" ")[0]) record_data[repo]['commits'] += count else: @@ -203,7 +222,7 @@ def _parse_timeline_update_record(record_div): # private activity title = normalize_text(record_div.find_all('span')[1].text) if title.endswith(' in private repositories'): - record_data[None]['private_contrib'] += int(title.split(" ", 1)[0]) + record_data[None]['private_contrib'] += _int(title.split(" ", 1)[0]) else: raise ValueError("Unexpected title: " + title) else: @@ -242,7 +261,14 @@ def _parse_timeline_update(bs4_tree): record_month = None month_data = {} for record_div in month_div.find_all("div", class_="profile-rollup-wrapper"): - parsed_record = _parse_timeline_update_record(record_div) + try: + parsed_record = _parse_timeline_update_record(record_div) + except: + logging.error("Failed to parse record. Please contact the " + "maintainer and send the following HTML, along " + "with the user profile you're scraping:") + logging.error(record_div.prettify()) + raise if not parsed_record: # ignore empty months continue for record_repo, record_activity in parsed_record.items(): @@ -440,9 +466,16 @@ def user_daily_contrib_num(self, user, year): url = "/users/%s/contributions?from=%d-12-01&to=%d-12-31&full_graph=1" \ % (user, year, year) year = str(year) - tree = ElementTree.fromstring(self._request(url).text) - - return {rect.attrib['data-date']: int(rect.attrib.get('data-count')) + start_token = ' element, + # since HTML outside of it is sometimes malformed + response_text = start_token + response_text.split( + start_token, 1)[-1].split(stop_token, 1)[0] + stop_token + tree = ElementTree.fromstring(response_text) + + return {rect.attrib['data-date']: _int(rect.attrib.get('data-count')) for rect in tree.iter('rect') if rect.attrib.get('class') == 'day' and rect.attrib.get('data-date', '').startswith(year)} @@ -498,6 +531,11 @@ def full_user_activity_timeline(self, user, start=None, to=None): # type: (str, str, str) -> Generator[Tuple[str, Dict]] """ Get a list of public user contributions, by month by repository. + .. note: User timeline sometimes does not include all contributions. + E.g., this issue is not reflected in the reporter timeline: + https://github.com/GoogleCloudPlatform/webapp2/issues/104 + Maybe, it + Args: user (str): GitHub login of the user to get activity for. start (str): date to start with, e.g. '2017-01' or '2017-01-01'.