From f0971f59587926dea63cab0b92f75d5d71b92d8b Mon Sep 17 00:00:00 2001
From: user2589 <user2589@users.noreply.github.com>
Date: Mon, 6 Jul 2020 15:29:29 -0400
Subject: [PATCH 01/10] feat: Github API repo_contributors

---
 stscraper/github.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/stscraper/github.py b/stscraper/github.py
index 9dd84d3..c06f91a 100644
--- a/stscraper/github.py
+++ b/stscraper/github.py
@@ -194,6 +194,21 @@ def repo_labels(self, repo_slug):
         return tuple(label['name'] for label in
                      self.request('repos/%s/labels' % repo_slug, paginate=True))
 
+    def repo_contributors(self, repo_slug):
+        """
+        https://developer.github.com/v3/repos/statistics/#get-all-contributor-commit-activity
+
+        Suggested use:
+            
+            
+
+        """
+        url = 'repos/pandas-dev/pandas/stats/contributors' % repo_slug
+        for contributor_stats in next(self.request(url)):
+            record = {w['w']: w['c'] for w in contributor_stats['weeks']}
+            record['user'] = json_path(contributor_stats, 'author', 'login')
+            yield record
+
     @api('repos/%s/pulls/%d/commits', paginate=True, state='all')
     def pull_request_commits(self, repo, pr_id):
         # https://developer.github.com/v3/issues/comments/#list-comments-on-an-issue

From 8a332f07e2c137a4fb7af5f535b3cb8de7f5d627 Mon Sep 17 00:00:00 2001
From: user2589 <user2589@users.noreply.github.com>
Date: Mon, 6 Jul 2020 15:30:26 -0400
Subject: [PATCH 02/10] feat: Github API v4 (Graphql) pagination support

---
 stscraper/base.py   | 147 +++++++++++++++++++++++++---------------
 stscraper/github.py | 162 +++++++++++++++++++++++---------------------
 2 files changed, 177 insertions(+), 132 deletions(-)

diff --git a/stscraper/base.py b/stscraper/base.py
index c19e202..702b84a 100644
--- a/stscraper/base.py
+++ b/stscraper/base.py
@@ -173,18 +173,21 @@ def init_pagination():
         return {'page': 1, 'per_page': 100}
 
     @staticmethod
-    def extract_result(response, paginate):
+    def extract_result(response):
         """ Parse results from the response.
         For most APIs, it is just parsing JSON
         """
         return response.json()
 
-    def request(self, url, method='get', data=None, paginate=False, **params):
-        """ Generic, API version agnostic request method """
-        timeout_counter = 0
-        if paginate:
-            params.update(self.init_pagination())
+    def iterate_tokens(self, url=""):
+        """Infinite generator of tokens, taking care of their availability
 
+        Args:
+            url (str): request URL. In some API classes there are multiple rate
+                limits handled separately, e.g. GitHub general vs search API.
+        Generates:
+            (APIToken): a token object
+        """
         while True:
             # problem with iterating them in the same order
             # (eg, sorted by expiration): in multithreaded case,
@@ -193,56 +196,7 @@ def request(self, url, method='get', data=None, paginate=False, **params):
             for token in random.sample(self.tokens, len(self.tokens)):
                 if not token.ready(url):
                     continue
-
-                try:
-                    r = token(url, method=method, data=data, **params)
-                except TokenNotReady:
-                    continue
-                except requests.exceptions.RequestException:
-                    # starting early November, GitHub fails to establish
-                    # a connection once in a while (bad status line).
-                    # To account for more general issues like this,
-                    # TimeoutException was replaced with RequestException
-                    timeout_counter += 1
-                    if timeout_counter > self.retries_on_timeout:
-                        raise
-                    continue  # i.e. try again
-
-                if r.status_code in self.status_not_found:  # API v3 only
-                    raise RepoDoesNotExist(
-                        "%s API returned status %s" % (
-                            self.__class__.__name__, r.status_code))
-                elif r.status_code in self.status_empty:
-                    yield {}
-                    return
-                elif r.status_code in self.status_internal_error:
-                    timeout_counter += 1
-                    if timeout_counter > self.retries_on_timeout:
-                        raise requests.exceptions.Timeout("VCS is down")
-                    time.sleep(2**timeout_counter)
-                    continue  # i.e. try again
-                elif r.status_code in self.status_too_many_requests:
-                    timeout_counter += 1
-                    if timeout_counter > self.retries_on_timeout:
-                        raise requests.exceptions.Timeout(
-                            "Too many requests from the same IP. "
-                            "Are you abusing the API?")
-                    time.sleep(2**timeout_counter)
-                    continue
-
-                r.raise_for_status()
-                res = self.extract_result(r, paginate)
-                if paginate:
-                    for item in res:
-                        yield item
-                    if not res or not self.has_next_page(r):
-                        return
-                    else:
-                        params["page"] += 1
-                        continue
-                else:
-                    yield res
-                    return
+            yield token
 
             next_res = min(token.when(url) for token in self.tokens)
             sleep = next_res and int(next_res - time.time()) + 1
@@ -253,6 +207,87 @@ def request(self, url, method='get', data=None, paginate=False, **params):
                 time.sleep(sleep)
                 self.logger.info(".. resumed")
 
+    def request(self, url, method='get', data=None, paginate=False, **params):
+        """ Make an API request, taking care of pagination
+
+        Args:
+            url (str): request URL
+            method (str): HTTP method type
+            data (str): API request payload (for POST requests)
+            paginate (bool): flag to take care of pagination
+
+        Generates:
+            object: parsed object, API-specific
+        """
+        if paginate:
+            params.update(self.init_pagination())
+
+        while True:
+            r = self._request(url, method, data, **params)
+            if r.status_code in self.status_empty:
+                return
+
+            res = self.extract_result(r)
+            if paginate:
+                for item in res:
+                    yield item
+                if not res or not self.has_next_page(r):
+                    return
+                else:
+                    params["page"] += 1
+                    continue
+            else:
+                yield res
+                return
+
+    def _request(self, url, method='get', data=None, **params):
+        """ Make
+        Args:
+            url (str): request URL
+            method (str): HTTP method type
+            data (str): API request payload (for POST requests)
+
+        Return:
+            requests.Response: raw HTTP response
+        """
+        timeout_counter = 0
+        for token in self.iterate_tokens(url):
+            try:
+                r = token(url, method=method, data=data, **params)
+            except TokenNotReady:
+                continue
+            except requests.exceptions.RequestException:
+                # starting early November, GitHub fails to establish
+                # a connection once in a while (bad status line).
+                # To account for more general issues like this,
+                # TimeoutException was replaced with RequestException
+                timeout_counter += 1
+                if timeout_counter > self.retries_on_timeout:
+                    raise
+                continue  # i.e. try again
+
+            if r.status_code in self.status_not_found:  # API v3 only
+                raise RepoDoesNotExist(
+                    "%s API returned status %s at %s" % (
+                        self.__class__.__name__, r.status_code, url))
+            elif r.status_code in self.status_internal_error:
+                timeout_counter += 1
+                if timeout_counter > self.retries_on_timeout:
+                    raise requests.exceptions.Timeout("VCS is down")
+                time.sleep(2**timeout_counter)
+                continue  # i.e. try again
+            elif r.status_code in self.status_too_many_requests:
+                timeout_counter += 1
+                if timeout_counter > self.retries_on_timeout:
+                    raise requests.exceptions.Timeout(
+                        "Too many requests from the same IP. "
+                        "Are you abusing the API?")
+                time.sleep(1 << (timeout_counter+1))
+                continue
+
+            r.raise_for_status()
+            return r
+
     def all_users(self):
         # type: () -> Iterable[dict]
         """ """
diff --git a/stscraper/github.py b/stscraper/github.py
index c06f91a..484510b 100644
--- a/stscraper/github.py
+++ b/stscraper/github.py
@@ -1,9 +1,11 @@
 
+from __future__ import absolute_import
 from __future__ import print_function
 
 import datetime
 import json
 import os
+from typing import Iterable
 import warnings
 
 from .base import *
@@ -217,12 +219,12 @@ def pull_request_commits(self, repo, pr_id):
     @api('repos/%s/issues/%s/comments', paginate=True, state='all')
     def issue_comments(self, repo, issue_id):
         """ Return comments on an issue or a pull request
-        Note that for pull requests this method will return only general
-        comments to the pull request, but not review comments related to
-        some code. Use review_comments() to get those instead
+            Note that for pull requests this method will return only general
+            comments to the pull request, but not review comments related to
+            some code. Use review_comments() to get those instead
 
-        :param repo: str 'owner/repo'
-        :param issue_id: int, either an issue or a Pull Request id
+            :param repo: str 'owner/repo'
+            :param issue_id: int, either an issue or a Pull Request id
         """
         # https://developer.github.com/v3/issues/comments/#list-comments-on-an-issue
         return repo, issue_id
@@ -305,83 +307,91 @@ def canonical_url(repo_slug):
 
 class GitHubAPIv4(GitHubAPI):
     """ An example class using GraphQL API """
-    def v4(self, query, **params):
-        payload = json.dumps({"query": query, "variables": params})
-        return self.request("graphql", 'post', data=payload)
+    def v4(self, query, object_path=("data",), **params):
+        """ Make an API v4 request, taking care of pagination
+
+        Args:
+            query (str): GraphQL query. If the API request is multipage, it is
+                expected that the cursor variable name is "cursor".
+            object_path (Tuple[str]): json path to objects to iterate, excluding
+                leading "data" part, and the trailing "nodes" when applicable.
+                If omitted, will return full "data" content
+                Example: "repository__issues"
+
+        Generates:
+            object: parsed object, query-specific
+        """
+
+        while True:
+            payload = json.dumps({"query": query, "variables": params})
+
+            r = self._request("graphql", 'post', data=payload)
+            if r.status_code in self.status_empty:
+                return
+
+            res = self.extract_result(r)
+            if "data" not in res:
+                raise VCSError("API didn't return any data:\n" +
+                               json.dumps(res, indent=4))
+
+            objects = json_path(res["data"], *object_path)
+            if objects is None:
+                raise VCSError("Invalid object path '%s' in:\n %s" %
+                               (object_path, json.dumps(res)))
+            if "nodes" not in objects:
+                yield objects
+                return
+            for obj in objects["nodes"]:
+                yield obj
+            # the result is single page, or there are no more pages
+            if not json_path(objects, "pageInfo", "hasNextPage"):
+                return
+            params["cursor"] = json_path(objects, "pageInfo", "endCursor")
 
     def repo_issues(self, repo_slug, cursor=None):
         # type: (str, str) -> Iterator[dict]
         owner, repo = repo_slug.split("/")
-        query = """query ($owner: String!, $repo: String!, $cursor: String) {
-        repository(name: $repo, owner: $owner) {
-          hasIssuesEnabled
-            issues (first: 100, after: $cursor,
-              orderBy: {field:CREATED_AT, direction: ASC}) {
-                nodes {author {login}, closed, createdAt,
-                       updatedAt, number, title}
-                pageInfo {endCursor, hasNextPage}
-        }}}"""
+        return self.v4("""
+            query ($owner: String!, $repo: String!, $cursor: String) {
+                repository(name: $repo, owner: $owner) {
+                  hasIssuesEnabled
+                    issues (first: 100, after: $cursor,
+                      orderBy: {field:CREATED_AT, direction: ASC}) {
+                        nodes {author {login}, closed, createdAt,
+                               updatedAt, number, title}
+                        pageInfo {endCursor, hasNextPage}
+                }}
+            }""", ("repository", "issues"), owner=owner, repo=repo)
+
+    def user_followers(self, user):
+        # type: (str) -> Iterator[dict]
+        return self.v4("""
+            query ($user: String!, $cursor: String) { 
+              user(login: $user) {
+                followers(first:100, after:$cursor) {
+                  nodes { login }
+                  pageInfo{endCursor, hasNextPage}
+            }}}""", ("user", "followers"), user=user)
 
-        while True:
-            data = self.v4(query, owner=owner, repo=repo, cursor=cursor
-                           )['data']['repository']
-            if not data:  # repository is empty, deleted or moved
-                break
-
-            for issue in data["issues"]:
-                yield {
-                    'author': issue['author']['login'],
-                    'closed': issue['closed'],
-                    'created_at': issue['createdAt'],
-                    'updated_at': issue['updatedAt'],
-                    'closed_at': None,
-                    'number': issue['number'],
-                    'title': issue['title']
-                }
-
-            cursor = data["issues"]["pageInfo"]["endCursor"]
-
-            if not data["issues"]["pageInfo"]["hasNextPage"]:
-                break
-
-    def repo_commits(self, repo_slug, cursor=None):
-        # type: (str, str) -> Iterator[dict]
-        """As of June 2017 GraphQL API does not allow to get commit parents
-        Until this issue is fixed this method is only left for a reference
-        Please use commits() instead"""
+    def repo_commits(self, repo_slug):
+        # type: (str) -> Iterator[dict]
         owner, repo = repo_slug.split("/")
-        query = """query ($owner: String!, $repo: String!, $cursor: String) {
-        repository(name: $repo, owner: $owner) {
-          ref(qualifiedName: "master") {
-            target { ... on Commit {
-              history (first: 100, after: $cursor) {
-                nodes {sha:oid, author {name, email, user{login}}
-                       message, committedDate}
-                pageInfo {endCursor, hasNextPage}
-        }}}}}}"""
-
-        while True:
-            data = self.v4(query, owner=owner, repo=repo, cursor=cursor
-                           )['data']['repository']
-            if not data:
-                break
-
-            for commit in data["ref"]["target"]["history"]["nodes"]:
-                yield {
-                    'sha': commit['sha'],
-                    'author': commit['author']['user']['login'],
-                    'author_name': commit['author']['name'],
-                    'author_email': commit['author']['email'],
-                    'authored_date': None,
-                    'message': commit['message'],
-                    'committed_date': commit['committedDate'],
-                    'parents': None,
-                    'verified': None
-                }
-
-            cursor = data["ref"]["target"]["history"]["pageInfo"]["endCursor"]
-            if not data["ref"]["target"]["history"]["pageInfo"]["hasNextPage"]:
-                break
+        return self.v4("""
+            query ($owner: String!, $repo: String!, $cursor: String) {
+            repository(name: $repo, owner: $owner) {
+                defaultBranchRef{ target {
+                # object(expression: "HEAD") {
+                ... on Commit {
+                    history (first: 100, after: $cursor) {
+                        nodes {sha:oid, author {name, email, user{login}}
+                               message, committedDate
+                          # normally there is only 1 parent; max observed is 3
+                          parents (first:100) {
+                            nodes {sha:oid}}
+                        }
+                        pageInfo {endCursor, hasNextPage}
+            }}}}}}""", ("repository", "defaultBranchRef", "target", "history"),
+                       owner=owner, repo=repo)
 
 
 def get_limits(tokens=None):

From b0dc657304d829f2691e55a1de2c3cb67e6d78d6 Mon Sep 17 00:00:00 2001
From: user2589 <user2589@users.noreply.github.com>
Date: Fri, 10 Jul 2020 15:40:52 -0400
Subject: [PATCH 03/10] feat: user_info support for github graphql API

---
 stscraper/github.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/stscraper/github.py b/stscraper/github.py
index 484510b..8fd0782 100644
--- a/stscraper/github.py
+++ b/stscraper/github.py
@@ -45,6 +45,10 @@ def user(self):
                 self._user = r.json().get('login', '')
         return self._user
 
+    @property
+    def is_valid(self):
+        return self.user is not None
+
     def check_limits(self):
         # regular limits will be updated automatically upon request
         # we only need to take care about search limit
@@ -96,7 +100,6 @@ class GitHubAPI(VCSAPI):
     limits after every request. Actual work is done by outside classes, such
     as _IssueIterator and _CommitIterator
     """
-    tokens = None
     token_class = GitHubAPIToken
     base_url = 'https://github.com'
     status_too_many_requests = (403,)
@@ -307,7 +310,7 @@ def canonical_url(repo_slug):
 
 class GitHubAPIv4(GitHubAPI):
     """ An example class using GraphQL API """
-    def v4(self, query, object_path=("data",), **params):
+    def v4(self, query, object_path=(), **params):
         """ Make an API v4 request, taking care of pagination
 
         Args:
@@ -373,6 +376,19 @@ def user_followers(self, user):
                   pageInfo{endCursor, hasNextPage}
             }}}""", ("user", "followers"), user=user)
 
+    def user_info(self, user):
+        # type: (str) -> Iterator[dict]
+        return next(self.v4("""
+            query ($user: String!) { 
+              user(login:$user) { 
+                login, name, avatarUrl, websiteUrl
+                company, bio, location, name, twitterUsername, isHireable
+                # email  # email requires extra scopes from the API key
+                createdAt, updatedAt
+                followers{totalCount}
+                following {totalCount}
+              }}""", ("user",), user=user))
+
     def repo_commits(self, repo_slug):
         # type: (str) -> Iterator[dict]
         owner, repo = repo_slug.split("/")

From 438f5aed9e2a58df16cf1b70f1e9545d40d05b4a Mon Sep 17 00:00:00 2001
From: user2589 <user2589@users.noreply.github.com>
Date: Fri, 10 Jul 2020 15:41:37 -0400
Subject: [PATCH 04/10] feat: token update support

---
 stscraper/base.py | 236 +++++++++++++++++++++++++---------------------
 1 file changed, 128 insertions(+), 108 deletions(-)

diff --git a/stscraper/base.py b/stscraper/base.py
index 702b84a..4045947 100644
--- a/stscraper/base.py
+++ b/stscraper/base.py
@@ -1,4 +1,6 @@
 
+from __future__ import absolute_import
+
 import requests
 
 from datetime import datetime
@@ -7,7 +9,7 @@
 import re
 import six
 import time
-from typing import Iterable, Iterator, Optional
+from typing import Iterable, Iterator, Optional, Tuple, Union
 from functools import wraps
 
 
@@ -134,11 +136,127 @@ def caller(*args):
     return wrapper
 
 
+class APIToken(object):
+    """ An abstract container for an API token
+    """
+    # API endpoint
+    api_url = None  # type: str
+
+    token = None  # type: str
+    # number of seconds before throwing IOError
+    timeout = None  # type: int
+    # request headers to use
+    _headers = {}  # type: dict
+    # supported API classes (e.g. core, search etc)
+    api_classes = ('core',)  # type: Tuple
+    # rate limits for API classes
+    limits = None  # type: dict
+    session = None  # type: requests.Session
+
+    def __init__(self, token=None, timeout=None):
+        self.token = token
+        self.timeout = timeout
+        self.limits = {api_class: {
+            'limit': None,
+            'remaining': None,
+            'reset_time': None
+        } for api_class in self.api_classes}
+        self.session = requests.Session()
+
+    @property
+    def is_valid(self):
+        raise NotImplementedError
+
+    @property
+    def user(self):
+        """ Get user info of the token owner """
+        raise NotImplementedError
+
+    def _update_limits(self, response, url):
+        raise NotImplementedError
+
+    def check_limits(self):
+        """ Get information about remaining limits on the token.
+
+        Usually this information present in response headers and updated
+        automatically (see _update_limits()). This method is intended to
+        FORCE to renew this info.
+
+        Some APIs have multiple classes of limits, so it should return a list
+        of dictionaries
+        { <api_class>: {
+                'remaining': remaining number of requests until reset,
+                'limit': overall limit,
+                'reset_time': unix_timestamp
+            },
+            ...
+         }
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    def api_class(url):
+        # type: (str) -> str
+        return 'core'
+
+    def when(self, url):
+        # type: (str) -> int
+        """Check when the specified URL become accessible without blocking
+
+        Returns: unix timestamp
+        """
+        raise NotImplementedError
+
+    def ready(self, url):
+        """ Check if this url can be called without blocking """
+        t = self.when(url)
+        return not t or t <= time.time()
+
+    def __call__(self, url, method='get', data=None, **params):
+        """ Make an API request """
+        # TODO: use coroutines, perhaps Tornado (as PY2/3 compatible)
+
+        if not self.ready(url):
+            raise TokenNotReady
+
+        r = self.session.request(
+            method, self.api_url + url, params=params, data=data,
+            headers=self._headers,  timeout=self.timeout)
+
+        self._update_limits(r, url)
+
+        return r
+
+    def __str__(self):
+        return self.token
+
+
+class DummyAPIToken(APIToken):
+    """ A dummy token class that does nothing
+    APIs that don't have limits should use tokens subclassed from this one
+    """
+
+    is_valid = True
+    user = 'Anonymous'
+
+    def check_limits(self):
+        return self.limits
+
+    def ready(self, url):
+        return True
+
+    def when(self, url):
+        return None
+
+    def _update_limits(self, response, url):
+        pass
+
+
 class VCSAPI(object):
     _instance = None  # instance of API() for Singleton pattern implementation
 
-    tokens = None
-    token_class = None
+    tokens = ()  # type: Tuple[APIToken]
+    token_class = DummyAPIToken  # type: type
 
     status_too_many_requests = ()
     status_not_found = (404, 451)
@@ -149,16 +267,19 @@ class VCSAPI(object):
     def __new__(cls, *args, **kwargs):  # Singleton
         if not isinstance(cls._instance, cls):
             cls._instance = super(VCSAPI, cls).__new__(cls)
-            cls._instance.__init__(*args, **kwargs)
+
+        cls._instance.__init__(*args, **kwargs)
         return cls._instance
 
     def __init__(self, tokens=None, timeout=30):
-        # type: (Optional[Iterable], int) -> None
+        # type: (Optional[Union[Iterable,str]], int) -> None
+        old_tokens = {str(token) for token in self.tokens}
         if tokens:
             if isinstance(tokens, six.string_types):
                 tokens = tokens.split(",")
-            self.tokens = tuple(
-                self.token_class(t, timeout=timeout) for t in set(tokens))
+            new_tokens_instances = [self.token_class(t, timeout=timeout)
+                                    for t in set(tokens) - old_tokens]
+            self.tokens += tuple(t for t in new_tokens_instances if t.is_valid)
         self.logger = logging.getLogger('scraper.' + self.__class__.__name__)
 
     def has_next_page(self, response):
@@ -368,104 +489,3 @@ def canonical_url(repo_slug):
         # type: (str) -> str
         """ """
         raise NotImplementedError
-
-
-class APIToken(object):
-    """ An abstract container for an API token
-    """
-    api_url = None  # API endpoint
-
-    token = None  # str token
-    timeout = None  # number of seconds before throwing IOError
-    _headers = {}  # request headers to use
-    api_classes = ('core',)  # supported API classes (e.g. core, search etc)
-    limits = None  # rate limits for API classes
-    session = None
-
-    def __init__(self, token=None, timeout=None):
-        self.token = token
-        self.timeout = timeout
-        self.limits = {api_class: {
-            'limit': None,
-            'remaining': None,
-            'reset_time': None
-        } for api_class in self.api_classes}
-        self.session = requests.Session()
-
-    @property
-    def user(self):
-        """ Get user info of the token owner """
-        raise NotImplementedError
-
-    def _update_limits(self, response, url):
-        raise NotImplementedError
-
-    def check_limits(self):
-        """ Get information about remaining limits on the token.
-
-        Usually this information present in response headers and updated
-        automatically (see _update_limits()). This method is intended to
-        FORCE to renew this info.
-
-        Some APIs have multiple classes of limits, so it should return a list
-        of dictionaries
-        { <api_class>: {
-                'remaining': remaining number of requests until reset,
-                'limit': overall limit,
-                'reset_time': unix_timestamp
-            },
-            ...
-         }
-        """
-        raise NotImplementedError
-
-    @staticmethod
-    def api_class(url):
-        # type: (str) -> str
-        return 'core'
-
-    def when(self, url):
-        # type: (str) -> int
-        """Check when the specified URL become accessible without blocking
-
-        Returns: unix timestamp
-        """
-        raise NotImplementedError
-
-    def ready(self, url):
-        """ Check if this url can be called without blocking """
-        t = self.when(url)
-        return not t or t <= time.time()
-
-    def __call__(self, url, method='get', data=None, **params):
-        """ Make an API request """
-        # TODO: use coroutines, perhaps Tornado (as PY2/3 compatible)
-
-        if not self.ready(url):
-            raise TokenNotReady
-
-        r = self.session.request(
-            method, self.api_url + url, params=params, data=data,
-            headers=self._headers,  timeout=self.timeout)
-
-        self._update_limits(r, url)
-
-        return r
-
-
-class DummyAPIToken(APIToken):
-    """ A dummy token class that does nothing """
-
-    user = 'Anonymous'
-
-    def check_limits(self):
-        return self.limits
-
-    def ready(self, url):
-        return True
-
-    def when(self, url):
-        return None
-
-    def _update_limits(self, response, url):
-        pass

From 634aec258dc21483322677331ba0ddb794738dc2 Mon Sep 17 00:00:00 2001
From: user2589 <user2589@users.noreply.github.com>
Date: Fri, 10 Jul 2020 15:55:20 -0400
Subject: [PATCH 05/10] chore: add tests for Github API v4

---
 stscraper/generic.py |  2 +-
 test.py              | 31 +++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/stscraper/generic.py b/stscraper/generic.py
index 8b2f36e..a0e3e63 100644
--- a/stscraper/generic.py
+++ b/stscraper/generic.py
@@ -10,7 +10,7 @@
 """
 
 from .base import *
-from .github import GitHubAPI
+from .github import GitHubAPI, GitHubAPIv4
 from .gitlab import GitLabAPI
 from .bitbucket import BitbucketAPI
 
diff --git a/test.py b/test.py
index bc2524b..acd93bf 100755
--- a/test.py
+++ b/test.py
@@ -6,6 +6,16 @@
 import stscraper
 
 
+class TestBase(unittest.TestCase):
+
+    def test_add_keys(self):
+        api = stscraper.VCSAPI('key1,key2,key1')
+        self.assertEqual(len(api.tokens), 2)
+        api2 = stscraper.VCSAPI('key3,key1,key4')
+        self.assertTrue(api2 is api)
+        self.assertEqual(len(api.tokens), 4)
+
+
 class TestGitHub(unittest.TestCase):
 
     def setUp(self):
@@ -237,6 +247,27 @@ def test_project_exists(self):
         self.assertFalse(self.api.project_exists('user2589/nonexistent'))
 
 
+class TestGitHubv4(unittest.TestCase):
+
+    def setUp(self):
+        self.api = stscraper.GitHubAPIv4()
+        self.repo_address = 'pandas-dev/pandas'
+
+    def test_user_info(self):
+        # Docs: https://developer.github.com/v3/users/#response
+        user_info = self.api.user_info('user2589')
+        self.assertIsInstance(user_info, dict)
+        for prop in ('login', 'name', 'avatarUrl', 'websiteUrl', 'company',
+                     'bio', 'location', 'twitterUsername',
+                     'isHireable', 'createdAt', 'updatedAt',
+                     'followers', 'following'):
+            self.assertIn(prop, user_info)
+
+    def test_pagination(self):
+        commits = list(self.api.repo_commits('benjaminp/six'))
+        self.assertGreater(len(commits), 463)
+
+
 # class TestGitLab(unittest.TestCase):
 #
 #     def setUp(self):

From 7c5711069b6034b0857b1129d21caf55a455a12b Mon Sep 17 00:00:00 2001
From: user2589 <user2589@users.noreply.github.com>
Date: Fri, 24 Jul 2020 23:01:58 -0400
Subject: [PATCH 06/10] refactor!: remove unused interfaces

---
 docs/bitbucket.rst        |   0
 docs/github.rst           |   0
 docs/gitlab.rst           |   0
 scripts/collect_emails.py |  78 -------
 stscraper/__init__.py     |   3 +-
 stscraper/bitbucket.py    | 114 ----------
 stscraper/deprecated.py   | 141 ------------
 stscraper/generic.py      | 181 ---------------
 stscraper/gitlab.py       | 202 -----------------
 stscraper/stats.py        | 455 --------------------------------------
 test.py                   | 192 ----------------
 11 files changed, 1 insertion(+), 1365 deletions(-)
 delete mode 100644 docs/bitbucket.rst
 delete mode 100644 docs/github.rst
 delete mode 100644 docs/gitlab.rst
 delete mode 100644 scripts/collect_emails.py
 delete mode 100644 stscraper/bitbucket.py
 delete mode 100644 stscraper/deprecated.py
 delete mode 100644 stscraper/generic.py
 delete mode 100644 stscraper/gitlab.py
 delete mode 100644 stscraper/stats.py

diff --git a/docs/bitbucket.rst b/docs/bitbucket.rst
deleted file mode 100644
index e69de29..0000000
diff --git a/docs/github.rst b/docs/github.rst
deleted file mode 100644
index e69de29..0000000
diff --git a/docs/gitlab.rst b/docs/gitlab.rst
deleted file mode 100644
index e69de29..0000000
diff --git a/scripts/collect_emails.py b/scripts/collect_emails.py
deleted file mode 100644
index 442b453..0000000
--- a/scripts/collect_emails.py
+++ /dev/null
@@ -1,78 +0,0 @@
-
-from __future__ import print_function, unicode_literals
-
-import os
-import logging
-import argparse
-import csv
-import hashlib
-
-import pandas as pd
-from django.core.management.base import BaseCommand
-
-from common import decorators
-from common import email_utils as email
-from scraper import scraper as scraper
-
-logging.basicConfig()
-logger = logging.getLogger('ghd')
-
-
-class Command(BaseCommand):
-    requires_system_checks = False
-    help = "Create mapping of GitHub users to their emails for mathching " \
-           "StackOverflow records. The result is store in cache folder.\n\n" \
-           "This data is generated from commits records, so it is recommnded " \
-           "to run ./manage.py scraper_build_cache first."
-
-    def add_arguments(self, parser):
-        parser.add_argument('ecosystem', type=str,
-                            help='Ecosystem to process, {pypi|npm}')
-        parser.add_argument('-o', '--output', default="",
-                            help='Output file. Will be extended if already '
-                                 'exists')
-
-    def handle(self, *args, **options):
-        loglevel = 40 - 10*options['verbosity']
-        logger.setLevel(20 if loglevel == 30 else loglevel)
-
-        reader = csv.DictReader(options['input'])
-
-        output = options['output']
-        if not output:
-            output = os.path.join(
-                decorators.get_cache_path('scraper'), "user.emails.csv")
-        if os.path.isfile(output):
-            users = pd.read_csv(output, index_col=0)
-        else:
-            users = pd.DataFrame(columns=['uname', 'email_md5'])
-            users.index.name = 'email'
-
-        for package in reader:
-            logger.info("Processing %s %s", package['name'],
-                        package['github_url'])
-            if not package['github_url']:
-                continue
-
-            commits = scraper._commits(package['github_url'])
-            commits = commits.loc[pd.notnull(commits['author_email']) & \
-                                  pd.notnull(commits['author'])]
-            for _, commit in commits.iterrows():
-                if not commit['author'] or not commit['author_email']:
-                    continue
-                try:
-                    email_addr = email.clean(commit['author_email'])
-                except ValueError:  # invalid email
-                    continue
-
-                if email_addr in users.index:
-                    continue
-
-                md5 = hashlib.md5()
-                md5.update(email_addr)
-                users.loc[email_addr] = {
-                    'uname': commit['author'],
-                    'email_md5': md5.hexdigest()
-                }
-
-        users.to_csv(output)
diff --git a/stscraper/__init__.py b/stscraper/__init__.py
index adb5614..abb5206 100644
--- a/stscraper/__init__.py
+++ b/stscraper/__init__.py
@@ -1,6 +1,5 @@
 
-from .base import *
-from .generic import *
+from .github import *
 
 __version__ = '0.4.0'
 __author__ = "Marat (@cmu.edu)"
diff --git a/stscraper/bitbucket.py b/stscraper/bitbucket.py
deleted file mode 100644
index 1d30ed5..0000000
--- a/stscraper/bitbucket.py
+++ /dev/null
@@ -1,114 +0,0 @@
-
-from .base import *
-
-
-class BitbucketAPIToken(DummyAPIToken):
-    """ A dummy
-    Bitbucket isn't using any tokens
-    https://confluence.atlassian.com/bitbucket/rate-limits-668173227.html
-    """
-    api_url = "https://api.bitbucket.org/2.0/"
-
-
-class BitbucketAPI(VCSAPI):
-    token_class = BitbucketAPIToken
-
-    status_not_found = (404, 422, 451)
-
-    def __init__(self, tokens=None, timeout=30):
-        super(BitbucketAPI, self).__init__([None], timeout)
-
-    def has_next_page(self, response):
-        return 'next' in response.json()
-
-    @staticmethod
-    def init_pagination():
-        return {'page': 1, 'pagelen': 100}
-
-    @staticmethod
-    def extract_result(response, paginate):
-        res = response.json()
-        if 'error' in res:
-            raise VCSError(json_path(res, 'error', 'message'))
-        if paginate:
-            return res['values']
-        return res
-
-    def all_users(self):
-        # type: () -> Iterable[dict]
-        """ """
-        raise NotImplementedError
-
-    def all_repos(self):
-        # type: () -> Iterable[dict]
-        """ """
-        return self.request('repositories', paginate=True)
-
-    def repo_issues(self, repo_name):
-        # type: (str) -> Iterable[dict]
-        """ """
-        return self.request(
-            'repositories/%s/issues' % repo_name, paginate=True)
-
-    def repo_commits(self, repo_name):
-        # type: (str) -> Iterable[dict]
-        """ """
-        return self.request(
-            'repositories/%s/commits' % repo_name, paginate=True)
-
-    def repo_pulls(self, repo_name):
-        # type: (str) -> Iterable[dict]
-        """ """
-        return self.request('repositories/%s/pullrequests' % repo_name)
-
-    def pull_request_commits(self, repo, pr_id):
-        # type: (str, int) -> Iterable[dict]
-        """ """
-        raise NotImplementedError
-
-    def issue_comments(self, repo, issue_id):
-        # type: (str, int) -> Iterable[dict]
-        """ """
-        raise NotImplementedError
-
-    def review_comments(self, repo, pr_id):
-        # type: (str, int) -> Iterable[dict]
-        """ """
-        raise NotImplementedError
-
-    def user_info(self, user):
-        # type: (str) -> dict
-        """ """
-        raise NotImplementedError
-
-    def user_repos(self, user):
-        # type: (str) -> dict
-        """Get list of user repositories"""
-        return self.request('repositories/' + user)
-
-    def user_orgs(self, user):
-        # type: (str) -> Iterable[dict]
-        """ """
-        raise NotImplementedError
-
-    def org_members(self, org):
-        # type: (str) -> Iterable[dict]
-        """ """
-        raise NotImplementedError
-
-    def org_repos(self, org):
-        # type: (str) -> Iterable[dict]
-        """ """
-        raise NotImplementedError
-
-    @staticmethod
-    def project_exists(repo_name):
-        # type: (str) -> bool
-        """ """
-        return bool(requests.head(BitbucketAPIToken.api_url + repo_name))
-
-    @staticmethod
-    def canonical_url(project_url):
-        # type: (str) -> str
-        """ """
-        raise NotImplementedError
diff --git a/stscraper/deprecated.py b/stscraper/deprecated.py
deleted file mode 100644
index 45fde4d..0000000
--- a/stscraper/deprecated.py
+++ /dev/null
@@ -1,141 +0,0 @@
-#!/usr/bin/env python
-"""
-Deprecated tools, mostly serving the purpose of examples
-"""
-
-
-import datetime
-import re
-
-
-def timestamp2str(timestamp):
-    return datetime2str(datetime.datetime.fromtimestamp(timestamp))
-
-
-def datetime2str(dt, fmt="%Y-%m-%d %H:%M"):
-    return dt.strftime(fmt)
-
-
-def utf8fy(string):
-    try:
-        return string.encode('utf8')
-    except UnicodeDecodeError:
-        return '*Garbled*'
-
-
-def commits_gitpython(repo_path, ref='master', short_message=False):
-    """ Parse commits from a cloned git repository using gitphython
-    This is a rather slow method since gitpython simply parses cli output of
-    native git client
-    """
-    import git
-
-    try:
-        repo = git.Repo(repo_path)
-    except git.InvalidGitRepositoryError:
-        raise ValueError("Not a git repository: %s" % repo_path)
-
-    for commit in repo.iter_commits(ref, max_count=-1):
-        # WTF? example:
-        # https://github.com/openssl/openssl/commit/c753e71e0a0aea2c540dab96fb02c9c62c6ba7a2
-        hasauthor = hasattr(commit, 'author') or None
-        hasdate = hasattr(commit, 'committed_date') or None
-
-        message = commit.message.strip()
-        if short_message:
-            message = message.split("\n", 1)[0].strip()
-
-        yield {
-            'sha': commit.hexsha,
-            'author_name': hasauthor and utf8fy(commit.author.name),
-            'author_email': hasauthor and utf8fy(commit.author.email),
-            'authored_date': hasauthor and timestamp2str(commit.authored_date),
-            'committer_name': utf8fy(commit.committer.name),
-            'committer_email': utf8fy(commit.committer.email),
-            'committed_date': hasdate and timestamp2str(commit.committed_date),
-            'message': utf8fy(message),
-            'parents': commit.parents
-        }
-
-
-def get_repo_name(repo_url):
-    assert(repo_url.endswith(".git"))
-    chunks = [c for c in re.split("[:/]", repo_url[:-4]) if c]
-    org = "" if len(chunks) < 2 else chunks[-2]
-    repo = chunks[-1]
-    return org, repo
-
-
-def commits_pygit2(repo_url, remove=True):
-    """ Iterate commits using Python libgit2 binding.
-    Unlike GitPython, it can clone repository for you and works in the same
-    memory space so it is much faster. It is kind of heavy, but can be handy if
-    you need to work with repository/commits content (e.g. code analysis)
-
-    :param repo_url Git repository URL (not GitHub URL!).
-            Example: git://github.com/user/repo.git
-    """
-    import os
-    import tempfile
-    import shutil
-
-    import pygit2
-    org, repo_name = get_repo_name(repo_url)
-    folder = tempfile.mkdtemp(prefix='_'.join(('ghd', org, repo_name, '')))
-    repo = pygit2.clone_repository(repo_url, folder, bare=True)
-
-    try:
-        for commit in repo.walk(repo.head.target):
-            # http://www.pygit2.org/objects.html#commits
-            yield {
-                'sha': commit.oid,
-                'author_name': commit.author.name,
-                'author_email': commit.author.email,
-                'committer_name': commit.committer.name,
-                'committer_email': commit.committer.email,
-                'message': commit.message.strip(),
-                'parent_ids': "\n".join(str(pid) for pid in commit.parent_ids),
-                'time': commit.commit_time,
-            }
-    finally:
-        if remove:
-            os.chdir('/tmp')
-            shutil.rmtree(folder)
-
-
-def issues_PyGithub(github_token, repo_name):
-    """ Iterate issues of a GitHub repository using GitHub API v3
-
-    The library used in this method, PyGithub tries to extensively resolve
-    attributes which leads to a number of excessive API calls and computation
-    overhead. This implementation tries to avoid this, and was replaced by
-    local implementation to have uniform interface and get rid of dependency
-    """
-    # this is not the same module included with scraper.
-    # to install, `pip install PyGithub`
-    import github
-
-    g = github.Github(github_token)
-    repo = g.get_repo(repo_name)
-    try:
-        id = repo.id
-    except github.GithubException:
-        raise ValueError("Repository %s does not exist" % repo_name)
-
-    issues = repo.get_issues(state='all')
-
-    # Response example:
-    # https://api.github.com/repos/pandas-dev/pandas/issues?page=62
-    for issue in issues:
-        raw = issue._rawData  # to prevent resolving usernames into objects
-        yield {
-            'id': int(raw['id']),
-            'title': raw['title'],
-            'user': raw['user']['login'],
-            'labels': ",".join(l['name'] for l in raw['labels']),
-            'state': raw['state'],
-            'created_at': raw['created_at'],
-            'updated_at': raw['updated_at'],
-            'closed_at': raw['closed_at'],
-            'body': raw['body']
-        }
diff --git a/stscraper/generic.py b/stscraper/generic.py
deleted file mode 100644
index a0e3e63..0000000
--- a/stscraper/generic.py
+++ /dev/null
@@ -1,181 +0,0 @@
-
-"""
-Standard interface to all supported code hosting platforms.
-
-Two important distinctions comparing to
-1. URLs must include the code hosting platform itself, i.e. instead of
-    `cmustrudel/strudel.scraper` one should use
-    `github.com/cmustrudel/strudel.scraper`.
-2. Returned objects are simplified to a common subset of fields
-"""
-
-from .base import *
-from .github import GitHubAPI, GitHubAPIv4
-from .gitlab import GitLabAPI
-from .bitbucket import BitbucketAPI
-
-PROVIDERS = {
-    "github.com": GitHubAPI,
-    # https://developer.atlassian.com/bitbucket/api/2/reference/resource/
-    "bitbucket.org": BitbucketAPI,
-    # https://docs.gitlab.com/ee/api/
-    "gitlab.org": GitLabAPI,
-    # https://anypoint.mulesoft.com/apiplatform/sourceforge/
-    "sourceforge.net": None,
-}
-
-
-def get_provider(url):
-    # type: (str) -> (str, str)
-    """ Separate provided URL into provider and project ID
-    :param url: url matching URL_PATTERN
-    :return: (provider_cls, project_id)
-
-    >>> prov, proj_id = get_provider("github.com/abc/def")
-    >>> isinstance(prov, github.GitHubAPI)
-    True
-    >>> proj_id
-    'abc/def'
-    >>> prov, proj_id = get_provider("someothersource.com/abc/def")
-    """
-    provider_name, project_url = parse_url(url)
-    provider_cls = PROVIDERS.get(provider_name)
-    if provider_cls is None:
-        raise NotImplementedError(
-            "Provider %s is not supported (yet?)" % provider_name)
-    return provider_cls, project_url
-
-
-MAPPINGS = {
-    'repo_commits': {
-        'fields': (
-            'sha', 'author', 'author_email', 'author_name', 'authored_at',
-            'committer', 'committer_email', 'committed_at', 'comment_count',
-            'message', 'verified'),
-        'github.com': {
-            'sha': 'sha',
-            'author': 'author__login',
-            'author_email': 'commit__author__email',
-            'author_name': 'commit__author__name',
-            'authored_at': 'commit__author__date',
-            'committer': 'commit__committer__login',
-            'committer_email': 'commit__committer__email',
-            'committed_at': 'commit__committer__date',
-            'comment_count': 'commit__comment_count',
-            'message': 'commit__message',
-            'verified': 'commit__verification__verified',
-            'parents': 'parents__,sha'
-        },
-    },
-    'repo_issues': {
-        'fields': (
-            'number', 'user', 'role', 'title', 'body', 'assignee', 'id',
-            'state', 'created_at', 'updated_at', 'closed_at', 'reactions'),
-        'github.com': {
-            'number': 'number',
-            'user': 'user__login',
-            'role': 'author_association',
-            'title': 'title',
-            'body': 'body',
-            'assignee': 'assignee',
-            'id': 'id',
-            'state': 'state',
-            'created_at': 'created_at',
-            'updated_at': 'updated_at',
-            'closed_at': 'closed_at',
-            'reactions': 'reactions__total_count',
-            'pull_request_url': 'pull_request__url',
-            'labels': 'labels__,name',
-        },
-    },
-    'repo_pulls': {
-        'fields': (
-            'number', 'title', 'body', 'state', 'user', 'head',
-            'head_branch', 'base', 'base_branch', 'created_at',
-            'updated_at', 'closed_at', 'merged_at', 'role'),
-        'github.com': {
-            'number': 'number',
-            'title': 'title',
-            'body': 'body',
-            'state': 'state',
-            'user': 'user__login',
-            'head': 'head__repo__full_name',
-            'head_branch': 'head__ref',
-            'base': 'base__repo__full_name',
-            'base_branch': 'base__ref',
-            'created_at': 'created_at',
-            'updated_at': 'updated_at',
-            'closed_at': 'closed_at',
-            'merged_at': 'merged_at',
-            'role': 'author_association',
-            'labels': 'labels__,name',
-        },
-    },
-    'review_comments': {
-        'fields': (  # 'pr_no',
-                   'id', 'user', 'created_at', 'updated_at',
-                   'body', 'path', 'position', 'role'),
-        'github.com': {
-            # TODO: 'pr_no': 'pr_no',  # from call params
-            'id': 'id',
-            'body': 'body',
-            'user': 'user__login',
-            'role': 'author_association',
-            'created_at': 'created_at',
-            'updated_at': 'updated_at',
-            'path': 'path',
-            'position': 'original_position',
-        },
-    },
-    'issue_comments': {
-        'fields': (  # 'issue_no',
-                   'id', 'user', 'created_at', 'updated_at',
-                   'body', 'role', 'reactions'),
-        'github.com': {
-            'id': 'id',
-            'body': 'body',
-            'user': 'user__login',
-            'role': 'author_association',
-            'created_at': 'created_at',
-            'updated_at': 'updated_at',
-            'reactions': 'reactions__total_count',
-            # TODO: 'issue_no': int(comment['issue_url'].rsplit("/", 1)[-1]),
-        }
-    },
-}
-
-
-class GenericScraper(object):
-    """ Get a small but consistent subset of fields across all VCS providers
-    This interface supports the same API as all other VCS providers,
-    with one addition: you need to append repository URL
-    in front of all other params. For example,
-
-    >>> GitHubAPI().repo_commits("user/repo")
-
-    is equivalent to:
-
-    >>> GenericScraper().repo_commits("https://github.com/user", "user/repo")
-    """
-    def __getattribute__(self, attr):
-        if not hasattr(VCSAPI, attr):
-            raise AttributeError("'Scraper' has not attribute '%s'" % attr)
-        if attr not in MAPPINGS:
-            raise NotImplementedError(
-                "Generic API '%s' has not been implemented yet" % attr)
-        mappings = MAPPINGS[attr]
-
-        def wrapper(url, *args):
-            provider_name, _ = parse_url(url)
-            if provider_name not in mappings:
-                raise NotImplementedError(
-                    "Generic API '%s' has not been implemented for '%s' yet"
-                    "" % (attr, provider_name))
-            mapping = mappings[provider_name]
-            provider_cls, _ = get_provider(url)
-            provider = provider_cls()
-
-            for item in getattr(provider, attr)(*args):
-                yield json_map(mapping, item)
-
-        return wrapper
diff --git a/stscraper/gitlab.py b/stscraper/gitlab.py
deleted file mode 100644
index ebe5649..0000000
--- a/stscraper/gitlab.py
+++ /dev/null
@@ -1,202 +0,0 @@
-import warnings
-
-from .base import *
-import stutils
-
-
-def str_urlencode(string):
-    # TODO: a real encoder
-    return string.replace("/", "%2f")
-
-
-class GitLabAPIToken(APIToken):
-    api_url = "https://gitlab.com/api/v4/"
-
-    _user = None  # cache user
-    _headers = {}
-
-    def __init__(self, token=None, timeout=None):
-        super(GitLabAPIToken, self).__init__(token, timeout)
-        if token is not None:
-            self.token = token
-            self._headers["Private-Token"] = token
-
-    @property
-    def user(self):
-        if self._user is None:
-            try:
-                r = self('user')
-            except TokenNotReady:
-                pass
-            else:
-                self._user = r.json().get('username', '')
-        return self._user
-
-    def check_limits(self):
-        # regular limits will be updaated automatically upon request
-        # we only need to take care about search limit
-        try:
-            stats = self('').json()['resources']
-        except TokenNotReady:
-            stats = {}
-
-        for cls in self.api_classes:
-            self.limits[cls] = json_map({
-                'remaining': 'remaining',
-                'reset': 'reset',
-                'limit': 'limit',
-            }, stats.get(cls, {}))
-
-        return self.limits
-
-    def when(self, url):
-        key = self.api_class(url)
-        if self.limits[key]['remaining'] != 0:
-            return 0
-        return self.limits[key]['reset']
-
-    def _update_limits(self, response, url):
-        if 'RateLimit-Remaining' in response.headers:
-            remaining = int(response.headers['RateLimit-Remaining'])
-            self.limits[self.api_class(url)] = {
-                'remaining': remaining,
-                'reset': int(response.headers['RateLimit-Reset']),
-                'limit': int(response.headers['RateLimit-Limit'])
-            }
-
-            if response.status_code == 429 and remaining == 0:
-                raise TokenNotReady
-
-
-class GitLabAPI(VCSAPI):
-    """ This is a convenience class to pool GitHub API keys and update their
-    limits after every request. Actual work is done by outside classes, such
-    as _IssueIterator and _CommitIterator
-    """
-    token_class = GitLabAPIToken
-
-    status_not_found = (404, 422, 451)
-
-    def __init__(self, tokens=None, timeout=30):
-        if not tokens:
-            stconfig_tokens = stutils.get_config("GITLAB_API_TOKENS")
-            if stconfig_tokens:
-                tokens = [token.strip()
-                          for token in stconfig_tokens.split(",")
-                          if len(token.strip()) == 20]
-
-        if not tokens:
-            tokens = [None]
-            warnings.warn("No tokens provided. GitLab API will be limited to "
-                          "600 requests per minute", Warning)
-        super(GitLabAPI, self).__init__(tokens, timeout)
-
-    def has_next_page(self, response):
-        page = response.headers.get('X-Page')
-        total_pages = response.headers.get('X-Total-Pages', 0)
-        return page is not None and int(page) < int(total_pages)
-
-    @api('users', paginate=True)
-    def all_users(self):
-        # https://docs.gitlab.com/ee/api/users.html#list-users
-        return ()
-
-    @api('projects', paginate=True)
-    def all_repos(self):
-        # https://docs.gitlab.com/ee/api/projects.html#list-all-projects
-        return ()
-
-    @api('projects/%s/issues', paginate=True)
-    def repo_issues(self, repo_name):
-        # https://docs.gitlab.com/ee/api/issues.html#list-project-issues
-        return str_urlencode(repo_name)
-
-    @api('projects/%s/repository/commits', paginate=True)
-    def repo_commits(self, repo_name):
-        # https://docs.gitlab.com/ee/api/commits.html#list-repository-commits
-        return str_urlencode(repo_name)
-
-    @api('projects/%s/merge_requests', paginate=True)
-    def repo_pulls(self, repo_name):
-        # https://docs.gitlab.com/ee/api/merge_requests.html
-        return str_urlencode(repo_name)
-
-    def repo_topics(self, repo_name):
-        return next(self.request('projects/%s' % str_urlencode(repo_name))
-                    ).get('tag_list', [])
-
-    @api('projects/%s/merge_requests/%s/commits', paginate=True)
-    def pull_request_commits(self, repo, pr_iid):
-        # https://docs.gitlab.com/ee/api/merge_requests.html#get-single-mr-commits
-        return str_urlencode(repo), pr_iid
-
-    @api('projects/%s/issues/%s/notes', paginate=True)
-    def issue_comments(self, repo, issue_iid):
-        # https://docs.gitlab.com/ee/api/notes.html#list-project-issue-notes
-        return str_urlencode(repo), issue_iid
-
-    @api('projects/%s/merge_requests/%s/notes', paginate=True)
-    def review_comments(self, repo, pr_iid):
-        # https://docs.gitlab.com/ee/api/notes.html#list-all-merge-request-notes
-        return str_urlencode(repo), pr_iid
-
-    @api('users/%s')
-    def user_info(self, user):
-        # https://docs.gitlab.com/ce/api/users.html#single-user
-        try:
-            return next(self.request('users', username=user))[0]['id']
-        except (StopIteration, IndexError):
-            raise KeyError("User does not exist")
-
-    @api('users/%s/projects', paginate=True)
-    def user_repos(self, user):
-        # https://docs.gitlab.com/ee/api/projects.html#list-user-projects
-        return user
-
-    @api('users/%s/events', paginate=True)
-    def user_events(self, user):
-        # https://docs.gitlab.com/ee/api/events.html#get-user-contribution-events
-        return user
-
-    def user_orgs(self, user):
-        # not available in GitLab API v4
-        raise NotImplementedError
-
-    @api('/groups/%s/members/all', paginate=True)
-    def org_members(self, org):
-        return str_urlencode(org)
-
-    @api('/groups/%s/projects', paginate=True)
-    def org_repos(self, org):
-        # TODO: recursive groups
-        return str_urlencode(org)
-
-    @staticmethod
-    def project_exists(repo_name):
-        # type: (str) -> bool
-        """
-        Unlike GitHub, GitLab will return 302 to login page
-        for non-existing projects
-        """
-        return requests.head("https://gitlab.com/" + repo_name
-                             ).status_code < 300
-
-    @staticmethod
-    def canonical_url(project_url):
-        # type: (str) -> str
-        """
-        Case insensitive
-        Path can contain only letters, digits, '_', '-' and '.'.
-        Cannot start with '-', end in '.git' or end in '.atom'
-
-        Implementation is copied from Github API
-        """
-        url = project_url.lower()
-        for chunk in ("http://", "https://", "gitlab.com"):
-            if url.startswith(chunk):
-                url = url[len(chunk):]
-        if url.endswith("/"):
-            url = url[:-1]
-        while url.endswith(".git"):
-            url = url[:-4]
-        return "gitlab.com/" + url
diff --git a/stscraper/stats.py b/stscraper/stats.py
deleted file mode 100644
index c0fb62f..0000000
--- a/stscraper/stats.py
+++ /dev/null
@@ -1,455 +0,0 @@
-
-from __future__ import print_function
-
-import numpy as np
-import pandas as pd
-
-from stutils import decorators
-from stutils import email_utils as email
-from . import *
-
-""" First contrib date without MIN_DATE restriction:
-> fcd = utils.first_contrib_dates("pypi").dropna()
-> df = pd.DataFrame(fcd.rename("fcd"))
-> df["url"] = utils.package_urls("pypi")
-> df = df.dropna(axis=1).sort_values("fcd")
-> df.groupby(df["fcd"].str[:4]).count()
-
-> data = df.iloc[:400]
-> def second_month(row):
->     cs = scraper_utils.commit_stats(row["url"])
->     return cs[cs>0].index[1]
-> data["second_month"] = data.apply(second_month, axis=1)
-> data.groupby(data["second_month"].str[:4]).count()
-
-1970: 3, 1973: 1, 1974: 3, 1997+: 2, 2, 2, 9, 14, 29, 50, 45, 99, 118, ...
-looking at their second month of contributions, it is:
-nothing before 1997,       1997+: 2, 0, 1, 9, 12, 18, 50, 47, 77, 113,  
-
-
-So, 1997 looks like a reasonable lower bound.
-Only 7 projects (1 commit each) have such commits, so they are safe to ignore
-"""
-
-MIN_DATE = "1997"
-# username to be used all unidentified users
-DEFAULT_USERNAME = "-"
-
-fs_cache = decorators.typed_fs_cache('scraper')
-
-logger = logging.getLogger("ghd.scraper")
-
-
-def gini(x):
-    """ Gini index of a given iterable
-    simplified version from https://github.com/oliviaguest/gini
-
-    >>> round(gini([1]*99 + [10**6]), 2)
-    0.99
-    >>> round(gini([1]*100), 2)
-    0.0
-    >>> round(gini(range(100)), 2)
-    0.34
-    """
-    n = len(x) * 1.0
-    return np.sort(x).dot(2 * np.arange(n) - n + 1) / (n * np.sum(x))
-
-
-def quantile(data, column, q):
-    # type: (pd.DataFrame, str, float) -> pd.DataFrame
-    """ Returns number of users responsible for a specific
-
-    :param data: an input pd.Dataframe, e.g. commit_user_stats.reset_index()
-        note that without index reset commit_user_stats is a Series
-    :param column: a column to aggregate on, e.g. username
-    :param q: quantile, e.g. 0.9
-    :return: pd.Dataframe aggregated on the specified column
-    >>> df = pd.DataFrame({'foo': 1, 'bar': [1,1,1,1,1,1,1,1,1,1]})
-    >>> quantile(df, 'foo', 0.5).loc[1, 'bar']
-    5
-    >>> quantile(df, 'foo', 0.9).loc[1, 'bar']
-    9
-    """
-    # assert column in df.columns  # - doesn't have to be, e.g. multilevel index
-
-    # how it works: sort descending, run cumulative sum and compare to sum
-    # number of records under q*sum is exactly what we're looking for
-    return data.groupby(column).aggregate(
-        lambda x: sum(x.sort_values(ascending=False).cumsum() / x.sum() <= q))
-
-
-def user_stats(stats, date_field, aggregated_field):
-    # type: (pd.DataFrame, str, str) -> pd.Series
-    """Helper function for internal use only
-    Aggregates specified stats dataframe by month/users
-    """
-    if stats.empty:
-        # a dirty hack to allow further aggregation
-        return pd.DataFrame(
-            columns=[date_field, 'author', aggregated_field]).set_index(
-            [date_field, "author"])[aggregated_field]
-    return stats['author'].groupby(
-        [stats[date_field].str[:7], stats['author']]).count().rename(
-        aggregated_field).astype(np.int)
-
-
-def zeropad(df, fill_value=0):
-    """Ensure monthly index on the passed df, fill in gaps with zeroes
-    >>> df = pd.DataFrame([1,1,1], index=["2017-01", "2016-12", "2017-09"])
-    >>> zp = zeropad(df)
-    >>> zp.index.min()
-    '2016-12'
-    >>> zp.index.max() >= "2017-12"
-    True
-    >>> 13 <= len(zp) <= 50
-    True
-    """
-    start = df.index.min()
-    if pd.isnull(start):
-        idx = []
-    else:
-        idx = [d.strftime("%Y-%m")
-               for d in pd.date_range(start, 'now', freq="M")]
-    return df.reindex(idx, fill_value=fill_value)
-
-
-@fs_cache('raw')
-def commits(repo_url):
-    # type: (str) -> pd.DataFrame
-    """
-    convert old cache files:
-    find -type f -name '*.csv' -exec rename 's/(?<=\/)commits\./_commits./' {} +
-
-    >>> cs = commits("github.com/benjaminp/six")
-    >>> isinstance(cs, pd.DataFrame)
-    True
-    >>> 450 < len(cs) < 2000  # 454 as of Jan 2018
-    True
-    >>> len(commits("github.com/user2589/nothingtoseehere"))
-    Traceback (most recent call last):
-        ...
-    RepoDoesNotExist: GH API returned status 404
-    """
-    provider, project_url = get_provider(repo_url)
-    return pd.DataFrame(
-        provider.repo_commits(project_url),
-        columns=['sha', 'author', 'author_name', 'author_email',
-                 'authored_date', 'committed_date', 'parents']
-    ).set_index('sha', drop=True)
-
-
-# @fs_cache('aggregate', 2)
-def commit_user_stats(repo_name):
-    # type: (str) -> pd.Series
-    """
-    :param repo_name: str, repo name (e.g. github.com/pandas-dev/pandas
-    :return a dataframe indexed on (month, username) with a commits column
-
-    # This repo contains one commit out of order 2005 while repo started in 2016
-    >>> cus = commit_user_stats("github.com/django/django")
-    >>> isinstance(cus, pd.Series)
-    True
-    >>> 4100 < len(cus) < 8000  # 4155 unique month/user combinations / Jan 18
-    True
-    >>> 13 < cus["2017-12"]["sir-sigurd"] < 100 # 22 as of Jan 2018
-    True
-    >>> "2005" < cus.reset_index()["authored_date"].min() < "2009"
-    True
-    >>> "2017" < cus.reset_index()["authored_date"].max() < "2022"
-    True
-    >>> len(cus.reset_index().columns)
-    3
-    >>> 1 <= len(commit_user_stats("github.com/user2589/schooligan")) < 10  # 1
-    True
-    """
-    stats = commits(repo_name)
-    # check for null and empty string is required because of file caching.
-    # commits scraped immediately will have empty string, but after save/load
-    # it will be converted to NaN by pandas
-    min_date = stats.loc[stats["parents"].isnull()
-                         | (~stats["parents"].astype(bool)),
-                         "authored_date"].min()
-    stats = stats[stats["authored_date"] >= min_date]
-    stats['author'] = stats['author'].fillna(DEFAULT_USERNAME)
-    return user_stats(stats, "authored_date", "commits")
-
-
-# @fs_cache('aggregate')
-def commit_stats(repo_name):
-    # type: (str) -> pd.Series
-    """Commits aggregated by month
-
-    >>> cs = commit_stats("github.com/django/django")
-    >>> isinstance(cs, pd.Series)
-    True
-    >>> 140 < len(cs) < 240
-    True
-    >>> 100 < cs["2017-12"] < 200
-    True
-    """
-    return zeropad(commit_user_stats(repo_name).groupby('authored_date').sum())
-
-
-# @fs_cache('aggregate')
-def commit_users(repo_name):
-    # type: (str) -> pd.Series
-    """Number of contributors by month
-
-    >>> cu = commit_users("github.com/django/django")
-    >>> isinstance(cu, pd.Series)
-    True
-    >>> 140 < len(cu) < 240
-    True
-    >>> 30 < cu["2017-12"] < 100  # 32
-    True
-    """
-    return commit_user_stats(repo_name).groupby(
-        'authored_date').count().rename("users")
-
-
-# @fs_cache('aggregate')
-def commit_gini(repo_name):
-    # type: (str) -> pd.Series
-    """
-    >>> g = commit_gini("github.com/django/django")
-    >>> isinstance(g, pd.Series)
-    True
-    >>> 150 < len(g) < 240
-    True
-    >>> all(0 <= i <= 1 for i in g)
-    True
-    """
-    return commit_user_stats(repo_name).groupby(
-        "authored_date").aggregate(gini).rename("gini")
-
-
-def contributions_quantile(repo_name, q):
-    # type: (str, float) -> pd.Series
-    """
-    >>> q50 = contributions_quantile("github.com/django/django", 0.5)
-    >>> isinstance(q50, pd.Series)
-    True
-    >>> 140 < len(q50) < 240
-    True
-    >>> all(q50 >= 0)
-    True
-    >>> 0 < q50["2017-12"] < 10  # 2
-    True
-    """
-    return quantile(commit_user_stats(repo_name).reset_index(),
-                    "authored_date", q)["commits"].rename("q%g" % (q*100))
-
-
-@fs_cache('raw')
-def issues(repo_url):
-    # type: (str) -> pd.DataFrame
-    """ Get a dataframe with issues
-
-    >>> iss = issues("github.com/benjaminp/six")
-    >>> isinstance(iss, pd.DataFrame)
-    True
-    >>> 180 < len(iss) < 500  # 191 as of Jan 2018
-    True
-    >>> len(issues("github.com/user2589/minicms"))
-    0
-    """
-    provider, project_url = get_provider(repo_url)
-    return pd.DataFrame(
-        provider.repo_issues(project_url),
-        columns=['number', 'author', 'closed', 'created_at', 'updated_at',
-                 'closed_at']).set_index('number', drop=True)
-
-
-# @fs_cache('aggregate')
-def non_dev_issues(repo_name):
-    # type: (str) -> pd.DataFrame
-    """Same as new_issues with subtracted issues authored by contributors
-
-    >>> ndi = non_dev_issues("github.com/benjaminp/six")
-    >>> isinstance(ndi, pd.DataFrame)
-    True
-    >>> 20 < len(ndi) < len(issues("github.com/benjaminp/six"))  # 23 as of 2018
-    True
-    """
-    cs = commits(repo_name)[['authored_date', 'author']]
-    fc = cs.loc[pd.notnull(cs['author'])].groupby(
-        'author').min()['authored_date']
-
-    i = issues(repo_name)[['created_at', 'author']].sort_values('created_at')
-    i['fc'] = i['author'].map(fc)
-    return i.loc[~(i['fc'] < i['created_at']), ['author', 'created_at']]
-
-
-# @fs_cache('aggregate', 2)
-def issue_user_stats(repo_name):
-    # type: (str) -> pd.Series
-    """
-    >>> ius = issue_user_stats("github.com/pandas-dev/pandas")
-    >>> isinstance(ius, pd.Series)
-    True
-    >>> 6000 < len(ius) < 10000  # 6261
-    True
-    >>> 12 < ius["2017-12"]["toobaz"] < 24  # 13
-    True
-    >>> (ius > 0).all()
-    True
-    """
-    return user_stats(issues(repo_name), "created_at", "new_issues")
-
-
-# @fs_cache('aggregate', 2)
-def non_dev_issue_user_stats(repo_name):
-    return user_stats(non_dev_issues(repo_name), "created_at", "new_issues")
-
-
-# @fs_cache('aggregate')
-def new_issues(repo_name):
-    # type: (str) -> pd.Series
-    """ New issues aggregated by month
-
-    >>> iss = new_issues("github.com/pandas-dev/pandas")
-    >>> isinstance(iss, pd.Series)
-    True
-    >>> 78 < len(iss) < 100  # 88
-    True
-    >>> 200 < iss["2017-12"] < 300  # 211
-    True
-    """
-    return issue_user_stats(repo_name).groupby('created_at').sum()
-
-
-# @fs_cache('aggregate')
-def non_dev_issue_stats(repo_name):
-    # type: (str) -> pd.Series
-    """Same as new_issues, not counting issues submitted by developers
-    >>> ndi = non_dev_issue_stats("github.com/pandas-dev/pandas")
-    >>> isinstance(ndi, pd.Series)
-    True
-    >>> 78 < len(ndi) < 180
-    True
-    >>> (new_issues("github.com/pandas-dev/pandas") >= ndi).all()
-    True
-    """
-    i = non_dev_issues(repo_name)
-    return i.groupby(i['created_at'].str[:7]).count()['created_at'].rename(
-        "non_dev_issues")
-
-
-# @fs_cache('aggregate')
-def submitters(repo_name):
-    # type: (str) -> pd.Series
-    """Number of submitters aggregated by month
-
-    >>> ss = submitters("github.com/pandas-dev/pandas")
-    >>> isinstance(ss, pd.Series)
-    True
-    >>> 78 < len(ss) < 180
-    True
-    >>> all(ss >= 0)
-    True
-    >>> (new_issues("github.com/pandas-dev/pandas") >= ss).all()
-    True
-    """
-    return issue_user_stats(repo_name).groupby(
-        'created_at').count().rename("submitters")
-
-
-# @fs_cache('aggregate')
-def non_dev_submitters(repo_name):
-    # type: (str) -> pd.Series
-    """New issues aggregated by month
-    >>> nds = non_dev_submitters("github.com/pandas-dev/pandas")
-    >>> isinstance(nds, pd.Series)
-    True
-    >>> 80 < len(nds) < 180
-    True
-    >>> (nds >= 0).all()
-    True
-    >>> (non_dev_issue_stats("github.com/pandas-dev/pandas") >= nds).all()
-    True
-    """
-    return non_dev_issue_user_stats(repo_name).groupby(
-        'created_at').count().rename("non_dev_submitters")
-
-
-@fs_cache('aggregate')
-def closed_issues(repo_name):
-    # type: (str) -> pd.Series
-    """New issues aggregated by month
-
-    >>> ci = closed_issues("github.com/pandas-dev/pandas")
-    >>> isinstance(ci, pd.Series)
-    True
-    >>> 80 < len(ci) < 150
-    True
-    >>> 170 < ci["2017-12"] < 1000  # 179
-    True
-    >>> (ci >= 0).all()
-    True
-    """
-    df = issues(repo_name)
-    closed = df.loc[df['closed'], 'closed_at'].astype(object)
-    return closed.groupby(closed.str[:7]).count()
-
-
-@fs_cache('aggregate')
-def open_issues(repo_name):
-    # type: (str) -> pd.Series
-    """Open issues aggregated by month
-
-    >>> oi = open_issues("github.com/pandas-dev/pandas")
-    >>> isinstance(oi, pd.Series)
-    True
-    >>> 80 < len(oi) < 150
-    True
-    >>> (oi.dropna() >= 0).all()
-    True
-    """
-    submitted = new_issues(repo_name).cumsum()
-    closed = closed_issues(repo_name).cumsum()
-    res = submitted - closed
-    return res.rename("open_issues")
-
-
-# @fs_cache('aggregate')
-def commercial_involvement(url):
-    # type: (str) -> pd.Series
-    """
-    >>> ci = commercial_involvement("github.com/pandas-dev/pandas")
-    >>> isinstance(ci, pd.Series)
-    True
-    >>> 100 < len(ci) < 150
-    True
-    >>> (0 <= ci).all()
-    True
-    >>> (1 >= ci).all()
-    True
-    """
-    cs = commits(url)[['authored_date', 'author_email']]
-    cs["commercial"] = email.is_commercial_bulk(cs["author_email"])
-    stats = cs.groupby(cs['authored_date'].str[:7]).agg(
-        {'authored_date': 'count', 'commercial': 'sum'}
-    ).rename(columns={'authored_date': "commits"})
-    return (stats["commercial"] / stats["commits"]).rename("commercial")
-
-
-# @fs_cache('aggregate')
-def university_involvement(url):
-    # type: (str) -> pd.Series
-    """
-    >>> ui = university_involvement("github.com/pandas-dev/pandas")
-    >>> isinstance(ui, pd.Series)
-    True
-    >>> 100 < len(ui) < 150
-    True
-    >>> (0 <= ui).all()
-    True
-    >>> (1 >= ui).all()
-    True
-    """
-    cs = commits(url)[['authored_date', 'author_email']]
-    cs["university"] = email.is_university_bulk(cs["author_email"])
-    stats = cs.groupby(cs['authored_date'].str[:7]).agg(
-        {'authored_date': 'count', 'university': 'sum'}
-    ).rename(columns={'authored_date': "commits"})
-    return (stats["university"] / stats["commits"]).rename("university")
diff --git a/test.py b/test.py
index acd93bf..6656b1a 100755
--- a/test.py
+++ b/test.py
@@ -268,197 +268,5 @@ def test_pagination(self):
         self.assertGreater(len(commits), 463)
 
 
-# class TestGitLab(unittest.TestCase):
-#
-#     def setUp(self):
-#         self.api = stscraper.GitLabAPI()
-#         self.repo_address = 'gitlab-org/gitlab-ce'
-#
-#     def _test_user(self, user, simple=True):
-#         self.assertIsInstance(user, dict)
-#         for prop in ('id', 'username', 'name', 'state', ):
-#             self.assertIn(prop, user,
-#                           "User object is expected to have '%s' property,"
-#                           " but it doesn't" % prop)
-#         if simple:
-#             return
-#         for prop in ('avatar_url', 'created_at', 'bio', 'location', 'skype',
-#                      'linkedin', 'twitter', 'website_url', 'organization'):
-#             self.assertIn(prop, user,
-#                           "User object is expected to have '%s' property,"
-#                           " but it doesn't" % prop)
-#
-#     def _test_commits(self, commit):
-#         self.assertIsInstance(commit, dict)
-#         for prop in ('id', 'short_id', 'title', 'author_name', 'author_email',
-#                      'authored_date', 'committer_name', 'committer_email',
-#                      'committed_date', 'created_at', 'message', 'parent_ids'):
-#             self.assertIn(prop, commit,
-#                           "Commit object is expected to have '%s' property,"
-#                           " but it doesn't" % prop)
-#
-#     def _test_issue(self, issue):
-#         self.assertIsInstance(issue, dict)
-#         for prop in ('id', 'iid', 'project_id', 'title', 'description', 'state',
-#                      'created_at', 'updated_at',  # 'closed_by', 'closed_at',
-#                      'author', 'labels', 'upvotes',  # 'assignees', 'assignee',
-#                      'downvotes', 'discussion_locked'):
-#             self.assertIn(prop, issue,
-#                           "Issue object is expected to have '%s' property,"
-#                           " but it doesn't" % prop)
-#
-#     def _test_issue_comments(self, comment):
-#         self.assertIsInstance(comment, dict)
-#         for prop in ('id', 'body', 'attachment', 'author', 'created_at',
-#                      'updated_at', 'system', 'noteable_id', 'noteable_type',
-#                      'noteable_iid'):
-#             self.assertIn(prop, comment,
-#                           "Issue comment is expected to have '%s' property,"
-#                           " but it doesn't" % prop)
-#
-#     def _test_repo(self, repo):
-#         self.assertIsInstance(repo, dict)
-#         for prop in ('id', 'description', 'default_branch', 'tag_list', 'name',
-#                      'path', 'path_with_namespace', 'forks_count', 'star_count',
-#                      'created_at', 'last_activity_at', 'issues_enabled',
-#                      'merge_method', 'creator_id', 'import_status', 'archived',
-#                      'wiki_enabled', 'snippets_enabled', 'open_issues_count',
-#                      'merge_requests_enabled',
-#                      'namespace', 'container_registry_enabled', 'public_jobs'):
-#             self.assertIn(prop, repo,
-#                           "Repository object is expected to have '%s' property,"
-#                           " but it doesn't" % prop)
-#
-#     def test_all_users(self):
-#         users = self.api.all_users()
-#         self.assertIsInstance(users, Generator)
-#         user = next(users)
-#         self._test_user(user)
-#
-#     def test_all_repos(self):
-#         repos = self.api.all_repos()
-#         self.assertIsInstance(repos, Generator)
-#         repo = next(repos)
-#         self._test_repo(repo)
-#
-#     def test_repo_issues(self):
-#         issues = self.api.repo_issues(self.repo_address)
-#         self.assertIsInstance(issues, Generator)
-#         issue = next(issues)
-#         self._test_issue(issue)
-#
-#     def test_repo_commits(self):
-#         commits = self.api.repo_commits(self.repo_address)
-#         self.assertIsInstance(commits, Generator)
-#         commit = next(commits)
-#         self._test_commits(commit)
-#
-#     def test_repo_pulls(self):
-#         pulls = self.api.repo_pulls(self.repo_address)
-#         self.assertIsInstance(pulls, Generator)
-#         pr = next(pulls)
-#         self._test_issue(pr)
-#         for prop in ('target_branch', 'source_branch', 'source_project_id',
-#                      'target_project_id', 'work_in_progress', 'merge_status',
-#                      'merge_commit_sha', 'sha', 'user_notes_count', 'squash',
-#                      'time_stats', 'approvals_before_merge'):
-#             self.assertIn(prop, pr,
-#                           "Merge request is expected to have '%s' property, "
-#                           "but it doesn't" % prop)
-#
-#     def test_repo_topics(self):
-#         topics = self.api.repo_topics(self.repo_address)
-#         self.assertIsInstance(topics, list)
-#
-#     def test_pull_request_commits(self):
-#         # https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/21628
-#         commits = self.api.pull_request_commits(self.repo_address, 21628)
-#         self.assertIsInstance(commits, Generator)
-#         commit = next(commits)
-#         self._test_commits(commit)
-#
-#     def test_issue_comments(self):
-#         # https://gitlab.com/gitlab-org/gitlab-ce/issues/2978
-#         comments = self.api.issue_comments(self.repo_address, 2978)
-#         self.assertIsInstance(comments, Generator)
-#         comment = next(comments)
-#         self._test_issue_comments(comment)
-#
-#     def test_review_comments(self):
-#         # https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/21038
-#         comments = self.api.review_comments(self.repo_address, 21038)
-#         self.assertIsInstance(comments, Generator)
-#         comment = next(comments)
-#         self._test_issue_comments(comment)
-#
-#     def test_user_info(self):
-#         user = self.api.user_info('user2589')
-#         self._test_user(user, simple=False)
-#
-#     def test_user_repos(self):
-#         """Get list of user repositories"""
-#         repos = self.api.user_repos('user2589')
-#         self.assertIsInstance(repos, Generator)
-#         repo = next(repos)
-#         self._test_repo(repo)
-#
-#     def test_user_orgs(self):
-#         # not available in GitLab API v4
-#         with self.assertRaises(NotImplementedError):
-#             self.api.user_orgs('user2589')
-#
-#     def test_org_members(self):
-#         members = self.api.org_members('Inkscape')
-#         self.assertIsInstance(members, Generator)
-#         user = next(members)
-#         self._test_user(user)
-#
-#     def test_org_repos(self):
-#         repos = self.api.org_repos('gitlab-org')
-#         self.assertIsInstance(repos, Generator)
-#         repo = next(repos)
-#         self._test_repo(repo)
-#
-#     def test_pagination(self):
-#         # 193 commits as of Aug 2018
-#         commits = list(self.api.repo_commits('user2589/ghd'))
-#         self.assertGreater(len(commits), 190)
-#
-#     def test_project_exists(self):
-#         self.assertTrue(self.api.project_exists(self.repo_address))
-#         self.assertFalse(self.api.project_exists('user2589/nonexistent'))
-
-
-class TestBitBucket(unittest.TestCase):
-
-    def setUp(self):
-        self.api = stscraper.BitbucketAPI()
-        self.repo_address = 'zzzeek/sqlalchemy'
-
-
-class TestGeneric(unittest.TestCase):
-
-    def setUp(self):
-        self.scraper = stscraper.GenericScraper()
-        self.full_url = 'https://github.com/cmustrudel/strudel.scraper'
-        self.repo_slug = 'cmustrudel/strudel.scraper'
-
-    def test_commits(self):
-        fields = stscraper.MAPPINGS['repo_commits']['fields']
-        count = 0
-        for commit in self.scraper.repo_commits(self.full_url, self.repo_slug):
-            self.assertTrue(all(field in commit for field in fields),
-                            "Some commits are missing expected fields")
-            count += 1
-        self.assertTrue(
-            count, "Zero commits returned by GenericScraper.repo_commits")
-
-
-class TestStats(unittest.TestCase):
-
-    def test_(self):
-        pass
-
-
 if __name__ == "__main__":
     unittest.main()

From 455b2b685e16ab28aa46566521b1572f972704fb Mon Sep 17 00:00:00 2001
From: user2589 <user2589@users.noreply.github.com>
Date: Fri, 24 Jul 2020 23:55:56 -0400
Subject: [PATCH 07/10] refactor!: remove legacy interfaces

---
 stscraper/base.py   | 14 ++++----------
 stscraper/github.py | 45 ++++-----------------------------------------
 2 files changed, 8 insertions(+), 51 deletions(-)

diff --git a/stscraper/base.py b/stscraper/base.py
index 4045947..2b1251a 100644
--- a/stscraper/base.py
+++ b/stscraper/base.py
@@ -228,7 +228,7 @@ def __call__(self, url, method='get', data=None, **params):
         return r
 
     def __str__(self):
-        return self.token
+        return self.token or ""
 
 
 class DummyAPIToken(APIToken):
@@ -279,10 +279,10 @@ def __init__(self, tokens=None, timeout=30):
                 tokens = tokens.split(",")
             new_tokens_instances = [self.token_class(t, timeout=timeout)
                                     for t in set(tokens) - old_tokens]
-            self.tokens += tuple(t for t in new_tokens_instances if t.is_valid)
+            self.tokens = tuple(t for t in new_tokens_instances if t.is_valid)
         self.logger = logging.getLogger('scraper.' + self.__class__.__name__)
 
-    def has_next_page(self, response):
+    def _has_next_page(self, response):
         """ Check if there is a next page to a paginated response """
         raise NotImplementedError
 
@@ -352,7 +352,7 @@ def request(self, url, method='get', data=None, paginate=False, **params):
             if paginate:
                 for item in res:
                     yield item
-                if not res or not self.has_next_page(r):
+                if not res or not self._has_next_page(r):
                     return
                 else:
                     params["page"] += 1
@@ -483,9 +483,3 @@ def project_exists(repo_slug):
         # type: (str) -> bool
         """ """
         raise NotImplementedError
-
-    @staticmethod
-    def canonical_url(repo_slug):
-        # type: (str) -> str
-        """ """
-        raise NotImplementedError
diff --git a/stscraper/github.py b/stscraper/github.py
index 8fd0782..7560952 100644
--- a/stscraper/github.py
+++ b/stscraper/github.py
@@ -129,7 +129,7 @@ def __init__(self, tokens=None, timeout=30):
 
         super(GitHubAPI, self).__init__(tokens, timeout)
 
-    def has_next_page(self, response):
+    def _has_next_page(self, response):
         for rel in response.headers.get("Link", "").split(","):
             if rel.rsplit(";", 1)[-1].strip() == 'rel="next"':
                 return True
@@ -280,33 +280,6 @@ def project_exists(repo_slug):
             except requests.RequestException:
                 time.sleep(2**i)
 
-    @staticmethod
-    def canonical_url(repo_slug):
-        # type: (str) -> str
-        """ Normalize URL
-        - remove trailing .git  (IMPORTANT)
-        - lowercase (API is case insensitive, so lowercase to deduplicate)
-        - prepend "github.com"
-
-        :param: repo_slug: str, user_name/repo_name
-        :return: github.com/user_name/repo_name with both names normalized
-
-        >>> GitHubAPI.canonical_url("pandas-DEV/pandas")
-        'github.com/pandas-dev/pandas'
-        >>> GitHubAPI.canonical_url("http://github.com/django/django.git")
-        'github.com/django/django'
-        >>> GitHubAPI.canonical_url("https://github.com/A/B/")
-        'github.com/a/b'
-        """
-        url = repo_slug.split("//")[-1].lower()
-        for prefix in ("github.com",):
-            if url.startswith(prefix):
-                url = url[len(prefix):]
-        for suffix in ("/", ".git"):
-            if url.endswith(suffix):
-                url = url[:-len(suffix)]
-        return "github.com/" + url
-
 
 class GitHubAPIv4(GitHubAPI):
     """ An example class using GraphQL API """
@@ -442,11 +415,6 @@ def get_limits(tokens=None):
 
 def print_limits(argv=None):
     """Check remaining limits of registered GitHub API keys"""
-    # import argparse
-    # parser = argparse.ArgumentParser(
-    #     description="Check remaining limits of registered GitHub API keys")
-    # # two lines above are just to print help, so ignoring the output
-    # _ = parser.parse_args()
 
     columns = ("user", "core_limit", "core_remaining", "core_renews_in",
                "search_limit", "search_remaining", "search_renews_in",
@@ -458,11 +426,6 @@ def print_limits(argv=None):
                         for values in stats)
             for column in columns}
 
-    def gen():
-        yield ""  # prepend empty line
-        yield " ".join(c.ljust(lens[c] + 1, " ") for c in columns)
-        for values in stats:
-            yield " ".join(
-                str(values[c]).ljust(lens[c] + 1, " ") for c in columns)
-
-    return "\n".join(gen())
+    print('\n', ' '.join(c.ljust(lens[c] + 1, " ") for c in columns))
+    for values in stats:
+        print(*(str(values[c]).ljust(lens[c] + 1, " ") for c in columns))

From 292cf80d0ee996be5bc53c4fe0d80a0dcaaf2f49 Mon Sep 17 00:00:00 2001
From: user2589 <user2589@users.noreply.github.com>
Date: Fri, 24 Jul 2020 23:56:36 -0400
Subject: [PATCH 08/10] chore: change check_gh_limits to use entry_points
 instead of scripts

---
 scripts/check_gh_limits.py | 26 --------------------------
 setup.py                   |  4 +++-
 2 files changed, 3 insertions(+), 27 deletions(-)
 delete mode 100755 scripts/check_gh_limits.py

diff --git a/scripts/check_gh_limits.py b/scripts/check_gh_limits.py
deleted file mode 100755
index 0cdbc7a..0000000
--- a/scripts/check_gh_limits.py
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/usr/bin/env python
-
-from __future__ import print_function
-
-import argparse
-
-import stscraper as scraper
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Check remaining limits of registered GitHub API keys")
-    args = parser.parse_args()
-
-    columns = ("user", "core_limit", "core_remaining", "core_renews_in",
-               "search_limit", "search_remaining", "search_renews_in", "key")
-
-    stats = list(scraper.github.get_limits())
-
-    lens = {column: max(max(len(str(values[column])), len(column))
-                        for values in stats)
-            for column in columns}
-
-    print(" ".join(c.ljust(lens[c] + 1, " ")for c in columns))
-    for values in stats:
-        print(" ".join(str(values[c]).ljust(lens[c] + 1, " ") for c in columns))
diff --git a/setup.py b/setup.py
index 624d5a2..0cd2c70 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,9 @@
     ],
     platforms=["Linux", "Solaris", "Mac OS-X", "Unix", "Windows"],
     python_requires='>2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4',
-    scripts=[os.path.join('scripts', 'check_gh_limits.py')],
+    entry_points={
+        'console_scripts': ["check_gh_limits = stscraper.github:print_limits"]
+    },
     packages=[package],
     url='https://github.com/cmustrudel/strudel.scraper',
     install_requires=requirements,

From bc24d8dc050b36d15be3735ef66b0f791cba159a Mon Sep 17 00:00:00 2001
From: user2589 <user2589@users.noreply.github.com>
Date: Sat, 25 Jul 2020 01:11:43 -0400
Subject: [PATCH 09/10] docs: fix the documentation

---
 docs/index.rst      |  65 +++++++++++++++-
 stscraper/base.py   |   4 +-
 stscraper/github.py | 181 ++++++++++++++++++++++++++++++--------------
 3 files changed, 188 insertions(+), 62 deletions(-)

diff --git a/docs/index.rst b/docs/index.rst
index e2f9590..8cf1bb0 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -5,7 +5,66 @@ Reference
 .. toctree::
    :maxdepth: 2
 
+.. py:module:: stscraper
+
+`stscraper` is a Python interface for GitHub API
+
+Key features:
+
+- utilize multiple API keys to speed up scraping
+- transparently handle pagination and minor network errors
+
+Installation
+------------
+
+.. code-block:: bash
+
+    pip install --user --upgrade strudel.scraper
+
+
+Usage
+-----
+
+The main way to use this module is through :py:class:`GitHubAPI` objects.
+
+.. code-block::
+
+    import stscraper as scraper
+    import pandas as pd
+
+    gh_api = scraper.GitHubAPI("token1,token2,...")
+
+    # repo_issues is a generator that can be used
+    # to instantiate a pandas dataframe
+    issues = pd.DataFrame(gh_api.repo_issues('cmustrudel/strudel.scraper'))
+
+Tokens can be provided either at class instantiation or through an environment
+variable:
+
+.. code-block:: bash
+
+    # somewhere in ~/.bashrc
+    export GITHUB_API_TOKENS='comma-separated list of tokens'
+
+.. code-block::
+
+    # later, in some Python file:
+    gh_api = scraper.GitHubAPI()  # tokens from the environment var will be used
+
+If no keys were passed at class instantiation and `GITLAB_API_TOKENS`
+environment variable is not defined, `stscraper` will also check `GITHUB_TOKEN`
+environment variable. This variable is created by GitHub actions runner and also
+used by `hub <https://github.com/github/hub)>`_ utility.
+
+REST (v3) API
+-------------
+.. autoclass:: GitHubAPI
+    :members:
+    :exclude-members:
+
+GraphQL (v4) API
+----------------
+
+.. autoclass:: GitHubAPIv4
+    :members:
 
-:doc:`github`
-:doc:`gitlab`
-:doc:`BitBucket`
diff --git a/stscraper/base.py b/stscraper/base.py
index 2b1251a..c89e28b 100644
--- a/stscraper/base.py
+++ b/stscraper/base.py
@@ -279,7 +279,7 @@ def __init__(self, tokens=None, timeout=30):
                 tokens = tokens.split(",")
             new_tokens_instances = [self.token_class(t, timeout=timeout)
                                     for t in set(tokens) - old_tokens]
-            self.tokens = tuple(t for t in new_tokens_instances if t.is_valid)
+            self.tokens += tuple(t for t in new_tokens_instances if t.is_valid)
         self.logger = logging.getLogger('scraper.' + self.__class__.__name__)
 
     def _has_next_page(self, response):
@@ -317,7 +317,7 @@ def iterate_tokens(self, url=""):
             for token in random.sample(self.tokens, len(self.tokens)):
                 if not token.ready(url):
                     continue
-            yield token
+                yield token
 
             next_res = min(token.when(url) for token in self.tokens)
             sleep = next_res and int(next_res - time.time()) + 1
diff --git a/stscraper/github.py b/stscraper/github.py
index 7560952..233297b 100644
--- a/stscraper/github.py
+++ b/stscraper/github.py
@@ -5,7 +5,6 @@
 import datetime
 import json
 import os
-from typing import Iterable
 import warnings
 
 from .base import *
@@ -13,7 +12,7 @@
 
 
 class GitHubAPIToken(APIToken):
-    api_url = "https://api.github.com/"
+    api_url = 'https://api.github.com/'
     api_classes = ('core', 'search')
 
     _user = None  # cache user
@@ -27,12 +26,12 @@ def __init__(self, token=None, timeout=None):
         # squirrel-girl-preview: issue reactions
         # starfox-preview: issue events
         self._headers = {
-            "Accept": "application/vnd.github.mercy-preview+json,"
-                      "application/vnd.github.squirrel-girl-preview,"
-                      "application/vnd.github.starfox-preview+json"}
+            'Accept': 'application/vnd.github.mercy-preview+json,'
+                      'application/vnd.github.squirrel-girl-preview,'
+                      'application/vnd.github.starfox-preview+json'}
         if token is not None:
             self.token = token
-            self._headers["Authorization"] = "token " + token
+            self._headers['Authorization'] = 'token ' + token
 
     @property
     def user(self):
@@ -96,7 +95,7 @@ def _update_limits(self, response, url):
 
 
 class GitHubAPI(VCSAPI):
-    """ This is a convenience class to pool GitHub API keys and update their
+    """ This is a convenience class to pool GitHub v3 API keys and update their
     limits after every request. Actual work is done by outside classes, such
     as _IssueIterator and _CommitIterator
     """
@@ -108,15 +107,16 @@ def __init__(self, tokens=None, timeout=30):
         # Where to look for tokens:
         # strudel config variables
         if not tokens:
-            stconfig_tokens = stutils.get_config("GITHUB_API_TOKENS")
+            stconfig_tokens = stutils.get_config('GITHUB_API_TOKENS')
             if stconfig_tokens:
                 tokens = [token.strip()
                           for token in stconfig_tokens.split(",")
                           if len(token.strip()) == 40]
 
         # hub configuration: https://hub.github.com/hub.1.html
+        # also, used by github actions
         if not tokens:
-            token = stutils.get_config("GITHUB_TOKEN")
+            token = stutils.get_config('GITHUB_TOKEN')
             if not token and os.path.isfile("~/.config/hub"):
                 token = open("~/.config/hub", 'r').read(64)
             if token and len(token.strip()) == 40:
@@ -140,30 +140,31 @@ def _has_next_page(self, response):
     # ===================================
     @api('users', paginate=True)
     def all_users(self):
+        """Get all GitHub users"""
         # https://developer.github.com/v3/users/#get-all-users
         return ()
 
     @api('repositories', paginate=True)
     def all_repos(self):
+        """Get all GitHub repositories"""
         # https://developer.github.com/v3/repos/#list-all-public-repositories
         return ()
 
     @api('repos/%s')
     def repo_info(self, repo_slug):
-        # type: (Union[str, unicode]) -> Iterator[dict]
+        """Get repository info"""
         # https://developer.github.com/v3/repos/#get
         return repo_slug
 
     @api_filter(lambda issue: 'pull_request' not in issue)
     @api('repos/%s/issues', paginate=True, state='all')
     def repo_issues(self, repo_slug):
-        # type: (Union[str, unicode]) -> Iterator[dict]
+        """Get repository issues (not including pull requests)"""
         # https://developer.github.com/v3/issues/#list-issues-for-a-repository
         return repo_slug
 
     @api('repos/%s/issues/comments', paginate=True)
     def repo_issue_comments(self, repo_slug):
-        # type: (Union[str, unicode]) -> Iterator[dict]
         """ Get all comments in all issues and pull requests,
         both open and closed.
         """
@@ -172,7 +173,6 @@ def repo_issue_comments(self, repo_slug):
 
     @api('repos/%s/issues/events', paginate=True)
     def repo_issue_events(self, repo_slug):
-        # type: (Union[str, unicode]) -> Iterator[dict]
         """ Get all events in all issues and pull requests,
         both open and closed.
         """
@@ -181,34 +181,56 @@ def repo_issue_events(self, repo_slug):
 
     @api('repos/%s/commits', paginate=True)
     def repo_commits(self, repo_slug):
-        # type: (Union[str, unicode]) -> Iterator[dict]
+        """Get all repository commits.
+        Note that GitHub API might ignore some merge commits"""
         # https://developer.github.com/v3/repos/commits/#list-commits-on-a-repository
         return repo_slug
 
     @api('repos/%s/pulls', paginate=True, state='all')
     def repo_pulls(self, repo_slug):
-        # type: (Union[str, unicode]) -> Iterator[dict]
+        """Get all repository pull requests.
+        Unlike the issues API, this method will return information specific for
+        pull requests, like head SHAs and branch names."""
         # https://developer.github.com/v3/pulls/#list-pull-requests
         return repo_slug
 
     def repo_topics(self, repo_slug):
+        """Get a tuple of repository topics.
+        Topics are "keywords" assigned by repository owner.
+
+        >>> GitHubAPI().repo_topics('pandas-dev/pandas')
+        ('data-analysis', 'pandas', 'flexible', 'alignment', 'python')
+        """
         return tuple(
             next(self.request('repos/%s/topics' % repo_slug)).get('names'))
 
     def repo_labels(self, repo_slug):
+        """Get a tuple of repository labels.
+        Labels are issue tags used by maintainers
+
+        >>> GitHubAPI().repo_labels('pandas-dev/pandas')[:5]
+        ('2/3 Compat', '32bit', 'API - Consistency', 'API Design', 'Admin')
+        """
         return tuple(label['name'] for label in
                      self.request('repos/%s/labels' % repo_slug, paginate=True))
 
     def repo_contributors(self, repo_slug):
-        """
-        https://developer.github.com/v3/repos/statistics/#get-all-contributor-commit-activity
+        """Get a timeline of up to 100 top project contributors
 
         Suggested use:
-            
-            
 
+        >>> import pandas as pd
+        >>> df = pd.DataFrame(
+        ...     GitHubAPI().repo_contributors(repo_slug)).set_index('user')
+        >>> df.columns = pd.to_datetime(df.columns, unit='s')
+        >>> df
+                  2018-08-19  2018-08-26    ...    2020-07-12  2020-07-19
+        user                                ...
+        user2589           3           0    ...             0           0
+        ...
         """
-        url = 'repos/pandas-dev/pandas/stats/contributors' % repo_slug
+        # https://developer.github.com/v3/repos/statistics/#get-all-contributor-commit-activity
+        url = 'repos/%s/stats/contributors' % repo_slug
         for contributor_stats in next(self.request(url)):
             record = {w['w']: w['c'] for w in contributor_stats['weeks']}
             record['user'] = json_path(contributor_stats, 'author', 'login')
@@ -216,32 +238,33 @@ def repo_contributors(self, repo_slug):
 
     @api('repos/%s/pulls/%d/commits', paginate=True, state='all')
     def pull_request_commits(self, repo, pr_id):
+        """Get commits in a pull request.
+        `pr_id` is the visible pull request number, not internal GitHub id.
+        """
         # https://developer.github.com/v3/issues/comments/#list-comments-on-an-issue
         return repo, pr_id
 
     @api('repos/%s/issues/%s/comments', paginate=True, state='all')
     def issue_comments(self, repo, issue_id):
-        """ Return comments on an issue or a pull request
-            Note that for pull requests this method will return only general
-            comments to the pull request, but not review comments related to
-            some code. Use review_comments() to get those instead
-
-            :param repo: str 'owner/repo'
-            :param issue_id: int, either an issue or a Pull Request id
+        """ Get comments on an issue or a pull request.
+        Note that for pull requests this method will return only general
+        comments to the pull request, but not review comments related to some
+        code. Use review_comments() to get those instead.
         """
         # https://developer.github.com/v3/issues/comments/#list-comments-on-an-issue
         return repo, issue_id
 
     @api('repos/%s/pulls/%s/comments', paginate=True, state='all')
     def review_comments(self, repo, pr_id):
-        """ Pull request comments attached to some code
-        See also issue_comments()
+        """ Get pull request comments related to some code.
+        This will not return general comments, see `issue_comments()`
         """
         # https://developer.github.com/v3/pulls/comments/
         return repo, pr_id
 
     @api('users/%s')
     def user_info(self, username):
+        """Get user info - name, location, blog etc."""
         # Docs: https://developer.github.com/v3/users/#response
         return username
 
@@ -253,20 +276,29 @@ def user_repos(self, username):
 
     @api('users/%s/orgs', paginate=True)
     def user_orgs(self, username):
+        """Get user organization membership.
+        Usually includes only public memberships, but for yourself you get
+        non-public as well."""
         # https://developer.github.com/v3/orgs/#list-user-organizations
         return username
 
     @api('orgs/%s/members', paginate=True)
     def org_members(self, org):
+        """Get public organization members.
+        Note that if you are a member of the organization you'll get everybody.
+        """
         # https://developer.github.com/v3/orgs/members/#members-list
         return org
 
     @api('orgs/%s/repos', paginate=True)
     def org_repos(self, org):
+        """Get organization repositories"""
         return org
 
     @api('repos/%s/issues/%d/events', paginate=True)
     def issue_events(self, repo, issue_no):
+        """Get issue events.
+        This includes state changes, references, labels etc. """
         return repo, issue_no
 
     # ===================================
@@ -274,6 +306,9 @@ def issue_events(self, repo, issue_no):
     # ===================================
     @staticmethod
     def project_exists(repo_slug):
+        """Check if the project exists.
+        This is a slightly cheaper alternative to getting repository info.
+        """
         for i in range(5):
             try:
                 return bool(requests.head("https://github.com/" + repo_slug))
@@ -282,7 +317,12 @@ def project_exists(repo_slug):
 
 
 class GitHubAPIv4(GitHubAPI):
-    """ An example class using GraphQL API """
+    """ An interface to GitHub v4 GraphQL API.
+
+    Due to the nature of graphql API, this class does not provide a specific
+    set of methods. Instead, you're expected to write your own queries and this
+    class will help you with pagination and network timeouts.
+    """
     def v4(self, query, object_path=(), **params):
         """ Make an API v4 request, taking care of pagination
 
@@ -292,41 +332,72 @@ def v4(self, query, object_path=(), **params):
             object_path (Tuple[str]): json path to objects to iterate, excluding
                 leading "data" part, and the trailing "nodes" when applicable.
                 If omitted, will return full "data" content
-                Example: "repository__issues"
+                Example: ("repository", "issues")
+            **params: dictionary of query variables.
 
-        Generates:
+        Yields:
             object: parsed object, query-specific
+
+        This method always returns an iterator, so normally you just throw it
+        straint into a loop:
+
+        >>> followers = GitHubAPIv4().v4('''
+        ...     query ($user: String!, $cursor: String) {
+        ...       user(login: $user) {
+        ...         followers(first:100, after:$cursor) {
+        ...           nodes { login }
+        ...           pageInfo{endCursor, hasNextPage}
+        ...     }}}''', ("user", "followers"), user=user)
+        >>> for follower in followers:
+        ...     pass
+
+        The method will look for `pageInfo` object in the object path and handle
+        pagination transparently.
+
+        However, the method will also return an iterator if the query is
+        expected to return a single result. In this case, you need to explicitly
+        get the first record, e.g. by calling `next()` on the result:
+
+        >>> user_info = next(self.v4('''
+        ...     query ($user: String!) { 
+        ...       user(login:$user) {
+        ...         login, name, avatarUrl, websiteUrl
+        ...         company, bio, location, name, twitterUsername, isHireable
+        ...         createdAt, updatedAt
+        ...         followers{totalCount}
+        ...         following {totalCount}
+        ...       }}''', ('user',), user=user))
+
         """
 
         while True:
-            payload = json.dumps({"query": query, "variables": params})
+            payload = json.dumps({'query': query, 'variables': params})
 
-            r = self._request("graphql", 'post', data=payload)
+            r = self._request('graphql', 'post', data=payload)
             if r.status_code in self.status_empty:
                 return
 
             res = self.extract_result(r)
-            if "data" not in res:
-                raise VCSError("API didn't return any data:\n" +
+            if 'data' not in res:
+                raise VCSError('API didn\'t return any data:\n' +
                                json.dumps(res, indent=4))
 
-            objects = json_path(res["data"], *object_path)
+            objects = json_path(res['data'], *object_path)
             if objects is None:
-                raise VCSError("Invalid object path '%s' in:\n %s" %
+                raise VCSError('Invalid object path "%s" in:\n %s' %
                                (object_path, json.dumps(res)))
-            if "nodes" not in objects:
+            if 'nodes' not in objects:
                 yield objects
                 return
-            for obj in objects["nodes"]:
+            for obj in objects['nodes']:
                 yield obj
             # the result is single page, or there are no more pages
-            if not json_path(objects, "pageInfo", "hasNextPage"):
+            if not json_path(objects, 'pageInfo', 'hasNextPage'):
                 return
-            params["cursor"] = json_path(objects, "pageInfo", "endCursor")
+            params['cursor'] = json_path(objects, 'pageInfo', 'endCursor')
 
     def repo_issues(self, repo_slug, cursor=None):
-        # type: (str, str) -> Iterator[dict]
-        owner, repo = repo_slug.split("/")
+        owner, repo = repo_slug.split('/')
         return self.v4("""
             query ($owner: String!, $repo: String!, $cursor: String) {
                 repository(name: $repo, owner: $owner) {
@@ -337,20 +408,18 @@ def repo_issues(self, repo_slug, cursor=None):
                                updatedAt, number, title}
                         pageInfo {endCursor, hasNextPage}
                 }}
-            }""", ("repository", "issues"), owner=owner, repo=repo)
+            }""", ('repository', 'issues'), owner=owner, repo=repo)
 
     def user_followers(self, user):
-        # type: (str) -> Iterator[dict]
         return self.v4("""
             query ($user: String!, $cursor: String) { 
               user(login: $user) {
                 followers(first:100, after:$cursor) {
                   nodes { login }
                   pageInfo{endCursor, hasNextPage}
-            }}}""", ("user", "followers"), user=user)
+            }}}""", ('user', 'followers'), user=user)
 
     def user_info(self, user):
-        # type: (str) -> Iterator[dict]
         return next(self.v4("""
             query ($user: String!) { 
               user(login:$user) { 
@@ -360,10 +429,9 @@ def user_info(self, user):
                 createdAt, updatedAt
                 followers{totalCount}
                 following {totalCount}
-              }}""", ("user",), user=user))
+              }}""", ('user',), user=user))
 
     def repo_commits(self, repo_slug):
-        # type: (str) -> Iterator[dict]
         owner, repo = repo_slug.split("/")
         return self.v4("""
             query ($owner: String!, $repo: String!, $cursor: String) {
@@ -379,7 +447,7 @@ def repo_commits(self, repo_slug):
                             nodes {sha:oid}}
                         }
                         pageInfo {endCursor, hasNextPage}
-            }}}}}}""", ("repository", "defaultBranchRef", "target", "history"),
+            }}}}}}""", ('repository', 'defaultBranchRef', 'target', 'history'),
                        owner=owner, repo=repo)
 
 
@@ -394,18 +462,17 @@ def get_limits(tokens=None):
 
     for i, token in enumerate(api.tokens):
         # if limit is exhausted there is no way to get username
-        user = token.user or "<unknown%d>" % i
+        user = token.user or '<unknown%d>' % i
         values = {'user': user, 'key': token.token}
         token.check_limits()
 
         for api_class in token.limits:
-            # geez, this code smells
             next_update = token.limits[api_class]['reset']
             if next_update is None:
                 renew = 'never'
             else:
                 tdiff = datetime.fromtimestamp(next_update) - now
-                renew = "%dm%ds" % divmod(tdiff.seconds, 60)
+                renew = '%dm%ds' % divmod(tdiff.seconds, 60)
             values[api_class + '_renews_in'] = renew
             values[api_class + '_limit'] = token.limits[api_class]['limit']
             values[api_class + '_remaining'] = token.limits[api_class]['remaining']
@@ -416,9 +483,9 @@ def get_limits(tokens=None):
 def print_limits(argv=None):
     """Check remaining limits of registered GitHub API keys"""
 
-    columns = ("user", "core_limit", "core_remaining", "core_renews_in",
-               "search_limit", "search_remaining", "search_renews_in",
-               "key")
+    columns = ('user', 'core_limit', 'core_remaining', 'core_renews_in',
+               'search_limit', 'search_remaining', 'search_renews_in',
+               'key')
 
     stats = list(get_limits())
 

From c3299f738f352fcda4567c59ae0c7ad1759172a2 Mon Sep 17 00:00:00 2001
From: user2589 <user2589@users.noreply.github.com>
Date: Sat, 25 Jul 2020 01:27:09 -0400
Subject: [PATCH 10/10] ci: migrate release automation unit test from travis to
 github actions

---
 .github/workflows/release.yml |  39 ++++++++++++
 .github/workflows/test.yml    |  38 +++++++++++
 .travis.yml                   |  51 ---------------
 README.md                     | 115 +---------------------------------
 stscraper/github.py           |   2 +-
 5 files changed, 79 insertions(+), 166 deletions(-)
 create mode 100644 .github/workflows/release.yml
 create mode 100644 .github/workflows/test.yml
 delete mode 100644 .travis.yml

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..2e8d740
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,39 @@
+name: Semantic Release
+
+on:
+  push:
+    branches: [ master ]
+
+jobs:
+  release:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        fetch-depth: 0
+
+    - name: Python Semantic Release
+      uses: relekang/python-semantic-release@v7.2.1
+      with:
+        pypi_token: ${{ secrets.PYPI_TOKEN }}
+
+  pages:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          ref: gh-pages
+          path: docs/build/html
+      - name: GitHub pages
+      - run: |
+          python -m pip install --upgrade pip
+          pip install sphinx sphinx-autobuild
+          sphinx-build -M html "docs" "docs/build"
+          git config user.name github-actions
+          git config user.email github-actions@github.com
+          cd docs/build/html
+          git add .
+          git commit -m "github pages"
+          git push
\ No newline at end of file
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..5802f18
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,38 @@
+name: Run unit tests on every push
+
+on: push
+
+jobs:
+  test:
+    name: Python ${{ matrix.python-version }} tests
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        python-version: [2.7, 3.6, 3.7, 3.8]
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v1
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Cache pip
+        uses: actions/cache@v1
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }}
+          restore-keys: |
+            ${{ runner.os }}-pip-
+
+      - name: Install dependencies (Python ${{ matrix.python-version }})
+        run: |
+          python -m pip install --upgrade pip
+          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+
+      - name: Run tests on Python ${{ matrix.python-version }}
+        env:
+          GITHUB_API_TOKENS: ${{ secrets.GH_API_TOKENS }}
+        run: make test
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 1829053..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,51 +0,0 @@
-sudo: required
-language: python
-python:
-  - 2.7
-  - 3.6
-
-cache:
-  - pip
-  - packages
-
-install:
-  - make install
-  - pip install requests typing
-
-script:
-  - make test
-
-# jobs instead of deploy to deploy only once (for Python3 build)
-jobs:
-  fast_finish: true
-  include:
-  - stage: upload to PYPI, build docs and create a release
-    # python-semantic-release fails with Travis Python3.5
-    python: 3.6
-    install: make install_dev
-    script: make html
-
-    deploy:
-      - provider: script
-        skip_cleanup: true
-        on:
-          branch: master
-        script: make publish
-
-      - provider: releases
-        skip-cleanup: true
-        api_key: $GH_TOKEN
-        on:
-          tags: true
-        file: dist/*
-
-      - provider: pages
-        skip-cleanup: true
-        github-token: $GH_TOKEN
-        keep-history: true
-        on:
-          branch: master
-        local-dir: docs/build/html
-
-after_failure:
-  - pip freeze
\ No newline at end of file
diff --git a/README.md b/README.md
index d16750d..95493d3 100644
--- a/README.md
+++ b/README.md
@@ -1,115 +1,2 @@
-# Python interface for code hosting platforms API
 
-It is intended to facilitate research of Open Source projects.
-At this point, it is basically functional but is missing:
-
-- tests
-- documentation
-- good architecture
-
-Feel free to contribute any of those.
-
-### Installation
-
-```bash
-pip install --user --upgrade strudel.scraper
-``` 
-
-
-### Usage
-
-```python
-import stscraper as scraper
-import pandas as pd
-
-gh_api = scraper.GitHubAPI()
-# so far only GiHub, Bitbucket and Gitlab are supported
-# bb_api = scraper.BitbucketAPI()
-# gl_api = scraper.GitLabAPI()
-
-# repo_issues is a generator that can be used
-# to instantiate a pandas dataframe
-issues = pd.DataFrame(gh_api.repo_issues('cmustrudel/strudel.scraper'))
-```
-
-
-
-### Settings
-
-GitHub and GitLab APIs limit request rate for unauthenticated requests
-(although GitLab limit is much more generous).
-There are several ways to set your API keys, listed below in order of priority.
-
-**Important note:** API objects are reused in subsequent calls.
-The same keys used to instantiate the first API object will be used by
-ALL other instances.
-
-#### Class instantiation:
-
-```python
-import stscraper
-
-gh_api = stscraper.GitHubAPI(tokens="comman-separated list of tokens")
-```
-
-#### At runtime:
-
-```python
-import stscraper
-import stutils
-
-# IMPORTANT: do this before creation of the first API object!
-stutils.CONFIG['GITHUB_API_TOKENS'] = 'comma-separated list of tokens'
-stutils.CONFIG['GITLAB_API_TOKENS'] = 'comma-separated list of tokens'
-
-# any api instance created after this, will use the provided tokens
-gh_api = stscraper.GitHubAPI()
-```
-
-#### settings file:
-
-```
-project root
- \
-  |- my_module
-  |   \- my_file.py
-  |- settings.py
-```
-
-```python
-# settings.py
-
-GITHUB_API_TOKENS = 'comma-separated list of tokens'
-GITLAB_API_TOKENS = 'comma-separated list of tokens'
-```
-
-```python
-# my_file.py
-import stscraper
-
-# keys from settings.py will be reused automatically
-gh_api = stscraper.GitHubAPI()
-```
-
-#### Environment variable:
-
-
-```bash
-# somewhere in ~/.bashrc
-export GITHUB_API_TOKENS='comma-separated list of tokens'
-export GITLAB_API_TOKENS='comma-separated list of tokens'
-```
-
-```python
-# somewhere in the code
-import stscraper
-
-# keys from environment variables will be reused automatically
-gh_api = stscraper.GitHubAPI()
-```
-
-
-#### Hub config:
-
-If you have [hub](https://github.com/github/hub) installed and everything else
-fails, its configuration will be reused for GitHub API.
\ No newline at end of file
+Please see https://cmustrudel.github.io/strudel.scraper/ for documentation.
\ No newline at end of file
diff --git a/stscraper/github.py b/stscraper/github.py
index 233297b..ea70690 100644
--- a/stscraper/github.py
+++ b/stscraper/github.py
@@ -359,7 +359,7 @@ def v4(self, query, object_path=(), **params):
         get the first record, e.g. by calling `next()` on the result:
 
         >>> user_info = next(self.v4('''
-        ...     query ($user: String!) { 
+        ...     query ($user: String!) {
         ...       user(login:$user) {
         ...         login, name, avatarUrl, websiteUrl
         ...         company, bio, location, name, twitterUsername, isHireable