diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..2e8d740 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,39 @@ +name: Semantic Release + +on: + push: + branches: [ master ] + +jobs: + release: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + + - name: Python Semantic Release + uses: relekang/python-semantic-release@v7.2.1 + with: + pypi_token: ${{ secrets.PYPI_TOKEN }} + + pages: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + with: + ref: gh-pages + path: docs/build/html + - name: GitHub pages + - run: | + python -m pip install --upgrade pip + pip install sphinx sphinx-autobuild + sphinx-build -M html "docs" "docs/build" + git config user.name github-actions + git config user.email github-actions@github.com + cd docs/build/html + git add . + git commit -m "github pages" + git push \ No newline at end of file diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..5802f18 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,38 @@ +name: Run unit tests on every push + +on: push + +jobs: + test: + name: Python ${{ matrix.python-version }} tests + runs-on: ubuntu-latest + + strategy: + matrix: + python-version: [2.7, 3.6, 3.7, 3.8] + + steps: + - uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + + - name: Cache pip + uses: actions/cache@v1 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }} + restore-keys: | + ${{ runner.os }}-pip- + + - name: Install dependencies (Python ${{ matrix.python-version }}) + run: | + python -m pip install --upgrade pip + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + + - name: Run tests on Python ${{ matrix.python-version }} + env: + GITHUB_API_TOKENS: ${{ secrets.GH_API_TOKENS }} + run: make test diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 1829053..0000000 --- a/.travis.yml +++ /dev/null @@ -1,51 +0,0 @@ -sudo: required -language: python -python: - - 2.7 - - 3.6 - -cache: - - pip - - packages - -install: - - make install - - pip install requests typing - -script: - - make test - -# jobs instead of deploy to deploy only once (for Python3 build) -jobs: - fast_finish: true - include: - - stage: upload to PYPI, build docs and create a release - # python-semantic-release fails with Travis Python3.5 - python: 3.6 - install: make install_dev - script: make html - - deploy: - - provider: script - skip_cleanup: true - on: - branch: master - script: make publish - - - provider: releases - skip-cleanup: true - api_key: $GH_TOKEN - on: - tags: true - file: dist/* - - - provider: pages - skip-cleanup: true - github-token: $GH_TOKEN - keep-history: true - on: - branch: master - local-dir: docs/build/html - -after_failure: - - pip freeze \ No newline at end of file diff --git a/README.md b/README.md index d16750d..95493d3 100644 --- a/README.md +++ b/README.md @@ -1,115 +1,2 @@ -# Python interface for code hosting platforms API -It is intended to facilitate research of Open Source projects. -At this point, it is basically functional but is missing: - -- tests -- documentation -- good architecture - -Feel free to contribute any of those. - -### Installation - -```bash -pip install --user --upgrade strudel.scraper -``` - - -### Usage - -```python -import stscraper as scraper -import pandas as pd - -gh_api = scraper.GitHubAPI() -# so far only GiHub, Bitbucket and Gitlab are supported -# bb_api = scraper.BitbucketAPI() -# gl_api = scraper.GitLabAPI() - -# repo_issues is a generator that can be used -# to instantiate a pandas dataframe -issues = pd.DataFrame(gh_api.repo_issues('cmustrudel/strudel.scraper')) -``` - - - -### Settings - -GitHub and GitLab APIs limit request rate for unauthenticated requests -(although GitLab limit is much more generous). -There are several ways to set your API keys, listed below in order of priority. - -**Important note:** API objects are reused in subsequent calls. -The same keys used to instantiate the first API object will be used by -ALL other instances. - -#### Class instantiation: - -```python -import stscraper - -gh_api = stscraper.GitHubAPI(tokens="comman-separated list of tokens") -``` - -#### At runtime: - -```python -import stscraper -import stutils - -# IMPORTANT: do this before creation of the first API object! -stutils.CONFIG['GITHUB_API_TOKENS'] = 'comma-separated list of tokens' -stutils.CONFIG['GITLAB_API_TOKENS'] = 'comma-separated list of tokens' - -# any api instance created after this, will use the provided tokens -gh_api = stscraper.GitHubAPI() -``` - -#### settings file: - -``` -project root - \ - |- my_module - | \- my_file.py - |- settings.py -``` - -```python -# settings.py - -GITHUB_API_TOKENS = 'comma-separated list of tokens' -GITLAB_API_TOKENS = 'comma-separated list of tokens' -``` - -```python -# my_file.py -import stscraper - -# keys from settings.py will be reused automatically -gh_api = stscraper.GitHubAPI() -``` - -#### Environment variable: - - -```bash -# somewhere in ~/.bashrc -export GITHUB_API_TOKENS='comma-separated list of tokens' -export GITLAB_API_TOKENS='comma-separated list of tokens' -``` - -```python -# somewhere in the code -import stscraper - -# keys from environment variables will be reused automatically -gh_api = stscraper.GitHubAPI() -``` - - -#### Hub config: - -If you have [hub](https://github.com/github/hub) installed and everything else -fails, its configuration will be reused for GitHub API. \ No newline at end of file +Please see https://cmustrudel.github.io/strudel.scraper/ for documentation. \ No newline at end of file diff --git a/docs/bitbucket.rst b/docs/bitbucket.rst deleted file mode 100644 index e69de29..0000000 diff --git a/docs/github.rst b/docs/github.rst deleted file mode 100644 index e69de29..0000000 diff --git a/docs/gitlab.rst b/docs/gitlab.rst deleted file mode 100644 index e69de29..0000000 diff --git a/docs/index.rst b/docs/index.rst index e2f9590..8cf1bb0 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -5,7 +5,66 @@ Reference .. toctree:: :maxdepth: 2 +.. py:module:: stscraper + +`stscraper` is a Python interface for GitHub API + +Key features: + +- utilize multiple API keys to speed up scraping +- transparently handle pagination and minor network errors + +Installation +------------ + +.. code-block:: bash + + pip install --user --upgrade strudel.scraper + + +Usage +----- + +The main way to use this module is through :py:class:`GitHubAPI` objects. + +.. code-block:: + + import stscraper as scraper + import pandas as pd + + gh_api = scraper.GitHubAPI("token1,token2,...") + + # repo_issues is a generator that can be used + # to instantiate a pandas dataframe + issues = pd.DataFrame(gh_api.repo_issues('cmustrudel/strudel.scraper')) + +Tokens can be provided either at class instantiation or through an environment +variable: + +.. code-block:: bash + + # somewhere in ~/.bashrc + export GITHUB_API_TOKENS='comma-separated list of tokens' + +.. code-block:: + + # later, in some Python file: + gh_api = scraper.GitHubAPI() # tokens from the environment var will be used + +If no keys were passed at class instantiation and `GITLAB_API_TOKENS` +environment variable is not defined, `stscraper` will also check `GITHUB_TOKEN` +environment variable. This variable is created by GitHub actions runner and also +used by `hub `_ utility. + +REST (v3) API +------------- +.. autoclass:: GitHubAPI + :members: + :exclude-members: + +GraphQL (v4) API +---------------- + +.. autoclass:: GitHubAPIv4 + :members: -:doc:`github` -:doc:`gitlab` -:doc:`BitBucket` diff --git a/scripts/check_gh_limits.py b/scripts/check_gh_limits.py deleted file mode 100755 index 0cdbc7a..0000000 --- a/scripts/check_gh_limits.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python - -from __future__ import print_function - -import argparse - -import stscraper as scraper - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Check remaining limits of registered GitHub API keys") - args = parser.parse_args() - - columns = ("user", "core_limit", "core_remaining", "core_renews_in", - "search_limit", "search_remaining", "search_renews_in", "key") - - stats = list(scraper.github.get_limits()) - - lens = {column: max(max(len(str(values[column])), len(column)) - for values in stats) - for column in columns} - - print(" ".join(c.ljust(lens[c] + 1, " ")for c in columns)) - for values in stats: - print(" ".join(str(values[c]).ljust(lens[c] + 1, " ") for c in columns)) diff --git a/scripts/collect_emails.py b/scripts/collect_emails.py deleted file mode 100644 index 442b453..0000000 --- a/scripts/collect_emails.py +++ /dev/null @@ -1,78 +0,0 @@ - -from __future__ import print_function, unicode_literals - -import os -import logging -import argparse -import csv -import hashlib - -import pandas as pd -from django.core.management.base import BaseCommand - -from common import decorators -from common import email_utils as email -from scraper import scraper as scraper - -logging.basicConfig() -logger = logging.getLogger('ghd') - - -class Command(BaseCommand): - requires_system_checks = False - help = "Create mapping of GitHub users to their emails for mathching " \ - "StackOverflow records. The result is store in cache folder.\n\n" \ - "This data is generated from commits records, so it is recommnded " \ - "to run ./manage.py scraper_build_cache first." - - def add_arguments(self, parser): - parser.add_argument('ecosystem', type=str, - help='Ecosystem to process, {pypi|npm}') - parser.add_argument('-o', '--output', default="", - help='Output file. Will be extended if already ' - 'exists') - - def handle(self, *args, **options): - loglevel = 40 - 10*options['verbosity'] - logger.setLevel(20 if loglevel == 30 else loglevel) - - reader = csv.DictReader(options['input']) - - output = options['output'] - if not output: - output = os.path.join( - decorators.get_cache_path('scraper'), "user.emails.csv") - if os.path.isfile(output): - users = pd.read_csv(output, index_col=0) - else: - users = pd.DataFrame(columns=['uname', 'email_md5']) - users.index.name = 'email' - - for package in reader: - logger.info("Processing %s %s", package['name'], - package['github_url']) - if not package['github_url']: - continue - - commits = scraper._commits(package['github_url']) - commits = commits.loc[pd.notnull(commits['author_email']) & \ - pd.notnull(commits['author'])] - for _, commit in commits.iterrows(): - if not commit['author'] or not commit['author_email']: - continue - try: - email_addr = email.clean(commit['author_email']) - except ValueError: # invalid email - continue - - if email_addr in users.index: - continue - - md5 = hashlib.md5() - md5.update(email_addr) - users.loc[email_addr] = { - 'uname': commit['author'], - 'email_md5': md5.hexdigest() - } - - users.to_csv(output) diff --git a/setup.py b/setup.py index 624d5a2..0cd2c70 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,9 @@ ], platforms=["Linux", "Solaris", "Mac OS-X", "Unix", "Windows"], python_requires='>2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4', - scripts=[os.path.join('scripts', 'check_gh_limits.py')], + entry_points={ + 'console_scripts': ["check_gh_limits = stscraper.github:print_limits"] + }, packages=[package], url='https://github.com/cmustrudel/strudel.scraper', install_requires=requirements, diff --git a/stscraper/__init__.py b/stscraper/__init__.py index adb5614..abb5206 100644 --- a/stscraper/__init__.py +++ b/stscraper/__init__.py @@ -1,6 +1,5 @@ -from .base import * -from .generic import * +from .github import * __version__ = '0.4.0' __author__ = "Marat (@cmu.edu)" diff --git a/stscraper/base.py b/stscraper/base.py index c19e202..c89e28b 100644 --- a/stscraper/base.py +++ b/stscraper/base.py @@ -1,4 +1,6 @@ +from __future__ import absolute_import + import requests from datetime import datetime @@ -7,7 +9,7 @@ import re import six import time -from typing import Iterable, Iterator, Optional +from typing import Iterable, Iterator, Optional, Tuple, Union from functools import wraps @@ -134,11 +136,127 @@ def caller(*args): return wrapper +class APIToken(object): + """ An abstract container for an API token + """ + # API endpoint + api_url = None # type: str + + token = None # type: str + # number of seconds before throwing IOError + timeout = None # type: int + # request headers to use + _headers = {} # type: dict + # supported API classes (e.g. core, search etc) + api_classes = ('core',) # type: Tuple + # rate limits for API classes + limits = None # type: dict + session = None # type: requests.Session + + def __init__(self, token=None, timeout=None): + self.token = token + self.timeout = timeout + self.limits = {api_class: { + 'limit': None, + 'remaining': None, + 'reset_time': None + } for api_class in self.api_classes} + self.session = requests.Session() + + @property + def is_valid(self): + raise NotImplementedError + + @property + def user(self): + """ Get user info of the token owner """ + raise NotImplementedError + + def _update_limits(self, response, url): + raise NotImplementedError + + def check_limits(self): + """ Get information about remaining limits on the token. + + Usually this information present in response headers and updated + automatically (see _update_limits()). This method is intended to + FORCE to renew this info. + + Some APIs have multiple classes of limits, so it should return a list + of dictionaries + { : { + 'remaining': remaining number of requests until reset, + 'limit': overall limit, + 'reset_time': unix_timestamp + }, + ... + } + """ + raise NotImplementedError + + @staticmethod + def api_class(url): + # type: (str) -> str + return 'core' + + def when(self, url): + # type: (str) -> int + """Check when the specified URL become accessible without blocking + + Returns: unix timestamp + """ + raise NotImplementedError + + def ready(self, url): + """ Check if this url can be called without blocking """ + t = self.when(url) + return not t or t <= time.time() + + def __call__(self, url, method='get', data=None, **params): + """ Make an API request """ + # TODO: use coroutines, perhaps Tornado (as PY2/3 compatible) + + if not self.ready(url): + raise TokenNotReady + + r = self.session.request( + method, self.api_url + url, params=params, data=data, + headers=self._headers, timeout=self.timeout) + + self._update_limits(r, url) + + return r + + def __str__(self): + return self.token or "" + + +class DummyAPIToken(APIToken): + """ A dummy token class that does nothing + APIs that don't have limits should use tokens subclassed from this one + """ + + is_valid = True + user = 'Anonymous' + + def check_limits(self): + return self.limits + + def ready(self, url): + return True + + def when(self, url): + return None + + def _update_limits(self, response, url): + pass + + class VCSAPI(object): _instance = None # instance of API() for Singleton pattern implementation - tokens = None - token_class = None + tokens = () # type: Tuple[APIToken] + token_class = DummyAPIToken # type: type status_too_many_requests = () status_not_found = (404, 451) @@ -149,19 +267,22 @@ class VCSAPI(object): def __new__(cls, *args, **kwargs): # Singleton if not isinstance(cls._instance, cls): cls._instance = super(VCSAPI, cls).__new__(cls) - cls._instance.__init__(*args, **kwargs) + + cls._instance.__init__(*args, **kwargs) return cls._instance def __init__(self, tokens=None, timeout=30): - # type: (Optional[Iterable], int) -> None + # type: (Optional[Union[Iterable,str]], int) -> None + old_tokens = {str(token) for token in self.tokens} if tokens: if isinstance(tokens, six.string_types): tokens = tokens.split(",") - self.tokens = tuple( - self.token_class(t, timeout=timeout) for t in set(tokens)) + new_tokens_instances = [self.token_class(t, timeout=timeout) + for t in set(tokens) - old_tokens] + self.tokens += tuple(t for t in new_tokens_instances if t.is_valid) self.logger = logging.getLogger('scraper.' + self.__class__.__name__) - def has_next_page(self, response): + def _has_next_page(self, response): """ Check if there is a next page to a paginated response """ raise NotImplementedError @@ -173,18 +294,21 @@ def init_pagination(): return {'page': 1, 'per_page': 100} @staticmethod - def extract_result(response, paginate): + def extract_result(response): """ Parse results from the response. For most APIs, it is just parsing JSON """ return response.json() - def request(self, url, method='get', data=None, paginate=False, **params): - """ Generic, API version agnostic request method """ - timeout_counter = 0 - if paginate: - params.update(self.init_pagination()) + def iterate_tokens(self, url=""): + """Infinite generator of tokens, taking care of their availability + Args: + url (str): request URL. In some API classes there are multiple rate + limits handled separately, e.g. GitHub general vs search API. + Generates: + (APIToken): a token object + """ while True: # problem with iterating them in the same order # (eg, sorted by expiration): in multithreaded case, @@ -193,56 +317,7 @@ def request(self, url, method='get', data=None, paginate=False, **params): for token in random.sample(self.tokens, len(self.tokens)): if not token.ready(url): continue - - try: - r = token(url, method=method, data=data, **params) - except TokenNotReady: - continue - except requests.exceptions.RequestException: - # starting early November, GitHub fails to establish - # a connection once in a while (bad status line). - # To account for more general issues like this, - # TimeoutException was replaced with RequestException - timeout_counter += 1 - if timeout_counter > self.retries_on_timeout: - raise - continue # i.e. try again - - if r.status_code in self.status_not_found: # API v3 only - raise RepoDoesNotExist( - "%s API returned status %s" % ( - self.__class__.__name__, r.status_code)) - elif r.status_code in self.status_empty: - yield {} - return - elif r.status_code in self.status_internal_error: - timeout_counter += 1 - if timeout_counter > self.retries_on_timeout: - raise requests.exceptions.Timeout("VCS is down") - time.sleep(2**timeout_counter) - continue # i.e. try again - elif r.status_code in self.status_too_many_requests: - timeout_counter += 1 - if timeout_counter > self.retries_on_timeout: - raise requests.exceptions.Timeout( - "Too many requests from the same IP. " - "Are you abusing the API?") - time.sleep(2**timeout_counter) - continue - - r.raise_for_status() - res = self.extract_result(r, paginate) - if paginate: - for item in res: - yield item - if not res or not self.has_next_page(r): - return - else: - params["page"] += 1 - continue - else: - yield res - return + yield token next_res = min(token.when(url) for token in self.tokens) sleep = next_res and int(next_res - time.time()) + 1 @@ -253,6 +328,87 @@ def request(self, url, method='get', data=None, paginate=False, **params): time.sleep(sleep) self.logger.info(".. resumed") + def request(self, url, method='get', data=None, paginate=False, **params): + """ Make an API request, taking care of pagination + + Args: + url (str): request URL + method (str): HTTP method type + data (str): API request payload (for POST requests) + paginate (bool): flag to take care of pagination + + Generates: + object: parsed object, API-specific + """ + if paginate: + params.update(self.init_pagination()) + + while True: + r = self._request(url, method, data, **params) + if r.status_code in self.status_empty: + return + + res = self.extract_result(r) + if paginate: + for item in res: + yield item + if not res or not self._has_next_page(r): + return + else: + params["page"] += 1 + continue + else: + yield res + return + + def _request(self, url, method='get', data=None, **params): + """ Make + Args: + url (str): request URL + method (str): HTTP method type + data (str): API request payload (for POST requests) + + Return: + requests.Response: raw HTTP response + """ + timeout_counter = 0 + for token in self.iterate_tokens(url): + try: + r = token(url, method=method, data=data, **params) + except TokenNotReady: + continue + except requests.exceptions.RequestException: + # starting early November, GitHub fails to establish + # a connection once in a while (bad status line). + # To account for more general issues like this, + # TimeoutException was replaced with RequestException + timeout_counter += 1 + if timeout_counter > self.retries_on_timeout: + raise + continue # i.e. try again + + if r.status_code in self.status_not_found: # API v3 only + raise RepoDoesNotExist( + "%s API returned status %s at %s" % ( + self.__class__.__name__, r.status_code, url)) + elif r.status_code in self.status_internal_error: + timeout_counter += 1 + if timeout_counter > self.retries_on_timeout: + raise requests.exceptions.Timeout("VCS is down") + time.sleep(2**timeout_counter) + continue # i.e. try again + elif r.status_code in self.status_too_many_requests: + timeout_counter += 1 + if timeout_counter > self.retries_on_timeout: + raise requests.exceptions.Timeout( + "Too many requests from the same IP. " + "Are you abusing the API?") + time.sleep(1 << (timeout_counter+1)) + continue + + r.raise_for_status() + return r + def all_users(self): # type: () -> Iterable[dict] """ """ @@ -327,110 +483,3 @@ def project_exists(repo_slug): # type: (str) -> bool """ """ raise NotImplementedError - - @staticmethod - def canonical_url(repo_slug): - # type: (str) -> str - """ """ - raise NotImplementedError - - -class APIToken(object): - """ An abstract container for an API token - """ - api_url = None # API endpoint - - token = None # str token - timeout = None # number of seconds before throwing IOError - _headers = {} # request headers to use - api_classes = ('core',) # supported API classes (e.g. core, search etc) - limits = None # rate limits for API classes - session = None - - def __init__(self, token=None, timeout=None): - self.token = token - self.timeout = timeout - self.limits = {api_class: { - 'limit': None, - 'remaining': None, - 'reset_time': None - } for api_class in self.api_classes} - self.session = requests.Session() - - @property - def user(self): - """ Get user info of the token owner """ - raise NotImplementedError - - def _update_limits(self, response, url): - raise NotImplementedError - - def check_limits(self): - """ Get information about remaining limits on the token. - - Usually this information present in response headers and updated - automatically (see _update_limits()). This method is intended to - FORCE to renew this info. - - Some APIs have multiple classes of limits, so it should return a list - of dictionaries - { : { - 'remaining': remaining number of requests until reset, - 'limit': overall limit, - 'reset_time': unix_timestamp - }, - ... - } - """ - raise NotImplementedError - - @staticmethod - def api_class(url): - # type: (str) -> str - return 'core' - - def when(self, url): - # type: (str) -> int - """Check when the specified URL become accessible without blocking - - Returns: unix timestamp - """ - raise NotImplementedError - - def ready(self, url): - """ Check if this url can be called without blocking """ - t = self.when(url) - return not t or t <= time.time() - - def __call__(self, url, method='get', data=None, **params): - """ Make an API request """ - # TODO: use coroutines, perhaps Tornado (as PY2/3 compatible) - - if not self.ready(url): - raise TokenNotReady - - r = self.session.request( - method, self.api_url + url, params=params, data=data, - headers=self._headers, timeout=self.timeout) - - self._update_limits(r, url) - - return r - - -class DummyAPIToken(APIToken): - """ A dummy token class that does nothing """ - - user = 'Anonymous' - - def check_limits(self): - return self.limits - - def ready(self, url): - return True - - def when(self, url): - return None - - def _update_limits(self, response, url): - pass diff --git a/stscraper/bitbucket.py b/stscraper/bitbucket.py deleted file mode 100644 index 1d30ed5..0000000 --- a/stscraper/bitbucket.py +++ /dev/null @@ -1,114 +0,0 @@ - -from .base import * - - -class BitbucketAPIToken(DummyAPIToken): - """ A dummy - Bitbucket isn't using any tokens - https://confluence.atlassian.com/bitbucket/rate-limits-668173227.html - """ - api_url = "https://api.bitbucket.org/2.0/" - - -class BitbucketAPI(VCSAPI): - token_class = BitbucketAPIToken - - status_not_found = (404, 422, 451) - - def __init__(self, tokens=None, timeout=30): - super(BitbucketAPI, self).__init__([None], timeout) - - def has_next_page(self, response): - return 'next' in response.json() - - @staticmethod - def init_pagination(): - return {'page': 1, 'pagelen': 100} - - @staticmethod - def extract_result(response, paginate): - res = response.json() - if 'error' in res: - raise VCSError(json_path(res, 'error', 'message')) - if paginate: - return res['values'] - return res - - def all_users(self): - # type: () -> Iterable[dict] - """ """ - raise NotImplementedError - - def all_repos(self): - # type: () -> Iterable[dict] - """ """ - return self.request('repositories', paginate=True) - - def repo_issues(self, repo_name): - # type: (str) -> Iterable[dict] - """ """ - return self.request( - 'repositories/%s/issues' % repo_name, paginate=True) - - def repo_commits(self, repo_name): - # type: (str) -> Iterable[dict] - """ """ - return self.request( - 'repositories/%s/commits' % repo_name, paginate=True) - - def repo_pulls(self, repo_name): - # type: (str) -> Iterable[dict] - """ """ - return self.request('repositories/%s/pullrequests' % repo_name) - - def pull_request_commits(self, repo, pr_id): - # type: (str, int) -> Iterable[dict] - """ """ - raise NotImplementedError - - def issue_comments(self, repo, issue_id): - # type: (str, int) -> Iterable[dict] - """ """ - raise NotImplementedError - - def review_comments(self, repo, pr_id): - # type: (str, int) -> Iterable[dict] - """ """ - raise NotImplementedError - - def user_info(self, user): - # type: (str) -> dict - """ """ - raise NotImplementedError - - def user_repos(self, user): - # type: (str) -> dict - """Get list of user repositories""" - return self.request('repositories/' + user) - - def user_orgs(self, user): - # type: (str) -> Iterable[dict] - """ """ - raise NotImplementedError - - def org_members(self, org): - # type: (str) -> Iterable[dict] - """ """ - raise NotImplementedError - - def org_repos(self, org): - # type: (str) -> Iterable[dict] - """ """ - raise NotImplementedError - - @staticmethod - def project_exists(repo_name): - # type: (str) -> bool - """ """ - return bool(requests.head(BitbucketAPIToken.api_url + repo_name)) - - @staticmethod - def canonical_url(project_url): - # type: (str) -> str - """ """ - raise NotImplementedError diff --git a/stscraper/deprecated.py b/stscraper/deprecated.py deleted file mode 100644 index 45fde4d..0000000 --- a/stscraper/deprecated.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python -""" -Deprecated tools, mostly serving the purpose of examples -""" - - -import datetime -import re - - -def timestamp2str(timestamp): - return datetime2str(datetime.datetime.fromtimestamp(timestamp)) - - -def datetime2str(dt, fmt="%Y-%m-%d %H:%M"): - return dt.strftime(fmt) - - -def utf8fy(string): - try: - return string.encode('utf8') - except UnicodeDecodeError: - return '*Garbled*' - - -def commits_gitpython(repo_path, ref='master', short_message=False): - """ Parse commits from a cloned git repository using gitphython - This is a rather slow method since gitpython simply parses cli output of - native git client - """ - import git - - try: - repo = git.Repo(repo_path) - except git.InvalidGitRepositoryError: - raise ValueError("Not a git repository: %s" % repo_path) - - for commit in repo.iter_commits(ref, max_count=-1): - # WTF? example: - # https://github.com/openssl/openssl/commit/c753e71e0a0aea2c540dab96fb02c9c62c6ba7a2 - hasauthor = hasattr(commit, 'author') or None - hasdate = hasattr(commit, 'committed_date') or None - - message = commit.message.strip() - if short_message: - message = message.split("\n", 1)[0].strip() - - yield { - 'sha': commit.hexsha, - 'author_name': hasauthor and utf8fy(commit.author.name), - 'author_email': hasauthor and utf8fy(commit.author.email), - 'authored_date': hasauthor and timestamp2str(commit.authored_date), - 'committer_name': utf8fy(commit.committer.name), - 'committer_email': utf8fy(commit.committer.email), - 'committed_date': hasdate and timestamp2str(commit.committed_date), - 'message': utf8fy(message), - 'parents': commit.parents - } - - -def get_repo_name(repo_url): - assert(repo_url.endswith(".git")) - chunks = [c for c in re.split("[:/]", repo_url[:-4]) if c] - org = "" if len(chunks) < 2 else chunks[-2] - repo = chunks[-1] - return org, repo - - -def commits_pygit2(repo_url, remove=True): - """ Iterate commits using Python libgit2 binding. - Unlike GitPython, it can clone repository for you and works in the same - memory space so it is much faster. It is kind of heavy, but can be handy if - you need to work with repository/commits content (e.g. code analysis) - - :param repo_url Git repository URL (not GitHub URL!). - Example: git://github.com/user/repo.git - """ - import os - import tempfile - import shutil - - import pygit2 - org, repo_name = get_repo_name(repo_url) - folder = tempfile.mkdtemp(prefix='_'.join(('ghd', org, repo_name, ''))) - repo = pygit2.clone_repository(repo_url, folder, bare=True) - - try: - for commit in repo.walk(repo.head.target): - # http://www.pygit2.org/objects.html#commits - yield { - 'sha': commit.oid, - 'author_name': commit.author.name, - 'author_email': commit.author.email, - 'committer_name': commit.committer.name, - 'committer_email': commit.committer.email, - 'message': commit.message.strip(), - 'parent_ids': "\n".join(str(pid) for pid in commit.parent_ids), - 'time': commit.commit_time, - } - finally: - if remove: - os.chdir('/tmp') - shutil.rmtree(folder) - - -def issues_PyGithub(github_token, repo_name): - """ Iterate issues of a GitHub repository using GitHub API v3 - - The library used in this method, PyGithub tries to extensively resolve - attributes which leads to a number of excessive API calls and computation - overhead. This implementation tries to avoid this, and was replaced by - local implementation to have uniform interface and get rid of dependency - """ - # this is not the same module included with scraper. - # to install, `pip install PyGithub` - import github - - g = github.Github(github_token) - repo = g.get_repo(repo_name) - try: - id = repo.id - except github.GithubException: - raise ValueError("Repository %s does not exist" % repo_name) - - issues = repo.get_issues(state='all') - - # Response example: - # https://api.github.com/repos/pandas-dev/pandas/issues?page=62 - for issue in issues: - raw = issue._rawData # to prevent resolving usernames into objects - yield { - 'id': int(raw['id']), - 'title': raw['title'], - 'user': raw['user']['login'], - 'labels': ",".join(l['name'] for l in raw['labels']), - 'state': raw['state'], - 'created_at': raw['created_at'], - 'updated_at': raw['updated_at'], - 'closed_at': raw['closed_at'], - 'body': raw['body'] - } diff --git a/stscraper/generic.py b/stscraper/generic.py deleted file mode 100644 index 8b2f36e..0000000 --- a/stscraper/generic.py +++ /dev/null @@ -1,181 +0,0 @@ - -""" -Standard interface to all supported code hosting platforms. - -Two important distinctions comparing to -1. URLs must include the code hosting platform itself, i.e. instead of - `cmustrudel/strudel.scraper` one should use - `github.com/cmustrudel/strudel.scraper`. -2. Returned objects are simplified to a common subset of fields -""" - -from .base import * -from .github import GitHubAPI -from .gitlab import GitLabAPI -from .bitbucket import BitbucketAPI - -PROVIDERS = { - "github.com": GitHubAPI, - # https://developer.atlassian.com/bitbucket/api/2/reference/resource/ - "bitbucket.org": BitbucketAPI, - # https://docs.gitlab.com/ee/api/ - "gitlab.org": GitLabAPI, - # https://anypoint.mulesoft.com/apiplatform/sourceforge/ - "sourceforge.net": None, -} - - -def get_provider(url): - # type: (str) -> (str, str) - """ Separate provided URL into provider and project ID - :param url: url matching URL_PATTERN - :return: (provider_cls, project_id) - - >>> prov, proj_id = get_provider("github.com/abc/def") - >>> isinstance(prov, github.GitHubAPI) - True - >>> proj_id - 'abc/def' - >>> prov, proj_id = get_provider("someothersource.com/abc/def") - """ - provider_name, project_url = parse_url(url) - provider_cls = PROVIDERS.get(provider_name) - if provider_cls is None: - raise NotImplementedError( - "Provider %s is not supported (yet?)" % provider_name) - return provider_cls, project_url - - -MAPPINGS = { - 'repo_commits': { - 'fields': ( - 'sha', 'author', 'author_email', 'author_name', 'authored_at', - 'committer', 'committer_email', 'committed_at', 'comment_count', - 'message', 'verified'), - 'github.com': { - 'sha': 'sha', - 'author': 'author__login', - 'author_email': 'commit__author__email', - 'author_name': 'commit__author__name', - 'authored_at': 'commit__author__date', - 'committer': 'commit__committer__login', - 'committer_email': 'commit__committer__email', - 'committed_at': 'commit__committer__date', - 'comment_count': 'commit__comment_count', - 'message': 'commit__message', - 'verified': 'commit__verification__verified', - 'parents': 'parents__,sha' - }, - }, - 'repo_issues': { - 'fields': ( - 'number', 'user', 'role', 'title', 'body', 'assignee', 'id', - 'state', 'created_at', 'updated_at', 'closed_at', 'reactions'), - 'github.com': { - 'number': 'number', - 'user': 'user__login', - 'role': 'author_association', - 'title': 'title', - 'body': 'body', - 'assignee': 'assignee', - 'id': 'id', - 'state': 'state', - 'created_at': 'created_at', - 'updated_at': 'updated_at', - 'closed_at': 'closed_at', - 'reactions': 'reactions__total_count', - 'pull_request_url': 'pull_request__url', - 'labels': 'labels__,name', - }, - }, - 'repo_pulls': { - 'fields': ( - 'number', 'title', 'body', 'state', 'user', 'head', - 'head_branch', 'base', 'base_branch', 'created_at', - 'updated_at', 'closed_at', 'merged_at', 'role'), - 'github.com': { - 'number': 'number', - 'title': 'title', - 'body': 'body', - 'state': 'state', - 'user': 'user__login', - 'head': 'head__repo__full_name', - 'head_branch': 'head__ref', - 'base': 'base__repo__full_name', - 'base_branch': 'base__ref', - 'created_at': 'created_at', - 'updated_at': 'updated_at', - 'closed_at': 'closed_at', - 'merged_at': 'merged_at', - 'role': 'author_association', - 'labels': 'labels__,name', - }, - }, - 'review_comments': { - 'fields': ( # 'pr_no', - 'id', 'user', 'created_at', 'updated_at', - 'body', 'path', 'position', 'role'), - 'github.com': { - # TODO: 'pr_no': 'pr_no', # from call params - 'id': 'id', - 'body': 'body', - 'user': 'user__login', - 'role': 'author_association', - 'created_at': 'created_at', - 'updated_at': 'updated_at', - 'path': 'path', - 'position': 'original_position', - }, - }, - 'issue_comments': { - 'fields': ( # 'issue_no', - 'id', 'user', 'created_at', 'updated_at', - 'body', 'role', 'reactions'), - 'github.com': { - 'id': 'id', - 'body': 'body', - 'user': 'user__login', - 'role': 'author_association', - 'created_at': 'created_at', - 'updated_at': 'updated_at', - 'reactions': 'reactions__total_count', - # TODO: 'issue_no': int(comment['issue_url'].rsplit("/", 1)[-1]), - } - }, -} - - -class GenericScraper(object): - """ Get a small but consistent subset of fields across all VCS providers - This interface supports the same API as all other VCS providers, - with one addition: you need to append repository URL - in front of all other params. For example, - - >>> GitHubAPI().repo_commits("user/repo") - - is equivalent to: - - >>> GenericScraper().repo_commits("https://github.com/user", "user/repo") - """ - def __getattribute__(self, attr): - if not hasattr(VCSAPI, attr): - raise AttributeError("'Scraper' has not attribute '%s'" % attr) - if attr not in MAPPINGS: - raise NotImplementedError( - "Generic API '%s' has not been implemented yet" % attr) - mappings = MAPPINGS[attr] - - def wrapper(url, *args): - provider_name, _ = parse_url(url) - if provider_name not in mappings: - raise NotImplementedError( - "Generic API '%s' has not been implemented for '%s' yet" - "" % (attr, provider_name)) - mapping = mappings[provider_name] - provider_cls, _ = get_provider(url) - provider = provider_cls() - - for item in getattr(provider, attr)(*args): - yield json_map(mapping, item) - - return wrapper diff --git a/stscraper/github.py b/stscraper/github.py index 9dd84d3..ea70690 100644 --- a/stscraper/github.py +++ b/stscraper/github.py @@ -1,4 +1,5 @@ +from __future__ import absolute_import from __future__ import print_function import datetime @@ -11,7 +12,7 @@ class GitHubAPIToken(APIToken): - api_url = "https://api.github.com/" + api_url = 'https://api.github.com/' api_classes = ('core', 'search') _user = None # cache user @@ -25,12 +26,12 @@ def __init__(self, token=None, timeout=None): # squirrel-girl-preview: issue reactions # starfox-preview: issue events self._headers = { - "Accept": "application/vnd.github.mercy-preview+json," - "application/vnd.github.squirrel-girl-preview," - "application/vnd.github.starfox-preview+json"} + 'Accept': 'application/vnd.github.mercy-preview+json,' + 'application/vnd.github.squirrel-girl-preview,' + 'application/vnd.github.starfox-preview+json'} if token is not None: self.token = token - self._headers["Authorization"] = "token " + token + self._headers['Authorization'] = 'token ' + token @property def user(self): @@ -43,6 +44,10 @@ def user(self): self._user = r.json().get('login', '') return self._user + @property + def is_valid(self): + return self.user is not None + def check_limits(self): # regular limits will be updated automatically upon request # we only need to take care about search limit @@ -90,11 +95,10 @@ def _update_limits(self, response, url): class GitHubAPI(VCSAPI): - """ This is a convenience class to pool GitHub API keys and update their + """ This is a convenience class to pool GitHub v3 API keys and update their limits after every request. Actual work is done by outside classes, such as _IssueIterator and _CommitIterator """ - tokens = None token_class = GitHubAPIToken base_url = 'https://github.com' status_too_many_requests = (403,) @@ -103,15 +107,16 @@ def __init__(self, tokens=None, timeout=30): # Where to look for tokens: # strudel config variables if not tokens: - stconfig_tokens = stutils.get_config("GITHUB_API_TOKENS") + stconfig_tokens = stutils.get_config('GITHUB_API_TOKENS') if stconfig_tokens: tokens = [token.strip() for token in stconfig_tokens.split(",") if len(token.strip()) == 40] # hub configuration: https://hub.github.com/hub.1.html + # also, used by github actions if not tokens: - token = stutils.get_config("GITHUB_TOKEN") + token = stutils.get_config('GITHUB_TOKEN') if not token and os.path.isfile("~/.config/hub"): token = open("~/.config/hub", 'r').read(64) if token and len(token.strip()) == 40: @@ -124,7 +129,7 @@ def __init__(self, tokens=None, timeout=30): super(GitHubAPI, self).__init__(tokens, timeout) - def has_next_page(self, response): + def _has_next_page(self, response): for rel in response.headers.get("Link", "").split(","): if rel.rsplit(";", 1)[-1].strip() == 'rel="next"': return True @@ -135,30 +140,31 @@ def has_next_page(self, response): # =================================== @api('users', paginate=True) def all_users(self): + """Get all GitHub users""" # https://developer.github.com/v3/users/#get-all-users return () @api('repositories', paginate=True) def all_repos(self): + """Get all GitHub repositories""" # https://developer.github.com/v3/repos/#list-all-public-repositories return () @api('repos/%s') def repo_info(self, repo_slug): - # type: (Union[str, unicode]) -> Iterator[dict] + """Get repository info""" # https://developer.github.com/v3/repos/#get return repo_slug @api_filter(lambda issue: 'pull_request' not in issue) @api('repos/%s/issues', paginate=True, state='all') def repo_issues(self, repo_slug): - # type: (Union[str, unicode]) -> Iterator[dict] + """Get repository issues (not including pull requests)""" # https://developer.github.com/v3/issues/#list-issues-for-a-repository return repo_slug @api('repos/%s/issues/comments', paginate=True) def repo_issue_comments(self, repo_slug): - # type: (Union[str, unicode]) -> Iterator[dict] """ Get all comments in all issues and pull requests, both open and closed. """ @@ -167,7 +173,6 @@ def repo_issue_comments(self, repo_slug): @api('repos/%s/issues/events', paginate=True) def repo_issue_events(self, repo_slug): - # type: (Union[str, unicode]) -> Iterator[dict] """ Get all events in all issues and pull requests, both open and closed. """ @@ -176,52 +181,90 @@ def repo_issue_events(self, repo_slug): @api('repos/%s/commits', paginate=True) def repo_commits(self, repo_slug): - # type: (Union[str, unicode]) -> Iterator[dict] + """Get all repository commits. + Note that GitHub API might ignore some merge commits""" # https://developer.github.com/v3/repos/commits/#list-commits-on-a-repository return repo_slug @api('repos/%s/pulls', paginate=True, state='all') def repo_pulls(self, repo_slug): - # type: (Union[str, unicode]) -> Iterator[dict] + """Get all repository pull requests. + Unlike the issues API, this method will return information specific for + pull requests, like head SHAs and branch names.""" # https://developer.github.com/v3/pulls/#list-pull-requests return repo_slug def repo_topics(self, repo_slug): + """Get a tuple of repository topics. + Topics are "keywords" assigned by repository owner. + + >>> GitHubAPI().repo_topics('pandas-dev/pandas') + ('data-analysis', 'pandas', 'flexible', 'alignment', 'python') + """ return tuple( next(self.request('repos/%s/topics' % repo_slug)).get('names')) def repo_labels(self, repo_slug): + """Get a tuple of repository labels. + Labels are issue tags used by maintainers + + >>> GitHubAPI().repo_labels('pandas-dev/pandas')[:5] + ('2/3 Compat', '32bit', 'API - Consistency', 'API Design', 'Admin') + """ return tuple(label['name'] for label in self.request('repos/%s/labels' % repo_slug, paginate=True)) + def repo_contributors(self, repo_slug): + """Get a timeline of up to 100 top project contributors + + Suggested use: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... GitHubAPI().repo_contributors(repo_slug)).set_index('user') + >>> df.columns = pd.to_datetime(df.columns, unit='s') + >>> df + 2018-08-19 2018-08-26 ... 2020-07-12 2020-07-19 + user ... + user2589 3 0 ... 0 0 + ... + """ + # https://developer.github.com/v3/repos/statistics/#get-all-contributor-commit-activity + url = 'repos/%s/stats/contributors' % repo_slug + for contributor_stats in next(self.request(url)): + record = {w['w']: w['c'] for w in contributor_stats['weeks']} + record['user'] = json_path(contributor_stats, 'author', 'login') + yield record + @api('repos/%s/pulls/%d/commits', paginate=True, state='all') def pull_request_commits(self, repo, pr_id): + """Get commits in a pull request. + `pr_id` is the visible pull request number, not internal GitHub id. + """ # https://developer.github.com/v3/issues/comments/#list-comments-on-an-issue return repo, pr_id @api('repos/%s/issues/%s/comments', paginate=True, state='all') def issue_comments(self, repo, issue_id): - """ Return comments on an issue or a pull request + """ Get comments on an issue or a pull request. Note that for pull requests this method will return only general - comments to the pull request, but not review comments related to - some code. Use review_comments() to get those instead - - :param repo: str 'owner/repo' - :param issue_id: int, either an issue or a Pull Request id + comments to the pull request, but not review comments related to some + code. Use review_comments() to get those instead. """ # https://developer.github.com/v3/issues/comments/#list-comments-on-an-issue return repo, issue_id @api('repos/%s/pulls/%s/comments', paginate=True, state='all') def review_comments(self, repo, pr_id): - """ Pull request comments attached to some code - See also issue_comments() + """ Get pull request comments related to some code. + This will not return general comments, see `issue_comments()` """ # https://developer.github.com/v3/pulls/comments/ return repo, pr_id @api('users/%s') def user_info(self, username): + """Get user info - name, location, blog etc.""" # Docs: https://developer.github.com/v3/users/#response return username @@ -233,20 +276,29 @@ def user_repos(self, username): @api('users/%s/orgs', paginate=True) def user_orgs(self, username): + """Get user organization membership. + Usually includes only public memberships, but for yourself you get + non-public as well.""" # https://developer.github.com/v3/orgs/#list-user-organizations return username @api('orgs/%s/members', paginate=True) def org_members(self, org): + """Get public organization members. + Note that if you are a member of the organization you'll get everybody. + """ # https://developer.github.com/v3/orgs/members/#members-list return org @api('orgs/%s/repos', paginate=True) def org_repos(self, org): + """Get organization repositories""" return org @api('repos/%s/issues/%d/events', paginate=True) def issue_events(self, repo, issue_no): + """Get issue events. + This includes state changes, references, labels etc. """ return repo, issue_no # =================================== @@ -254,119 +306,149 @@ def issue_events(self, repo, issue_no): # =================================== @staticmethod def project_exists(repo_slug): + """Check if the project exists. + This is a slightly cheaper alternative to getting repository info. + """ for i in range(5): try: return bool(requests.head("https://github.com/" + repo_slug)) except requests.RequestException: time.sleep(2**i) - @staticmethod - def canonical_url(repo_slug): - # type: (str) -> str - """ Normalize URL - - remove trailing .git (IMPORTANT) - - lowercase (API is case insensitive, so lowercase to deduplicate) - - prepend "github.com" - - :param: repo_slug: str, user_name/repo_name - :return: github.com/user_name/repo_name with both names normalized - - >>> GitHubAPI.canonical_url("pandas-DEV/pandas") - 'github.com/pandas-dev/pandas' - >>> GitHubAPI.canonical_url("http://github.com/django/django.git") - 'github.com/django/django' - >>> GitHubAPI.canonical_url("https://github.com/A/B/") - 'github.com/a/b' - """ - url = repo_slug.split("//")[-1].lower() - for prefix in ("github.com",): - if url.startswith(prefix): - url = url[len(prefix):] - for suffix in ("/", ".git"): - if url.endswith(suffix): - url = url[:-len(suffix)] - return "github.com/" + url - class GitHubAPIv4(GitHubAPI): - """ An example class using GraphQL API """ - def v4(self, query, **params): - payload = json.dumps({"query": query, "variables": params}) - return self.request("graphql", 'post', data=payload) + """ An interface to GitHub v4 GraphQL API. - def repo_issues(self, repo_slug, cursor=None): - # type: (str, str) -> Iterator[dict] - owner, repo = repo_slug.split("/") - query = """query ($owner: String!, $repo: String!, $cursor: String) { - repository(name: $repo, owner: $owner) { - hasIssuesEnabled - issues (first: 100, after: $cursor, - orderBy: {field:CREATED_AT, direction: ASC}) { - nodes {author {login}, closed, createdAt, - updatedAt, number, title} - pageInfo {endCursor, hasNextPage} - }}}""" + Due to the nature of graphql API, this class does not provide a specific + set of methods. Instead, you're expected to write your own queries and this + class will help you with pagination and network timeouts. + """ + def v4(self, query, object_path=(), **params): + """ Make an API v4 request, taking care of pagination + + Args: + query (str): GraphQL query. If the API request is multipage, it is + expected that the cursor variable name is "cursor". + object_path (Tuple[str]): json path to objects to iterate, excluding + leading "data" part, and the trailing "nodes" when applicable. + If omitted, will return full "data" content + Example: ("repository", "issues") + **params: dictionary of query variables. + + Yields: + object: parsed object, query-specific + + This method always returns an iterator, so normally you just throw it + straint into a loop: + + >>> followers = GitHubAPIv4().v4(''' + ... query ($user: String!, $cursor: String) { + ... user(login: $user) { + ... followers(first:100, after:$cursor) { + ... nodes { login } + ... pageInfo{endCursor, hasNextPage} + ... }}}''', ("user", "followers"), user=user) + >>> for follower in followers: + ... pass + + The method will look for `pageInfo` object in the object path and handle + pagination transparently. + + However, the method will also return an iterator if the query is + expected to return a single result. In this case, you need to explicitly + get the first record, e.g. by calling `next()` on the result: + + >>> user_info = next(self.v4(''' + ... query ($user: String!) { + ... user(login:$user) { + ... login, name, avatarUrl, websiteUrl + ... company, bio, location, name, twitterUsername, isHireable + ... createdAt, updatedAt + ... followers{totalCount} + ... following {totalCount} + ... }}''', ('user',), user=user)) - while True: - data = self.v4(query, owner=owner, repo=repo, cursor=cursor - )['data']['repository'] - if not data: # repository is empty, deleted or moved - break - - for issue in data["issues"]: - yield { - 'author': issue['author']['login'], - 'closed': issue['closed'], - 'created_at': issue['createdAt'], - 'updated_at': issue['updatedAt'], - 'closed_at': None, - 'number': issue['number'], - 'title': issue['title'] - } - - cursor = data["issues"]["pageInfo"]["endCursor"] - - if not data["issues"]["pageInfo"]["hasNextPage"]: - break - - def repo_commits(self, repo_slug, cursor=None): - # type: (str, str) -> Iterator[dict] - """As of June 2017 GraphQL API does not allow to get commit parents - Until this issue is fixed this method is only left for a reference - Please use commits() instead""" - owner, repo = repo_slug.split("/") - query = """query ($owner: String!, $repo: String!, $cursor: String) { - repository(name: $repo, owner: $owner) { - ref(qualifiedName: "master") { - target { ... on Commit { - history (first: 100, after: $cursor) { - nodes {sha:oid, author {name, email, user{login}} - message, committedDate} - pageInfo {endCursor, hasNextPage} - }}}}}}""" + """ while True: - data = self.v4(query, owner=owner, repo=repo, cursor=cursor - )['data']['repository'] - if not data: - break - - for commit in data["ref"]["target"]["history"]["nodes"]: - yield { - 'sha': commit['sha'], - 'author': commit['author']['user']['login'], - 'author_name': commit['author']['name'], - 'author_email': commit['author']['email'], - 'authored_date': None, - 'message': commit['message'], - 'committed_date': commit['committedDate'], - 'parents': None, - 'verified': None - } - - cursor = data["ref"]["target"]["history"]["pageInfo"]["endCursor"] - if not data["ref"]["target"]["history"]["pageInfo"]["hasNextPage"]: - break + payload = json.dumps({'query': query, 'variables': params}) + + r = self._request('graphql', 'post', data=payload) + if r.status_code in self.status_empty: + return + + res = self.extract_result(r) + if 'data' not in res: + raise VCSError('API didn\'t return any data:\n' + + json.dumps(res, indent=4)) + + objects = json_path(res['data'], *object_path) + if objects is None: + raise VCSError('Invalid object path "%s" in:\n %s' % + (object_path, json.dumps(res))) + if 'nodes' not in objects: + yield objects + return + for obj in objects['nodes']: + yield obj + # the result is single page, or there are no more pages + if not json_path(objects, 'pageInfo', 'hasNextPage'): + return + params['cursor'] = json_path(objects, 'pageInfo', 'endCursor') + + def repo_issues(self, repo_slug, cursor=None): + owner, repo = repo_slug.split('/') + return self.v4(""" + query ($owner: String!, $repo: String!, $cursor: String) { + repository(name: $repo, owner: $owner) { + hasIssuesEnabled + issues (first: 100, after: $cursor, + orderBy: {field:CREATED_AT, direction: ASC}) { + nodes {author {login}, closed, createdAt, + updatedAt, number, title} + pageInfo {endCursor, hasNextPage} + }} + }""", ('repository', 'issues'), owner=owner, repo=repo) + + def user_followers(self, user): + return self.v4(""" + query ($user: String!, $cursor: String) { + user(login: $user) { + followers(first:100, after:$cursor) { + nodes { login } + pageInfo{endCursor, hasNextPage} + }}}""", ('user', 'followers'), user=user) + + def user_info(self, user): + return next(self.v4(""" + query ($user: String!) { + user(login:$user) { + login, name, avatarUrl, websiteUrl + company, bio, location, name, twitterUsername, isHireable + # email # email requires extra scopes from the API key + createdAt, updatedAt + followers{totalCount} + following {totalCount} + }}""", ('user',), user=user)) + + def repo_commits(self, repo_slug): + owner, repo = repo_slug.split("/") + return self.v4(""" + query ($owner: String!, $repo: String!, $cursor: String) { + repository(name: $repo, owner: $owner) { + defaultBranchRef{ target { + # object(expression: "HEAD") { + ... on Commit { + history (first: 100, after: $cursor) { + nodes {sha:oid, author {name, email, user{login}} + message, committedDate + # normally there is only 1 parent; max observed is 3 + parents (first:100) { + nodes {sha:oid}} + } + pageInfo {endCursor, hasNextPage} + }}}}}}""", ('repository', 'defaultBranchRef', 'target', 'history'), + owner=owner, repo=repo) def get_limits(tokens=None): @@ -380,18 +462,17 @@ def get_limits(tokens=None): for i, token in enumerate(api.tokens): # if limit is exhausted there is no way to get username - user = token.user or "" % i + user = token.user or '' % i values = {'user': user, 'key': token.token} token.check_limits() for api_class in token.limits: - # geez, this code smells next_update = token.limits[api_class]['reset'] if next_update is None: renew = 'never' else: tdiff = datetime.fromtimestamp(next_update) - now - renew = "%dm%ds" % divmod(tdiff.seconds, 60) + renew = '%dm%ds' % divmod(tdiff.seconds, 60) values[api_class + '_renews_in'] = renew values[api_class + '_limit'] = token.limits[api_class]['limit'] values[api_class + '_remaining'] = token.limits[api_class]['remaining'] @@ -401,15 +482,10 @@ def get_limits(tokens=None): def print_limits(argv=None): """Check remaining limits of registered GitHub API keys""" - # import argparse - # parser = argparse.ArgumentParser( - # description="Check remaining limits of registered GitHub API keys") - # # two lines above are just to print help, so ignoring the output - # _ = parser.parse_args() - columns = ("user", "core_limit", "core_remaining", "core_renews_in", - "search_limit", "search_remaining", "search_renews_in", - "key") + columns = ('user', 'core_limit', 'core_remaining', 'core_renews_in', + 'search_limit', 'search_remaining', 'search_renews_in', + 'key') stats = list(get_limits()) @@ -417,11 +493,6 @@ def print_limits(argv=None): for values in stats) for column in columns} - def gen(): - yield "" # prepend empty line - yield " ".join(c.ljust(lens[c] + 1, " ") for c in columns) - for values in stats: - yield " ".join( - str(values[c]).ljust(lens[c] + 1, " ") for c in columns) - - return "\n".join(gen()) + print('\n', ' '.join(c.ljust(lens[c] + 1, " ") for c in columns)) + for values in stats: + print(*(str(values[c]).ljust(lens[c] + 1, " ") for c in columns)) diff --git a/stscraper/gitlab.py b/stscraper/gitlab.py deleted file mode 100644 index ebe5649..0000000 --- a/stscraper/gitlab.py +++ /dev/null @@ -1,202 +0,0 @@ -import warnings - -from .base import * -import stutils - - -def str_urlencode(string): - # TODO: a real encoder - return string.replace("/", "%2f") - - -class GitLabAPIToken(APIToken): - api_url = "https://gitlab.com/api/v4/" - - _user = None # cache user - _headers = {} - - def __init__(self, token=None, timeout=None): - super(GitLabAPIToken, self).__init__(token, timeout) - if token is not None: - self.token = token - self._headers["Private-Token"] = token - - @property - def user(self): - if self._user is None: - try: - r = self('user') - except TokenNotReady: - pass - else: - self._user = r.json().get('username', '') - return self._user - - def check_limits(self): - # regular limits will be updaated automatically upon request - # we only need to take care about search limit - try: - stats = self('').json()['resources'] - except TokenNotReady: - stats = {} - - for cls in self.api_classes: - self.limits[cls] = json_map({ - 'remaining': 'remaining', - 'reset': 'reset', - 'limit': 'limit', - }, stats.get(cls, {})) - - return self.limits - - def when(self, url): - key = self.api_class(url) - if self.limits[key]['remaining'] != 0: - return 0 - return self.limits[key]['reset'] - - def _update_limits(self, response, url): - if 'RateLimit-Remaining' in response.headers: - remaining = int(response.headers['RateLimit-Remaining']) - self.limits[self.api_class(url)] = { - 'remaining': remaining, - 'reset': int(response.headers['RateLimit-Reset']), - 'limit': int(response.headers['RateLimit-Limit']) - } - - if response.status_code == 429 and remaining == 0: - raise TokenNotReady - - -class GitLabAPI(VCSAPI): - """ This is a convenience class to pool GitHub API keys and update their - limits after every request. Actual work is done by outside classes, such - as _IssueIterator and _CommitIterator - """ - token_class = GitLabAPIToken - - status_not_found = (404, 422, 451) - - def __init__(self, tokens=None, timeout=30): - if not tokens: - stconfig_tokens = stutils.get_config("GITLAB_API_TOKENS") - if stconfig_tokens: - tokens = [token.strip() - for token in stconfig_tokens.split(",") - if len(token.strip()) == 20] - - if not tokens: - tokens = [None] - warnings.warn("No tokens provided. GitLab API will be limited to " - "600 requests per minute", Warning) - super(GitLabAPI, self).__init__(tokens, timeout) - - def has_next_page(self, response): - page = response.headers.get('X-Page') - total_pages = response.headers.get('X-Total-Pages', 0) - return page is not None and int(page) < int(total_pages) - - @api('users', paginate=True) - def all_users(self): - # https://docs.gitlab.com/ee/api/users.html#list-users - return () - - @api('projects', paginate=True) - def all_repos(self): - # https://docs.gitlab.com/ee/api/projects.html#list-all-projects - return () - - @api('projects/%s/issues', paginate=True) - def repo_issues(self, repo_name): - # https://docs.gitlab.com/ee/api/issues.html#list-project-issues - return str_urlencode(repo_name) - - @api('projects/%s/repository/commits', paginate=True) - def repo_commits(self, repo_name): - # https://docs.gitlab.com/ee/api/commits.html#list-repository-commits - return str_urlencode(repo_name) - - @api('projects/%s/merge_requests', paginate=True) - def repo_pulls(self, repo_name): - # https://docs.gitlab.com/ee/api/merge_requests.html - return str_urlencode(repo_name) - - def repo_topics(self, repo_name): - return next(self.request('projects/%s' % str_urlencode(repo_name)) - ).get('tag_list', []) - - @api('projects/%s/merge_requests/%s/commits', paginate=True) - def pull_request_commits(self, repo, pr_iid): - # https://docs.gitlab.com/ee/api/merge_requests.html#get-single-mr-commits - return str_urlencode(repo), pr_iid - - @api('projects/%s/issues/%s/notes', paginate=True) - def issue_comments(self, repo, issue_iid): - # https://docs.gitlab.com/ee/api/notes.html#list-project-issue-notes - return str_urlencode(repo), issue_iid - - @api('projects/%s/merge_requests/%s/notes', paginate=True) - def review_comments(self, repo, pr_iid): - # https://docs.gitlab.com/ee/api/notes.html#list-all-merge-request-notes - return str_urlencode(repo), pr_iid - - @api('users/%s') - def user_info(self, user): - # https://docs.gitlab.com/ce/api/users.html#single-user - try: - return next(self.request('users', username=user))[0]['id'] - except (StopIteration, IndexError): - raise KeyError("User does not exist") - - @api('users/%s/projects', paginate=True) - def user_repos(self, user): - # https://docs.gitlab.com/ee/api/projects.html#list-user-projects - return user - - @api('users/%s/events', paginate=True) - def user_events(self, user): - # https://docs.gitlab.com/ee/api/events.html#get-user-contribution-events - return user - - def user_orgs(self, user): - # not available in GitLab API v4 - raise NotImplementedError - - @api('/groups/%s/members/all', paginate=True) - def org_members(self, org): - return str_urlencode(org) - - @api('/groups/%s/projects', paginate=True) - def org_repos(self, org): - # TODO: recursive groups - return str_urlencode(org) - - @staticmethod - def project_exists(repo_name): - # type: (str) -> bool - """ - Unlike GitHub, GitLab will return 302 to login page - for non-existing projects - """ - return requests.head("https://gitlab.com/" + repo_name - ).status_code < 300 - - @staticmethod - def canonical_url(project_url): - # type: (str) -> str - """ - Case insensitive - Path can contain only letters, digits, '_', '-' and '.'. - Cannot start with '-', end in '.git' or end in '.atom' - - Implementation is copied from Github API - """ - url = project_url.lower() - for chunk in ("http://", "https://", "gitlab.com"): - if url.startswith(chunk): - url = url[len(chunk):] - if url.endswith("/"): - url = url[:-1] - while url.endswith(".git"): - url = url[:-4] - return "gitlab.com/" + url diff --git a/stscraper/stats.py b/stscraper/stats.py deleted file mode 100644 index c0fb62f..0000000 --- a/stscraper/stats.py +++ /dev/null @@ -1,455 +0,0 @@ - -from __future__ import print_function - -import numpy as np -import pandas as pd - -from stutils import decorators -from stutils import email_utils as email -from . import * - -""" First contrib date without MIN_DATE restriction: -> fcd = utils.first_contrib_dates("pypi").dropna() -> df = pd.DataFrame(fcd.rename("fcd")) -> df["url"] = utils.package_urls("pypi") -> df = df.dropna(axis=1).sort_values("fcd") -> df.groupby(df["fcd"].str[:4]).count() - -> data = df.iloc[:400] -> def second_month(row): -> cs = scraper_utils.commit_stats(row["url"]) -> return cs[cs>0].index[1] -> data["second_month"] = data.apply(second_month, axis=1) -> data.groupby(data["second_month"].str[:4]).count() - -1970: 3, 1973: 1, 1974: 3, 1997+: 2, 2, 2, 9, 14, 29, 50, 45, 99, 118, ... -looking at their second month of contributions, it is: -nothing before 1997, 1997+: 2, 0, 1, 9, 12, 18, 50, 47, 77, 113, - - -So, 1997 looks like a reasonable lower bound. -Only 7 projects (1 commit each) have such commits, so they are safe to ignore -""" - -MIN_DATE = "1997" -# username to be used all unidentified users -DEFAULT_USERNAME = "-" - -fs_cache = decorators.typed_fs_cache('scraper') - -logger = logging.getLogger("ghd.scraper") - - -def gini(x): - """ Gini index of a given iterable - simplified version from https://github.com/oliviaguest/gini - - >>> round(gini([1]*99 + [10**6]), 2) - 0.99 - >>> round(gini([1]*100), 2) - 0.0 - >>> round(gini(range(100)), 2) - 0.34 - """ - n = len(x) * 1.0 - return np.sort(x).dot(2 * np.arange(n) - n + 1) / (n * np.sum(x)) - - -def quantile(data, column, q): - # type: (pd.DataFrame, str, float) -> pd.DataFrame - """ Returns number of users responsible for a specific - - :param data: an input pd.Dataframe, e.g. commit_user_stats.reset_index() - note that without index reset commit_user_stats is a Series - :param column: a column to aggregate on, e.g. username - :param q: quantile, e.g. 0.9 - :return: pd.Dataframe aggregated on the specified column - >>> df = pd.DataFrame({'foo': 1, 'bar': [1,1,1,1,1,1,1,1,1,1]}) - >>> quantile(df, 'foo', 0.5).loc[1, 'bar'] - 5 - >>> quantile(df, 'foo', 0.9).loc[1, 'bar'] - 9 - """ - # assert column in df.columns # - doesn't have to be, e.g. multilevel index - - # how it works: sort descending, run cumulative sum and compare to sum - # number of records under q*sum is exactly what we're looking for - return data.groupby(column).aggregate( - lambda x: sum(x.sort_values(ascending=False).cumsum() / x.sum() <= q)) - - -def user_stats(stats, date_field, aggregated_field): - # type: (pd.DataFrame, str, str) -> pd.Series - """Helper function for internal use only - Aggregates specified stats dataframe by month/users - """ - if stats.empty: - # a dirty hack to allow further aggregation - return pd.DataFrame( - columns=[date_field, 'author', aggregated_field]).set_index( - [date_field, "author"])[aggregated_field] - return stats['author'].groupby( - [stats[date_field].str[:7], stats['author']]).count().rename( - aggregated_field).astype(np.int) - - -def zeropad(df, fill_value=0): - """Ensure monthly index on the passed df, fill in gaps with zeroes - >>> df = pd.DataFrame([1,1,1], index=["2017-01", "2016-12", "2017-09"]) - >>> zp = zeropad(df) - >>> zp.index.min() - '2016-12' - >>> zp.index.max() >= "2017-12" - True - >>> 13 <= len(zp) <= 50 - True - """ - start = df.index.min() - if pd.isnull(start): - idx = [] - else: - idx = [d.strftime("%Y-%m") - for d in pd.date_range(start, 'now', freq="M")] - return df.reindex(idx, fill_value=fill_value) - - -@fs_cache('raw') -def commits(repo_url): - # type: (str) -> pd.DataFrame - """ - convert old cache files: - find -type f -name '*.csv' -exec rename 's/(?<=\/)commits\./_commits./' {} + - - >>> cs = commits("github.com/benjaminp/six") - >>> isinstance(cs, pd.DataFrame) - True - >>> 450 < len(cs) < 2000 # 454 as of Jan 2018 - True - >>> len(commits("github.com/user2589/nothingtoseehere")) - Traceback (most recent call last): - ... - RepoDoesNotExist: GH API returned status 404 - """ - provider, project_url = get_provider(repo_url) - return pd.DataFrame( - provider.repo_commits(project_url), - columns=['sha', 'author', 'author_name', 'author_email', - 'authored_date', 'committed_date', 'parents'] - ).set_index('sha', drop=True) - - -# @fs_cache('aggregate', 2) -def commit_user_stats(repo_name): - # type: (str) -> pd.Series - """ - :param repo_name: str, repo name (e.g. github.com/pandas-dev/pandas - :return a dataframe indexed on (month, username) with a commits column - - # This repo contains one commit out of order 2005 while repo started in 2016 - >>> cus = commit_user_stats("github.com/django/django") - >>> isinstance(cus, pd.Series) - True - >>> 4100 < len(cus) < 8000 # 4155 unique month/user combinations / Jan 18 - True - >>> 13 < cus["2017-12"]["sir-sigurd"] < 100 # 22 as of Jan 2018 - True - >>> "2005" < cus.reset_index()["authored_date"].min() < "2009" - True - >>> "2017" < cus.reset_index()["authored_date"].max() < "2022" - True - >>> len(cus.reset_index().columns) - 3 - >>> 1 <= len(commit_user_stats("github.com/user2589/schooligan")) < 10 # 1 - True - """ - stats = commits(repo_name) - # check for null and empty string is required because of file caching. - # commits scraped immediately will have empty string, but after save/load - # it will be converted to NaN by pandas - min_date = stats.loc[stats["parents"].isnull() - | (~stats["parents"].astype(bool)), - "authored_date"].min() - stats = stats[stats["authored_date"] >= min_date] - stats['author'] = stats['author'].fillna(DEFAULT_USERNAME) - return user_stats(stats, "authored_date", "commits") - - -# @fs_cache('aggregate') -def commit_stats(repo_name): - # type: (str) -> pd.Series - """Commits aggregated by month - - >>> cs = commit_stats("github.com/django/django") - >>> isinstance(cs, pd.Series) - True - >>> 140 < len(cs) < 240 - True - >>> 100 < cs["2017-12"] < 200 - True - """ - return zeropad(commit_user_stats(repo_name).groupby('authored_date').sum()) - - -# @fs_cache('aggregate') -def commit_users(repo_name): - # type: (str) -> pd.Series - """Number of contributors by month - - >>> cu = commit_users("github.com/django/django") - >>> isinstance(cu, pd.Series) - True - >>> 140 < len(cu) < 240 - True - >>> 30 < cu["2017-12"] < 100 # 32 - True - """ - return commit_user_stats(repo_name).groupby( - 'authored_date').count().rename("users") - - -# @fs_cache('aggregate') -def commit_gini(repo_name): - # type: (str) -> pd.Series - """ - >>> g = commit_gini("github.com/django/django") - >>> isinstance(g, pd.Series) - True - >>> 150 < len(g) < 240 - True - >>> all(0 <= i <= 1 for i in g) - True - """ - return commit_user_stats(repo_name).groupby( - "authored_date").aggregate(gini).rename("gini") - - -def contributions_quantile(repo_name, q): - # type: (str, float) -> pd.Series - """ - >>> q50 = contributions_quantile("github.com/django/django", 0.5) - >>> isinstance(q50, pd.Series) - True - >>> 140 < len(q50) < 240 - True - >>> all(q50 >= 0) - True - >>> 0 < q50["2017-12"] < 10 # 2 - True - """ - return quantile(commit_user_stats(repo_name).reset_index(), - "authored_date", q)["commits"].rename("q%g" % (q*100)) - - -@fs_cache('raw') -def issues(repo_url): - # type: (str) -> pd.DataFrame - """ Get a dataframe with issues - - >>> iss = issues("github.com/benjaminp/six") - >>> isinstance(iss, pd.DataFrame) - True - >>> 180 < len(iss) < 500 # 191 as of Jan 2018 - True - >>> len(issues("github.com/user2589/minicms")) - 0 - """ - provider, project_url = get_provider(repo_url) - return pd.DataFrame( - provider.repo_issues(project_url), - columns=['number', 'author', 'closed', 'created_at', 'updated_at', - 'closed_at']).set_index('number', drop=True) - - -# @fs_cache('aggregate') -def non_dev_issues(repo_name): - # type: (str) -> pd.DataFrame - """Same as new_issues with subtracted issues authored by contributors - - >>> ndi = non_dev_issues("github.com/benjaminp/six") - >>> isinstance(ndi, pd.DataFrame) - True - >>> 20 < len(ndi) < len(issues("github.com/benjaminp/six")) # 23 as of 2018 - True - """ - cs = commits(repo_name)[['authored_date', 'author']] - fc = cs.loc[pd.notnull(cs['author'])].groupby( - 'author').min()['authored_date'] - - i = issues(repo_name)[['created_at', 'author']].sort_values('created_at') - i['fc'] = i['author'].map(fc) - return i.loc[~(i['fc'] < i['created_at']), ['author', 'created_at']] - - -# @fs_cache('aggregate', 2) -def issue_user_stats(repo_name): - # type: (str) -> pd.Series - """ - >>> ius = issue_user_stats("github.com/pandas-dev/pandas") - >>> isinstance(ius, pd.Series) - True - >>> 6000 < len(ius) < 10000 # 6261 - True - >>> 12 < ius["2017-12"]["toobaz"] < 24 # 13 - True - >>> (ius > 0).all() - True - """ - return user_stats(issues(repo_name), "created_at", "new_issues") - - -# @fs_cache('aggregate', 2) -def non_dev_issue_user_stats(repo_name): - return user_stats(non_dev_issues(repo_name), "created_at", "new_issues") - - -# @fs_cache('aggregate') -def new_issues(repo_name): - # type: (str) -> pd.Series - """ New issues aggregated by month - - >>> iss = new_issues("github.com/pandas-dev/pandas") - >>> isinstance(iss, pd.Series) - True - >>> 78 < len(iss) < 100 # 88 - True - >>> 200 < iss["2017-12"] < 300 # 211 - True - """ - return issue_user_stats(repo_name).groupby('created_at').sum() - - -# @fs_cache('aggregate') -def non_dev_issue_stats(repo_name): - # type: (str) -> pd.Series - """Same as new_issues, not counting issues submitted by developers - >>> ndi = non_dev_issue_stats("github.com/pandas-dev/pandas") - >>> isinstance(ndi, pd.Series) - True - >>> 78 < len(ndi) < 180 - True - >>> (new_issues("github.com/pandas-dev/pandas") >= ndi).all() - True - """ - i = non_dev_issues(repo_name) - return i.groupby(i['created_at'].str[:7]).count()['created_at'].rename( - "non_dev_issues") - - -# @fs_cache('aggregate') -def submitters(repo_name): - # type: (str) -> pd.Series - """Number of submitters aggregated by month - - >>> ss = submitters("github.com/pandas-dev/pandas") - >>> isinstance(ss, pd.Series) - True - >>> 78 < len(ss) < 180 - True - >>> all(ss >= 0) - True - >>> (new_issues("github.com/pandas-dev/pandas") >= ss).all() - True - """ - return issue_user_stats(repo_name).groupby( - 'created_at').count().rename("submitters") - - -# @fs_cache('aggregate') -def non_dev_submitters(repo_name): - # type: (str) -> pd.Series - """New issues aggregated by month - >>> nds = non_dev_submitters("github.com/pandas-dev/pandas") - >>> isinstance(nds, pd.Series) - True - >>> 80 < len(nds) < 180 - True - >>> (nds >= 0).all() - True - >>> (non_dev_issue_stats("github.com/pandas-dev/pandas") >= nds).all() - True - """ - return non_dev_issue_user_stats(repo_name).groupby( - 'created_at').count().rename("non_dev_submitters") - - -@fs_cache('aggregate') -def closed_issues(repo_name): - # type: (str) -> pd.Series - """New issues aggregated by month - - >>> ci = closed_issues("github.com/pandas-dev/pandas") - >>> isinstance(ci, pd.Series) - True - >>> 80 < len(ci) < 150 - True - >>> 170 < ci["2017-12"] < 1000 # 179 - True - >>> (ci >= 0).all() - True - """ - df = issues(repo_name) - closed = df.loc[df['closed'], 'closed_at'].astype(object) - return closed.groupby(closed.str[:7]).count() - - -@fs_cache('aggregate') -def open_issues(repo_name): - # type: (str) -> pd.Series - """Open issues aggregated by month - - >>> oi = open_issues("github.com/pandas-dev/pandas") - >>> isinstance(oi, pd.Series) - True - >>> 80 < len(oi) < 150 - True - >>> (oi.dropna() >= 0).all() - True - """ - submitted = new_issues(repo_name).cumsum() - closed = closed_issues(repo_name).cumsum() - res = submitted - closed - return res.rename("open_issues") - - -# @fs_cache('aggregate') -def commercial_involvement(url): - # type: (str) -> pd.Series - """ - >>> ci = commercial_involvement("github.com/pandas-dev/pandas") - >>> isinstance(ci, pd.Series) - True - >>> 100 < len(ci) < 150 - True - >>> (0 <= ci).all() - True - >>> (1 >= ci).all() - True - """ - cs = commits(url)[['authored_date', 'author_email']] - cs["commercial"] = email.is_commercial_bulk(cs["author_email"]) - stats = cs.groupby(cs['authored_date'].str[:7]).agg( - {'authored_date': 'count', 'commercial': 'sum'} - ).rename(columns={'authored_date': "commits"}) - return (stats["commercial"] / stats["commits"]).rename("commercial") - - -# @fs_cache('aggregate') -def university_involvement(url): - # type: (str) -> pd.Series - """ - >>> ui = university_involvement("github.com/pandas-dev/pandas") - >>> isinstance(ui, pd.Series) - True - >>> 100 < len(ui) < 150 - True - >>> (0 <= ui).all() - True - >>> (1 >= ui).all() - True - """ - cs = commits(url)[['authored_date', 'author_email']] - cs["university"] = email.is_university_bulk(cs["author_email"]) - stats = cs.groupby(cs['authored_date'].str[:7]).agg( - {'authored_date': 'count', 'university': 'sum'} - ).rename(columns={'authored_date': "commits"}) - return (stats["university"] / stats["commits"]).rename("university") diff --git a/test.py b/test.py index bc2524b..6656b1a 100755 --- a/test.py +++ b/test.py @@ -6,6 +6,16 @@ import stscraper +class TestBase(unittest.TestCase): + + def test_add_keys(self): + api = stscraper.VCSAPI('key1,key2,key1') + self.assertEqual(len(api.tokens), 2) + api2 = stscraper.VCSAPI('key3,key1,key4') + self.assertTrue(api2 is api) + self.assertEqual(len(api.tokens), 4) + + class TestGitHub(unittest.TestCase): def setUp(self): @@ -237,196 +247,25 @@ def test_project_exists(self): self.assertFalse(self.api.project_exists('user2589/nonexistent')) -# class TestGitLab(unittest.TestCase): -# -# def setUp(self): -# self.api = stscraper.GitLabAPI() -# self.repo_address = 'gitlab-org/gitlab-ce' -# -# def _test_user(self, user, simple=True): -# self.assertIsInstance(user, dict) -# for prop in ('id', 'username', 'name', 'state', ): -# self.assertIn(prop, user, -# "User object is expected to have '%s' property," -# " but it doesn't" % prop) -# if simple: -# return -# for prop in ('avatar_url', 'created_at', 'bio', 'location', 'skype', -# 'linkedin', 'twitter', 'website_url', 'organization'): -# self.assertIn(prop, user, -# "User object is expected to have '%s' property," -# " but it doesn't" % prop) -# -# def _test_commits(self, commit): -# self.assertIsInstance(commit, dict) -# for prop in ('id', 'short_id', 'title', 'author_name', 'author_email', -# 'authored_date', 'committer_name', 'committer_email', -# 'committed_date', 'created_at', 'message', 'parent_ids'): -# self.assertIn(prop, commit, -# "Commit object is expected to have '%s' property," -# " but it doesn't" % prop) -# -# def _test_issue(self, issue): -# self.assertIsInstance(issue, dict) -# for prop in ('id', 'iid', 'project_id', 'title', 'description', 'state', -# 'created_at', 'updated_at', # 'closed_by', 'closed_at', -# 'author', 'labels', 'upvotes', # 'assignees', 'assignee', -# 'downvotes', 'discussion_locked'): -# self.assertIn(prop, issue, -# "Issue object is expected to have '%s' property," -# " but it doesn't" % prop) -# -# def _test_issue_comments(self, comment): -# self.assertIsInstance(comment, dict) -# for prop in ('id', 'body', 'attachment', 'author', 'created_at', -# 'updated_at', 'system', 'noteable_id', 'noteable_type', -# 'noteable_iid'): -# self.assertIn(prop, comment, -# "Issue comment is expected to have '%s' property," -# " but it doesn't" % prop) -# -# def _test_repo(self, repo): -# self.assertIsInstance(repo, dict) -# for prop in ('id', 'description', 'default_branch', 'tag_list', 'name', -# 'path', 'path_with_namespace', 'forks_count', 'star_count', -# 'created_at', 'last_activity_at', 'issues_enabled', -# 'merge_method', 'creator_id', 'import_status', 'archived', -# 'wiki_enabled', 'snippets_enabled', 'open_issues_count', -# 'merge_requests_enabled', -# 'namespace', 'container_registry_enabled', 'public_jobs'): -# self.assertIn(prop, repo, -# "Repository object is expected to have '%s' property," -# " but it doesn't" % prop) -# -# def test_all_users(self): -# users = self.api.all_users() -# self.assertIsInstance(users, Generator) -# user = next(users) -# self._test_user(user) -# -# def test_all_repos(self): -# repos = self.api.all_repos() -# self.assertIsInstance(repos, Generator) -# repo = next(repos) -# self._test_repo(repo) -# -# def test_repo_issues(self): -# issues = self.api.repo_issues(self.repo_address) -# self.assertIsInstance(issues, Generator) -# issue = next(issues) -# self._test_issue(issue) -# -# def test_repo_commits(self): -# commits = self.api.repo_commits(self.repo_address) -# self.assertIsInstance(commits, Generator) -# commit = next(commits) -# self._test_commits(commit) -# -# def test_repo_pulls(self): -# pulls = self.api.repo_pulls(self.repo_address) -# self.assertIsInstance(pulls, Generator) -# pr = next(pulls) -# self._test_issue(pr) -# for prop in ('target_branch', 'source_branch', 'source_project_id', -# 'target_project_id', 'work_in_progress', 'merge_status', -# 'merge_commit_sha', 'sha', 'user_notes_count', 'squash', -# 'time_stats', 'approvals_before_merge'): -# self.assertIn(prop, pr, -# "Merge request is expected to have '%s' property, " -# "but it doesn't" % prop) -# -# def test_repo_topics(self): -# topics = self.api.repo_topics(self.repo_address) -# self.assertIsInstance(topics, list) -# -# def test_pull_request_commits(self): -# # https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/21628 -# commits = self.api.pull_request_commits(self.repo_address, 21628) -# self.assertIsInstance(commits, Generator) -# commit = next(commits) -# self._test_commits(commit) -# -# def test_issue_comments(self): -# # https://gitlab.com/gitlab-org/gitlab-ce/issues/2978 -# comments = self.api.issue_comments(self.repo_address, 2978) -# self.assertIsInstance(comments, Generator) -# comment = next(comments) -# self._test_issue_comments(comment) -# -# def test_review_comments(self): -# # https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/21038 -# comments = self.api.review_comments(self.repo_address, 21038) -# self.assertIsInstance(comments, Generator) -# comment = next(comments) -# self._test_issue_comments(comment) -# -# def test_user_info(self): -# user = self.api.user_info('user2589') -# self._test_user(user, simple=False) -# -# def test_user_repos(self): -# """Get list of user repositories""" -# repos = self.api.user_repos('user2589') -# self.assertIsInstance(repos, Generator) -# repo = next(repos) -# self._test_repo(repo) -# -# def test_user_orgs(self): -# # not available in GitLab API v4 -# with self.assertRaises(NotImplementedError): -# self.api.user_orgs('user2589') -# -# def test_org_members(self): -# members = self.api.org_members('Inkscape') -# self.assertIsInstance(members, Generator) -# user = next(members) -# self._test_user(user) -# -# def test_org_repos(self): -# repos = self.api.org_repos('gitlab-org') -# self.assertIsInstance(repos, Generator) -# repo = next(repos) -# self._test_repo(repo) -# -# def test_pagination(self): -# # 193 commits as of Aug 2018 -# commits = list(self.api.repo_commits('user2589/ghd')) -# self.assertGreater(len(commits), 190) -# -# def test_project_exists(self): -# self.assertTrue(self.api.project_exists(self.repo_address)) -# self.assertFalse(self.api.project_exists('user2589/nonexistent')) - - -class TestBitBucket(unittest.TestCase): +class TestGitHubv4(unittest.TestCase): def setUp(self): - self.api = stscraper.BitbucketAPI() - self.repo_address = 'zzzeek/sqlalchemy' - - -class TestGeneric(unittest.TestCase): - - def setUp(self): - self.scraper = stscraper.GenericScraper() - self.full_url = 'https://github.com/cmustrudel/strudel.scraper' - self.repo_slug = 'cmustrudel/strudel.scraper' - - def test_commits(self): - fields = stscraper.MAPPINGS['repo_commits']['fields'] - count = 0 - for commit in self.scraper.repo_commits(self.full_url, self.repo_slug): - self.assertTrue(all(field in commit for field in fields), - "Some commits are missing expected fields") - count += 1 - self.assertTrue( - count, "Zero commits returned by GenericScraper.repo_commits") - + self.api = stscraper.GitHubAPIv4() + self.repo_address = 'pandas-dev/pandas' -class TestStats(unittest.TestCase): + def test_user_info(self): + # Docs: https://developer.github.com/v3/users/#response + user_info = self.api.user_info('user2589') + self.assertIsInstance(user_info, dict) + for prop in ('login', 'name', 'avatarUrl', 'websiteUrl', 'company', + 'bio', 'location', 'twitterUsername', + 'isHireable', 'createdAt', 'updatedAt', + 'followers', 'following'): + self.assertIn(prop, user_info) - def test_(self): - pass + def test_pagination(self): + commits = list(self.api.repo_commits('benjaminp/six')) + self.assertGreater(len(commits), 463) if __name__ == "__main__":