Skip to content

Commit

Permalink
feat: generic API interface
Browse files Browse the repository at this point in the history
  • Loading branch information
user2589 committed Feb 26, 2019
1 parent 4c9ffec commit ea13280
Show file tree
Hide file tree
Showing 4 changed files with 198 additions and 39 deletions.
39 changes: 2 additions & 37 deletions stscraper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,7 @@

from .base import *
from .github import GitHubAPI
from .gitlab import GitLabAPI
from .bitbucket import BitbucketAPI
from .generic import *

# make sure to update setup.py
__version__ = '0.2.6'
__author__ = "Marat (@cmu.edu)"

PROVIDERS = {
"github.com": GitHubAPI,
# https://developer.atlassian.com/bitbucket/api/2/reference/resource/
"bitbucket.org": BitbucketAPI,
# https://docs.gitlab.com/ee/api/
"gitlab.org": GitLabAPI,
# https://anypoint.mulesoft.com/apiplatform/sourceforge/
"sourceforge.net": None,
}


def get_provider(url):
# type: (str) -> (str, str)
""" Separate provided URL into parovider and provider-specific project ID
:param url: url matching URL_PATTERN
:return: (provider, project_id)
>>> prov, proj_id = get_provider("github.com/abc/def")
>>> isinstance(prov, github.GitHubAPI)
True
>>> proj_id
'abc/def'
>>> prov, proj_id = get_provider("someothersource.com/abc/def")
"""
provider_name, project_url = parse_url(url)
provider = PROVIDERS.get(provider_name)
if provider is None:
raise NotImplementedError(
"Provider %s is not supported (yet?)" % provider_name)
return provider, project_url
__license__ = "GPL v3"
1 change: 1 addition & 0 deletions stscraper/deprecated.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ def commits_pygit2(repo_url, remove=True):

def issues_PyGithub(github_token, repo_name):
""" Iterate issues of a GitHub repository using GitHub API v3
The library used in this method, PyGithub tries to extensively resolve
attributes which leads to a number of excessive API calls and computation
overhead. This implementation tries to avoid this, and was replaced by
Expand Down
181 changes: 181 additions & 0 deletions stscraper/generic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@

from .base import *
from .github import GitHubAPI
from .gitlab import GitLabAPI
from .bitbucket import BitbucketAPI

"""
Standard interface to all supported code hosting platforms.
Two important distinctions comparing to
1. URLs must include the code hosting platform itself, i.e. instead of
`cmustrudel/strudel.scraper` one should use
`github.com/cmustrudel/strudel.scraper`.
2. Returned objects are simplified to a common subset of fields
"""

PROVIDERS = {
"github.com": GitHubAPI,
# https://developer.atlassian.com/bitbucket/api/2/reference/resource/
"bitbucket.org": BitbucketAPI,
# https://docs.gitlab.com/ee/api/
"gitlab.org": GitLabAPI,
# https://anypoint.mulesoft.com/apiplatform/sourceforge/
"sourceforge.net": None,
}


def get_provider(url):
# type: (str) -> (str, str)
""" Separate provided URL into provider and project ID
:param url: url matching URL_PATTERN
:return: (provider_cls, project_id)
>>> prov, proj_id = get_provider("github.com/abc/def")
>>> isinstance(prov, github.GitHubAPI)
True
>>> proj_id
'abc/def'
>>> prov, proj_id = get_provider("someothersource.com/abc/def")
"""
provider_name, project_url = parse_url(url)
provider_cls = PROVIDERS.get(provider_name)
if provider_cls is None:
raise NotImplementedError(
"Provider %s is not supported (yet?)" % provider_name)
return provider_cls, project_url


MAPPINGS = {
'repo_commits': {
'fields': (
'sha', 'author', 'author_email', 'author_name', 'authored_at',
'committer', 'committer_email', 'committed_at', 'comment_count',
'message', 'verified'),
'github.com': {
'sha': 'sha',
'author': 'author__login',
'author_email': 'commit__author__email',
'author_name': 'commit__author__name',
'authored_at': 'commit__author__date',
'committer': 'commit__committer__login',
'committer_email': 'commit__committer__email',
'committed_at': 'commit__committer__date',
'comment_count': 'commit__comment_count',
'message': 'commit__message',
'verified': 'commit__verification__verified',
'parents': 'parents__,sha'
},
},
'repo_issues': {
'fields': (
'number', 'user', 'role', 'title', 'body', 'assignee', 'id',
'state', 'created_at', 'updated_at', 'closed_at', 'reactions'),
'github.com': {
'number': 'number',
'user': 'user__login',
'role': 'author_association',
'title': 'title',
'body': 'body',
'assignee': 'assignee',
'id': 'id',
'state': 'state',
'created_at': 'created_at',
'updated_at': 'updated_at',
'closed_at': 'closed_at',
'reactions': 'reactions__total_count',
'pull_request_url': 'pull_request__url',
'labels': 'labels__,name',
},
},
'repo_pulls': {
'fields': (
'number', 'title', 'body', 'state', 'user', 'head',
'head_branch', 'base', 'base_branch', 'created_at',
'updated_at', 'closed_at', 'merged_at', 'role'),
'github.com': {
'number': 'number',
'title': 'title',
'body': 'body',
'state': 'state',
'user': 'user__login',
'head': 'head__repo__full_name',
'head_branch': 'head__ref',
'base': 'base__repo__full_name',
'base_branch': 'base__ref',
'created_at': 'created_at',
'updated_at': 'updated_at',
'closed_at': 'closed_at',
'merged_at': 'merged_at',
'role': 'author_association',
'labels': 'labels__,name',
},
},
'review_comments': {
'fields': ( # 'pr_no',
'id', 'user', 'created_at', 'updated_at',
'body', 'path', 'position', 'role'),
'github.com': {
# TODO: 'pr_no': 'pr_no', # from call params
'id': 'id',
'body': 'body',
'user': 'user__login',
'role': 'author_association',
'created_at': 'created_at',
'updated_at': 'updated_at',
'path': 'path',
'position': 'original_position',
},
},
'issue_comments': {
'fields': ( # 'issue_no',
'id', 'user', 'created_at', 'updated_at',
'body', 'role', 'reactions'),
'github.com': {
'id': 'id',
'body': 'body',
'user': 'user__login',
'role': 'author_association',
'created_at': 'created_at',
'updated_at': 'updated_at',
'reactions': 'reactions__total_count',
# TODO: 'issue_no': int(comment['issue_url'].rsplit("/", 1)[-1]),
}
},
}


class GenericScraper(object):
""" Get a small but consistent subset of fields across all VCS providers
This interface supports the same API as all other VCS providers,
with one addition: you need to append repository URL
in front of all other params. For example,
>>> GitHubAPI().repo_commits("user/repo")
is equivalent to:
>>> GenericScraper().repo_commits("https://github.com/user", "user/repo")
"""
def __getattribute__(self, attr):
if not hasattr(VCSAPI, attr):
raise AttributeError("'Scraper' has not attribute '%s'" % attr)
if attr not in MAPPINGS:
raise NotImplementedError(
"Generic API '%s' has not been implemented yet" % attr)
mappings = MAPPINGS[attr]

def wrapper(url, *args):
provider_name, _ = parse_url(url)
if provider_name not in mappings:
raise NotImplementedError(
"Generic API '%s' has not been implemented for '%s' yet"
"" % (attr, provider_name))
mapping = mappings[provider_name]
provider_cls, _ = get_provider(url)
provider = provider_cls()

for item in getattr(provider, attr)(*args):
yield json_map(mapping, item)

return wrapper
16 changes: 14 additions & 2 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,8 +531,20 @@ def setUp(self):

class TestGeneric(unittest.TestCase):

def test_(self):
pass
def setUp(self):
self.scraper = stscraper.GenericScraper()
self.full_url = 'https://github.com/cmustrudel/strudel.scraper'
self.repo_slug = 'cmustrudel/strudel.scraper'

def test_commits(self):
fields = stscraper.MAPPINGS['repo_commits']['fields']
count = 0
for commit in self.scraper.repo_commits(self.full_url, self.repo_slug):
self.assertTrue(all(field in commit for field in fields),
"Some commits are missing expected fields")
count += 1
self.assertTrue(
count, "Zero commits returned by GenericScraper.repo_commits")


class TestStats(unittest.TestCase):
Expand Down

0 comments on commit ea13280

Please sign in to comment.