From bb92758cfc6c75316a8d7a5bdd48cd166d2e563e Mon Sep 17 00:00:00 2001 From: user2589 Date: Thu, 24 Sep 2020 22:57:15 -0400 Subject: [PATCH] feat: support pagination in GitHub graphql APIs returning objects in 'edges' --- stscraper/base.py | 21 ++++++++++++--------- stscraper/github.py | 26 +++++++++++++++++--------- 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/stscraper/base.py b/stscraper/base.py index c89e28b..bef6671 100644 --- a/stscraper/base.py +++ b/stscraper/base.py @@ -71,18 +71,18 @@ def parse_url(url): return None, None -def json_path(obj, *path): +def json_path(obj, path, raise_on_missing=False): """ Get a dict value by the specified path. >>> obj = {'author': {'name': 'John'}, 'committer': None, ... 'labels': [{'name': 'Bug'}, {'name': 'Good first issue'}]} - >>> json_path(obj, 'author', 'name') + >>> json_path(obj, ('author', 'name')) 'John' - >>> json_path(obj, 'committer', 'name') is None + >>> json_path(obj, ('committer', 'name')) is None True - >>> json_path(obj, 'committer') is None + >>> json_path(obj, ('committer',)) is None True - >>> json_path(obj, 'labels', ',name') + >>> json_path(obj, ('labels', ',name')) 'Bug,Good first issue' """ for chunk in path: @@ -90,9 +90,12 @@ def json_path(obj, *path): obj = ",".join(str(item.get(chunk[1:])) for item in obj) # supported only for the last chunk in the path, so break break - obj = obj.get(chunk) - if obj is None: - break + if chunk not in obj: + if raise_on_missing: + raise IndexError('Path does not exist') + else: + return None + obj = obj[chunk] return obj @@ -107,7 +110,7 @@ def json_map(mapping, obj): >>> json_map({"author_login": "author__name", 'foo': 'bar'}, obj) {'author_login': 'John', 'foo': None} """ - return {key: json_path(obj, *path.split("__")) + return {key: json_path(obj, path.split("__")) for key, path in mapping.items()} diff --git a/stscraper/github.py b/stscraper/github.py index bd0d86d..1739186 100644 --- a/stscraper/github.py +++ b/stscraper/github.py @@ -233,7 +233,7 @@ def repo_contributors(self, repo_slug): url = 'repos/%s/stats/contributors' % repo_slug for contributor_stats in next(self.request(url)): record = {w['w']: w['c'] for w in contributor_stats['weeks']} - record['user'] = json_path(contributor_stats, 'author', 'login') + record['user'] = json_path(contributor_stats, ('author', 'login')) yield record @api('repos/%s/pulls/%d/commits', paginate=True, state='all') @@ -382,20 +382,28 @@ def v4(self, query, object_path=(), **params): if 'data' not in res: raise VCSError('API didn\'t return any data:\n' + json.dumps(res, indent=4)) + data = res['data'] - objects = json_path(res['data'], *object_path) - if objects is None: + try: + objects = json_path(data, object_path, raise_on_missing=True) + except IndexError: raise VCSError('Invalid object path "%s" in:\n %s' % - (object_path, json.dumps(res))) - if 'nodes' not in objects: + (object_path, json.dumps(data))) + + has_next_page = json_path(objects, ('pageInfo', 'hasNextPage')) + if not json_path(objects, ('pageInfo', 'hasNextPage')): yield objects return - for obj in objects['nodes']: + # This is due to inconsistency in graphql API. + # In most cases, requests returning lists of objects put them in + # 'nodes', but in few legacy methods they use 'edges' + nodes = objects.get('nodes') or objects.get('edges') + if not nodes: + break + for obj in nodes: yield obj # the result is single page, or there are no more pages - if not json_path(objects, 'pageInfo', 'hasNextPage'): - return - params['cursor'] = json_path(objects, 'pageInfo', 'endCursor') + params['cursor'] = json_path(objects, ('pageInfo', 'endCursor')) def repo_issues(self, repo_slug, cursor=None): owner, repo = repo_slug.split('/')