feat: support pagination in GitHub graphql APIs returning objects in …

…'edges'
CMUSTRUDEL · Sep 25, 2020 · bb92758 · bb92758
1 parent c1120ca
commit bb92758
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 18 deletions.
diff --git a/stscraper/base.py b/stscraper/base.py
@@ -71,28 +71,31 @@ def parse_url(url):
     return None, None
 
 
-def json_path(obj, *path):
+def json_path(obj, path, raise_on_missing=False):
     """ Get a dict value by the specified path.
 
     >>> obj = {'author': {'name': 'John'}, 'committer': None,
     ...        'labels': [{'name': 'Bug'}, {'name': 'Good first issue'}]}
-    >>> json_path(obj, 'author', 'name')
+    >>> json_path(obj, ('author', 'name'))
     'John'
-    >>> json_path(obj, 'committer', 'name') is None
+    >>> json_path(obj, ('committer', 'name')) is None
     True
-    >>> json_path(obj, 'committer') is None
+    >>> json_path(obj, ('committer',)) is None
     True
-    >>> json_path(obj, 'labels', ',name')
+    >>> json_path(obj, ('labels', ',name'))
     'Bug,Good first issue'
     """
     for chunk in path:
         if chunk.startswith(","):
             obj = ",".join(str(item.get(chunk[1:])) for item in obj)
             # supported only for the last chunk in the path, so break
             break
-        obj = obj.get(chunk)
-        if obj is None:
-            break
+        if chunk not in obj:
+            if raise_on_missing:
+                raise IndexError('Path does not exist')
+            else:
+                return None
+        obj = obj[chunk]
     return obj
 
 
@@ -107,7 +110,7 @@ def json_map(mapping, obj):
     >>> json_map({"author_login": "author__name", 'foo': 'bar'}, obj)
     {'author_login': 'John', 'foo': None}
     """
-    return {key: json_path(obj, *path.split("__"))
+    return {key: json_path(obj, path.split("__"))
             for key, path in mapping.items()}
 
 

diff --git a/stscraper/github.py b/stscraper/github.py
@@ -233,7 +233,7 @@ def repo_contributors(self, repo_slug):
         url = 'repos/%s/stats/contributors' % repo_slug
         for contributor_stats in next(self.request(url)):
             record = {w['w']: w['c'] for w in contributor_stats['weeks']}
-            record['user'] = json_path(contributor_stats, 'author', 'login')
+            record['user'] = json_path(contributor_stats, ('author', 'login'))
             yield record
 
     @api('repos/%s/pulls/%d/commits', paginate=True, state='all')
@@ -382,20 +382,28 @@ def v4(self, query, object_path=(), **params):
             if 'data' not in res:
                 raise VCSError('API didn\'t return any data:\n' +
                                json.dumps(res, indent=4))
+            data = res['data']
 
-            objects = json_path(res['data'], *object_path)
-            if objects is None:
+            try:
+                objects = json_path(data, object_path, raise_on_missing=True)
+            except IndexError:
                 raise VCSError('Invalid object path "%s" in:\n %s' %
-                               (object_path, json.dumps(res)))
-            if 'nodes' not in objects:
+                               (object_path, json.dumps(data)))
+
+            has_next_page = json_path(objects, ('pageInfo', 'hasNextPage'))
+            if not json_path(objects, ('pageInfo', 'hasNextPage')):
                 yield objects
                 return
-            for obj in objects['nodes']:
+            # This is due to inconsistency in graphql API.
+            # In most cases, requests returning lists of objects put them in
+            # 'nodes', but in few legacy methods they use 'edges'
+            nodes = objects.get('nodes') or objects.get('edges')
+            if not nodes:
+                break
+            for obj in nodes:
                 yield obj
             # the result is single page, or there are no more pages
-            if not json_path(objects, 'pageInfo', 'hasNextPage'):
-                return
-            params['cursor'] = json_path(objects, 'pageInfo', 'endCursor')
+            params['cursor'] = json_path(objects, ('pageInfo', 'endCursor'))
 
     def repo_issues(self, repo_slug, cursor=None):
         owner, repo = repo_slug.split('/')