Merge pull request #37 from ipinfo/uman/better-batch-ops

Improved batch processing
ipinfo · Dec 21, 2020 · e229f52 · e229f52
2 parents 1b98867 + 6f36516
commit e229f52
Show file tree

Hide file tree

Showing 12 changed files with 517 additions and 272 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,12 +1,34 @@
 # IPInfo Changelog
 
+## 4.1.0
+
+- The SDK version is available via `ipinfo.version` as `SDK_VERSION`.
+- Most private functions on all handlers (i.e. those that start with `_`) are
+  now moved to `ipinfo.handler_utils`.
+- All constants that existed on handlers (i.e. `REQUEST_TIMEOUT_DEFAULT`) are
+  now moved to `ipinfo.handler_utils`.
+- Cache behavior for the synchronous handler is a bit different now; the item
+  actually cached is the item _after_ formatting is complete, rather than
+  before.
+- Both the sync and async handlers have the following improvements:
+    - `timeout` can be specified as a keyword-arg to getDetails to optionally
+      override the client-level timeout.
+    - getBatchDetails now has no limit to the size of the `ip_addresses` input
+      list. It will chunk the list internally and make requests against the
+      batch endpoint in a way that doesn't exceed the API's own limits.
+    - getBatchDetails now accepts the new options `batch_size`,
+      `timeout_per_batch`, `timeout_total` and `raise_on_fail`. Please see the
+      documentation for details on what each of these do.
+
 ## 4.0.0
 
 #### Breaking Changes
 
 - [PR #32](https://github.com/ipinfo/python/pull/32)
-  All EOL Python versions are no longer supported; currently, Python 3.6 or greater is now **required**.
-  An asynchronous handler is available from `getHandlerAsync` which returns an `AsyncHandler` which uses **aiohttp**.
+  All EOL Python versions are no longer supported; currently, Python 3.6 or
+  greater is now **required**.
+  An asynchronous handler is available from `getHandlerAsync` which returns an
+  `AsyncHandler` which uses **aiohttp**.
 
 ## 3.0.0
 

diff --git a/README.md b/README.md
@@ -218,9 +218,10 @@ The file must be a `.json` file with the following structure:
 
 ### Batch Operations
 
-Looking up a single IP at a time can be slow. It could be done concurrently from
-the client side, but IPinfo supports a batch endpoint to allow you to group
-together IPs and let us handle retrieving details for them in bulk for you.
+Looking up a single IP at a time can be slow. It could be done concurrently
+from the client side, but IPinfo supports a batch endpoint to allow you to
+group together IPs and let us handle retrieving details for them in bulk for
+you.
 
 ```python
 >>> import ipinfo, pprint
@@ -256,6 +257,9 @@ together IPs and let us handle retrieving details for them in bulk for you.
              'timezone': 'America/Los_Angeles'}}
 ```
 
+The input size is not limited, as the interface will chunk operations for you
+behind the scenes.
+
 Please see [the official documentation](https://ipinfo.io/developers/batch) for
 more information and limitations.
 

diff --git a/ipinfo/exceptions.py b/ipinfo/exceptions.py
@@ -7,3 +7,9 @@ class RequestQuotaExceededError(Exception):
     """Error indicating that users monthly request quota has been passed."""
 
     pass
+
+
+class TimeoutExceededError(Exception):
+    """Error indicating that some timeout has been exceeded."""
+
+    pass
diff --git a/ipinfo/handler.py b/ipinfo/handler.py
@@ -6,12 +6,23 @@
 import json
 import os
 import sys
+import time
 
 import requests
 
 from .cache.default import DefaultCache
 from .details import Details
-from .exceptions import RequestQuotaExceededError
+from .exceptions import RequestQuotaExceededError, TimeoutExceededError
+from .handler_utils import (
+    API_URL,
+    COUNTRY_FILE_DEFAULT,
+    BATCH_MAX_SIZE,
+    CACHE_MAXSIZE,
+    CACHE_TTL,
+    REQUEST_TIMEOUT_DEFAULT,
+    BATCH_REQ_TIMEOUT_DEFAULT,
+)
+from . import handler_utils
 
 
 class Handler:
@@ -20,12 +31,6 @@ class Handler:
     Instantiates and maintains access to cache.
     """
 
-    API_URL = "https://ipinfo.io"
-    CACHE_MAXSIZE = 4096
-    CACHE_TTL = 60 * 60 * 24
-    COUNTRY_FILE_DEFAULT = "countries.json"
-    REQUEST_TIMEOUT_DEFAULT = 2
-
     def __init__(self, access_token=None, **kwargs):
         """
         Initialize the Handler object with country name list and the
@@ -34,39 +39,113 @@ def __init__(self, access_token=None, **kwargs):
         self.access_token = access_token
 
         # load countries file
-        self.countries = self._read_country_names(kwargs.get("countries_file"))
+        self.countries = handler_utils.read_country_names(
+            kwargs.get("countries_file")
+        )
 
         # setup req opts
         self.request_options = kwargs.get("request_options", {})
         if "timeout" not in self.request_options:
-            self.request_options["timeout"] = self.REQUEST_TIMEOUT_DEFAULT
+            self.request_options["timeout"] = REQUEST_TIMEOUT_DEFAULT
 
         # setup cache
         if "cache" in kwargs:
             self.cache = kwargs["cache"]
         else:
             cache_options = kwargs.get("cache_options", {})
             if "maxsize" not in cache_options:
-                cache_options["maxsize"] = self.CACHE_MAXSIZE
+                cache_options["maxsize"] = CACHE_MAXSIZE
             if "ttl" not in cache_options:
-                cache_options["ttl"] = self.CACHE_TTL
+                cache_options["ttl"] = CACHE_TTL
             self.cache = DefaultCache(**cache_options)
 
-    def getDetails(self, ip_address=None):
-        """Get details for specified IP address as a Details object."""
-        raw_details = self._requestDetails(ip_address)
-        self._format_details(raw_details)
-        return Details(raw_details)
+    def getDetails(self, ip_address=None, timeout=None):
+        """
+        Get details for specified IP address as a Details object.
+
+        If `timeout` is not `None`, it will override the client-level timeout
+        just for this operation.
+        """
+        # If the supplied IP address uses the objects defined in the built-in
+        # module ipaddress extract the appropriate string notation before
+        # formatting the URL.
+        if isinstance(ip_address, IPv4Address) or isinstance(
+            ip_address, IPv6Address
+        ):
+            ip_address = ip_address.exploded
+
+        if ip_address in self.cache:
+            return Details(self.cache[ip_address])
+
+        # prepare req http opts
+        req_opts = {**self.request_options}
+        if timeout is not None:
+            req_opts["timeout"] = timeout
+
+        # not in cache; do http req
+        url = API_URL
+        if ip_address:
+            url += "/" + ip_address
+        headers = handler_utils.get_headers(self.access_token)
+        response = requests.get(url, headers=headers, **req_opts)
+        if response.status_code == 429:
+            raise RequestQuotaExceededError()
+        response.raise_for_status()
+        details = response.json()
+
+        # format & cache
+        handler_utils.format_details(details, self.countries)
+        self.cache[ip_address] = details
+
+        return Details(details)
+
+    def getBatchDetails(
+        self,
+        ip_addresses,
+        batch_size=None,
+        timeout_per_batch=BATCH_REQ_TIMEOUT_DEFAULT,
+        timeout_total=None,
+        raise_on_fail=True,
+    ):
+        """
+        Get details for a batch of IP addresses at once.
+
+        There is no specified limit to the number of IPs this function can
+        accept; it can handle as much as the user can fit in RAM (along with
+        all of the response data, which is at least a magnitude larger than the
+        input list).
+
+        The input list is broken up into batches to abide by API requirements.
+        The batch size can be adjusted with `batch_size` but is clipped to
+        `BATCH_MAX_SIZE`.
+        Defaults to `BATCH_MAX_SIZE`.
+
+        For each batch, `timeout_per_batch` indicates the maximum seconds to
+        spend waiting for the HTTP request to complete. If any batch fails with
+        this timeout, the whole operation fails.
+        Defaults to `BATCH_REQ_TIMEOUT_DEFAULT` seconds.
+
+        `timeout_total` is a seconds-denominated hard-timeout for the time
+        spent in HTTP operations; regardless of whether all batches have
+        succeeded so far, if `timeout_total` is reached, the whole operation
+        will fail by raising `TimeoutExceededError`.
+        Defaults to being turned off.
+
+        `raise_on_fail`, if turned off, will return any result retrieved so far
+        rather than raise an exception when errors occur, including timeout and
+        quota errors.
+        Defaults to on.
+        """
+        if batch_size == None:
+            batch_size = BATCH_MAX_SIZE
 
-    def getBatchDetails(self, ip_addresses):
-        """Get details for a batch of IP addresses at once."""
         result = {}
 
-        # Pre-populate with anything we've got in the cache, and keep around
+        # pre-populate with anything we've got in the cache, and keep around
         # the IPs not in the cache.
         lookup_addresses = []
         for ip_address in ip_addresses:
-            # If the supplied IP address uses the objects defined in the
+            # if the supplied IP address uses the objects defined in the
             # built-in module ipaddress extract the appropriate string notation
             # before formatting the URL.
             if isinstance(ip_address, IPv4Address) or isinstance(
@@ -79,95 +158,57 @@ def getBatchDetails(self, ip_addresses):
             else:
                 lookup_addresses.append(ip_address)
 
-        # Do the lookup
-        url = self.API_URL + "/batch"
-        headers = self._get_headers()
-        headers["content-type"] = "application/json"
-        response = requests.post(
-            url, json=lookup_addresses, headers=headers, **self.request_options
-        )
-        if response.status_code == 429:
-            raise RequestQuotaExceededError()
-        response.raise_for_status()
+        # all in cache - return early.
+        if len(lookup_addresses) == 0:
+            return result
 
-        # Fill up cache
-        json_response = response.json()
-        for ip_address, details in json_response.items():
-            self.cache[ip_address] = details
+        # do start timer if necessary
+        if timeout_total is not None:
+            start_time = time.time()
 
-        # Merge cached results with new lookup
-        result.update(json_response)
+        # prepare req http options
+        req_opts = {**self.request_options, "timeout": timeout_per_batch}
 
-        # Format every result
-        for detail in result.values():
-            if isinstance(detail, dict):
-                self._format_details(detail)
-
-        return result
+        # loop over batch chunks and do lookup for each.
+        url = API_URL + "/batch"
+        headers = handler_utils.get_headers(self.access_token)
+        headers["content-type"] = "application/json"
+        for i in range(0, len(lookup_addresses), batch_size):
+            # quit if total timeout is reached.
+            if (
+                timeout_total is not None
+                and time.time() - start_time > timeout_total
+            ):
+                return handler_utils.return_or_fail(
+                    raise_on_fail, TimeoutExceededError(), result
+                )
 
-    def _requestDetails(self, ip_address=None):
-        """Get IP address data by sending request to IPinfo API."""
+            chunk = lookup_addresses[i : i + batch_size]
 
-        # If the supplied IP address uses the objects defined in the built-in
-        # module ipaddress extract the appropriate string notation before
-        # formatting the URL.
-        if isinstance(ip_address, IPv4Address) or isinstance(
-            ip_address, IPv6Address
-        ):
-            ip_address = ip_address.exploded
+            # lookup
+            response = requests.post(
+                url, json=chunk, headers=headers, **req_opts
+            )
 
-        if ip_address not in self.cache:
-            url = self.API_URL
-            if ip_address:
-                url += "/" + ip_address
+            # fail on bad status codes
+            try:
+                if response.status_code == 429:
+                    raise RequestQuotaExceededError()
+                response.raise_for_status()
+            except Exception as e:
+                return handler_utils.return_or_fail(raise_on_fail, e, result)
 
-            response = requests.get(
-                url, headers=self._get_headers(), **self.request_options
-            )
-            if response.status_code == 429:
-                raise RequestQuotaExceededError()
-            response.raise_for_status()
-            self.cache[ip_address] = response.json()
-
-        return self.cache[ip_address]
-
-    def _get_headers(self):
-        """Built headers for request to IPinfo API."""
-        headers = {
-            "user-agent": "IPinfoClient/Python{version}/4.0.0".format(
-                version=sys.version_info[0]
-            ),
-            "accept": "application/json",
-        }
-
-        if self.access_token:
-            headers["authorization"] = "Bearer {}".format(self.access_token)
-
-        return headers
-
-    def _format_details(self, details):
-        details["country_name"] = self.countries.get(details.get("country"))
-        details["latitude"], details["longitude"] = self._read_coords(
-            details.get("loc")
-        )
+            # fill cache
+            json_response = response.json()
+            for ip_address, details in json_response.items():
+                self.cache[ip_address] = details
 
-    def _read_coords(self, location):
-        lat, lon = None, None
-        coords = tuple(location.split(",")) if location else ""
-        if len(coords) == 2 and coords[0] and coords[1]:
-            lat, lon = coords[0], coords[1]
-        return lat, lon
+            # merge cached results with new lookup
+            result.update(json_response)
 
-    def _read_country_names(self, countries_file=None):
-        """
-        Read list of countries from specified country file or
-        default file.
-        """
-        if not countries_file:
-            countries_file = os.path.join(
-                os.path.dirname(__file__), self.COUNTRY_FILE_DEFAULT
-            )
-        with open(countries_file) as f:
-            countries_json = f.read()
+            # format all
+            for detail in result.values():
+                if isinstance(detail, dict):
+                    handler_utils.format_details(detail, self.countries)
 
-        return json.loads(countries_json)
+        return result