Skip to content

Commit

Permalink
Merge pull request #37 from ipinfo/uman/better-batch-ops
Browse files Browse the repository at this point in the history
Improved batch processing
  • Loading branch information
UmanShahzad authored Dec 21, 2020
2 parents 1b98867 + 6f36516 commit e229f52
Show file tree
Hide file tree
Showing 12 changed files with 517 additions and 272 deletions.
26 changes: 24 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,34 @@
# IPInfo Changelog

## 4.1.0

- The SDK version is available via `ipinfo.version` as `SDK_VERSION`.
- Most private functions on all handlers (i.e. those that start with `_`) are
now moved to `ipinfo.handler_utils`.
- All constants that existed on handlers (i.e. `REQUEST_TIMEOUT_DEFAULT`) are
now moved to `ipinfo.handler_utils`.
- Cache behavior for the synchronous handler is a bit different now; the item
actually cached is the item _after_ formatting is complete, rather than
before.
- Both the sync and async handlers have the following improvements:
- `timeout` can be specified as a keyword-arg to getDetails to optionally
override the client-level timeout.
- getBatchDetails now has no limit to the size of the `ip_addresses` input
list. It will chunk the list internally and make requests against the
batch endpoint in a way that doesn't exceed the API's own limits.
- getBatchDetails now accepts the new options `batch_size`,
`timeout_per_batch`, `timeout_total` and `raise_on_fail`. Please see the
documentation for details on what each of these do.

## 4.0.0

#### Breaking Changes

- [PR #32](https://github.com/ipinfo/python/pull/32)
All EOL Python versions are no longer supported; currently, Python 3.6 or greater is now **required**.
An asynchronous handler is available from `getHandlerAsync` which returns an `AsyncHandler` which uses **aiohttp**.
All EOL Python versions are no longer supported; currently, Python 3.6 or
greater is now **required**.
An asynchronous handler is available from `getHandlerAsync` which returns an
`AsyncHandler` which uses **aiohttp**.

## 3.0.0

Expand Down
10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -218,9 +218,10 @@ The file must be a `.json` file with the following structure:

### Batch Operations

Looking up a single IP at a time can be slow. It could be done concurrently from
the client side, but IPinfo supports a batch endpoint to allow you to group
together IPs and let us handle retrieving details for them in bulk for you.
Looking up a single IP at a time can be slow. It could be done concurrently
from the client side, but IPinfo supports a batch endpoint to allow you to
group together IPs and let us handle retrieving details for them in bulk for
you.

```python
>>> import ipinfo, pprint
Expand Down Expand Up @@ -256,6 +257,9 @@ together IPs and let us handle retrieving details for them in bulk for you.
'timezone': 'America/Los_Angeles'}}
```

The input size is not limited, as the interface will chunk operations for you
behind the scenes.

Please see [the official documentation](https://ipinfo.io/developers/batch) for
more information and limitations.

Expand Down
6 changes: 6 additions & 0 deletions ipinfo/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,9 @@ class RequestQuotaExceededError(Exception):
"""Error indicating that users monthly request quota has been passed."""

pass


class TimeoutExceededError(Exception):
"""Error indicating that some timeout has been exceeded."""

pass
245 changes: 143 additions & 102 deletions ipinfo/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,23 @@
import json
import os
import sys
import time

import requests

from .cache.default import DefaultCache
from .details import Details
from .exceptions import RequestQuotaExceededError
from .exceptions import RequestQuotaExceededError, TimeoutExceededError
from .handler_utils import (
API_URL,
COUNTRY_FILE_DEFAULT,
BATCH_MAX_SIZE,
CACHE_MAXSIZE,
CACHE_TTL,
REQUEST_TIMEOUT_DEFAULT,
BATCH_REQ_TIMEOUT_DEFAULT,
)
from . import handler_utils


class Handler:
Expand All @@ -20,12 +31,6 @@ class Handler:
Instantiates and maintains access to cache.
"""

API_URL = "https://ipinfo.io"
CACHE_MAXSIZE = 4096
CACHE_TTL = 60 * 60 * 24
COUNTRY_FILE_DEFAULT = "countries.json"
REQUEST_TIMEOUT_DEFAULT = 2

def __init__(self, access_token=None, **kwargs):
"""
Initialize the Handler object with country name list and the
Expand All @@ -34,39 +39,113 @@ def __init__(self, access_token=None, **kwargs):
self.access_token = access_token

# load countries file
self.countries = self._read_country_names(kwargs.get("countries_file"))
self.countries = handler_utils.read_country_names(
kwargs.get("countries_file")
)

# setup req opts
self.request_options = kwargs.get("request_options", {})
if "timeout" not in self.request_options:
self.request_options["timeout"] = self.REQUEST_TIMEOUT_DEFAULT
self.request_options["timeout"] = REQUEST_TIMEOUT_DEFAULT

# setup cache
if "cache" in kwargs:
self.cache = kwargs["cache"]
else:
cache_options = kwargs.get("cache_options", {})
if "maxsize" not in cache_options:
cache_options["maxsize"] = self.CACHE_MAXSIZE
cache_options["maxsize"] = CACHE_MAXSIZE
if "ttl" not in cache_options:
cache_options["ttl"] = self.CACHE_TTL
cache_options["ttl"] = CACHE_TTL
self.cache = DefaultCache(**cache_options)

def getDetails(self, ip_address=None):
"""Get details for specified IP address as a Details object."""
raw_details = self._requestDetails(ip_address)
self._format_details(raw_details)
return Details(raw_details)
def getDetails(self, ip_address=None, timeout=None):
"""
Get details for specified IP address as a Details object.
If `timeout` is not `None`, it will override the client-level timeout
just for this operation.
"""
# If the supplied IP address uses the objects defined in the built-in
# module ipaddress extract the appropriate string notation before
# formatting the URL.
if isinstance(ip_address, IPv4Address) or isinstance(
ip_address, IPv6Address
):
ip_address = ip_address.exploded

if ip_address in self.cache:
return Details(self.cache[ip_address])

# prepare req http opts
req_opts = {**self.request_options}
if timeout is not None:
req_opts["timeout"] = timeout

# not in cache; do http req
url = API_URL
if ip_address:
url += "/" + ip_address
headers = handler_utils.get_headers(self.access_token)
response = requests.get(url, headers=headers, **req_opts)
if response.status_code == 429:
raise RequestQuotaExceededError()
response.raise_for_status()
details = response.json()

# format & cache
handler_utils.format_details(details, self.countries)
self.cache[ip_address] = details

return Details(details)

def getBatchDetails(
self,
ip_addresses,
batch_size=None,
timeout_per_batch=BATCH_REQ_TIMEOUT_DEFAULT,
timeout_total=None,
raise_on_fail=True,
):
"""
Get details for a batch of IP addresses at once.
There is no specified limit to the number of IPs this function can
accept; it can handle as much as the user can fit in RAM (along with
all of the response data, which is at least a magnitude larger than the
input list).
The input list is broken up into batches to abide by API requirements.
The batch size can be adjusted with `batch_size` but is clipped to
`BATCH_MAX_SIZE`.
Defaults to `BATCH_MAX_SIZE`.
For each batch, `timeout_per_batch` indicates the maximum seconds to
spend waiting for the HTTP request to complete. If any batch fails with
this timeout, the whole operation fails.
Defaults to `BATCH_REQ_TIMEOUT_DEFAULT` seconds.
`timeout_total` is a seconds-denominated hard-timeout for the time
spent in HTTP operations; regardless of whether all batches have
succeeded so far, if `timeout_total` is reached, the whole operation
will fail by raising `TimeoutExceededError`.
Defaults to being turned off.
`raise_on_fail`, if turned off, will return any result retrieved so far
rather than raise an exception when errors occur, including timeout and
quota errors.
Defaults to on.
"""
if batch_size == None:
batch_size = BATCH_MAX_SIZE

def getBatchDetails(self, ip_addresses):
"""Get details for a batch of IP addresses at once."""
result = {}

# Pre-populate with anything we've got in the cache, and keep around
# pre-populate with anything we've got in the cache, and keep around
# the IPs not in the cache.
lookup_addresses = []
for ip_address in ip_addresses:
# If the supplied IP address uses the objects defined in the
# if the supplied IP address uses the objects defined in the
# built-in module ipaddress extract the appropriate string notation
# before formatting the URL.
if isinstance(ip_address, IPv4Address) or isinstance(
Expand All @@ -79,95 +158,57 @@ def getBatchDetails(self, ip_addresses):
else:
lookup_addresses.append(ip_address)

# Do the lookup
url = self.API_URL + "/batch"
headers = self._get_headers()
headers["content-type"] = "application/json"
response = requests.post(
url, json=lookup_addresses, headers=headers, **self.request_options
)
if response.status_code == 429:
raise RequestQuotaExceededError()
response.raise_for_status()
# all in cache - return early.
if len(lookup_addresses) == 0:
return result

# Fill up cache
json_response = response.json()
for ip_address, details in json_response.items():
self.cache[ip_address] = details
# do start timer if necessary
if timeout_total is not None:
start_time = time.time()

# Merge cached results with new lookup
result.update(json_response)
# prepare req http options
req_opts = {**self.request_options, "timeout": timeout_per_batch}

# Format every result
for detail in result.values():
if isinstance(detail, dict):
self._format_details(detail)

return result
# loop over batch chunks and do lookup for each.
url = API_URL + "/batch"
headers = handler_utils.get_headers(self.access_token)
headers["content-type"] = "application/json"
for i in range(0, len(lookup_addresses), batch_size):
# quit if total timeout is reached.
if (
timeout_total is not None
and time.time() - start_time > timeout_total
):
return handler_utils.return_or_fail(
raise_on_fail, TimeoutExceededError(), result
)

def _requestDetails(self, ip_address=None):
"""Get IP address data by sending request to IPinfo API."""
chunk = lookup_addresses[i : i + batch_size]

# If the supplied IP address uses the objects defined in the built-in
# module ipaddress extract the appropriate string notation before
# formatting the URL.
if isinstance(ip_address, IPv4Address) or isinstance(
ip_address, IPv6Address
):
ip_address = ip_address.exploded
# lookup
response = requests.post(
url, json=chunk, headers=headers, **req_opts
)

if ip_address not in self.cache:
url = self.API_URL
if ip_address:
url += "/" + ip_address
# fail on bad status codes
try:
if response.status_code == 429:
raise RequestQuotaExceededError()
response.raise_for_status()
except Exception as e:
return handler_utils.return_or_fail(raise_on_fail, e, result)

response = requests.get(
url, headers=self._get_headers(), **self.request_options
)
if response.status_code == 429:
raise RequestQuotaExceededError()
response.raise_for_status()
self.cache[ip_address] = response.json()

return self.cache[ip_address]

def _get_headers(self):
"""Built headers for request to IPinfo API."""
headers = {
"user-agent": "IPinfoClient/Python{version}/4.0.0".format(
version=sys.version_info[0]
),
"accept": "application/json",
}

if self.access_token:
headers["authorization"] = "Bearer {}".format(self.access_token)

return headers

def _format_details(self, details):
details["country_name"] = self.countries.get(details.get("country"))
details["latitude"], details["longitude"] = self._read_coords(
details.get("loc")
)
# fill cache
json_response = response.json()
for ip_address, details in json_response.items():
self.cache[ip_address] = details

def _read_coords(self, location):
lat, lon = None, None
coords = tuple(location.split(",")) if location else ""
if len(coords) == 2 and coords[0] and coords[1]:
lat, lon = coords[0], coords[1]
return lat, lon
# merge cached results with new lookup
result.update(json_response)

def _read_country_names(self, countries_file=None):
"""
Read list of countries from specified country file or
default file.
"""
if not countries_file:
countries_file = os.path.join(
os.path.dirname(__file__), self.COUNTRY_FILE_DEFAULT
)
with open(countries_file) as f:
countries_json = f.read()
# format all
for detail in result.values():
if isinstance(detail, dict):
handler_utils.format_details(detail, self.countries)

return json.loads(countries_json)
return result
Loading

0 comments on commit e229f52

Please sign in to comment.