From 620719b7b3700f881674bff53b87330ebaa3bb99 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Wed, 11 May 2022 18:54:35 +0100 Subject: [PATCH 01/40] First pass at better management of files on disk This includes files in the cache and source dirs. There are also a number of other changes in this commit, it got away from me a bit. Changes include: - cache management for processed images - cache management for the source images - the concept of a source file being in use so that it remains on disk throughout processing - processing is simplified to use a standard process pool in the hopes this reduces the number of zombie processes we keep getting - exception handling is revamped --- iiif/config.py | 8 +- iiif/exceptions.py | 78 +++- iiif/ops.py | 12 +- iiif/processing.py | 347 +++++------------ iiif/profiles/base.py | 27 +- iiif/profiles/disk.py | 56 +-- iiif/profiles/mss.py | 641 +++++++++++++++++++------------ iiif/routers/iiif.py | 26 +- iiif/routers/originals.py | 4 +- iiif/routers/simple.py | 4 +- iiif/state.py | 15 +- iiif/utils.py | 359 +++++++++++------ iiif/web.py | 44 ++- pytest.ini | 2 + setup.py | 9 +- tests/profiles/test_disk.py | 53 +-- tests/profiles/test_mss.py | 189 ++++++--- tests/profiles/test_mss_es.py | 94 +++++ tests/profiles/test_mss_store.py | 164 ++++++++ tests/test_exceptions.py | 17 +- tests/test_processing.py | 511 +----------------------- tests/test_utils.py | 326 +++++++--------- tests/utils.py | 6 + 23 files changed, 1458 insertions(+), 1534 deletions(-) create mode 100644 pytest.ini create mode 100644 tests/profiles/test_mss_es.py create mode 100644 tests/profiles/test_mss_store.py diff --git a/iiif/config.py b/iiif/config.py index 10b94a8..c0a8cd9 100644 --- a/iiif/config.py +++ b/iiif/config.py @@ -1,9 +1,6 @@ #!/usr/bin/env python3 # encoding: utf-8 -from copy import deepcopy -from typing import Callable - import os import yaml from pathlib import Path @@ -26,8 +23,9 @@ def __init__(self, **options): self.min_sizes_size = options.get('min_sizes_size', 200) # image processing settings - self.image_pool_size = options.get('image_pool_size', 2) - self.image_cache_size_per_process = options.get('image_cache_size_per_process', 5) + self.processing_pool_size = options.get('processing_pool_size', 2) + self.processed_cache_size = options.get('processed_cache_size', 1024 * 1024 * 256) + self.processed_cache_ttl = options.get('processed_cache_ttl', 12 * 60 * 60) # size definitions for the quick access endpoints self.thumbnail_width = options.get('thumbnail_width', 512) diff --git a/iiif/exceptions.py b/iiif/exceptions.py index 6630661..699bea2 100644 --- a/iiif/exceptions.py +++ b/iiif/exceptions.py @@ -1,20 +1,80 @@ #!/usr/bin/env python3 # encoding: utf-8 -from fastapi import HTTPException +import logging +from fastapi import Request +from starlette.responses import JSONResponse +from typing import Optional +# this is all assuming we're using uvicorn... +uvicorn_logger = logging.getLogger('uvicorn.error') +# use this logger to dump stuff into the same channels as the uvicorn logs +logger = uvicorn_logger.getChild('iiif') -def profile_not_found() -> HTTPException: - return HTTPException(status_code=404, detail='Profile not recognised') +class IIIFServerException(Exception): -def image_not_found() -> HTTPException: - return HTTPException(status_code=404, detail='Image not found') + def __init__(self, public: str, status_code: int = 500, log: Optional[str] = None, + level: int = logging.WARNING, cause: Optional[Exception] = None): + super().__init__(public) + self.status_code = status_code + self.public = public + self.log = log + self.level = level + self.cause = cause + if log is not None: + self.log = log + else: + if self.cause is not None: + self.log = f'An error occurred: {self.cause}' + else: + self.log = self.public -def too_many_images(max_files) -> HTTPException: - return HTTPException(status_code=400, detail=f'Too many images requested (max: {max_files})') +class Timeout(IIIFServerException): + def __init__(self, *args, **kwargs): + super().__init__(f'A timeout occurred, please try again', 500, *args, **kwargs) -def invalid_iiif_parameter(name: str, value: str) -> HTTPException: - return HTTPException(status_code=400, detail=f'{name} value "{value}" is invalid') + +class ProfileNotFound(IIIFServerException): + + def __init__(self, name: str, *args, **kwargs): + super().__init__(f'Profile {name} not recognised', 404, *args, **kwargs) + self.name = name + + +class ImageNotFound(IIIFServerException): + + def __init__(self, profile: str, name: str, *args, **kwargs): + super().__init__(f'Image {name} not found in profile {profile}', 404, *args, + **kwargs) + self.profile = profile + self.name = name + + +class TooManyImages(IIIFServerException): + + def __init__(self, max_files: int, *args, **kwargs): + super().__init__(f'Too many images requested (max: {max_files})', 400, *args, **kwargs) + self.max_files = max_files + + +class InvalidIIIFParameter(IIIFServerException): + + def __init__(self, name: str, value: str, *args, **kwargs): + super().__init__(f'Invalid IIIF option: {name} value "{value}" is invalid', 400, *args, + **kwargs) + self.name = name + self.value = value + + +async def handler(request: Request, exception: IIIFServerException) -> JSONResponse: + if exception.log: + logger.log(exception.level, exception.log) + return JSONResponse( + status_code=exception.status_code, + content={ + "error": exception.public + }, + ) diff --git a/iiif/ops.py b/iiif/ops.py index 9fd1062..3c1b064 100644 --- a/iiif/ops.py +++ b/iiif/ops.py @@ -4,7 +4,7 @@ from contextlib import suppress -from iiif.exceptions import invalid_iiif_parameter +from iiif.exceptions import InvalidIIIFParameter from iiif.profiles.base import ImageInfo # this server currently supports IIIF level1 @@ -81,7 +81,7 @@ def parse_region(region: str, info: ImageInfo) -> Region: return Region(x, y, w, h, full=(w == info.width and h == info.height)) # if we get here, the region is no good :( - raise invalid_iiif_parameter('Region', region) + raise InvalidIIIFParameter('Region', region) @dataclass @@ -147,7 +147,7 @@ def parse_size(size: str, region: Region) -> Size: if 0 < w <= region.w and 0 < h <= region.h: return Size(w, h, max=(w == region.w and h == region.h)) - raise invalid_iiif_parameter('Size', size) + raise InvalidIIIFParameter('Size', size) @dataclass @@ -189,7 +189,7 @@ def parse_rotation(rotation: str) -> Rotation: if angle in allowed_angles: return Rotation(angle, mirror) - raise invalid_iiif_parameter('Rotation', rotation) + raise InvalidIIIFParameter('Rotation', rotation) class Quality(Enum): @@ -213,7 +213,7 @@ def parse_quality(quality: str) -> Quality: if option.value == quality: return option - raise invalid_iiif_parameter('Quality', quality) + raise InvalidIIIFParameter('Quality', quality) class Format(Enum): @@ -235,7 +235,7 @@ def parse_format(fmt: str) -> Format: if option.value == fmt: return option - raise invalid_iiif_parameter('Format', fmt) + raise InvalidIIIFParameter('Format', fmt) def parse_params(info: ImageInfo, region: str = 'full', size: str = 'max', rotation: str = '0', diff --git a/iiif/processing.py b/iiif/processing.py index 3d134a6..03b37eb 100644 --- a/iiif/processing.py +++ b/iiif/processing.py @@ -1,42 +1,18 @@ #!/usr/bin/env python3 # encoding: utf-8 +from concurrent.futures import ProcessPoolExecutor import asyncio -from asyncio import Future -from collections import defaultdict -from multiprocessing.context import Process -from pathlib import Path -from typing import Any, Optional - -import multiprocessing as mp -import random -from cachetools import LRUCache +import os +from dataclasses import dataclass from jpegtran import JPEGImage from jpegtran.lib import Transformation -from threading import Thread +from pathlib import Path +from typing import Optional, Tuple from iiif.ops import IIIFOps, Region, Size, Rotation, Quality, Format -from iiif.utils import to_pillow, to_jpegtran - - -class Task: - """ - Class representing an image processing task as defined by a IIIF based request. - """ - - def __init__(self, source_path: Path, output_path: Path, ops: IIIFOps): - """ - :param region: the IIIF region request parameter - :param size: the IIIF size request parameter - """ - self.source_path = source_path - self.output_path = output_path - self.ops = ops - - def __eq__(self, other): - if isinstance(other, Task): - return self.source_path == other.source_path and self.ops == other.ops - return NotImplemented +from iiif.profiles.base import ImageInfo, AbstractProfile +from iiif.utils import to_pillow, to_jpegtran, FetchCache, Fetchable def process_region(image: JPEGImage, region: Region) -> JPEGImage: @@ -141,256 +117,115 @@ def process_format(image: JPEGImage, fmt: Format, output_path: Path): to_pillow(image).save(f, format='png') -def process_image_requests(worker_id: Any, task_queue: mp.Queue, result_queue: mp.Queue, - cache_size: int): +def process_image_request(source_path: Path, output_path: Path, ops: IIIFOps): """ - Processes a given task queue, putting tasks on the given results queue once complete. This - function is blocking and should be run in a separate process. - - Due to the way JPEGImage handles file data we use the LRU cache to avoid rereading source files - if possible. When initialised, JPEGImage loads the entire source file into memory but is then - immutable when using the various operation functions (crop, downscale etc). This means it's most - efficient for us to load the file once and reuse the JPEGImage object over and over again, hence - the LRU image cache. - - :param worker_id: the worker id associated with this process - :param task_queue: a multiprocessing Queue of Task objects - :param result_queue: a multiprocessing Queue to put the completed Task objects on - :param cache_size: the size to use for the LRU cache for loaded source images - """ - image_cache = LRUCache(cache_size) - - try: - # wait for tasks until we get a sentinel (in this case None) - for task in iter(task_queue.get, None): - try: - if task.source_path not in image_cache: - # the JPEGImage init function reads the entire source file into memory - image_cache[task.source_path] = JPEGImage(task.source_path) - - image = image_cache[task.source_path] - image = process_region(image, task.ops.region) - image = process_size(image, task.ops.size) - image = process_rotation(image, task.ops.rotation) - image = process_quality(image, task.ops.quality) - - # ensure the full cache path exists - task.output_path.parent.mkdir(parents=True, exist_ok=True) - - # write the processed image to disk - process_format(image, task.ops.format, task.output_path) - - # put our worker id, the task and None on the result queue to indicate to the main - # process that it's done and we encountered no exceptions - result_queue.put((worker_id, task, None)) - except Exception as e: - # if we get a keyboard interrupt we need to stop! - if isinstance(e, KeyboardInterrupt): - raise e - # put our worker id, the task and the exception on the result queue to indicate to - # the main process that it's done and we encountered an exception - result_queue.put((worker_id, task, e)) - except KeyboardInterrupt: - pass - - -class Worker: - """ - Class representing an image processing worker process. + Process a single image at the source path using the given ops and save it in the output path. + + :param source_path: the source image + :param output_path: the target location + :param ops: the IIIF operations to perform """ + image = JPEGImage(str(source_path)) - def __init__(self, worker_id: Any, result_queue: mp.Queue, cache_size: int): - """ - :param worker_id: the worker's id, handy for debugging and not really used otherwise - :param result_queue: the multiprocessing Queue that should be used by the worker to indicate - task completions - :param cache_size: the requested size of the worker's image cache - """ - self.worker_id = worker_id - self.name = str(worker_id) - - # create a multiprocessing Queue for just this worker's tasks - self.task_queue = mp.Queue() - # create the process - self.process = Process(target=process_image_requests, args=(worker_id, self.task_queue, - result_queue, cache_size)) - self.queue_size = 0 - # this LRU cache holds the source file paths that should be in the process's image cache at - # the time the last task on the task queue is processed and therefore allows us to use it as - # a heuristic when determining which worker to assign a task (we want to hit the image cache - # as much as possible!) - self.predicted_cache = LRUCache(cache_size) - self.process.start() - - def add(self, task: Task): - """ - Adds the given task to this worker's task queue. + image = process_region(image, ops.region) + image = process_size(image, ops.size) + image = process_rotation(image, ops.rotation) + image = process_quality(image, ops.quality) - :param task: the Task object - """ - self.queue_size += 1 - self.predicted_cache[task.source_path] = True - # this will almost always be instantaneous but does have the chance to block up the entire - # asyncio thread - self.task_queue.put(task) + # ensure the full cache path exists + output_path.parent.mkdir(parents=True, exist_ok=True) - def done(self, task: Task): - """ - Call this to notify this worker that it completed the given task. + # write the processed image to disk + process_format(image, ops.format, output_path) - :param task: the task that was completed - """ - self.queue_size -= 1 - def stop(self): - """ - Requests that this worker stops. This is a blocking call and will wait until the worker has - completed all currently queued tasks. - """ - # send the sentinel - self.task_queue.put(None) - self.process.join() +@dataclass +class ImageProcessorTask(Fetchable): + """ + Class used to manage the ImageProcessor tasks. + """ + profile: AbstractProfile + info: ImageInfo + ops: IIIFOps + + @property + def public_name(self) -> str: + return self.info.identifier - def is_warm_for(self, task: Task) -> bool: + @property + def store_path(self) -> Path: + return self.profile.cache_path / self.info.name / self.ops.location + + @property + def size_hint(self) -> Optional[Tuple[int, int]]: """ - Determines whether the worker is warmed up for a given task. This just checks to see whether - the source image will be in the worker's LRU cache when it is processed if it is added to - the queue now. + This is used as a performance enhancement. By providing a hint at the size of the image we + need to serve up, we can (sometimes!) use a smaller source image thus reducing downloading, + storage, and processing time. - :param task: the task - :return: True if the source path is warm on this worker or False if not + :return: the hint size as a tuple or None if we can't hint at anything """ - return task.source_path in self.predicted_cache + # this is a performance enhancement. + if self.ops.region.full: + # the full image region is selected so we can take the hint from the size parameter + return self.ops.size.w, self.ops.size.h + else: + # a region has been specified, we'll have to use the whole thing + # TODO: can we do better than this? + return None -class ImageProcessingDispatcher: +class ImageProcessor(FetchCache): """ - Class controlling the image processing workers. + Class controlling IIIF image processing. """ - def __init__(self): - # keep a reference to the correct asyncio loop so that we can correctly call task completion - # callbacks from the result thread - self.loop = asyncio.get_event_loop() - # a dict of the Worker objects we're dispatching the requests to keyed by their worker ids - self.workers = {} - # a register of the processed image paths and tornado Event objects indicating whether they - # have been processed yet, we deliberately don't pre-populate this in case the cache - # directory is large and leave it to be lazily built as requests come in (see submit method) - self.output_paths = {} - # the multiprocessing result queue used by all workers to notify the main process that a - # task has been completed - self.result_queue = mp.Queue() - self.result_thread = Thread(target=self.result_listener) - self.result_thread.start() - - def result_listener(self): + def __init__(self, root: Path, ttl: float, max_size: float, max_workers: int = os.cpu_count()): """ - This function should be run in a separate thread to avoid blocking the asyncio loop. It - listens for results to be put on the result queue by workers and adds a callback into the - main ayncio loop to notify all waiting coroutines. - """ - for result in iter(self.result_queue.get, None): - self.loop.call_soon_threadsafe(self.finish_task, *result) - - def init_workers(self, worker_count: int, worker_cache_size: int): + :param root: the root path to cache the processed images in + :param ttl: how long processed images should exist on disk after they've been last used + :param max_size: maximum bytes to store in this cache + :param max_workers: maximum number of worker processes to use to work on processing images """ - Initialises the required number of workers. + super().__init__(root, ttl, max_size) + self.loop = asyncio.get_event_loop() + self.max_workers = max_workers + self.executor = ProcessPoolExecutor(max_workers=max_workers) - :param worker_count: the number of workers to create - :param worker_cache_size: the size of each worker's image cache + async def process(self, profile: AbstractProfile, info: ImageInfo, ops: IIIFOps) -> Path: """ - for i in range(worker_count): - self.workers[i] = Worker(i, self.result_queue, worker_cache_size) + Process an image according to the IIIF ops. - async def submit(self, task: Task): - """ - Submits the given task to be processed on one of our worker processes. If the task has - already been completed (this is determined by the existence of the task's output path) then - the task will not be reprocessed. Tornado Future objects are used to determine if a task has - been completed or not. If the task has already been completed, this function will return - immediately when awaited. If a task is requested again whilst it is already being processed, - the Future object created for the in progress task will be awaited on by this function for - the new processing request. This results in all tasks resolving at the same time upon the - first task's completion. - - :param task: the task object - """ - if task.output_path not in self.output_paths: - # we haven't processed this task before, create a Future and add it to the output_paths - processed_future = Future() - self.output_paths[task.output_path] = processed_future - if task.output_path.exists(): - # if the path exists the task was created before this server started up, set the - # result to indicate the task is complete - processed_future.set_result(None) - else: - # otherwise, choose a worker and add it to it - worker = self.choose_worker(task) - worker.add(task) - - await self.output_paths[task.output_path] - - def choose_worker(self, task: Task) -> Worker: - """ - Select a worker for the given task. Workers are chosen by giving them a score and then - randomly choosing the worker from the group with highest score. + Technically, the path returned could become invalid immediately as this is not a context + manager and therefore doesn't guarantee that another coroutine wouldn't delete it, but + realistically this would only happen if the amount of time it takes you to do whatever you + need to do with the file exceeds the ttl of the file. - Workers which will have the source image loaded into their image caches are prioritised as - are workers with a queue size shorter than the number of workers (for lack of a better value - to be less than). + If you need to ensure the file exists, use the `use` function as a context manager instead. - :param task: the task - :return: a Worker object + :param profile: the profile + :param info: the image info + :param ops: IIIF ops to perform + :return: the path of the processed image """ - buckets = defaultdict(list) - for worker in self.workers.values(): - # higher is better - score = 0 - - if worker.queue_size <= len(self.workers): - # you get a point if your queue is smaller than the current number of workers - score += 1 - if worker.queue_size == 0: - # and an extra point if you have no tasks on your queue - score += 1 + async with self.use(ImageProcessorTask(profile, info, ops)) as path: + return path - if worker.is_warm_for(task): - # you get a point if you are warmed up for the task - score += 1 - - # add the worker to the appropriate bucket - buckets[score].append(worker) - - # choose the bucket with the highest score and pick a worker at random from it - return random.choice(buckets[max(buckets.keys())]) - - def finish_task(self, worker_id: Any, task: Task, exception: Optional[Exception]): + async def _fetch(self, task: ImageProcessorTask): """ - Called by the result thread to signal that a task has finished processing. This function - simply retrieves the the Future object associated with the task's output path and sets its - result/exception. - - :param worker_id: the id of the worker that completed the task - :param task: the task object that is complete - :param exception: an exception that occurred during processing, if no exception was - encountered this will be None + Perform the actual processing to produce the derived image. + + :param task: the task information """ - self.workers[worker_id].done(task) - if exception is None: - self.output_paths[task.output_path].set_result(None) - else: - self.output_paths[task.output_path].set_exception(exception) + async with task.profile.use_source(task.info, task.size_hint) as source_path: + await self.loop.run_in_executor(self.executor, process_image_request, source_path, + task.store_path, task.ops) def stop(self): """ - Signals all worker processes to stop. This function will block until all workers have - finished processing their queues. + Shuts down the processing pool. This will block until they're all done. """ - for worker in self.workers.values(): - worker.stop() - self.result_queue.put(None) - self.result_thread.join() + self.executor.shutdown() def get_status(self) -> dict: """ @@ -399,8 +234,12 @@ def get_status(self) -> dict: :return: a dict of stats """ return { - 'results_queue_size': self.result_queue.qsize(), - 'worker_queue_sizes': { - worker.name: worker.queue_size for worker in self.workers.values() - } + 'requests': self.requests, + 'completed': self.completed, + 'errors': self.errors, + 'pool_size': self.max_workers, + 'cache_size': self.total_size, + 'percentage_used': self.pct, + 'in_use': len(self._in_use), + 'waiting_clean_up': len(self._cleaners), } diff --git a/iiif/profiles/base.py b/iiif/profiles/base.py index 1171746..b6823a1 100644 --- a/iiif/profiles/base.py +++ b/iiif/profiles/base.py @@ -1,12 +1,11 @@ -from pathlib import Path -from typing import Tuple, Optional, Any, Union, AsyncIterable - import abc -import logging -from cachetools import LRUCache +from cachetools import TTLCache +from contextlib import asynccontextmanager +from pathlib import Path +from typing import Tuple, Optional, Any, AsyncIterable from iiif.config import Config -from iiif.utils import get_path_stats, generate_sizes, create_logger +from iiif.utils import get_path_stats, generate_sizes class ImageInfo: @@ -53,13 +52,12 @@ class AbstractProfile(abc.ABC): """ def __init__(self, name: str, config: Config, rights: str, info_json_cache_size: int = 1000, - log_level: Union[int, str] = logging.WARNING, cache_for: int = 0): + cache_for: float = 60): """ :param name: the name of the profile, should be unique across profiles :param config: the config object :param rights: the rights definition for all images handled by this profile :param info_json_cache_size: the size of the info.json cache - :param log_level: the log level to use for messages from this profile :param cache_for: how long in seconds a client should cache the results from this profile (both info.json and image data) """ @@ -72,12 +70,11 @@ def __init__(self, name: str, config: Config, rights: str, info_json_cache_size: self.rights = rights self.source_path.mkdir(exist_ok=True) self.cache_path.mkdir(exist_ok=True) - self.info_json_cache = LRUCache(info_json_cache_size) + self.info_json_cache = TTLCache(maxsize=info_json_cache_size, ttl=cache_for) self.cache_for = cache_for - self.logger = create_logger(name, log_level) @abc.abstractmethod - async def get_info(self, name: str) -> Optional[ImageInfo]: + async def get_info(self, name: str) -> ImageInfo: """ Returns an instance of ImageInfo or one of it's subclasses for the given name. Note that this function does not generate the info.json, see generate_info_json below for that! @@ -88,20 +85,20 @@ async def get_info(self, name: str) -> Optional[ImageInfo]: pass @abc.abstractmethod - async def fetch_source(self, info: ImageInfo, - size_hint: Optional[Tuple[int, int]] = None) -> Optional[Path]: + @asynccontextmanager + async def use_source(self, info: ImageInfo, size: Optional[Tuple[int, int]] = None) -> Path: """ Ensures the source file required for the image is available, using the optional size hint to choose the best file if multiple sizes are available. :param info: the ImageInfo - :param size_hint: an 2-tuple of ints to hint at the target size required for subsequent ops + :param size: an 2-tuple of ints to hint at the target size required for subsequent ops :return: the Path to the source file or None if one could not be found """ pass @abc.abstractmethod - async def resolve_filename(self, name: str) -> Optional[str]: + async def resolve_filename(self, name: str) -> str: """ Given the name of an image produced by this profile, returns the filename that should be used for it. This is used when original images are downloaded to give them the right name. diff --git a/iiif/profiles/disk.py b/iiif/profiles/disk.py index 515c69d..a889a08 100644 --- a/iiif/profiles/disk.py +++ b/iiif/profiles/disk.py @@ -1,44 +1,55 @@ -from pathlib import Path -from typing import Tuple, Optional - import aiofiles +from contextlib import asynccontextmanager +from pathlib import Path +from iiif.exceptions import ImageNotFound from iiif.profiles.base import AbstractProfile, ImageInfo from iiif.utils import get_size +class MissingFile(ImageNotFound): + + def __init__(self, profile: str, name: str, source: Path): + super().__init__(profile, name, log=f"Couldn't find the image file for {name} on disk at" + f" {source}") + self.source = source + + class OnDiskProfile(AbstractProfile): """ A profile representing source files that are already on disk and don't need to be fetched from an external source. """ - async def get_info(self, name: str) -> Optional[ImageInfo]: + async def get_info(self, name: str) -> ImageInfo: """ Given an image name, returns an info object for it. If the image doesn't exist on disk then - None is returned. + an error is raised. :param name: the image name - :return: None if the image doesn't exist on disk, or an ImageInfo instance + :return: an ImageInfo instance + :raises: HTTPException if the file is missing """ source = self._get_source(name) if not source.exists(): - return None + raise MissingFile(self.name, name, source) else: return ImageInfo(self.name, name, *get_size(source)) - async def fetch_source(self, info: ImageInfo, - target_size: Optional[Tuple[int, int]] = None) -> Optional[Path]: + @asynccontextmanager + async def use_source(self, info: ImageInfo, *args, **kwargs) -> Path: """ - Given an info object returns the path to the on disk image source. The target size is - ignored by this function because we only have the full size original and nothing else. + Given an info object, yields the path to the on disk image source. The target size is + ignored by this function because we only have the full size originals and nothing else. :param info: the image info object - :param target_size: a target size - this is ignored by this function :return: the path to the source image on disk + :raises: HTTPException if the file is missing """ - source_path = self._get_source(info.name) - return source_path if source_path.exists() else None + source = self._get_source(info.name) + if not source.exists(): + raise MissingFile(self.name, info.name, source) + yield source def _get_source(self, name: str) -> Path: """ @@ -49,16 +60,14 @@ def _get_source(self, name: str) -> Path: """ return self.source_path / name - async def resolve_filename(self, name: str) -> Optional[str]: + async def resolve_filename(self, name: str) -> str: """ - Given an image name, resolves the filename. Given that the name == the filename, this just - checks that the name source exists on disk and returns the name if it does. + Given an image name, resolves the filename.. :param name: the image name :return: the source filename """ - source_path = self._get_source(name) - return source_path.name if source_path.exists() else None + return self._get_source(name).name async def stream_original(self, name: str, chunk_size: int = 4096, raise_errors=True): """ @@ -70,10 +79,10 @@ async def stream_original(self, name: str, chunk_size: int = 4096, raise_errors= :param raise_errors: whether to raise errors if they occur, or just stop (default: True) :return: yields chunks of bytes """ - source_path = self._get_source(name) - if source_path.exists(): + source = self._get_source(name) + if source.exists(): try: - async with aiofiles.open(file=str(source_path), mode='rb') as f: + async with aiofiles.open(file=str(source), mode='rb') as f: while True: chunk = await f.read(chunk_size) if not chunk: @@ -82,3 +91,6 @@ async def stream_original(self, name: str, chunk_size: int = 4096, raise_errors= except Exception as e: if raise_errors: raise e + else: + if raise_errors: + raise MissingFile(self.name, name, source) diff --git a/iiif/profiles/mss.py b/iiif/profiles/mss.py index d63c14e..52fb98a 100644 --- a/iiif/profiles/mss.py +++ b/iiif/profiles/mss.py @@ -1,23 +1,57 @@ -import asyncio -from concurrent.futures.process import ProcessPoolExecutor -from pathlib import Path -from typing import Tuple, List, Optional -from urllib.parse import quote +from concurrent.futures import ProcessPoolExecutor, Executor -import aiocron as aiocron -import aiohttp +import asyncio import json +import logging import shutil import tempfile import time -from contextlib import AsyncExitStack +from cachetools import TTLCache +from contextlib import AsyncExitStack, asynccontextmanager +from dataclasses import dataclass from elasticsearch_dsl import Search from fastapi import HTTPException -from itertools import cycle, chain +from functools import partial +from itertools import cycle +from pathlib import Path +from typing import List +from typing import Tuple, Optional +from urllib.parse import quote from iiif.config import Config +from iiif.exceptions import Timeout, IIIFServerException, ImageNotFound from iiif.profiles.base import AbstractProfile, ImageInfo -from iiif.utils import OnceRunner, convert_image, get_size +from iiif.utils import Locker, convert_image, create_client_session, FetchCache, Fetchable +from iiif.utils import get_size + + +class MissingCollectionRecord(ImageNotFound): + + def __init__(self, profile: str, name: str, emu_irn: int): + super().__init__(profile, name, log=f"Couldn't find collection record associated with " + f"multimedia IRN {emu_irn} [guid: {name}]") + self.emu_irn = emu_irn + + +class MSSAccessDenied(ImageNotFound): + + def __init__(self, profile: str, name: str, emu_irn: int): + super().__init__(profile, name, log=f"MSS denied access to multimedia IRN {emu_irn} " + f"[guid: {name}]") + self.emu_irn = emu_irn + + +class MSSDocDuplicates(ImageNotFound): + + def __init__(self, profile: str, name: str, total: int): + super().__init__(profile, name, log=f"Found {total} MSS docs for the guid {name}") + self.total = total + + +class MSSDocNotFound(ImageNotFound): + + def __init__(self, profile: str, name: str): + super().__init__(profile, name, log=f"No MSS doc found for the guid {name}") class MSSImageInfo(ImageInfo): @@ -34,7 +68,7 @@ def __init__(self, profile_name: str, name: str, doc: dict): super().__init__(profile_name, name, doc.get('width', None), doc.get('height', None)) self.doc = doc self.emu_irn = doc['id'] - # the name of the original file as it appears on SCALE + # the name of the original file as it appears on the actual filesystem EMu is using self.original = doc['file'] # a list of the EMu generated derivatives of the original file. The list should already be # in the ascending (width, height) order because the import process sorts it @@ -58,196 +92,142 @@ def choose_file(self, target_size: Optional[Tuple[int, int]] = None) -> str: class MSSProfile(AbstractProfile): - """ - Profile for the MSS service which provides access to the images stored in EMu. - """ def __init__(self, name: str, config: Config, rights: str, es_hosts: List[str], - mss_url: str, - ic_fast_pool_size: int, - ic_slow_pool_size: int, collection_indices: List[str], - mss_index: str = 'mss', - es_limit: int = 100, - doc_cache_size: int = 1_000_000, - doc_exception_timeout: int = 0, - mss_limit: int = 20, - fetch_cache_size: int = 1_000_000, - fetch_exception_timeout: int = 0, - ic_quality: int = 85, - ic_subsampling: str = '4:2:0', - dm_limit: int = 4, + mss_url: str, mss_ssl: bool = True, - dm_ssl: bool = True, + mss_index: str = 'mss', + mss_limit: int = 10, + es_limit: int = 10, + info_cache_size: int = 100_000, + info_cache_ttl: float = 43_200, + info_lock_ttl: float = 60, + source_cache_size: int = 1024 * 1024 * 256, + source_cache_ttl: float = 12 * 60 * 60, + convert_fast_pool_size: int = 1, + convert_slow_pool_size: int = 1, + convert_quality: int = 85, + convert_subsampling: str = '4:2:0', + dams_limit: int = 4, + dams_ssl: bool = True, **kwargs ): """ :param name: the name of this profile - :param config: the config object + :param config: the Config object :param rights: the rights url for images served by this profile :param es_hosts: a list of elasticsearch hosts to use - :param mss_url: mss base url - :param ic_fast_pool_size: the size of the fast source image conversion pool (currently, - source images that are already jpegs are put in this pool) - :param ic_slow_pool_size: the size of the slow source image conversion pool (currently, - source images that are not jpegs (most likely tiffs) are put in - this pool) :param collection_indices: the indices to search to confirm the images can be accessed - :param mss_index: the name of the MSS index - :param es_limit: the number of elasticsearch requests that can be active simultaneously - :param doc_cache_size: the size of the cache for doc results - :param doc_exception_timeout: timeout for exceptions that occur during doc retrieval - :param mss_limit: the number of mss requests that can be active simultaneously - :param fetch_cache_size: the size of the cache for fetch results - :param fetch_exception_timeout: timeout for exceptions that occur during fetching - :param ic_quality: the jpeg quality to use when converting images - :param ic_subsampling: the jpeg subsampling to use when converting images - :param dm_limit: the number of dams requests that can be active simultaneously + :param mss_url: mss base url :param mss_ssl: boolean indicating whether ssl certificates should be checked when making requests to mss - :param dm_ssl: boolean indicating whether ssl certificates should be checked when making - requests to dams + :param mss_index: the MSS index name in elasticsearch + :param mss_limit: the maximum number of simultaneous connections that can be made to the MSS + :param es_limit: the maximum number of simultaneous connections that can be made to + elasticsearch + :param info_cache_size: max size of the caches related to the metadata of the image + :param info_cache_ttl: ttl of the data in the caches related to the metadata of the image + :param info_lock_ttl: how long to lock for when gathering the metadata about an image from + the various sources. Note that this needs to cover Elasticsearch and + MSS requests as well as potentially downloading the source image (in + order to find out the size) + :param source_cache_size: max size in bytes of the source cache on disk + :param source_cache_ttl: ttl of each source image in the cache + :param convert_fast_pool_size: how many processes to use for converting "fast" images + :param convert_slow_pool_size: how many processes to use for converting "slow" images + :param convert_quality: quality to use when converting a source to a jpeg + :param convert_subsampling: subsampling value to use when converting a source to a jpeg + :param dams_limit: the maximum number of simultaneous connections that can be made to the + old dams service (to retrieve files stored at the damsurlfile value) + :param dams_ssl: boolean indicating whether ssl certificates should be checked when making + requests to dams + :param kwargs: extra kwargs for the AbstractProfile base class __init__ """ super().__init__(name, config, rights, **kwargs) - self.loop = asyncio.get_event_loop() - # runners - self.doc_runner = OnceRunner('doc', doc_cache_size, doc_exception_timeout) - self.fetch_runner = OnceRunner('fetch', fetch_cache_size, fetch_exception_timeout) - # elasticsearch - self.es_hosts = cycle(es_hosts) - self.es_session = aiohttp.ClientSession(connector=aiohttp.TCPConnector(limit=es_limit)) - self.collection_indices = ','.join(collection_indices) - self.mss_index = mss_index - # MSS - self.mss_url = mss_url - self.mss_ssl = None if mss_ssl else False - self.mss_session = aiohttp.ClientSession(connector=aiohttp.TCPConnector(limit=mss_limit, - ssl=mss_ssl)) - # dams - self.dm_ssl = None if dm_ssl else False - self.dm_session = aiohttp.ClientSession(connector=aiohttp.TCPConnector(limit=dm_limit, - ssl=dm_ssl)) - # image conversion - self.ic_fast_pool = ProcessPoolExecutor(max_workers=ic_slow_pool_size) - self.ic_slow_pool = ProcessPoolExecutor(max_workers=ic_fast_pool_size) - self.ic_quality = ic_quality - self.ic_subsampling = ic_subsampling - # start a cron to make sure we check the access for each image we have cached every hour - self.clean_up_cron = aiocron.crontab('0 * * * *', func=self.clean_up) - - async def clean_up(self) -> int: - """ - Call this function to clean up cached data for images that shouldn't be accessible any more. - This could be called on the same basis as the data importer but instead we simplify and call - it way more, i.e. every hour (see the clean_up_cron attr defined above). - - Cached image data, source image data, image info metadata and cached info.json dicts are all - removed if an image is no longer accessible. - - :return: the number of images that had data removed - """ - removed = 0 - # use the cache dir, source dir and info.json cache as sources for the names to remove. We - # could also use the fetch runner but given that we're using the source dir already and - # that's where the fetch runner writes to it seems unnecessary - name_sources = chain(self.cache_path.iterdir(), self.source_path.iterdir(), - self.info_json_cache.keys()) - names = {path.name for path in name_sources} - for name in names: - doc = await self.get_mss_doc(name, refresh=True) - if doc is None: - shutil.rmtree(self.cache_path / name) - shutil.rmtree(self.source_path / name) - self.fetch_runner.expire_matching(lambda task_id: task_id.startswith(f'{name}::')) - self.info_json_cache.pop(name, None) - removed += 1 - - self.logger.warning(f'Clean up removed {removed} images that are no longer available') - return removed - - async def get_info(self, name: str) -> Optional[MSSImageInfo]: - """ - Given an image name (a GUID) returns a MSSImageInfo object or None if the image can't be - found/isn't allowed to be accessed. If the image doesn't have width and height stored in the - elasticsearch index for whatever reason then the image will be retrieved and the size - extracted. - - :param name: the GUID of the image - :return: an MSSImageInfo instance or None - """ - doc = await self.get_mss_doc(name) - if doc is None: - return None + self.info_cache = TTLCache(info_cache_size, info_cache_ttl) + self.get_info_locker = Locker(default_timeout=info_lock_ttl) - info = MSSImageInfo(self.name, name, doc) + self.get_mss_doc_locker = Locker(default_timeout=info_lock_ttl) + self.mss_doc_cache = TTLCache(maxsize=info_cache_size, ttl=info_cache_ttl) - if info.width is None or info.height is None: - source_path = await self.fetch_source(info) - info.width, info.height = get_size(source_path) + self.es_handler = MSSElasticsearchHandler(es_hosts, collection_indices, es_limit, mss_index) + self.store = MSSSourceStore(self.source_path, mss_url, source_cache_size, + source_cache_ttl, mss_limit, dams_limit, mss_ssl, + dams_ssl, convert_slow_pool_size, convert_fast_pool_size, + convert_quality, convert_subsampling) - return info + async def get_info(self, name: str) -> MSSImageInfo: + """ + Given an image name (a GUID) returns a MSSImageInfo object or raise an HTTPException if the + image can't be found/isn't allowed to be accessed. If the image doesn't have width and + height stored in the elasticsearch index for whatever reason then the original source image + will be retrieved and the size extracted. - async def fetch_source(self, info: MSSImageInfo, - target_size: Optional[Tuple[int, int]] = None) -> Path: + :param name: the GUID of the image + :return: an MSSImageInfo instance + """ + if name in self.info_cache: + return self.info_cache[name] + + try: + async with self.get_info_locker.acquire(name): + try: + # double check that the cache hasn't been filled while we waited for the lock + if name in self.info_cache: + return self.info_cache[name] + + doc = await self.get_mss_doc(name) + info = MSSImageInfo(self.name, name, doc) + if info.width is None or info.height is None: + async with self.use_source(info) as source_path: + info.width, info.height = get_size(source_path) + self.info_cache[name] = info + return info + except IIIFServerException: + raise + except Exception as cause: + e = ImageNotFound( + self.name, f'An error occurred while processing an info request for {name}', + cause=cause, level=logging.ERROR + ) + raise e from cause + except asyncio.TimeoutError: + raise Timeout(cause=cause, log=f'Timeout while waiting for get_info lock on {name} in ' + f'profile {self.name}') + + @asynccontextmanager + async def use_source(self, info: MSSImageInfo, size: Optional[Tuple[int, int]] = None) -> Path: """ - Given a MSSImageInfo object retrieve a source image that can be used to fulfill future image - data requests. If no target size is provided then the size of the image is used as the + Given an MSSImageInfo object, retrieve a source image and yield the path where it is stored. + This is an async context manager and the while the context exists the source image will + remain on disk. If no target size is provided then the size of the image is used as the target size, this could result in either the original image being retrieved or a derivative of the same size (sometimes, it seems, EMu generates a jpeg derivative of the same size as the original). If the target size is provided then the smallest image available in MSS that can fulfill the request will be used. :param info: an MSSImageInfo instance - :param target_size: a target size 2-tuple or None + :param size: a target size 2-tuple or None :return: the path to the source image on disk """ # if the target size isn't provided, fill it in using the full size of the image. This # provides useful functionality as it allows choose_file to return a full size derivative # of the image instead of the original if one is available - if target_size is None: - target_size = info.size - file = info.choose_file(target_size) - - source_path = self.source_path / info.name / file - if source_path.exists(): - return source_path - - task_id = f'{info.name}::{file}' - is_original = file == info.original - - async def download(): - with tempfile.NamedTemporaryFile() as f: - async for chunk in self._fetch_file(info.emu_irn, file, is_original): - f.write(chunk) - f.flush() - - # to avoid the conversion pool sitting there just converting loads of giant tiffs - # and blocking anything else from going through, we have two pools with different - # priorities (this could have been implemented as a priority queue of some kind but - # this is easier and time is money, yo! Might as well let the OS do the scheduling). - if source_path.suffix.lower() in {'.jpeg', '.jpg'}: - # jpegs are quick to convert - pool = self.ic_fast_pool - else: - # anything else is slower, tiffs for example - pool = self.ic_slow_pool - - # we've downloaded the file, convert it on a separate thread - await self.loop.run_in_executor(pool, convert_image, Path(f.name), source_path, - self.ic_quality, self.ic_subsampling) - - # we only want to be downloading a source file once so use the runner - await self.fetch_runner.run(task_id, download) - return source_path - - async def get_mss_doc(self, name: str, refresh: bool = False) -> Optional[dict]: - """ - Retrieves an MSS doc and ensures it's should be accessible. For a doc to be returned instead - of None: + if size is None: + size = info.size + file = info.choose_file(size) + source = MSSSourceFile(str(info.emu_irn), file, info.original == file) + async with self.store.use(source) as path: + yield path + + async def get_mss_doc(self, name: str) -> dict: + """ + Retrieves an MSS doc and ensures it's should be accessible. For a doc to be returned: - the GUID (i.e. the name) must be unique and exist in the mss elasticsearch index - the EMu IRN that the GUID maps to must be valid according the to the MSS (specifically @@ -256,57 +236,58 @@ async def get_mss_doc(self, name: str, refresh: bool = False) -> Optional[dict]: artefacts indices as an associated media item :param name: the image name (a GUID) - :param refresh: whether to enforce a refresh of the doc from elasticsearch rather than using - the cache :return: the mss doc as a dict or None """ + if name in self.mss_doc_cache: + return self.mss_doc_cache[name] + + try: + async with self.get_mss_doc_locker.acquire(name): + try: + # double check that the cache hasn't been filled while we waited for the lock + if name in self.mss_doc_cache: + return self.mss_doc_cache[name] + + # first, get the document from the mss index + total, doc = await self.es_handler.get_mss_doc(name) + if total > 1: + raise MSSDocDuplicates(self.name, name, total) + elif total == 0: + raise MSSDocNotFound(self.name, name) + + emu_irn = int(doc['id']) + + # TODO: I think we can get rid of this requirement now that we're using GUIDs + # instead of irns - you can't guess a GUID + # check if there's an associated collection record + if not await self.es_handler.has_collection_record(emu_irn): + raise MissingCollectionRecord(self.name, name, emu_irn) + + # finally, check with mss that the irn is valid + if not await self.store.check_access(emu_irn): + raise MSSAccessDenied(self.name, name, emu_irn) + + self.mss_doc_cache[name] = doc + return doc + except IIIFServerException: + raise + except Exception as cause: + e = ImageNotFound(self.name, name, cause=cause, level=logging.ERROR) + raise e from cause + except asyncio.TimeoutError: + raise Timeout(cause=cause, log=f'Timeout while waiting for get_mss_doc lock on {name} ' + f'in profile {self.name}') + + async def resolve_filename(self, name: str) -> str: + """ + Given an image name (i.e. a GUID), return the original filename or None if the image can't + be found. - async def get_doc() -> Optional[dict]: - # first, check that we have a document in the mss index - search_url = f'{next(self.es_hosts)}/{self.mss_index}/_search' - search = Search().filter('term', **{'guid.keyword': name}) - async with self.es_session.post(search_url, json=search.to_dict()) as response: - text = await response.text(encoding='utf-8') - result = json.loads(text) - if result['hits']['total'] == 1: - doc = result['hits']['hits'][0]['_source'] - emu_irn = doc['id'] - else: - # TODO: might be nice to indicate if the GUID is not unique? - return None - - # next, check that the irn is associated with a record in the collection datasets - count_url = f'{next(self.es_hosts)}/{self.collection_indices}/_count' - search = Search() \ - .filter('term', **{'data.associatedMedia._id': emu_irn}) \ - .filter('term', **{'meta.versions': int(time.time() * 1000)}) - async with self.es_session.post(count_url, json=search.to_dict()) as response: - text = await response.text(encoding='utf-8') - if json.loads(text)['count'] == 0: - return None - - # finally, check with mss that the irn is valid - async with self.mss_session.get(f'{self.mss_url}/{emu_irn}') as response: - if not response.ok: - return None - - # if we get here then all 3 checks have passed - return doc - - if refresh: - self.doc_runner.expire(name) - return await self.doc_runner.run(name, get_doc) - - async def resolve_filename(self, name: str) -> Optional[str]: - """ - Given an image name (i.e. IRN), return the original filename or None if the image can't be - found. - - :param name: the image name (in this case the IRN) - :return: the filename or None + :param name: the image name (in this case the GUID) + :return: the filename """ doc = await self.get_mss_doc(name) - return doc['file'] if doc is not None else None + return doc['file'] async def stream_original(self, name: str, chunk_size: int = 4096, raise_errors=True): """ @@ -318,37 +299,180 @@ async def stream_original(self, name: str, chunk_size: int = 4096, raise_errors= :param raise_errors: whether to raise errors when they happen or simply stop, swallowing the error """ - doc = await self.get_mss_doc(name) - if doc is not None: - try: - async for chunk in self._fetch_file(doc['id'], doc['file'], True, chunk_size): - yield chunk - except Exception as e: - if raise_errors: - raise e + try: + doc = await self.get_mss_doc(name) + source = MSSSourceFile(doc['id'], doc['file'], True, chunk_size) + async for chunk in self.store.stream(source): + yield chunk + except Exception as e: + if raise_errors: + raise e + + async def close(self): + """ + Close down this profile. + """ + await self.es_handler.close() + await self.store.close() + + +class MSSElasticsearchHandler: + """ + Class that handles requests to Elasticsearch for the MSS profile. + """ - async def _fetch_file(self, emu_irn: str, file: str, is_original: bool, chunk_size: int = 4096): + def __init__(self, es_hosts: List[str], collection_indices: List[str], limit: int = 20, + mss_index: str = 'mss'): """ - Fetches a file from MSS or, if the file is the original and doesn't exist in MSS, the old - dams servers. Once a source for the requested file is located, the bytes are yielded in - chunks of chunk_size. + :param es_hosts: a list of elasticsearch hosts to use + :param collection_indices: the indices to search to confirm the images can be accessed + :param limit: the maximum number of simultaneous connections that can be made to + elasticsearch + :param mss_index: the MSS index name in elasticsearch + """ + self.es_hosts = cycle(es_hosts) + self.es_session = create_client_session(limit, ssl=False) + self.collection_indices = ','.join(collection_indices) + self.mss_index = mss_index - :param emu_irn: the EMu IRN of the multimedia record for the file - :param file: the name of the file to retrieve - :param is_original: whether the file is an original image (if it is and the file doesn't - exist in MSS we'll try looking for a damsurl file) + async def get_mss_doc(self, guid: str) -> Tuple[int, Optional[dict]]: + """ + Given a GUID, return the number of MSS doc hits and the first hit from the MSS index. + + :param guid: the GUID of the image + :return: the total number of hits and the first hit's source + """ + search_url = f'{next(self.es_hosts)}/{self.mss_index}/_search' + search = Search().filter('term', **{'guid.keyword': guid}).extra(size=1) + async with self.es_session.post(search_url, json=search.to_dict()) as response: + text = await response.text(encoding='utf-8') + result = json.loads(text) + total = result['hits']['total'] + first_doc = next((doc['_source'] for doc in result['hits']['hits']), None) + return total, first_doc + + async def has_collection_record(self, emu_irn: int) -> bool: + """ + Check whether the given image IRN is associated with a collection record. + + :param emu_irn: the EMu IRN + :return: True if it is, False if not + """ + count_url = f'{next(self.es_hosts)}/{self.collection_indices}/_count' + search = Search() \ + .filter('term', **{'data.associatedMedia._id': emu_irn}) \ + .filter('term', **{'meta.versions': int(time.time() * 1000)}) + async with self.es_session.post(count_url, json=search.to_dict()) as response: + text = await response.text(encoding='utf-8') + return json.loads(text)['count'] > 0 + + async def close(self): + await self.es_session.close() + + +@dataclass +class MSSSourceFile(Fetchable): + """ + Fetchable subclass representing an image in the MSS that can be retrieved. + """ + emu_irn: int + file: str + is_original: bool + chunk_size: int = 4096 + + @property + def public_name(self) -> str: + return str(self.emu_irn) + + @property + def store_path(self) -> Path: + return Path(str(self.emu_irn), self.file) + + @property + def url(self) -> str: + return f'/nhmlive/{self.emu_irn}/{quote(self.file)}' + + @property + def dams_url(self) -> str: + return f'/nhmlive/{self.emu_irn}/damsurl' + + @staticmethod + def check_url(emu_irn: int) -> str: + return f'/nhmlive/{emu_irn}' + + +class MSSSourceStore(FetchCache): + """ + Class that controls fetching source files from the MSS and then storing them in a cache and + streaming them to users directly. + """ + + def __init__(self, root: Path, mss_url: str, max_size: int, ttl: float, + mss_limit: int = 20, dams_limit: int = 5, mss_ssl: bool = True, + dams_ssl: bool = True, slow_pool_size: int = 1, fast_pool_size: int = 1, + quality: int = 85, subsampling: str = '4:2:0'): + """ + :param root: the path to store the data + :param mss_url: the MSS base URL + :param max_size: maximum size that the cache can grow to + :param ttl: the maximum time a source file can be unused before it is removed + :param mss_limit: the maximum number of simultaneous connections allowed to the MSS + :param dams_limit: the maximum number of simultaneous connections allowed to the dams + :param mss_ssl: whether to use SSL for MSS connections + :param dams_ssl: whether to use SSL for the dams connections + :param slow_pool_size: size of the "slow" conversion pool + :param fast_pool_size: size of the "fast" conversion pool + :param quality: jpeg quality of converted source files + :param subsampling: jpeg subsampling value of converted source files + """ + super().__init__(root, ttl, max_size) + self.mss_session = create_client_session(mss_limit, mss_ssl, mss_url) + self.dm_session = create_client_session(dams_limit, dams_ssl) + self._fast_pool = ProcessPoolExecutor(max_workers=slow_pool_size) + self._slow_pool = ProcessPoolExecutor(max_workers=fast_pool_size) + self._convert = partial(convert_image, quality=quality, subsampling=subsampling) + + async def _fetch(self, source: MSSSourceFile): + """ + Fetch a file from the MSS and store it in the cache. + + :param source: the source + """ + with tempfile.NamedTemporaryFile() as f: + image_path = Path(f.name) + async for chunk in self.stream(source): + f.write(chunk) + f.flush() + + # convert the image file, saving the data in a temp file but then moving it + # to the source_path after the conversion is complete + with tempfile.NamedTemporaryFile(delete=False) as g: + target_path = Path(g.name) + + pool = self._choose_convert_pool(source.file) + convert = partial(self._convert, image_path, target_path) + await asyncio.get_running_loop().run_in_executor(pool, convert) + + cache_path = self.root / source.store_path + cache_path.parent.mkdir(parents=True, exist_ok=True) + shutil.move(target_path, cache_path) + + async def stream(self, source: MSSSourceFile): + """ + Stream the given source file directly from the MSS. + + :param source: the source + :return: yields chunks of bytes """ async with AsyncExitStack() as stack: - file_url = f'{self.mss_url}/{emu_irn}/{quote(file)}' - response = await stack.enter_async_context(self.mss_session.get(file_url)) + response = await stack.enter_async_context(self.mss_session.get(source.url)) if response.status == 401: raise HTTPException(status_code=401, detail=f'Access denied') - if response.status == 404 and is_original: + if response.status == 404 and source.is_original: # check for a damsurl file - damsurl_file = f'{self.mss_url}/{emu_irn}/damsurl' - response = await stack.enter_async_context(self.mss_session.get(damsurl_file)) + response = await stack.enter_async_context(self.mss_session.get(source.dams_url)) response.raise_for_status() # load the url in the response and fetch it @@ -357,29 +481,44 @@ async def _fetch_file(self, emu_irn: str, file: str, is_original: bool, chunk_si response.raise_for_status() - while chunk := await response.content.read(chunk_size): + while chunk := await response.content.read(source.chunk_size): yield chunk - async def close(self): + def _choose_convert_pool(self, file: str) -> Executor: """ - Close down this profile. + Pick the pool to use for converting the given file. JPEGs are converted in the fast pool + whereas everything else is done in the slow pool. + + To avoid the conversion pool sitting there just converting loads of giant tiffs and blocking + anything else from going through, we have two pools with different priorities (this could + have been implemented as a priority queue of some kind but this is easier and time is money, + yo! Might as well let the OS do the scheduling). + + :param file: the file name + :return: which pool to convert in """ - await self.es_session.close() - await self.mss_session.close() - await self.dm_session.close() - self.ic_fast_pool.shutdown() - self.ic_slow_pool.shutdown() + if Path(file).suffix.lower() in ('.jpeg', '.jpg'): + return self._fast_pool + else: + return self._slow_pool - async def get_status(self, full: bool = False) -> dict: + async def check_access(self, emu_irn: int) -> bool: """ - Returns some nice stats about what the runners are up to and such. + Check whether the EMu multimedia IRN is valid according to the APS (which is a part of the + MSS). - :return: a dict of stats + :param emu_irn: the EMu multimedia IRN + :return: True if it's ok, False if not """ - status = await super().get_status(full) - runners = (self.doc_runner, self.fetch_runner) - status['runners'] = { - runner.name: await runner.get_status() for runner in runners - } - # TODO: add pool stats - return status + async with self.mss_session.get(MSSSourceFile.check_url(emu_irn)) as response: + return response.ok + + async def close(self): + """ + Close down the store, this will simply close out our sessions and pools, all the files are + left on the disk. + """ + await self.mss_session.close() + await self.dm_session.close() + self._fast_pool.shutdown() + self._slow_pool.shutdown() diff --git a/iiif/routers/iiif.py b/iiif/routers/iiif.py index 7601cf4..0b0c095 100644 --- a/iiif/routers/iiif.py +++ b/iiif/routers/iiif.py @@ -2,7 +2,6 @@ from starlette.responses import FileResponse, JSONResponse from iiif.ops import IIIF_LEVEL, parse_params -from iiif.processing import Task from iiif.state import state from iiif.utils import get_mimetype @@ -57,30 +56,9 @@ async def get_image_data(identifier: str, region: str, size: str, rotation: str, # parse the IIIF parts of the request to assert parameter correctness and define what how we're # going to manipulate the image when we process it ops = parse_params(info, region, size, rotation, quality, fmt) - output_path = state.config.cache_path / info.profile_name / info.name / ops.location - - # only do work if there is no cached version of the requested file - if not output_path.exists(): - # this is a performance enhancement. By providing a hint at the size of the image we need to - # serve up, we can (sometimes!) use a smaller source image thus reducing processing time - if ops.region.full: - # the full image region is selected so we can take the hint from the size parameter - size_hint = (ops.size.w, ops.size.h) - else: - # a region has been specified, we'll have to use the whole thing - # TODO: can we do better than this? - size_hint = None - - # ensure a source file is available to process, passing the hint - source_path = await profile.fetch_source(info, size_hint) - - task = Task(source_path, output_path, ops) - # submit the task to the dispatcher and wait for it to complete - await state.dispatcher.submit(task) - - # add a cache-control header and iiif header + path = await state.processor.process(profile, info, ops) headers = { 'cache-control': f'max-age={profile.cache_for}', 'link': f';rel="profile"' } - return FileResponse(output_path, media_type=get_mimetype(output_path), headers=headers) + return FileResponse(path, media_type=get_mimetype(path), headers=headers) diff --git a/iiif/routers/originals.py b/iiif/routers/originals.py index 8a9fd48..c5c6410 100644 --- a/iiif/routers/originals.py +++ b/iiif/routers/originals.py @@ -5,7 +5,7 @@ from starlette.responses import StreamingResponse from zipstream import AioZipStream -from iiif.exceptions import too_many_images +from iiif.exceptions import TooManyImages from iiif.profiles import AbstractProfile from iiif.state import state from iiif.utils import parse_identifier @@ -58,7 +58,7 @@ async def zip_originals(identifiers: str, stop_on_error: bool = True, """ profiles_and_names = list(parse_identifiers(identifiers)) if len(profiles_and_names) > state.config.download_max_files: - raise too_many_images(state.config.download_max_files) + raise TooManyImages(state.config.download_max_files) # aiozipstream can't handle async generators which is a shame :( files = [] diff --git a/iiif/routers/simple.py b/iiif/routers/simple.py index 3a346f1..b0848f4 100644 --- a/iiif/routers/simple.py +++ b/iiif/routers/simple.py @@ -1,7 +1,7 @@ from fastapi import APIRouter from starlette.responses import FileResponse, StreamingResponse -from iiif.exceptions import image_not_found +from iiif.exceptions import ImageNotFound from iiif.routers.iiif import get_image_data from iiif.state import state from iiif.utils import parse_identifier, get_mimetype @@ -71,7 +71,7 @@ async def original(identifier: str) -> StreamingResponse: profile = state.get_profile(profile_name) filename = await profile.resolve_filename(name) if filename is None: - raise image_not_found() + raise ImageNotFound(profile_name, name) response = StreamingResponse( profile.stream_original(name, chunk_size=state.config.download_chunk_size), media_type=get_mimetype(filename), diff --git a/iiif/state.py b/iiif/state.py index fa9f20b..ff36cd3 100644 --- a/iiif/state.py +++ b/iiif/state.py @@ -1,8 +1,8 @@ from typing import Tuple, Optional from iiif.config import load_config, Config -from iiif.exceptions import profile_not_found, image_not_found -from iiif.processing import ImageProcessingDispatcher +from iiif.exceptions import ProfileNotFound, ImageNotFound +from iiif.processing import ImageProcessor from iiif.profiles import load_profiles from iiif.profiles.base import AbstractProfile, ImageInfo from iiif.utils import parse_identifier @@ -16,10 +16,9 @@ class State: def __init__(self): self.config: Config = load_config() self.profiles = load_profiles(self.config) - # create the dispatcher which controls how image data requests are handled - self.dispatcher: ImageProcessingDispatcher = ImageProcessingDispatcher() - self.dispatcher.init_workers(self.config.image_pool_size, - self.config.image_cache_size_per_process) + # create the processor which actually does the IIIF image processing + self.processor = ImageProcessor(self.config.cache_path, self.config.processed_cache_ttl, + self.config.processed_cache_size) def get_profile(self, profile_name: Optional[str] = None) -> AbstractProfile: """ @@ -34,7 +33,7 @@ def get_profile(self, profile_name: Optional[str] = None) -> AbstractProfile: profile = self.profiles.get(profile_name, None) if profile is None: - raise profile_not_found() + raise ProfileNotFound(profile_name) return profile @staticmethod @@ -50,7 +49,7 @@ async def get_info(profile: AbstractProfile, name: str) -> ImageInfo: """ info = await profile.get_info(name) if info is None: - raise image_not_found() + raise ImageNotFound(profile.name, name) return info async def get_profile_and_info(self, identifier: str) -> Tuple[AbstractProfile, ImageInfo]: diff --git a/iiif/utils.py b/iiif/utils.py index ce0f5d6..ad96dc8 100644 --- a/iiif/utils.py +++ b/iiif/utils.py @@ -1,135 +1,75 @@ #!/usr/bin/env python3 # encoding: utf-8 +from collections import OrderedDict, Counter +import abc +import aiohttp import asyncio -from asyncio import Future -from pathlib import Path -from typing import Callable, Awaitable, Optional, Tuple, Union - import humanize import io import logging import mimetypes -import sys from PIL import Image -from cachetools import LRUCache -from contextlib import contextmanager +from contextlib import suppress, asynccontextmanager from functools import lru_cache from itertools import count from jpegtran import JPEGImage +from pathlib import Path +from typing import Optional, Tuple, Union, Any from wand.exceptions import MissingDelegateError from wand.image import Image as WandImage +from iiif.exceptions import IIIFServerException + mimetypes.init() +# this is all assuming we're using uvicorn... +uvicorn_logger = logging.getLogger('uvicorn.error') +# use this logger to dump stuff into the same channels as the uvicorn logs +logger = uvicorn_logger.getChild('iiif') + -class OnceRunner: +class Locker: """ - This class runs tasks and caches the results. It ensures that a task is only actually run the - first time it is submitted and any later calls use the cached result of the first run. + A normal async Lock surrounded by a timeout. """ - def __init__(self, name: str, size: int, exception_timeout: Optional[float] = 0): + def __init__(self, default_timeout: float = 0): """ - :param name: the name of the runner - :param size: the maximum size of the LRU cache used to store task results - :param exception_timeout: minimum time in seconds to wait before trying a task again if it - raises an exception. Pass 0 to never try the task again (default). + :param default_timeout: how long to timeout the locks for by default (defaults to 0 which + means don't timeout, just lock forever). """ - self.name = name - self.results = LRUCache(size) - self.exception_timeout = exception_timeout - self.waiting = 0 - self.working = 0 - self.loop = asyncio.get_event_loop() - - async def run(self, task_id: str, task: Callable[..., Awaitable], *args, **kwargs): - """ - Given a task and it's unique identifier, either run the task if it's the first time we've - seen it or return the cached result of a previous run. If the task passed is currently - running then the result is awaited and returned once the first instance of the task - completes. - - :param task_id: the task's id - :param task: the task itself, this should be a reference to an async function - :param args: the args to pass to the async function when running it - :param kwargs: the kwargs to pass to the async function when running it - :return: the task's return value - """ - if task_id in self.results: - with self.wait(): - # this will either resolve instantaneously because the future in the results cache - # is done or it will wait for the task that is populating the future to complete - return await self.results[task_id] - - with self.work(): - self.results[task_id] = Future() - try: - result = await task(*args, **kwargs) - self.results[task_id].set_result(result) - return result - except Exception as exception: - # set the exception on the future so that any future or concurrent requests for the - # same task get the same exception when they await it - self.results[task_id].set_exception(exception) - - if self.exception_timeout: - self.loop.call_later(self.exception_timeout, self.expire, task_id) - - # raise it for the current caller - raise exception + self._locks = {} + self.default_timeout = default_timeout - def expire(self, task_id: str) -> bool: - """ - Force the expiry of the given task by popping it from the cache. - - :param task_id: the id of the task to expire - :return: True if the task was evicted, False if it was already expired - """ - return self.results.pop(task_id, None) is not None - - def expire_matching(self, filter_function: Callable[[str], bool]) -> int: - """ - Expire the task results that match the given filter function. Returns the number of task - results that were expired. - - :param filter_function: a function that when passed a str task ID returns True if the task - result should be removed and False if not - :return: the number of tasks removed by the filter function - """ - return sum(map(self.expire, filter(filter_function, list(self.results.keys())))) - - async def get_status(self) -> dict: - """ - Returns a status dict about the state of the runner. + @asynccontextmanager + async def acquire(self, key: Any, timeout: Optional[float] = None): + if timeout is None: + timeout = self.default_timeout - :return: a dict of details - """ - return { - 'size': len(self.results), - 'waiting': self.waiting, - 'working': self.working, - } - - @contextmanager - def wait(self): - """ - Convenience context manager that increases and then decreases the waiting count around - whatever code it wraps. + lock = self._locks.setdefault(key, asyncio.Lock()) + if timeout > 0: + acquired = False + try: + acquired = await asyncio.wait_for(lock.acquire(), timeout=timeout) + yield + except asyncio.TimeoutError: + raise + finally: + if acquired: + lock.release() + else: + async with lock: + yield + + def is_locked(self, key: Any) -> bool: """ - self.waiting += 1 - yield - self.waiting -= 1 + Check if the given key is locked. - @contextmanager - def work(self): + :param key: the key + :return: True if the key is locked, False if not """ - Convenience context manager that increases and then decreases the working count around - whatever code it wraps. - """ - self.working += 1 - yield - self.working -= 1 + return key in self._locks and self._locks[key].locked() def get_path_stats(path: Path) -> dict: @@ -225,23 +165,6 @@ def convert_image(image_path: Path, target_path: Path, quality: int = 80, image.save(target_path, format='jpeg', quality=quality, subsampling=subsampling) -def create_logger(name: str, level: Union[int, str]) -> logging.Logger: - """ - Creates a logger with the given name that outputs to stdout and returns it. - - :param name: the logger name - :param level: the logger level - :return: the logger instance - """ - handler = logging.StreamHandler(sys.stdout) - handler.setLevel(level) - formatter = logging.Formatter('%(asctime)s %(levelname)s [%(name)s]: %(message)s') - handler.setFormatter(formatter) - logger = logging.getLogger(name) - logger.addHandler(handler) - return logger - - def parse_identifier(identifier: str) -> Tuple[Optional[str], str]: """ Parse the identifier into a profile name and image name. The profile name may be omitted in @@ -290,3 +213,197 @@ def get_mimetype(filename: Union[str, Path]) -> str: guess = mimetypes.guess_type(filename)[0] # use octet stream as the default return guess if guess is not None else 'application/octet-stream' + + +def create_client_session(limit: int, ssl: bool = True, + base_url: Optional[str] = None) -> aiohttp.ClientSession: + """ + Convenience function to create a new aiohttp session object. + + :param limit: the maximum number of simultaneous connections allowed + :param ssl: whether to verify SSL certs or not + :param base_url: the base URL for all requests made with this session + :return: a new aiohttp.ClientSession object + """ + # if we want to use ssl this parameter needs to be set to None, if not then False is used + ssl = None if ssl else False + return aiohttp.ClientSession(base_url=base_url, + connector=aiohttp.TCPConnector(limit=limit, ssl=ssl)) + + +class Fetchable(abc.ABC): + """ + Abstract base class of something that can be fetched by the FetchCache. + """ + + @property + @abc.abstractmethod + def public_name(self) -> str: + """ + A name to use in errors. + :return: a name that is acceptable for public users. + """ + ... + + @property + @abc.abstractmethod + def store_path(self) -> Path: + """ + Where this fetchable will be stored in the store. This should be relative to the store root. + :return: the relative path where this fetchable should be stored + """ + ... + + +class FetchCache(abc.ABC): + """ + A cache with TTL and LRU functionality which allows custom fetching of the data put in it. + """ + + def __init__(self, root: Path, ttl: float, max_size: float, clean_empty_dirs: bool = True, + fetch_timeout: float = 120): + """ + Note that this init will automatically call self.load() and therefore populate the cache. + This could take time if the cache is enormous. + + :param root: the root under which all data will be stored + :param ttl: how long untouched files can stay in the cache before being removed + :param max_size: the maximum number of bytes that can be stored in the cache + :param clean_empty_dirs: whether to delete empty parent dirs when removing expired files + :param fetch_timeout: how long to wait for the _fetch function to complete + """ + self.root = root + self.ttl = ttl + self.max_size = max_size + self.clean_empty_dirs = clean_empty_dirs + self.fetch_timeout = fetch_timeout + self.total_size = 0 + self._in_use = Counter() + self._sizes = {} + self._locker = Locker() + self._cleaners = OrderedDict() + self.requests = 0 + self.completed = 0 + self.errors = 0 + # let's see what currently exists + self.load() + + def load(self): + """ + Scan the root dir and add any found files into the cache. + """ + for path in self.root.rglob('*'): + if path.is_file(): + size = path.stat().st_size + self._sizes[path] = size + self._cleaners[path] = self._schedule_clean_up(path) + self.total_size += size + logger.info(f'Found {len(self._sizes)} files in {self.root} [{self.pct}]') + + @property + def pct(self) -> str: + """ + Returns the percentage usage of the cache as a number with 2 decimal points. This is mainly + a convenience for logging. + + :return: the percentage the cache is in use as a formatted string + """ + return f'{(self.total_size / self.max_size):.2f}%' + + def _clean_up(self, path: Path): + """ + Callback scheduled to clean up paths when their TTL expires. + + :param path: the path to clean up + """ + with suppress(KeyError): + self._cleaners.pop(path) + + if path not in self._in_use: + if path.exists(): + with suppress(Exception): + path.unlink() + if self.clean_empty_dirs: + for parent in path.parents: + if parent != self.root and not any(parent.iterdir()): + parent.rmdir() + else: + break + self.total_size -= self._sizes.pop(path, 0) + logger.info(f'Cleaned up {path} [{self.pct}]') + + def _schedule_clean_up(self, path: Path) -> asyncio.TimerHandle: + """ + Add a callback to remove the given path after the TTL. + + :param path: the path to remove + :return: a TimerHandle object + """ + return asyncio.get_running_loop().call_later(self.ttl, self._clean_up, path) + + def __contains__(self, relative_path: Path) -> bool: + """ + Checks whether the given path is in use or not. This only checks the in use dict, not the + cleaners dict. + + :param relative_path: the path (relative to the root) + :return: True if the path is in use, False if not + """ + return self.root / relative_path in self._in_use + + @abc.abstractmethod + def _fetch(self, fetchable: Fetchable): + """ + Abstract method which, when called, puts the requested fetchable file on disk. + + :param fetchable: the fetchable + """ + ... + + @asynccontextmanager + async def use(self, fetchable: Fetchable) -> Path: + """ + Async context manager which when entered ensures the given fetchable is on disk and provides + a path to it. Once the context manager exits the path is volatile and will expire according + to the TTL (once it is no longer in use by any other coroutines). + + :param fetchable: the fetchable + :return: the full path to the file + """ + self.requests += 1 + path = self.root / fetchable.store_path + + if path not in self._in_use: + if path in self._cleaners: + self._cleaners.pop(path).cancel() + elif not path.exists(): + try: + async with self._locker.acquire(path, timeout=self.fetch_timeout): + await self._fetch(fetchable) + self._sizes[path] = path.stat().st_size + self.total_size += self._sizes[path] + + times = 0 + while self._cleaners and self.total_size > self.max_size and times < 10: + path_to_clean_up, handle = self._cleaners.popitem(last=False) + handle.cancel() + self._clean_up(path_to_clean_up) + times += 1 + except Exception as cause: + self.errors += 1 + e = IIIFServerException( + f'An error occurred while fetching {fetchable.public_name}', cause=cause + ) + raise e from cause + + self._in_use[path] += 1 + try: + yield path + finally: + self._in_use[path] -= 1 + + if self._in_use[path] == 0: + del self._in_use[path] + self._cleaners[path] = self._schedule_clean_up(path) + + self.completed += 1 diff --git a/iiif/web.py b/iiif/web.py index 9631d59..0ddc783 100644 --- a/iiif/web.py +++ b/iiif/web.py @@ -1,11 +1,15 @@ #!/usr/bin/env python3 # encoding: utf-8 +import aiohttp +import platform +import time from PIL import Image -from fastapi import FastAPI +from fastapi import FastAPI, Request from fastapi.middleware.cors import CORSMiddleware -from starlette.responses import JSONResponse +from starlette.responses import JSONResponse, StreamingResponse +from iiif.exceptions import handler, IIIFServerException from iiif.routers import iiif, originals, simple from iiif.state import state @@ -23,6 +27,24 @@ ) +@app.middleware('http') +async def add_debug_headers(request: Request, call_next): + """ + Middleware which adds some debug headers :) + + :param request: the request + :param call_next: the next function in the chain + :return: the response + """ + start_time = time.monotonic() + response = await call_next(request) + # add the process time + response.headers['x-process-time'] = str(time.monotonic() - start_time) + # add which server processed the request + response.headers['x-served-by'] = platform.node() + return response + + @app.on_event('shutdown') async def on_shutdown(): """ @@ -31,7 +53,7 @@ async def on_shutdown(): """ for profile in state.profiles.values(): await profile.close() - state.dispatcher.stop() + state.processor.stop() @app.get('/status') @@ -47,7 +69,7 @@ async def status(full: bool = False) -> JSONResponse: body = { 'status': ':)', 'default_profile': state.config.default_profile_name, - 'processing': state.dispatcher.get_status(), + 'processing': state.processor.get_status(), 'profiles': { profile.name: await profile.get_status(full) for profile in state.profiles.values() @@ -56,6 +78,20 @@ async def status(full: bool = False) -> JSONResponse: return JSONResponse(body, headers={'cache-control': 'no-store'}) +@app.get('/favicon.ico') +async def favicon() -> StreamingResponse: + async def get(): + async with aiohttp.ClientSession() as session: + async with session.get('https://iiif.io/favicon.ico') as response: + while chunk := await response.content.read(4096): + yield chunk + + return StreamingResponse(get()) + + +app.add_exception_handler(IIIFServerException, handler) + + # order matters here btw! app.include_router(originals.router) app.include_router(simple.router) diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..2f4c80e --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +asyncio_mode = auto diff --git a/setup.py b/setup.py index c2c8228..7817a0a 100644 --- a/setup.py +++ b/setup.py @@ -11,11 +11,10 @@ jpegtran_url = 'git+https://github.com/jbaiter/jpegtran-cffi.git@70928eb#egg=jpegtran-cffi' install_dependencies = [ - 'aiocron~=1.4', 'aiofiles~=0.6.0', - 'aiohttp[speedups]~=3.7.4', + 'aiohttp[speedups]~=3.8.1', 'aiozipstream~=0.4', - 'cachetools~=4.2.4', + 'cachetools~=5.0.0', 'cffi~=1.14.5', 'elasticsearch-dsl>=6.0.0,<7.0.0', 'fastapi~=0.63.0', @@ -23,13 +22,15 @@ f'jpegtran-cffi @ {jpegtran_url}', 'pillow~=8.2.0', 'pyyaml~=5.4.1', - 'uvicorn[standard]~=0.13.4', + 'uvicorn[standard]~=0.17.6', 'wand~=0.6.7', + 'anyio~=3.5.0', ] test_dependencies = [ 'pytest', 'pytest-asyncio', 'pytest-cov', + 'aioresponses~=0.7.3', # needed by starlette's test client 'requests', ] diff --git a/tests/profiles/test_disk.py b/tests/profiles/test_disk.py index 2898cdf..701d4c1 100644 --- a/tests/profiles/test_disk.py +++ b/tests/profiles/test_disk.py @@ -1,3 +1,4 @@ +from fastapi import HTTPException from unittest.mock import patch, MagicMock import math @@ -15,24 +16,22 @@ def disk_profile(config): class TestOnDiskProfile: - @pytest.mark.asyncio async def test_get_info_no_file(self, disk_profile): - info = await disk_profile.get_info('image') - assert info is None + with pytest.raises(HTTPException) as exc_info: + await disk_profile.get_info('image') + assert exc_info.value.status_code == 404 - @pytest.mark.asyncio async def test_get_info_with_file(self, disk_profile, config): create_image(config, 100, 100, 'test', 'image') info = await disk_profile.get_info('image') assert info.size == (100, 100) - @pytest.mark.asyncio async def test_fetch_source_no_file(self, config, disk_profile): info = ImageInfo('test', 'image', 100, 100) - source_path = await disk_profile.fetch_source(info) - assert source_path is None + with pytest.raises(HTTPException) as exc_info: + await disk_profile.fetch_source(info) + assert exc_info.value.status_code == 404 - @pytest.mark.asyncio async def test_fetch_source_with_file(self, config, disk_profile): info = ImageInfo('test', 'image', 100, 100) create_image(config, 100, 100, 'test', 'image') @@ -45,26 +44,23 @@ async def test_fetch_source_with_file(self, config, disk_profile): source_path = await disk_profile.fetch_source(info, (50, 50)) assert source_path == disk_profile.source_path / 'image' - @pytest.mark.asyncio async def test_resolve_filename_no_file(self, disk_profile): - filename = await disk_profile.resolve_filename('image') - assert filename is None - - @pytest.mark.asyncio - async def test_resolve_filename_with_file(self, config, disk_profile): - create_image(config, 100, 100, 'test', 'image') filename = await disk_profile.resolve_filename('image') assert filename == 'image' - @pytest.mark.asyncio - async def test_stream_original_no_file(self, disk_profile): + async def test_stream_original_no_file_errors_off(self, disk_profile): count = 0 - async for _ in disk_profile.stream_original('image'): + async for _ in disk_profile.stream_original('image', raise_errors=False): count += 1 assert count == 0 - @pytest.mark.asyncio - async def test_stream_original_with_file(self, config, disk_profile): + async def test_stream_original_no_file_errors_on(self, disk_profile): + with pytest.raises(HTTPException) as exc_info: + async for _ in disk_profile.stream_original('image', raise_errors=True): + pass + assert exc_info.value.status_code == 404 + + async def test_stream_original(self, config, disk_profile): path = create_image(config, 10000, 10000, 'test', 'image') size = path.stat().st_size chunk_size = 1024 @@ -79,20 +75,3 @@ async def test_stream_original_with_file(self, config, disk_profile): assert count == expected_count with path.open('rb') as f: assert f.read() == data - - @pytest.mark.asyncio - async def test_stream_original_with_file_errors_on(self, config, disk_profile): - disk_profile._get_source = MagicMock(return_value=MagicMock( - exists=MagicMock(return_value=True), __str__=MagicMock(return_value='/dev/null/nope'))) - with pytest.raises(Exception): - async for _ in disk_profile.stream_original('image', raise_errors=True): - pass - - @pytest.mark.asyncio - async def test_stream_original_with_file_errors_off(self, config, disk_profile): - disk_profile._get_source = MagicMock(return_value=MagicMock( - exists=MagicMock(return_value=True), __str__=MagicMock(return_value='/dev/null/nope'))) - data = b'' - async for chunk in disk_profile.stream_original('image', raise_errors=False): - data += chunk - assert not data diff --git a/tests/profiles/test_mss.py b/tests/profiles/test_mss.py index 1ac2921..036621d 100644 --- a/tests/profiles/test_mss.py +++ b/tests/profiles/test_mss.py @@ -1,13 +1,16 @@ #!/usr/bin/env python3 # encoding: utf-8 - -from unittest.mock import AsyncMock, patch - -import json +import asyncio import pytest - -from iiif.profiles import MSSProfile -from iiif.profiles.mss import MSSImageInfo +from contextlib import asynccontextmanager +from fastapi import HTTPException +from pathlib import Path +from typing import Optional, Union +from unittest.mock import AsyncMock, patch, Mock, MagicMock + +from iiif.config import Config +from iiif.exceptions import ImageNotFound +from iiif.profiles.mss import MSSImageInfo, MSSProfile from tests.utils import create_image @@ -50,57 +53,123 @@ def test_mss_choose_file_with_derivatives(): assert info.choose_file((2000, 4000)) == 'original.tif' -def create_es_mss_doc(doc): - if doc is None: - es_doc = {'found': False} +def create_profile(config: Config, mss_doc: Union[Exception, dict], has_record: bool, + mss_valid: bool, image: Union[Exception, Path], **kwargs) -> MSSProfile: + mock_es_handler = MagicMock( + has_collection_record=AsyncMock(return_value=has_record), + close=AsyncMock() + ) + if isinstance(mss_doc, Exception): + mock_es_handler.configure_mock(get_mss_doc=AsyncMock(side_effect=mss_doc)) else: - es_doc = {'found': True, '_source': doc} - return json.dumps(es_doc) - - -@pytest.fixture -def mss_profile(config): - return MSSProfile('test', config, 'http://creativecommons.org/licenses/by/4.0/', [''], '', 1, - 1, ['collections']) - - -# TODO: write more and better mss profile tests - -class TestMSSProfileGetInfo: - - @pytest.mark.asyncio - async def test_allowed(self, mss_profile): - mss_doc = { - 'id': 1234, - 'file': 'beans.tiff', - 'width': 4000, - 'height': 1600, - } - mock_get_mss_doc = AsyncMock(return_value=mss_doc) - with patch.object(mss_profile, 'get_mss_doc', mock_get_mss_doc): - info = await mss_profile.get_info('testname') - assert info is not None - assert info.name == 'testname' - assert info.size == (4000, 1600) - assert info.original == 'beans.tiff' - - @pytest.mark.asyncio - async def test_missing_collections_doc(self, mss_profile): - mock_get_mss_doc = AsyncMock(return_value=None) - with patch.object(mss_profile, 'get_mss_doc', mock_get_mss_doc): - info = await mss_profile.get_info('1234') - assert info is None - - @pytest.mark.asyncio - async def test_missing_size(self, config, mss_profile): - mss_doc = { - 'id': 1234, - 'file': 'beans.tiff', - } - mock_get_mss_doc = AsyncMock(return_value=mss_doc) - with patch.object(mss_profile, 'get_mss_doc', mock_get_mss_doc): - source_path = create_image(config, 140, 504, 'mss', 'test') - mss_profile.fetch_source = AsyncMock(return_value=source_path) - info = await mss_profile.get_info('test') - assert info is not None - assert info.size == (140, 504) + mock_es_handler.configure_mock(get_mss_doc=AsyncMock(return_value=(1, mss_doc))) + + mock_store = MagicMock( + check_access=AsyncMock(return_value=mss_valid), + close=AsyncMock() + ) + + @asynccontextmanager + async def use(*a, **k): + if isinstance(image, Exception): + raise image + else: + yield image + + async def data_iter(*stream_args, **stream_kwargs): + if isinstance(image, Exception): + raise image + else: + for byte in image.read_bytes(): + yield bytes([byte]) + + mock_store.configure_mock(stream=data_iter, use=use) + + with patch('iiif.profiles.mss.MSSElasticsearchHandler', Mock(return_value=mock_es_handler)): + with patch('iiif.profiles.mss.MSSSourceStore', Mock(return_value=mock_store)): + return MSSProfile('test', config, 'some-rights-yo', Mock(), Mock(), Mock(), **kwargs) + + +def create_mss_doc(emu_irn: int, file: str, width: Optional[int] = None, + height: Optional[int] = None, *derivatives: dict) -> dict: + doc = { + 'id': emu_irn, + 'file': file, + } + if width: + doc['width'] = width + if height: + doc['height'] = height + if derivatives: + doc['derivatives'] = list(derivatives) + return doc + + +def create_derivative(width, height, file) -> dict: + return {'width': width, 'height': height, 'file': file} + + +class TestGetInfo: + + async def test_no_source_required(self, config): + mss_doc = create_mss_doc(7, 'image.jpg', 200, 300) + profile = create_profile(config, mss_doc, True, True, Exception('should not need this')) + info = await profile.get_info('the_name') + assert info.emu_irn == 7 + assert info.width == 200 + assert info.height == 300 + assert info.original == 'image.jpg' + assert info.derivatives == [] + + async def test_source_required(self, config): + mss_doc = create_mss_doc(7, 'image.jpg') + image_path = create_image(config, 200, 300) + profile = create_profile(config, mss_doc, True, True, image_path) + info = await profile.get_info('the_name') + assert info.emu_irn == 7 + assert info.width == 200 + assert info.height == 300 + assert info.original == 'image.jpg' + assert info.derivatives == [] + + async def test_doc_error(self, config): + image = create_image(config, 200, 300) + exception = Exception('narp') + profile = create_profile(config, exception, True, True, image) + with pytest.raises(ImageNotFound) as exc_info1: + await profile.get_info('the_name') + assert exc_info1.value.cause is exception + + async def test_cache(self, config): + mss_doc = create_mss_doc(7, 'image.jpg', 200, 300) + profile = create_profile(config, mss_doc, True, True, Exception('should not need this')) + with patch.object(profile, 'get_mss_doc', wraps=profile.get_mss_doc): + await profile.get_info('the_name') + assert profile.get_mss_doc.call_count == 1 + await profile.get_info('the_name') + assert profile.get_mss_doc.call_count == 1 + + async def test_source_error(self, config): + mss_doc = create_mss_doc(7, 'image.jpg') + exception = Exception('errrrooorr!') + profile = create_profile(config, mss_doc, True, True, exception) + with pytest.raises(ImageNotFound) as exc_info: + await profile.get_info('the_name') + assert exc_info.value.cause is exception + + async def test_get_size_error(self, config): + mss_doc = create_mss_doc(7, 'image.jpg') + image = create_image(config, 200, 300) + profile = create_profile(config, mss_doc, True, True, image) + exception = Exception('nope!') + with patch('iiif.profiles.mss.get_size', MagicMock(side_effect=exception)): + with pytest.raises(ImageNotFound) as exc_info: + await profile.get_info('the_name') + assert exc_info.value.cause is exception + + +async def test_close(config): + profile = create_profile(config, Exception('meh'), True, True, Exception('meh')) + await profile.close() + profile.es_handler.close.assert_called_once() + profile.store.close.assert_called_once() diff --git a/tests/profiles/test_mss_es.py b/tests/profiles/test_mss_es.py new file mode 100644 index 0000000..468d71b --- /dev/null +++ b/tests/profiles/test_mss_es.py @@ -0,0 +1,94 @@ +from contextlib import closing + +import pytest +from aioresponses import aioresponses +from fastapi import HTTPException + +from iiif.profiles.mss import MSSElasticsearchHandler + +MOCK_HOST = 'http://not.a.real.es.host' +MSS_INDEX = 'mss' + + +async def test_get_mss_doc(): + mock_doc = {'a': 'doc'} + mock_response = {'hits': {'total': 1, 'hits': [{'_source': mock_doc}]}} + handler = MSSElasticsearchHandler([MOCK_HOST], ['index-1', 'index-2'], mss_index=MSS_INDEX) + try: + with aioresponses() as m: + m.post(f'{MOCK_HOST}/{MSS_INDEX}/_search', payload=mock_response) + total, doc = await handler.get_mss_doc('some-guid') + assert total == 1 + assert doc == mock_doc + finally: + await handler.close() + + +async def test_get_mss_doc_none_found(): + mock_response = {'hits': {'total': 0, 'hits': []}} + handler = MSSElasticsearchHandler([MOCK_HOST], ['index-1', 'index-2'], mss_index=MSS_INDEX) + try: + with aioresponses() as m: + m.post(f'{MOCK_HOST}/{MSS_INDEX}/_search', payload=mock_response) + total, doc = await handler.get_mss_doc('some-guid') + assert total == 0 + assert doc is None + finally: + await handler.close() + + +async def test_get_mss_doc_many_found(): + mock_docs = [{'a': 'doc'}, {'another': 'doc'}] + mock_response = {'hits': {'total': len(mock_docs), + 'hits': [{'_source': mock_doc} for mock_doc in mock_docs]}} + handler = MSSElasticsearchHandler([MOCK_HOST], ['index-1', 'index-2'], mss_index=MSS_INDEX) + try: + with aioresponses() as m: + m.post(f'{MOCK_HOST}/{MSS_INDEX}/_search', payload=mock_response) + total, doc = await handler.get_mss_doc('some-guid') + assert total == len(mock_docs) + assert doc == mock_docs[0] + finally: + await handler.close() + + +@pytest.mark.parametrize('count', [1, 2, 10347]) +async def test_has_collection_record(count): + mock_response = {'count': count} + handler = MSSElasticsearchHandler([MOCK_HOST], ['index-1', 'index-2'], mss_index=MSS_INDEX) + try: + with aioresponses() as m: + m.post(f'{MOCK_HOST}/{handler.collection_indices}/_count', payload=mock_response) + assert await handler.has_collection_record('12345') + finally: + await handler.close() + + +async def test_has_collection_record_no_hits(): + mock_response = {'count': 0} + handler = MSSElasticsearchHandler([MOCK_HOST], ['index-1', 'index-2'], mss_index=MSS_INDEX) + try: + with aioresponses() as m: + m.post(f'{MOCK_HOST}/{handler.collection_indices}/_count', payload=mock_response) + assert not await handler.has_collection_record('12345') + finally: + await handler.close() + + +async def test_cycling_hosts(): + host_1 = 'http://not.a.real.es.host1' + mock_response_1 = {'count': 1} + host_2 = 'http://not.a.real.es.host2' + mock_response_2 = {'count': 0} + handler = MSSElasticsearchHandler([host_1, host_2], ['index-1', 'index-2'], + mss_index=MSS_INDEX) + try: + with aioresponses() as m: + m.post(f'{host_1}/{handler.collection_indices}/_count', payload=mock_response_1) + m.post(f'{host_2}/{handler.collection_indices}/_count', payload=mock_response_2) + m.post(f'{host_1}/{handler.collection_indices}/_count', payload=mock_response_1) + assert await handler.has_collection_record('12345') + assert not await handler.has_collection_record('12345') + assert await handler.has_collection_record('12345') + finally: + await handler.close() diff --git a/tests/profiles/test_mss_store.py b/tests/profiles/test_mss_store.py new file mode 100644 index 0000000..429716a --- /dev/null +++ b/tests/profiles/test_mss_store.py @@ -0,0 +1,164 @@ +import pytest +from PIL import Image +from aiohttp import ClientResponseError +from aioresponses import aioresponses +from fastapi import HTTPException +from io import BytesIO +from pathlib import Path + +from iiif.profiles.mss import MSSSourceStore, MSSSourceFile + +MOCK_HOST = 'http://not.the.real.mss.host' + + +@pytest.fixture +def source_root(tmpdir) -> Path: + return Path(tmpdir, 'test') + + +async def test_choose_convert_pool(source_root: Path): + store = MSSSourceStore(source_root, MOCK_HOST, 10, 10, 10) + try: + assert store._choose_convert_pool('beans.jpg') == store._fast_pool + assert store._choose_convert_pool('beans.jpeg') == store._fast_pool + assert store._choose_convert_pool('beans.tiff') == store._slow_pool + assert store._choose_convert_pool('beans.any') == store._slow_pool + finally: + await store.close() + + +async def test_check_access_ok(source_root: Path): + store = MSSSourceStore(source_root, MOCK_HOST, 10, 10, 10) + emu_irn = 12345 + try: + with aioresponses() as m: + m.get(MSSSourceFile.check_url(emu_irn), status=204) + assert await store.check_access(emu_irn) + finally: + await store.close() + + +async def test_check_access_failed(source_root: Path): + store = MSSSourceStore(source_root, MOCK_HOST, 10, 10, 10) + emu_irn = 12345 + try: + with aioresponses() as m: + m.get(MSSSourceFile.check_url(emu_irn), status=401) + assert not await store.check_access(emu_irn) + finally: + await store.close() + + +async def test_stream(source_root: Path): + store = MSSSourceStore(source_root, MOCK_HOST, 10, 10, 10) + emu_irn = 12345 + file = 'some_file.jpg' + content = b'1234 look at me i am a jpg image' + chunk_size = 2 + try: + with aioresponses() as m: + source = MSSSourceFile(emu_irn, file, False, chunk_size=chunk_size) + m.get(source.url, body=content) + buffer = BytesIO() + async for chunk in store.stream(source): + buffer.write(chunk) + assert len(chunk) <= chunk_size + assert buffer.getvalue() == content + finally: + await store.close() + + +async def test_stream_access_denied(source_root): + source_root: Path + store = MSSSourceStore(source_root, MOCK_HOST, 10, 10, 10) + emu_irn = 12345 + file = 'some_file.jpg' + try: + with aioresponses() as m: + source = MSSSourceFile(emu_irn, file, False) + m.get(source.url, status=401) + with pytest.raises(HTTPException): + async for chunk in store.stream(source): + pass + finally: + await store.close() + + +async def test_stream_missing(source_root): + store = MSSSourceStore(source_root, MOCK_HOST, 10, 10, 10) + emu_irn = 12345 + file = 'some_file.jpg' + try: + with aioresponses() as m: + source = MSSSourceFile(emu_irn, file, False) + m.get(source.url, status=404) + with pytest.raises(ClientResponseError): + async for chunk in store.stream(source): + pass + finally: + await store.close() + + +async def test_stream_use_dams(source_root: Path): + store = MSSSourceStore(source_root, MOCK_HOST, 10, 10, 10) + emu_irn = 12345 + file = 'some_file.jpg' + dams_host = 'http://not.the.real.dams' + dams_url = f'{dams_host}/some_original.tiff' + content = b'1234 look at me i am a tiff image' + chunk_size = 2 + try: + with aioresponses() as m: + source = MSSSourceFile(emu_irn, file, True, chunk_size=chunk_size) + m.get(source.url, status=404) + m.get(source.dams_url, body=dams_url) + m.get(dams_url, body=content) + + buffer = BytesIO() + async for chunk in store.stream(source): + buffer.write(chunk) + assert len(chunk) <= chunk_size + assert buffer.getvalue() == content + finally: + await store.close() + + +async def test_stream_dams_fails(source_root: Path): + store = MSSSourceStore(source_root, MOCK_HOST, 10, 10, 10) + emu_irn = 12345 + file = 'some_file.jpg' + dams_host = 'http://not.the.real.dams' + dams_url = f'{dams_host}/some_original.tiff' + try: + with aioresponses() as m: + source = MSSSourceFile(emu_irn, file, True) + m.get(source.url, status=404) + m.get(source.dams_url, body=dams_url) + m.get(dams_url, status=404) + + with pytest.raises(ClientResponseError): + async for chunk in store.stream(source): + pass + finally: + await store.close() + + +async def test_use(source_root: Path): + store = MSSSourceStore(source_root, MOCK_HOST, 10, 10, 10) + emu_irn = 12345 + file = 'some_file.jpg' + + buffer = BytesIO() + image = Image.new('RGB', size=(100, 100), color=(255, 0, 0)) + image.save(buffer, 'png') + + try: + with aioresponses() as m: + source = MSSSourceFile(emu_irn, file, False) + m.get(source.url, body=buffer.getvalue()) + async with store.use(source) as path: + assert path.exists() + with Image.open(path) as image: + assert image.format.lower() == 'jpeg' + finally: + await store.close() diff --git a/tests/test_exceptions.py b/tests/test_exceptions.py index f393919..140e0c2 100644 --- a/tests/test_exceptions.py +++ b/tests/test_exceptions.py @@ -1,23 +1,22 @@ -from iiif.exceptions import profile_not_found, too_many_images, image_not_found, \ - invalid_iiif_parameter +from iiif.exceptions import ProfileNotFound, TooManyImages, ImageNotFound, InvalidIIIFParameter def test_profile_not_found(): - assert profile_not_found().status_code == 404 + assert ProfileNotFound('test').status_code == 404 def test_image_not_found(): - assert image_not_found().status_code == 404 + assert ImageNotFound('test', 'image').status_code == 404 def test_too_many_images(): - error = too_many_images(1029) + error = TooManyImages(1029) assert error.status_code == 400 - assert '1029' in error.detail + assert '1029' in error.public def test_invalid_iiif_parameter(): - error = invalid_iiif_parameter('Goats', 'Beans') + error = InvalidIIIFParameter('Goats', 'Beans') assert error.status_code == 400 - assert 'Goats' in error.detail - assert 'Beans' in error.detail + assert 'Goats' in error.public + assert 'Beans' in error.public diff --git a/tests/test_processing.py b/tests/test_processing.py index 8426188..ed4e15d 100644 --- a/tests/test_processing.py +++ b/tests/test_processing.py @@ -1,22 +1,17 @@ #!/usr/bin/env python3 # encoding: utf-8 -from asyncio import Future -from pathlib import Path -from queue import Queue -from typing import Union -from unittest.mock import patch, MagicMock, call - import hashlib -import io -import multiprocessing as mp import pytest from PIL import Image from jpegtran import JPEGImage +from pathlib import Path +from queue import Queue +from typing import Union -from iiif.ops import parse_params, Region, Size, Rotation, Quality, Format -from iiif.processing import Task, Worker, ImageProcessingDispatcher, process_region, process_size, \ - process_rotation, process_quality, process_format +from iiif.ops import Region, Size, Rotation, Quality, Format +from iiif.processing import process_region, process_size, process_rotation, process_quality, \ + process_format from iiif.profiles.base import ImageInfo from iiif.utils import to_jpegtran, to_pillow from tests.utils import create_image @@ -137,496 +132,4 @@ def test_format(fmt: Format, expected_format: str, tmp_path: Path, image: JPEGIm with Image.open(output_path) as image: assert image.format == expected_format - -class TestWorker: - - @pytest.fixture - def worker(self, result_queue): - worker = Worker(0, result_queue, 1) - worker.stop() - return worker - - @pytest.fixture - def task(self, source_path, cache_path, info): - return Task(source_path, cache_path, parse_params(info)) - - @pytest.fixture - def result_queue(self): - return mp.Queue() - - def test_add(self, worker, task): - assert worker.queue_size == 0 - - worker.add(task) - - assert worker.queue_size == 1 - assert task.source_path in worker.predicted_cache - assert worker.task_queue.qsize() == 1 - - def test_done(self, worker, task): - assert worker.queue_size == 0 - worker.add(task) - assert worker.queue_size == 1 - worker.done(task) - assert worker.queue_size == 0 - - def test_stop(self, result_queue): - worker = Worker(0, result_queue, 1) - worker.stop() - assert worker.task_queue.qsize() == 0 - assert not worker.process.is_alive() - - def test_is_warm_for(self, config, result_queue): - worker = Worker(0, result_queue, 2) - - image1_source = create_image(config, 100, 200, profile='test', name='image1') - info1 = ImageInfo('t1', 't1', 100, 200) - image2_source = create_image(config, 100, 200, profile='test', name='image2') - info2 = ImageInfo('t1', 't2', 100, 200) - image3_source = create_image(config, 100, 200, profile='test', name='image3') - info3 = ImageInfo('t1', 't3', 100, 200) - image4_source = create_image(config, 100, 200, profile='test', name='image4') - info4 = ImageInfo('t1', 't4', 100, 200) - image1_cache = config.cache_path / 'test' / 'image1' - image2_cache = config.cache_path / 'test' / 'image2' - image3_cache = config.cache_path / 'test' / 'image3' - image4_cache = config.cache_path / 'test' / 'image4' - - task1 = Task(image1_source, image1_cache, parse_params(info1)) - task2 = Task(image2_source, image2_cache, parse_params(info2)) - task3 = Task(image3_source, image3_cache, parse_params(info3)) - task4 = Task(image4_source, image4_cache, parse_params(info4)) - - worker.add(task1) - assert any(worker.is_warm_for(task) for task in (task1,)) - assert not any(worker.is_warm_for(task) for task in (task2, task3, task4)) - - worker.add(task2) - assert any(worker.is_warm_for(task) for task in (task1, task2)) - assert not any(worker.is_warm_for(task) for task in (task3, task4)) - - worker.add(task3) - assert any(worker.is_warm_for(task) for task in (task2, task3)) - assert not any(worker.is_warm_for(task) for task in (task1, task4)) - - worker.add(task4) - assert any(worker.is_warm_for(task) for task in (task3, task4)) - assert not any(worker.is_warm_for(task) for task in (task1, task2)) - - worker.stop() - - def test_result_queue_is_used(self, config, cache_path, result_queue): - worker = Worker(0, result_queue, 2) - source_path = create_image(config, 100, 200) - info = ImageInfo('t1', 't1', 100, 200) - task = Task(source_path, cache_path, parse_params(info)) - - worker.add(task) - worker.stop() - - assert result_queue.qsize() == 1 - assert result_queue.get() == (0, task, None) - - -class TestTask: - - def test_equality(self, config): - image1_source = create_image(config, 100, 200, profile='test', name='image1') - image2_source = create_image(config, 100, 200, profile='test', name='image2') - image1_cache = config.cache_path / 'test' / 'image1' - image2_cache = config.cache_path / 'test' / 'image2' - image1_info = ImageInfo('test', 'image1', 100, 200) - image2_info = ImageInfo('test', 'image2', 100, 200) - - # simple the checks that the same image is the same and different images are different - assert Task(image1_source, image1_cache, parse_params(image1_info)) == \ - Task(image1_source, image1_cache, parse_params(image1_info)) - assert Task(image1_source, image1_cache, parse_params(image1_info)) != \ - Task(image2_source, image2_cache, parse_params(image2_info)) - # simple checks that different ops on the same images are different things - assert Task(image2_source, image2_cache, parse_params(image2_info)) != \ - Task(image2_source, image2_cache, parse_params(image2_info, size='50,')) - assert Task(image2_source, image2_cache, parse_params(image2_info, region='square')) != \ - Task(image2_source, image2_cache, parse_params(image2_info)) - # check that even if the ops are different we know they are the same semantically - assert Task(image1_source, image1_cache, parse_params(image1_info)) == \ - Task(image1_source, image1_cache, parse_params(image1_info, region='0,0,100,200')) - assert Task(image1_source, image1_cache, parse_params(image1_info)) == \ - Task(image1_source, image1_cache, parse_params(image1_info, size='100,200')) - assert Task(image2_source, image2_cache, parse_params(image2_info, region='square')) == \ - Task(image2_source, image2_cache, parse_params(image2_info, region='0,50,100,100')) - - -@pytest.fixture -def dispatcher(event_loop): - with patch('iiif.processing.asyncio.get_event_loop', MagicMock(return_value=event_loop)): - dispatcher = ImageProcessingDispatcher() - yield dispatcher - dispatcher.stop() - - -@pytest.fixture -def default_ops(): - return parse_params(ImageInfo('test', 'image', 100, 200)) - - -class TestImageProcessingDispatcher: - - def test_result_listener(self): - mock_loop = MagicMock() - with patch('iiif.processing.asyncio.get_event_loop', MagicMock(return_value=mock_loop)): - dispatcher = ImageProcessingDispatcher() - mock_result = ('some', 'mock', 'data') - dispatcher.result_queue.put(mock_result) - dispatcher.result_queue.put(None) - dispatcher.result_thread.join() - - mock_loop.call_soon_threadsafe.assert_called_once_with(dispatcher.finish_task, *mock_result) - - def test_init_workers(self, dispatcher): - worker_mock = MagicMock() - with patch('iiif.processing.Worker', worker_mock): - dispatcher.init_workers(2, 3) - - worker_mock.assert_has_calls([ - call(0, dispatcher.result_queue, 3), - call(1, dispatcher.result_queue, 3) - ]) - assert len(dispatcher.workers) == 2 - - def test_stop(self): - dispatcher = ImageProcessingDispatcher() - dispatcher.init_workers(2, 3) - - worker0 = dispatcher.workers[0] - worker1 = dispatcher.workers[1] - - with patch.object(worker0, 'stop', wraps=worker0.stop) as stop0: - with patch.object(worker1, 'stop', wraps=worker1.stop) as stop1: - dispatcher.stop() - stop0.assert_called_once() - stop1.assert_called_once() - - assert dispatcher.result_queue.empty() - assert not dispatcher.result_thread.is_alive() - - def test_finish_task_success(self, dispatcher, config, cache_path, default_ops): - worker_id = 0 - source = create_image(config, 100, 200) - task = Task(source, cache_path, default_ops) - mock_future = MagicMock() - mock_worker = MagicMock() - dispatcher.workers[worker_id] = mock_worker - dispatcher.output_paths[task.output_path] = mock_future - - dispatcher.finish_task(worker_id, task, None) - - mock_worker.done.assert_called_once_with(task) - mock_future.set_result.called_once_with(None) - mock_future.set_exception.assert_not_called() - - def test_finish_task_failure(self, dispatcher, config, cache_path, default_ops): - worker_id = 0 - source = create_image(config, 100, 200) - task = Task(source, cache_path, default_ops) - mock_future = MagicMock() - mock_worker = MagicMock() - mock_exception = MagicMock() - dispatcher.workers[worker_id] = mock_worker - dispatcher.output_paths[task.output_path] = mock_future - - dispatcher.finish_task(worker_id, task, mock_exception) - - mock_worker.done.assert_called_once_with(task) - mock_future.set_exception.called_once_with(mock_exception) - mock_future.set_result.assert_not_called() - - def test_choose_worker_all_empty(self, dispatcher): - worker0 = MagicMock(queue_size=0, is_warm_for=MagicMock(return_value=False)) - worker1 = MagicMock(queue_size=0, is_warm_for=MagicMock(return_value=False)) - worker2 = MagicMock(queue_size=0, is_warm_for=MagicMock(return_value=False)) - dispatcher.workers[0] = worker0 - dispatcher.workers[1] = worker1 - dispatcher.workers[2] = worker2 - - mock_choice = MagicMock() - with patch('iiif.processing.random', MagicMock(choice=mock_choice)): - dispatcher.choose_worker(MagicMock()) - - mock_choice.assert_called_once_with([worker0, worker1, worker2]) - - def test_choose_worker_all_really_full(self, dispatcher): - worker0 = MagicMock(queue_size=100, is_warm_for=MagicMock(return_value=False)) - worker1 = MagicMock(queue_size=100, is_warm_for=MagicMock(return_value=False)) - worker2 = MagicMock(queue_size=100, is_warm_for=MagicMock(return_value=False)) - dispatcher.workers[0] = worker0 - dispatcher.workers[1] = worker1 - dispatcher.workers[2] = worker2 - - mock_choice = MagicMock() - with patch('iiif.processing.random', MagicMock(choice=mock_choice)): - dispatcher.choose_worker(MagicMock()) - - mock_choice.assert_called_once_with([worker0, worker1, worker2]) - - def test_choose_worker_some_warm(self, dispatcher): - worker0 = MagicMock(queue_size=0, is_warm_for=MagicMock(return_value=False)) - worker1 = MagicMock(queue_size=0, is_warm_for=MagicMock(return_value=True)) - worker2 = MagicMock(queue_size=0, is_warm_for=MagicMock(return_value=True)) - dispatcher.workers[0] = worker0 - dispatcher.workers[1] = worker1 - dispatcher.workers[2] = worker2 - - mock_choice = MagicMock() - with patch('iiif.processing.random', MagicMock(choice=mock_choice)): - dispatcher.choose_worker(MagicMock()) - - mock_choice.assert_called_once_with([worker1, worker2]) - - def test_choose_worker_scenario_1(self, dispatcher): - """ - Worker states: - - 0: empty, not warm - - 1: empty, warm - - Worker 1 should be chosen as warm and empty is better than not warm and empty. - """ - worker0 = MagicMock(queue_size=0, is_warm_for=MagicMock(return_value=False)) - worker1 = MagicMock(queue_size=0, is_warm_for=MagicMock(return_value=True)) - dispatcher.workers[0] = worker0 - dispatcher.workers[1] = worker1 - assert worker1 == dispatcher.choose_worker(MagicMock()) - - def test_choose_worker_scenario_2(self, dispatcher): - """ - Worker states: - - 0: full, not warm - - 1: full, warm - - Worker 1 should be chosen as warm and full is better than not warm and full. - """ - worker0 = MagicMock(queue_size=3, is_warm_for=MagicMock(return_value=False)) - worker1 = MagicMock(queue_size=3, is_warm_for=MagicMock(return_value=True)) - dispatcher.workers[0] = worker0 - dispatcher.workers[1] = worker1 - assert worker1 == dispatcher.choose_worker(MagicMock()) - - def test_choose_worker_scenario_3(self, dispatcher): - """ - Worker states: - - 0: some tasks, not warm - - 1: some tasks, warm - - Worker 1 should be chosen as warm with some tasks is better than not warm and some tasks. - """ - worker0 = MagicMock(queue_size=1, is_warm_for=MagicMock(return_value=False)) - worker1 = MagicMock(queue_size=1, is_warm_for=MagicMock(return_value=True)) - dispatcher.workers[0] = worker0 - dispatcher.workers[1] = worker1 - assert worker1 == dispatcher.choose_worker(MagicMock()) - - def test_choose_worker_scenario_4(self, dispatcher): - """ - Worker states: - - 0: full, not warm - - 1: empty, not warm - - Worker 1 should be chosen as empty but not warm is better than full and not warm - """ - worker0 = MagicMock(queue_size=3, is_warm_for=MagicMock(return_value=False)) - worker1 = MagicMock(queue_size=0, is_warm_for=MagicMock(return_value=False)) - dispatcher.workers[0] = worker0 - dispatcher.workers[1] = worker1 - assert worker1 == dispatcher.choose_worker(MagicMock()) - - def test_choose_worker_scenario_5(self, dispatcher): - """ - Worker states: - - 0: some tasks, not warm - - 1: empty, not warm - - 2: full, not warm - - Worker 1 should be chosen as empty but not warm is better than full and not warm as well as - some tasks and not warm. - """ - worker0 = MagicMock(queue_size=3, is_warm_for=MagicMock(return_value=False)) - worker1 = MagicMock(queue_size=0, is_warm_for=MagicMock(return_value=False)) - worker2 = MagicMock(queue_size=1, is_warm_for=MagicMock(return_value=False)) - dispatcher.workers[0] = worker0 - dispatcher.workers[1] = worker1 - dispatcher.workers[2] = worker2 - assert worker1 == dispatcher.choose_worker(MagicMock()) - - def test_choose_worker_scenario_6(self, dispatcher): - """ - Worker states: - - 0: some tasks, warm - - 1: empty, warm - - 2: full, warm - - Worker 1 should be chosen as empty but warm is better than full and warm as well as some - tasks and warm. - """ - worker0 = MagicMock(queue_size=3, is_warm_for=MagicMock(return_value=True)) - worker1 = MagicMock(queue_size=0, is_warm_for=MagicMock(return_value=True)) - worker2 = MagicMock(queue_size=1, is_warm_for=MagicMock(return_value=True)) - dispatcher.workers[0] = worker0 - dispatcher.workers[1] = worker1 - dispatcher.workers[2] = worker2 - assert worker1 == dispatcher.choose_worker(MagicMock()) - - def test_choose_worker_scenario_7(self, dispatcher): - """ - Worker states: - - 0: full, warm - - 1: empty, not warm - - Worker 1 should be chosen as empty but not warm is better than full and warm - """ - worker0 = MagicMock(queue_size=3, is_warm_for=MagicMock(return_value=True)) - worker1 = MagicMock(queue_size=0, is_warm_for=MagicMock(return_value=False)) - dispatcher.workers[0] = worker0 - dispatcher.workers[1] = worker1 - assert worker1 == dispatcher.choose_worker(MagicMock()) - - def test_choose_worker_scenario_8(self, dispatcher): - """ - Worker states: - - 0: some tasks, warm - - 1: empty, not warm - - A random choice between worker 0 and worker 1 should be made as warm but with some tasks is - equal to empty but not warm. - """ - worker0 = MagicMock(queue_size=1, is_warm_for=MagicMock(return_value=True)) - worker1 = MagicMock(queue_size=0, is_warm_for=MagicMock(return_value=False)) - dispatcher.workers[0] = worker0 - dispatcher.workers[1] = worker1 - mock_choice = MagicMock() - with patch('iiif.processing.random', MagicMock(choice=mock_choice)): - dispatcher.choose_worker(MagicMock()) - mock_choice.assert_called_once_with([worker0, worker1]) - - @pytest.mark.asyncio - async def test_submit_already_in_progress_success(self, event_loop, dispatcher, config, - cache_path, default_ops): - source = create_image(config, 100, 200) - image_task = Task(source, cache_path, default_ops) - - # create a future to mimic what would happen if a request for this task had already been - # submitted - previous_request_future = Future() - dispatcher.output_paths[image_task.output_path] = previous_request_future - - # launch a task to which submits the task, this should await the result of the previous - # request future created above - task = event_loop.create_task(dispatcher.submit(image_task)) - # indicate the the future is complete - previous_request_future.set_result(None) - # make sure our task is done - await task - # assert it - assert task.done() - - @pytest.mark.asyncio - async def test_submit_already_in_progress_failure(self, event_loop, dispatcher, config, - cache_path, default_ops): - source = create_image(config, 100, 200) - image_task = Task(source, cache_path, default_ops) - - # create a future to mimic what would happen if a request for this task had already been - # submitted - previous_request_future = Future() - dispatcher.output_paths[image_task.output_path] = previous_request_future - - # launch a task to which submits the task, this should await the result of the previous - # request future created above - task = event_loop.create_task(dispatcher.submit(image_task)) - # indicate the the future is complete but failed - mock_exception = Exception('test!') - previous_request_future.set_exception(mock_exception) - - with pytest.raises(Exception) as e: - await task - - assert e.value == mock_exception - - @pytest.mark.asyncio - async def test_submit_already_finished_success(self, event_loop, dispatcher, config, - cache_path, default_ops): - source = create_image(config, 100, 200) - image_task = Task(source, cache_path, default_ops) - - # create a future to mimic what would happen if a request for this task had already been - # submitted - previous_request_future = Future() - previous_request_future.set_result(None) - dispatcher.output_paths[image_task.output_path] = previous_request_future - - # launch a task to which submits the task, this should finish almost immediately - task = event_loop.create_task(dispatcher.submit(image_task)) - # make sure our task is done - await task - # assert it - assert task.done() - - @pytest.mark.asyncio - async def test_submit_already_finished_failure(self, dispatcher, config, cache_path, - default_ops): - source = create_image(config, 100, 200) - image_task = Task(source, cache_path, default_ops) - - # create a future to mimic what would happen if a request for this task had already been - # submitted - previous_request_future = Future() - mock_exception = Exception('test!') - previous_request_future.set_exception(mock_exception) - dispatcher.output_paths[image_task.output_path] = previous_request_future - - with pytest.raises(Exception) as e: - await dispatcher.submit(image_task) - - assert e.value == mock_exception - - @pytest.mark.asyncio - async def test_submit_path_exists(self, dispatcher, config, cache_path, default_ops): - source = create_image(config, 100, 200) - image_task = Task(source, cache_path, default_ops) - - # make sure the path exists and write some data there - image_task.output_path.parent.mkdir(parents=True, exist_ok=True) - with image_task.output_path.open('w') as f: - f.write('something!') - - mock_worker = MagicMock() - dispatcher.workers[0] = mock_worker - - await dispatcher.submit(image_task) - - assert dispatcher.output_paths[image_task.output_path].done() - mock_worker.add.assert_not_called() - - @pytest.mark.asyncio - async def test_submit_path_does_not_exist(self, event_loop, dispatcher, config, cache_path, - default_ops): - source = create_image(config, 100, 200) - image_task = Task(source, cache_path, default_ops) - - def mock_add(t): - # just immediately complete the task - dispatcher.output_paths[t.output_path].set_result(None) - - mock_worker = MagicMock(queue_size=0, is_warm_for=MagicMock(return_value=True), - add=MagicMock(side_effect=mock_add)) - dispatcher.workers[0] = mock_worker - - # launch a task to which submits the task, this should finish almost immediately - task = event_loop.create_task(dispatcher.submit(image_task)) - - # make sure our task is done - await task - # assert it - assert task.done() - - assert dispatcher.output_paths[image_task.output_path].done() - mock_worker.add.assert_called_once_with(image_task) +# TODO: write processor tests diff --git a/tests/test_utils.py b/tests/test_utils.py index 93b7a8c..feb3096 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,19 +1,19 @@ #!/usr/bin/env python3 # encoding: utf-8 +from dataclasses import dataclass import asyncio -from collections import Counter, defaultdict -from unittest.mock import patch, MagicMock - import humanize import pytest from PIL import Image from jpegtran import JPEGImage +from pathlib import Path +from unittest.mock import patch, MagicMock from wand.exceptions import MissingDelegateError -from iiif.utils import convert_image, generate_sizes, get_size, get_path_stats, OnceRunner, \ - get_mimetype, parse_identifier, to_pillow, to_jpegtran -from tests.utils import create_image +from iiif.utils import convert_image, generate_sizes, get_size, get_path_stats, get_mimetype, \ + parse_identifier, to_pillow, to_jpegtran, Locker, FetchCache, Fetchable +from tests.utils import create_image, create_file class TestConvertImage: @@ -160,197 +160,6 @@ def test_get_path_stats_empty(tmp_path): assert stats['size_pretty'] == humanize.naturalsize(0, binary=True) -class TestOnceRunner: - - @pytest.mark.asyncio - async def test_sequential_usage(self): - result = object() - counter = Counter() - - async def task(name): - counter[name] += 1 - await asyncio.sleep(1) - return result - - runner = OnceRunner('test', 100) - assert await runner.run('task1', task, 'task1') is result - assert await runner.run('task1', task, 'task1') is result - assert await runner.run('task2', task, 'task2') is result - assert counter['task1'] == 1 - assert counter['task2'] == 1 - - @pytest.mark.asyncio - async def test_parallel_usage(self): - results = defaultdict(object) - counter = Counter() - - async def task(name): - counter[name] += 1 - await asyncio.sleep(1) - return name, results[name] - - runner = OnceRunner('test', 100) - task_a = asyncio.create_task(runner.run('task1', task, 'task1')) - task_b = asyncio.create_task(runner.run('task2', task, 'task2')) - task_c = asyncio.create_task(runner.run('task1', task, 'task1')) - - run_results = await asyncio.gather(task_a, task_b, task_c) - assert set(run_results) == set(results.items()) - assert counter['task1'] == 1 - assert counter['task2'] == 1 - - @pytest.mark.asyncio - async def test_sequential_usage_with_error(self): - counter = Counter() - - async def task(name): - counter[name] += 1 - await asyncio.sleep(1) - raise Exception(f'heyo from {name}!') - - runner = OnceRunner('test', 100) - with pytest.raises(Exception, match=f'heyo from task1!') as exc_info: - await runner.run('task1', task, 'task1') - exception = exc_info.value - - with pytest.raises(Exception, match=f'heyo from task1!') as exc_info: - await runner.run('task1', task, 'task1') - assert exception is exc_info.value - - with pytest.raises(Exception, match=f'heyo from task2!') as exc_info: - await runner.run('task2', task, 'task2') - - assert counter['task1'] == 1 - assert counter['task2'] == 1 - - @pytest.mark.asyncio - async def test_parallel_usage_with_errors(self): - counter = Counter() - - async def task(name): - counter[name] += 1 - await asyncio.sleep(1) - raise Exception(f'heyo from {name}!') - - runner = OnceRunner('test', 100) - task_a = asyncio.create_task(runner.run('task1', task, 'task1')) - task_b = asyncio.create_task(runner.run('task2', task, 'task2')) - task_c = asyncio.create_task(runner.run('task1', task, 'task1')) - - run_results = await asyncio.gather(task_a, task_b, task_c, return_exceptions=True) - assert len(run_results) == 3 - exceptions = set(run_results) - assert len(exceptions) == 2 - assert 'heyo from task1!' in set(map(str, exceptions)) - assert 'heyo from task2!' in set(map(str, exceptions)) - assert counter['task1'] == 1 - assert counter['task2'] == 1 - - @pytest.mark.asyncio - async def test_parallel_mixed(self): - results = defaultdict(object) - counter = Counter() - - async def erroring_task(name): - counter[name] += 1 - await asyncio.sleep(1) - raise Exception(f'heyo from {name}!') - - async def working_task(name): - counter[name] += 1 - await asyncio.sleep(1.5) - return name, results[name] - - runner = OnceRunner('test', 100) - task_a = asyncio.create_task(runner.run('task1', erroring_task, 'task1')) - task_b = asyncio.create_task(runner.run('task2', working_task, 'task2')) - task_c = asyncio.create_task(runner.run('task1', erroring_task, 'task1')) - task_d = asyncio.create_task(runner.run('task2', working_task, 'task2')) - task_e = asyncio.create_task(runner.run('task3', working_task, 'task3')) - - all_results = await asyncio.gather(task_a, task_b, task_c, task_d, task_e, - return_exceptions=True) - assert len(all_results) == 5 - - exceptions = [result for result in all_results if isinstance(result, Exception)] - working = [result for result in all_results if not isinstance(result, Exception)] - - assert len(exceptions) == 2 - assert len(set(exceptions)) == 1 - assert len(working) == 3 - assert len(set(working)) == 2 - - assert 'heyo from task1!' in set(map(str, exceptions)) - assert set(working) == set(results.items()) - assert counter['task1'] == 1 - assert counter['task2'] == 1 - assert counter['task3'] == 1 - - @pytest.mark.asyncio - async def test_expire(self): - runner = OnceRunner('test', 100) - assert not runner.expire('task1') - await runner.run('task1', asyncio.sleep, 1) - assert runner.expire('task1') - - @pytest.mark.asyncio - async def test_exception_timeout(self): - counter = Counter() - - async def erroring_task(name): - counter[name] += 1 - await asyncio.sleep(0.5) - raise Exception(f'heyo!') - - runner = OnceRunner('test', 100, exception_timeout=5) - # run the erroring task, this should raise an exception - with pytest.raises(Exception, match='heyo!'): - await runner.run('task1', erroring_task, 'task1') - # run the erroring task again, this should also raise an exception but from the result cache - with pytest.raises(Exception, match='heyo!'): - await runner.run('task1', erroring_task, 'task1') - # sleep for 5 seconds to make sure the exception timeout has run out - await asyncio.sleep(5) - # run the erroring task, this should raise an exception again, not from the cache - with pytest.raises(Exception, match='heyo!'): - await runner.run('task1', erroring_task, 'task1') - - # the counter is 2 because the task was run twice due to the exception timeout - assert counter['task1'] == 2 - - def test_wait(self): - runner = OnceRunner('test', 100) - assert runner.waiting == 0 - with runner.wait(): - assert runner.waiting == 1 - assert runner.waiting == 0 - - def test_work(self): - runner = OnceRunner('test', 100) - assert runner.working == 0 - with runner.work(): - assert runner.working == 1 - assert runner.working == 0 - - @pytest.mark.asyncio - async def test_get_status(self): - runner = OnceRunner('test', 100) - stats = await runner.get_status() - assert stats['size'] == 0 - assert stats['waiting'] == 0 - assert stats['working'] == 0 - - with runner.wait(): - with runner.work(): - stats = await runner.get_status() - assert stats['waiting'] == 1 - assert stats['working'] == 1 - - await runner.run('task1', asyncio.sleep, 1) - stats = await runner.get_status() - assert stats['size'] == 1 - - class TestGetMimetype: def test_normal(self): @@ -371,3 +180,126 @@ def test_to_pillow_and_to_jpegtran(): assert isinstance(jpegtran_image, JPEGImage) pillow_image_again = to_pillow(jpegtran_image) assert isinstance(pillow_image_again, Image.Image) + + +@dataclass +class FetchableForTesting(Fetchable): + + def __init__(self, size: int): + self.size = size + self.name = f'{self.size}bytes.bin' + + @property + def public_name(self) -> str: + return self.name + + @property + def store_path(self) -> Path: + return Path('test', self.name) + + +class FetchCacheForTesting(FetchCache): + + def __init__(self, root: Path, ttl: float = 1, max_size: float = 20): + super().__init__(Path(root), ttl, max_size) + + def _fetch(self, fetchable: FetchableForTesting): + path = self.root / fetchable.store_path + path.parent.mkdir(parents=True, exist_ok=True) + create_file(path, fetchable.size) + + +@pytest.fixture +def cache(tmpdir: Path) -> FetchCacheForTesting: + return FetchCacheForTesting(tmpdir) + + +class TestFetchCache: + + async def test_simple_usage(self, cache: FetchCacheForTesting): + fetchable = FetchableForTesting(4) + async with cache.use(fetchable) as path: + assert cache.total_size == 4 + assert cache.errors == 0 + assert cache.requests == 1 + assert cache.completed == 0 + assert path.exists() + assert fetchable.store_path in cache + await asyncio.sleep(cache.ttl + 0.5) + assert path.exists() + assert fetchable.store_path in cache + assert cache.completed == 1 + await asyncio.sleep(cache.ttl + 0.5) + assert not path.exists() + assert fetchable.store_path not in cache + assert not path.parent.exist() + + +class TestLocker: + + async def test_is_locked(self): + locker = Locker() + async with locker.acquire('test'): + assert locker.is_locked('test') + + async def test_acquire_no_timeout(self): + locker = Locker() + + async def first(): + async with locker.acquire('test'): + await asyncio.sleep(3) + + async def second(): + try: + async with locker.acquire('test', timeout=0): + assert True + except asyncio.TimeoutError: + assert False + + task = asyncio.ensure_future(first()) + await asyncio.sleep(1) + await second() + await task + assert not locker.is_locked('test') + + async def test_acquire_with_shorter_timeout(self): + locker = Locker() + + async def first(): + async with locker.acquire('test'): + await asyncio.sleep(5) + + async def second(): + try: + async with locker.acquire('test', timeout=1): + # should never get here because we wait for the lock and then get cancelled + # before it's available + assert False + except asyncio.TimeoutError: + assert locker.is_locked('test') + + task = asyncio.ensure_future(first()) + await asyncio.sleep(1) + await second() + await task + assert not locker.is_locked('test') + + async def test_acquire_with_longer_timeout(self): + locker = Locker() + + async def first(): + async with locker.acquire('test'): + await asyncio.sleep(3) + + async def second(): + try: + async with locker.acquire('test', timeout=5): + assert True + except asyncio.TimeoutError: + assert False + + task = asyncio.ensure_future(first()) + await asyncio.sleep(1) + await second() + await task + assert not locker.is_locked('test') diff --git a/tests/utils.py b/tests/utils.py index bc6dc56..fd84e5e 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -24,3 +24,9 @@ def create_image(config: Config, width: int, height: int, profile: str = 'test', img = Image.new('RGB', (width, height), color='red') img.save(path, format='jpeg') return path + + +def create_file(path: Path, size: int): + with path.open('wb') as f: + f.write(bytes(size)) + return path From c9908645984de328dd36eeb09761d457f74012e8 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Wed, 11 May 2022 19:05:28 +0100 Subject: [PATCH 02/40] Remove MSS collection record requirement This commit removes the requirement for an MSS profile image to have an associated specimen/index lot/artefact record in Elasticsearch. This is because we rely on the APS part of the MSS to do this check for us and because the IDs used for the images (specifically GUIDs) are not guessable, so a user can't accidentally stumble upon images they shouldn't be able to get to by incrementing the ID. The side effects of this are reduced time processing requests because we don't have to go ask Elasticsearch about the collections and it gives us the ability to serve images not associated with collections through this interface which is very useful! --- iiif/profiles/mss.py | 43 +++-------------------------------- tests/profiles/test_mss.py | 19 ++++++++-------- tests/profiles/test_mss_es.py | 43 +++++++---------------------------- 3 files changed, 20 insertions(+), 85 deletions(-) diff --git a/iiif/profiles/mss.py b/iiif/profiles/mss.py index 52fb98a..ef0bccd 100644 --- a/iiif/profiles/mss.py +++ b/iiif/profiles/mss.py @@ -5,7 +5,6 @@ import logging import shutil import tempfile -import time from cachetools import TTLCache from contextlib import AsyncExitStack, asynccontextmanager from dataclasses import dataclass @@ -25,14 +24,6 @@ from iiif.utils import get_size -class MissingCollectionRecord(ImageNotFound): - - def __init__(self, profile: str, name: str, emu_irn: int): - super().__init__(profile, name, log=f"Couldn't find collection record associated with " - f"multimedia IRN {emu_irn} [guid: {name}]") - self.emu_irn = emu_irn - - class MSSAccessDenied(ImageNotFound): def __init__(self, profile: str, name: str, emu_irn: int): @@ -98,7 +89,6 @@ def __init__(self, config: Config, rights: str, es_hosts: List[str], - collection_indices: List[str], mss_url: str, mss_ssl: bool = True, mss_index: str = 'mss', @@ -122,7 +112,6 @@ def __init__(self, :param config: the Config object :param rights: the rights url for images served by this profile :param es_hosts: a list of elasticsearch hosts to use - :param collection_indices: the indices to search to confirm the images can be accessed :param mss_url: mss base url :param mss_ssl: boolean indicating whether ssl certificates should be checked when making requests to mss @@ -155,7 +144,7 @@ def __init__(self, self.get_mss_doc_locker = Locker(default_timeout=info_lock_ttl) self.mss_doc_cache = TTLCache(maxsize=info_cache_size, ttl=info_cache_ttl) - self.es_handler = MSSElasticsearchHandler(es_hosts, collection_indices, es_limit, mss_index) + self.es_handler = MSSElasticsearchHandler(es_hosts, es_limit, mss_index) self.store = MSSSourceStore(self.source_path, mss_url, source_cache_size, source_cache_ttl, mss_limit, dams_limit, mss_ssl, dams_ssl, convert_slow_pool_size, convert_fast_pool_size, @@ -232,8 +221,6 @@ async def get_mss_doc(self, name: str) -> dict: - the GUID (i.e. the name) must be unique and exist in the mss elasticsearch index - the EMu IRN that the GUID maps to must be valid according the to the MSS (specifically the APS) - - the GUID (i.e. the name) must be found in either the specimen, index lot or - artefacts indices as an associated media item :param name: the image name (a GUID) :return: the mss doc as a dict or None @@ -257,13 +244,7 @@ async def get_mss_doc(self, name: str) -> dict: emu_irn = int(doc['id']) - # TODO: I think we can get rid of this requirement now that we're using GUIDs - # instead of irns - you can't guess a GUID - # check if there's an associated collection record - if not await self.es_handler.has_collection_record(emu_irn): - raise MissingCollectionRecord(self.name, name, emu_irn) - - # finally, check with mss that the irn is valid + # check with mss that the irn is valid if not await self.store.check_access(emu_irn): raise MSSAccessDenied(self.name, name, emu_irn) @@ -321,18 +302,15 @@ class MSSElasticsearchHandler: Class that handles requests to Elasticsearch for the MSS profile. """ - def __init__(self, es_hosts: List[str], collection_indices: List[str], limit: int = 20, - mss_index: str = 'mss'): + def __init__(self, es_hosts: List[str], limit: int = 20, mss_index: str = 'mss'): """ :param es_hosts: a list of elasticsearch hosts to use - :param collection_indices: the indices to search to confirm the images can be accessed :param limit: the maximum number of simultaneous connections that can be made to elasticsearch :param mss_index: the MSS index name in elasticsearch """ self.es_hosts = cycle(es_hosts) self.es_session = create_client_session(limit, ssl=False) - self.collection_indices = ','.join(collection_indices) self.mss_index = mss_index async def get_mss_doc(self, guid: str) -> Tuple[int, Optional[dict]]: @@ -351,21 +329,6 @@ async def get_mss_doc(self, guid: str) -> Tuple[int, Optional[dict]]: first_doc = next((doc['_source'] for doc in result['hits']['hits']), None) return total, first_doc - async def has_collection_record(self, emu_irn: int) -> bool: - """ - Check whether the given image IRN is associated with a collection record. - - :param emu_irn: the EMu IRN - :return: True if it is, False if not - """ - count_url = f'{next(self.es_hosts)}/{self.collection_indices}/_count' - search = Search() \ - .filter('term', **{'data.associatedMedia._id': emu_irn}) \ - .filter('term', **{'meta.versions': int(time.time() * 1000)}) - async with self.es_session.post(count_url, json=search.to_dict()) as response: - text = await response.text(encoding='utf-8') - return json.loads(text)['count'] > 0 - async def close(self): await self.es_session.close() diff --git a/tests/profiles/test_mss.py b/tests/profiles/test_mss.py index 036621d..0989cd1 100644 --- a/tests/profiles/test_mss.py +++ b/tests/profiles/test_mss.py @@ -53,10 +53,9 @@ def test_mss_choose_file_with_derivatives(): assert info.choose_file((2000, 4000)) == 'original.tif' -def create_profile(config: Config, mss_doc: Union[Exception, dict], has_record: bool, - mss_valid: bool, image: Union[Exception, Path], **kwargs) -> MSSProfile: +def create_profile(config: Config, mss_doc: Union[Exception, dict], mss_valid: bool, + image: Union[Exception, Path], **kwargs) -> MSSProfile: mock_es_handler = MagicMock( - has_collection_record=AsyncMock(return_value=has_record), close=AsyncMock() ) if isinstance(mss_doc, Exception): @@ -113,7 +112,7 @@ class TestGetInfo: async def test_no_source_required(self, config): mss_doc = create_mss_doc(7, 'image.jpg', 200, 300) - profile = create_profile(config, mss_doc, True, True, Exception('should not need this')) + profile = create_profile(config, mss_doc, True, Exception('should not need this')) info = await profile.get_info('the_name') assert info.emu_irn == 7 assert info.width == 200 @@ -124,7 +123,7 @@ async def test_no_source_required(self, config): async def test_source_required(self, config): mss_doc = create_mss_doc(7, 'image.jpg') image_path = create_image(config, 200, 300) - profile = create_profile(config, mss_doc, True, True, image_path) + profile = create_profile(config, mss_doc, True, image_path) info = await profile.get_info('the_name') assert info.emu_irn == 7 assert info.width == 200 @@ -135,14 +134,14 @@ async def test_source_required(self, config): async def test_doc_error(self, config): image = create_image(config, 200, 300) exception = Exception('narp') - profile = create_profile(config, exception, True, True, image) + profile = create_profile(config, exception, True, image) with pytest.raises(ImageNotFound) as exc_info1: await profile.get_info('the_name') assert exc_info1.value.cause is exception async def test_cache(self, config): mss_doc = create_mss_doc(7, 'image.jpg', 200, 300) - profile = create_profile(config, mss_doc, True, True, Exception('should not need this')) + profile = create_profile(config, mss_doc, True, Exception('should not need this')) with patch.object(profile, 'get_mss_doc', wraps=profile.get_mss_doc): await profile.get_info('the_name') assert profile.get_mss_doc.call_count == 1 @@ -152,7 +151,7 @@ async def test_cache(self, config): async def test_source_error(self, config): mss_doc = create_mss_doc(7, 'image.jpg') exception = Exception('errrrooorr!') - profile = create_profile(config, mss_doc, True, True, exception) + profile = create_profile(config, mss_doc, True, exception) with pytest.raises(ImageNotFound) as exc_info: await profile.get_info('the_name') assert exc_info.value.cause is exception @@ -160,7 +159,7 @@ async def test_source_error(self, config): async def test_get_size_error(self, config): mss_doc = create_mss_doc(7, 'image.jpg') image = create_image(config, 200, 300) - profile = create_profile(config, mss_doc, True, True, image) + profile = create_profile(config, mss_doc, True, image) exception = Exception('nope!') with patch('iiif.profiles.mss.get_size', MagicMock(side_effect=exception)): with pytest.raises(ImageNotFound) as exc_info: @@ -169,7 +168,7 @@ async def test_get_size_error(self, config): async def test_close(config): - profile = create_profile(config, Exception('meh'), True, True, Exception('meh')) + profile = create_profile(config, Exception('meh'), True, Exception('meh')) await profile.close() profile.es_handler.close.assert_called_once() profile.store.close.assert_called_once() diff --git a/tests/profiles/test_mss_es.py b/tests/profiles/test_mss_es.py index 468d71b..44c7601 100644 --- a/tests/profiles/test_mss_es.py +++ b/tests/profiles/test_mss_es.py @@ -1,8 +1,4 @@ -from contextlib import closing - -import pytest from aioresponses import aioresponses -from fastapi import HTTPException from iiif.profiles.mss import MSSElasticsearchHandler @@ -52,43 +48,20 @@ async def test_get_mss_doc_many_found(): await handler.close() -@pytest.mark.parametrize('count', [1, 2, 10347]) -async def test_has_collection_record(count): - mock_response = {'count': count} - handler = MSSElasticsearchHandler([MOCK_HOST], ['index-1', 'index-2'], mss_index=MSS_INDEX) - try: - with aioresponses() as m: - m.post(f'{MOCK_HOST}/{handler.collection_indices}/_count', payload=mock_response) - assert await handler.has_collection_record('12345') - finally: - await handler.close() - - -async def test_has_collection_record_no_hits(): - mock_response = {'count': 0} - handler = MSSElasticsearchHandler([MOCK_HOST], ['index-1', 'index-2'], mss_index=MSS_INDEX) - try: - with aioresponses() as m: - m.post(f'{MOCK_HOST}/{handler.collection_indices}/_count', payload=mock_response) - assert not await handler.has_collection_record('12345') - finally: - await handler.close() - - async def test_cycling_hosts(): host_1 = 'http://not.a.real.es.host1' - mock_response_1 = {'count': 1} + mock_response_1 = {'hits': {'total': 1, 'hits': [{'_source': {'some': 'data'}}]}} host_2 = 'http://not.a.real.es.host2' - mock_response_2 = {'count': 0} + mock_response_2 = {'hits': {'total': 0, 'hits': []}} handler = MSSElasticsearchHandler([host_1, host_2], ['index-1', 'index-2'], mss_index=MSS_INDEX) try: with aioresponses() as m: - m.post(f'{host_1}/{handler.collection_indices}/_count', payload=mock_response_1) - m.post(f'{host_2}/{handler.collection_indices}/_count', payload=mock_response_2) - m.post(f'{host_1}/{handler.collection_indices}/_count', payload=mock_response_1) - assert await handler.has_collection_record('12345') - assert not await handler.has_collection_record('12345') - assert await handler.has_collection_record('12345') + m.post(f'{host_1}/{handler.mss_index}/_search', payload=mock_response_1) + m.post(f'{host_2}/{handler.mss_index}/_search', payload=mock_response_2) + m.post(f'{host_1}/{handler.mss_index}/_search', payload=mock_response_1) + assert (await handler.get_mss_doc('12345'))[0] == 1 + assert (await handler.get_mss_doc('12345'))[0] == 0 + assert (await handler.get_mss_doc('12345'))[0] == 1 finally: await handler.close() From e81d67eddf03e9bab330394bc41378fd55856bd5 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Wed, 11 May 2022 19:31:12 +0100 Subject: [PATCH 03/40] Pass int instead of str for emu_irn --- iiif/profiles/mss.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/iiif/profiles/mss.py b/iiif/profiles/mss.py index ef0bccd..b19dc56 100644 --- a/iiif/profiles/mss.py +++ b/iiif/profiles/mss.py @@ -210,7 +210,7 @@ async def use_source(self, info: MSSImageInfo, size: Optional[Tuple[int, int]] = if size is None: size = info.size file = info.choose_file(size) - source = MSSSourceFile(str(info.emu_irn), file, info.original == file) + source = MSSSourceFile(info.emu_irn, file, info.original == file) async with self.store.use(source) as path: yield path @@ -282,7 +282,7 @@ async def stream_original(self, name: str, chunk_size: int = 4096, raise_errors= """ try: doc = await self.get_mss_doc(name) - source = MSSSourceFile(doc['id'], doc['file'], True, chunk_size) + source = MSSSourceFile(int(doc['id']), doc['file'], True, chunk_size) async for chunk in self.store.stream(source): yield chunk except Exception as e: From 3b5dbb2380c6552fd2b41d9448108756a98aaf9a Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Wed, 11 May 2022 19:32:29 +0100 Subject: [PATCH 04/40] Catch image errors and try again with pillow This is useful for large images for example where the size exceeds the limits specified in the ImageMagick policy.xml file. --- iiif/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/iiif/utils.py b/iiif/utils.py index ad96dc8..f610102 100644 --- a/iiif/utils.py +++ b/iiif/utils.py @@ -16,7 +16,7 @@ from jpegtran import JPEGImage from pathlib import Path from typing import Optional, Tuple, Union, Any -from wand.exceptions import MissingDelegateError +from wand.exceptions import MissingDelegateError, ImageError from wand.image import Image as WandImage from iiif.exceptions import IIIFServerException @@ -153,7 +153,7 @@ def convert_image(image_path: Path, target_path: Path, quality: int = 80, target_path.parent.mkdir(parents=True, exist_ok=True) with target_path.open('wb') as f: image.save(file=f) - except MissingDelegateError: + except (MissingDelegateError, ImageError): with Image.open(image_path) as image: if image.format.lower() == 'jpeg': exif = image.getexif() From 9f96134ea64ce41918cf4122773cac7154d17a0a Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Wed, 11 May 2022 19:33:43 +0100 Subject: [PATCH 05/40] Throw fetch errors up from the FetchCache.use method We don't know how they will be used so don't wrap them. --- iiif/utils.py | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/iiif/utils.py b/iiif/utils.py index f610102..ed23e1a 100644 --- a/iiif/utils.py +++ b/iiif/utils.py @@ -19,8 +19,6 @@ from wand.exceptions import MissingDelegateError, ImageError from wand.image import Image as WandImage -from iiif.exceptions import IIIFServerException - mimetypes.init() # this is all assuming we're using uvicorn... @@ -377,24 +375,17 @@ async def use(self, fetchable: Fetchable) -> Path: if path in self._cleaners: self._cleaners.pop(path).cancel() elif not path.exists(): - try: - async with self._locker.acquire(path, timeout=self.fetch_timeout): - await self._fetch(fetchable) - self._sizes[path] = path.stat().st_size - self.total_size += self._sizes[path] - - times = 0 - while self._cleaners and self.total_size > self.max_size and times < 10: - path_to_clean_up, handle = self._cleaners.popitem(last=False) - handle.cancel() - self._clean_up(path_to_clean_up) - times += 1 - except Exception as cause: - self.errors += 1 - e = IIIFServerException( - f'An error occurred while fetching {fetchable.public_name}', cause=cause - ) - raise e from cause + async with self._locker.acquire(path, timeout=self.fetch_timeout): + await self._fetch(fetchable) + self._sizes[path] = path.stat().st_size + self.total_size += self._sizes[path] + + times = 0 + while self._cleaners and self.total_size > self.max_size and times < 10: + path_to_clean_up, handle = self._cleaners.popitem(last=False) + handle.cancel() + self._clean_up(path_to_clean_up) + times += 1 self._in_use[path] += 1 try: From c6995b8b533fcea3511ba1b2019fa154a1b87dd5 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Wed, 11 May 2022 19:34:38 +0100 Subject: [PATCH 06/40] Check again once the lock is acquired before fetching --- iiif/utils.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/iiif/utils.py b/iiif/utils.py index ed23e1a..0327b77 100644 --- a/iiif/utils.py +++ b/iiif/utils.py @@ -376,16 +376,17 @@ async def use(self, fetchable: Fetchable) -> Path: self._cleaners.pop(path).cancel() elif not path.exists(): async with self._locker.acquire(path, timeout=self.fetch_timeout): - await self._fetch(fetchable) - self._sizes[path] = path.stat().st_size - self.total_size += self._sizes[path] - - times = 0 - while self._cleaners and self.total_size > self.max_size and times < 10: - path_to_clean_up, handle = self._cleaners.popitem(last=False) - handle.cancel() - self._clean_up(path_to_clean_up) - times += 1 + if not path.exists(): + await self._fetch(fetchable) + self._sizes[path] = path.stat().st_size + self.total_size += self._sizes[path] + + times = 0 + while self._cleaners and self.total_size > self.max_size and times < 10: + path_to_clean_up, handle = self._cleaners.popitem(last=False) + handle.cancel() + self._clean_up(path_to_clean_up) + times += 1 self._in_use[path] += 1 try: From b3ba8e67c90c70162d1bc2003d1a10d7b5d73449 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Wed, 11 May 2022 20:01:33 +0100 Subject: [PATCH 07/40] Update the status on mss profile to provide cache stats --- iiif/profiles/base.py | 5 +---- iiif/profiles/mss.py | 10 ++++++++++ iiif/utils.py | 17 ----------------- iiif/web.py | 6 ++---- 4 files changed, 13 insertions(+), 25 deletions(-) diff --git a/iiif/profiles/base.py b/iiif/profiles/base.py index b6823a1..1807e75 100644 --- a/iiif/profiles/base.py +++ b/iiif/profiles/base.py @@ -168,7 +168,7 @@ async def close(self): """ pass - async def get_status(self, full: bool = False) -> dict: + async def get_status(self) -> dict: """ Returns some stats about the profile. @@ -178,7 +178,4 @@ async def get_status(self, full: bool = False) -> dict: 'name': self.name, 'info_json_cache_size': len(self.info_json_cache), } - if full: - status['sources'] = get_path_stats(self.source_path) - status['cache'] = get_path_stats(self.cache_path) return status diff --git a/iiif/profiles/mss.py b/iiif/profiles/mss.py index b19dc56..4322981 100644 --- a/iiif/profiles/mss.py +++ b/iiif/profiles/mss.py @@ -1,6 +1,7 @@ from concurrent.futures import ProcessPoolExecutor, Executor import asyncio +import humanize import json import logging import shutil @@ -296,6 +297,15 @@ async def close(self): await self.es_handler.close() await self.store.close() + async def get_status(self) -> dict: + status = await super().get_status() + status['source_cache'] = { + 'size': humanize.naturalsize(self.store.total_size, binary=True), + 'max_size': humanize.naturalsize(self.store.max_size, binary=True), + 'percent_full': self.store.pct, + } + return status + class MSSElasticsearchHandler: """ diff --git a/iiif/utils.py b/iiif/utils.py index 0327b77..da08316 100644 --- a/iiif/utils.py +++ b/iiif/utils.py @@ -5,7 +5,6 @@ import abc import aiohttp import asyncio -import humanize import io import logging import mimetypes @@ -70,22 +69,6 @@ def is_locked(self, key: Any) -> bool: return key in self._locks and self._locks[key].locked() -def get_path_stats(path: Path) -> dict: - """ - Calculates some statistics about the on disk path's files. - - :param path: the path to investigate - :return: a dict of stats - """ - sizes = [f.stat().st_size for f in path.glob('**/*') if f.is_file()] - size = sum(sizes) - return { - 'count': len(sizes), - 'size_bytes': size, - 'size_pretty': humanize.naturalsize(size, binary=True), - } - - def get_size(path: Path) -> Tuple[int, int]: """ Returns the size of the image at the given path. diff --git a/iiif/web.py b/iiif/web.py index 0ddc783..a183630 100644 --- a/iiif/web.py +++ b/iiif/web.py @@ -57,13 +57,11 @@ async def on_shutdown(): @app.get('/status') -async def status(full: bool = False) -> JSONResponse: +async def status() -> JSONResponse: """ Returns the status of the server along with some stats about current resource usages. \f - :param full: boolean parameter indicating whether to provide a full status or just the - essentials. The full status may take longer to generate. Default: False. :return: a dict """ body = { @@ -71,7 +69,7 @@ async def status(full: bool = False) -> JSONResponse: 'default_profile': state.config.default_profile_name, 'processing': state.processor.get_status(), 'profiles': { - profile.name: await profile.get_status(full) + profile.name: await profile.get_status() for profile in state.profiles.values() } } From 0c8798061b891d0736fcaa42771d760face0ed74 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Wed, 11 May 2022 20:05:42 +0100 Subject: [PATCH 08/40] Provide stats straight from the FetchCache instead of copying the logic around --- iiif/processing.py | 16 +++++----------- iiif/profiles/mss.py | 6 +----- iiif/utils.py | 18 ++++++++++++++++++ 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/iiif/processing.py b/iiif/processing.py index 03b37eb..5dd5cd7 100644 --- a/iiif/processing.py +++ b/iiif/processing.py @@ -3,6 +3,7 @@ from concurrent.futures import ProcessPoolExecutor import asyncio +import humanize import os from dataclasses import dataclass from jpegtran import JPEGImage @@ -227,19 +228,12 @@ def stop(self): """ self.executor.shutdown() - def get_status(self) -> dict: + async def get_status(self) -> dict: """ Returns some basic stats info as a dict. :return: a dict of stats """ - return { - 'requests': self.requests, - 'completed': self.completed, - 'errors': self.errors, - 'pool_size': self.max_workers, - 'cache_size': self.total_size, - 'percentage_used': self.pct, - 'in_use': len(self._in_use), - 'waiting_clean_up': len(self._cleaners), - } + status = await super().get_status() + status['pool_size'] = self.max_workers + return status diff --git a/iiif/profiles/mss.py b/iiif/profiles/mss.py index 4322981..82d80b4 100644 --- a/iiif/profiles/mss.py +++ b/iiif/profiles/mss.py @@ -299,11 +299,7 @@ async def close(self): async def get_status(self) -> dict: status = await super().get_status() - status['source_cache'] = { - 'size': humanize.naturalsize(self.store.total_size, binary=True), - 'max_size': humanize.naturalsize(self.store.max_size, binary=True), - 'percent_full': self.store.pct, - } + status['source_cache'] = await self.store.get_status() return status diff --git a/iiif/utils.py b/iiif/utils.py index da08316..39d3db9 100644 --- a/iiif/utils.py +++ b/iiif/utils.py @@ -5,6 +5,7 @@ import abc import aiohttp import asyncio +import humanize import io import logging import mimetypes @@ -382,3 +383,20 @@ async def use(self, fetchable: Fetchable) -> Path: self._cleaners[path] = self._schedule_clean_up(path) self.completed += 1 + + async def get_status(self) -> dict: + """ + Returns some basic stats about the cache as a dict. + + :return: a dict of stats + """ + return { + 'requests': self.requests, + 'completed': self.completed, + 'errors': self.errors, + 'cache_size': humanize.naturalsize(self.total_size, binary=True), + 'max_size': humanize.naturalsize(self.max_size, binary=True), + 'percentage_used': self.pct, + 'in_use': len(self._in_use), + 'waiting_clean_up': len(self._cleaners), + } From 3ddbc1f9338309ec6dabef822de1008619796777 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Wed, 11 May 2022 20:07:09 +0100 Subject: [PATCH 09/40] Clear up get_path_stats usages and tests --- iiif/profiles/base.py | 2 +- tests/test_utils.py | 33 +++------------------------------ 2 files changed, 4 insertions(+), 31 deletions(-) diff --git a/iiif/profiles/base.py b/iiif/profiles/base.py index 1807e75..8ea544e 100644 --- a/iiif/profiles/base.py +++ b/iiif/profiles/base.py @@ -5,7 +5,7 @@ from typing import Tuple, Optional, Any, AsyncIterable from iiif.config import Config -from iiif.utils import get_path_stats, generate_sizes +from iiif.utils import generate_sizes class ImageInfo: diff --git a/tests/test_utils.py b/tests/test_utils.py index feb3096..4f12eef 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,18 +1,16 @@ #!/usr/bin/env python3 # encoding: utf-8 -from dataclasses import dataclass - import asyncio -import humanize import pytest from PIL import Image +from dataclasses import dataclass from jpegtran import JPEGImage from pathlib import Path from unittest.mock import patch, MagicMock from wand.exceptions import MissingDelegateError -from iiif.utils import convert_image, generate_sizes, get_size, get_path_stats, get_mimetype, \ - parse_identifier, to_pillow, to_jpegtran, Locker, FetchCache, Fetchable +from iiif.utils import convert_image, generate_sizes, get_size, get_mimetype, parse_identifier, \ + to_pillow, to_jpegtran, Locker, FetchCache, Fetchable from tests.utils import create_image, create_file @@ -135,31 +133,6 @@ def test_get_size(config): ] -def test_get_path_stats(tmp_path): - total = 0 - count = 0 - for i in range(10): - with (tmp_path / f'{i}.text').open('w') as f: - total += f.write('beans!') - count += 1 - (tmp_path / f'{i}').mkdir() - with (tmp_path / f'{i}' / f'{i}.text').open('w') as f: - total += f.write('beans again!') - count += 1 - - stats = get_path_stats(tmp_path) - assert stats['count'] == count - assert stats['size_bytes'] == total - assert stats['size_pretty'] == humanize.naturalsize(total, binary=True) - - -def test_get_path_stats_empty(tmp_path): - stats = get_path_stats(tmp_path / 'empty') - assert stats['count'] == 0 - assert stats['size_bytes'] == 0 - assert stats['size_pretty'] == humanize.naturalsize(0, binary=True) - - class TestGetMimetype: def test_normal(self): From 619071fd615910c69f69e8a5a7c402f2e280af76 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Wed, 11 May 2022 20:08:10 +0100 Subject: [PATCH 10/40] Await the processor status --- iiif/web.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iiif/web.py b/iiif/web.py index a183630..295aada 100644 --- a/iiif/web.py +++ b/iiif/web.py @@ -67,7 +67,7 @@ async def status() -> JSONResponse: body = { 'status': ':)', 'default_profile': state.config.default_profile_name, - 'processing': state.processor.get_status(), + 'processing': await state.processor.get_status(), 'profiles': { profile.name: await profile.get_status() for profile in state.profiles.values() From 190c6b4a89065e0a2d537aa59d7bd92799766e3b Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Wed, 11 May 2022 23:46:29 +0100 Subject: [PATCH 11/40] Add doc for favicon endpoint --- iiif/web.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/iiif/web.py b/iiif/web.py index 295aada..e65cb7d 100644 --- a/iiif/web.py +++ b/iiif/web.py @@ -78,6 +78,10 @@ async def status() -> JSONResponse: @app.get('/favicon.ico') async def favicon() -> StreamingResponse: + """ + This only exists to stop requests to /favicon.ico from erroring when running the server under /. + It's probably only useful in the dev env tbh and just returns the IIIF favicon. + """ async def get(): async with aiohttp.ClientSession() as session: async with session.get('https://iiif.io/favicon.ico') as response: From 987ec4f1158756a91c6b2a94eb12a493abd1244e Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Wed, 11 May 2022 23:49:07 +0100 Subject: [PATCH 12/40] Don't log IIIF request format errors --- iiif/exceptions.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/iiif/exceptions.py b/iiif/exceptions.py index 699bea2..264ac65 100644 --- a/iiif/exceptions.py +++ b/iiif/exceptions.py @@ -15,7 +15,8 @@ class IIIFServerException(Exception): def __init__(self, public: str, status_code: int = 500, log: Optional[str] = None, - level: int = logging.WARNING, cause: Optional[Exception] = None): + level: int = logging.WARNING, cause: Optional[Exception] = None, + use_public_as_log: bool = True): super().__init__(public) self.status_code = status_code self.public = public @@ -27,7 +28,7 @@ def __init__(self, public: str, status_code: int = 500, log: Optional[str] = Non else: if self.cause is not None: self.log = f'An error occurred: {self.cause}' - else: + elif use_public_as_log: self.log = self.public @@ -63,8 +64,8 @@ def __init__(self, max_files: int, *args, **kwargs): class InvalidIIIFParameter(IIIFServerException): def __init__(self, name: str, value: str, *args, **kwargs): - super().__init__(f'Invalid IIIF option: {name} value "{value}" is invalid', 400, *args, - **kwargs) + super().__init__(f'Invalid IIIF option: {name} value "{value}" is invalid', 400, + use_public_as_log=False, *args, **kwargs) self.name = name self.value = value From d5e882301164bb65287f50e113bd072a59cfb6e1 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Thu, 12 May 2022 00:04:52 +0100 Subject: [PATCH 13/40] Add some exception hanlder tests --- iiif/exceptions.py | 2 +- tests/test_exceptions.py | 55 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/iiif/exceptions.py b/iiif/exceptions.py index 264ac65..17b51fd 100644 --- a/iiif/exceptions.py +++ b/iiif/exceptions.py @@ -76,6 +76,6 @@ async def handler(request: Request, exception: IIIFServerException) -> JSONRespo return JSONResponse( status_code=exception.status_code, content={ - "error": exception.public + 'error': exception.public }, ) diff --git a/tests/test_exceptions.py b/tests/test_exceptions.py index 140e0c2..b88c2a9 100644 --- a/tests/test_exceptions.py +++ b/tests/test_exceptions.py @@ -1,4 +1,10 @@ -from iiif.exceptions import ProfileNotFound, TooManyImages, ImageNotFound, InvalidIIIFParameter +import logging + +import json +from unittest.mock import MagicMock, patch + +from iiif.exceptions import ProfileNotFound, TooManyImages, ImageNotFound, InvalidIIIFParameter, \ + handler, IIIFServerException def test_profile_not_found(): @@ -20,3 +26,50 @@ def test_invalid_iiif_parameter(): assert error.status_code == 400 assert 'Goats' in error.public assert 'Beans' in error.public + + +class TestExceptionHandler: + + @patch('iiif.exceptions.logger') + async def test_no_log_use_public(self, mock_logger): + exception = IIIFServerException('public message', use_public_as_log=True) + response = await handler(MagicMock(), exception) + assert response.status_code == exception.status_code + assert json.loads(response.body)['error'] == exception.public + mock_logger.log.assert_called_with(exception.level, exception.log) + + @patch('iiif.exceptions.logger') + async def test_custom_log(self, mock_logger): + exception = IIIFServerException('public message', log='log message') + response = await handler(MagicMock(), exception) + assert response.status_code == exception.status_code + assert json.loads(response.body)['error'] == exception.public + mock_logger.log.assert_called_with(exception.level, exception.log) + + @patch('iiif.exceptions.logger') + async def test_no_log(self, mock_logger): + exception = IIIFServerException('public message', use_public_as_log=False) + response = await handler(MagicMock(), exception) + assert response.status_code == exception.status_code + assert json.loads(response.body)['error'] == exception.public + assert not mock_logger.log.called + + async def test_status_code(self): + exception = IIIFServerException('public message', status_code=418) + response = await handler(MagicMock(), exception) + assert response.status_code == exception.status_code + + @patch('iiif.exceptions.logger') + async def test_cause(self, mock_logger): + cause = Exception('oh no, something went wrong') + exception = IIIFServerException('public message', cause=cause) + response = await handler(MagicMock(), exception) + assert json.loads(response.body)['error'] == exception.public + mock_logger.log.assert_called_with(exception.level, f'An error occurred: {cause}') + + @patch('iiif.exceptions.logger') + async def test_level(self, mock_logger): + exception = IIIFServerException('public message', level=logging.CRITICAL) + response = await handler(MagicMock(), exception) + assert json.loads(response.body)['error'] == exception.public + mock_logger.log.assert_called_with(exception.level, 'public message') From 207daece056cf54b7e6a9d1ae1b91395fada75af Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Thu, 12 May 2022 00:17:00 +0100 Subject: [PATCH 14/40] Catch all kinds of wand exceptions before trying pillow --- iiif/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/iiif/utils.py b/iiif/utils.py index 39d3db9..5979de4 100644 --- a/iiif/utils.py +++ b/iiif/utils.py @@ -16,7 +16,7 @@ from jpegtran import JPEGImage from pathlib import Path from typing import Optional, Tuple, Union, Any -from wand.exceptions import MissingDelegateError, ImageError +from wand.exceptions import WandException from wand.image import Image as WandImage mimetypes.init() @@ -135,7 +135,7 @@ def convert_image(image_path: Path, target_path: Path, quality: int = 80, target_path.parent.mkdir(parents=True, exist_ok=True) with target_path.open('wb') as f: image.save(file=f) - except (MissingDelegateError, ImageError): + except WandException: with Image.open(image_path) as image: if image.format.lower() == 'jpeg': exif = image.getexif() From 3c9a427b8fabe1bf13b89fc4b948991bdf8925b0 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Thu, 12 May 2022 00:41:36 +0100 Subject: [PATCH 15/40] Remove old import --- iiif/processing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/iiif/processing.py b/iiif/processing.py index 5dd5cd7..240bd63 100644 --- a/iiif/processing.py +++ b/iiif/processing.py @@ -3,7 +3,6 @@ from concurrent.futures import ProcessPoolExecutor import asyncio -import humanize import os from dataclasses import dataclass from jpegtran import JPEGImage From 3907020f427b754e97753b8a327e0b6bba5ccd5a Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Thu, 12 May 2022 00:51:19 +0100 Subject: [PATCH 16/40] Fix exception usage breaking ops tests --- tests/test_ops.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_ops.py b/tests/test_ops.py index 7cb2df1..e1368bc 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -1,8 +1,8 @@ from pathlib import Path import pytest -from fastapi import HTTPException +from iiif.exceptions import InvalidIIIFParameter from iiif.ops import parse_region, Region, parse_size, Size, parse_rotation, Rotation, \ parse_quality, Quality, parse_format, Format, parse_params, IIIFOps, IIIF_LEVEL from iiif.profiles.base import ImageInfo @@ -61,7 +61,7 @@ def test_level2_regionByPct_oob(self, info): @pytest.mark.parametrize('region', ['', '10,50', '-10,-15', '10,10,0,0']) def test_invalid(self, info, region): - with pytest.raises(HTTPException) as exc_info: + with pytest.raises(InvalidIIIFParameter) as exc_info: parse_region(region, info) assert exc_info.value.status_code == 400 @@ -143,7 +143,7 @@ def test_level2_sizeByConfinedWh_5(self, full_region: Region): @pytest.mark.parametrize('size', invalid_scenarios) def test_invalid(self, full_region, size): - with pytest.raises(HTTPException) as exc_info: + with pytest.raises(InvalidIIIFParameter) as exc_info: parse_size(size, full_region) assert exc_info.value.status_code == 400 @@ -168,12 +168,12 @@ def test_level2_mirroring(self, angle: int): @pytest.mark.parametrize('angle', [-90, 10, 3, 289, 360, 500, 3600, 'beans']) def test_invalid_angles(self, angle: int): rotation = str(angle) - with pytest.raises(HTTPException) as exc_info: + with pytest.raises(InvalidIIIFParameter) as exc_info: parse_rotation(rotation) assert exc_info.value.status_code == 400 rotation = f'!{angle}' - with pytest.raises(HTTPException) as exc_info: + with pytest.raises(InvalidIIIFParameter) as exc_info: parse_rotation(rotation) assert exc_info.value.status_code == 400 @@ -193,7 +193,7 @@ def test_bitonal(self): assert parse_quality('bitonal') == Quality.bitonal def test_invalid(self): - with pytest.raises(HTTPException) as exc_info: + with pytest.raises(InvalidIIIFParameter) as exc_info: parse_quality('banana') assert exc_info.value.status_code == 400 @@ -208,7 +208,7 @@ def test_level2_png(self): @pytest.mark.parametrize('fmt', ['tif', 'gif', 'pdf', 'jp2', 'webp', 'arms!']) def test_other_formats_are_invalid(self, fmt): - with pytest.raises(HTTPException) as exc_info: + with pytest.raises(InvalidIIIFParameter) as exc_info: parse_format(fmt) assert exc_info.value.status_code == 400 From 30558fae49fcbc871ac636fe9403f5f3fb0161b0 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Thu, 12 May 2022 00:57:09 +0100 Subject: [PATCH 17/40] Make base _fetch async --- iiif/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iiif/utils.py b/iiif/utils.py index 5979de4..bb54f37 100644 --- a/iiif/utils.py +++ b/iiif/utils.py @@ -334,7 +334,7 @@ def __contains__(self, relative_path: Path) -> bool: return self.root / relative_path in self._in_use @abc.abstractmethod - def _fetch(self, fetchable: Fetchable): + async def _fetch(self, fetchable: Fetchable): """ Abstract method which, when called, puts the requested fetchable file on disk. From 819e5f8a2fd507c71d790f4441fca98ebd68ba1a Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Thu, 12 May 2022 00:57:54 +0100 Subject: [PATCH 18/40] Make test _fetch async --- tests/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 4f12eef..336f289 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -176,7 +176,7 @@ class FetchCacheForTesting(FetchCache): def __init__(self, root: Path, ttl: float = 1, max_size: float = 20): super().__init__(Path(root), ttl, max_size) - def _fetch(self, fetchable: FetchableForTesting): + async def _fetch(self, fetchable: FetchableForTesting): path = self.root / fetchable.store_path path.parent.mkdir(parents=True, exist_ok=True) create_file(path, fetchable.size) From 0fd026fbbe84f06ea39ce792f93e9b94250d2781 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Thu, 12 May 2022 00:58:12 +0100 Subject: [PATCH 19/40] Fix test typo failing test --- tests/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 336f289..6496d96 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -205,7 +205,7 @@ async def test_simple_usage(self, cache: FetchCacheForTesting): await asyncio.sleep(cache.ttl + 0.5) assert not path.exists() assert fetchable.store_path not in cache - assert not path.parent.exist() + assert not path.parent.exists() class TestLocker: From 54f7fff970b211c3085e13adf5b649976cf31a7a Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Thu, 12 May 2022 01:02:07 +0100 Subject: [PATCH 20/40] Fix disk profile tests --- tests/profiles/test_disk.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/tests/profiles/test_disk.py b/tests/profiles/test_disk.py index 701d4c1..6168d43 100644 --- a/tests/profiles/test_disk.py +++ b/tests/profiles/test_disk.py @@ -1,11 +1,8 @@ -from fastapi import HTTPException -from unittest.mock import patch, MagicMock - import math import pytest from iiif.profiles.base import ImageInfo -from iiif.profiles.disk import OnDiskProfile +from iiif.profiles.disk import OnDiskProfile, MissingFile from tests.utils import create_image @@ -17,7 +14,7 @@ def disk_profile(config): class TestOnDiskProfile: async def test_get_info_no_file(self, disk_profile): - with pytest.raises(HTTPException) as exc_info: + with pytest.raises(MissingFile) as exc_info: await disk_profile.get_info('image') assert exc_info.value.status_code == 404 @@ -26,23 +23,24 @@ async def test_get_info_with_file(self, disk_profile, config): info = await disk_profile.get_info('image') assert info.size == (100, 100) - async def test_fetch_source_no_file(self, config, disk_profile): + async def test_use_source_no_file(self, config, disk_profile): info = ImageInfo('test', 'image', 100, 100) - with pytest.raises(HTTPException) as exc_info: - await disk_profile.fetch_source(info) + with pytest.raises(MissingFile) as exc_info: + async with disk_profile.use_source(info) as _: + pass assert exc_info.value.status_code == 404 - async def test_fetch_source_with_file(self, config, disk_profile): + async def test_use_source_with_file(self, config, disk_profile): info = ImageInfo('test', 'image', 100, 100) create_image(config, 100, 100, 'test', 'image') - source_path = await disk_profile.fetch_source(info) - assert source_path == disk_profile.source_path / 'image' + async with disk_profile.use_source(info) as source_path: + assert source_path == disk_profile.source_path / 'image' info = ImageInfo('test', 'image', 100, 100) create_image(config, 100, 100, 'test', 'image') # target size doesn't matter for on disk images - source_path = await disk_profile.fetch_source(info, (50, 50)) - assert source_path == disk_profile.source_path / 'image' + async with disk_profile.use_source(info, (50, 50)) as source_path: + assert source_path == disk_profile.source_path / 'image' async def test_resolve_filename_no_file(self, disk_profile): filename = await disk_profile.resolve_filename('image') @@ -55,7 +53,7 @@ async def test_stream_original_no_file_errors_off(self, disk_profile): assert count == 0 async def test_stream_original_no_file_errors_on(self, disk_profile): - with pytest.raises(HTTPException) as exc_info: + with pytest.raises(MissingFile) as exc_info: async for _ in disk_profile.stream_original('image', raise_errors=True): pass assert exc_info.value.status_code == 404 From 45044746a6cd6fd975e336ada5284c99e73c1f6c Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Thu, 12 May 2022 14:29:14 +0100 Subject: [PATCH 21/40] Update abstract resolve_filename method's return type We don't use None returns anymore, we use exceptions. --- iiif/profiles/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iiif/profiles/base.py b/iiif/profiles/base.py index 8ea544e..214434a 100644 --- a/iiif/profiles/base.py +++ b/iiif/profiles/base.py @@ -98,7 +98,7 @@ async def use_source(self, info: ImageInfo, size: Optional[Tuple[int, int]] = No pass @abc.abstractmethod - async def resolve_filename(self, name: str) -> str: + async def resolve_filename(self, name: str) -> Optional[str]: """ Given the name of an image produced by this profile, returns the filename that should be used for it. This is used when original images are downloaded to give them the right name. From 4fb12eb8ad9f71544830a5d6b83d1a83a7dbc27b Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Thu, 12 May 2022 14:31:50 +0100 Subject: [PATCH 22/40] Add content-length to the /original endpoint Now it's not a mystery how big the file is. --- iiif/profiles/base.py | 11 +++++++++ iiif/profiles/disk.py | 12 ++++++++++ iiif/profiles/mss.py | 54 ++++++++++++++++++++++++++++++++++++++---- iiif/routers/simple.py | 8 ++++--- 4 files changed, 77 insertions(+), 8 deletions(-) diff --git a/iiif/profiles/base.py b/iiif/profiles/base.py index 214434a..c1acd56 100644 --- a/iiif/profiles/base.py +++ b/iiif/profiles/base.py @@ -108,6 +108,17 @@ async def resolve_filename(self, name: str) -> Optional[str]: """ pass + @abc.abstractmethod + async def resolve_original_size(self, name: str) -> int: + """ + Given the name of an image managed by this profile, returns the size of the original source + image. + + :param name: the name of the image + :return: the size of the original image in bytes + """ + ... + @abc.abstractmethod async def stream_original(self, name: str, chunk_size: int = 4096, raise_errors=True) -> AsyncIterable[bytes]: diff --git a/iiif/profiles/disk.py b/iiif/profiles/disk.py index a889a08..91e7599 100644 --- a/iiif/profiles/disk.py +++ b/iiif/profiles/disk.py @@ -69,6 +69,18 @@ async def resolve_filename(self, name: str) -> str: """ return self._get_source(name).name + async def resolve_original_size(self, name: str) -> int: + """ + Given an image, returns the size of the source image. + + :param name: the image name + :return: the size of the source file in bytes + """ + source = self._get_source(name) + if not source.exists(): + raise MissingFile(self.name, name, source) + return source.stat().st_size + async def stream_original(self, name: str, chunk_size: int = 4096, raise_errors=True): """ Streams the source file for the given image name from disk to the requester. This function diff --git a/iiif/profiles/mss.py b/iiif/profiles/mss.py index 82d80b4..783a6cb 100644 --- a/iiif/profiles/mss.py +++ b/iiif/profiles/mss.py @@ -1,11 +1,11 @@ from concurrent.futures import ProcessPoolExecutor, Executor import asyncio -import humanize import json import logging import shutil import tempfile +from aiohttp import ClientResponse from cachetools import TTLCache from contextlib import AsyncExitStack, asynccontextmanager from dataclasses import dataclass @@ -271,6 +271,21 @@ async def resolve_filename(self, name: str) -> str: doc = await self.get_mss_doc(name) return doc['file'] + async def resolve_original_size(self, name: str) -> int: + """ + Given an image name (i.e. a GUID), return the size of the original image. This relies on the + server which is actually serving up the original file data telling us how big the file is + through a content-length header. This means we're using the actual filesize, not relying on + the data EMu has to hand (which, if the file has been modified directly on disk, may not be + correct and would cause issues if we presented an incorrect content-length). + + :param name: the image name (in this case the GUID) + :return: the size of the original image in bytes + """ + doc = await self.get_mss_doc(name) + source = MSSSourceFile(int(doc['id']), doc['file'], True) + return await self.store.get_file_size(source) + async def stream_original(self, name: str, chunk_size: int = 4096, raise_errors=True): """ Async generator which yields the bytes of the original image for the given image name (EMu @@ -426,17 +441,22 @@ async def _fetch(self, source: MSSSourceFile): cache_path.parent.mkdir(parents=True, exist_ok=True) shutil.move(target_path, cache_path) - async def stream(self, source: MSSSourceFile): + @asynccontextmanager + async def _open_stream(self, source: MSSSourceFile) -> ClientResponse: """ - Stream the given source file directly from the MSS. + Opens a stream to the requested source data file. This is returned in the form of an aiohttp + ClientResponse object which will be correctly closed once this context manager exits. The + response may come from the MSS or it may come from the old dams service (via the damsurl + file). - :param source: the source - :return: yields chunks of bytes + :param source: the source file requested + :return: a ClientResponse object """ async with AsyncExitStack() as stack: response = await stack.enter_async_context(self.mss_session.get(source.url)) if response.status == 401: + # TODO: this should probably be a different kind of error raise HTTPException(status_code=401, detail=f'Access denied') if response.status == 404 and source.is_original: @@ -449,10 +469,34 @@ async def stream(self, source: MSSSourceFile): response = await stack.enter_async_context(self.dm_session.get(damsurl)) response.raise_for_status() + yield response + async def stream(self, source: MSSSourceFile): + """ + Stream the given source file directly from the MSS/dams. + + :param source: the source + :return: yields chunks of bytes + """ + async with self._open_stream(source) as response: while chunk := await response.content.read(source.chunk_size): yield chunk + async def get_file_size(self, source: MSSSourceFile) -> int: + """ + Returns the file size of the given source by connecting to the backend service that can + provide the source and returning the content-length. No body data is read, just the headers. + + :param source: the source + :return: the size of the file in bytes + """ + async with self._open_stream(source) as response: + size = response.headers.get('content-length') + if size is None: + # TODO: this should be a better exception + raise Exception(f'The MSS backend returned no content-length') + return int(size) + def _choose_convert_pool(self, file: str) -> Executor: """ Pick the pool to use for converting the given file. JPEGs are converted in the fast pool diff --git a/iiif/routers/simple.py b/iiif/routers/simple.py index b0848f4..138a84d 100644 --- a/iiif/routers/simple.py +++ b/iiif/routers/simple.py @@ -70,12 +70,14 @@ async def original(identifier: str) -> StreamingResponse: profile_name, name = parse_identifier(identifier) profile = state.get_profile(profile_name) filename = await profile.resolve_filename(name) - if filename is None: - raise ImageNotFound(profile_name, name) + length = await profile.resolve_original_size(name) response = StreamingResponse( profile.stream_original(name, chunk_size=state.config.download_chunk_size), media_type=get_mimetype(filename), # note the quoted file name, this avoids client-side errors if the filename contains a comma - headers={'content-disposition': f'attachment; filename="{filename}"'} + headers={ + 'content-disposition': f'attachment; filename="{filename}"', + 'content-length': str(length) + } ) return response From 2f873e32cc24bdc5adb2432cc3abce075920de35 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Fri, 13 May 2022 16:16:35 +0100 Subject: [PATCH 23/40] Split the error log handling out so that it can be used elsewhere --- iiif/exceptions.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/iiif/exceptions.py b/iiif/exceptions.py index 17b51fd..85d98f8 100644 --- a/iiif/exceptions.py +++ b/iiif/exceptions.py @@ -71,11 +71,15 @@ def __init__(self, name: str, value: str, *args, **kwargs): async def handler(request: Request, exception: IIIFServerException) -> JSONResponse: - if exception.log: - logger.log(exception.level, exception.log) + log_error(exception) return JSONResponse( status_code=exception.status_code, content={ 'error': exception.public }, ) + + +def log_error(exception: IIIFServerException): + if exception.log: + logger.log(exception.level, exception.log) From 8147bfd4db6f786b192155e795dac473d80d9af8 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Fri, 13 May 2022 16:21:53 +0100 Subject: [PATCH 24/40] Rewrite the way we get the source data stream to be more robust It's better if we don't use an AsyncExitStack for this as it means we're leaving connections open after we're really done with them. For example, if we end up needing to go to the dams to get the data, we're leaving 2 connections open to the mss even though we're done with them. Instead, we just open each url in turn and close the connection out when we're done with it. Additionally, this commit adds exception handling so that we can track the URL we were attempting to access up the stack. --- iiif/profiles/mss.py | 55 +++++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 21 deletions(-) diff --git a/iiif/profiles/mss.py b/iiif/profiles/mss.py index 783a6cb..c7db581 100644 --- a/iiif/profiles/mss.py +++ b/iiif/profiles/mss.py @@ -5,12 +5,11 @@ import logging import shutil import tempfile -from aiohttp import ClientResponse +from aiohttp import ClientResponse, ClientError from cachetools import TTLCache -from contextlib import AsyncExitStack, asynccontextmanager +from contextlib import asynccontextmanager from dataclasses import dataclass from elasticsearch_dsl import Search -from fastapi import HTTPException from functools import partial from itertools import cycle from pathlib import Path @@ -385,6 +384,15 @@ def check_url(emu_irn: int) -> str: return f'/nhmlive/{emu_irn}' +class StoreStreamError(Exception): + + def __init__(self, source: MSSSourceFile, url: str, cause: ClientError): + super().__init__() + self.source = source + self.url = url + self.cause = cause + + class MSSSourceStore(FetchCache): """ Class that controls fetching source files from the MSS and then storing them in a cache and @@ -452,24 +460,29 @@ async def _open_stream(self, source: MSSSourceFile) -> ClientResponse: :param source: the source file requested :return: a ClientResponse object """ - async with AsyncExitStack() as stack: - response = await stack.enter_async_context(self.mss_session.get(source.url)) - - if response.status == 401: - # TODO: this should probably be a different kind of error - raise HTTPException(status_code=401, detail=f'Access denied') - - if response.status == 404 and source.is_original: - # check for a damsurl file - response = await stack.enter_async_context(self.mss_session.get(source.dams_url)) - response.raise_for_status() - - # load the url in the response and fetch it - damsurl = await response.text(encoding='utf-8') - response = await stack.enter_async_context(self.dm_session.get(damsurl)) - - response.raise_for_status() - yield response + check_dams = False + current_url = source.url + try: + async with self.mss_session.get(source.url) as mss_response: + if mss_response.status == 404: + check_dams = True + else: + mss_response.raise_for_status() + yield mss_response + + if check_dams: + current_url = source.dams_url + async with self.mss_session.get(source.dams_url) as mss_response: + mss_response.raise_for_status() + # load the url in the response and fetch it + dams_url = await mss_response.text(encoding='utf-8') + + current_url = dams_url + async with self.dm_session.get(dams_url) as dams_response: + dams_response.raise_for_status() + yield dams_response + except ClientError as e: + raise StoreStreamError(source, current_url, e) async def stream(self, source: MSSSourceFile): """ From 6ab80d8933073c71cc176ed1d5ba5ff66bd81d20 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Fri, 13 May 2022 16:27:25 +0100 Subject: [PATCH 25/40] Sort out error propagation on streaming originals Because once we've started streaming the body of the response we can't go back and change the headers we've already sent to show a nice error, we basically just have to hang up the phone and stop sending bits. To this end, there's no point in having various options for handling this, there's only really one choice: raise the exception anyway but before we do (cause it will disappear into the ether), log the error. So now when you stream and original and the backend (mss or dams) fails, we get a log of what happened and the user gets the same experience they had before. Neater and simpler to understand/manage/maintain. --- iiif/profiles/base.py | 4 +--- iiif/profiles/disk.py | 22 ++++++++------------- iiif/profiles/mss.py | 34 ++++++++++++++++++++++---------- iiif/routers/originals.py | 17 +++++----------- tests/profiles/test_disk.py | 10 ++-------- tests/profiles/test_mss.py | 2 -- tests/profiles/test_mss_store.py | 22 ++++++++++++++++----- 7 files changed, 57 insertions(+), 54 deletions(-) diff --git a/iiif/profiles/base.py b/iiif/profiles/base.py index c1acd56..25e7425 100644 --- a/iiif/profiles/base.py +++ b/iiif/profiles/base.py @@ -120,8 +120,7 @@ async def resolve_original_size(self, name: str) -> int: ... @abc.abstractmethod - async def stream_original(self, name: str, chunk_size: int = 4096, - raise_errors=True) -> AsyncIterable[bytes]: + async def stream_original(self, name: str, chunk_size: int = 4096) -> AsyncIterable[bytes]: """ Streams the original file associated with the given name, if valid. The chunk size can be configured to define how much data is yielded each time this function is scheduled to run @@ -130,7 +129,6 @@ async def stream_original(self, name: str, chunk_size: int = 4096, :param name: the name of the image :param chunk_size: the number of bytes to yield at a time - :param raise_errors: whether to raise errors as they occur or just stop (default: True) :return: an asynchronous generator of bytes """ pass diff --git a/iiif/profiles/disk.py b/iiif/profiles/disk.py index 91e7599..e9eed05 100644 --- a/iiif/profiles/disk.py +++ b/iiif/profiles/disk.py @@ -81,28 +81,22 @@ async def resolve_original_size(self, name: str) -> int: raise MissingFile(self.name, name, source) return source.stat().st_size - async def stream_original(self, name: str, chunk_size: int = 4096, raise_errors=True): + async def stream_original(self, name: str, chunk_size: int = 4096): """ Streams the source file for the given image name from disk to the requester. This function uses aiofiles to avoid locking up the server. :param name: the image name :param chunk_size: the size in bytes of each chunk - :param raise_errors: whether to raise errors if they occur, or just stop (default: True) :return: yields chunks of bytes """ source = self._get_source(name) if source.exists(): - try: - async with aiofiles.open(file=str(source), mode='rb') as f: - while True: - chunk = await f.read(chunk_size) - if not chunk: - break - yield chunk - except Exception as e: - if raise_errors: - raise e + async with aiofiles.open(file=str(source), mode='rb') as f: + while True: + chunk = await f.read(chunk_size) + if not chunk: + break + yield chunk else: - if raise_errors: - raise MissingFile(self.name, name, source) + raise MissingFile(self.name, name, source) diff --git a/iiif/profiles/mss.py b/iiif/profiles/mss.py index c7db581..2ea5705 100644 --- a/iiif/profiles/mss.py +++ b/iiif/profiles/mss.py @@ -18,7 +18,7 @@ from urllib.parse import quote from iiif.config import Config -from iiif.exceptions import Timeout, IIIFServerException, ImageNotFound +from iiif.exceptions import Timeout, IIIFServerException, ImageNotFound, log_error from iiif.profiles.base import AbstractProfile, ImageInfo from iiif.utils import Locker, convert_image, create_client_session, FetchCache, Fetchable from iiif.utils import get_size @@ -45,6 +45,15 @@ def __init__(self, profile: str, name: str): super().__init__(profile, name, log=f"No MSS doc found for the guid {name}") +class MSSStoreFailure(IIIFServerException): + + def __init__(self, profile: str, name: str, cause: 'StoreStreamError'): + super().__init__(f'Failed to retrieve the requested image data for {name} from the ' + f'{profile} backend, try again', status_code=503, + log=f'Failed to stream the source file for {profile}:{name} from ' + f'{cause.url} due to {cause.cause}') + + class MSSImageInfo(ImageInfo): """ MSS variant of the ImageInfo class. @@ -281,28 +290,33 @@ async def resolve_original_size(self, name: str) -> int: :param name: the image name (in this case the GUID) :return: the size of the original image in bytes """ - doc = await self.get_mss_doc(name) - source = MSSSourceFile(int(doc['id']), doc['file'], True) - return await self.store.get_file_size(source) + try: + doc = await self.get_mss_doc(name) + source = MSSSourceFile(int(doc['id']), doc['file'], True) + return await self.store.get_file_size(source) + except StoreStreamError as cause: + raise MSSStoreFailure(self.name, name, cause) - async def stream_original(self, name: str, chunk_size: int = 4096, raise_errors=True): + async def stream_original(self, name: str, chunk_size: int = 4096): """ Async generator which yields the bytes of the original image for the given image name (EMu IRN). If the image isn't available then nothing is yielded. :param name: the name of the image (EMu IRN) :param chunk_size: the size of the chunks to yield - :param raise_errors: whether to raise errors when they happen or simply stop, swallowing the - error """ try: doc = await self.get_mss_doc(name) source = MSSSourceFile(int(doc['id']), doc['file'], True, chunk_size) async for chunk in self.store.stream(source): yield chunk - except Exception as e: - if raise_errors: - raise e + except StoreStreamError as cause: + e = MSSStoreFailure(self.name, name, cause) + # it's highly likely this error won't get surfaced through the exception handler because + # the response has likely already begun, so log it before raising it to ensure we can + # see what happened + log_error(e) + raise e async def close(self): """ diff --git a/iiif/routers/originals.py b/iiif/routers/originals.py index c5c6410..f577b50 100644 --- a/iiif/routers/originals.py +++ b/iiif/routers/originals.py @@ -1,8 +1,7 @@ -from pathlib import Path -from typing import Tuple, Iterable - from fastapi import APIRouter +from pathlib import Path from starlette.responses import StreamingResponse +from typing import Tuple, Iterable from zipstream import AioZipStream from iiif.exceptions import TooManyImages @@ -33,8 +32,7 @@ def parse_identifiers(identifiers: str) -> Iterable[Tuple[AbstractProfile, str]] @router.get('/originals') -async def zip_originals(identifiers: str, stop_on_error: bool = True, - use_original_filenames: bool = True) -> StreamingResponse: +async def zip_originals(identifiers: str, use_original_filenames: bool = True) -> StreamingResponse: """ Endpoint which streams a zip containing the original versions of the requested images. The zip is created on the fly which in theory means it could be unlimited in size, however, to @@ -46,16 +44,12 @@ async def zip_originals(identifiers: str, stop_on_error: bool = True, :param identifiers: a comma separated list of identifiers (:) and/or just names in which case the default profile is used - :param stop_on_error: whether to stop streaming and return an error if there is a problem while - streaming a file (True, the default) or finish the file and continue with - the next file (if the file hasn't started streaming yet then this will - result in an empty file, if the file has started streaming this will - result in a partial file) :param use_original_filenames: whether to use the original file names in the zip (True, the default) or name the files after the image name :return: a StreamingResponse object streaming a dynamically generated zip of the requested original files """ + chunk_size = state.config.download_chunk_size profiles_and_names = list(parse_identifiers(identifiers)) if len(profiles_and_names) > state.config.download_max_files: raise TooManyImages(state.config.download_max_files) @@ -69,8 +63,7 @@ async def zip_originals(identifiers: str, stop_on_error: bool = True, filename = f'{name}{Path(filename).suffix.lower()}' files.append({ 'name': filename, - 'stream': profile.stream_original(name, chunk_size=state.config.download_chunk_size, - raise_errors=stop_on_error), + 'stream': profile.stream_original(name, chunk_size=chunk_size), 'compression': 'deflate' }) diff --git a/tests/profiles/test_disk.py b/tests/profiles/test_disk.py index 6168d43..fe65582 100644 --- a/tests/profiles/test_disk.py +++ b/tests/profiles/test_disk.py @@ -46,15 +46,9 @@ async def test_resolve_filename_no_file(self, disk_profile): filename = await disk_profile.resolve_filename('image') assert filename == 'image' - async def test_stream_original_no_file_errors_off(self, disk_profile): - count = 0 - async for _ in disk_profile.stream_original('image', raise_errors=False): - count += 1 - assert count == 0 - - async def test_stream_original_no_file_errors_on(self, disk_profile): + async def test_stream_original_no_file(self, disk_profile): with pytest.raises(MissingFile) as exc_info: - async for _ in disk_profile.stream_original('image', raise_errors=True): + async for _ in disk_profile.stream_original('image'): pass assert exc_info.value.status_code == 404 diff --git a/tests/profiles/test_mss.py b/tests/profiles/test_mss.py index 0989cd1..8d8d120 100644 --- a/tests/profiles/test_mss.py +++ b/tests/profiles/test_mss.py @@ -1,9 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 -import asyncio import pytest from contextlib import asynccontextmanager -from fastapi import HTTPException from pathlib import Path from typing import Optional, Union from unittest.mock import AsyncMock, patch, Mock, MagicMock diff --git a/tests/profiles/test_mss_store.py b/tests/profiles/test_mss_store.py index 429716a..9ded330 100644 --- a/tests/profiles/test_mss_store.py +++ b/tests/profiles/test_mss_store.py @@ -2,11 +2,10 @@ from PIL import Image from aiohttp import ClientResponseError from aioresponses import aioresponses -from fastapi import HTTPException from io import BytesIO from pathlib import Path -from iiif.profiles.mss import MSSSourceStore, MSSSourceFile +from iiif.profiles.mss import MSSSourceStore, MSSSourceFile, StoreStreamError MOCK_HOST = 'http://not.the.real.mss.host' @@ -77,9 +76,13 @@ async def test_stream_access_denied(source_root): with aioresponses() as m: source = MSSSourceFile(emu_irn, file, False) m.get(source.url, status=401) - with pytest.raises(HTTPException): + with pytest.raises(StoreStreamError) as exc_info: async for chunk in store.stream(source): pass + assert exc_info.value.url.endswith(source.url) + assert exc_info.value.source is source + assert isinstance(exc_info.value.cause, ClientResponseError) + assert exc_info.value.cause.status == 401 finally: await store.close() @@ -92,9 +95,14 @@ async def test_stream_missing(source_root): with aioresponses() as m: source = MSSSourceFile(emu_irn, file, False) m.get(source.url, status=404) - with pytest.raises(ClientResponseError): + m.get(source.dams_url, status=404) + with pytest.raises(StoreStreamError) as exc_info: async for chunk in store.stream(source): pass + assert exc_info.value.url.endswith(source.dams_url) + assert exc_info.value.source is source + assert isinstance(exc_info.value.cause, ClientResponseError) + assert exc_info.value.cause.status == 404 finally: await store.close() @@ -136,9 +144,13 @@ async def test_stream_dams_fails(source_root: Path): m.get(source.dams_url, body=dams_url) m.get(dams_url, status=404) - with pytest.raises(ClientResponseError): + with pytest.raises(StoreStreamError) as exc_info: async for chunk in store.stream(source): pass + assert exc_info.value.url == dams_url + assert exc_info.value.source is source + assert isinstance(exc_info.value.cause, ClientResponseError) + assert exc_info.value.cause.status == 404 finally: await store.close() From 27b66581d469cbc3d1fa57d03f3810da497b506a Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Fri, 13 May 2022 16:27:33 +0100 Subject: [PATCH 26/40] Remove old import --- iiif/routers/simple.py | 1 - 1 file changed, 1 deletion(-) diff --git a/iiif/routers/simple.py b/iiif/routers/simple.py index 138a84d..5098c9e 100644 --- a/iiif/routers/simple.py +++ b/iiif/routers/simple.py @@ -1,7 +1,6 @@ from fastapi import APIRouter from starlette.responses import FileResponse, StreamingResponse -from iiif.exceptions import ImageNotFound from iiif.routers.iiif import get_image_data from iiif.state import state from iiif.utils import parse_identifier, get_mimetype From 4a15711e21187d90187d7edd622755d87460eaa0 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Fri, 13 May 2022 16:30:49 +0100 Subject: [PATCH 27/40] Upgrade comment that is out of date --- iiif/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iiif/ops.py b/iiif/ops.py index 3c1b064..bec5bf2 100644 --- a/iiif/ops.py +++ b/iiif/ops.py @@ -7,7 +7,7 @@ from iiif.exceptions import InvalidIIIFParameter from iiif.profiles.base import ImageInfo -# this server currently supports IIIF level1 +# this server currently supports IIIF level2 IIIF_LEVEL = 2 From 46194a43bea17ffeb5385a6beb7fbdfb5c71202c Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Fri, 13 May 2022 17:15:23 +0100 Subject: [PATCH 28/40] Tidy up some comments and abstract method defs --- iiif/profiles/base.py | 17 +++++++++-------- iiif/profiles/disk.py | 6 +++--- iiif/profiles/mss.py | 8 ++++---- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/iiif/profiles/base.py b/iiif/profiles/base.py index 25e7425..4c1bbd2 100644 --- a/iiif/profiles/base.py +++ b/iiif/profiles/base.py @@ -80,22 +80,23 @@ async def get_info(self, name: str) -> ImageInfo: this function does not generate the info.json, see generate_info_json below for that! :param name: the name of the image - :return: an info object or None if the image wasn't available (for any reason) + :return: an info object """ - pass + ... @abc.abstractmethod @asynccontextmanager async def use_source(self, info: ImageInfo, size: Optional[Tuple[int, int]] = None) -> Path: """ Ensures the source file required for the image is available, using the optional size hint to - choose the best file if multiple sizes are available. + choose the best file if multiple sizes are available, and then yields the path to it. This + function should ensure that while the context is up the source is available at the path. :param info: the ImageInfo :param size: an 2-tuple of ints to hint at the target size required for subsequent ops - :return: the Path to the source file or None if one could not be found + :return: the Path to the source file """ - pass + ... @abc.abstractmethod async def resolve_filename(self, name: str) -> Optional[str]: @@ -104,9 +105,9 @@ async def resolve_filename(self, name: str) -> Optional[str]: used for it. This is used when original images are downloaded to give them the right name. :param name: the image name - :return: the filename or None if the name is invalid + :return: the filename """ - pass + ... @abc.abstractmethod async def resolve_original_size(self, name: str) -> int: @@ -131,7 +132,7 @@ async def stream_original(self, name: str, chunk_size: int = 4096) -> AsyncItera :param chunk_size: the number of bytes to yield at a time :return: an asynchronous generator of bytes """ - pass + ... async def generate_info_json(self, info: ImageInfo, iiif_level: int) -> dict: """ diff --git a/iiif/profiles/disk.py b/iiif/profiles/disk.py index e9eed05..7cb3c86 100644 --- a/iiif/profiles/disk.py +++ b/iiif/profiles/disk.py @@ -10,8 +10,8 @@ class MissingFile(ImageNotFound): def __init__(self, profile: str, name: str, source: Path): - super().__init__(profile, name, log=f"Couldn't find the image file for {name} on disk at" - f" {source}") + super().__init__(profile, name, + log=f"Couldn't find the image file for {name} on disk at {source}") self.source = source @@ -28,7 +28,7 @@ async def get_info(self, name: str) -> ImageInfo: :param name: the image name :return: an ImageInfo instance - :raises: HTTPException if the file is missing + :raises: MissingFile if the file is missing """ source = self._get_source(name) if not source.exists(): diff --git a/iiif/profiles/mss.py b/iiif/profiles/mss.py index 2ea5705..b80d048 100644 --- a/iiif/profiles/mss.py +++ b/iiif/profiles/mss.py @@ -27,8 +27,8 @@ class MSSAccessDenied(ImageNotFound): def __init__(self, profile: str, name: str, emu_irn: int): - super().__init__(profile, name, log=f"MSS denied access to multimedia IRN {emu_irn} " - f"[guid: {name}]") + super().__init__(profile, name, + log=f"MSS denied access to multimedia IRN {emu_irn} [guid: {name}]") self.emu_irn = emu_irn @@ -161,7 +161,7 @@ def __init__(self, async def get_info(self, name: str) -> MSSImageInfo: """ - Given an image name (a GUID) returns a MSSImageInfo object or raise an HTTPException if the + Given an image name (a GUID) returns a MSSImageInfo object or raise an exception if the image can't be found/isn't allowed to be accessed. If the image doesn't have width and height stored in the elasticsearch index for whatever reason then the original source image will be retrieved and the size extracted. @@ -232,7 +232,7 @@ async def get_mss_doc(self, name: str) -> dict: the APS) :param name: the image name (a GUID) - :return: the mss doc as a dict or None + :return: the mss doc as a dict """ if name in self.mss_doc_cache: return self.mss_doc_cache[name] From a8b11652a4c1ae6e07de127c531b628e20689786 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Fri, 13 May 2022 17:18:20 +0100 Subject: [PATCH 29/40] DRY up the disk profile --- iiif/profiles/disk.py | 42 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/iiif/profiles/disk.py b/iiif/profiles/disk.py index 7cb3c86..8cfe469 100644 --- a/iiif/profiles/disk.py +++ b/iiif/profiles/disk.py @@ -31,10 +31,7 @@ async def get_info(self, name: str) -> ImageInfo: :raises: MissingFile if the file is missing """ source = self._get_source(name) - if not source.exists(): - raise MissingFile(self.name, name, source) - else: - return ImageInfo(self.name, name, *get_size(source)) + return ImageInfo(self.name, name, *get_size(source)) @asynccontextmanager async def use_source(self, info: ImageInfo, *args, **kwargs) -> Path: @@ -46,19 +43,21 @@ async def use_source(self, info: ImageInfo, *args, **kwargs) -> Path: :return: the path to the source image on disk :raises: HTTPException if the file is missing """ - source = self._get_source(info.name) - if not source.exists(): - raise MissingFile(self.name, info.name, source) - yield source + yield self._get_source(info.name) def _get_source(self, name: str) -> Path: """ - Returns the path to the given name in this profile. + Returns the path to the given name in this profile. If the file doesn't exist, raises a + MissingFile error. :param name: the name of the image :return: the path to the image + :raises: MissingFile exception if the file doesn't exist """ - return self.source_path / name + source = self.source_path / name + if not source.exists(): + raise MissingFile(self.name, name, source) + return source async def resolve_filename(self, name: str) -> str: """ @@ -66,6 +65,7 @@ async def resolve_filename(self, name: str) -> str: :param name: the image name :return: the source filename + :raises: MissingFile exception if the file doesn't exist """ return self._get_source(name).name @@ -75,11 +75,9 @@ async def resolve_original_size(self, name: str) -> int: :param name: the image name :return: the size of the source file in bytes + :raises: MissingFile exception if the file doesn't exist """ - source = self._get_source(name) - if not source.exists(): - raise MissingFile(self.name, name, source) - return source.stat().st_size + return self._get_source(name).stat().st_size async def stream_original(self, name: str, chunk_size: int = 4096): """ @@ -89,14 +87,12 @@ async def stream_original(self, name: str, chunk_size: int = 4096): :param name: the image name :param chunk_size: the size in bytes of each chunk :return: yields chunks of bytes + :raises: MissingFile exception if the file doesn't exist """ source = self._get_source(name) - if source.exists(): - async with aiofiles.open(file=str(source), mode='rb') as f: - while True: - chunk = await f.read(chunk_size) - if not chunk: - break - yield chunk - else: - raise MissingFile(self.name, name, source) + async with aiofiles.open(file=str(source), mode='rb') as f: + while True: + chunk = await f.read(chunk_size) + if not chunk: + break + yield chunk From aa4d4937b3e5de97c53bca00d88af1fe46c4bfa5 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Fri, 13 May 2022 17:20:03 +0100 Subject: [PATCH 30/40] Alter disk filename test to use real file --- tests/profiles/test_disk.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/profiles/test_disk.py b/tests/profiles/test_disk.py index fe65582..8d6c1b6 100644 --- a/tests/profiles/test_disk.py +++ b/tests/profiles/test_disk.py @@ -42,7 +42,8 @@ async def test_use_source_with_file(self, config, disk_profile): async with disk_profile.use_source(info, (50, 50)) as source_path: assert source_path == disk_profile.source_path / 'image' - async def test_resolve_filename_no_file(self, disk_profile): + async def test_resolve_filename_no_file(self, config, disk_profile): + create_image(config, 100, 100, 'test', 'image') filename = await disk_profile.resolve_filename('image') assert filename == 'image' From 74af6beab581e81bf0bab7d611263347421889af Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Fri, 13 May 2022 23:54:12 +0100 Subject: [PATCH 31/40] Put in a proper error for no length responses --- iiif/profiles/mss.py | 15 +++++++++++++-- tests/profiles/test_mss_store.py | 26 +++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/iiif/profiles/mss.py b/iiif/profiles/mss.py index b80d048..016b5f3 100644 --- a/iiif/profiles/mss.py +++ b/iiif/profiles/mss.py @@ -54,6 +54,12 @@ def __init__(self, profile: str, name: str, cause: 'StoreStreamError'): f'{cause.url} due to {cause.cause}') +class MSSStoreNoLength(IIIFServerException): + + def __init__(self, profile: str, name: str): + super().__init__(f'Failed to get data length for {name} from the {profile} backend.') + + class MSSImageInfo(ImageInfo): """ MSS variant of the ImageInfo class. @@ -294,6 +300,8 @@ async def resolve_original_size(self, name: str) -> int: doc = await self.get_mss_doc(name) source = MSSSourceFile(int(doc['id']), doc['file'], True) return await self.store.get_file_size(source) + except StoreStreamNoLength as e: + raise MSSStoreNoLength(self.name, name) except StoreStreamError as cause: raise MSSStoreFailure(self.name, name, cause) @@ -407,6 +415,10 @@ def __init__(self, source: MSSSourceFile, url: str, cause: ClientError): self.cause = cause +class StoreStreamNoLength(Exception): + pass + + class MSSSourceStore(FetchCache): """ Class that controls fetching source files from the MSS and then storing them in a cache and @@ -520,8 +532,7 @@ async def get_file_size(self, source: MSSSourceFile) -> int: async with self._open_stream(source) as response: size = response.headers.get('content-length') if size is None: - # TODO: this should be a better exception - raise Exception(f'The MSS backend returned no content-length') + raise StoreStreamNoLength('The backend returned no content-length') return int(size) def _choose_convert_pool(self, file: str) -> Executor: diff --git a/tests/profiles/test_mss_store.py b/tests/profiles/test_mss_store.py index 9ded330..1d9f8ae 100644 --- a/tests/profiles/test_mss_store.py +++ b/tests/profiles/test_mss_store.py @@ -5,7 +5,7 @@ from io import BytesIO from pathlib import Path -from iiif.profiles.mss import MSSSourceStore, MSSSourceFile, StoreStreamError +from iiif.profiles.mss import MSSSourceStore, MSSSourceFile, StoreStreamError, StoreStreamNoLength MOCK_HOST = 'http://not.the.real.mss.host' @@ -174,3 +174,27 @@ async def test_use(source_root: Path): assert image.format.lower() == 'jpeg' finally: await store.close() + + +async def test_get_file_size(source_root: Path): + store = MSSSourceStore(source_root, MOCK_HOST, 10, 10, 10) + emu_irn = 12345 + file = 'some_file.jpg' + content_length = 489345 + + with aioresponses() as m: + source = MSSSourceFile(emu_irn, file, False) + m.get(source.url, headers={'content-length': str(content_length)}) + assert await store.get_file_size(source) == content_length + + +async def test_get_file_size_fail(source_root: Path): + store = MSSSourceStore(source_root, MOCK_HOST, 10, 10, 10) + emu_irn = 12345 + file = 'some_file.jpg' + + with aioresponses() as m: + source = MSSSourceFile(emu_irn, file, False) + m.get(source.url, headers={}) + with pytest.raises(StoreStreamNoLength): + assert await store.get_file_size(source) From 53df5a570e8f9b9d9f50a11d293c3a71bc956b02 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Sat, 14 May 2022 19:04:37 +0100 Subject: [PATCH 32/40] Remove spurious e --- iiif/profiles/mss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iiif/profiles/mss.py b/iiif/profiles/mss.py index 016b5f3..90cae88 100644 --- a/iiif/profiles/mss.py +++ b/iiif/profiles/mss.py @@ -300,7 +300,7 @@ async def resolve_original_size(self, name: str) -> int: doc = await self.get_mss_doc(name) source = MSSSourceFile(int(doc['id']), doc['file'], True) return await self.store.get_file_size(source) - except StoreStreamNoLength as e: + except StoreStreamNoLength: raise MSSStoreNoLength(self.name, name) except StoreStreamError as cause: raise MSSStoreFailure(self.name, name, cause) From 47eaf06f0ab4e2482e754a923a5e2362f9743151 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Sat, 14 May 2022 19:13:56 +0100 Subject: [PATCH 33/40] Add an update into the test workflow --- .github/workflows/main.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 1c472a9..6ad6f8e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -12,6 +12,9 @@ jobs: - name: Checkout source code uses: actions/checkout@v2 + - name: Update OS package list + run: sudo apt-get update + - name: Install OS dependencies run: sudo apt-get -y install libffi-dev libturbojpeg0-dev libjpeg-dev libpcre3 libpcre3-dev libcurl4-openssl-dev libssl-dev build-essential libmagickwand-dev From c166ff2725f0845c235cc170e153099657d5e49a Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Sat, 14 May 2022 20:00:05 +0100 Subject: [PATCH 34/40] Use pillow for rotations instead of jpegtran Just easier really and avoids the errors. We could write some logic to only rotate/flip with jpegtran if the image width and height are divisible by 16 but quite frankly it's just easier not to. If performance becomes an issue here then that's a thing we can do. --- iiif/ops.py | 8 +++----- iiif/processing.py | 14 ++++++++------ tests/test_processing.py | 6 +++--- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/iiif/ops.py b/iiif/ops.py index bec5bf2..baccc1b 100644 --- a/iiif/ops.py +++ b/iiif/ops.py @@ -1,9 +1,8 @@ +from contextlib import suppress from dataclasses import dataclass from enum import Enum from pathlib import Path -from contextlib import suppress - from iiif.exceptions import InvalidIIIFParameter from iiif.profiles.base import ImageInfo @@ -152,9 +151,8 @@ def parse_size(size: str, region: Region) -> Size: @dataclass class Rotation: - # jpegtran only supports rotating in 90 degree increments and IIIF suggests only supporting - # rotating by 90 unless the png format is supported so that the background can be made - # transparent + # IIIF suggests only supporting rotating by 90 unless the png format is supported so that the + # background can be made transparent angle: int mirror: bool = False diff --git a/iiif/processing.py b/iiif/processing.py index 240bd63..d2786a5 100644 --- a/iiif/processing.py +++ b/iiif/processing.py @@ -4,6 +4,7 @@ import asyncio import os +from PIL import ImageOps from dataclasses import dataclass from jpegtran import JPEGImage from jpegtran.lib import Transformation @@ -65,19 +66,20 @@ def process_size(image: JPEGImage, size: Size) -> JPEGImage: def process_rotation(image: JPEGImage, rotation: Rotation) -> JPEGImage: """ - Processes a IIIF rotation parameter which can involve mirroring and/or rotating. + Processes a IIIF rotation parameter which can involve mirroring and/or rotating. We could do + this in jpegtran but only if the width and height were divisible by 16 so we'll just do it in + pillow for ease. :param image: a jpegtran JPEGImage object :param rotation: the IIIF rotate parameter :return: a jpegtran JPEGImage object """ + pillow_image = to_pillow(image) if rotation.mirror: - image = image.flip('horizontal') + pillow_image = ImageOps.mirror(pillow_image) if rotation.angle > 0: - # note that jpegtran can only do 90 degree rotations, if we want to support arbitrary - # rotation we'll have to use pillow/pywand - image = image.rotate(rotation.angle) - return image + pillow_image = pillow_image.rotate(rotation.angle) + return to_jpegtran(pillow_image) def process_quality(image: JPEGImage, quality: Quality) -> JPEGImage: diff --git a/tests/test_processing.py b/tests/test_processing.py index ed4e15d..cb85a7e 100644 --- a/tests/test_processing.py +++ b/tests/test_processing.py @@ -3,7 +3,7 @@ import hashlib import pytest -from PIL import Image +from PIL import Image, ImageOps from jpegtran import JPEGImage from pathlib import Path from queue import Queue @@ -96,7 +96,7 @@ class TestProcessRotation: def test_rotate(self, image: JPEGImage): result = process_rotation(image, Rotation(90)) - assert_same(result, image.rotate(90)) + assert_same(result, to_pillow(image).rotate(90)) def test_mirror(self, image: JPEGImage): result = process_rotation(image, Rotation(0, mirror=True)) @@ -104,7 +104,7 @@ def test_mirror(self, image: JPEGImage): def test_rotate_and_mirror(self, image: JPEGImage): result = process_rotation(image, Rotation(90, mirror=True)) - assert_same(result, image.flip('horizontal').rotate(90)) + assert_same(result, ImageOps.mirror(to_pillow(image)).rotate(90)) class TestQuality: From 7267171bb51b1d9033e28bf8550196d2862b9759 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Sat, 14 May 2022 20:29:00 +0100 Subject: [PATCH 35/40] Allow optional formats and qualities quality: color, colour & gray, grey format: jpg, jpeg --- iiif/ops.py | 30 +++++++++++++++++++++--------- tests/test_ops.py | 9 +++++++++ 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/iiif/ops.py b/iiif/ops.py index baccc1b..89d12f3 100644 --- a/iiif/ops.py +++ b/iiif/ops.py @@ -191,10 +191,16 @@ def parse_rotation(rotation: str) -> Rotation: class Quality(Enum): - default = 'default' - color = 'color' - gray = 'gray' - bitonal = 'bitonal' + default = ('default',) + color = ('color', 'colour') + gray = ('gray', 'grey') + bitonal = ('bitonal',) + + def matches(self, value: str) -> bool: + return value in self.value + + def __str__(self) -> str: + return self.value[0] def parse_quality(quality: str) -> Quality: @@ -208,15 +214,21 @@ def parse_quality(quality: str) -> Quality: :return: a Quality """ for option in Quality: - if option.value == quality: + if option.matches(quality): return option raise InvalidIIIFParameter('Quality', quality) class Format(Enum): - jpg = 'jpg' - png = 'png' + jpg = ('jpg', 'jpeg') + png = ('png',) + + def matches(self, value: str) -> bool: + return value in self.value + + def __str__(self) -> str: + return self.value[0] def parse_format(fmt: str) -> Format: @@ -230,7 +242,7 @@ def parse_format(fmt: str) -> Format: :return: a Format """ for option in Format: - if option.value == fmt: + if option.matches(fmt): return option raise InvalidIIIFParameter('Format', fmt) @@ -274,4 +286,4 @@ def location(self) -> Path: :return: the path where the image produced by this set of ops should be stored. """ return Path(str(self.region), str(self.size), str(self.rotation), - f'{self.quality.value}.{self.format.value}') + f'{self.quality}.{self.format}') diff --git a/tests/test_ops.py b/tests/test_ops.py index e1368bc..cbe21a0 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -186,9 +186,15 @@ def test_level0(self): def test_level2_color(self): assert parse_quality('color') == Quality.color + def test_level2_color(self): + assert parse_quality('colour') == Quality.color + def test_level2_gray(self): assert parse_quality('gray') == Quality.gray + def test_level2_grey(self): + assert parse_quality('grey') == Quality.gray + def test_bitonal(self): assert parse_quality('bitonal') == Quality.bitonal @@ -203,6 +209,9 @@ class TestParseFormat: def test_level0(self): assert parse_format('jpg') == Format.jpg + def test_level0(self): + assert parse_format('jpeg') == Format.jpg + def test_level2_png(self): assert parse_format('png') == Format.png From e2a15b96a24b8b51638b34e8cbb73b5889f858cc Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Sat, 14 May 2022 20:55:44 +0100 Subject: [PATCH 36/40] Fix rotation bugs - pillow rotates counter-clockwise, so we need to negate the angle for IIIF which rotates clockwise - use the expand parameter for pillow to do the right thing when rotating non-square images --- iiif/processing.py | 2 +- tests/test_processing.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/iiif/processing.py b/iiif/processing.py index d2786a5..06b9efa 100644 --- a/iiif/processing.py +++ b/iiif/processing.py @@ -78,7 +78,7 @@ def process_rotation(image: JPEGImage, rotation: Rotation) -> JPEGImage: if rotation.mirror: pillow_image = ImageOps.mirror(pillow_image) if rotation.angle > 0: - pillow_image = pillow_image.rotate(rotation.angle) + pillow_image = pillow_image.rotate(-rotation.angle, expand=True) return to_jpegtran(pillow_image) diff --git a/tests/test_processing.py b/tests/test_processing.py index cb85a7e..a51764b 100644 --- a/tests/test_processing.py +++ b/tests/test_processing.py @@ -96,7 +96,7 @@ class TestProcessRotation: def test_rotate(self, image: JPEGImage): result = process_rotation(image, Rotation(90)) - assert_same(result, to_pillow(image).rotate(90)) + assert_same(result, to_pillow(image).rotate(-90, expand=True)) def test_mirror(self, image: JPEGImage): result = process_rotation(image, Rotation(0, mirror=True)) @@ -104,7 +104,7 @@ def test_mirror(self, image: JPEGImage): def test_rotate_and_mirror(self, image: JPEGImage): result = process_rotation(image, Rotation(90, mirror=True)) - assert_same(result, ImageOps.mirror(to_pillow(image)).rotate(90)) + assert_same(result, ImageOps.mirror(to_pillow(image)).rotate(-90, expand=True)) class TestQuality: From 5389a489849855c5447c524f64ffcd99127e7ce3 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Sat, 14 May 2022 21:10:14 +0100 Subject: [PATCH 37/40] Upgrade dependencies and fix a test that broke due to them --- iiif/processing.py | 6 +++--- setup.py | 27 +++++++++++++-------------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/iiif/processing.py b/iiif/processing.py index 06b9efa..0b06ca1 100644 --- a/iiif/processing.py +++ b/iiif/processing.py @@ -190,7 +190,6 @@ def __init__(self, root: Path, ttl: float, max_size: float, max_workers: int = o :param max_workers: maximum number of worker processes to use to work on processing images """ super().__init__(root, ttl, max_size) - self.loop = asyncio.get_event_loop() self.max_workers = max_workers self.executor = ProcessPoolExecutor(max_workers=max_workers) @@ -219,9 +218,10 @@ async def _fetch(self, task: ImageProcessorTask): :param task: the task information """ + loop = asyncio.get_event_loop() async with task.profile.use_source(task.info, task.size_hint) as source_path: - await self.loop.run_in_executor(self.executor, process_image_request, source_path, - task.store_path, task.ops) + await loop.run_in_executor(self.executor, process_image_request, source_path, + task.store_path, task.ops) def stop(self): """ diff --git a/setup.py b/setup.py index 7817a0a..38890de 100644 --- a/setup.py +++ b/setup.py @@ -11,26 +11,25 @@ jpegtran_url = 'git+https://github.com/jbaiter/jpegtran-cffi.git@70928eb#egg=jpegtran-cffi' install_dependencies = [ - 'aiofiles~=0.6.0', - 'aiohttp[speedups]~=3.8.1', - 'aiozipstream~=0.4', - 'cachetools~=5.0.0', - 'cffi~=1.14.5', - 'elasticsearch-dsl>=6.0.0,<7.0.0', - 'fastapi~=0.63.0', - 'humanize~=3.4.1', + 'aiofiles==0.8.0', + 'aiohttp[speedups]==3.8.1', + 'aiozipstream==0.4', + 'cachetools==5.0.0', + 'cffi==1.15.0', + 'elasticsearch-dsl==6.4.0', + 'fastapi==0.77.1', + 'humanize==4.1.0', f'jpegtran-cffi @ {jpegtran_url}', - 'pillow~=8.2.0', - 'pyyaml~=5.4.1', - 'uvicorn[standard]~=0.17.6', - 'wand~=0.6.7', - 'anyio~=3.5.0', + 'pillow==9.1.0', + 'pyyaml==6.0', + 'uvicorn[standard]==0.17.6', + 'wand==0.6.7', ] test_dependencies = [ 'pytest', 'pytest-asyncio', 'pytest-cov', - 'aioresponses~=0.7.3', + 'aioresponses', # needed by starlette's test client 'requests', ] From ebe72f4677e63b241cd269adc52db01f2a5423a3 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Mon, 16 May 2022 10:00:27 +0100 Subject: [PATCH 38/40] Remove jpeg option Just no need really --- iiif/ops.py | 11 ++++------- tests/test_ops.py | 3 --- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/iiif/ops.py b/iiif/ops.py index 89d12f3..db07094 100644 --- a/iiif/ops.py +++ b/iiif/ops.py @@ -221,14 +221,11 @@ def parse_quality(quality: str) -> Quality: class Format(Enum): - jpg = ('jpg', 'jpeg') - png = ('png',) - - def matches(self, value: str) -> bool: - return value in self.value + jpg = 'jpg' + png = 'png' def __str__(self) -> str: - return self.value[0] + return self.value def parse_format(fmt: str) -> Format: @@ -242,7 +239,7 @@ def parse_format(fmt: str) -> Format: :return: a Format """ for option in Format: - if option.matches(fmt): + if fmt == option.value: return option raise InvalidIIIFParameter('Format', fmt) diff --git a/tests/test_ops.py b/tests/test_ops.py index cbe21a0..a23d199 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -209,9 +209,6 @@ class TestParseFormat: def test_level0(self): assert parse_format('jpg') == Format.jpg - def test_level0(self): - assert parse_format('jpeg') == Format.jpg - def test_level2_png(self): assert parse_format('png') == Format.png From 6584cea4e156fae41ac00f1dc6496edb3a645a2c Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Mon, 16 May 2022 10:02:51 +0100 Subject: [PATCH 39/40] Fix test name --- tests/test_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_ops.py b/tests/test_ops.py index a23d199..9704933 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -186,7 +186,7 @@ def test_level0(self): def test_level2_color(self): assert parse_quality('color') == Quality.color - def test_level2_color(self): + def test_level2_colour(self): assert parse_quality('colour') == Quality.color def test_level2_gray(self): From 16f19885310b356dbd8a970cb69f00405b83de24 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Mon, 16 May 2022 10:04:48 +0100 Subject: [PATCH 40/40] Move info.json generation out of profile and add extras This solves a circular import issue which arose from generating the extra qualities in the ops module but then needing to import it into the profile base module. Additionally, it simplifies the info.json generation by removing the profile override ability (currently unused) and caching (was not well thought out as the part of the process that might take time, getting the info object, was not cached). --- iiif/ops.py | 14 ++++++++++++++ iiif/profiles/base.py | 45 +------------------------------------------ iiif/routers/iiif.py | 30 ++++++++++++++++++++++++++--- tests/test_ops.py | 9 +++++++++ 4 files changed, 51 insertions(+), 47 deletions(-) diff --git a/iiif/ops.py b/iiif/ops.py index db07094..122c206 100644 --- a/iiif/ops.py +++ b/iiif/ops.py @@ -1,7 +1,9 @@ from contextlib import suppress from dataclasses import dataclass from enum import Enum +from itertools import chain from pathlib import Path +from typing import List from iiif.exceptions import InvalidIIIFParameter from iiif.profiles.base import ImageInfo @@ -199,6 +201,18 @@ class Quality(Enum): def matches(self, value: str) -> bool: return value in self.value + @staticmethod + def extras() -> List[str]: + """ + Returns the values that should be use in the info.json response. This should include + eveything except the default value. + + :return: a list of extra qualities available on this IIIF server + """ + return list(chain.from_iterable( + quality.value for quality in Quality if quality != Quality.default) + ) + def __str__(self) -> str: return self.value[0] diff --git a/iiif/profiles/base.py b/iiif/profiles/base.py index 4c1bbd2..aa1b412 100644 --- a/iiif/profiles/base.py +++ b/iiif/profiles/base.py @@ -1,11 +1,9 @@ import abc -from cachetools import TTLCache from contextlib import asynccontextmanager from pathlib import Path from typing import Tuple, Optional, Any, AsyncIterable from iiif.config import Config -from iiif.utils import generate_sizes class ImageInfo: @@ -51,13 +49,11 @@ class AbstractProfile(abc.ABC): jpeg file, it can be processed in a common way (see the processing module). """ - def __init__(self, name: str, config: Config, rights: str, info_json_cache_size: int = 1000, - cache_for: float = 60): + def __init__(self, name: str, config: Config, rights: str, cache_for: float = 60): """ :param name: the name of the profile, should be unique across profiles :param config: the config object :param rights: the rights definition for all images handled by this profile - :param info_json_cache_size: the size of the info.json cache :param cache_for: how long in seconds a client should cache the results from this profile (both info.json and image data) """ @@ -70,7 +66,6 @@ def __init__(self, name: str, config: Config, rights: str, info_json_cache_size: self.rights = rights self.source_path.mkdir(exist_ok=True) self.cache_path.mkdir(exist_ok=True) - self.info_json_cache = TTLCache(maxsize=info_json_cache_size, ttl=cache_for) self.cache_for = cache_for @abc.abstractmethod @@ -134,43 +129,6 @@ async def stream_original(self, name: str, chunk_size: int = 4096) -> AsyncItera """ ... - async def generate_info_json(self, info: ImageInfo, iiif_level: int) -> dict: - """ - Generates an info.json dict for the given image. The info.json is cached locally in this - profile's attributes. - - :param info: the ImageInfo object to create the info.json dict for - :param iiif_level: the IIIF image server compliance level to include in the info.json - :return: the generated or cached info.json dict for the image - """ - # if the image's info.json isn't cached, create and add the complete info.json to the cache - if info not in self.info_json_cache: - id_url = f'{self.config.base_url}/{info.identifier}' - self.info_json_cache[info] = { - '@context': 'http://iiif.io/api/image/3/context.json', - 'id': id_url, - # mirador/openseadragon seems to need this to work even though I don't think it's - # correct under the IIIF image API v3 - '@id': id_url, - 'type': 'ImageService3', - 'protocol': 'http://iiif.io/api/image', - 'width': info.width, - 'height': info.height, - 'rights': self.rights, - 'profile': f'level{iiif_level}', - 'tiles': [ - {'width': 512, 'scaleFactors': [1, 2, 4, 8, 16]}, - {'width': 256, 'scaleFactors': [1, 2, 4, 8, 16]}, - {'width': 1024, 'scaleFactors': [1, 2, 4, 8, 16]}, - ], - 'sizes': generate_sizes(info.width, info.height, self.config.min_sizes_size), - # suggest to clients that upscaling isn't supported - 'maxWidth': info.width, - 'maxHeight': info.height, - } - - return self.info_json_cache[info] - async def close(self): """ Close down the profile ensuring any resources are released. This will be called before @@ -186,6 +144,5 @@ async def get_status(self) -> dict: """ status = { 'name': self.name, - 'info_json_cache_size': len(self.info_json_cache), } return status diff --git a/iiif/routers/iiif.py b/iiif/routers/iiif.py index 0b0c095..724bb94 100644 --- a/iiif/routers/iiif.py +++ b/iiif/routers/iiif.py @@ -1,9 +1,9 @@ from fastapi import APIRouter from starlette.responses import FileResponse, JSONResponse -from iiif.ops import IIIF_LEVEL, parse_params +from iiif.ops import IIIF_LEVEL, parse_params, Quality from iiif.state import state -from iiif.utils import get_mimetype +from iiif.utils import get_mimetype, generate_sizes router = APIRouter() @@ -22,7 +22,31 @@ async def get_image_info(identifier: str) -> JSONResponse: :return: the info.json as a dict """ profile, info = await state.get_profile_and_info(identifier) - info_json = await profile.generate_info_json(info, IIIF_LEVEL) + id_url = f'{state.config.base_url}/{info.identifier}' + info_json = { + '@context': 'http://iiif.io/api/image/3/context.json', + 'id': id_url, + # mirador/openseadragon seems to need this to work even though I don't think it's correct + # under the IIIF image API v3 + '@id': id_url, + 'type': 'ImageService3', + 'protocol': 'http://iiif.io/api/image', + 'width': info.width, + 'height': info.height, + 'rights': profile.rights, + 'profile': f'level{IIIF_LEVEL}', + 'tiles': [ + {'width': 512, 'scaleFactors': [1, 2, 4, 8, 16]}, + {'width': 256, 'scaleFactors': [1, 2, 4, 8, 16]}, + {'width': 1024, 'scaleFactors': [1, 2, 4, 8, 16]}, + ], + 'sizes': generate_sizes(info.width, info.height, state.config.min_sizes_size), + # suggest to clients that upscaling isn't supported + 'maxWidth': info.width, + 'maxHeight': info.height, + 'extraQualities': Quality.extras(), + 'extraFeatures': ['mirroring'], + } # add a cache-control header and iiif header headers = { 'cache-control': f'max-age={profile.cache_for}', diff --git a/tests/test_ops.py b/tests/test_ops.py index 9704933..76e7c2f 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -203,6 +203,15 @@ def test_invalid(self): parse_quality('banana') assert exc_info.value.status_code == 400 + def test_extras(self): + extras = Quality.extras() + for quality in Quality: + for value in quality.value: + if value == 'default': + assert value not in extras + else: + assert value in extras + class TestParseFormat: