From ac1e63209268b30f94fd423bf1fac0f8a979d49f Mon Sep 17 00:00:00 2001 From: Eugene M Date: Mon, 15 Jul 2024 14:40:11 -0400 Subject: [PATCH 01/46] add zarr route --- tiled/server/dependencies.py | 1 + tiled/server/router.py | 61 ++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/tiled/server/dependencies.py b/tiled/server/dependencies.py index f13c676fc..f0af8e72f 100644 --- a/tiled/server/dependencies.py +++ b/tiled/server/dependencies.py @@ -71,6 +71,7 @@ async def inner( session_state is an optional dictionary passed in the session token """ + # breakpoint() path_parts = [segment for segment in path.split("/") if segment] entry = root_tree diff --git a/tiled/server/router.py b/tiled/server/router.py index a81a2fea3..a8bedce91 100644 --- a/tiled/server/router.py +++ b/tiled/server/router.py @@ -1672,6 +1672,67 @@ async def get_asset_manifest( manifest.extend(Path(root, file) for file in files) return json_or_msgpack(request, {"manifest": manifest}) +@router.get( + "/zarr/full/{path:path}", response_model=schemas.Response, name="full array as zarr" +) +async def zarr_array_full( + request: Request, + entry=SecureEntry( + scopes=["read:data"], + structure_families={StructureFamily.array, StructureFamily.sparse}, + ), + slice=Depends(slice_), + expected_shape=Depends(expected_shape), + format: Optional[str] = None, + filename: Optional[str] = None, + serialization_registry=Depends(get_serialization_registry), + settings: BaseSettings = Depends(get_settings), +): + """ + Fetch a slice of array-like data. + """ + structure_family = entry.structure_family + # Deferred import because this is not a required dependency of the server + # for some use cases. + import numpy + + try: + with record_timing(request.state.metrics, "read"): + array = await ensure_awaitable(entry.read, slice) + if structure_family == StructureFamily.array: + array = numpy.asarray(array) # Force dask or PIMS or ... to do I/O. + except IndexError: + raise HTTPException( + status_code=HTTP_400_BAD_REQUEST, detail="Block index out of range" + ) + if (expected_shape is not None) and (expected_shape != array.shape): + raise HTTPException( + status_code=HTTP_400_BAD_REQUEST, + detail=f"The expected_shape {expected_shape} does not match the actual shape {array.shape}", + ) + if array.nbytes > settings.response_bytesize_limit: + raise HTTPException( + status_code=HTTP_400_BAD_REQUEST, + detail=( + f"Response would exceed {settings.response_bytesize_limit}. " + "Use slicing ('?slice=...') to request smaller chunks." + ), + ) + try: + with record_timing(request.state.metrics, "pack"): + return await construct_data_response( + structure_family, + serialization_registry, + array, + entry.metadata(), + request, + format, + specs=getattr(entry, "specs", []), + expires=getattr(entry, "content_stale_at", None), + filename=filename, + ) + except UnsupportedMediaTypes as err: + raise HTTPException(status_code=HTTP_406_NOT_ACCEPTABLE, detail=err.args[0]) async def validate_metadata( metadata: dict, From dab92e35bb290a46b9020ffe46c34474046989a2 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Mon, 5 Aug 2024 10:34:59 -0400 Subject: [PATCH 02/46] ENH: basic zarr functionality --- tiled/examples/generated.py | 14 ++- tiled/server/app.py | 26 ++++++ tiled/server/router.py | 63 ------------- tiled/server/utils.py | 8 ++ tiled/server/zarr.py | 180 ++++++++++++++++++++++++++++++++++++ 5 files changed, 227 insertions(+), 64 deletions(-) create mode 100644 tiled/server/zarr.py diff --git a/tiled/examples/generated.py b/tiled/examples/generated.py index 014c9c82c..9caaacea0 100644 --- a/tiled/examples/generated.py +++ b/tiled/examples/generated.py @@ -43,9 +43,21 @@ print("Done generating example data.", file=sys.stderr) mapping = { + "nested": MapAdapter( + {"small_image": ArrayAdapter.from_array(data["small_image"]), + "tiny_image": ArrayAdapter.from_array(data["tiny_image"]), + "inner": MapAdapter( + {"small_image": ArrayAdapter.from_array(data["small_image"]), + "tiny_image": ArrayAdapter.from_array(data["tiny_image"]), + }, + metadata = {"animal": "cat", "color": "green"}, + ), + }, + metadata = {"animal": "cat", "color": "green"}, + ), "big_image": ArrayAdapter.from_array(data["big_image"]), "small_image": ArrayAdapter.from_array(data["small_image"]), - "medium_image": ArrayAdapter.from_array(data["medium_image"]), + "medium_image": ArrayAdapter.from_array(data["medium_image"], chunks=((250, )*4, (100, )*10)), "sparse_image": COOAdapter.from_coo(sparse.COO(sparse_arr)), "awkward_array": AwkwardAdapter.from_array(awkward_arr), "tiny_image": ArrayAdapter.from_array(data["tiny_image"]), diff --git a/tiled/server/app.py b/tiled/server/app.py index e5c953b3a..1f04003f2 100644 --- a/tiled/server/app.py +++ b/tiled/server/app.py @@ -11,6 +11,7 @@ from functools import lru_cache, partial from pathlib import Path from typing import List +import re import anyio import packaging.version @@ -60,6 +61,7 @@ get_root_url, record_timing, ) +from .zarr import router as zarr_router SAFE_METHODS = {"GET", "HEAD", "OPTIONS", "TRACE"} SENSITIVE_COOKIES = { @@ -67,6 +69,7 @@ } CSRF_HEADER_NAME = "x-csrf" CSRF_QUERY_PARAMETER = "csrf" +ZARR_PREFIX = '/zarr/v2' MINIMUM_SUPPORTED_PYTHON_CLIENT_VERSION = packaging.version.parse("0.1.0a104") @@ -344,6 +347,7 @@ async def unhandled_exception_handler( ) app.include_router(router, prefix="/api/v1") + app.include_router(zarr_router, prefix=ZARR_PREFIX) # The Tree and Authenticator have the opportunity to add custom routes to # the server here. (Just for example, a Tree of BlueskyRuns uses this @@ -882,6 +886,28 @@ async def current_principal_logging_filter(request: Request, call_next): current_principal.set(request.state.principal) return response + @app.middleware("http") + async def resolve_zarr_uris(request: Request, call_next): + response = await call_next(request) + + # If a zarr block is requested, e.g. http://zarr.com/array/0.1.2.3, replace the block spec with a properly + # formatted query parameter: http://zarr.com/array?block=0,1,2,3 (with ',' encoded) + if request.url.path.startswith(ZARR_PREFIX) and response.status_code == 404: + # Extract the last bit of the path + zarr_path = request.url.path.removeprefix(ZARR_PREFIX).strip('/').split('/') + zarr_block = zarr_path[-1] if len(zarr_path) > 0 else '' + if re.compile(r'^(?:\d+\.)*\d+$').fullmatch(zarr_block): + # Create a query string if the last part is in the zarr block forma, e.g. `m.n.p. ... .q` + request.scope['query_string'] = f"block={zarr_block.replace('.', '%2C')}".encode() + request.scope['path'] = ZARR_PREFIX + '/' + '/'.join(zarr_path[:-1]) + response = await call_next(request) + + # TODO: Try compiling a single RE for matching and replacement -- possible speedup? + print("In middleware resolve_zarr_uris -- replaced query params") + + response.__class__ = PatchedStreamingResponse # tolerate memoryview + return response + app.add_middleware( CorrelationIdMiddleware, header_name="X-Tiled-Request-ID", diff --git a/tiled/server/router.py b/tiled/server/router.py index a8bedce91..f2494c0a6 100644 --- a/tiled/server/router.py +++ b/tiled/server/router.py @@ -460,7 +460,6 @@ async def array_full( # Deferred import because this is not a required dependency of the server # for some use cases. import numpy - try: with record_timing(request.state.metrics, "read"): array = await ensure_awaitable(entry.read, slice) @@ -1672,68 +1671,6 @@ async def get_asset_manifest( manifest.extend(Path(root, file) for file in files) return json_or_msgpack(request, {"manifest": manifest}) -@router.get( - "/zarr/full/{path:path}", response_model=schemas.Response, name="full array as zarr" -) -async def zarr_array_full( - request: Request, - entry=SecureEntry( - scopes=["read:data"], - structure_families={StructureFamily.array, StructureFamily.sparse}, - ), - slice=Depends(slice_), - expected_shape=Depends(expected_shape), - format: Optional[str] = None, - filename: Optional[str] = None, - serialization_registry=Depends(get_serialization_registry), - settings: BaseSettings = Depends(get_settings), -): - """ - Fetch a slice of array-like data. - """ - structure_family = entry.structure_family - # Deferred import because this is not a required dependency of the server - # for some use cases. - import numpy - - try: - with record_timing(request.state.metrics, "read"): - array = await ensure_awaitable(entry.read, slice) - if structure_family == StructureFamily.array: - array = numpy.asarray(array) # Force dask or PIMS or ... to do I/O. - except IndexError: - raise HTTPException( - status_code=HTTP_400_BAD_REQUEST, detail="Block index out of range" - ) - if (expected_shape is not None) and (expected_shape != array.shape): - raise HTTPException( - status_code=HTTP_400_BAD_REQUEST, - detail=f"The expected_shape {expected_shape} does not match the actual shape {array.shape}", - ) - if array.nbytes > settings.response_bytesize_limit: - raise HTTPException( - status_code=HTTP_400_BAD_REQUEST, - detail=( - f"Response would exceed {settings.response_bytesize_limit}. " - "Use slicing ('?slice=...') to request smaller chunks." - ), - ) - try: - with record_timing(request.state.metrics, "pack"): - return await construct_data_response( - structure_family, - serialization_registry, - array, - entry.metadata(), - request, - format, - specs=getattr(entry, "specs", []), - expires=getattr(entry, "content_stale_at", None), - filename=filename, - ) - except UnsupportedMediaTypes as err: - raise HTTPException(status_code=HTTP_406_NOT_ACCEPTABLE, detail=err.args[0]) - async def validate_metadata( metadata: dict, structure_family: StructureFamily, diff --git a/tiled/server/utils.py b/tiled/server/utils.py index 0c4368d07..29233e134 100644 --- a/tiled/server/utils.py +++ b/tiled/server/utils.py @@ -1,5 +1,6 @@ import contextlib import time +from typing import Literal from ..access_policies import NO_ACCESS from ..adapters.mapping import MapAdapter @@ -41,6 +42,13 @@ def get_base_url(request): return f"{get_root_url(request)}/api/v1" +def get_zarr_url(request, version: Literal['v2', 'v3'] = 'v2'): + """ + Base URL for the Zarr API + """ + return f"{get_root_url(request)}/zarr/{version}" + + def get_root_url_low_level(request_headers, scope): # We want to get the scheme, host, and root_path (if any) # *as it appears to the client* for use in assembling links to diff --git a/tiled/server/zarr.py b/tiled/server/zarr.py new file mode 100644 index 000000000..4c3343c84 --- /dev/null +++ b/tiled/server/zarr.py @@ -0,0 +1,180 @@ +import dataclasses +import inspect +import os +import re +import warnings +from datetime import datetime, timedelta +from functools import partial, wraps +from pathlib import Path +from typing import Any, List, Optional, Tuple +import json + +import anyio +from fastapi import APIRouter, Body, Depends, HTTPException, Query, Request, Security +from jmespath.exceptions import JMESPathError +from json_merge_patch import merge as apply_merge_patch +from jsonpatch import apply_patch as apply_json_patch +from pydantic_settings import BaseSettings +from starlette.responses import Response +from starlette.status import ( + HTTP_200_OK, + HTTP_206_PARTIAL_CONTENT, + HTTP_400_BAD_REQUEST, + HTTP_403_FORBIDDEN, + HTTP_404_NOT_FOUND, + HTTP_405_METHOD_NOT_ALLOWED, + HTTP_406_NOT_ACCEPTABLE, + HTTP_416_REQUESTED_RANGE_NOT_SATISFIABLE, + HTTP_422_UNPROCESSABLE_ENTITY, + HTTP_500_INTERNAL_SERVER_ERROR, +) + +from .. import __version__ +from ..structures.core import Spec, StructureFamily +from ..utils import ensure_awaitable, patch_mimetypes, path_from_uri +from ..validation_registration import ValidationError +from . import schemas +from .authentication import Mode, get_authenticators, get_current_principal +from .core import ( + DEFAULT_PAGE_SIZE, + DEPTH_LIMIT, + MAX_PAGE_SIZE, + NoEntry, + UnsupportedMediaTypes, + WrongTypeForRoute, + apply_search, + construct_data_response, + construct_entries_response, + construct_resource, + construct_revisions_response, + json_or_msgpack, + resolve_media_type, +) +from .dependencies import ( + SecureEntry, + block, + expected_shape, + get_deserialization_registry, + get_query_registry, + get_serialization_registry, + get_validation_registry, + slice_, +) +from .file_response_with_range import FileResponseWithRange +from .links import links_for_node +from .settings import get_settings +from .utils import filter_for_access, get_base_url, record_timing + +ZARR_BLOCK_SIZE = 10 + +router = APIRouter() + +def convert_chunks_for_zarr(chunks: Tuple[Tuple[int]]): + """Convert full chunk specification into zarr format + + Zarr only accepts chunks of constant size; this function finds a unique representation of (possibly variable- + sized chunks) internal to Tiled ArrayAdapter in terms of zarr blocks. + """ + # return [min(ZARR_BLOCK_SIZE, i[0]) for i in chunks] + return [ZARR_BLOCK_SIZE for _ in chunks] + +def slice_for_zarr_block(chunks: Tuple[Tuple[int]], zblock: Tuple[int]): + ... + +# @router.get("/.zgroup", name="Root .zgroup metadata") +@router.get("/{path:path}/.zgroup", name="Zarr .zgroup metadata") +async def get_zarr_group_metadata( + request: Request, + entry=SecureEntry( + scopes=["read:data", "read:metadata"], + structure_families={StructureFamily.table, StructureFamily.container}, + ), +): + + return Response(json.dumps({"zarr_format": 2}), status_code=200) + +@router.get("/{path:path}/.zarray", name="Zarr .zarray metadata") +async def get_zarr_array_metadata( + request: Request, + path: str, + entry=SecureEntry(scopes=["read:data", "read:metadata"]), +): + if entry.structure_family not in {StructureFamily.array, StructureFamily.sparse}: + # This is normal behaviour; zarr will try to open .zarray and, if 404 is received, it will move on assuming + # that the requested resource is a group (`.../path/.zgroup` would be requested next). + # TODO: Perhaps, checking this within SecureEntry is sufficient? What happens to tables? + raise HTTPException(status_code=HTTP_404_NOT_FOUND, detail="Requested resource does not have .zarray") + + try: + zarray_spec = {} + metadata = entry.metadata() + structure = entry.structure() + zarray_spec = {'chunks': convert_chunks_for_zarr(structure.chunks), + 'compressor': {'blocksize': 0, + 'clevel': 5, + 'cname': 'lz4', + 'id': 'blosc', + 'shuffle': 1}, + 'dtype': structure.data_type.to_numpy_str(), + 'fill_value': 0, + 'filters': None, + 'order': 'C', + 'shape': list(structure.shape), + 'zarr_format': 2} + except Exception as err: + print(f"Can not create .zarray metadata, {err}") + raise HTTPException(status_code=HTTP_500_INTERNAL_SERVER_ERROR, detail=err.args[0]) + + return Response(json.dumps(zarray_spec), status_code=200) + + +@router.get("/{path:path}", name="Zarr .zgroup directory structure or a chunk of a zarr array") +async def get_zarr_array( + request: Request, + block: str | None = None, + entry=SecureEntry(scopes=["read:data"], + # structure_families={StructureFamily.array, StructureFamily.sparse}, + # structure_families={StructureFamily.table, StructureFamily.container}, + ), +): + if entry.structure_family in {StructureFamily.table, StructureFamily.container}: + # List the contents of a simulated zarr directory (excluding .zarray and .zgroup files) + url = str(request.url).split('?')[0].rstrip('/') # Remove query params and trailing slash + body = json.dumps([url + '/' + key for key in entry.keys()]) + + return Response(body, status_code=200, media_type='application/json') + + elif entry.structure_family in {StructureFamily.array, StructureFamily.sparse}: + if block is not None: + import zarr + print(f"Here, {block=}") + + block = [int(i) for i in block.split(',')] + chunks = entry.structure().chunks + + if block == (): + # Handle special case of numpy scalar + with record_timing(request.state.metrics, "read"): + array = await ensure_awaitable(entry.read) + else: + try: + with record_timing(request.state.metrics, "read"): + # array = await ensure_awaitable(entry.read_block, block) + array = await ensure_awaitable(entry.read) + x, y = block + array = array[x*ZARR_BLOCK_SIZE:(x+1)*ZARR_BLOCK_SIZE, y*ZARR_BLOCK_SIZE:(y+1)*ZARR_BLOCK_SIZE] + except IndexError: + raise HTTPException( + status_code=HTTP_400_BAD_REQUEST, detail="Block index out of range" + ) + + # TODO: This must be cached! + zarray = zarr.array(array) + + return Response(zarray.store['0.0'], status_code=200) + + else: + # TODO: + # Entire array (root uri) is requested -- never happens, but need to decide what to return here + return Response(json.dumps({}), status_code=200) + From b7e39a62fb5722adfab85cd43606ec0012539255 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Tue, 6 Aug 2024 17:39:31 -0400 Subject: [PATCH 03/46] ENH: map tiled chunks to zarr blocks --- tiled/server/app.py | 1 - tiled/server/zarr.py | 91 ++++++++++++++++++++++++-------------------- 2 files changed, 50 insertions(+), 42 deletions(-) diff --git a/tiled/server/app.py b/tiled/server/app.py index 1f04003f2..c65f7c654 100644 --- a/tiled/server/app.py +++ b/tiled/server/app.py @@ -903,7 +903,6 @@ async def resolve_zarr_uris(request: Request, call_next): response = await call_next(request) # TODO: Try compiling a single RE for matching and replacement -- possible speedup? - print("In middleware resolve_zarr_uris -- replaced query params") response.__class__ = PatchedStreamingResponse # tolerate memoryview return response diff --git a/tiled/server/zarr.py b/tiled/server/zarr.py index 4c3343c84..32a8481d7 100644 --- a/tiled/server/zarr.py +++ b/tiled/server/zarr.py @@ -65,23 +65,28 @@ from .settings import get_settings from .utils import filter_for_access, get_base_url, record_timing -ZARR_BLOCK_SIZE = 10 +ZARR_BLOCK_SIZE = 10000 +ZARR_BYTE_ORDER = 'C' +ZARR_CODEC_SPEC = {'blocksize': 0, + 'clevel': 5, + 'cname': 'lz4', + 'id': 'blosc', + 'shuffle': 1} + +import numcodecs +zarr_codec = numcodecs.get_codec(ZARR_CODEC_SPEC) router = APIRouter() -def convert_chunks_for_zarr(chunks: Tuple[Tuple[int]]): - """Convert full chunk specification into zarr format +def convert_chunks_for_zarr(tiled_chunks: Tuple[Tuple[int]]): + """Convert full tiled/dask chunk specification into zarr format - Zarr only accepts chunks of constant size; this function finds a unique representation of (possibly variable- - sized chunks) internal to Tiled ArrayAdapter in terms of zarr blocks. + Zarr only accepts chunks of constant size along each dimension; this function finds a unique representation of + (possibly variable-sized chunks) internal to Tiled ArrayAdapter in terms of zarr blocks. """ - # return [min(ZARR_BLOCK_SIZE, i[0]) for i in chunks] - return [ZARR_BLOCK_SIZE for _ in chunks] + return [min(ZARR_BLOCK_SIZE, max(c)) for c in tiled_chunks] -def slice_for_zarr_block(chunks: Tuple[Tuple[int]], zblock: Tuple[int]): - ... - -# @router.get("/.zgroup", name="Root .zgroup metadata") +@router.get("{path:path}.zgroup", name="Root .zgroup metadata") @router.get("/{path:path}/.zgroup", name="Zarr .zgroup metadata") async def get_zarr_group_metadata( request: Request, @@ -110,15 +115,11 @@ async def get_zarr_array_metadata( metadata = entry.metadata() structure = entry.structure() zarray_spec = {'chunks': convert_chunks_for_zarr(structure.chunks), - 'compressor': {'blocksize': 0, - 'clevel': 5, - 'cname': 'lz4', - 'id': 'blosc', - 'shuffle': 1}, + 'compressor': ZARR_CODEC_SPEC, 'dtype': structure.data_type.to_numpy_str(), 'fill_value': 0, 'filters': None, - 'order': 'C', + 'order': ZARR_BYTE_ORDER, 'shape': list(structure.shape), 'zarr_format': 2} except Exception as err: @@ -138,7 +139,7 @@ async def get_zarr_array( ), ): if entry.structure_family in {StructureFamily.table, StructureFamily.container}: - # List the contents of a simulated zarr directory (excluding .zarray and .zgroup files) + # List the contents of a "simulated" zarr directory (excluding .zarray and .zgroup files) url = str(request.url).split('?')[0].rstrip('/') # Remove query params and trailing slash body = json.dumps([url + '/' + key for key in entry.keys()]) @@ -147,34 +148,42 @@ async def get_zarr_array( elif entry.structure_family in {StructureFamily.array, StructureFamily.sparse}: if block is not None: import zarr - print(f"Here, {block=}") + import numpy as np + + block_indx = [int(i) for i in block.split(',')] + zarr_chunks = convert_chunks_for_zarr(entry.structure().chunks) + block_slice = tuple([slice(i*c, (i+1)*c) for c, i in zip(zarr_chunks, block_indx)]) + padding_size = [max(0, sl.stop-sh) for sh, sl in zip(entry.structure().shape, block_slice)] - block = [int(i) for i in block.split(',')] - chunks = entry.structure().chunks + # if block == (): + # # Handle special case of numpy scalar + # with record_timing(request.state.metrics, "read"): + # array = await ensure_awaitable(entry.read) + # else: - if block == (): - # Handle special case of numpy scalar + # breakpoint() + try: with record_timing(request.state.metrics, "read"): - array = await ensure_awaitable(entry.read) - else: - try: - with record_timing(request.state.metrics, "read"): - # array = await ensure_awaitable(entry.read_block, block) - array = await ensure_awaitable(entry.read) - x, y = block - array = array[x*ZARR_BLOCK_SIZE:(x+1)*ZARR_BLOCK_SIZE, y*ZARR_BLOCK_SIZE:(y+1)*ZARR_BLOCK_SIZE] - except IndexError: - raise HTTPException( - status_code=HTTP_400_BAD_REQUEST, detail="Block index out of range" - ) - - # TODO: This must be cached! - zarray = zarr.array(array) - - return Response(zarray.store['0.0'], status_code=200) + array = await ensure_awaitable(entry.read, slice=block_slice) + if sum(padding_size) > 0: + array = np.pad(array, [(0, p) for p in padding_size], mode='constant') + except IndexError: + raise HTTPException( + status_code=HTTP_400_BAD_REQUEST, detail="Block index out of range" + ) + + # buf = zarr.array(array).store['0.0'] # Define a zarr array as a single block + + # breakpoint() + + array = array.astype(array.dtype, order=ZARR_BYTE_ORDER, copy=False) # ensure array is contiguous + buf = zarr_codec.encode(array) + if not isinstance(buf, bytes): + buf = array.tobytes(order="A") + + return Response(buf, status_code=200) else: # TODO: # Entire array (root uri) is requested -- never happens, but need to decide what to return here return Response(json.dumps({}), status_code=200) - From 784f391bab0276c3fd853cef95a378a651debd57 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Tue, 6 Aug 2024 17:57:40 -0400 Subject: [PATCH 04/46] MNT: Clean-up comments --- tiled/server/app.py | 5 +++-- tiled/server/dependencies.py | 1 - tiled/server/router.py | 2 ++ 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tiled/server/app.py b/tiled/server/app.py index c65f7c654..40c05d959 100644 --- a/tiled/server/app.py +++ b/tiled/server/app.py @@ -890,8 +890,9 @@ async def current_principal_logging_filter(request: Request, call_next): async def resolve_zarr_uris(request: Request, call_next): response = await call_next(request) - # If a zarr block is requested, e.g. http://zarr.com/array/0.1.2.3, replace the block spec with a properly - # formatted query parameter: http://zarr.com/array?block=0,1,2,3 (with ',' encoded) + # If a zarr block is requested, e.g. http://localhost:8000/zarr/v2/array/0.1.2.3, replace the block spec + # with a properly formatted query parameter: http://localhost:8000/zarr/v2/array?block=0,1,2,3 (with ',' + # safely encoded) if request.url.path.startswith(ZARR_PREFIX) and response.status_code == 404: # Extract the last bit of the path zarr_path = request.url.path.removeprefix(ZARR_PREFIX).strip('/').split('/') diff --git a/tiled/server/dependencies.py b/tiled/server/dependencies.py index f0af8e72f..f13c676fc 100644 --- a/tiled/server/dependencies.py +++ b/tiled/server/dependencies.py @@ -71,7 +71,6 @@ async def inner( session_state is an optional dictionary passed in the session token """ - # breakpoint() path_parts = [segment for segment in path.split("/") if segment] entry = root_tree diff --git a/tiled/server/router.py b/tiled/server/router.py index f2494c0a6..a81a2fea3 100644 --- a/tiled/server/router.py +++ b/tiled/server/router.py @@ -460,6 +460,7 @@ async def array_full( # Deferred import because this is not a required dependency of the server # for some use cases. import numpy + try: with record_timing(request.state.metrics, "read"): array = await ensure_awaitable(entry.read, slice) @@ -1671,6 +1672,7 @@ async def get_asset_manifest( manifest.extend(Path(root, file) for file in files) return json_or_msgpack(request, {"manifest": manifest}) + async def validate_metadata( metadata: dict, structure_family: StructureFamily, From 08f255d687118b1983cf1019b375d7d6f948ce2e Mon Sep 17 00:00:00 2001 From: Eugene M Date: Wed, 7 Aug 2024 13:01:16 -0400 Subject: [PATCH 05/46] ENH: support tables --- tiled/examples/generated.py | 74 +++++++++++++++++++----------------- tiled/server/zarr.py | 76 +++++++++++++++++++++++++------------ 2 files changed, 91 insertions(+), 59 deletions(-) diff --git a/tiled/examples/generated.py b/tiled/examples/generated.py index 9caaacea0..4fa76f0c1 100644 --- a/tiled/examples/generated.py +++ b/tiled/examples/generated.py @@ -55,6 +55,45 @@ }, metadata = {"animal": "cat", "color": "green"}, ), + "tables": MapAdapter( + { + "short_table": DataFrameAdapter.from_pandas( + pandas.DataFrame( + { + "A": data["short_column"], + "B": 2 * data["short_column"], + "C": 3 * data["short_column"], + }, + index=pandas.Index(numpy.arange(len(data["short_column"])), name="index"), + ), + npartitions=1, + metadata={"animal": "dog", "color": "red"}, + ), + "long_table": DataFrameAdapter.from_pandas( + pandas.DataFrame( + { + "A": data["long_column"], + "B": 2 * data["long_column"], + "C": 3 * data["long_column"], + }, + index=pandas.Index(numpy.arange(len(data["long_column"])), name="index"), + ), + npartitions=5, + metadata={"animal": "dog", "color": "green"}, + ), + "wide_table": DataFrameAdapter.from_pandas( + pandas.DataFrame( + { + letter: i * data["tiny_column"] + for i, letter in enumerate(string.ascii_uppercase, start=1) + }, + index=pandas.Index(numpy.arange(len(data["tiny_column"])), name="index"), + ), + npartitions=1, + metadata={"animal": "dog", "color": "red"}, + ), + } + ), "big_image": ArrayAdapter.from_array(data["big_image"]), "small_image": ArrayAdapter.from_array(data["small_image"]), "medium_image": ArrayAdapter.from_array(data["medium_image"], chunks=((250, )*4, (100, )*10)), @@ -63,41 +102,6 @@ "tiny_image": ArrayAdapter.from_array(data["tiny_image"]), "tiny_cube": ArrayAdapter.from_array(data["tiny_cube"]), "tiny_hypercube": ArrayAdapter.from_array(data["tiny_hypercube"]), - "short_table": DataFrameAdapter.from_pandas( - pandas.DataFrame( - { - "A": data["short_column"], - "B": 2 * data["short_column"], - "C": 3 * data["short_column"], - }, - index=pandas.Index(numpy.arange(len(data["short_column"])), name="index"), - ), - npartitions=1, - metadata={"animal": "dog", "color": "red"}, - ), - "long_table": DataFrameAdapter.from_pandas( - pandas.DataFrame( - { - "A": data["long_column"], - "B": 2 * data["long_column"], - "C": 3 * data["long_column"], - }, - index=pandas.Index(numpy.arange(len(data["long_column"])), name="index"), - ), - npartitions=5, - metadata={"animal": "dog", "color": "green"}, - ), - "wide_table": DataFrameAdapter.from_pandas( - pandas.DataFrame( - { - letter: i * data["tiny_column"] - for i, letter in enumerate(string.ascii_uppercase, start=1) - }, - index=pandas.Index(numpy.arange(len(data["tiny_column"])), name="index"), - ), - npartitions=1, - metadata={"animal": "dog", "color": "red"}, - ), "structured_data": MapAdapter( { "pets": ArrayAdapter.from_array( diff --git a/tiled/server/zarr.py b/tiled/server/zarr.py index 32a8481d7..785366d1e 100644 --- a/tiled/server/zarr.py +++ b/tiled/server/zarr.py @@ -102,30 +102,48 @@ async def get_zarr_group_metadata( async def get_zarr_array_metadata( request: Request, path: str, - entry=SecureEntry(scopes=["read:data", "read:metadata"]), + column: str = '', + entry=SecureEntry(scopes=["read:data", "read:metadata"], + structure_families={StructureFamily.array, StructureFamily.sparse, StructureFamily.table}), ): - if entry.structure_family not in {StructureFamily.array, StructureFamily.sparse}: + if entry.structure_family in {StructureFamily.array, StructureFamily.sparse}: + try: + metadata = entry.metadata() + structure = entry.structure() + zarray_spec = {'chunks': convert_chunks_for_zarr(structure.chunks), + 'compressor': ZARR_CODEC_SPEC, + 'dtype': structure.data_type.to_numpy_str(), + 'fill_value': 0, + 'filters': None, + 'order': ZARR_BYTE_ORDER, + 'shape': list(structure.shape), + 'zarr_format': 2} + except Exception as err: + print(f"Can not create .zarray metadata, {err}") + raise HTTPException(status_code=HTTP_500_INTERNAL_SERVER_ERROR, detail=err.args[0]) + + # elif entry.structure_family == StructureFamily.table: + # try: + # zarray_spec = {} + # metadata = entry.metadata() + # structure = entry.structure() + # # zarray_spec = {'chunks': [100, 1], #convert_chunks_for_zarr(structure.chunks), + # # 'compressor': ZARR_CODEC_SPEC, + # # 'dtype': entry.structure().meta.dtypes[column].str, + # # 'fill_value': 0, + # # 'filters': None, + # # 'order': ZARR_BYTE_ORDER, + # # # 'shape': list(structure.shape), + # # 'zarr_format': 2} + # except Exception as err: + # print(f"Can not create .zarray metadata, {err}") + # raise HTTPException(status_code=HTTP_500_INTERNAL_SERVER_ERROR, detail=err.args[0]) + + else: # This is normal behaviour; zarr will try to open .zarray and, if 404 is received, it will move on assuming # that the requested resource is a group (`.../path/.zgroup` would be requested next). - # TODO: Perhaps, checking this within SecureEntry is sufficient? What happens to tables? raise HTTPException(status_code=HTTP_404_NOT_FOUND, detail="Requested resource does not have .zarray") - - try: - zarray_spec = {} - metadata = entry.metadata() - structure = entry.structure() - zarray_spec = {'chunks': convert_chunks_for_zarr(structure.chunks), - 'compressor': ZARR_CODEC_SPEC, - 'dtype': structure.data_type.to_numpy_str(), - 'fill_value': 0, - 'filters': None, - 'order': ZARR_BYTE_ORDER, - 'shape': list(structure.shape), - 'zarr_format': 2} - except Exception as err: - print(f"Can not create .zarray metadata, {err}") - raise HTTPException(status_code=HTTP_500_INTERNAL_SERVER_ERROR, detail=err.args[0]) - + return Response(json.dumps(zarray_spec), status_code=200) @@ -134,16 +152,26 @@ async def get_zarr_array( request: Request, block: str | None = None, entry=SecureEntry(scopes=["read:data"], - # structure_families={StructureFamily.array, StructureFamily.sparse}, - # structure_families={StructureFamily.table, StructureFamily.container}, + structure_families={StructureFamily.array, StructureFamily.sparse, StructureFamily.table, StructureFamily.container}, ), ): - if entry.structure_family in {StructureFamily.table, StructureFamily.container}: + url = str(request.url).split('?')[0].rstrip('/') # Remove query params and the trailing slash + + # breakpoint() + if entry.structure_family == StructureFamily.container: # List the contents of a "simulated" zarr directory (excluding .zarray and .zgroup files) - url = str(request.url).split('?')[0].rstrip('/') # Remove query params and trailing slash body = json.dumps([url + '/' + key for key in entry.keys()]) return Response(body, status_code=200, media_type='application/json') + + elif entry.structure_family == StructureFamily.table: + url = str(request.url).split('?')[0].rstrip('/') # Remove query params and the trailing slash + # breakpoint() + body = json.dumps([url + '/' + key for key in entry.structure().columns]) + + # entry.structure().meta.dtypes + + return Response(body, status_code=200, media_type='application/json') elif entry.structure_family in {StructureFamily.array, StructureFamily.sparse}: if block is not None: From d10e2f18e55bef9ee6f5e18b4beb42f9f49e5345 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Mon, 15 Jul 2024 14:40:11 -0400 Subject: [PATCH 06/46] add zarr route --- tiled/server/dependencies.py | 1 + tiled/server/router.py | 61 ++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/tiled/server/dependencies.py b/tiled/server/dependencies.py index f13c676fc..f0af8e72f 100644 --- a/tiled/server/dependencies.py +++ b/tiled/server/dependencies.py @@ -71,6 +71,7 @@ async def inner( session_state is an optional dictionary passed in the session token """ + # breakpoint() path_parts = [segment for segment in path.split("/") if segment] entry = root_tree diff --git a/tiled/server/router.py b/tiled/server/router.py index a81a2fea3..a8bedce91 100644 --- a/tiled/server/router.py +++ b/tiled/server/router.py @@ -1672,6 +1672,67 @@ async def get_asset_manifest( manifest.extend(Path(root, file) for file in files) return json_or_msgpack(request, {"manifest": manifest}) +@router.get( + "/zarr/full/{path:path}", response_model=schemas.Response, name="full array as zarr" +) +async def zarr_array_full( + request: Request, + entry=SecureEntry( + scopes=["read:data"], + structure_families={StructureFamily.array, StructureFamily.sparse}, + ), + slice=Depends(slice_), + expected_shape=Depends(expected_shape), + format: Optional[str] = None, + filename: Optional[str] = None, + serialization_registry=Depends(get_serialization_registry), + settings: BaseSettings = Depends(get_settings), +): + """ + Fetch a slice of array-like data. + """ + structure_family = entry.structure_family + # Deferred import because this is not a required dependency of the server + # for some use cases. + import numpy + + try: + with record_timing(request.state.metrics, "read"): + array = await ensure_awaitable(entry.read, slice) + if structure_family == StructureFamily.array: + array = numpy.asarray(array) # Force dask or PIMS or ... to do I/O. + except IndexError: + raise HTTPException( + status_code=HTTP_400_BAD_REQUEST, detail="Block index out of range" + ) + if (expected_shape is not None) and (expected_shape != array.shape): + raise HTTPException( + status_code=HTTP_400_BAD_REQUEST, + detail=f"The expected_shape {expected_shape} does not match the actual shape {array.shape}", + ) + if array.nbytes > settings.response_bytesize_limit: + raise HTTPException( + status_code=HTTP_400_BAD_REQUEST, + detail=( + f"Response would exceed {settings.response_bytesize_limit}. " + "Use slicing ('?slice=...') to request smaller chunks." + ), + ) + try: + with record_timing(request.state.metrics, "pack"): + return await construct_data_response( + structure_family, + serialization_registry, + array, + entry.metadata(), + request, + format, + specs=getattr(entry, "specs", []), + expires=getattr(entry, "content_stale_at", None), + filename=filename, + ) + except UnsupportedMediaTypes as err: + raise HTTPException(status_code=HTTP_406_NOT_ACCEPTABLE, detail=err.args[0]) async def validate_metadata( metadata: dict, From 5b9e786e97657cb5e56a11184b5e13f53147717a Mon Sep 17 00:00:00 2001 From: Eugene M Date: Mon, 5 Aug 2024 10:34:59 -0400 Subject: [PATCH 07/46] ENH: basic zarr functionality --- tiled/examples/generated.py | 14 ++- tiled/server/app.py | 26 ++++++ tiled/server/router.py | 63 ------------- tiled/server/utils.py | 8 ++ tiled/server/zarr.py | 180 ++++++++++++++++++++++++++++++++++++ 5 files changed, 227 insertions(+), 64 deletions(-) create mode 100644 tiled/server/zarr.py diff --git a/tiled/examples/generated.py b/tiled/examples/generated.py index 014c9c82c..9caaacea0 100644 --- a/tiled/examples/generated.py +++ b/tiled/examples/generated.py @@ -43,9 +43,21 @@ print("Done generating example data.", file=sys.stderr) mapping = { + "nested": MapAdapter( + {"small_image": ArrayAdapter.from_array(data["small_image"]), + "tiny_image": ArrayAdapter.from_array(data["tiny_image"]), + "inner": MapAdapter( + {"small_image": ArrayAdapter.from_array(data["small_image"]), + "tiny_image": ArrayAdapter.from_array(data["tiny_image"]), + }, + metadata = {"animal": "cat", "color": "green"}, + ), + }, + metadata = {"animal": "cat", "color": "green"}, + ), "big_image": ArrayAdapter.from_array(data["big_image"]), "small_image": ArrayAdapter.from_array(data["small_image"]), - "medium_image": ArrayAdapter.from_array(data["medium_image"]), + "medium_image": ArrayAdapter.from_array(data["medium_image"], chunks=((250, )*4, (100, )*10)), "sparse_image": COOAdapter.from_coo(sparse.COO(sparse_arr)), "awkward_array": AwkwardAdapter.from_array(awkward_arr), "tiny_image": ArrayAdapter.from_array(data["tiny_image"]), diff --git a/tiled/server/app.py b/tiled/server/app.py index a21162867..70648ba12 100644 --- a/tiled/server/app.py +++ b/tiled/server/app.py @@ -11,6 +11,7 @@ from functools import lru_cache, partial from pathlib import Path from typing import List +import re import anyio import packaging.version @@ -60,6 +61,7 @@ get_root_url, record_timing, ) +from .zarr import router as zarr_router SAFE_METHODS = {"GET", "HEAD", "OPTIONS", "TRACE"} SENSITIVE_COOKIES = { @@ -67,6 +69,7 @@ } CSRF_HEADER_NAME = "x-csrf" CSRF_QUERY_PARAMETER = "csrf" +ZARR_PREFIX = '/zarr/v2' MINIMUM_SUPPORTED_PYTHON_CLIENT_VERSION = packaging.version.parse("0.1.0a104") @@ -350,6 +353,7 @@ async def unhandled_exception_handler( ) app.include_router(router, prefix="/api/v1") + app.include_router(zarr_router, prefix=ZARR_PREFIX) # The Tree and Authenticator have the opportunity to add custom routes to # the server here. (Just for example, a Tree of BlueskyRuns uses this @@ -888,6 +892,28 @@ async def current_principal_logging_filter(request: Request, call_next): current_principal.set(request.state.principal) return response + @app.middleware("http") + async def resolve_zarr_uris(request: Request, call_next): + response = await call_next(request) + + # If a zarr block is requested, e.g. http://zarr.com/array/0.1.2.3, replace the block spec with a properly + # formatted query parameter: http://zarr.com/array?block=0,1,2,3 (with ',' encoded) + if request.url.path.startswith(ZARR_PREFIX) and response.status_code == 404: + # Extract the last bit of the path + zarr_path = request.url.path.removeprefix(ZARR_PREFIX).strip('/').split('/') + zarr_block = zarr_path[-1] if len(zarr_path) > 0 else '' + if re.compile(r'^(?:\d+\.)*\d+$').fullmatch(zarr_block): + # Create a query string if the last part is in the zarr block forma, e.g. `m.n.p. ... .q` + request.scope['query_string'] = f"block={zarr_block.replace('.', '%2C')}".encode() + request.scope['path'] = ZARR_PREFIX + '/' + '/'.join(zarr_path[:-1]) + response = await call_next(request) + + # TODO: Try compiling a single RE for matching and replacement -- possible speedup? + print("In middleware resolve_zarr_uris -- replaced query params") + + response.__class__ = PatchedStreamingResponse # tolerate memoryview + return response + app.add_middleware( CorrelationIdMiddleware, header_name="X-Tiled-Request-ID", diff --git a/tiled/server/router.py b/tiled/server/router.py index a8bedce91..f2494c0a6 100644 --- a/tiled/server/router.py +++ b/tiled/server/router.py @@ -460,7 +460,6 @@ async def array_full( # Deferred import because this is not a required dependency of the server # for some use cases. import numpy - try: with record_timing(request.state.metrics, "read"): array = await ensure_awaitable(entry.read, slice) @@ -1672,68 +1671,6 @@ async def get_asset_manifest( manifest.extend(Path(root, file) for file in files) return json_or_msgpack(request, {"manifest": manifest}) -@router.get( - "/zarr/full/{path:path}", response_model=schemas.Response, name="full array as zarr" -) -async def zarr_array_full( - request: Request, - entry=SecureEntry( - scopes=["read:data"], - structure_families={StructureFamily.array, StructureFamily.sparse}, - ), - slice=Depends(slice_), - expected_shape=Depends(expected_shape), - format: Optional[str] = None, - filename: Optional[str] = None, - serialization_registry=Depends(get_serialization_registry), - settings: BaseSettings = Depends(get_settings), -): - """ - Fetch a slice of array-like data. - """ - structure_family = entry.structure_family - # Deferred import because this is not a required dependency of the server - # for some use cases. - import numpy - - try: - with record_timing(request.state.metrics, "read"): - array = await ensure_awaitable(entry.read, slice) - if structure_family == StructureFamily.array: - array = numpy.asarray(array) # Force dask or PIMS or ... to do I/O. - except IndexError: - raise HTTPException( - status_code=HTTP_400_BAD_REQUEST, detail="Block index out of range" - ) - if (expected_shape is not None) and (expected_shape != array.shape): - raise HTTPException( - status_code=HTTP_400_BAD_REQUEST, - detail=f"The expected_shape {expected_shape} does not match the actual shape {array.shape}", - ) - if array.nbytes > settings.response_bytesize_limit: - raise HTTPException( - status_code=HTTP_400_BAD_REQUEST, - detail=( - f"Response would exceed {settings.response_bytesize_limit}. " - "Use slicing ('?slice=...') to request smaller chunks." - ), - ) - try: - with record_timing(request.state.metrics, "pack"): - return await construct_data_response( - structure_family, - serialization_registry, - array, - entry.metadata(), - request, - format, - specs=getattr(entry, "specs", []), - expires=getattr(entry, "content_stale_at", None), - filename=filename, - ) - except UnsupportedMediaTypes as err: - raise HTTPException(status_code=HTTP_406_NOT_ACCEPTABLE, detail=err.args[0]) - async def validate_metadata( metadata: dict, structure_family: StructureFamily, diff --git a/tiled/server/utils.py b/tiled/server/utils.py index 0c4368d07..29233e134 100644 --- a/tiled/server/utils.py +++ b/tiled/server/utils.py @@ -1,5 +1,6 @@ import contextlib import time +from typing import Literal from ..access_policies import NO_ACCESS from ..adapters.mapping import MapAdapter @@ -41,6 +42,13 @@ def get_base_url(request): return f"{get_root_url(request)}/api/v1" +def get_zarr_url(request, version: Literal['v2', 'v3'] = 'v2'): + """ + Base URL for the Zarr API + """ + return f"{get_root_url(request)}/zarr/{version}" + + def get_root_url_low_level(request_headers, scope): # We want to get the scheme, host, and root_path (if any) # *as it appears to the client* for use in assembling links to diff --git a/tiled/server/zarr.py b/tiled/server/zarr.py new file mode 100644 index 000000000..4c3343c84 --- /dev/null +++ b/tiled/server/zarr.py @@ -0,0 +1,180 @@ +import dataclasses +import inspect +import os +import re +import warnings +from datetime import datetime, timedelta +from functools import partial, wraps +from pathlib import Path +from typing import Any, List, Optional, Tuple +import json + +import anyio +from fastapi import APIRouter, Body, Depends, HTTPException, Query, Request, Security +from jmespath.exceptions import JMESPathError +from json_merge_patch import merge as apply_merge_patch +from jsonpatch import apply_patch as apply_json_patch +from pydantic_settings import BaseSettings +from starlette.responses import Response +from starlette.status import ( + HTTP_200_OK, + HTTP_206_PARTIAL_CONTENT, + HTTP_400_BAD_REQUEST, + HTTP_403_FORBIDDEN, + HTTP_404_NOT_FOUND, + HTTP_405_METHOD_NOT_ALLOWED, + HTTP_406_NOT_ACCEPTABLE, + HTTP_416_REQUESTED_RANGE_NOT_SATISFIABLE, + HTTP_422_UNPROCESSABLE_ENTITY, + HTTP_500_INTERNAL_SERVER_ERROR, +) + +from .. import __version__ +from ..structures.core import Spec, StructureFamily +from ..utils import ensure_awaitable, patch_mimetypes, path_from_uri +from ..validation_registration import ValidationError +from . import schemas +from .authentication import Mode, get_authenticators, get_current_principal +from .core import ( + DEFAULT_PAGE_SIZE, + DEPTH_LIMIT, + MAX_PAGE_SIZE, + NoEntry, + UnsupportedMediaTypes, + WrongTypeForRoute, + apply_search, + construct_data_response, + construct_entries_response, + construct_resource, + construct_revisions_response, + json_or_msgpack, + resolve_media_type, +) +from .dependencies import ( + SecureEntry, + block, + expected_shape, + get_deserialization_registry, + get_query_registry, + get_serialization_registry, + get_validation_registry, + slice_, +) +from .file_response_with_range import FileResponseWithRange +from .links import links_for_node +from .settings import get_settings +from .utils import filter_for_access, get_base_url, record_timing + +ZARR_BLOCK_SIZE = 10 + +router = APIRouter() + +def convert_chunks_for_zarr(chunks: Tuple[Tuple[int]]): + """Convert full chunk specification into zarr format + + Zarr only accepts chunks of constant size; this function finds a unique representation of (possibly variable- + sized chunks) internal to Tiled ArrayAdapter in terms of zarr blocks. + """ + # return [min(ZARR_BLOCK_SIZE, i[0]) for i in chunks] + return [ZARR_BLOCK_SIZE for _ in chunks] + +def slice_for_zarr_block(chunks: Tuple[Tuple[int]], zblock: Tuple[int]): + ... + +# @router.get("/.zgroup", name="Root .zgroup metadata") +@router.get("/{path:path}/.zgroup", name="Zarr .zgroup metadata") +async def get_zarr_group_metadata( + request: Request, + entry=SecureEntry( + scopes=["read:data", "read:metadata"], + structure_families={StructureFamily.table, StructureFamily.container}, + ), +): + + return Response(json.dumps({"zarr_format": 2}), status_code=200) + +@router.get("/{path:path}/.zarray", name="Zarr .zarray metadata") +async def get_zarr_array_metadata( + request: Request, + path: str, + entry=SecureEntry(scopes=["read:data", "read:metadata"]), +): + if entry.structure_family not in {StructureFamily.array, StructureFamily.sparse}: + # This is normal behaviour; zarr will try to open .zarray and, if 404 is received, it will move on assuming + # that the requested resource is a group (`.../path/.zgroup` would be requested next). + # TODO: Perhaps, checking this within SecureEntry is sufficient? What happens to tables? + raise HTTPException(status_code=HTTP_404_NOT_FOUND, detail="Requested resource does not have .zarray") + + try: + zarray_spec = {} + metadata = entry.metadata() + structure = entry.structure() + zarray_spec = {'chunks': convert_chunks_for_zarr(structure.chunks), + 'compressor': {'blocksize': 0, + 'clevel': 5, + 'cname': 'lz4', + 'id': 'blosc', + 'shuffle': 1}, + 'dtype': structure.data_type.to_numpy_str(), + 'fill_value': 0, + 'filters': None, + 'order': 'C', + 'shape': list(structure.shape), + 'zarr_format': 2} + except Exception as err: + print(f"Can not create .zarray metadata, {err}") + raise HTTPException(status_code=HTTP_500_INTERNAL_SERVER_ERROR, detail=err.args[0]) + + return Response(json.dumps(zarray_spec), status_code=200) + + +@router.get("/{path:path}", name="Zarr .zgroup directory structure or a chunk of a zarr array") +async def get_zarr_array( + request: Request, + block: str | None = None, + entry=SecureEntry(scopes=["read:data"], + # structure_families={StructureFamily.array, StructureFamily.sparse}, + # structure_families={StructureFamily.table, StructureFamily.container}, + ), +): + if entry.structure_family in {StructureFamily.table, StructureFamily.container}: + # List the contents of a simulated zarr directory (excluding .zarray and .zgroup files) + url = str(request.url).split('?')[0].rstrip('/') # Remove query params and trailing slash + body = json.dumps([url + '/' + key for key in entry.keys()]) + + return Response(body, status_code=200, media_type='application/json') + + elif entry.structure_family in {StructureFamily.array, StructureFamily.sparse}: + if block is not None: + import zarr + print(f"Here, {block=}") + + block = [int(i) for i in block.split(',')] + chunks = entry.structure().chunks + + if block == (): + # Handle special case of numpy scalar + with record_timing(request.state.metrics, "read"): + array = await ensure_awaitable(entry.read) + else: + try: + with record_timing(request.state.metrics, "read"): + # array = await ensure_awaitable(entry.read_block, block) + array = await ensure_awaitable(entry.read) + x, y = block + array = array[x*ZARR_BLOCK_SIZE:(x+1)*ZARR_BLOCK_SIZE, y*ZARR_BLOCK_SIZE:(y+1)*ZARR_BLOCK_SIZE] + except IndexError: + raise HTTPException( + status_code=HTTP_400_BAD_REQUEST, detail="Block index out of range" + ) + + # TODO: This must be cached! + zarray = zarr.array(array) + + return Response(zarray.store['0.0'], status_code=200) + + else: + # TODO: + # Entire array (root uri) is requested -- never happens, but need to decide what to return here + return Response(json.dumps({}), status_code=200) + From 198e24be784587ea5304e1896eb03e0beaf22821 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Tue, 6 Aug 2024 17:39:31 -0400 Subject: [PATCH 08/46] ENH: map tiled chunks to zarr blocks --- tiled/server/app.py | 1 - tiled/server/zarr.py | 91 ++++++++++++++++++++++++-------------------- 2 files changed, 50 insertions(+), 42 deletions(-) diff --git a/tiled/server/app.py b/tiled/server/app.py index 70648ba12..46c98a58c 100644 --- a/tiled/server/app.py +++ b/tiled/server/app.py @@ -909,7 +909,6 @@ async def resolve_zarr_uris(request: Request, call_next): response = await call_next(request) # TODO: Try compiling a single RE for matching and replacement -- possible speedup? - print("In middleware resolve_zarr_uris -- replaced query params") response.__class__ = PatchedStreamingResponse # tolerate memoryview return response diff --git a/tiled/server/zarr.py b/tiled/server/zarr.py index 4c3343c84..32a8481d7 100644 --- a/tiled/server/zarr.py +++ b/tiled/server/zarr.py @@ -65,23 +65,28 @@ from .settings import get_settings from .utils import filter_for_access, get_base_url, record_timing -ZARR_BLOCK_SIZE = 10 +ZARR_BLOCK_SIZE = 10000 +ZARR_BYTE_ORDER = 'C' +ZARR_CODEC_SPEC = {'blocksize': 0, + 'clevel': 5, + 'cname': 'lz4', + 'id': 'blosc', + 'shuffle': 1} + +import numcodecs +zarr_codec = numcodecs.get_codec(ZARR_CODEC_SPEC) router = APIRouter() -def convert_chunks_for_zarr(chunks: Tuple[Tuple[int]]): - """Convert full chunk specification into zarr format +def convert_chunks_for_zarr(tiled_chunks: Tuple[Tuple[int]]): + """Convert full tiled/dask chunk specification into zarr format - Zarr only accepts chunks of constant size; this function finds a unique representation of (possibly variable- - sized chunks) internal to Tiled ArrayAdapter in terms of zarr blocks. + Zarr only accepts chunks of constant size along each dimension; this function finds a unique representation of + (possibly variable-sized chunks) internal to Tiled ArrayAdapter in terms of zarr blocks. """ - # return [min(ZARR_BLOCK_SIZE, i[0]) for i in chunks] - return [ZARR_BLOCK_SIZE for _ in chunks] + return [min(ZARR_BLOCK_SIZE, max(c)) for c in tiled_chunks] -def slice_for_zarr_block(chunks: Tuple[Tuple[int]], zblock: Tuple[int]): - ... - -# @router.get("/.zgroup", name="Root .zgroup metadata") +@router.get("{path:path}.zgroup", name="Root .zgroup metadata") @router.get("/{path:path}/.zgroup", name="Zarr .zgroup metadata") async def get_zarr_group_metadata( request: Request, @@ -110,15 +115,11 @@ async def get_zarr_array_metadata( metadata = entry.metadata() structure = entry.structure() zarray_spec = {'chunks': convert_chunks_for_zarr(structure.chunks), - 'compressor': {'blocksize': 0, - 'clevel': 5, - 'cname': 'lz4', - 'id': 'blosc', - 'shuffle': 1}, + 'compressor': ZARR_CODEC_SPEC, 'dtype': structure.data_type.to_numpy_str(), 'fill_value': 0, 'filters': None, - 'order': 'C', + 'order': ZARR_BYTE_ORDER, 'shape': list(structure.shape), 'zarr_format': 2} except Exception as err: @@ -138,7 +139,7 @@ async def get_zarr_array( ), ): if entry.structure_family in {StructureFamily.table, StructureFamily.container}: - # List the contents of a simulated zarr directory (excluding .zarray and .zgroup files) + # List the contents of a "simulated" zarr directory (excluding .zarray and .zgroup files) url = str(request.url).split('?')[0].rstrip('/') # Remove query params and trailing slash body = json.dumps([url + '/' + key for key in entry.keys()]) @@ -147,34 +148,42 @@ async def get_zarr_array( elif entry.structure_family in {StructureFamily.array, StructureFamily.sparse}: if block is not None: import zarr - print(f"Here, {block=}") + import numpy as np + + block_indx = [int(i) for i in block.split(',')] + zarr_chunks = convert_chunks_for_zarr(entry.structure().chunks) + block_slice = tuple([slice(i*c, (i+1)*c) for c, i in zip(zarr_chunks, block_indx)]) + padding_size = [max(0, sl.stop-sh) for sh, sl in zip(entry.structure().shape, block_slice)] - block = [int(i) for i in block.split(',')] - chunks = entry.structure().chunks + # if block == (): + # # Handle special case of numpy scalar + # with record_timing(request.state.metrics, "read"): + # array = await ensure_awaitable(entry.read) + # else: - if block == (): - # Handle special case of numpy scalar + # breakpoint() + try: with record_timing(request.state.metrics, "read"): - array = await ensure_awaitable(entry.read) - else: - try: - with record_timing(request.state.metrics, "read"): - # array = await ensure_awaitable(entry.read_block, block) - array = await ensure_awaitable(entry.read) - x, y = block - array = array[x*ZARR_BLOCK_SIZE:(x+1)*ZARR_BLOCK_SIZE, y*ZARR_BLOCK_SIZE:(y+1)*ZARR_BLOCK_SIZE] - except IndexError: - raise HTTPException( - status_code=HTTP_400_BAD_REQUEST, detail="Block index out of range" - ) - - # TODO: This must be cached! - zarray = zarr.array(array) - - return Response(zarray.store['0.0'], status_code=200) + array = await ensure_awaitable(entry.read, slice=block_slice) + if sum(padding_size) > 0: + array = np.pad(array, [(0, p) for p in padding_size], mode='constant') + except IndexError: + raise HTTPException( + status_code=HTTP_400_BAD_REQUEST, detail="Block index out of range" + ) + + # buf = zarr.array(array).store['0.0'] # Define a zarr array as a single block + + # breakpoint() + + array = array.astype(array.dtype, order=ZARR_BYTE_ORDER, copy=False) # ensure array is contiguous + buf = zarr_codec.encode(array) + if not isinstance(buf, bytes): + buf = array.tobytes(order="A") + + return Response(buf, status_code=200) else: # TODO: # Entire array (root uri) is requested -- never happens, but need to decide what to return here return Response(json.dumps({}), status_code=200) - From 9264d8e95cecf9e688591085323ee7bd15d4154e Mon Sep 17 00:00:00 2001 From: Eugene M Date: Tue, 6 Aug 2024 17:57:40 -0400 Subject: [PATCH 09/46] MNT: Clean-up comments --- tiled/server/app.py | 5 +++-- tiled/server/dependencies.py | 1 - tiled/server/router.py | 2 ++ 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tiled/server/app.py b/tiled/server/app.py index 46c98a58c..cb100cab4 100644 --- a/tiled/server/app.py +++ b/tiled/server/app.py @@ -896,8 +896,9 @@ async def current_principal_logging_filter(request: Request, call_next): async def resolve_zarr_uris(request: Request, call_next): response = await call_next(request) - # If a zarr block is requested, e.g. http://zarr.com/array/0.1.2.3, replace the block spec with a properly - # formatted query parameter: http://zarr.com/array?block=0,1,2,3 (with ',' encoded) + # If a zarr block is requested, e.g. http://localhost:8000/zarr/v2/array/0.1.2.3, replace the block spec + # with a properly formatted query parameter: http://localhost:8000/zarr/v2/array?block=0,1,2,3 (with ',' + # safely encoded) if request.url.path.startswith(ZARR_PREFIX) and response.status_code == 404: # Extract the last bit of the path zarr_path = request.url.path.removeprefix(ZARR_PREFIX).strip('/').split('/') diff --git a/tiled/server/dependencies.py b/tiled/server/dependencies.py index f0af8e72f..f13c676fc 100644 --- a/tiled/server/dependencies.py +++ b/tiled/server/dependencies.py @@ -71,7 +71,6 @@ async def inner( session_state is an optional dictionary passed in the session token """ - # breakpoint() path_parts = [segment for segment in path.split("/") if segment] entry = root_tree diff --git a/tiled/server/router.py b/tiled/server/router.py index f2494c0a6..a81a2fea3 100644 --- a/tiled/server/router.py +++ b/tiled/server/router.py @@ -460,6 +460,7 @@ async def array_full( # Deferred import because this is not a required dependency of the server # for some use cases. import numpy + try: with record_timing(request.state.metrics, "read"): array = await ensure_awaitable(entry.read, slice) @@ -1671,6 +1672,7 @@ async def get_asset_manifest( manifest.extend(Path(root, file) for file in files) return json_or_msgpack(request, {"manifest": manifest}) + async def validate_metadata( metadata: dict, structure_family: StructureFamily, From 2da31d319db7487550620d0f6d8bd051cff38338 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Wed, 7 Aug 2024 13:01:16 -0400 Subject: [PATCH 10/46] ENH: support tables --- tiled/examples/generated.py | 74 +++++++++++++++++++----------------- tiled/server/zarr.py | 76 +++++++++++++++++++++++++------------ 2 files changed, 91 insertions(+), 59 deletions(-) diff --git a/tiled/examples/generated.py b/tiled/examples/generated.py index 9caaacea0..4fa76f0c1 100644 --- a/tiled/examples/generated.py +++ b/tiled/examples/generated.py @@ -55,6 +55,45 @@ }, metadata = {"animal": "cat", "color": "green"}, ), + "tables": MapAdapter( + { + "short_table": DataFrameAdapter.from_pandas( + pandas.DataFrame( + { + "A": data["short_column"], + "B": 2 * data["short_column"], + "C": 3 * data["short_column"], + }, + index=pandas.Index(numpy.arange(len(data["short_column"])), name="index"), + ), + npartitions=1, + metadata={"animal": "dog", "color": "red"}, + ), + "long_table": DataFrameAdapter.from_pandas( + pandas.DataFrame( + { + "A": data["long_column"], + "B": 2 * data["long_column"], + "C": 3 * data["long_column"], + }, + index=pandas.Index(numpy.arange(len(data["long_column"])), name="index"), + ), + npartitions=5, + metadata={"animal": "dog", "color": "green"}, + ), + "wide_table": DataFrameAdapter.from_pandas( + pandas.DataFrame( + { + letter: i * data["tiny_column"] + for i, letter in enumerate(string.ascii_uppercase, start=1) + }, + index=pandas.Index(numpy.arange(len(data["tiny_column"])), name="index"), + ), + npartitions=1, + metadata={"animal": "dog", "color": "red"}, + ), + } + ), "big_image": ArrayAdapter.from_array(data["big_image"]), "small_image": ArrayAdapter.from_array(data["small_image"]), "medium_image": ArrayAdapter.from_array(data["medium_image"], chunks=((250, )*4, (100, )*10)), @@ -63,41 +102,6 @@ "tiny_image": ArrayAdapter.from_array(data["tiny_image"]), "tiny_cube": ArrayAdapter.from_array(data["tiny_cube"]), "tiny_hypercube": ArrayAdapter.from_array(data["tiny_hypercube"]), - "short_table": DataFrameAdapter.from_pandas( - pandas.DataFrame( - { - "A": data["short_column"], - "B": 2 * data["short_column"], - "C": 3 * data["short_column"], - }, - index=pandas.Index(numpy.arange(len(data["short_column"])), name="index"), - ), - npartitions=1, - metadata={"animal": "dog", "color": "red"}, - ), - "long_table": DataFrameAdapter.from_pandas( - pandas.DataFrame( - { - "A": data["long_column"], - "B": 2 * data["long_column"], - "C": 3 * data["long_column"], - }, - index=pandas.Index(numpy.arange(len(data["long_column"])), name="index"), - ), - npartitions=5, - metadata={"animal": "dog", "color": "green"}, - ), - "wide_table": DataFrameAdapter.from_pandas( - pandas.DataFrame( - { - letter: i * data["tiny_column"] - for i, letter in enumerate(string.ascii_uppercase, start=1) - }, - index=pandas.Index(numpy.arange(len(data["tiny_column"])), name="index"), - ), - npartitions=1, - metadata={"animal": "dog", "color": "red"}, - ), "structured_data": MapAdapter( { "pets": ArrayAdapter.from_array( diff --git a/tiled/server/zarr.py b/tiled/server/zarr.py index 32a8481d7..785366d1e 100644 --- a/tiled/server/zarr.py +++ b/tiled/server/zarr.py @@ -102,30 +102,48 @@ async def get_zarr_group_metadata( async def get_zarr_array_metadata( request: Request, path: str, - entry=SecureEntry(scopes=["read:data", "read:metadata"]), + column: str = '', + entry=SecureEntry(scopes=["read:data", "read:metadata"], + structure_families={StructureFamily.array, StructureFamily.sparse, StructureFamily.table}), ): - if entry.structure_family not in {StructureFamily.array, StructureFamily.sparse}: + if entry.structure_family in {StructureFamily.array, StructureFamily.sparse}: + try: + metadata = entry.metadata() + structure = entry.structure() + zarray_spec = {'chunks': convert_chunks_for_zarr(structure.chunks), + 'compressor': ZARR_CODEC_SPEC, + 'dtype': structure.data_type.to_numpy_str(), + 'fill_value': 0, + 'filters': None, + 'order': ZARR_BYTE_ORDER, + 'shape': list(structure.shape), + 'zarr_format': 2} + except Exception as err: + print(f"Can not create .zarray metadata, {err}") + raise HTTPException(status_code=HTTP_500_INTERNAL_SERVER_ERROR, detail=err.args[0]) + + # elif entry.structure_family == StructureFamily.table: + # try: + # zarray_spec = {} + # metadata = entry.metadata() + # structure = entry.structure() + # # zarray_spec = {'chunks': [100, 1], #convert_chunks_for_zarr(structure.chunks), + # # 'compressor': ZARR_CODEC_SPEC, + # # 'dtype': entry.structure().meta.dtypes[column].str, + # # 'fill_value': 0, + # # 'filters': None, + # # 'order': ZARR_BYTE_ORDER, + # # # 'shape': list(structure.shape), + # # 'zarr_format': 2} + # except Exception as err: + # print(f"Can not create .zarray metadata, {err}") + # raise HTTPException(status_code=HTTP_500_INTERNAL_SERVER_ERROR, detail=err.args[0]) + + else: # This is normal behaviour; zarr will try to open .zarray and, if 404 is received, it will move on assuming # that the requested resource is a group (`.../path/.zgroup` would be requested next). - # TODO: Perhaps, checking this within SecureEntry is sufficient? What happens to tables? raise HTTPException(status_code=HTTP_404_NOT_FOUND, detail="Requested resource does not have .zarray") - - try: - zarray_spec = {} - metadata = entry.metadata() - structure = entry.structure() - zarray_spec = {'chunks': convert_chunks_for_zarr(structure.chunks), - 'compressor': ZARR_CODEC_SPEC, - 'dtype': structure.data_type.to_numpy_str(), - 'fill_value': 0, - 'filters': None, - 'order': ZARR_BYTE_ORDER, - 'shape': list(structure.shape), - 'zarr_format': 2} - except Exception as err: - print(f"Can not create .zarray metadata, {err}") - raise HTTPException(status_code=HTTP_500_INTERNAL_SERVER_ERROR, detail=err.args[0]) - + return Response(json.dumps(zarray_spec), status_code=200) @@ -134,16 +152,26 @@ async def get_zarr_array( request: Request, block: str | None = None, entry=SecureEntry(scopes=["read:data"], - # structure_families={StructureFamily.array, StructureFamily.sparse}, - # structure_families={StructureFamily.table, StructureFamily.container}, + structure_families={StructureFamily.array, StructureFamily.sparse, StructureFamily.table, StructureFamily.container}, ), ): - if entry.structure_family in {StructureFamily.table, StructureFamily.container}: + url = str(request.url).split('?')[0].rstrip('/') # Remove query params and the trailing slash + + # breakpoint() + if entry.structure_family == StructureFamily.container: # List the contents of a "simulated" zarr directory (excluding .zarray and .zgroup files) - url = str(request.url).split('?')[0].rstrip('/') # Remove query params and trailing slash body = json.dumps([url + '/' + key for key in entry.keys()]) return Response(body, status_code=200, media_type='application/json') + + elif entry.structure_family == StructureFamily.table: + url = str(request.url).split('?')[0].rstrip('/') # Remove query params and the trailing slash + # breakpoint() + body = json.dumps([url + '/' + key for key in entry.structure().columns]) + + # entry.structure().meta.dtypes + + return Response(body, status_code=200, media_type='application/json') elif entry.structure_family in {StructureFamily.array, StructureFamily.sparse}: if block is not None: From 09d6808531f7ce733729b81a6166d907d791a1f1 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Fri, 9 Aug 2024 10:03:52 -0400 Subject: [PATCH 11/46] ENH: Add data type to sparse --- tiled/adapters/sparse.py | 3 +++ tiled/client/container.py | 2 ++ tiled/server/pydantic_sparse.py | 2 ++ tiled/server/router.py | 1 + tiled/server/zarr.py | 3 ++- tiled/structures/sparse.py | 8 ++++++++ 6 files changed, 18 insertions(+), 1 deletion(-) diff --git a/tiled/adapters/sparse.py b/tiled/adapters/sparse.py index 60b122d5c..5aa53a90c 100644 --- a/tiled/adapters/sparse.py +++ b/tiled/adapters/sparse.py @@ -8,6 +8,7 @@ from ..structures.core import Spec, StructureFamily from ..structures.sparse import COOStructure +from ..structures.array import BuiltinDtype from .array import slice_and_shape_from_block_and_chunks from .protocols import AccessPolicy from .type_alliases import JSON, NDSlice @@ -49,6 +50,7 @@ def from_arrays( dims=dims, shape=shape, chunks=tuple((dim,) for dim in shape), + data_type=BuiltinDtype.from_numpy_dtype(data.dtype), resizable=False, ) return cls( @@ -133,6 +135,7 @@ def from_global_ref( dims=dims, shape=shape, chunks=chunks, + data_type=BuiltinDtype.from_numpy_dtype(data.dtype), resizable=False, ) return cls( diff --git a/tiled/client/container.py b/tiled/client/container.py index 7a87ce507..51b466ba4 100644 --- a/tiled/client/container.py +++ b/tiled/client/container.py @@ -887,12 +887,14 @@ def write_sparse( >>> x.write_block(coords=[[0, 1]], data=[6.7, 1.2], block=(1,)) """ from ..structures.sparse import COOStructure + from ..structures.array import BuiltinDtype structure = COOStructure( shape=shape, # This method only supports single-chunk COO arrays. chunks=tuple((dim,) for dim in shape), dims=dims, + data_type=BuiltinDtype.from_numpy_dtype(data.dtype), ) client = self.new( StructureFamily.sparse, diff --git a/tiled/server/pydantic_sparse.py b/tiled/server/pydantic_sparse.py index 145f272d0..59883e94a 100644 --- a/tiled/server/pydantic_sparse.py +++ b/tiled/server/pydantic_sparse.py @@ -3,11 +3,13 @@ import pydantic from ..structures.sparse import SparseLayout +from ..structures.array import BuiltinDtype, StructDtype class COOStructure(pydantic.BaseModel): shape: Tuple[int, ...] # tuple of ints like (3, 3) chunks: Tuple[Tuple[int, ...], ...] # tuple-of-tuples-of-ints like ((3,), (3,)) + data_type: Optional[Union[BuiltinDtype, StructDtype]] = None dims: Optional[Tuple[str, ...]] = None # None or tuple of names like ("x", "y") resizable: Union[bool, Tuple[bool, ...]] = False layout: SparseLayout = SparseLayout.COO diff --git a/tiled/server/router.py b/tiled/server/router.py index a81a2fea3..c469190d7 100644 --- a/tiled/server/router.py +++ b/tiled/server/router.py @@ -456,6 +456,7 @@ async def array_full( """ Fetch a slice of array-like data. """ + breakpoint() structure_family = entry.structure_family # Deferred import because this is not a required dependency of the server # for some use cases. diff --git a/tiled/server/zarr.py b/tiled/server/zarr.py index 785366d1e..68da15434 100644 --- a/tiled/server/zarr.py +++ b/tiled/server/zarr.py @@ -84,7 +84,7 @@ def convert_chunks_for_zarr(tiled_chunks: Tuple[Tuple[int]]): Zarr only accepts chunks of constant size along each dimension; this function finds a unique representation of (possibly variable-sized chunks) internal to Tiled ArrayAdapter in terms of zarr blocks. """ - return [min(ZARR_BLOCK_SIZE, max(c)) for c in tiled_chunks] + return [min(ZARR_BLOCK_SIZE, max(tc)) for tc in tiled_chunks] @router.get("{path:path}.zgroup", name="Root .zgroup metadata") @router.get("/{path:path}/.zgroup", name="Zarr .zgroup metadata") @@ -106,6 +106,7 @@ async def get_zarr_array_metadata( entry=SecureEntry(scopes=["read:data", "read:metadata"], structure_families={StructureFamily.array, StructureFamily.sparse, StructureFamily.table}), ): + breakpoint() if entry.structure_family in {StructureFamily.array, StructureFamily.sparse}: try: metadata = entry.metadata() diff --git a/tiled/structures/sparse.py b/tiled/structures/sparse.py index 354d150d7..91ec0ee51 100644 --- a/tiled/structures/sparse.py +++ b/tiled/structures/sparse.py @@ -1,6 +1,7 @@ import enum from dataclasses import dataclass from typing import Optional, Tuple, Union +from .array import BuiltinDtype, StructDtype class SparseLayout(str, enum.Enum): @@ -13,6 +14,7 @@ class SparseLayout(str, enum.Enum): class COOStructure: chunks: Tuple[Tuple[int, ...], ...] # tuple-of-tuples-of-ints like ((3,), (3,)) shape: Tuple[int, ...] # tuple of ints like (3, 3) + data_type: Optional[Union[BuiltinDtype, StructDtype]] = None dims: Optional[Tuple[str, ...]] = None # None or tuple of names like ("x", "y") resizable: Union[bool, Tuple[bool, ...]] = False layout: SparseLayout = SparseLayout.COO @@ -20,7 +22,13 @@ class COOStructure: @classmethod def from_json(cls, structure): + data_type = structure.get("data_type", None) + if "fields" in data_type: + data_type = StructDtype.from_json(data_type) + else: + data_type = BuiltinDtype.from_json(data_type) return cls( + data_type=data_type, chunks=tuple(map(tuple, structure["chunks"])), shape=tuple(structure["shape"]), dims=structure["dims"], From 35cf1a85cd77ac6ad974cc9b8311bbbd94bb9ca1 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Thu, 5 Sep 2024 15:06:28 -0400 Subject: [PATCH 12/46] ENH: support units for numpy datetime types --- tiled/structures/array.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tiled/structures/array.py b/tiled/structures/array.py index 9581763b1..73ca3cda0 100644 --- a/tiled/structures/array.py +++ b/tiled/structures/array.py @@ -1,11 +1,10 @@ import enum import os import sys +import re from dataclasses import dataclass from typing import List, Optional, Tuple, Union -import numpy - class Endianness(str, enum.Enum): """ @@ -76,6 +75,7 @@ class BuiltinDtype: endianness: Endianness kind: Kind itemsize: int + units: str __endianness_map = { ">": "big", @@ -88,13 +88,22 @@ class BuiltinDtype: @classmethod def from_numpy_dtype(cls, dtype) -> "BuiltinDtype": + + # Extract datetime units from the dtype string representation, + # e.g. `' numpy.dtype: + def to_numpy_dtype(self): import numpy return numpy.dtype(self.to_numpy_str()) @@ -111,7 +120,8 @@ def to_numpy_str(self): # so the reported itemsize is 4x the char count. To get back to the string # we need to divide by 4. size = self.itemsize if self.kind != Kind.unicode else self.itemsize // 4 - return f"{endianness}{self.kind.value}{size}" + units = f"[{self.units}]" if self.units else '' + return f"{endianness}{self.kind.value}{size}{units}" @classmethod def from_json(cls, structure): @@ -119,6 +129,7 @@ def from_json(cls, structure): kind=Kind(structure["kind"]), itemsize=structure["itemsize"], endianness=Endianness(structure["endianness"]), + units=structure.get('units', '') ) From a31bd9436ae8afe932c25feb5918115c9222c8bf Mon Sep 17 00:00:00 2001 From: Eugene M Date: Thu, 5 Sep 2024 15:10:20 -0400 Subject: [PATCH 13/46] MNT: removed unnecessary imports --- tiled/structures/array.py | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/tiled/structures/array.py b/tiled/structures/array.py index 73ca3cda0..8453111dc 100644 --- a/tiled/structures/array.py +++ b/tiled/structures/array.py @@ -1,10 +1,12 @@ import enum import os -import sys import re +import sys from dataclasses import dataclass from typing import List, Optional, Tuple, Union +import numpy + class Endianness(str, enum.Enum): """ @@ -88,24 +90,21 @@ class BuiltinDtype: @classmethod def from_numpy_dtype(cls, dtype) -> "BuiltinDtype": - # Extract datetime units from the dtype string representation, # e.g. `' numpy.dtype: return numpy.dtype(self.to_numpy_str()) def to_numpy_str(self): @@ -120,7 +119,7 @@ def to_numpy_str(self): # so the reported itemsize is 4x the char count. To get back to the string # we need to divide by 4. size = self.itemsize if self.kind != Kind.unicode else self.itemsize // 4 - units = f"[{self.units}]" if self.units else '' + units = f"[{self.units}]" if self.units else "" return f"{endianness}{self.kind.value}{size}{units}" @classmethod @@ -129,7 +128,7 @@ def from_json(cls, structure): kind=Kind(structure["kind"]), itemsize=structure["itemsize"], endianness=Endianness(structure["endianness"]), - units=structure.get('units', '') + units=structure.get("units", ""), ) @@ -141,8 +140,6 @@ class Field: @classmethod def from_numpy_descr(cls, field): - import numpy - name, *rest = field if name == "": raise ValueError( @@ -200,8 +197,6 @@ def from_numpy_dtype(cls, dtype): ) def to_numpy_dtype(self): - import numpy - return numpy.dtype(self.to_numpy_descr()) def to_numpy_descr(self): @@ -252,8 +247,6 @@ def from_array(cls, array, shape=None, chunks=None, dims=None) -> "ArrayStructur if not hasattr(array, "__array__"): # may be a list of something; convert to array - import numpy - array = numpy.asanyarray(array) # Why would shape ever be different from array.shape, you ask? From b36c6b451a6226bb9ba61e5af8000826ac0d881e Mon Sep 17 00:00:00 2001 From: Eugene M Date: Thu, 5 Sep 2024 17:19:43 -0400 Subject: [PATCH 14/46] ENH: add default value for units --- tiled/structures/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tiled/structures/array.py b/tiled/structures/array.py index 8453111dc..cf1736900 100644 --- a/tiled/structures/array.py +++ b/tiled/structures/array.py @@ -77,7 +77,7 @@ class BuiltinDtype: endianness: Endianness kind: Kind itemsize: int - units: str + units: str = '' __endianness_map = { ">": "big", From 260e83ee71ba006001e7b5f755997aa0cc6da655 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Thu, 5 Sep 2024 17:20:25 -0400 Subject: [PATCH 15/46] ENH: update BuiltinDtype in pydantic --- tiled/server/pydantic_array.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tiled/server/pydantic_array.py b/tiled/server/pydantic_array.py index 6bc2090e8..661c1543c 100644 --- a/tiled/server/pydantic_array.py +++ b/tiled/server/pydantic_array.py @@ -14,6 +14,7 @@ it. """ +import re import sys from typing import List, Optional, Tuple, Union @@ -27,6 +28,7 @@ class BuiltinDtype(BaseModel): endianness: Endianness kind: Kind itemsize: int + units: str = "" __endianness_map = { ">": "big", @@ -39,10 +41,18 @@ class BuiltinDtype(BaseModel): @classmethod def from_numpy_dtype(cls, dtype) -> "BuiltinDtype": + # Extract datetime units from the dtype string representation, + # e.g. `' Date: Thu, 5 Sep 2024 17:21:22 -0400 Subject: [PATCH 16/46] ENH: update BuiltinDtype in pydantic --- tiled/server/pydantic_array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tiled/server/pydantic_array.py b/tiled/server/pydantic_array.py index 661c1543c..08a258bf2 100644 --- a/tiled/server/pydantic_array.py +++ b/tiled/server/pydantic_array.py @@ -79,7 +79,7 @@ def from_json(cls, structure): kind=Kind(structure["kind"]), itemsize=structure["itemsize"], endianness=Endianness(structure["endianness"]), - units=structure.get('units', 's') + units=structure.get("units", "s"), ) From 4ffa2bfdf01c92739560d6e48d373f85a9262a33 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Thu, 5 Sep 2024 17:21:28 -0400 Subject: [PATCH 17/46] ENH: add default value for units --- tiled/structures/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tiled/structures/array.py b/tiled/structures/array.py index cf1736900..ae606af75 100644 --- a/tiled/structures/array.py +++ b/tiled/structures/array.py @@ -77,7 +77,7 @@ class BuiltinDtype: endianness: Endianness kind: Kind itemsize: int - units: str = '' + units: str = "" __endianness_map = { ">": "big", From 917319d2eb28a5019cb878d8c5197aaa47ec2abd Mon Sep 17 00:00:00 2001 From: Eugene M Date: Thu, 5 Sep 2024 17:21:45 -0400 Subject: [PATCH 18/46] TST: datetime dtypes in test_array --- tiled/_tests/test_array.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tiled/_tests/test_array.py b/tiled/_tests/test_array.py index 6cb0d21e1..ed1a55463 100644 --- a/tiled/_tests/test_array.py +++ b/tiled/_tests/test_array.py @@ -23,11 +23,9 @@ "uint64": numpy.arange(10, dtype="uint64"), "f": numpy.arange(10, dtype="f"), "c": (numpy.arange(10) * 1j).astype("c"), - # "m": ( - # numpy.array(['2007-07-13', '2006-01-13', '2010-08-13'], dtype='datetime64') - - # numpy.datetime64('2008-01-01'), - # ) - # "M": numpy.array(['2007-07-13', '2006-01-13', '2010-08-13'], dtype='datetime64'), + "m": numpy.array(["2007-07-13", "2006-01-13", "2010-08-13"], dtype="datetime64[D]") + - numpy.datetime64("2008-01-01"), + "M": numpy.array(["2007-07-13", "2006-01-13", "2010-08-13"], dtype="datetime64[D]"), "S": numpy.array([letter * 3 for letter in string.ascii_letters], dtype="S3"), "U": numpy.array([letter * 3 for letter in string.ascii_letters], dtype="U3"), } From 91f0f98dc4a51b9514c143508616144d242356d9 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Thu, 5 Sep 2024 17:24:25 -0400 Subject: [PATCH 19/46] MNT: Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e68e29f4..926b39f96 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ Write the date in place of the "Unreleased" in the case a new version is release ### Added - Add method to `TableAdapter` which accepts a Python dictionary. - Added an `Arrow` adapter which supports reading/writing arrow tables via `RecordBatchFileReader`/`RecordBatchFileWriter`. +- Added support for explicit units in numpy datetime64 dtypes. ### Changed - Make `tiled.client` accept a Python dictionary when fed to `write_dataframe()`. From b6bef553402cbd32771630689b3a05b6255c712a Mon Sep 17 00:00:00 2001 From: Eugene M Date: Mon, 12 Aug 2024 16:51:43 -0400 Subject: [PATCH 20/46] resolve conflict FIX: Recursion error when pickling with dill TST: Initial tests for zarr endpoints Clean, refactor, and lint TST: tests for arrays and tables TST: tests for arrays and tables ENH: restructure demo examples ENH: (partial) support for StructDtype TST: fix tests ENH: support for datetime types --- alembic.ini | 116 ++++++++++++ tiled/_tests/test_zarr.py | 301 ++++++++++++++++++++++++++++++++ tiled/adapters/sparse.py | 2 +- tiled/adapters/zarr.py | 33 ++-- tiled/client/container.py | 2 +- tiled/examples/generated.py | 100 ++++++----- tiled/server/app.py | 30 ++-- tiled/server/dependencies.py | 1 - tiled/server/pydantic_sparse.py | 2 +- tiled/server/router.py | 1 - tiled/server/schemas.py | 4 + tiled/server/utils.py | 2 +- tiled/server/zarr.py | 214 +++++++++++++---------- tiled/structures/sparse.py | 1 + 14 files changed, 631 insertions(+), 178 deletions(-) create mode 100644 alembic.ini create mode 100644 tiled/_tests/test_zarr.py diff --git a/alembic.ini b/alembic.ini new file mode 100644 index 000000000..e7e80abc8 --- /dev/null +++ b/alembic.ini @@ -0,0 +1,116 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts +# Use forward slashes (/) also on windows to provide an os agnostic path +script_location = /Users/eugene/code/tiled/tiled/catalog/migrations + +# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s +# Uncomment the line below if you want the files to be prepended with date and time +# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file +# for all available tokens +# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. +prepend_sys_path = . + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the python>=3.9 or backports.zoneinfo library. +# Any required deps can installed by adding `alembic[tz]` to the pip requirements +# string value is passed to ZoneInfo() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to catalog/migrations/versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "version_path_separator" below. +# version_locations = %(here)s/bar:%(here)s/bat:catalog/migrations/versions + +# version path separator; As mentioned above, this is the character used to split +# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. +# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. +# Valid values for version_path_separator are: +# +# version_path_separator = : +# version_path_separator = ; +# version_path_separator = space +version_path_separator = os # Use os.pathsep. Default configuration used for new projects. + +# set to 'true' to search source files recursively +# in each "version_locations" directory +# new in Alembic version 1.10 +# recursive_version_locations = false + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +# sqlalchemy.url = driver://user:pass@localhost/dbname +sqlalchemy.url = sqlite+aiosqlite:////Users/eugene/code/demo_stream_documents/catalog/catalog.db + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# lint with attempts to fix using "ruff" - use the exec runner, execute a binary +# hooks = ruff +# ruff.type = exec +# ruff.executable = %(here)s/.venv/bin/ruff +# ruff.options = --fix REVISION_SCRIPT_FILENAME + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/tiled/_tests/test_zarr.py b/tiled/_tests/test_zarr.py new file mode 100644 index 000000000..c3fa49256 --- /dev/null +++ b/tiled/_tests/test_zarr.py @@ -0,0 +1,301 @@ +import contextlib +import math +import string +import threading +import time +import warnings + +import dask.array +import numpy +import pytest +import numpy +import pandas.testing +import uvicorn +import zarr +from fsspec.implementations.http import HTTPFileSystem +from httpx import ASGITransport, AsyncClient +from starlette.status import HTTP_200_OK, HTTP_404_NOT_FOUND + +from ..adapters.array import ArrayAdapter +from ..adapters.dataframe import DataFrameAdapter +from ..adapters.mapping import MapAdapter +from ..server.app import build_app + +rng = numpy.random.default_rng(seed=42) +array_cases = { + "dtype_b": (numpy.arange(10) % 2).astype("b"), + "dtype_i": numpy.arange(-10, 10, dtype="i"), + "dtype_uint8": numpy.arange(10, dtype="uint8"), + "dtype_uint16": numpy.arange(10, dtype="uint16"), + "dtype_uint64": numpy.arange(10, dtype="uint64"), + "dtype_f": numpy.arange(10, dtype="f"), + "dtype_c": (numpy.arange(10) * 1j).astype("c"), + "dtype_S": numpy.array([letter * 3 for letter in string.ascii_letters], dtype="S3"), + "dtype_U": numpy.array([letter * 3 for letter in string.ascii_letters], dtype="U3"), + "dtype_m": numpy.array(['2007-07-13', '2006-01-13', '2010-08-13'], dtype='datetime64') - numpy.datetime64('2008-01-01'), + "dtype_M": numpy.array(['2007-07-13', '2006-01-13', '2010-08-13'], dtype='datetime64'), + "random_2d": rng.random((10, 10)), +} +# TODO bitfield "t", void "v", and object "O" (which is not supported by default) +scalar_cases = { + k: numpy.array(v[0], dtype=v.dtype) + for k, v in array_cases.items() + if k.startswith("dtype_") +} +for v in scalar_cases.values(): + assert v.shape == () +array_tree = MapAdapter({k: ArrayAdapter.from_array(v) for k, v in array_cases.items()}) +scalar_tree = MapAdapter( + {k: ArrayAdapter.from_array(v) for k, v in scalar_cases.items()} +) + +cube_cases = { + "tiny_cube": rng.random((10, 10, 10)), + "tiny_hypercube": rng.random((10, 10, 10, 10, 10)), +} +cube_tree = MapAdapter({k: ArrayAdapter.from_array(v) for k, v in cube_cases.items()}) +arr_with_inf = numpy.array([0, 1, numpy.nan, -numpy.inf, numpy.inf]) +inf_tree = MapAdapter( + { + "example": ArrayAdapter.from_array( + arr_with_inf, + metadata={"infinity": math.inf, "-infinity": -math.inf, "nan": numpy.nan}, + ) + }, + metadata={"infinity": math.inf, "-infinity": -math.inf, "nan": numpy.nan}, +) +arr_with_zero_dim = numpy.array([]).reshape((0, 100, 1, 10)) +# Suppress RuntimeWarning: divide by zero encountered in true_divide from dask.array.core. +with warnings.catch_warnings(): + zero_tree = MapAdapter( + { + "example": ArrayAdapter.from_array( + dask.array.from_array(arr_with_zero_dim, chunks=arr_with_zero_dim.shape) + ) + } + ) +df = pandas.DataFrame( + { + "x": rng.random(size=10, dtype='float64'), + "y": rng.integers(10, size=10, dtype='uint'), + "z": rng.integers(-10, 10, size=10, dtype='int64'), + } + ) +table_tree = MapAdapter( + { + # a dataframe divided into three partitions + "divided": DataFrameAdapter.from_pandas(df, npartitions=3), + # a dataframe with just one partition + "single": DataFrameAdapter.from_pandas(df, npartitions=1), + } +) + +tree = MapAdapter( + { + "nested": MapAdapter({"array": array_tree, "cube": cube_tree}), + "inf": inf_tree, + "scalar": scalar_tree, + "zero": zero_tree, + "table": table_tree, + "random_2d": array_tree["random_2d"], + } +) + + +def traverse_tree(tree, parent='', result = None): + result = result or {} + for key, val in tree.items(): + if isinstance(val, ArrayAdapter): + result.update({f'{parent}/{key}' : 'array'}) + elif isinstance(val, DataFrameAdapter): + result.update({f'{parent}/{key}' : 'group'}) + for col, _ in val.items(): + result.update({f'{parent}/{key}/{col}' : 'array'}) + else: + result.update({f'{parent}/{key}' : 'group'}) + traverse_tree(val, parent=f'{parent}/{key}', result=result) + return result + + +@pytest.fixture(scope="module") +def app(): + app = build_app(tree, authentication={"single_user_api_key": "secret"}) + return app + + +class ThreadedServer(uvicorn.Server): + @contextlib.contextmanager + def run_in_thread(self): + thread = threading.Thread(target=self.run) + thread.start() + try: + while not self.started: + time.sleep(1e-3) + self.port = ( + self.servers[0].sockets[0].getsockname()[1] + ) # Actual port number + yield + finally: + self.should_exit = True + thread.join() + + +@pytest.fixture(scope="module") +def server(app): + config = uvicorn.Config(app, host="127.0.0.1", port=0, log_level="info") + server = ThreadedServer(config) + with server.run_in_thread(): + yield server + + +@pytest.fixture(scope="module") +def fs(): + headers = {"Authorization": "Apikey secret", "Content-Type": "application/json"} + fs = HTTPFileSystem(client_kwargs={"headers": headers}) + return fs + + +@pytest.mark.parametrize("path", ["/zarr/v2/", "/zarr/v2", "/zarr/v2/nested", "/zarr/v2/table/single"]) +@pytest.mark.asyncio +async def test_zarr_group_routes(path, app): + async with AsyncClient( + transport=ASGITransport(app=app), + base_url="http://test", + headers={"Authorization": "Apikey secret"}, + follow_redirects=True, + ) as client: + response = await client.get(path) + assert response.status_code == HTTP_200_OK + + response = await client.get(path + '/.zarray') + assert response.status_code == HTTP_404_NOT_FOUND + + response = await client.get(path + '/.zgroup') + assert response.status_code == HTTP_200_OK + + +@pytest.mark.parametrize("path", ["/zarr/v2/nested/cube/tiny_cube", "/zarr/v2/table/single/x"]) +@pytest.mark.asyncio +async def test_zarr_array_routes(path, app): + async with AsyncClient( + transport=ASGITransport(app=app), + base_url="http://test", + headers={"Authorization": "Apikey secret"}, + follow_redirects=True, + ) as client: + response = await client.get(path) + assert response.status_code == HTTP_200_OK + + response = await client.get(path + '/.zgroup') + assert response.status_code == HTTP_404_NOT_FOUND + + response = await client.get(path + '/.zarray') + assert response.status_code == HTTP_200_OK + + ndim = len(response.json().get('shape')) + indx = '.'.join( ['0']*max(ndim, 0) ) + response = await client.get(path + f'/{indx}') + assert response.status_code == HTTP_200_OK + +def test_zarr_integration(server, fs): + url = f"http://localhost:{server.port}/zarr/v2/" + grp = zarr.open(fs.get_mapper(url), mode="r") + + assert grp.store.fs == fs + assert set(grp.keys()) == set(tree.keys()) + assert len(set(grp.group_keys())) == 5 + assert len(set(grp.array_keys())) == 1 + + +@pytest.mark.parametrize( + "suffix, path", + [ + ("", "random_2d"), + ("", "nested/array/random_2d"), + ("nested", "array/random_2d"), + ("nested/array", "random_2d"), + ("nested/array/random_2d", ""), + ], +) +@pytest.mark.parametrize("slash", ["", "/"]) +def test_zarr_groups(suffix, path, slash, server, fs): + expected = array_cases["random_2d"] + url = f"http://localhost:{server.port}/zarr/v2/{suffix}{slash}" + arr = zarr.open(fs.get_mapper(url), mode="r") + if path: + arr = arr[path] + assert numpy.array_equal(arr[...], expected) + + +@pytest.mark.parametrize("kind", list(array_cases)) +def test_array_dtypes(kind, server, fs): + expected = array_cases[kind] + url = f"http://localhost:{server.port}/zarr/v2/nested/array" + grp = zarr.open(fs.get_mapper(url), mode="r") + actual = grp[kind][...] + assert numpy.array_equal(actual, expected) + + +@pytest.mark.parametrize("kind", list(scalar_cases)) +def test_scalar_dtypes(kind, server, fs): + expected = scalar_cases[kind] + url = f"http://localhost:{server.port}/zarr/v2/scalar" + grp = zarr.open(fs.get_mapper(url), mode="r") + actual = grp[kind][...] + assert numpy.array_equal(actual, expected) + + +@pytest.mark.parametrize("kind", list(cube_cases)) +def test_cube_cases(kind, server, fs): + expected = cube_cases[kind] + url = f"http://localhost:{server.port}/zarr/v2/nested/cube" + grp = zarr.open(fs.get_mapper(url), mode="r") + actual = grp[kind][...] + assert numpy.array_equal(actual, expected) + + +def test_infinity(server, fs): + url = f"http://localhost:{server.port}/zarr/v2/inf/example" + actual = zarr.open(fs.get_mapper(url), mode="r")[...] + mask = numpy.isnan(arr_with_inf) + assert numpy.array_equal(actual[~mask], arr_with_inf[~mask]) + assert numpy.isnan(actual[mask]).all() + + +def test_shape_with_zero(server, fs): + url = f"http://localhost:{server.port}/zarr/v2/zero/example" + actual = zarr.open(fs.get_mapper(url), mode="r")[...] + assert numpy.array_equal(actual, arr_with_zero_dim) + + +def test_dataframe_group(server, fs): + url = f"http://localhost:{server.port}/zarr/v2/table" + grp = zarr.open(fs.get_mapper(url), mode="r") + assert set(grp.keys()) == set(table_tree.keys()) + + for key in grp.keys(): + for col in grp[key].keys(): + actual = grp[key][col][...] + expected = df[col] + assert numpy.array_equal(actual, expected) + + +@pytest.mark.parametrize("key", list(table_tree.keys())) +def test_dataframe_single(key, server, fs): + url = f"http://localhost:{server.port}/zarr/v2/table/{key}" + grp = zarr.open(fs.get_mapper(url), mode="r") + + for col in df.columns: + actual = grp[col][...] + expected = df[col] + assert numpy.array_equal(actual, expected) + + +@pytest.mark.parametrize("key", list(table_tree.keys())) +def test_dataframe_column(key, server, fs): + for col in df.columns: + url = f"http://localhost:{server.port}/zarr/v2/table/{key}/{col}" + arr = zarr.open(fs.get_mapper(url), mode="r") + actual = arr[...] + expected = df[col] + assert numpy.array_equal(actual, expected) diff --git a/tiled/adapters/sparse.py b/tiled/adapters/sparse.py index 5aa53a90c..c6771f942 100644 --- a/tiled/adapters/sparse.py +++ b/tiled/adapters/sparse.py @@ -6,9 +6,9 @@ import sparse from numpy._typing import NDArray +from ..structures.array import BuiltinDtype from ..structures.core import Spec, StructureFamily from ..structures.sparse import COOStructure -from ..structures.array import BuiltinDtype from .array import slice_and_shape_from_block_and_chunks from .protocols import AccessPolicy from .type_alliases import JSON, NDSlice diff --git a/tiled/adapters/zarr.py b/tiled/adapters/zarr.py index 7a914965c..e8761fecd 100644 --- a/tiled/adapters/zarr.py +++ b/tiled/adapters/zarr.py @@ -1,5 +1,4 @@ import builtins -import collections.abc import os import sys from typing import Any, Iterator, List, Optional, Tuple, Union @@ -19,6 +18,11 @@ from .protocols import AccessPolicy from .type_alliases import JSON, NDSlice +if sys.version_info < (3, 9): + from typing_extensions import Mapping as MappingType +else: + from collections.abc import Mapping as MappingType + INLINED_DEPTH = int(os.getenv("TILED_HDF5_INLINED_CONTENTS_MAX_DEPTH", "7")) @@ -27,17 +31,17 @@ def read_zarr( structure: Optional[ArrayStructure] = None, **kwargs: Any, ) -> Union["ZarrGroupAdapter", ArrayAdapter]: - """ + """Create an adapter for zarr Group or Array Parameters ---------- - data_uri : - structure : - kwargs : + data_uri : location of the zarr resource, e.g. 'file://localhost/data/arr1' + structure : specification of the shape, chunks, and data type + kwargs : any kwargs accepted by ZarrGroupAdapter or ZarrArrayAdapter Returns ------- - + Initialized ZarrGroupAdapter or ZarrArrayAdapter. """ filepath = path_from_uri(data_uri) zarr_obj = zarr.open(filepath) # Group or Array @@ -91,14 +95,17 @@ def init_storage(cls, data_uri: str, structure: ArrayStructure) -> List[Asset]: ] def _stencil(self) -> Tuple[slice, ...]: - """ - Trims overflow because Zarr always has equal-sized chunks. + """Trims overflow because Zarr always has equal-sized chunks. + Returns ------- """ return tuple(builtins.slice(0, dim) for dim in self.structure().shape) + def get(self, key: str): + return None + def read( self, slice: NDSlice = ..., @@ -184,16 +191,6 @@ async def write_block( self._array[block_slice] = data -if sys.version_info < (3, 9): - from typing_extensions import Mapping - - MappingType = Mapping -else: - import collections - - MappingType = collections.abc.Mapping - - class ZarrGroupAdapter( MappingType[str, Union["ArrayAdapter", "ZarrGroupAdapter"]], IndexersMixin, diff --git a/tiled/client/container.py b/tiled/client/container.py index 51b466ba4..2434185ee 100644 --- a/tiled/client/container.py +++ b/tiled/client/container.py @@ -886,8 +886,8 @@ def write_sparse( >>> x.write_block(coords=[[2, 4]], data=[3.1, 2.8], block=(0,)) >>> x.write_block(coords=[[0, 1]], data=[6.7, 1.2], block=(1,)) """ - from ..structures.sparse import COOStructure from ..structures.array import BuiltinDtype + from ..structures.sparse import COOStructure structure = COOStructure( shape=shape, diff --git a/tiled/examples/generated.py b/tiled/examples/generated.py index 4fa76f0c1..2d296fd87 100644 --- a/tiled/examples/generated.py +++ b/tiled/examples/generated.py @@ -17,24 +17,25 @@ from tiled.adapters.xarray import DatasetAdapter print("Generating large example data...", file=sys.stderr) +rng = numpy.random.default_rng(seed=42) data = { - "big_image": numpy.random.random((10_000, 10_000)), - "small_image": numpy.random.random((300, 300)), - "medium_image": numpy.random.random((1000, 1000)), - "tiny_image": numpy.random.random((50, 50)), - "tiny_cube": numpy.random.random((50, 50, 50)), - "tiny_hypercube": numpy.random.random((50, 50, 50, 50, 50)), - "high_entropy": numpy.random.random((100, 100)), - "low_entropy": numpy.ones((100, 100)), - "short_column": numpy.random.random(100), - "tiny_column": numpy.random.random(10), - "long_column": numpy.random.random(100_000), + "big_image": rng.random((10_000, 10_000)), + "small_image": rng.random((300, 300)), + "medium_image": rng.random((1000, 1000)), + "tiny_image": rng.random((50, 50)), + "tiny_cube": rng.random((50, 50, 50)), + "tiny_hypercube": rng.random((50, 50, 50, 50, 50)), + "high_entropy": rng.integers(-10, 10, size=(100, 100)), + "low_entropy": numpy.ones((100, 100), dtype='int32'), + "short_column": rng.integers(10, size=100, dtype=numpy.dtype('uint8')), + "tiny_column": rng.random(10), + "long_column": rng.random(100_000), } -temp = 15 + 8 * numpy.random.randn(2, 2, 3) -precip = 10 * numpy.random.rand(2, 2, 3) +temp = 15 + 8 * rng.normal(size=(2, 2, 3)) +precip = 10 * rng.uniform(size=(2, 2, 3)) lon = [[-99.83, -99.32], [-99.79, -99.23]] lat = [[42.25, 42.21], [42.63, 42.59]] -sparse_arr = numpy.random.random((100, 100)) +sparse_arr = rng.random((100, 100)) sparse_arr[sparse_arr < 0.9] = 0 # fill most of the array with zeros awkward_arr = awkward.Array( [[{"x": 1.1, "y": [1]}, {"x": 2.2, "y": [1, 2]}], [], [{"x": 3.3, "y": [1, 2, 3]}]] @@ -43,17 +44,39 @@ print("Done generating example data.", file=sys.stderr) mapping = { - "nested": MapAdapter( - {"small_image": ArrayAdapter.from_array(data["small_image"]), - "tiny_image": ArrayAdapter.from_array(data["tiny_image"]), - "inner": MapAdapter( - {"small_image": ArrayAdapter.from_array(data["small_image"]), - "tiny_image": ArrayAdapter.from_array(data["tiny_image"]), - }, - metadata = {"animal": "cat", "color": "green"}, + "scalars": MapAdapter( + { + "pi": ArrayAdapter.from_array(3.14159), + "e_arr": ArrayAdapter.from_array(["2.71828"]), + "fsc": ArrayAdapter.from_array("1/137"), + "fortytwo": ArrayAdapter.from_array(42), + }, + metadata={"numbers": "constants", "precision": 5}, ), - }, - metadata = {"animal": "cat", "color": "green"}, + "nested": MapAdapter( + { + "images": MapAdapter( + { + "tiny_image": ArrayAdapter.from_array(data["tiny_image"]), + "small_image": ArrayAdapter.from_array(data["small_image"]), + "medium_image": ArrayAdapter.from_array( + data["medium_image"], chunks=((250,) * 4, (100,) * 10) + ), + "big_image": ArrayAdapter.from_array(data["big_image"]), + }, + metadata={"animal": "cat", "color": "green"}, + ), + "cubes": MapAdapter( + { + "tiny_cube": ArrayAdapter.from_array(data["tiny_cube"]), + "tiny_hypercube": ArrayAdapter.from_array(data["tiny_hypercube"]), + }, + metadata={"animal": "dog", "color": "red"}, + ), + "sparse_image": COOAdapter.from_coo(sparse.COO(sparse_arr)), + "awkward_array": AwkwardAdapter.from_array(awkward_arr), + }, + metadata={"animal": "cat", "color": "green"}, ), "tables": MapAdapter( { @@ -64,7 +87,9 @@ "B": 2 * data["short_column"], "C": 3 * data["short_column"], }, - index=pandas.Index(numpy.arange(len(data["short_column"])), name="index"), + index=pandas.Index( + numpy.arange(len(data["short_column"])), name="index" + ), ), npartitions=1, metadata={"animal": "dog", "color": "red"}, @@ -76,7 +101,9 @@ "B": 2 * data["long_column"], "C": 3 * data["long_column"], }, - index=pandas.Index(numpy.arange(len(data["long_column"])), name="index"), + index=pandas.Index( + numpy.arange(len(data["long_column"])), name="index" + ), ), npartitions=5, metadata={"animal": "dog", "color": "green"}, @@ -87,28 +114,19 @@ letter: i * data["tiny_column"] for i, letter in enumerate(string.ascii_uppercase, start=1) }, - index=pandas.Index(numpy.arange(len(data["tiny_column"])), name="index"), + index=pandas.Index( + numpy.arange(len(data["tiny_column"])), name="index" + ), ), npartitions=1, metadata={"animal": "dog", "color": "red"}, ), } - ), - "big_image": ArrayAdapter.from_array(data["big_image"]), - "small_image": ArrayAdapter.from_array(data["small_image"]), - "medium_image": ArrayAdapter.from_array(data["medium_image"], chunks=((250, )*4, (100, )*10)), - "sparse_image": COOAdapter.from_coo(sparse.COO(sparse_arr)), - "awkward_array": AwkwardAdapter.from_array(awkward_arr), - "tiny_image": ArrayAdapter.from_array(data["tiny_image"]), - "tiny_cube": ArrayAdapter.from_array(data["tiny_cube"]), - "tiny_hypercube": ArrayAdapter.from_array(data["tiny_hypercube"]), + ), "structured_data": MapAdapter( { "pets": ArrayAdapter.from_array( - numpy.array( - [("Rex", 9, 81.0), ("Fido", 3, 27.0)], - dtype=[("name", "U10"), ("age", "i4"), ("weight", "f4")], - ) + numpy.array([("Rex", 9, 81.0), ("Fido", 3, 27.0)],dtype=[("name", "U10"), ("age", "i4"), ("weight", "f4")]) ), "xarray_dataset": DatasetAdapter.from_dataset( xarray.Dataset( @@ -126,7 +144,7 @@ }, metadata={"animal": "cat", "color": "green"}, ), - "flat_array": ArrayAdapter.from_array(numpy.random.random(100)), + "flat_array": ArrayAdapter.from_array(rng.random(100)), "low_entropy": ArrayAdapter.from_array(data["low_entropy"]), "high_entropy": ArrayAdapter.from_array(data["high_entropy"]), # Below, an asynchronous task modifies this value over time. diff --git a/tiled/server/app.py b/tiled/server/app.py index cb100cab4..4fd0ff4d1 100644 --- a/tiled/server/app.py +++ b/tiled/server/app.py @@ -3,15 +3,16 @@ import contextvars import logging import os +import re import secrets import sys import urllib.parse +import urllib.parse as urlparse import warnings from contextlib import asynccontextmanager from functools import lru_cache, partial from pathlib import Path -from typing import List -import re +from typing import Dict, List import anyio import packaging.version @@ -69,7 +70,7 @@ } CSRF_HEADER_NAME = "x-csrf" CSRF_QUERY_PARAMETER = "csrf" -ZARR_PREFIX = '/zarr/v2' +ZARR_PREFIX = "/zarr/v2" MINIMUM_SUPPORTED_PYTHON_CLIENT_VERSION = packaging.version.parse("0.1.0a104") @@ -432,11 +433,7 @@ async def unhandled_exception_handler( # opporunity to register custom query types before startup. app.get( "/api/v1/search/{path:path}", - response_model=schemas.Response[ - List[schemas.Resource[schemas.NodeAttributes, dict, dict]], - schemas.PaginationLinks, - dict, - ], + response_model=schemas.SearchResponse, )(patch_route_signature(search, query_registry)) app.get( "/api/v1/distinct/{path:path}", @@ -901,15 +898,16 @@ async def resolve_zarr_uris(request: Request, call_next): # safely encoded) if request.url.path.startswith(ZARR_PREFIX) and response.status_code == 404: # Extract the last bit of the path - zarr_path = request.url.path.removeprefix(ZARR_PREFIX).strip('/').split('/') - zarr_block = zarr_path[-1] if len(zarr_path) > 0 else '' - if re.compile(r'^(?:\d+\.)*\d+$').fullmatch(zarr_block): - # Create a query string if the last part is in the zarr block forma, e.g. `m.n.p. ... .q` - request.scope['query_string'] = f"block={zarr_block.replace('.', '%2C')}".encode() - request.scope['path'] = ZARR_PREFIX + '/' + '/'.join(zarr_path[:-1]) - response = await call_next(request) + zarr_path = request.url.path.removeprefix(ZARR_PREFIX).strip("/").split("/") + zarr_block = zarr_path[-1] if len(zarr_path) > 0 else "" + if re.compile(r"^(?:\d+\.)*\d+$").fullmatch(zarr_block): + # Create a query string if the last part is in the zarr block form, e.g. `m.n.p. ... .q` + query = dict(urlparse.parse_qsl(request.url.query)) + query.update({"block": zarr_block.replace(".", ",")}) + request.scope["query_string"] = urlparse.urlencode(query).encode() + request.scope["path"] = ZARR_PREFIX + "/" + "/".join(zarr_path[:-1]) - # TODO: Try compiling a single RE for matching and replacement -- possible speedup? + response = await call_next(request) response.__class__ = PatchedStreamingResponse # tolerate memoryview return response diff --git a/tiled/server/dependencies.py b/tiled/server/dependencies.py index f13c676fc..c756a80d3 100644 --- a/tiled/server/dependencies.py +++ b/tiled/server/dependencies.py @@ -73,7 +73,6 @@ async def inner( """ path_parts = [segment for segment in path.split("/") if segment] entry = root_tree - # If the entry/adapter can take a session state, pass it in. # The entry/adapter may return itself or a different object. if hasattr(entry, "with_session_state") and session_state: diff --git a/tiled/server/pydantic_sparse.py b/tiled/server/pydantic_sparse.py index 59883e94a..6c7d35e05 100644 --- a/tiled/server/pydantic_sparse.py +++ b/tiled/server/pydantic_sparse.py @@ -2,8 +2,8 @@ import pydantic -from ..structures.sparse import SparseLayout from ..structures.array import BuiltinDtype, StructDtype +from ..structures.sparse import SparseLayout class COOStructure(pydantic.BaseModel): diff --git a/tiled/server/router.py b/tiled/server/router.py index c469190d7..a81a2fea3 100644 --- a/tiled/server/router.py +++ b/tiled/server/router.py @@ -456,7 +456,6 @@ async def array_full( """ Fetch a slice of array-like data. """ - breakpoint() structure_family = entry.structure_family # Deferred import because this is not a required dependency of the server # for some use cases. diff --git a/tiled/server/schemas.py b/tiled/server/schemas.py index fa7f039e9..3cc757b84 100644 --- a/tiled/server/schemas.py +++ b/tiled/server/schemas.py @@ -567,4 +567,8 @@ class PatchMetadataResponse(pydantic.BaseModel, Generic[ResourceLinksT]): data_sources: Optional[List[DataSource]] +SearchResponse = Response[ + List[Resource[NodeAttributes, Dict, Dict]], PaginationLinks, Dict +] + NodeStructure.model_rebuild() diff --git a/tiled/server/utils.py b/tiled/server/utils.py index 29233e134..bc7e55d91 100644 --- a/tiled/server/utils.py +++ b/tiled/server/utils.py @@ -42,7 +42,7 @@ def get_base_url(request): return f"{get_root_url(request)}/api/v1" -def get_zarr_url(request, version: Literal['v2', 'v3'] = 'v2'): +def get_zarr_url(request, version: Literal["v2", "v3"] = "v2"): """ Base URL for the Zarr API """ diff --git a/tiled/server/zarr.py b/tiled/server/zarr.py index c6b991320..f62fd5d9b 100644 --- a/tiled/server/zarr.py +++ b/tiled/server/zarr.py @@ -1,5 +1,6 @@ import dataclasses import inspect +import json import os import re import warnings @@ -7,7 +8,6 @@ from functools import partial, wraps from pathlib import Path from typing import Any, List, Optional, Tuple -import json import anyio from fastapi import APIRouter, Body, Depends, HTTPException, Query, Request, Security @@ -31,9 +31,11 @@ from .. import __version__ from ..structures.core import Spec, StructureFamily +from ..structures.array import StructDtype from ..utils import ensure_awaitable, patch_mimetypes, path_from_uri from ..validation_registration import ValidationError -from . import schemas + +# from . import schemas from .authentication import Mode, get_authenticators, get_current_principal from .core import ( DEFAULT_PAGE_SIZE, @@ -66,29 +68,33 @@ from .utils import filter_for_access, get_base_url, record_timing ZARR_BLOCK_SIZE = 10000 -ZARR_BYTE_ORDER = 'C' -ZARR_CODEC_SPEC = {'blocksize': 0, - 'clevel': 5, - 'cname': 'lz4', - 'id': 'blosc', - 'shuffle': 1} +ZARR_BYTE_ORDER = "C" +ZARR_CODEC_SPEC = { + "blocksize": 0, + "clevel": 5, + "cname": "lz4", + "id": "blosc", + "shuffle": 1, +} +ZARR_DATETIME64_PRECISION = 'ns' import numcodecs + zarr_codec = numcodecs.get_codec(ZARR_CODEC_SPEC) router = APIRouter() + def convert_chunks_for_zarr(tiled_chunks: Tuple[Tuple[int]]): """Convert full tiled/dask chunk specification into zarr format - + Zarr only accepts chunks of constant size along each dimension; this function finds a unique representation of (possibly variable-sized chunks) internal to Tiled ArrayAdapter in terms of zarr blocks. + + Zarr chunks must be at least of size 1 (even for zero-dimensional arrays). """ -<<<<<<< HEAD - return [min(ZARR_BLOCK_SIZE, max(tc)) for tc in tiled_chunks] -======= - return [min(ZARR_BLOCK_SIZE, max(c)) for c in tiled_chunks] ->>>>>>> 08f255d687118b1983cf1019b375d7d6f948ce2e + return [min(ZARR_BLOCK_SIZE, max(*tc, 1)) for tc in tiled_chunks] + @router.get("{path:path}.zgroup", name="Root .zgroup metadata") @router.get("/{path:path}/.zgroup", name="Zarr .zgroup metadata") @@ -96,123 +102,137 @@ async def get_zarr_group_metadata( request: Request, entry=SecureEntry( scopes=["read:data", "read:metadata"], - structure_families={StructureFamily.table, StructureFamily.container}, + structure_families={StructureFamily.table, StructureFamily.container, StructureFamily.array}, ), ): + # Usual (unstructured) array; should respond to /.zarray instead + if entry.structure_family == StructureFamily.array and not isinstance(entry.structure().data_type, StructDtype): + raise HTTPException(status_code=HTTP_404_NOT_FOUND) + # Structured numpy array, Container, or Table return Response(json.dumps({"zarr_format": 2}), status_code=200) @router.get("/{path:path}/.zarray", name="Zarr .zarray metadata") async def get_zarr_array_metadata( request: Request, - path: str, - column: str = '', - entry=SecureEntry(scopes=["read:data", "read:metadata"], - structure_families={StructureFamily.array, StructureFamily.sparse, StructureFamily.table}), + entry=SecureEntry( + scopes=["read:data", "read:metadata"], + structure_families={StructureFamily.array, StructureFamily.sparse}, + ), ): -<<<<<<< HEAD - breakpoint() -======= ->>>>>>> 08f255d687118b1983cf1019b375d7d6f948ce2e - if entry.structure_family in {StructureFamily.array, StructureFamily.sparse}: - try: - metadata = entry.metadata() - structure = entry.structure() - zarray_spec = {'chunks': convert_chunks_for_zarr(structure.chunks), - 'compressor': ZARR_CODEC_SPEC, - 'dtype': structure.data_type.to_numpy_str(), - 'fill_value': 0, - 'filters': None, - 'order': ZARR_BYTE_ORDER, - 'shape': list(structure.shape), - 'zarr_format': 2} - except Exception as err: - print(f"Can not create .zarray metadata, {err}") - raise HTTPException(status_code=HTTP_500_INTERNAL_SERVER_ERROR, detail=err.args[0]) - - # elif entry.structure_family == StructureFamily.table: - # try: - # zarray_spec = {} - # metadata = entry.metadata() - # structure = entry.structure() - # # zarray_spec = {'chunks': [100, 1], #convert_chunks_for_zarr(structure.chunks), - # # 'compressor': ZARR_CODEC_SPEC, - # # 'dtype': entry.structure().meta.dtypes[column].str, - # # 'fill_value': 0, - # # 'filters': None, - # # 'order': ZARR_BYTE_ORDER, - # # # 'shape': list(structure.shape), - # # 'zarr_format': 2} - # except Exception as err: - # print(f"Can not create .zarray metadata, {err}") - # raise HTTPException(status_code=HTTP_500_INTERNAL_SERVER_ERROR, detail=err.args[0]) - - else: - # This is normal behaviour; zarr will try to open .zarray and, if 404 is received, it will move on assuming - # that the requested resource is a group (`.../path/.zgroup` would be requested next). - raise HTTPException(status_code=HTTP_404_NOT_FOUND, detail="Requested resource does not have .zarray") - - return Response(json.dumps(zarray_spec), status_code=200) - - -@router.get("/{path:path}", name="Zarr .zgroup directory structure or a chunk of a zarr array") + # Only StructureFamily.array and StructureFamily.sparse can respond to `/.zarray` querries. Zarr will try to + # request .zarray on all other nodes in Tiled (not included in SecureEntry above), in which case the server + # will return an 404 error; this is the expected behaviour, which will signal zarr to try /.zgroup instead. + structure = entry.structure() + if isinstance(structure.data_type, StructDtype): + # Structured numpy array should be treated as a DataFrame and will respond to /.zgroup instead + raise HTTPException(status_code=HTTP_404_NOT_FOUND) + try: + zarray_spec = { + "chunks": convert_chunks_for_zarr(structure.chunks), + "compressor": ZARR_CODEC_SPEC, + "dtype": structure.data_type.to_numpy_str(), + "fill_value": 0, + "filters": None, + "order": ZARR_BYTE_ORDER, + "shape": list(structure.shape), + "zarr_format": 2, + } + return Response(json.dumps(zarray_spec), status_code=200) + except Exception as err: + print(f"Can not create .zarray metadata, {err}") + raise HTTPException( + status_code=HTTP_500_INTERNAL_SERVER_ERROR, detail=err.args[0] + ) + + +@router.get( + "/{path:path}", name="Zarr group (directory) structure or a chunk of a zarr array" +) async def get_zarr_array( request: Request, block: str | None = None, - entry=SecureEntry(scopes=["read:data"], - structure_families={StructureFamily.array, StructureFamily.sparse, StructureFamily.table, StructureFamily.container}, + entry=SecureEntry( + scopes=["read:data"], + structure_families={ + StructureFamily.array, + StructureFamily.sparse, + StructureFamily.table, + StructureFamily.container, + }, ), ): - url = str(request.url).split('?')[0].rstrip('/') # Remove query params and the trailing slash + # Remove query params and the trailing slash from the url + url = str(request.url).split("?")[0].rstrip("/") - # breakpoint() if entry.structure_family == StructureFamily.container: # List the contents of a "simulated" zarr directory (excluding .zarray and .zgroup files) - body = json.dumps([url + '/' + key for key in entry.keys()]) + if hasattr(entry, "keys_range"): + keys = await entry.keys_range(offset=0, limit=None) + else: + keys = entry.keys() + body = json.dumps([url + "/" + key for key in keys]) + + return Response(body, status_code=200, media_type="application/json") - return Response(body, status_code=200, media_type='application/json') - elif entry.structure_family == StructureFamily.table: - url = str(request.url).split('?')[0].rstrip('/') # Remove query params and the trailing slash - # breakpoint() - body = json.dumps([url + '/' + key for key in entry.structure().columns]) + # List the columns of the table -- they will be accessed separately as arrays + body = json.dumps([url + "/" + key for key in entry.structure().columns]) - # entry.structure().meta.dtypes + return Response(body, status_code=200, media_type="application/json") + + elif entry.structure_family == StructureFamily.array and isinstance(entry.structure().data_type, StructDtype): + # List the column names of the structured array -- they will be accessed separately + body = json.dumps([url + "/" + f.name for f in entry.structure().data_type.fields]) - return Response(body, status_code=200, media_type='application/json') + return Response(body, status_code=200, media_type="application/json") elif entry.structure_family in {StructureFamily.array, StructureFamily.sparse}: + # Return the actual array values for a single block of zarr array if block is not None: - import zarr import numpy as np + from sparse import SparseArray + + zarr_block_indx = [int(i) for i in block.split(",")] + zarr_block_spec = convert_chunks_for_zarr(entry.structure().chunks) + if (not (zarr_block_spec == [] and zarr_block_indx == [0])) and ( + len(zarr_block_spec) != len(zarr_block_indx) + ): + # Not a scalar and shape doesn't match + raise HTTPException( + status_code=HTTP_400_BAD_REQUEST, + detail=f"Requested zarr block index {zarr_block_indx} is inconsistent with the shape of array, {entry.structure().shape}.", # noqa + ) - block_indx = [int(i) for i in block.split(',')] - zarr_chunks = convert_chunks_for_zarr(entry.structure().chunks) - block_slice = tuple([slice(i*c, (i+1)*c) for c, i in zip(zarr_chunks, block_indx)]) - padding_size = [max(0, sl.stop-sh) for sh, sl in zip(entry.structure().shape, block_slice)] - - # if block == (): - # # Handle special case of numpy scalar - # with record_timing(request.state.metrics, "read"): - # array = await ensure_awaitable(entry.read) - # else: - - # breakpoint() + # Indices of the array slices in each dimension that correspond to the requested zarr block + block_slices = tuple( + [ + slice(i * c, (i + 1) * c) + for i, c in zip(zarr_block_indx, zarr_block_spec) + ] + ) try: with record_timing(request.state.metrics, "read"): - array = await ensure_awaitable(entry.read, slice=block_slice) - if sum(padding_size) > 0: - array = np.pad(array, [(0, p) for p in padding_size], mode='constant') + array = await ensure_awaitable(entry.read, slice=block_slices) except IndexError: raise HTTPException( - status_code=HTTP_400_BAD_REQUEST, detail="Block index out of range" + status_code=HTTP_400_BAD_REQUEST, + detail=f"Index of zarr block {zarr_block_indx} is out of range.", ) - # buf = zarr.array(array).store['0.0'] # Define a zarr array as a single block + if isinstance(array, SparseArray): + array = array.todense() - # breakpoint() + # Padd the last slices with zeros if needed to ensure all zarr blocks have same shapes + padding_size = [ + max(0, sl.stop - sh) + for sl, sh in zip(block_slices, entry.structure().shape) + ] + if sum(padding_size) > 0: + array = np.pad(array, [(0, p) for p in padding_size], mode="constant") - array = array.astype(array.dtype, order=ZARR_BYTE_ORDER, copy=False) # ensure array is contiguous + # Ensure the array is contiguous and encode it; equivalent to `buf = zarr.array(array).store['0.0']` + array = array.astype(array.dtype, order=ZARR_BYTE_ORDER, copy=False) buf = zarr_codec.encode(array) if not isinstance(buf, bytes): buf = array.tobytes(order="A") diff --git a/tiled/structures/sparse.py b/tiled/structures/sparse.py index 91ec0ee51..5b1e34d96 100644 --- a/tiled/structures/sparse.py +++ b/tiled/structures/sparse.py @@ -1,6 +1,7 @@ import enum from dataclasses import dataclass from typing import Optional, Tuple, Union + from .array import BuiltinDtype, StructDtype From 00d0f5f0b1e5ddf8fe4935fd97d73881ea0f2dc3 Mon Sep 17 00:00:00 2001 From: Eugene Date: Thu, 5 Sep 2024 19:45:26 -0400 Subject: [PATCH 21/46] Clean-up --- alembic.ini | 116 ---------------------------------------------------- 1 file changed, 116 deletions(-) delete mode 100644 alembic.ini diff --git a/alembic.ini b/alembic.ini deleted file mode 100644 index e7e80abc8..000000000 --- a/alembic.ini +++ /dev/null @@ -1,116 +0,0 @@ -# A generic, single database configuration. - -[alembic] -# path to migration scripts -# Use forward slashes (/) also on windows to provide an os agnostic path -script_location = /Users/eugene/code/tiled/tiled/catalog/migrations - -# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s -# Uncomment the line below if you want the files to be prepended with date and time -# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file -# for all available tokens -# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s - -# sys.path path, will be prepended to sys.path if present. -# defaults to the current working directory. -prepend_sys_path = . - -# timezone to use when rendering the date within the migration file -# as well as the filename. -# If specified, requires the python>=3.9 or backports.zoneinfo library. -# Any required deps can installed by adding `alembic[tz]` to the pip requirements -# string value is passed to ZoneInfo() -# leave blank for localtime -# timezone = - -# max length of characters to apply to the "slug" field -# truncate_slug_length = 40 - -# set to 'true' to run the environment during -# the 'revision' command, regardless of autogenerate -# revision_environment = false - -# set to 'true' to allow .pyc and .pyo files without -# a source .py file to be detected as revisions in the -# versions/ directory -# sourceless = false - -# version location specification; This defaults -# to catalog/migrations/versions. When using multiple version -# directories, initial revisions must be specified with --version-path. -# The path separator used here should be the separator specified by "version_path_separator" below. -# version_locations = %(here)s/bar:%(here)s/bat:catalog/migrations/versions - -# version path separator; As mentioned above, this is the character used to split -# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. -# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. -# Valid values for version_path_separator are: -# -# version_path_separator = : -# version_path_separator = ; -# version_path_separator = space -version_path_separator = os # Use os.pathsep. Default configuration used for new projects. - -# set to 'true' to search source files recursively -# in each "version_locations" directory -# new in Alembic version 1.10 -# recursive_version_locations = false - -# the output encoding used when revision files -# are written from script.py.mako -# output_encoding = utf-8 - -# sqlalchemy.url = driver://user:pass@localhost/dbname -sqlalchemy.url = sqlite+aiosqlite:////Users/eugene/code/demo_stream_documents/catalog/catalog.db - -[post_write_hooks] -# post_write_hooks defines scripts or Python functions that are run -# on newly generated revision scripts. See the documentation for further -# detail and examples - -# format using "black" - use the console_scripts runner, against the "black" entrypoint -# hooks = black -# black.type = console_scripts -# black.entrypoint = black -# black.options = -l 79 REVISION_SCRIPT_FILENAME - -# lint with attempts to fix using "ruff" - use the exec runner, execute a binary -# hooks = ruff -# ruff.type = exec -# ruff.executable = %(here)s/.venv/bin/ruff -# ruff.options = --fix REVISION_SCRIPT_FILENAME - -# Logging configuration -[loggers] -keys = root,sqlalchemy,alembic - -[handlers] -keys = console - -[formatters] -keys = generic - -[logger_root] -level = WARN -handlers = console -qualname = - -[logger_sqlalchemy] -level = WARN -handlers = -qualname = sqlalchemy.engine - -[logger_alembic] -level = INFO -handlers = -qualname = alembic - -[handler_console] -class = StreamHandler -args = (sys.stderr,) -level = NOTSET -formatter = generic - -[formatter_generic] -format = %(levelname)-5.5s [%(name)s] %(message)s -datefmt = %H:%M:%S From ee0d2cce67d55a2523bdc71120baa30d72909bd6 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Thu, 5 Sep 2024 19:45:59 -0400 Subject: [PATCH 22/46] MNT: gitignore alembic.ini --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index b10648632..d08e25abb 100644 --- a/.gitignore +++ b/.gitignore @@ -58,6 +58,7 @@ config.yml prometheus_data grafana_data data +alembic.ini tiled/_version.py From e68e0e59e5a183fa084e5f5f42dc6088144f6d94 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Fri, 6 Sep 2024 10:34:37 -0400 Subject: [PATCH 23/46] FIX: typo in comment --- tiled/server/pydantic_array.py | 2 +- tiled/structures/array.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tiled/server/pydantic_array.py b/tiled/server/pydantic_array.py index 08a258bf2..a85f54e4e 100644 --- a/tiled/server/pydantic_array.py +++ b/tiled/server/pydantic_array.py @@ -42,7 +42,7 @@ class BuiltinDtype(BaseModel): @classmethod def from_numpy_dtype(cls, dtype) -> "BuiltinDtype": # Extract datetime units from the dtype string representation, - # e.g. `' "BuiltinDtype": # Extract datetime units from the dtype string representation, - # e.g. `' Date: Fri, 9 Aug 2024 10:03:52 -0400 Subject: [PATCH 24/46] ENH: Add data type to sparse --- tiled/adapters/sparse.py | 3 +++ tiled/client/container.py | 2 ++ tiled/server/pydantic_sparse.py | 2 ++ tiled/server/router.py | 1 + tiled/structures/sparse.py | 8 ++++++++ 5 files changed, 16 insertions(+) diff --git a/tiled/adapters/sparse.py b/tiled/adapters/sparse.py index 60b122d5c..5aa53a90c 100644 --- a/tiled/adapters/sparse.py +++ b/tiled/adapters/sparse.py @@ -8,6 +8,7 @@ from ..structures.core import Spec, StructureFamily from ..structures.sparse import COOStructure +from ..structures.array import BuiltinDtype from .array import slice_and_shape_from_block_and_chunks from .protocols import AccessPolicy from .type_alliases import JSON, NDSlice @@ -49,6 +50,7 @@ def from_arrays( dims=dims, shape=shape, chunks=tuple((dim,) for dim in shape), + data_type=BuiltinDtype.from_numpy_dtype(data.dtype), resizable=False, ) return cls( @@ -133,6 +135,7 @@ def from_global_ref( dims=dims, shape=shape, chunks=chunks, + data_type=BuiltinDtype.from_numpy_dtype(data.dtype), resizable=False, ) return cls( diff --git a/tiled/client/container.py b/tiled/client/container.py index 7a87ce507..51b466ba4 100644 --- a/tiled/client/container.py +++ b/tiled/client/container.py @@ -887,12 +887,14 @@ def write_sparse( >>> x.write_block(coords=[[0, 1]], data=[6.7, 1.2], block=(1,)) """ from ..structures.sparse import COOStructure + from ..structures.array import BuiltinDtype structure = COOStructure( shape=shape, # This method only supports single-chunk COO arrays. chunks=tuple((dim,) for dim in shape), dims=dims, + data_type=BuiltinDtype.from_numpy_dtype(data.dtype), ) client = self.new( StructureFamily.sparse, diff --git a/tiled/server/pydantic_sparse.py b/tiled/server/pydantic_sparse.py index 145f272d0..59883e94a 100644 --- a/tiled/server/pydantic_sparse.py +++ b/tiled/server/pydantic_sparse.py @@ -3,11 +3,13 @@ import pydantic from ..structures.sparse import SparseLayout +from ..structures.array import BuiltinDtype, StructDtype class COOStructure(pydantic.BaseModel): shape: Tuple[int, ...] # tuple of ints like (3, 3) chunks: Tuple[Tuple[int, ...], ...] # tuple-of-tuples-of-ints like ((3,), (3,)) + data_type: Optional[Union[BuiltinDtype, StructDtype]] = None dims: Optional[Tuple[str, ...]] = None # None or tuple of names like ("x", "y") resizable: Union[bool, Tuple[bool, ...]] = False layout: SparseLayout = SparseLayout.COO diff --git a/tiled/server/router.py b/tiled/server/router.py index a81a2fea3..c469190d7 100644 --- a/tiled/server/router.py +++ b/tiled/server/router.py @@ -456,6 +456,7 @@ async def array_full( """ Fetch a slice of array-like data. """ + breakpoint() structure_family = entry.structure_family # Deferred import because this is not a required dependency of the server # for some use cases. diff --git a/tiled/structures/sparse.py b/tiled/structures/sparse.py index 354d150d7..91ec0ee51 100644 --- a/tiled/structures/sparse.py +++ b/tiled/structures/sparse.py @@ -1,6 +1,7 @@ import enum from dataclasses import dataclass from typing import Optional, Tuple, Union +from .array import BuiltinDtype, StructDtype class SparseLayout(str, enum.Enum): @@ -13,6 +14,7 @@ class SparseLayout(str, enum.Enum): class COOStructure: chunks: Tuple[Tuple[int, ...], ...] # tuple-of-tuples-of-ints like ((3,), (3,)) shape: Tuple[int, ...] # tuple of ints like (3, 3) + data_type: Optional[Union[BuiltinDtype, StructDtype]] = None dims: Optional[Tuple[str, ...]] = None # None or tuple of names like ("x", "y") resizable: Union[bool, Tuple[bool, ...]] = False layout: SparseLayout = SparseLayout.COO @@ -20,7 +22,13 @@ class COOStructure: @classmethod def from_json(cls, structure): + data_type = structure.get("data_type", None) + if "fields" in data_type: + data_type = StructDtype.from_json(data_type) + else: + data_type = BuiltinDtype.from_json(data_type) return cls( + data_type=data_type, chunks=tuple(map(tuple, structure["chunks"])), shape=tuple(structure["shape"]), dims=structure["dims"], From 1fbb021a92b47fdbe008d839d0d43dd490f28a54 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Fri, 6 Sep 2024 10:58:39 -0400 Subject: [PATCH 25/46] FIX: assignment error --- tiled/structures/sparse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tiled/structures/sparse.py b/tiled/structures/sparse.py index 91ec0ee51..ea773ce3b 100644 --- a/tiled/structures/sparse.py +++ b/tiled/structures/sparse.py @@ -23,7 +23,7 @@ class COOStructure: @classmethod def from_json(cls, structure): data_type = structure.get("data_type", None) - if "fields" in data_type: + if data_type is not None and "fields" in data_type: data_type = StructDtype.from_json(data_type) else: data_type = BuiltinDtype.from_json(data_type) From 1ec2425f1350e6b402ce7622c368f04765c9c3ab Mon Sep 17 00:00:00 2001 From: Eugene M Date: Fri, 6 Sep 2024 11:12:05 -0400 Subject: [PATCH 26/46] MNT: clean and lint --- tiled/adapters/sparse.py | 2 +- tiled/client/container.py | 2 +- tiled/server/pydantic_sparse.py | 2 +- tiled/server/router.py | 1 - tiled/structures/sparse.py | 1 + 5 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tiled/adapters/sparse.py b/tiled/adapters/sparse.py index 5aa53a90c..c6771f942 100644 --- a/tiled/adapters/sparse.py +++ b/tiled/adapters/sparse.py @@ -6,9 +6,9 @@ import sparse from numpy._typing import NDArray +from ..structures.array import BuiltinDtype from ..structures.core import Spec, StructureFamily from ..structures.sparse import COOStructure -from ..structures.array import BuiltinDtype from .array import slice_and_shape_from_block_and_chunks from .protocols import AccessPolicy from .type_alliases import JSON, NDSlice diff --git a/tiled/client/container.py b/tiled/client/container.py index 51b466ba4..2434185ee 100644 --- a/tiled/client/container.py +++ b/tiled/client/container.py @@ -886,8 +886,8 @@ def write_sparse( >>> x.write_block(coords=[[2, 4]], data=[3.1, 2.8], block=(0,)) >>> x.write_block(coords=[[0, 1]], data=[6.7, 1.2], block=(1,)) """ - from ..structures.sparse import COOStructure from ..structures.array import BuiltinDtype + from ..structures.sparse import COOStructure structure = COOStructure( shape=shape, diff --git a/tiled/server/pydantic_sparse.py b/tiled/server/pydantic_sparse.py index 59883e94a..6c7d35e05 100644 --- a/tiled/server/pydantic_sparse.py +++ b/tiled/server/pydantic_sparse.py @@ -2,8 +2,8 @@ import pydantic -from ..structures.sparse import SparseLayout from ..structures.array import BuiltinDtype, StructDtype +from ..structures.sparse import SparseLayout class COOStructure(pydantic.BaseModel): diff --git a/tiled/server/router.py b/tiled/server/router.py index c469190d7..a81a2fea3 100644 --- a/tiled/server/router.py +++ b/tiled/server/router.py @@ -456,7 +456,6 @@ async def array_full( """ Fetch a slice of array-like data. """ - breakpoint() structure_family = entry.structure_family # Deferred import because this is not a required dependency of the server # for some use cases. diff --git a/tiled/structures/sparse.py b/tiled/structures/sparse.py index ea773ce3b..da50c3a7b 100644 --- a/tiled/structures/sparse.py +++ b/tiled/structures/sparse.py @@ -1,6 +1,7 @@ import enum from dataclasses import dataclass from typing import Optional, Tuple, Union + from .array import BuiltinDtype, StructDtype From 808ebcbc39f2b136866d0a9f4ac9eaf480588511 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Fri, 6 Sep 2024 11:18:10 -0400 Subject: [PATCH 27/46] MNT: update changelog --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 52d581718..39488ebc2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,12 @@ Write the date in place of the "Unreleased" in the case a new version is release # Changelog +## v0.1.0b9 (Unreleased) + +### Added + +- `dtype` property for sparse arrays in `COOAdapter` and `COOStructure`. + ## v0.1.0b8 (2024-09-06) ### Fixed From 66eabcf3a512a06bf1e9227679c3d23b77b2c7fa Mon Sep 17 00:00:00 2001 From: Eugene M Date: Fri, 6 Sep 2024 11:42:51 -0400 Subject: [PATCH 28/46] MNT: fix changelog --- CHANGELOG.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 912a5acaa..347761673 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,12 @@ Write the date in place of the "Unreleased" in the case a new version is release # Changelog +## Unreleased + +### Added + +- Added support for explicit units in numpy datetime64 dtypes. + ## v0.1.0b8 (2024-09-06) ### Fixed @@ -15,7 +21,6 @@ Write the date in place of the "Unreleased" in the case a new version is release - Add method to `TableAdapter` which accepts a Python dictionary. - Added an `Arrow` adapter which supports reading/writing arrow tables via `RecordBatchFileReader`/`RecordBatchFileWriter`. -- Added support for explicit units in numpy datetime64 dtypes. ### Changed From 57137b154b7b474e4facc1d4f9dbd51b40ca7179 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Fri, 6 Sep 2024 13:36:26 -0400 Subject: [PATCH 29/46] FIX: tests with COOStructure --- tiled/_tests/test_protocols.py | 8 ++++++-- tiled/_tests/test_writing.py | 9 ++++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/tiled/_tests/test_protocols.py b/tiled/_tests/test_protocols.py index e54921cd5..a458ae106 100644 --- a/tiled/_tests/test_protocols.py +++ b/tiled/_tests/test_protocols.py @@ -247,9 +247,13 @@ def test_sparseadapter_protocol(mocker: MockFixture) -> None: mock_call4 = mocker.patch.object(CustomSparseAdapter, "specs") mock_call5 = mocker.patch.object(CustomSparseAdapter, "metadata") - structure = COOStructure(shape=(2 * 5,), chunks=((5, 5),)) - array = numpy.random.rand(2, 512, 512) + + structure = COOStructure( + shape=(2 * 5,), + chunks=((5, 5),), + data_type=BuiltinDtype.from_numpy_dtype(array.dtype), + ) blocks: Dict[Tuple[int, ...], Tuple[NDArray[Any], Any]] = {(1,): (array, (1,))} metadata: JSON = {"foo": "bar"} anyslice = (1, 1, 1) diff --git a/tiled/_tests/test_writing.py b/tiled/_tests/test_writing.py index 571aaad59..320dba4e3 100644 --- a/tiled/_tests/test_writing.py +++ b/tiled/_tests/test_writing.py @@ -26,6 +26,7 @@ from ..mimetypes import PARQUET_MIMETYPE from ..queries import Key from ..server.app import build_app +from ..structures.array import BuiltinDtype from ..structures.core import Spec, StructureFamily from ..structures.data_source import DataSource from ..structures.sparse import COOStructure @@ -245,7 +246,13 @@ def test_write_sparse_chunked(tree): "sparse", [ DataSource( - structure=COOStructure(shape=(2 * N,), chunks=((N, N),)), + structure=COOStructure( + shape=(2 * N,), + chunks=((N, N),), + data_type=BuiltinDtype.from_numpy_dtype( + numpy.dtype("float64") + ), + ), structure_family="sparse", ) ], From 527dad6c515d5deb3457f2e95368dd3fe874be58 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Fri, 6 Sep 2024 15:09:37 -0400 Subject: [PATCH 30/46] FIX: typing --- tiled/server/zarr.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/tiled/server/zarr.py b/tiled/server/zarr.py index f62fd5d9b..f0d3c0afa 100644 --- a/tiled/server/zarr.py +++ b/tiled/server/zarr.py @@ -30,8 +30,8 @@ ) from .. import __version__ -from ..structures.core import Spec, StructureFamily from ..structures.array import StructDtype +from ..structures.core import Spec, StructureFamily from ..utils import ensure_awaitable, patch_mimetypes, path_from_uri from ..validation_registration import ValidationError @@ -76,7 +76,7 @@ "id": "blosc", "shuffle": 1, } -ZARR_DATETIME64_PRECISION = 'ns' +ZARR_DATETIME64_PRECISION = "ns" import numcodecs @@ -102,16 +102,23 @@ async def get_zarr_group_metadata( request: Request, entry=SecureEntry( scopes=["read:data", "read:metadata"], - structure_families={StructureFamily.table, StructureFamily.container, StructureFamily.array}, + structure_families={ + StructureFamily.table, + StructureFamily.container, + StructureFamily.array, + }, ), ): # Usual (unstructured) array; should respond to /.zarray instead - if entry.structure_family == StructureFamily.array and not isinstance(entry.structure().data_type, StructDtype): + if entry.structure_family == StructureFamily.array and not isinstance( + entry.structure().data_type, StructDtype + ): raise HTTPException(status_code=HTTP_404_NOT_FOUND) # Structured numpy array, Container, or Table return Response(json.dumps({"zarr_format": 2}), status_code=200) + @router.get("/{path:path}/.zarray", name="Zarr .zarray metadata") async def get_zarr_array_metadata( request: Request, @@ -151,7 +158,7 @@ async def get_zarr_array_metadata( ) async def get_zarr_array( request: Request, - block: str | None = None, + block: Optional[str] = None, entry=SecureEntry( scopes=["read:data"], structure_families={ @@ -180,10 +187,14 @@ async def get_zarr_array( body = json.dumps([url + "/" + key for key in entry.structure().columns]) return Response(body, status_code=200, media_type="application/json") - - elif entry.structure_family == StructureFamily.array and isinstance(entry.structure().data_type, StructDtype): + + elif entry.structure_family == StructureFamily.array and isinstance( + entry.structure().data_type, StructDtype + ): # List the column names of the structured array -- they will be accessed separately - body = json.dumps([url + "/" + f.name for f in entry.structure().data_type.fields]) + body = json.dumps( + [url + "/" + f.name for f in entry.structure().data_type.fields] + ) return Response(body, status_code=200, media_type="application/json") From 4801407b4ebbcc465ff3a614b32f18aba34fcd08 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Fri, 6 Sep 2024 15:11:05 -0400 Subject: [PATCH 31/46] BLD: add aiohttp package to server requirements --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 1b74b8a17..66df566e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ tiled = "tiled.commandline.main:main" # This is the union of all optional dependencies. all = [ + "aiohttp", "aiofiles", "aiosqlite", "alembic", @@ -190,6 +191,7 @@ minimal-client = [ ] # These are the requirements needed for basic server functionality. minimal-server = [ + "aiohttp", "aiofiles", "aiosqlite", "alembic", @@ -225,6 +227,7 @@ minimal-server = [ ] # This is the "kichen sink" fully-featured server dependency set. server = [ + "aiohttp", "aiofiles", "aiosqlite", "alembic", From 8a7c510b6d50feed2c926871936181b46c29d67d Mon Sep 17 00:00:00 2001 From: Eugene M Date: Fri, 6 Sep 2024 15:21:30 -0400 Subject: [PATCH 32/46] MNT: clean and lint --- tiled/_tests/test_zarr.py | 59 +++++++++++++++++++++---------------- tiled/examples/generated.py | 9 ++++-- 2 files changed, 40 insertions(+), 28 deletions(-) diff --git a/tiled/_tests/test_zarr.py b/tiled/_tests/test_zarr.py index c3fa49256..20b8309ee 100644 --- a/tiled/_tests/test_zarr.py +++ b/tiled/_tests/test_zarr.py @@ -7,9 +7,8 @@ import dask.array import numpy -import pytest -import numpy import pandas.testing +import pytest import uvicorn import zarr from fsspec.implementations.http import HTTPFileSystem @@ -32,8 +31,13 @@ "dtype_c": (numpy.arange(10) * 1j).astype("c"), "dtype_S": numpy.array([letter * 3 for letter in string.ascii_letters], dtype="S3"), "dtype_U": numpy.array([letter * 3 for letter in string.ascii_letters], dtype="U3"), - "dtype_m": numpy.array(['2007-07-13', '2006-01-13', '2010-08-13'], dtype='datetime64') - numpy.datetime64('2008-01-01'), - "dtype_M": numpy.array(['2007-07-13', '2006-01-13', '2010-08-13'], dtype='datetime64'), + "dtype_m": numpy.array( + ["2007-07-13", "2006-01-13", "2010-08-13"], dtype="datetime64" + ) + - numpy.datetime64("2008-01-01"), + "dtype_M": numpy.array( + ["2007-07-13", "2006-01-13", "2010-08-13"], dtype="datetime64" + ), "random_2d": rng.random((10, 10)), } # TODO bitfield "t", void "v", and object "O" (which is not supported by default) @@ -75,12 +79,12 @@ } ) df = pandas.DataFrame( - { - "x": rng.random(size=10, dtype='float64'), - "y": rng.integers(10, size=10, dtype='uint'), - "z": rng.integers(-10, 10, size=10, dtype='int64'), - } - ) + { + "x": rng.random(size=10, dtype="float64"), + "y": rng.integers(10, size=10, dtype="uint"), + "z": rng.integers(-10, 10, size=10, dtype="int64"), + } +) table_tree = MapAdapter( { # a dataframe divided into three partitions @@ -102,18 +106,18 @@ ) -def traverse_tree(tree, parent='', result = None): +def traverse_tree(tree, parent="", result=None): result = result or {} for key, val in tree.items(): if isinstance(val, ArrayAdapter): - result.update({f'{parent}/{key}' : 'array'}) + result.update({f"{parent}/{key}": "array"}) elif isinstance(val, DataFrameAdapter): - result.update({f'{parent}/{key}' : 'group'}) + result.update({f"{parent}/{key}": "group"}) for col, _ in val.items(): - result.update({f'{parent}/{key}/{col}' : 'array'}) + result.update({f"{parent}/{key}/{col}": "array"}) else: - result.update({f'{parent}/{key}' : 'group'}) - traverse_tree(val, parent=f'{parent}/{key}', result=result) + result.update({f"{parent}/{key}": "group"}) + traverse_tree(val, parent=f"{parent}/{key}", result=result) return result @@ -155,7 +159,9 @@ def fs(): return fs -@pytest.mark.parametrize("path", ["/zarr/v2/", "/zarr/v2", "/zarr/v2/nested", "/zarr/v2/table/single"]) +@pytest.mark.parametrize( + "path", ["/zarr/v2/", "/zarr/v2", "/zarr/v2/nested", "/zarr/v2/table/single"] +) @pytest.mark.asyncio async def test_zarr_group_routes(path, app): async with AsyncClient( @@ -167,14 +173,16 @@ async def test_zarr_group_routes(path, app): response = await client.get(path) assert response.status_code == HTTP_200_OK - response = await client.get(path + '/.zarray') + response = await client.get(path + "/.zarray") assert response.status_code == HTTP_404_NOT_FOUND - response = await client.get(path + '/.zgroup') + response = await client.get(path + "/.zgroup") assert response.status_code == HTTP_200_OK -@pytest.mark.parametrize("path", ["/zarr/v2/nested/cube/tiny_cube", "/zarr/v2/table/single/x"]) +@pytest.mark.parametrize( + "path", ["/zarr/v2/nested/cube/tiny_cube", "/zarr/v2/table/single/x"] +) @pytest.mark.asyncio async def test_zarr_array_routes(path, app): async with AsyncClient( @@ -186,17 +194,18 @@ async def test_zarr_array_routes(path, app): response = await client.get(path) assert response.status_code == HTTP_200_OK - response = await client.get(path + '/.zgroup') + response = await client.get(path + "/.zgroup") assert response.status_code == HTTP_404_NOT_FOUND - response = await client.get(path + '/.zarray') + response = await client.get(path + "/.zarray") assert response.status_code == HTTP_200_OK - ndim = len(response.json().get('shape')) - indx = '.'.join( ['0']*max(ndim, 0) ) - response = await client.get(path + f'/{indx}') + ndim = len(response.json().get("shape")) + indx = ".".join(["0"] * max(ndim, 0)) + response = await client.get(path + f"/{indx}") assert response.status_code == HTTP_200_OK + def test_zarr_integration(server, fs): url = f"http://localhost:{server.port}/zarr/v2/" grp = zarr.open(fs.get_mapper(url), mode="r") diff --git a/tiled/examples/generated.py b/tiled/examples/generated.py index 2d296fd87..0a6d1343d 100644 --- a/tiled/examples/generated.py +++ b/tiled/examples/generated.py @@ -26,8 +26,8 @@ "tiny_cube": rng.random((50, 50, 50)), "tiny_hypercube": rng.random((50, 50, 50, 50, 50)), "high_entropy": rng.integers(-10, 10, size=(100, 100)), - "low_entropy": numpy.ones((100, 100), dtype='int32'), - "short_column": rng.integers(10, size=100, dtype=numpy.dtype('uint8')), + "low_entropy": numpy.ones((100, 100), dtype="int32"), + "short_column": rng.integers(10, size=100, dtype=numpy.dtype("uint8")), "tiny_column": rng.random(10), "long_column": rng.random(100_000), } @@ -126,7 +126,10 @@ "structured_data": MapAdapter( { "pets": ArrayAdapter.from_array( - numpy.array([("Rex", 9, 81.0), ("Fido", 3, 27.0)],dtype=[("name", "U10"), ("age", "i4"), ("weight", "f4")]) + numpy.array( + [("Rex", 9, 81.0), ("Fido", 3, 27.0)], + dtype=[("name", "U10"), ("age", "i4"), ("weight", "f4")], + ) ), "xarray_dataset": DatasetAdapter.from_dataset( xarray.Dataset( From 484c6a0ef9871bbbe9ced2a151bc289ab1df031f Mon Sep 17 00:00:00 2001 From: Eugene Date: Tue, 10 Sep 2024 14:26:53 -0400 Subject: [PATCH 33/46] FIX: default value of units to empty string. --- tiled/server/pydantic_array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tiled/server/pydantic_array.py b/tiled/server/pydantic_array.py index a85f54e4e..952a5c8ba 100644 --- a/tiled/server/pydantic_array.py +++ b/tiled/server/pydantic_array.py @@ -79,7 +79,7 @@ def from_json(cls, structure): kind=Kind(structure["kind"]), itemsize=structure["itemsize"], endianness=Endianness(structure["endianness"]), - units=structure.get("units", "s"), + units=structure.get("units", ""), ) From c9fa03e34212bb8f3528aa1e937ac72a558089aa Mon Sep 17 00:00:00 2001 From: Eugene M Date: Tue, 10 Sep 2024 20:29:05 -0400 Subject: [PATCH 34/46] FIX: use None as the sentinel for the units kwarg --- tiled/server/pydantic_array.py | 6 +++--- tiled/structures/array.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tiled/server/pydantic_array.py b/tiled/server/pydantic_array.py index 952a5c8ba..4e466f7fb 100644 --- a/tiled/server/pydantic_array.py +++ b/tiled/server/pydantic_array.py @@ -28,7 +28,7 @@ class BuiltinDtype(BaseModel): endianness: Endianness kind: Kind itemsize: int - units: str = "" + units: Optional[str] = None __endianness_map = { ">": "big", @@ -43,7 +43,7 @@ class BuiltinDtype(BaseModel): def from_numpy_dtype(cls, dtype) -> "BuiltinDtype": # Extract datetime units from the dtype string representation, # e.g. `'": "big", @@ -92,7 +92,7 @@ class BuiltinDtype: def from_numpy_dtype(cls, dtype) -> "BuiltinDtype": # Extract datetime units from the dtype string representation, # e.g. `' Date: Wed, 11 Sep 2024 16:45:31 -0400 Subject: [PATCH 35/46] ENH: use np.datetime_data to extract units --- tiled/server/pydantic_array.py | 18 ++++++++---------- tiled/structures/array.py | 18 ++++++++---------- 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/tiled/server/pydantic_array.py b/tiled/server/pydantic_array.py index 4e466f7fb..2c37a51dc 100644 --- a/tiled/server/pydantic_array.py +++ b/tiled/server/pydantic_array.py @@ -14,7 +14,6 @@ it. """ -import re import sys from typing import List, Optional, Tuple, Union @@ -28,7 +27,7 @@ class BuiltinDtype(BaseModel): endianness: Endianness kind: Kind itemsize: int - units: Optional[str] = None + dt_units: Optional[str] = None __endianness_map = { ">": "big", @@ -42,17 +41,17 @@ class BuiltinDtype(BaseModel): @classmethod def from_numpy_dtype(cls, dtype) -> "BuiltinDtype": # Extract datetime units from the dtype string representation, - # e.g. `' 1 else ''}{unit}]" return cls( endianness=cls.__endianness_map[dtype.byteorder], kind=Kind(dtype.kind), itemsize=dtype.itemsize, - units=units, + dt_units=dt_units, ) def to_numpy_dtype(self): @@ -70,8 +69,7 @@ def to_numpy_str(self): # so the reported itemsize is 4x the char count. To get back to the string # we need to divide by 4. size = self.itemsize if self.kind != Kind.unicode else self.itemsize // 4 - units = f"[{self.units}]" if self.units else "" - return f"{endianness}{self.kind.value}{size}{units}" + return f"{endianness}{self.kind.value}{size}{self.dt_units or ''}" @classmethod def from_json(cls, structure): @@ -79,7 +77,7 @@ def from_json(cls, structure): kind=Kind(structure["kind"]), itemsize=structure["itemsize"], endianness=Endianness(structure["endianness"]), - units=structure.get("units"), + units=structure.get("dt_units"), ) diff --git a/tiled/structures/array.py b/tiled/structures/array.py index e8e06e491..23b625d51 100644 --- a/tiled/structures/array.py +++ b/tiled/structures/array.py @@ -1,6 +1,5 @@ import enum import os -import re import sys from dataclasses import dataclass from typing import List, Optional, Tuple, Union @@ -77,7 +76,7 @@ class BuiltinDtype: endianness: Endianness kind: Kind itemsize: int - units: Optional[str] = None + dt_units: Optional[str] = None __endianness_map = { ">": "big", @@ -91,17 +90,17 @@ class BuiltinDtype: @classmethod def from_numpy_dtype(cls, dtype) -> "BuiltinDtype": # Extract datetime units from the dtype string representation, - # e.g. `' 1 else ''}{unit}]" return cls( endianness=cls.__endianness_map[dtype.byteorder], kind=Kind(dtype.kind), itemsize=dtype.itemsize, - units=units, + dt_units=dt_units, ) def to_numpy_dtype(self) -> numpy.dtype: @@ -119,8 +118,7 @@ def to_numpy_str(self): # so the reported itemsize is 4x the char count. To get back to the string # we need to divide by 4. size = self.itemsize if self.kind != Kind.unicode else self.itemsize // 4 - units = f"[{self.units}]" if self.units else "" - return f"{endianness}{self.kind.value}{size}{units}" + return f"{endianness}{self.kind.value}{size}{self.dt_units or ''}" @classmethod def from_json(cls, structure): @@ -128,7 +126,7 @@ def from_json(cls, structure): kind=Kind(structure["kind"]), itemsize=structure["itemsize"], endianness=Endianness(structure["endianness"]), - units=structure.get("units"), + dt_units=structure.get("dt_units"), ) From a742bd99a8ef1fa009b12213368c57ac0951d42a Mon Sep 17 00:00:00 2001 From: Eugene M Date: Fri, 13 Sep 2024 14:11:44 -0400 Subject: [PATCH 36/46] TST: Fix failing authorization test -- empty password --- tiled/_tests/test_authentication.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tiled/_tests/test_authentication.py b/tiled/_tests/test_authentication.py index 519ec5261..cd5b60520 100644 --- a/tiled/_tests/test_authentication.py +++ b/tiled/_tests/test_authentication.py @@ -10,7 +10,6 @@ from starlette.status import ( HTTP_400_BAD_REQUEST, HTTP_401_UNAUTHORIZED, - HTTP_422_UNPROCESSABLE_ENTITY, ) from ..adapters.array import ArrayAdapter @@ -93,7 +92,7 @@ def test_password_auth(enter_password, config): from_context(context, username="alice") # Empty password should not work. - with fail_with_status_code(HTTP_422_UNPROCESSABLE_ENTITY): + with fail_with_status_code(HTTP_401_UNAUTHORIZED): with enter_password(""): from_context(context, username="alice") From 73b2b40ae23ac780d0b472106c665fae04324e61 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Fri, 13 Sep 2024 14:15:23 -0400 Subject: [PATCH 37/46] MNT: format and lint --- tiled/_tests/test_authentication.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tiled/_tests/test_authentication.py b/tiled/_tests/test_authentication.py index cd5b60520..1bc2cfb6c 100644 --- a/tiled/_tests/test_authentication.py +++ b/tiled/_tests/test_authentication.py @@ -7,10 +7,7 @@ import numpy import pytest -from starlette.status import ( - HTTP_400_BAD_REQUEST, - HTTP_401_UNAUTHORIZED, -) +from starlette.status import HTTP_400_BAD_REQUEST, HTTP_401_UNAUTHORIZED from ..adapters.array import ArrayAdapter from ..adapters.mapping import MapAdapter From 24c86d5a12f439bb14e13667cf17c492316c0cfd Mon Sep 17 00:00:00 2001 From: Eugene M Date: Tue, 22 Oct 2024 10:08:37 -0400 Subject: [PATCH 38/46] MNT: remove deprecated PatchedStreamingResponse --- tiled/server/app.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tiled/server/app.py b/tiled/server/app.py index 960dedfc7..2a6e7f362 100644 --- a/tiled/server/app.py +++ b/tiled/server/app.py @@ -902,7 +902,6 @@ async def resolve_zarr_uris(request: Request, call_next): response = await call_next(request) - response.__class__ = PatchedStreamingResponse # tolerate memoryview return response app.add_middleware( From 4ace2920c54d821778bedb6ba2da3420a20e1de9 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Tue, 22 Oct 2024 10:58:54 -0400 Subject: [PATCH 39/46] MNT: lint --- tiled/adapters/zarr.py | 2 +- tiled/server/app.py | 1 - tiled/server/zarr.py | 67 +++++------------------------------------- 3 files changed, 8 insertions(+), 62 deletions(-) diff --git a/tiled/adapters/zarr.py b/tiled/adapters/zarr.py index e8761fecd..e156549bd 100644 --- a/tiled/adapters/zarr.py +++ b/tiled/adapters/zarr.py @@ -103,7 +103,7 @@ def _stencil(self) -> Tuple[slice, ...]: """ return tuple(builtins.slice(0, dim) for dim in self.structure().shape) - def get(self, key: str): + def get(self, key: str) -> Union[ArrayAdapter, None]: return None def read( diff --git a/tiled/server/app.py b/tiled/server/app.py index 2a6e7f362..80ac9fc65 100644 --- a/tiled/server/app.py +++ b/tiled/server/app.py @@ -12,7 +12,6 @@ from contextlib import asynccontextmanager from functools import lru_cache, partial from pathlib import Path -from typing import Dict, List import anyio import packaging.version diff --git a/tiled/server/zarr.py b/tiled/server/zarr.py index f0d3c0afa..e8d536114 100644 --- a/tiled/server/zarr.py +++ b/tiled/server/zarr.py @@ -1,71 +1,20 @@ -import dataclasses -import inspect import json -import os -import re -import warnings -from datetime import datetime, timedelta -from functools import partial, wraps -from pathlib import Path -from typing import Any, List, Optional, Tuple +from typing import Optional, Tuple -import anyio -from fastapi import APIRouter, Body, Depends, HTTPException, Query, Request, Security -from jmespath.exceptions import JMESPathError -from json_merge_patch import merge as apply_merge_patch -from jsonpatch import apply_patch as apply_json_patch -from pydantic_settings import BaseSettings +import numcodecs +from fastapi import APIRouter, HTTPException, Request from starlette.responses import Response from starlette.status import ( - HTTP_200_OK, - HTTP_206_PARTIAL_CONTENT, HTTP_400_BAD_REQUEST, - HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND, - HTTP_405_METHOD_NOT_ALLOWED, - HTTP_406_NOT_ACCEPTABLE, - HTTP_416_REQUESTED_RANGE_NOT_SATISFIABLE, - HTTP_422_UNPROCESSABLE_ENTITY, HTTP_500_INTERNAL_SERVER_ERROR, ) -from .. import __version__ from ..structures.array import StructDtype -from ..structures.core import Spec, StructureFamily -from ..utils import ensure_awaitable, patch_mimetypes, path_from_uri -from ..validation_registration import ValidationError - -# from . import schemas -from .authentication import Mode, get_authenticators, get_current_principal -from .core import ( - DEFAULT_PAGE_SIZE, - DEPTH_LIMIT, - MAX_PAGE_SIZE, - NoEntry, - UnsupportedMediaTypes, - WrongTypeForRoute, - apply_search, - construct_data_response, - construct_entries_response, - construct_resource, - construct_revisions_response, - json_or_msgpack, - resolve_media_type, -) -from .dependencies import ( - SecureEntry, - block, - expected_shape, - get_deserialization_registry, - get_query_registry, - get_serialization_registry, - get_validation_registry, - slice_, -) -from .file_response_with_range import FileResponseWithRange -from .links import links_for_node -from .settings import get_settings -from .utils import filter_for_access, get_base_url, record_timing +from ..structures.core import StructureFamily +from ..utils import ensure_awaitable +from .dependencies import SecureEntry +from .utils import record_timing ZARR_BLOCK_SIZE = 10000 ZARR_BYTE_ORDER = "C" @@ -78,8 +27,6 @@ } ZARR_DATETIME64_PRECISION = "ns" -import numcodecs - zarr_codec = numcodecs.get_codec(ZARR_CODEC_SPEC) router = APIRouter() From 9fd4d35a3020753c9e6ecfa62f96a6e63b25e213 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Tue, 22 Oct 2024 12:22:43 -0400 Subject: [PATCH 40/46] TST: add authentication tests --- tiled/_tests/test_zarr.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/tiled/_tests/test_zarr.py b/tiled/_tests/test_zarr.py index 20b8309ee..f253c714d 100644 --- a/tiled/_tests/test_zarr.py +++ b/tiled/_tests/test_zarr.py @@ -13,7 +13,7 @@ import zarr from fsspec.implementations.http import HTTPFileSystem from httpx import ASGITransport, AsyncClient -from starlette.status import HTTP_200_OK, HTTP_404_NOT_FOUND +from starlette.status import HTTP_200_OK, HTTP_401_UNAUTHORIZED, HTTP_404_NOT_FOUND from ..adapters.array import ArrayAdapter from ..adapters.dataframe import DataFrameAdapter @@ -206,6 +206,35 @@ async def test_zarr_array_routes(path, app): assert response.status_code == HTTP_200_OK +@pytest.mark.parametrize( + "path", + [ + "/zarr/v2/", + "/zarr/v2", + "/zarr/v2/nested", + "/zarr/v2/table/single", + "/zarr/v2/nested/cube/tiny_cube", + "/zarr/v2/table/single/x", + ], +) +@pytest.mark.asyncio +async def test_authentication(path, app): + async with AsyncClient( + transport=ASGITransport(app=app), + base_url="http://test", + headers={"Authorization": "Apikey not-secret"}, + follow_redirects=True, + ) as client: + response = await client.get(path) + assert response.status_code == HTTP_401_UNAUTHORIZED + + response = await client.get(path + "/.zarray") + assert response.status_code == HTTP_401_UNAUTHORIZED + + response = await client.get(path + "/.zgroup") + assert response.status_code == HTTP_401_UNAUTHORIZED + + def test_zarr_integration(server, fs): url = f"http://localhost:{server.port}/zarr/v2/" grp = zarr.open(fs.get_mapper(url), mode="r") From 83351367544bb3a0f04540c757404423d8ad8a0a Mon Sep 17 00:00:00 2001 From: Eugene M Date: Tue, 22 Oct 2024 12:34:16 -0400 Subject: [PATCH 41/46] FIX: ensure support for py3.8 --- tiled/server/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tiled/server/app.py b/tiled/server/app.py index 80ac9fc65..1dc33c302 100644 --- a/tiled/server/app.py +++ b/tiled/server/app.py @@ -890,7 +890,7 @@ async def resolve_zarr_uris(request: Request, call_next): # safely encoded) if request.url.path.startswith(ZARR_PREFIX) and response.status_code == 404: # Extract the last bit of the path - zarr_path = request.url.path.removeprefix(ZARR_PREFIX).strip("/").split("/") + zarr_path = request.url.path[len(ZARR_PREFIX):].strip("/").split("/") zarr_block = zarr_path[-1] if len(zarr_path) > 0 else "" if re.compile(r"^(?:\d+\.)*\d+$").fullmatch(zarr_block): # Create a query string if the last part is in the zarr block form, e.g. `m.n.p. ... .q` From fc2a4cea967af5fd239cedfa93f42810faad6380 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Tue, 22 Oct 2024 12:43:13 -0400 Subject: [PATCH 42/46] MNT: lint --- tiled/server/app.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tiled/server/app.py b/tiled/server/app.py index 1dc33c302..304ed263d 100644 --- a/tiled/server/app.py +++ b/tiled/server/app.py @@ -890,7 +890,11 @@ async def resolve_zarr_uris(request: Request, call_next): # safely encoded) if request.url.path.startswith(ZARR_PREFIX) and response.status_code == 404: # Extract the last bit of the path - zarr_path = request.url.path[len(ZARR_PREFIX):].strip("/").split("/") + zarr_path = ( + request.url.path[len(ZARR_PREFIX) :] # noqa: #E203 + .strip("/") + .split("/") + ) zarr_block = zarr_path[-1] if len(zarr_path) > 0 else "" if re.compile(r"^(?:\d+\.)*\d+$").fullmatch(zarr_block): # Create a query string if the last part is in the zarr block form, e.g. `m.n.p. ... .q` From 9c5388b49f49c67d125e85b16844fa0f89e0722b Mon Sep 17 00:00:00 2001 From: Eugene M Date: Tue, 22 Oct 2024 12:54:58 -0400 Subject: [PATCH 43/46] MNT: add changelog entry --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ae138eaf..c90f1ab3f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,12 @@ Write the date in place of the "Unreleased" in the case a new version is release # Changelog +## Unreleased + +### Added + +- zarr v2 endpoints for read access + ## v0.1.0b10 (2024-10-11) - Add kwarg to client logout to auto-clear default identity. From 851c1557c1e46927c46e05b6c1b883b2f8850d04 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Wed, 23 Oct 2024 13:27:49 -0400 Subject: [PATCH 44/46] MNT: moved aiohttp from required to dev dependencies --- pyproject.toml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 66df566e9..a00664062 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,7 +44,6 @@ tiled = "tiled.commandline.main:main" # This is the union of all optional dependencies. all = [ - "aiohttp", "aiofiles", "aiosqlite", "alembic", @@ -146,6 +145,7 @@ dataframe = [ # These are required for developing the package (running the tests, building # the documentation) but not necessarily required for _using_ it. dev = [ + "aiohttp", "coverage", "flake8", "importlib_resources;python_version < \"3.9\"", @@ -191,7 +191,6 @@ minimal-client = [ ] # These are the requirements needed for basic server functionality. minimal-server = [ - "aiohttp", "aiofiles", "aiosqlite", "alembic", @@ -227,7 +226,6 @@ minimal-server = [ ] # This is the "kichen sink" fully-featured server dependency set. server = [ - "aiohttp", "aiofiles", "aiosqlite", "alembic", From 4afd5e4d164e75a0688839fea9b1060c4aff621a Mon Sep 17 00:00:00 2001 From: Eugene M Date: Thu, 24 Oct 2024 11:00:31 -0400 Subject: [PATCH 45/46] TST: refactor ThreadedServer class for tests --- tiled/_tests/test_server.py | 33 ++------------------------------- tiled/_tests/test_zarr.py | 21 +-------------------- tiled/_tests/utils.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 51 deletions(-) diff --git a/tiled/_tests/test_server.py b/tiled/_tests/test_server.py index a22e8b7b8..4dde50969 100644 --- a/tiled/_tests/test_server.py +++ b/tiled/_tests/test_server.py @@ -1,7 +1,3 @@ -import contextlib -import threading -import time - import pytest import uvicorn from fastapi import APIRouter @@ -11,35 +7,10 @@ from ..client import from_uri from ..server.app import build_app from ..server.logging_config import LOGGING_CONFIG +from .utils import ThreadedServer router = APIRouter() - -class Server(uvicorn.Server): - # https://github.com/encode/uvicorn/discussions/1103#discussioncomment-941726 - - def install_signal_handlers(self): - pass - - @contextlib.contextmanager - def run_in_thread(self): - thread = threading.Thread(target=self.run) - thread.start() - try: - # Wait for server to start up, or raise TimeoutError. - for _ in range(100): - time.sleep(0.1) - if self.started: - break - else: - raise TimeoutError("Server did not start in 10 seconds.") - host, port = self.servers[0].sockets[0].getsockname() - yield f"http://{host}:{port}" - finally: - self.should_exit = True - thread.join() - - API_KEY = "secret" @@ -49,7 +20,7 @@ def server(tmpdir): app = build_app(catalog, {"single_user_api_key": API_KEY}) app.include_router(router) config = uvicorn.Config(app, port=0, loop="asyncio", log_config=LOGGING_CONFIG) - server = Server(config) + server = ThreadedServer(config) with server.run_in_thread() as url: yield url diff --git a/tiled/_tests/test_zarr.py b/tiled/_tests/test_zarr.py index f253c714d..0c8309396 100644 --- a/tiled/_tests/test_zarr.py +++ b/tiled/_tests/test_zarr.py @@ -1,8 +1,5 @@ -import contextlib import math import string -import threading -import time import warnings import dask.array @@ -19,6 +16,7 @@ from ..adapters.dataframe import DataFrameAdapter from ..adapters.mapping import MapAdapter from ..server.app import build_app +from .utils import ThreadedServer rng = numpy.random.default_rng(seed=42) array_cases = { @@ -127,23 +125,6 @@ def app(): return app -class ThreadedServer(uvicorn.Server): - @contextlib.contextmanager - def run_in_thread(self): - thread = threading.Thread(target=self.run) - thread.start() - try: - while not self.started: - time.sleep(1e-3) - self.port = ( - self.servers[0].sockets[0].getsockname()[1] - ) # Actual port number - yield - finally: - self.should_exit = True - thread.join() - - @pytest.fixture(scope="module") def server(app): config = uvicorn.Config(app, host="127.0.0.1", port=0, log_level="info") diff --git a/tiled/_tests/utils.py b/tiled/_tests/utils.py index b3a4bc9df..f00bc68a4 100644 --- a/tiled/_tests/utils.py +++ b/tiled/_tests/utils.py @@ -3,12 +3,15 @@ import sqlite3 import sys import tempfile +import threading +import time import uuid from enum import IntEnum from pathlib import Path import httpx import pytest +import uvicorn from sqlalchemy import text from sqlalchemy.ext.asyncio import create_async_engine @@ -92,3 +95,30 @@ def sqlite_from_dump(filename): conn.executescript(path.read_text()) conn.close() yield database_path + + +class ThreadedServer(uvicorn.Server): + # https://github.com/encode/uvicorn/discussions/1103#discussioncomment-941726 + + def install_signal_handlers(self): + pass + + @contextlib.contextmanager + def run_in_thread(self): + thread = threading.Thread(target=self.run) + thread.start() + try: + # Wait for server to start up, or raise TimeoutError. + for _ in range(100): + time.sleep(0.1) + if self.started: + break + else: + raise TimeoutError("Server did not start in 10 seconds.") + + # Get the actual hostname and port number + self.host, self.port = self.servers[0].sockets[0].getsockname() + yield f"http://{self.host}:{self.port}" + finally: + self.should_exit = True + thread.join() From 202f10eb161282c95173ab049073e727e60cbc82 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Thu, 24 Oct 2024 12:25:45 -0400 Subject: [PATCH 46/46] TST: test no writing in read-only mode --- tiled/_tests/test_zarr.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tiled/_tests/test_zarr.py b/tiled/_tests/test_zarr.py index 0c8309396..77632eaf0 100644 --- a/tiled/_tests/test_zarr.py +++ b/tiled/_tests/test_zarr.py @@ -318,3 +318,14 @@ def test_dataframe_column(key, server, fs): actual = arr[...] expected = df[col] assert numpy.array_equal(actual, expected) + + +def test_writing(server, fs): + url = f"http://localhost:{server.port}/zarr/v2/nested/array" + + with pytest.raises(NotImplementedError): + grp = zarr.open(fs.get_mapper(url), mode="w") + + with pytest.raises(zarr.errors.ReadOnlyError): + grp = zarr.open(fs.get_mapper(url), mode="r") + grp["random_2d"][0, 0] = 0.0