Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add JSON deserialization support #257

Merged
merged 1 commit into from
Jun 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
336 changes: 164 additions & 172 deletions python-default.lock

Large diffs are not rendered by default.

4 changes: 1 addition & 3 deletions python-packages/smithy-core/smithy_core/codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def create_serializer(self, sink: BytesWriter) -> "ShapeSerializer":
"""
...

def create_deserializer(self, source: BytesReader) -> "ShapeDeserializer":
def create_deserializer(self, source: bytes | BytesReader) -> "ShapeDeserializer":
"""Create a deserializer that reads from the given bytes reader.

:param source: The source to read bytes from.
Expand Down Expand Up @@ -59,7 +59,5 @@ def deserialize[
:param shape: The shape class to deserialize into.
:returns: An instance of the given shape class with the data from the source.
"""
if isinstance(source, bytes):
source = BytesIO(source)
deserializer = self.create_deserializer(source=source)
return shape.deserialize(deserializer=deserializer)
13 changes: 10 additions & 3 deletions python-packages/smithy-core/smithy_core/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,14 @@ def _wrap_list(self, value: Sequence[DocumentValue]) -> list["Document"]:
schema = self._schema
if schema.shape_type is ShapeType.LIST:
schema = self._schema.members["member"]
return [Document(e, schema=schema) for e in value]
return [self._new_document(e, schema) for e in value]

def _new_document(
self,
value: DocumentValue | dict[str, "Document"] | list["Document"],
schema: Schema,
) -> "Document":
return Document(value, schema=schema)

def as_map(self) -> dict[str, "Document"]:
"""Asserts the document is a map and returns it."""
Expand All @@ -221,11 +228,11 @@ def _wrap_map(self, value: Mapping[str, DocumentValue]) -> dict[str, "Document"]
member_schema = self._schema
if self._schema.shape_type is ShapeType.MAP:
member_schema = self._schema.members["value"]
return {k: Document(v, schema=member_schema) for k, v in value.items()}
return {k: self._new_document(v, member_schema) for k, v in value.items()}

result: dict[str, "Document"] = {}
for k, v in value.items():
result[k] = Document(v, schema=self._schema.members[k])
result[k] = self._new_document(v, self._schema.members[k])
return result

def as_value(self) -> DocumentValue:
Expand Down
8 changes: 8 additions & 0 deletions python-packages/smithy-json/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,11 @@ python_distribution(
version="0.0.1",
),
)

# We shouldn't need this, but pants will assume that smithy_core is an external
# dependency since it's in pyproject.toml and there's no way to exclude it, so
# for now we need to duplicate things.
python_requirements(
name="requirements",
source="requirements.txt",
)
3 changes: 2 additions & 1 deletion python-packages/smithy-json/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ classifiers = [
"Topic :: Software Development :: Libraries"
]
dependencies = [
"smithy_core==0.0.1"
"smithy_core==0.0.1",
"ijson==3.2.3",
]

[project.urls]
Expand Down
4 changes: 4 additions & 0 deletions python-packages/smithy-json/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# We shouldn't need this, but pants will assume that smithy_core is an external
# dependency since it's in pyproject.toml and there's no way to exclude it, so
# for now we need to duplicate things.
ijson==3.2.3
13 changes: 11 additions & 2 deletions python-packages/smithy-json/smithy_json/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
# SPDX-License-Identifier: Apache-2.0
__version__ = "0.0.1"

from io import BytesIO

from smithy_core.codecs import Codec
from smithy_core.deserializers import ShapeDeserializer
from smithy_core.interfaces import BytesReader, BytesWriter
from smithy_core.serializers import ShapeSerializer
from smithy_core.types import TimestampFormat

from ._private.deserializers import JSONShapeDeserializer as _JSONShapeDeserializer
from ._private.serializers import JSONShapeSerializer as _JSONShapeSerializer


Expand Down Expand Up @@ -50,5 +52,12 @@ def create_serializer(self, sink: BytesWriter) -> "ShapeSerializer":
default_timestamp_format=self._default_timestamp_format,
)

def create_deserializer(self, source: BytesReader) -> "ShapeDeserializer":
raise NotImplementedError()
def create_deserializer(self, source: bytes | BytesReader) -> "ShapeDeserializer":
if isinstance(source, bytes):
source = BytesIO(source)
return _JSONShapeDeserializer(
source,
use_json_name=self._use_json_name,
use_timestamp_format=self._use_timestamp_format,
default_timestamp_format=self._default_timestamp_format,
)
269 changes: 269 additions & 0 deletions python-packages/smithy-json/smithy_json/_private/deserializers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,269 @@
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0

import datetime
from base64 import b64decode
from collections.abc import Callable, Iterator, Mapping, Sequence
from decimal import Decimal
from typing import Literal, NamedTuple, Protocol, cast

import ijson # type: ignore
from ijson.common import ObjectBuilder # type: ignore
from smithy_core.deserializers import ShapeDeserializer
from smithy_core.documents import Document
from smithy_core.exceptions import SmithyException
from smithy_core.interfaces import BytesReader
from smithy_core.schemas import Schema
from smithy_core.shapes import ShapeID
from smithy_core.types import TimestampFormat

from .documents import JSONDocument
from .traits import JSON_NAME, TIMESTAMP_FORMAT

# TODO: put these type hints in a pyi somewhere. There here because ijson isn't
# typed.
type JSONParseEventType = Literal[
"string",
"number",
"boolean",
"start_array",
"end_array",
"start_map",
"map_key",
"end_map",
]

type JSONParseEventValue = str | int | float | Decimal | bool | None

type JSON = Mapping[str, "JSON"] | Sequence["JSON"] | JSONParseEventValue


class JSONParseEvent(NamedTuple):
path: str
type: JSONParseEventType
value: JSONParseEventValue


class JSONTokenError(SmithyException):
def __init__(self, expected: str, event: JSONParseEvent) -> None:
super().__init__(
f"Error parsing JSON. Expected token of type `{expected}` at path "
f"`{event.path}`, but found: `{event.type}`: {event.value}"
)


class TypedObjectBuilder(Protocol):
value: JSON

def event(self, event: JSONParseEventType, value: JSONParseEventValue): ...


class BufferedParser:
"""A wrapper around the ijson parser that allows peeking."""

def __init__(
self, stream: Iterator[tuple[str, JSONParseEventType, JSONParseEventValue]]
) -> None:
self._stream = stream
self._pending: JSONParseEvent | None = None

def __iter__(self):
return self

def __next__(self) -> JSONParseEvent:
if self._pending is not None:
result = self._pending
self._pending = None
return result
return self._next()

def _next(self) -> JSONParseEvent:
return JSONParseEvent(*next(self._stream))

def peek(self) -> JSONParseEvent:
self._pending = self._next()
return self._pending


class JSONShapeDeserializer(ShapeDeserializer):
def __init__(
self,
source: BytesReader,
*,
use_json_name: bool = True,
use_timestamp_format: bool = True,
default_timestamp_format: TimestampFormat = TimestampFormat.DATE_TIME,
) -> None:
self._stream = BufferedParser(ijson.parse(source))
self._use_json_name = use_json_name
self._use_timestamp_format = use_timestamp_format
self._default_timestamp_format = default_timestamp_format

# A mapping of json name to member name for each shape. Since the deserializer
# is shared and we don't know which shapes will be deserialized, this is
# populated on an as-needed basis.
self._json_names: dict[ShapeID, dict[str, str]] = {}

def read_null(self, schema: Schema) -> None:
event = next(self._stream)
if event.value is not None:
raise JSONTokenError("null", event)
return None

def read_boolean(self, schema: Schema) -> bool:
event = next(self._stream)
if not isinstance(event.value, bool):
raise JSONTokenError("boolean", event)
return event.value

def read_blob(self, schema: Schema) -> bytes:
event = next(self._stream)
if event.type != "string" or not isinstance(event.value, str):
raise JSONTokenError("string", event)
return b64decode(event.value)

def read_integer(self, schema: Schema) -> int:
event = next(self._stream)
if not isinstance(event.value, int):
raise JSONTokenError("number", event)
return event.value

def read_float(self, schema: Schema) -> float:
event = next(self._stream)
match event.value:
case Decimal():
return float(event.value)
case int() | float():
return event.value
case _:
raise JSONTokenError("number", event)

def read_big_decimal(self, schema: Schema) -> Decimal:
event = next(self._stream)
match event.value:
case Decimal():
return event.value
case int() | float():
return Decimal.from_float(event.value)
case _:
raise JSONTokenError("number", event)

def read_string(self, schema: Schema) -> str:
event = next(self._stream)
if event.type not in ("string", "map_key") or not isinstance(event.value, str):
raise JSONTokenError("string | map_key", event)
return event.value

def read_document(self, schema: Schema) -> Document:
start = next(self._stream)
if start.type not in ("start_map", "start_array"):
return JSONDocument(
start.value,
schema=schema,
use_json_name=self._use_json_name,
default_timestamp_format=self._default_timestamp_format,
use_timestamp_format=self._use_timestamp_format,
)

end_type = "end_map" if start.type == "start_map" else "end_array"
builder = cast(TypedObjectBuilder, ObjectBuilder())
builder.event(start.type, start.value)
while (
event := next(self._stream)
).path != start.path or event.type != end_type:
builder.event(event.type, event.value)

return JSONDocument(
builder.value,
schema=schema,
use_json_name=self._use_json_name,
default_timestamp_format=self._default_timestamp_format,
use_timestamp_format=self._use_timestamp_format,
)

def read_timestamp(self, schema: Schema) -> datetime.datetime:
format = self._default_timestamp_format
if self._use_timestamp_format:
if format_trait := schema.traits.get(TIMESTAMP_FORMAT):
format = TimestampFormat(format_trait.value)

match format:
case TimestampFormat.EPOCH_SECONDS:
return format.deserialize(self.read_float(schema=schema))
case _:
return format.deserialize(self.read_string(schema=schema))

def read_struct(
self, schema: Schema, consumer: Callable[[Schema, "ShapeDeserializer"], None]
):
event = next(self._stream)
if event.type != "start_map":
raise JSONTokenError("start_map", event)

while self._stream.peek().type != "end_map":
key = self.read_string(schema=schema)
member = self._resolve_member(schema=schema, key=key)
if not member:
self._skip()
continue
consumer(member, self)

next(self._stream)

def _resolve_member(self, schema: Schema, key: str) -> Schema | None:
if self._use_json_name:
if schema.id not in self._json_names:
self._cache_json_names(schema=schema)
if key in self._json_names[schema.id]:
return schema.members.get(self._json_names[schema.id][key])
return None

return schema.members.get(key)

def _cache_json_names(self, schema: Schema):
result: dict[str, str] = {}
for member_name, member_schema in schema.members.items():
name: str = member_name
if json_name := member_schema.traits.get(JSON_NAME):
name = cast(str, json_name.value)
result[name] = member_name
self._json_names[schema.id] = result

def read_list(
self, schema: Schema, consumer: Callable[["ShapeDeserializer"], None]
):
event = next(self._stream)
if event.type != "start_array":
raise JSONTokenError("start_array", event)

while self._stream.peek().type != "end_array":
consumer(self)

next(self._stream)

def read_map(
self,
schema: Schema,
consumer: Callable[[str, "ShapeDeserializer"], None],
):
event = next(self._stream)
if event.type != "start_map":
raise JSONTokenError("start_map", event)

key_schema = schema.members["key"]
while self._stream.peek().type != "end_map":
consumer(self.read_string(schema=key_schema), self)

next(self._stream)

def _skip(self) -> None:
start = next(self._stream)
if start.type not in ("start_map", "start_array"):
return

end_type = "end_map" if start.type == "start_map" else "end_array"

while (
event := next(self._stream)
).path != start.path or event.type != end_type:
continue
Loading
Loading