Skip to content

Commit

Permalink
Revisit typing around add_metadata and validate_metadata + create new…
Browse files Browse the repository at this point in the history
… convert_and_check_metadata
  • Loading branch information
benoit74 committed Jul 30, 2024
1 parent 286fd07 commit 2d2c547
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 27 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Automatically index PDF documents content #167
- Automatically set proper title on PDF documents #168
- Expose new `optimization.get_optimization_method` to get the proper optimization method to call for a given image format
- Add `optimization.get_optimization_method` to get the proper optimization method to call for a given image format
- New `creator.Creator.convert_and_check_metadata` to convert metadata to bytes or str for known use cases and check proper type is passed to libzim

## Changed

Expand All @@ -23,6 +25,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- **BREAKING** Force all boolean arguments (and some other non-obvious parameters) to be keyword-only in function calls for clarity / disambiguation (see ruff rule FBT002)
- Prefer to use `IO[bytes]` to `io.BytesIO` when possible since it is more generic
- **BREAKING** `i18n.NotFound` renamed `i18n.NotFoundError`
- **BREAKING** `types.get_mime_for_name` now returns `str | None`
- **BREAKING** `creator.Creator.add_metadata` and `creator.Creator.validate_metadata` now only accepts `bytes | str` as value (it must have been converted before call)
- **BREAKING** second argument of `creator.Creator.add_metadata` has been renamed to `value` instead of `content` to align with other methods
- When a type issue arises in metadata checks, wrong value type is displayed in exception

### Fixed

Expand Down
55 changes: 33 additions & 22 deletions src/zimscraperlib/zim/creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,22 +226,14 @@ def start(self):
del self._metadata["Illustration_48x48@1"]
for name, value in self._metadata.items():
if value:
self.add_metadata(name, value)
self.add_metadata(name, self.convert_and_check_metadata(name, value))

return self

def validate_metadata(
self,
name: str,
value: (
int
| float
| bytes
| str
| datetime.datetime
| datetime.date
| Iterable[str]
),
value: bytes | str,
):
"""Ensures metadata value for name is conform with the openZIM spec on Metadata
Expand All @@ -260,10 +252,37 @@ def validate_metadata(
validate_tags(name, value) # pyright: ignore
validate_illustrations(name, value) # pyright: ignore

def convert_and_check_metadata(
self,
name: str,
value: str | bytes | datetime.date | datetime.datetime | Iterable[str],
) -> str | bytes:
"""Convert metadata to appropriate type for few known usecase and check type
Date: converts date and datetime to string YYYY-MM-DD
Tags: converts iterable to string with semi-colon separator
Also checks that final type is appropriate for libzim (str or bytes)
"""
if name == "Date" and isinstance(value, (datetime.date, datetime.datetime)):
value = value.strftime("%Y-%m-%d")
if (
name == "Tags"
and not isinstance(value, str)
and not isinstance(value, bytes)
and isinstance(value, Iterable)
):
value = ";".join(value)

if not isinstance(value, str) and not isinstance(value, bytes):
raise ValueError(f"Invalid type for {name}: {type(value)}")

return value

def add_metadata(
self,
name: str,
content: str | bytes | datetime.date | datetime.datetime | Iterable[str],
value: str | bytes,
mimetype: str = "text/plain;charset=UTF-8",
):
# drop control characters before passing them to libzim
Expand All @@ -272,17 +291,9 @@ def add_metadata(
" \r\n\t"
)
if not self.disable_metadata_checks:
self.validate_metadata(name, content)
if name == "Date" and isinstance(content, (datetime.date, datetime.datetime)):
content = content.strftime("%Y-%m-%d").encode("UTF-8")
if (
name == "Tags"
and not isinstance(content, str)
and not isinstance(content, bytes)
and isinstance(content, Iterable)
):
content = ";".join(content)
super().add_metadata(name, content, mimetype)
self.validate_metadata(name, value)

super().add_metadata(name, value, mimetype)

# there are many N803 problems, but they are intentional to match real tag name
def config_metadata(
Expand Down
8 changes: 3 additions & 5 deletions src/zimscraperlib/zim/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,7 @@ def validate_required_values(name: str, value: Any):

def validate_standard_str_types(
name: str,
value: (
int | float | bytes | str | datetime.datetime | datetime.date | Iterable[str]
),
value: str | bytes,
):
"""ensures standard string metadata are indeed str"""
if name in (
Expand All @@ -50,7 +48,7 @@ def validate_standard_str_types(
"Source",
"Scraper",
) and not isinstance(value, str):
raise ValueError(f"Invalid type for {name}")
raise ValueError(f"Invalid type for {name}: {type(value)}")


def validate_title(name: str, value: str):
Expand All @@ -63,7 +61,7 @@ def validate_date(name: str, value: datetime.datetime | datetime.date | str):
"""ensures Date metadata can be casted to an ISO 8601 string"""
if name == "Date":
if not isinstance(value, (datetime.datetime, datetime.date, str)):
raise ValueError(f"Invalid type for {name}.")
raise ValueError(f"Invalid type for {name}: {type(value)}")
elif isinstance(value, str):
match = re.match(r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})", value)
if not match:
Expand Down

0 comments on commit 2d2c547

Please sign in to comment.