-
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
221 additions
and
159 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,52 +1,114 @@ | ||
from typing import List, Optional, Dict, Any | ||
""" | ||
# Semantic-Code Text - Datamodel | ||
This module provides the pydantic metadata schema for Semantic Text Code results. | ||
The schema is conformant with https://schema.iscc.codes/ | ||
The `features` property of the top level Metadata Object support two different formats for | ||
representing granular (per text chunk) features: the **Index-Format** and the **Object-Format**. | ||
These formats are designed to offer flexibility in how feature data is structured and processed, | ||
catering to different use cases where either performance or clarity is prioritized. | ||
## Features Index-Format (Compact Array Structure): | ||
In this compact format, features are represented as a list of strings, with optional parallel arrays to | ||
store related attributes such as `offsets`, `sizes`, and `contents`. | ||
**Example**: | ||
```json | ||
{ | ||
"maintype": "semantic", | ||
"subtype": "text", | ||
"version": 0, | ||
"simprints": ["XZjeSfdyVi0", "NGrHC1F1Q-k"], | ||
"offsets": [0, 12], | ||
"sizes": [12, 48], | ||
"contents": ["textchunk no one", "textchunk no two"] | ||
} | ||
``` | ||
**Use Case**: | ||
- Best suited for scenarios where storage efficiency is critical, and the overhead of processing | ||
multiple parallel arrays is acceptable. | ||
- Useful when all features share the same set of attributes, allowing for faster bulk processing. | ||
## Features Object-Format (Self-Descriptive Object Structure): | ||
In this convenient format, each feature is represented as an individual object containing its | ||
attributes (`feature`, `offset`, `size`, `content`). This makes the structure more verbose but | ||
easier to read and work with. | ||
**Example**: | ||
```json | ||
{ | ||
"maintype": "content", | ||
"subtype": "text", | ||
"version": 0, | ||
"simprints": [ | ||
{ | ||
"simprint": "lUjuScFYBik", | ||
"offset": 0, | ||
"size": 25, | ||
"content": "ISCC - Semantic Text-Code" | ||
} | ||
] | ||
} | ||
``` | ||
**Use Case**: | ||
- Ideal for scenarios where clarity and readability are prioritized. | ||
- Each feature is self-contained, making it easier to understand, extend, and debug. | ||
- Flexibility in including or omitting optional attributes per feature. | ||
### Unified FeatureSet Schema: | ||
The `FeatureSet` model unifies these two formats by allowing either structure to be used. | ||
To use the `FeatureSet` model, you can either provide data in the Index-Format or Object-Format. | ||
""" | ||
|
||
from typing import List, Optional, Dict, Any, Union | ||
from pydantic import BaseModel | ||
|
||
__all__ = ["SctFeature", "SctMeta"] | ||
|
||
__all__ = ["Feature", "FeatureSet", "Metadata"] | ||
|
||
|
||
class PrettyBaseModel(BaseModel): | ||
def __repr__(self): | ||
return self.pretty_repr() | ||
|
||
def pretty_repr(self): | ||
return self.model_dump_json(indent=2, exclude_unset=True, exclude_none=True) | ||
return self.model_dump_json(indent=2, exclude_unset=True, exclude_none=True, exclude_defaults=False) | ||
|
||
|
||
class SctFeature(PrettyBaseModel): | ||
feature: Optional[str] = None | ||
class Feature(PrettyBaseModel): | ||
simprint: str | ||
offset: Optional[int] = None | ||
size: Optional[int] = None | ||
text: Optional[str] = None | ||
content: Optional[str] = None | ||
|
||
|
||
class FeatureSet(PrettyBaseModel): | ||
maintype: str = "semantic" | ||
subtype: str = "text" | ||
version: int = 0 | ||
embedding: Optional[List[float]] = None | ||
simprints: Optional[ | ||
Union[ | ||
List[str], # Index-Format | ||
List[Feature], # Object-Format | ||
] | ||
] = None | ||
offsets: Optional[List[int]] = None | ||
sizes: Optional[List[int]] = None | ||
contents: Optional[List[str]] = None | ||
|
||
class SctMeta(PrettyBaseModel): | ||
|
||
class Metadata(PrettyBaseModel): | ||
iscc: str | ||
characters: Optional[int] = None | ||
embedding: Optional[List[float]] = None | ||
features: Optional[List[SctFeature]] = None | ||
|
||
@classmethod | ||
def from_dict(cls, data: Dict[str, Any]) -> "SctMeta": | ||
features = [] | ||
feature_list = data.get("features", []) | ||
offset_list = data.get("offsets", []) | ||
size_list = data.get("sizes", []) | ||
text_list = data.get("chunks", []) | ||
|
||
max_len = max(len(feature_list), len(offset_list), len(size_list), len(text_list)) | ||
|
||
for i in range(max_len): | ||
features.append( | ||
SctFeature( | ||
feature=feature_list[i] if feature_list else None, | ||
offset=offset_list[i] if offset_list else None, | ||
size=size_list[i] if size_list else None, | ||
text=text_list[i] if text_list else None, | ||
) | ||
) | ||
return cls( | ||
iscc=data["iscc"], | ||
characters=data.get("characters"), | ||
embedding=data.get("embedding"), | ||
features=features if features else None, | ||
) | ||
features: Optional[List[FeatureSet]] = None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.