Skip to content

Commit

Permalink
Refactor data model
Browse files Browse the repository at this point in the history
  • Loading branch information
titusz committed Aug 13, 2024
1 parent bc13e76 commit 8d33ec4
Show file tree
Hide file tree
Showing 8 changed files with 221 additions and 159 deletions.
19 changes: 15 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,21 @@ poet For granular (per chunk) feature outputs:
"characters": 77,
"features": [
{
"feature": "XZjeSfdyVi0",
"offset": 0,
"size": 77,
"text": "This is some sample text. It can be a longer document or even an entire book."
"maintype": "semantic",
"subtype": "text",
"version": 0,
"simprints": [
"XZjeSfdyVi0"
],
"offsets": [
0
],
"sizes": [
77
],
"contents": [
"This is some sample text. It can be a longer document or even an entire book."
]
}
]
}
Expand Down
34 changes: 21 additions & 13 deletions iscc_sct/code_semantic_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,40 +104,48 @@ def gen_text_code_semantic(text, **options):
:key max_tokens (int): Max tokens per chunk (default 127).
:key overlap (int): Max tokens allowed to overlap between chunks (default 48).
:key trim (int): Trim whitespace from chunks (default False).
:return: Dict with ISCC processing results
:return: Dict with ISCC processing results (using Index-Format for granular features)
"""

if not text:
raise ValueError("Input text cannot be empty.")

opts = sct.sct_opts.override(options)

result = {"iscc": None} # Initialize first so `iscc` key is first in dict
result = {"iscc": None} # Initialize first so `iscc` key is "first" in dict

if opts.characters:
result["characters"] = len(text)

# Text splitting
splits = split_text(text, **opts.model_dump())
offsets, chunks = [list(item) for item in zip(*splits)]
if opts.chunks:
result["chunks"] = chunks
if opts.offsets:
result["offsets"] = offsets
if opts.sizes:
result["sizes"] = [len(chunk) for chunk in chunks]

# Chunk embedding
with sct.timer("EMBEDDING time"):
embeddings = embed_chunks(chunks)
if opts.features:
feature_digests = [binarize(vec)[: opts.bits_granular // 8] for vec in embeddings]
result["features"] = [sct.encode_base64(digest) for digest in feature_digests]

# Create global document embedding
embedding = mean_pooling(embeddings)
if opts.embedding:
result["embedding"] = compress(embedding, opts.precision)

if any([opts.simprints, opts.offsets, opts.sizes, opts.contents, opts.embedding]):
feature_set = {
"maintype": "semantic",
"subtype": "text",
"version": 0,
}
if opts.embedding:
feature_set["embedding"] = compress(embedding, opts.precision)
if opts.simprints:
feature_digests = [binarize(vec)[: opts.bits_granular // 8] for vec in embeddings]
feature_set["simprints"] = [sct.encode_base64(digest) for digest in feature_digests]
if opts.offsets:
feature_set["offsets"] = offsets
if opts.sizes:
feature_set["sizes"] = [len(chunk) for chunk in chunks]
if opts.contents:
feature_set["contents"] = chunks
result["features"] = [feature_set]

# Encode global document embedding
length = BIT_LEN_MAP[opts.bits]
Expand Down
6 changes: 3 additions & 3 deletions iscc_sct/main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from iscc_sct.models import SctMeta
from iscc_sct.models import Metadata
from iscc_sct.code_semantic_text import gen_text_code_semantic
from iscc_sct.options import sct_opts

Expand All @@ -21,11 +21,11 @@ def create(text, granular=False, **options):
"""

# Override global options with individual options derived from `granular` parameter
granular = dict(features=True, offsets=True, sizes=True, chunks=True) if granular else {}
granular = dict(simprints=True, offsets=True, sizes=True, contents=True) if granular else {}
opts = sct_opts.override(granular)

# Override local options with individual options form additional keyword arguments
opts = opts.override(options)

data = gen_text_code_semantic(text, **opts.model_dump())
return SctMeta.from_dict(data)
return Metadata(**data)
132 changes: 97 additions & 35 deletions iscc_sct/models.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,114 @@
from typing import List, Optional, Dict, Any
"""
# Semantic-Code Text - Datamodel
This module provides the pydantic metadata schema for Semantic Text Code results.
The schema is conformant with https://schema.iscc.codes/
The `features` property of the top level Metadata Object support two different formats for
representing granular (per text chunk) features: the **Index-Format** and the **Object-Format**.
These formats are designed to offer flexibility in how feature data is structured and processed,
catering to different use cases where either performance or clarity is prioritized.
## Features Index-Format (Compact Array Structure):
In this compact format, features are represented as a list of strings, with optional parallel arrays to
store related attributes such as `offsets`, `sizes`, and `contents`.
**Example**:
```json
{
"maintype": "semantic",
"subtype": "text",
"version": 0,
"simprints": ["XZjeSfdyVi0", "NGrHC1F1Q-k"],
"offsets": [0, 12],
"sizes": [12, 48],
"contents": ["textchunk no one", "textchunk no two"]
}
```
**Use Case**:
- Best suited for scenarios where storage efficiency is critical, and the overhead of processing
multiple parallel arrays is acceptable.
- Useful when all features share the same set of attributes, allowing for faster bulk processing.
## Features Object-Format (Self-Descriptive Object Structure):
In this convenient format, each feature is represented as an individual object containing its
attributes (`feature`, `offset`, `size`, `content`). This makes the structure more verbose but
easier to read and work with.
**Example**:
```json
{
"maintype": "content",
"subtype": "text",
"version": 0,
"simprints": [
{
"simprint": "lUjuScFYBik",
"offset": 0,
"size": 25,
"content": "ISCC - Semantic Text-Code"
}
]
}
```
**Use Case**:
- Ideal for scenarios where clarity and readability are prioritized.
- Each feature is self-contained, making it easier to understand, extend, and debug.
- Flexibility in including or omitting optional attributes per feature.
### Unified FeatureSet Schema:
The `FeatureSet` model unifies these two formats by allowing either structure to be used.
To use the `FeatureSet` model, you can either provide data in the Index-Format or Object-Format.
"""

from typing import List, Optional, Dict, Any, Union
from pydantic import BaseModel

__all__ = ["SctFeature", "SctMeta"]

__all__ = ["Feature", "FeatureSet", "Metadata"]


class PrettyBaseModel(BaseModel):
def __repr__(self):
return self.pretty_repr()

def pretty_repr(self):
return self.model_dump_json(indent=2, exclude_unset=True, exclude_none=True)
return self.model_dump_json(indent=2, exclude_unset=True, exclude_none=True, exclude_defaults=False)


class SctFeature(PrettyBaseModel):
feature: Optional[str] = None
class Feature(PrettyBaseModel):
simprint: str
offset: Optional[int] = None
size: Optional[int] = None
text: Optional[str] = None
content: Optional[str] = None


class FeatureSet(PrettyBaseModel):
maintype: str = "semantic"
subtype: str = "text"
version: int = 0
embedding: Optional[List[float]] = None
simprints: Optional[
Union[
List[str], # Index-Format
List[Feature], # Object-Format
]
] = None
offsets: Optional[List[int]] = None
sizes: Optional[List[int]] = None
contents: Optional[List[str]] = None

class SctMeta(PrettyBaseModel):

class Metadata(PrettyBaseModel):
iscc: str
characters: Optional[int] = None
embedding: Optional[List[float]] = None
features: Optional[List[SctFeature]] = None

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "SctMeta":
features = []
feature_list = data.get("features", [])
offset_list = data.get("offsets", [])
size_list = data.get("sizes", [])
text_list = data.get("chunks", [])

max_len = max(len(feature_list), len(offset_list), len(size_list), len(text_list))

for i in range(max_len):
features.append(
SctFeature(
feature=feature_list[i] if feature_list else None,
offset=offset_list[i] if offset_list else None,
size=size_list[i] if size_list else None,
text=text_list[i] if text_list else None,
)
)
return cls(
iscc=data["iscc"],
characters=data.get("characters"),
embedding=data.get("embedding"),
features=features if features else None,
)
features: Optional[List[FeatureSet]] = None
4 changes: 2 additions & 2 deletions iscc_sct/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ class SctOptions(BaseSettings):

precision: int = Field(8, description="ISCC_SCT_PRECISION - Max fractional digits for embeddings (default 8)")

features: bool = Field(False, description="ISCC_SCT_FEATURES - Include granular feature simprints")
simprints: bool = Field(False, description="ISCC_SCT_SIMPRINTS - Include granular feature simprints")
offsets: bool = Field(False, description="ISCC_SCT_OFFSETS - Include offsets of granular features")

sizes: bool = Field(False, description="ISCC_SCT_SIZES - Include sizes of granular features (number of chars)")

chunks: bool = Field(False, description="ISCC_SCT_CHUNKS - Include granular text chunks")
contents: bool = Field(False, description="ISCC_SCT_CONTENTS - Include granular text chunks")

max_tokens: int = Field(
127,
Expand Down
Loading

0 comments on commit 8d33ec4

Please sign in to comment.