Refactor data model

iscc · Aug 13, 2024 · 8d33ec4 · 8d33ec4
1 parent bc13e76
commit 8d33ec4
Show file tree

Hide file tree

Showing 8 changed files with 221 additions and 159 deletions.
diff --git a/README.md b/README.md
@@ -75,10 +75,21 @@ poet For granular (per chunk) feature outputs:
   "characters": 77,
   "features": [
     {
-      "feature": "XZjeSfdyVi0",
-      "offset": 0,
-      "size": 77,
-      "text": "This is some sample text. It can be a longer document or even an entire book."
+      "maintype": "semantic",
+      "subtype": "text",
+      "version": 0,
+      "simprints": [
+        "XZjeSfdyVi0"
+      ],
+      "offsets": [
+        0
+      ],
+      "sizes": [
+        77
+      ],
+      "contents": [
+        "This is some sample text. It can be a longer document or even an entire book."
+      ]
     }
   ]
 }

diff --git a/iscc_sct/code_semantic_text.py b/iscc_sct/code_semantic_text.py
@@ -104,40 +104,48 @@ def gen_text_code_semantic(text, **options):
     :key max_tokens (int): Max tokens per chunk (default 127).
     :key overlap (int): Max tokens allowed to overlap between chunks (default 48).
     :key trim (int): Trim whitespace from chunks (default False).
-    :return: Dict with ISCC processing results
+    :return: Dict with ISCC processing results (using Index-Format for granular features)
     """
 
     if not text:
         raise ValueError("Input text cannot be empty.")
 
     opts = sct.sct_opts.override(options)
 
-    result = {"iscc": None}  # Initialize first so `iscc` key is first in dict
+    result = {"iscc": None}  # Initialize first so `iscc` key is "first" in dict
 
     if opts.characters:
         result["characters"] = len(text)
 
     # Text splitting
     splits = split_text(text, **opts.model_dump())
     offsets, chunks = [list(item) for item in zip(*splits)]
-    if opts.chunks:
-        result["chunks"] = chunks
-    if opts.offsets:
-        result["offsets"] = offsets
-    if opts.sizes:
-        result["sizes"] = [len(chunk) for chunk in chunks]
 
     # Chunk embedding
     with sct.timer("EMBEDDING time"):
         embeddings = embed_chunks(chunks)
-    if opts.features:
-        feature_digests = [binarize(vec)[: opts.bits_granular // 8] for vec in embeddings]
-        result["features"] = [sct.encode_base64(digest) for digest in feature_digests]
 
     # Create global document embedding
     embedding = mean_pooling(embeddings)
-    if opts.embedding:
-        result["embedding"] = compress(embedding, opts.precision)
+
+    if any([opts.simprints, opts.offsets, opts.sizes, opts.contents, opts.embedding]):
+        feature_set = {
+            "maintype": "semantic",
+            "subtype": "text",
+            "version": 0,
+        }
+        if opts.embedding:
+            feature_set["embedding"] = compress(embedding, opts.precision)
+        if opts.simprints:
+            feature_digests = [binarize(vec)[: opts.bits_granular // 8] for vec in embeddings]
+            feature_set["simprints"] = [sct.encode_base64(digest) for digest in feature_digests]
+        if opts.offsets:
+            feature_set["offsets"] = offsets
+        if opts.sizes:
+            feature_set["sizes"] = [len(chunk) for chunk in chunks]
+        if opts.contents:
+            feature_set["contents"] = chunks
+        result["features"] = [feature_set]
 
     # Encode global document embedding
     length = BIT_LEN_MAP[opts.bits]

diff --git a/iscc_sct/main.py b/iscc_sct/main.py
@@ -1,4 +1,4 @@
-from iscc_sct.models import SctMeta
+from iscc_sct.models import Metadata
 from iscc_sct.code_semantic_text import gen_text_code_semantic
 from iscc_sct.options import sct_opts
 
@@ -21,11 +21,11 @@ def create(text, granular=False, **options):
     """
 
     # Override global options with individual options derived from `granular` parameter
-    granular = dict(features=True, offsets=True, sizes=True, chunks=True) if granular else {}
+    granular = dict(simprints=True, offsets=True, sizes=True, contents=True) if granular else {}
     opts = sct_opts.override(granular)
 
     # Override local options with individual options form additional keyword arguments
     opts = opts.override(options)
 
     data = gen_text_code_semantic(text, **opts.model_dump())
-    return SctMeta.from_dict(data)
+    return Metadata(**data)
diff --git a/iscc_sct/models.py b/iscc_sct/models.py
@@ -1,52 +1,114 @@
-from typing import List, Optional, Dict, Any
+"""
+# Semantic-Code Text - Datamodel
+
+This module provides the pydantic metadata schema for Semantic Text Code results.
+The schema is conformant with https://schema.iscc.codes/
+
+The `features` property of the top level Metadata Object support two different formats for
+representing granular (per text chunk) features: the **Index-Format** and the **Object-Format**.
+These formats are designed to offer flexibility in how feature data is structured and processed,
+catering to different use cases where either performance or clarity is prioritized.
+
+## Features Index-Format (Compact Array Structure):
+
+In this compact format, features are represented as a list of strings, with optional parallel arrays to
+store related attributes such as `offsets`, `sizes`, and `contents`.
+
+**Example**:
+
+```json
+{
+    "maintype": "semantic",
+    "subtype": "text",
+    "version": 0,
+    "simprints": ["XZjeSfdyVi0", "NGrHC1F1Q-k"],
+    "offsets": [0, 12],
+    "sizes": [12, 48],
+    "contents": ["textchunk no one", "textchunk no two"]
+}
+
+```
+
+**Use Case**:
+- Best suited for scenarios where storage efficiency is critical, and the overhead of processing
+  multiple parallel arrays is acceptable.
+- Useful when all features share the same set of attributes, allowing for faster bulk processing.
+
+## Features Object-Format (Self-Descriptive Object Structure):
+
+In this convenient format, each feature is represented as an individual object containing its
+attributes (`feature`, `offset`, `size`, `content`). This makes the structure more verbose but
+easier to read and work with.
+
+**Example**:
+
+```json
+{
+    "maintype": "content",
+    "subtype": "text",
+    "version": 0,
+    "simprints": [
+        {
+            "simprint": "lUjuScFYBik",
+            "offset": 0,
+            "size": 25,
+            "content": "ISCC - Semantic Text-Code"
+        }
+    ]
+}
+
+```
+**Use Case**:
+- Ideal for scenarios where clarity and readability are prioritized.
+- Each feature is self-contained, making it easier to understand, extend, and debug.
+- Flexibility in including or omitting optional attributes per feature.
+
+
+### Unified FeatureSet Schema:
+
+The `FeatureSet` model unifies these two formats by allowing either structure to be used.
+To use the `FeatureSet` model, you can either provide data in the Index-Format or Object-Format.
+"""
+
+from typing import List, Optional, Dict, Any, Union
 from pydantic import BaseModel
 
-__all__ = ["SctFeature", "SctMeta"]
+
+__all__ = ["Feature", "FeatureSet", "Metadata"]
 
 
 class PrettyBaseModel(BaseModel):
     def __repr__(self):
         return self.pretty_repr()
 
     def pretty_repr(self):
-        return self.model_dump_json(indent=2, exclude_unset=True, exclude_none=True)
+        return self.model_dump_json(indent=2, exclude_unset=True, exclude_none=True, exclude_defaults=False)
 
 
-class SctFeature(PrettyBaseModel):
-    feature: Optional[str] = None
+class Feature(PrettyBaseModel):
+    simprint: str
     offset: Optional[int] = None
     size: Optional[int] = None
-    text: Optional[str] = None
+    content: Optional[str] = None
+
 
+class FeatureSet(PrettyBaseModel):
+    maintype: str = "semantic"
+    subtype: str = "text"
+    version: int = 0
+    embedding: Optional[List[float]] = None
+    simprints: Optional[
+        Union[
+            List[str],  # Index-Format
+            List[Feature],  # Object-Format
+        ]
+    ] = None
+    offsets: Optional[List[int]] = None
+    sizes: Optional[List[int]] = None
+    contents: Optional[List[str]] = None
 
-class SctMeta(PrettyBaseModel):
+
+class Metadata(PrettyBaseModel):
     iscc: str
     characters: Optional[int] = None
-    embedding: Optional[List[float]] = None
-    features: Optional[List[SctFeature]] = None
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> "SctMeta":
-        features = []
-        feature_list = data.get("features", [])
-        offset_list = data.get("offsets", [])
-        size_list = data.get("sizes", [])
-        text_list = data.get("chunks", [])
-
-        max_len = max(len(feature_list), len(offset_list), len(size_list), len(text_list))
-
-        for i in range(max_len):
-            features.append(
-                SctFeature(
-                    feature=feature_list[i] if feature_list else None,
-                    offset=offset_list[i] if offset_list else None,
-                    size=size_list[i] if size_list else None,
-                    text=text_list[i] if text_list else None,
-                )
-            )
-        return cls(
-            iscc=data["iscc"],
-            characters=data.get("characters"),
-            embedding=data.get("embedding"),
-            features=features if features else None,
-        )
+    features: Optional[List[FeatureSet]] = None
diff --git a/iscc_sct/options.py b/iscc_sct/options.py
@@ -34,12 +34,12 @@ class SctOptions(BaseSettings):
 
     precision: int = Field(8, description="ISCC_SCT_PRECISION - Max fractional digits for embeddings (default 8)")
 
-    features: bool = Field(False, description="ISCC_SCT_FEATURES - Include granular feature simprints")
+    simprints: bool = Field(False, description="ISCC_SCT_SIMPRINTS - Include granular feature simprints")
     offsets: bool = Field(False, description="ISCC_SCT_OFFSETS - Include offsets of granular features")
 
     sizes: bool = Field(False, description="ISCC_SCT_SIZES - Include sizes of granular features (number of chars)")
 
-    chunks: bool = Field(False, description="ISCC_SCT_CHUNKS - Include granular text chunks")
+    contents: bool = Field(False, description="ISCC_SCT_CONTENTS - Include granular text chunks")
 
     max_tokens: int = Field(
         127,