diff --git a/README.md b/README.md index 8987a57..0d5b854 100644 --- a/README.md +++ b/README.md @@ -75,10 +75,21 @@ poet For granular (per chunk) feature outputs: "characters": 77, "features": [ { - "feature": "XZjeSfdyVi0", - "offset": 0, - "size": 77, - "text": "This is some sample text. It can be a longer document or even an entire book." + "maintype": "semantic", + "subtype": "text", + "version": 0, + "simprints": [ + "XZjeSfdyVi0" + ], + "offsets": [ + 0 + ], + "sizes": [ + 77 + ], + "contents": [ + "This is some sample text. It can be a longer document or even an entire book." + ] } ] } diff --git a/iscc_sct/code_semantic_text.py b/iscc_sct/code_semantic_text.py index 76814f9..b731068 100644 --- a/iscc_sct/code_semantic_text.py +++ b/iscc_sct/code_semantic_text.py @@ -104,7 +104,7 @@ def gen_text_code_semantic(text, **options): :key max_tokens (int): Max tokens per chunk (default 127). :key overlap (int): Max tokens allowed to overlap between chunks (default 48). :key trim (int): Trim whitespace from chunks (default False). - :return: Dict with ISCC processing results + :return: Dict with ISCC processing results (using Index-Format for granular features) """ if not text: @@ -112,7 +112,7 @@ def gen_text_code_semantic(text, **options): opts = sct.sct_opts.override(options) - result = {"iscc": None} # Initialize first so `iscc` key is first in dict + result = {"iscc": None} # Initialize first so `iscc` key is "first" in dict if opts.characters: result["characters"] = len(text) @@ -120,24 +120,32 @@ def gen_text_code_semantic(text, **options): # Text splitting splits = split_text(text, **opts.model_dump()) offsets, chunks = [list(item) for item in zip(*splits)] - if opts.chunks: - result["chunks"] = chunks - if opts.offsets: - result["offsets"] = offsets - if opts.sizes: - result["sizes"] = [len(chunk) for chunk in chunks] # Chunk embedding with sct.timer("EMBEDDING time"): embeddings = embed_chunks(chunks) - if opts.features: - feature_digests = [binarize(vec)[: opts.bits_granular // 8] for vec in embeddings] - result["features"] = [sct.encode_base64(digest) for digest in feature_digests] # Create global document embedding embedding = mean_pooling(embeddings) - if opts.embedding: - result["embedding"] = compress(embedding, opts.precision) + + if any([opts.simprints, opts.offsets, opts.sizes, opts.contents, opts.embedding]): + feature_set = { + "maintype": "semantic", + "subtype": "text", + "version": 0, + } + if opts.embedding: + feature_set["embedding"] = compress(embedding, opts.precision) + if opts.simprints: + feature_digests = [binarize(vec)[: opts.bits_granular // 8] for vec in embeddings] + feature_set["simprints"] = [sct.encode_base64(digest) for digest in feature_digests] + if opts.offsets: + feature_set["offsets"] = offsets + if opts.sizes: + feature_set["sizes"] = [len(chunk) for chunk in chunks] + if opts.contents: + feature_set["contents"] = chunks + result["features"] = [feature_set] # Encode global document embedding length = BIT_LEN_MAP[opts.bits] diff --git a/iscc_sct/main.py b/iscc_sct/main.py index e4c1f35..9f6b950 100644 --- a/iscc_sct/main.py +++ b/iscc_sct/main.py @@ -1,4 +1,4 @@ -from iscc_sct.models import SctMeta +from iscc_sct.models import Metadata from iscc_sct.code_semantic_text import gen_text_code_semantic from iscc_sct.options import sct_opts @@ -21,11 +21,11 @@ def create(text, granular=False, **options): """ # Override global options with individual options derived from `granular` parameter - granular = dict(features=True, offsets=True, sizes=True, chunks=True) if granular else {} + granular = dict(simprints=True, offsets=True, sizes=True, contents=True) if granular else {} opts = sct_opts.override(granular) # Override local options with individual options form additional keyword arguments opts = opts.override(options) data = gen_text_code_semantic(text, **opts.model_dump()) - return SctMeta.from_dict(data) + return Metadata(**data) diff --git a/iscc_sct/models.py b/iscc_sct/models.py index 4c4a246..beff308 100644 --- a/iscc_sct/models.py +++ b/iscc_sct/models.py @@ -1,7 +1,80 @@ -from typing import List, Optional, Dict, Any +""" +# Semantic-Code Text - Datamodel + +This module provides the pydantic metadata schema for Semantic Text Code results. +The schema is conformant with https://schema.iscc.codes/ + +The `features` property of the top level Metadata Object support two different formats for +representing granular (per text chunk) features: the **Index-Format** and the **Object-Format**. +These formats are designed to offer flexibility in how feature data is structured and processed, +catering to different use cases where either performance or clarity is prioritized. + +## Features Index-Format (Compact Array Structure): + +In this compact format, features are represented as a list of strings, with optional parallel arrays to +store related attributes such as `offsets`, `sizes`, and `contents`. + +**Example**: + +```json +{ + "maintype": "semantic", + "subtype": "text", + "version": 0, + "simprints": ["XZjeSfdyVi0", "NGrHC1F1Q-k"], + "offsets": [0, 12], + "sizes": [12, 48], + "contents": ["textchunk no one", "textchunk no two"] +} + +``` + +**Use Case**: +- Best suited for scenarios where storage efficiency is critical, and the overhead of processing + multiple parallel arrays is acceptable. +- Useful when all features share the same set of attributes, allowing for faster bulk processing. + +## Features Object-Format (Self-Descriptive Object Structure): + +In this convenient format, each feature is represented as an individual object containing its +attributes (`feature`, `offset`, `size`, `content`). This makes the structure more verbose but +easier to read and work with. + +**Example**: + +```json +{ + "maintype": "content", + "subtype": "text", + "version": 0, + "simprints": [ + { + "simprint": "lUjuScFYBik", + "offset": 0, + "size": 25, + "content": "ISCC - Semantic Text-Code" + } + ] +} + +``` +**Use Case**: +- Ideal for scenarios where clarity and readability are prioritized. +- Each feature is self-contained, making it easier to understand, extend, and debug. +- Flexibility in including or omitting optional attributes per feature. + + +### Unified FeatureSet Schema: + +The `FeatureSet` model unifies these two formats by allowing either structure to be used. +To use the `FeatureSet` model, you can either provide data in the Index-Format or Object-Format. +""" + +from typing import List, Optional, Dict, Any, Union from pydantic import BaseModel -__all__ = ["SctFeature", "SctMeta"] + +__all__ = ["Feature", "FeatureSet", "Metadata"] class PrettyBaseModel(BaseModel): @@ -9,44 +82,33 @@ def __repr__(self): return self.pretty_repr() def pretty_repr(self): - return self.model_dump_json(indent=2, exclude_unset=True, exclude_none=True) + return self.model_dump_json(indent=2, exclude_unset=True, exclude_none=True, exclude_defaults=False) -class SctFeature(PrettyBaseModel): - feature: Optional[str] = None +class Feature(PrettyBaseModel): + simprint: str offset: Optional[int] = None size: Optional[int] = None - text: Optional[str] = None + content: Optional[str] = None + +class FeatureSet(PrettyBaseModel): + maintype: str = "semantic" + subtype: str = "text" + version: int = 0 + embedding: Optional[List[float]] = None + simprints: Optional[ + Union[ + List[str], # Index-Format + List[Feature], # Object-Format + ] + ] = None + offsets: Optional[List[int]] = None + sizes: Optional[List[int]] = None + contents: Optional[List[str]] = None -class SctMeta(PrettyBaseModel): + +class Metadata(PrettyBaseModel): iscc: str characters: Optional[int] = None - embedding: Optional[List[float]] = None - features: Optional[List[SctFeature]] = None - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "SctMeta": - features = [] - feature_list = data.get("features", []) - offset_list = data.get("offsets", []) - size_list = data.get("sizes", []) - text_list = data.get("chunks", []) - - max_len = max(len(feature_list), len(offset_list), len(size_list), len(text_list)) - - for i in range(max_len): - features.append( - SctFeature( - feature=feature_list[i] if feature_list else None, - offset=offset_list[i] if offset_list else None, - size=size_list[i] if size_list else None, - text=text_list[i] if text_list else None, - ) - ) - return cls( - iscc=data["iscc"], - characters=data.get("characters"), - embedding=data.get("embedding"), - features=features if features else None, - ) + features: Optional[List[FeatureSet]] = None diff --git a/iscc_sct/options.py b/iscc_sct/options.py index 68d42e7..29299ef 100644 --- a/iscc_sct/options.py +++ b/iscc_sct/options.py @@ -34,12 +34,12 @@ class SctOptions(BaseSettings): precision: int = Field(8, description="ISCC_SCT_PRECISION - Max fractional digits for embeddings (default 8)") - features: bool = Field(False, description="ISCC_SCT_FEATURES - Include granular feature simprints") + simprints: bool = Field(False, description="ISCC_SCT_SIMPRINTS - Include granular feature simprints") offsets: bool = Field(False, description="ISCC_SCT_OFFSETS - Include offsets of granular features") sizes: bool = Field(False, description="ISCC_SCT_SIZES - Include sizes of granular features (number of chars)") - chunks: bool = Field(False, description="ISCC_SCT_CHUNKS - Include granular text chunks") + contents: bool = Field(False, description="ISCC_SCT_CONTENTS - Include granular text chunks") max_tokens: int = Field( 127, diff --git a/tests/test_iscc_sct.py b/tests/test_iscc_sct.py index 8ab53c5..68e80d6 100644 --- a/tests/test_iscc_sct.py +++ b/tests/test_iscc_sct.py @@ -53,37 +53,37 @@ def test_code_text_semantic_embedding(): fp = HERE / "en.txt" result = sct.code_text_semantic(fp, embedding=True) assert result["iscc"] == "ISCC:CAA636IXQD736IGJ" - assert len(result["embedding"]) == 384 + assert len(result["features"][0]["embedding"]) == 384 def test_code_text_semantic_features(): fp = HERE / "en.txt" - result = sct.code_text_semantic(fp, features=True) + result = sct.code_text_semantic(fp, simprints=True) assert result["iscc"] == "ISCC:CAA636IXQD736IGJ" assert result["characters"] == 12076 - assert result["features"][:3] == ["5wkXkfEx4lE", "b2UVwfc3wgk", "qvlV0W63s90"] - assert result["features"][-3:] == ["PNsX9eGZQEs", "fFk3M2u5Qkk", "TPuXs2sRtk8"] + assert result["features"][0]["simprints"][:3] == ["5wkXkfEx4lE", "b2UVwfc3wgk", "qvlV0W63s90"] + assert result["features"][0]["simprints"][-3:] == ["PNsX9eGZQEs", "fFk3M2u5Qkk", "TPuXs2sRtk8"] def test_code_text_semantic_offsets(): fp = HERE / "en.txt" result = sct.code_text_semantic(fp, offsets=True) - assert result["offsets"][:3] == [0, 277, 612] + assert result["features"][0]["offsets"][:3] == [0, 277, 612] def test_code_text_semantic_chunks(): fp = HERE / "en.txt" - result = sct.code_text_semantic(fp, chunks=True) - assert len(result["chunks"]) == 39 - assert result["chunks"][0].startswith("\n Thank ") - assert result["chunks"][-1].endswith("(Applause)\n") + result = sct.code_text_semantic(fp, contents=True) + assert len(result["features"][0]["contents"]) == 39 + assert result["features"][0]["contents"][0].startswith("\n Thank ") + assert result["features"][0]["contents"][-1].endswith("(Applause)\n") def test_code_text_semantic_sizes(): fp = HERE / "en.txt" result = sct.code_text_semantic(fp, sizes=True) # fmt: off - assert result["sizes"] == [ + assert result["features"][0]["sizes"] == [ 440, 396, 431, 385, 440, 380, 406, 477, 415, 536, 280, 449, 446, 442, 443, 444, 451, 485, 477, 439, 517, 430, 468, 394, 531, 448, 421, 503, 376, 403, 513, 477, 393, 375, 555, 533, 312, 455, 413 @@ -100,38 +100,45 @@ def test_gen_text_code_semantic_empty(): def test_gen_text_code_semantic_granular(): result = sct.gen_text_code_semantic( TEXT, - features=True, + simprints=True, offsets=True, - chunks=True, + contents=True, ) assert ( result == { "characters": 726, "iscc": "ISCC:CAARISHPJHEXQAYL", - "features": ["FWjtTcl4Aws", "lAjHSc1wAws"], - "offsets": [0, 297], - "chunks": [ - "\n" - "`iscc-sct` is a **proof of concept implementation** of a semantic " - "Text-Code for the\n" - "[ISCC](https://core.iscc.codes) (*International Standard Content " - "Code*). Semantic Text-Codes are\n" - "designed to capture and represent the language agnostic semantic " - "content of text for improved\n" - "similarity detection.\n" - "\n", # NOTE: end of first chunk (see comma :) - "\n" - "\n" - "The ISCC framework already comes with a Text-Code that is based " - "on lexical similarity and can match\n" - "near duplicates. The ISCC Semantic Text-Code is planned as a new " - "additional ISCC-UNIT focused on\n" - "capturing a more abstract and broad semantic similarity. As such " - "the Semantic Text-Code is\n" - "engineered to be robust against a broader range of variations and " - "translations of text that cannot\n" - "be matched based on lexical similarity.\n", + "features": [ + { + "maintype": "semantic", + "subtype": "text", + "version": 0, + "simprints": ["FWjtTcl4Aws", "lAjHSc1wAws"], + "offsets": [0, 297], + "contents": [ + "\n" + "`iscc-sct` is a **proof of concept implementation** of a semantic " + "Text-Code for the\n" + "[ISCC](https://core.iscc.codes) (*International Standard Content " + "Code*). Semantic Text-Codes are\n" + "designed to capture and represent the language agnostic semantic " + "content of text for improved\n" + "similarity detection.\n" + "\n", # NOTE: end of first chunk (see comma :) + "\n" + "\n" + "The ISCC framework already comes with a Text-Code that is based " + "on lexical similarity and can match\n" + "near duplicates. The ISCC Semantic Text-Code is planned as a new " + "additional ISCC-UNIT focused on\n" + "capturing a more abstract and broad semantic similarity. As such " + "the Semantic Text-Code is\n" + "engineered to be robust against a broader range of variations and " + "translations of text that cannot\n" + "be matched based on lexical similarity.\n", + ], + } ], } ) @@ -184,7 +191,7 @@ def test_embed_chunks(): def test_gen_text_code_semantic(text_en): result = sct.gen_text_code_semantic(text_en, embedding=True) assert result["iscc"] == "ISCC:CAA636IXQD736IGJ" - assert result["embedding"][:3] == pytest.approx( + assert result["features"][0]["embedding"][:3] == pytest.approx( [0.03241169825196266, 0.022712377831339836, 0.050273094326257706], rel=1e-3, ) @@ -235,4 +242,4 @@ def test_compress(): def test_embedding_precision(): d16 = sct.gen_text_code_semantic("Hello World", embedding=True, precision=4) - assert d16["embedding"][0] == 0.0087 + assert d16["features"][0]["embedding"][0] == 0.0087 diff --git a/tests/test_main.py b/tests/test_main.py index 5381261..9df6ee4 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -3,24 +3,33 @@ def test_create_returns_sct_meta(): result = sct.create("Hello World") - assert isinstance(result, sct.SctMeta) + assert isinstance(result, sct.Metadata) def test_create_default(): result = sct.create("Hello World") - assert result == sct.SctMeta(iscc="ISCC:CAA7GZ4J3DI3XY2R", characters=11) + assert result == sct.Metadata(iscc="ISCC:CAA7GZ4J3DI3XY2R", characters=11) def test_create_granular(): result = sct.create("Hello World", granular=True) - assert result == sct.SctMeta( - iscc="ISCC:CAA7GZ4J3DI3XY2R", - characters=11, - embedding=None, - features=[sct.SctFeature(feature="82eJ2NG741E", offset=0, size=11, text="Hello World")], - ) + assert result.model_dump(exclude_none=True) == { + "iscc": "ISCC:CAA7GZ4J3DI3XY2R", + "characters": 11, + "features": [ + { + "maintype": "semantic", + "subtype": "text", + "version": 0, + "simprints": ["82eJ2NG741E"], + "offsets": [0], + "sizes": [11], + "contents": ["Hello World"], + } + ], + } def test_create_embedding(): result = sct.create("Hello World", embedding=True) - assert len(result.embedding) == 384 + assert len(result.features[0].embedding) == 384 diff --git a/tests/test_models.py b/tests/test_models.py index 77faccf..6bd1e3b 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -1,75 +1,40 @@ import pytest +from pydantic import ValidationError +from iscc_sct.models import Metadata, Feature, FeatureSet -from iscc_sct.models import SctMeta, SctFeature - -def test_sct_feature_initialization(): - # Test initialization with all fields None - feature = SctFeature() - assert feature.feature is None +def test_feature_initialization(): + # Test empty initialization + with pytest.raises(ValidationError): + Feature() + feature = Feature(simprint="XZjeSfdyVi0") + assert feature.simprint == "XZjeSfdyVi0" assert feature.offset is None - assert feature.text is None + assert feature.content is None # Test initialization with values - feature = SctFeature(feature="feature", offset=5, text="example text") - assert feature.feature == "feature" + feature = Feature(simprint="feature", offset=5, content="example text") + assert feature.simprint == "feature" assert feature.offset == 5 - assert feature.text == "example text" + assert feature.content == "example text" + + +def test_feature_set_initialization(): + fs = FeatureSet() + assert fs.model_dump(exclude_none=True) == {"maintype": "semantic", "subtype": "text", "version": 0} def test_sct_meta_initialization(): # Test initialization with minimal required fields - meta = SctMeta(iscc="ISCC1234567890") + meta = Metadata(iscc="ISCC1234567890") assert meta.iscc == "ISCC1234567890" assert meta.characters is None - assert meta.embedding is None assert meta.features is None # Test initialization with all fields - features = [SctFeature(feature="feature1", offset=0, text="text1")] - meta = SctMeta(iscc="ISCC1234567890", characters=1000, embedding=[0.1, 0.2], features=features) + features = [FeatureSet(simprints=[Feature(simprint="feature1", offset=0, content="text1")], embedding=[0.1, 0.2])] + meta = Metadata(iscc="ISCC1234567890", characters=1000, features=features) assert meta.iscc == "ISCC1234567890" assert meta.characters == 1000 - assert meta.embedding == [0.1, 0.2] assert meta.features == features - - -def test_sct_meta_from_dict(): - data = { - "iscc": "ISCC1234567890", - "characters": 100, - "embedding": [0.1, 0.2], - "features": ["feature1", "feature2"], - "chunks": ["chunk1", "chunk2"], - } - - meta = SctMeta.from_dict(data) - assert meta.iscc == "ISCC1234567890" - assert meta.characters == 100 - assert meta.embedding == [0.1, 0.2] - assert len(meta.features) == 2 - assert meta.features[0].feature == "feature1" - assert meta.features[0].offset is None - assert meta.features[0].text == "chunk1" - assert meta.features[1].feature == "feature2" - assert meta.features[1].offset is None - assert meta.features[1].text == "chunk2" - - # Test with missing optional fields - minimal_data = {"iscc": "ISCC1234567890"} - minimal_meta = SctMeta.from_dict(minimal_data) - assert minimal_meta.iscc == "ISCC1234567890" - assert minimal_meta.characters is None - assert minimal_meta.embedding is None - assert minimal_meta.features is None - - -def test_sct_meta_from_dict_with_incomplete_data(): - # Incomplete feature-chunk pairs - data = { - "iscc": "ISCC1234567890", - "features": ["feature1"], - "chunks": ["chunk1", "chunk2"], # More chunks than features - } - with pytest.raises(IndexError): - SctMeta.from_dict(data) + assert meta.features[0].embedding == [0.1, 0.2]