Add doctests for readme

iscc · Jun 25, 2024 · 5ba6a5a · 5ba6a5a
1 parent aecabf5
commit 5ba6a5a
Show file tree

Hide file tree

Showing 4 changed files with 56 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -1,14 +1,11 @@
 # ISCC - Semantic Text-Code
 
-[![Version](https://img.shields.io/pypi/v/iscc-sct.svg)](https://pypi.python.org/pypi/iscc-sct/)
-[![Downloads](https://pepy.tech/badge/iscc-sct)](https://pepy.tech/project/iscc-sct)
-
 `iscc-sct` is a **proof of concept implementation** of a semantic Text-Code for the [ISCC](https://core.iscc.codes)
 (*International Standard Content Code*). Semantic Text-Codes are designed to capture and represent the language
 agnostic semantic content of text for improved similarity detection.
 
 > [!CAUTION]
-> This is an early proof of concept. All releases with release numbers below v1.0.0 may break backward
+> **This is an early proof of concept.** All releases with release numbers below v1.0.0 may break backward
 > compatibility and produce incompatible Semantic Text-Codes.
 
 ## What is ISCC Semantic Text-Code
@@ -43,23 +40,42 @@ pip install iscc-sct[gpu]
 
 ## Usage
 
-To generate a Semantic Text-Code use the `code_text_semantic` function. You can specify the bit length of the code to
-control the level of granularity in the semantic representation.
-
-```python
-import iscc_sct as sci
-
-# Generate a 64-bit ISCC Semantic Text-Code for an image file
-text = "This is some sample text. It can be a longer document or even an entire book."
-semantic_code = sci.gen_text_code_semantic(text, bits=64)
+To generate a Semantic Text-Code use the `create` function.
 
-print(semantic_code)
+```python-repl
+>>> import iscc_sct as sci
+>>> text = "This is some sample text. It can be a longer document or even an entire book."
+>>> sci.create(text, bits=64)
+{
+  "iscc": "ISCC:CAAVZHGOJH3XUFRF",
+  "characters": 89
+}
 ```
 
-```shell
-{'iscc': 'ISCC:CAAV3GG6JH3XEVRN', 'characters': 77}
+You can also generate granular (per chunk) feature outputs:
+
+```python-repl
+>>> import iscc_sct as sci
+>>> text = "This is some sample text. It can be a longer document or even an entire book."
+>>> sci.create(text, granular=True)
+{
+  "iscc": "ISCC:CAAV3GG6JH3XEVRN",
+  "characters": 77,
+  "features": [
+    {
+      "feature": "LWMN4SPXOJLC2",
+      "offset": 0,
+      "size": 77,
+      "text": "This is some sample text. It can be a longer document or even an entire book."
+    }
+  ]
+}
 ```
 
+control the level of granularity in the semantic representation.
+For mo
+
+
 ## How It Works
 
 `iscc-sct` splits the text into chunks and uses a pre-trained deep learning model for text embedding. The model
@@ -75,7 +91,7 @@ the broader ISCC ecosystem. For development, you'll need to install the project
 ```shell
 git clone https://github.com/iscc/iscc-sct.git
 cd iscc-sct
-poetry install -E cpu
+poetry install
 ```
 
 ## Contributing

diff --git a/iscc_sct/models.py b/iscc_sct/models.py
@@ -1,17 +1,26 @@
+import json
 from typing import List, Optional, Dict, Any
 from pydantic import BaseModel
 
 __all__ = ["SctFeature", "SctMeta"]
 
 
-class SctFeature(BaseModel):
+class PrettyBaseModel(BaseModel):
+    def __repr__(self):
+        return self.pretty_repr()
+
+    def pretty_repr(self):
+        return self.model_dump_json(exclude_unset=True, exclude_none=True, indent=2)
+
+
+class SctFeature(PrettyBaseModel):
     feature: Optional[str] = None
     offset: Optional[int] = None
     size: Optional[int] = None
     text: Optional[str] = None
 
 
-class SctMeta(BaseModel):
+class SctMeta(PrettyBaseModel):
     iscc: str
     characters: Optional[int] = None
     embedding: Optional[List[float]] = None

diff --git a/tests/test_iscc_sct.py b/tests/test_iscc_sct.py
@@ -59,10 +59,10 @@ def test_code_text_semantic_embedding():
 def test_code_text_semantic_features():
     fp = HERE / "en.txt"
     result = sct.code_text_semantic(fp, features=True)
-    assert result['iscc'] == "ISCC:CAA636IXQD736IGJ"
-    assert result['characters'] == 12076
-    assert result['features'][:3] == ["44ERPEPRGHRFC", "N5SRLQPXG7BAS", "VL4VLULOW6Z52"]
-    assert result['features'][-3:] == ["HTNRP5PBTFAEW", "PRMTOM3LXFBES", "JT5ZPM3LCG3E6"]
+    assert result["iscc"] == "ISCC:CAA636IXQD736IGJ"
+    assert result["characters"] == 12076
+    assert result["features"][:3] == ["44ERPEPRGHRFC", "N5SRLQPXG7BAS", "VL4VLULOW6Z52"]
+    assert result["features"][-3:] == ["HTNRP5PBTFAEW", "PRMTOM3LXFBES", "JT5ZPM3LCG3E6"]
 
 
 def test_code_text_semantic_offsets():

diff --git a/tests/test_readme.py b/tests/test_readme.py
@@ -0,0 +1,8 @@
+import doctest
+from pathlib import Path
+
+README = Path(__file__).parent.parent / "README.md"
+
+
+def test_readme_examples():
+    doctest.testfile(README.as_posix(), module_relative=False, optionflags=doctest.ELLIPSIS)