Skip to content

Commit

Permalink
Encode granular features with base64
Browse files Browse the repository at this point in the history
  • Loading branch information
titusz committed Aug 12, 2024
1 parent c9e9215 commit 254ca51
Show file tree
Hide file tree
Showing 8 changed files with 152 additions and 8 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Changelog

## [0.1.2] - Unreleased
- Encode granular features with base64

## [0.1.1] - 2024-06-25
- Handle text decoding errors gracefully
Expand Down
2 changes: 1 addition & 1 deletion iscc_sct/code_semantic_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def gen_text_code_semantic(text, **options):
embeddings = embed_chunks(chunks)
if opts.features:
feature_digests = [binarize(vec)[: opts.bits_granular // 8] for vec in embeddings]
result["features"] = [sct.encode_base32(digest) for digest in feature_digests]
result["features"] = [sct.encode_base64(digest) for digest in feature_digests]

# Create global document embedding
embedding = mean_pooling(embeddings)
Expand Down
12 changes: 11 additions & 1 deletion iscc_sct/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from base64 import b32encode

from pybase64 import urlsafe_b64encode
from loguru import logger as log
import os
import time
Expand All @@ -19,6 +19,7 @@
"timer",
"get_model",
"encode_base32",
"encode_base64",
"hamming_distance",
"MODEL_PATH",
]
Expand Down Expand Up @@ -93,6 +94,15 @@ def encode_base32(data):
return b32encode(data).decode("ascii").rstrip("=")


def encode_base64(data):
# type: (bytes) -> str
"""
Standard RFC4648 base64url encoding without padding.
"""
code = urlsafe_b64encode(data).decode("ascii")
return code.rstrip("=")


def hamming_distance(a, b):
# type: (bytes, bytes) -> int
"""
Expand Down
134 changes: 133 additions & 1 deletion poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ tokenizers = "*"
pydantic-settings = "*"
charset-normalizer = "*"
numpy = "<2.0.0"
pybase64 = "^1.4.0"


[tool.poetry.extras]
Expand Down
2 changes: 1 addition & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def test_cli_generate_sct(sample_text_file):
def test_cli_generate_sct_granular(sample_text_file):
result = subprocess.run([sct, str(sample_text_file), "--granular"], capture_output=True, text=True)
assert result.returncode == 0
assert "iscc" in result.stdout
assert "features" in result.stdout


def test_cli_debug_mode(sample_text_file):
Expand Down
6 changes: 3 additions & 3 deletions tests/test_iscc_sct.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ def test_code_text_semantic_features():
result = sct.code_text_semantic(fp, features=True)
assert result["iscc"] == "ISCC:CAA636IXQD736IGJ"
assert result["characters"] == 12076
assert result["features"][:3] == ["44ERPEPRGHRFC", "N5SRLQPXG7BAS", "VL4VLULOW6Z52"]
assert result["features"][-3:] == ["HTNRP5PBTFAEW", "PRMTOM3LXFBES", "JT5ZPM3LCG3E6"]
assert result["features"][:3] == ["5wkXkfEx4lE", "b2UVwfc3wgk", "qvlV0W63s90"]
assert result["features"][-3:] == ["PNsX9eGZQEs", "fFk3M2u5Qkk", "TPuXs2sRtk8"]


def test_code_text_semantic_offsets():
Expand Down Expand Up @@ -109,7 +109,7 @@ def test_gen_text_code_semantic_granular():
== {
"characters": 726,
"iscc": "ISCC:CAARISHPJHEXQAYL",
"features": ["CVUO2TOJPABQW", "SQEMOSONOABQW"],
"features": ["FWjtTcl4Aws", "lAjHSc1wAws"],
"offsets": [0, 297],
"chunks": [
"\n"
Expand Down
2 changes: 1 addition & 1 deletion tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def test_create_granular():
iscc="ISCC:CAA7GZ4J3DI3XY2R",
characters=11,
embedding=None,
features=[sct.SctFeature(feature="6NTYTWGRXPRVC", offset=0, size=11, text="Hello World")],
features=[sct.SctFeature(feature="82eJ2NG741E", offset=0, size=11, text="Hello World")],
)


Expand Down

0 comments on commit 254ca51

Please sign in to comment.