Skip to content

Commit

Permalink
Merge pull request #48 from iscc/develop
Browse files Browse the repository at this point in the history
batch command csv support
  • Loading branch information
titusz authored May 19, 2020
2 parents e15cf12 + 43cdd64 commit bba5562
Show file tree
Hide file tree
Showing 14 changed files with 86 additions and 45 deletions.
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ The `sim` command computes estimated similarity of two ISCC Codes:

```console
$ iscc sim CCUcKwdQc1jUM CCjMmrCsKWu1D
Estimated Similarity of Meta-ID: 78.00 % (8 of 64 bits different)
Estimated Similarity of Meta-ID: 78.00 % (56 of 64 bits match)
```

You may also compare full four-component ISCC Codes.
Expand Down Expand Up @@ -199,6 +199,13 @@ You may also want join our developer chat on Telegram at <https://t.me/iscc_dev>

## Change Log

### [0.9.10] - 2020-05-19
- Fixed issue with mime-type detection
- Changed wording of similarity output
- Added CSV-compatible output for batch command
- Added debug option for batch command
- Updated dependencies

### [0.9.9] - 2020-05-18
- Fixed issue with tika & macOS
- Added macOS ci testing
Expand Down
4 changes: 2 additions & 2 deletions iscc_cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
import click


__version__ = "0.9.9"
__version__ = "0.9.10"
APP_NAME = "iscc-cli"
APP_DIR = click.get_app_dir(APP_NAME, roaming=False)
os.makedirs(iscc_cli.APP_DIR, exist_ok=True)
os.environ["TIKA_PATH"] = APP_DIR
os.environ["TIKA_LOG_PATH"] = APP_DIR
os.environ["TIKA_STARTUP_MAX_RETRY"] = "5"
os.environ["TIKA_STARTUP_MAX_RETRY"] = "8"
os.environ["LOGURU_AUTOINIT"] = "False"


Expand Down
54 changes: 33 additions & 21 deletions iscc_cli/commands/batch.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
# -*- coding: utf-8 -*-
import os
import shutil
import sys
from os.path import basename, abspath
import click
import mobi
from iscc_cli.tika import detector, parser
import iscc

from iscc_cli import video_id
from iscc_cli.const import SUPPORTED_MIME_TYPES, GMT
from iscc_cli.utils import get_files, mime_to_gmt, get_title, DefaultHelp
from iscc_cli.utils import get_files, mime_to_gmt, get_title, DefaultHelp, clean_mime
from iscc_cli import audio_id, fpcalc
from loguru import logger as log


@click.command(cls=DefaultHelp)
Expand All @@ -22,31 +23,40 @@
is_flag=True,
default=False,
help="Guess title (first line of text).",
show_default=True,
)
@click.option(
"-d",
"--debug",
is_flag=True,
default=False,
help="Show debug output",
show_default=True,
)
def batch(path, recursive, guess):
def batch(path, recursive, guess, debug):
"""Create ISCC Codes for all files in PATH.
Example:
$ iscc batch ~/Documents
"""
if debug:
log.add(sys.stdout)

results = []
for f in get_files(path, recursive=recursive):
filesize = os.path.getsize(f)
if not filesize:
click.echo("Cannot proccess empty file: {}".format(f))
msg = "Cannot proccess empty file: {}".format(f)
log.warning(msg)
continue

media_type = detector.from_file(f)
media_type = clean_mime(detector.from_file(f))
if media_type not in SUPPORTED_MIME_TYPES:
fname = basename(f)
click.echo(
"Unsupported file {} with mime type: {}".format(fname, media_type)
)
click.echo(
"Please request support at https://github.com/iscc/iscc-cli/issues"
)
msg = "Unsupported file {} with mime type: {},,,,".format(fname, media_type)
log.warning(msg)
continue

if media_type == "application/x-mobipocket-ebook":
Expand All @@ -55,7 +65,8 @@ def batch(path, recursive, guess):
tika_result = parser.from_file(epub_filepath)
shutil.rmtree(tempdir)
except Exception as e:
click.echo("Error with mobi extraction %s" % f)
msg = "Error with mobi extraction %s"
log.error(msg)
continue
else:
tika_result = parser.from_file(f)
Expand All @@ -68,12 +79,15 @@ def batch(path, recursive, guess):
try:
cid = iscc.content_id_image(f)
except Exception as e:
click.echo("Clould not proccess image: {} ({})".format(f, e))
msg = "Clould not proccess image: {} ({})".format(f, e)
log.error(msg)
continue

elif gmt == GMT.TEXT:
text = tika_result["content"]
if not text:
click.echo("Could not extract text from {}".format(basename(f)))
msg = "Could not extract text from {}".format(basename(f))
log.warning(msg)
continue
cid = iscc.content_id_text(tika_result["content"])
elif gmt == GMT.AUDIO:
Expand All @@ -85,26 +99,24 @@ def batch(path, recursive, guess):
features = video_id.get_frame_vectors(abspath(f))
cid = video_id.content_id_video(features)
else:
click.echo("Could not generate ISCC")
log.error("Could not generate ISCC")
continue

did = iscc.data_id(f)
iid, tophash = iscc.instance_id(f)

if not norm_title:
iscc_code = "-".join((cid, did, iid))
else:
iscc_code = "-".join((mid, cid, did, iid))
iscc_code_cs = ",".join((mid, cid, did, iid))

click.echo(
"ISCC:{iscc_code},{tophash},{fname},{gmt},{title}".format(
iscc_code=iscc_code,
"{iscc_code},{tophash},{fname},{gmt},{title}".format(
iscc_code=iscc_code_cs,
tophash=tophash,
fname=basename(f),
gmt=gmt,
title=norm_title,
)
)
iscc_code = "-".join((mid, cid, did, iid))
results.append(
dict(
iscc=iscc_code,
Expand Down
4 changes: 2 additions & 2 deletions iscc_cli/commands/dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import mobi
from click import UsageError
from iscc_cli.tika import parser, detector
from iscc_cli.utils import DefaultHelp
from iscc_cli.utils import DefaultHelp, clean_mime
from iscc_cli.const import SUPPORTED_MIME_TYPES
import json

Expand All @@ -20,7 +20,7 @@
def dump(path, strip, meta, content):
"""Dump Tika extraction results for PATH (file or url path)."""

media_type = detector.from_file(path)
media_type = clean_mime(detector.from_file(path))

if media_type not in SUPPORTED_MIME_TYPES:
click.echo("Unsupported media type {}.".format(media_type))
Expand Down
4 changes: 2 additions & 2 deletions iscc_cli/commands/gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from iscc_cli import audio_id, video_id, fpcalc
from iscc_cli.const import SUPPORTED_MIME_TYPES, GMT
from iscc_cli.utils import get_title, mime_to_gmt, DefaultHelp
from iscc_cli.utils import get_title, mime_to_gmt, DefaultHelp, clean_mime


@click.command(cls=DefaultHelp)
Expand All @@ -33,7 +33,7 @@ def gen(file, guess, title, extra, verbose):
if not filesize:
raise click.BadParameter("Cannot proccess empty file: {}".format(file.name))

media_type = detector.from_file(file.name)
media_type = clean_mime(detector.from_file(file.name))
if media_type not in SUPPORTED_MIME_TYPES:
click.echo("Unsupported media type {}.".format(media_type))
click.echo("Please request support at https://github.com/iscc/iscc-cli/issues")
Expand Down
4 changes: 2 additions & 2 deletions iscc_cli/commands/sim.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ def sim(a, b):
hamming_sim = 64 - hamming_dist
similarity = round(hamming_sim / (2 * 64 - hamming_sim) * 100)
click.echo(
"Estimated Similarity of {}: {:.2f} % ({} of 64 bits different)".format(
type_a, similarity, hamming_dist
"Estimated Similarity of {}: {:.2f} % ({} of 64 bits match)".format(
type_a, similarity, hamming_sim
)
)
if type_a == "Instance-ID" and type_b == "Instance-ID":
Expand Down
3 changes: 2 additions & 1 deletion iscc_cli/commands/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
DefaultHelp,
download_file,
is_youtube_url,
clean_mime,
)
import pytube

Expand Down Expand Up @@ -66,7 +67,7 @@ def web(url, guess, title, extra, verbose):
raise click.BadArgumentUsage(e)

data = BytesIO(resp.content)
media_type = detector.from_buffer(data)
media_type = clean_mime(detector.from_buffer(data))
if media_type not in SUPPORTED_MIME_TYPES:
click.echo("Unsupported media type {}".format(media_type))
click.echo("Please request support at https://github.com/iscc/iscc-cli/issues")
Expand Down
4 changes: 2 additions & 2 deletions iscc_cli/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ def iscc_from_file(file, guess=False, title="", extra="") -> Dict:
return gen.callback(file, guess, title, extra, False)


def isccs_from_dir(path, recursive=False, guess=False) -> List[Dict]:
return batch.callback(path, recursive, guess)
def isccs_from_dir(path, recursive=False, guess=False, debug=False) -> List[Dict]:
return batch.callback(path, recursive, guess, debug)


def iscc_from_url(url, guess=False, title="", extra="") -> Dict:
Expand Down
16 changes: 12 additions & 4 deletions iscc_cli/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-
from typing import Union, List
import hashlib
import io
import os
Expand Down Expand Up @@ -58,7 +59,17 @@ def get_files(path, recursive=False):
return iter_files(path, exts=SUPPORTED_EXTENSIONS, recursive=recursive)


def clean_mime(mime: Union[str, List]):
"""Returns first entry in mime and removes semicolon separated charset info"""
if mime and isinstance(mime, List):
mime = mime[0]
if mime:
mime = mime.split(";")[0]
return mime.strip()


def mime_to_gmt(mime_type, file_path=None):
mime_type = clean_mime(mime_type)
if mime_type == "image/gif" and file_path:
img = Image.open(file_path)
if img.is_animated:
Expand All @@ -79,10 +90,7 @@ def mime_to_gmt(mime_type, file_path=None):
def get_title(tika_result: dict, guess=False, uri=None):
title = ""
meta = tika_result.get("metadata")

# In contrast to tika.detect this may yield a list of mime-types!!!
mime_type = meta.get("Content-Type")
mime_type = mime_type[0] if mime_type and isinstance(mime_type, list) else mime_type
mime_type = clean_mime(meta.get("Content-Type"))
gmt = mime_to_gmt(mime_type)

if meta:
Expand Down
6 changes: 3 additions & 3 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "iscc-cli"
version = "0.9.9"
version = "0.9.10"
description = "ISCC CLI - Creates ISCC Codes from Media Files"
authors = ["Titusz Pan <[email protected]>"]
license = "MIT"
Expand Down
6 changes: 3 additions & 3 deletions tests/test_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,18 @@
def test_batch():
result = r.invoke(cli, ["batch", "./tests/batch"])
assert result.exit_code == 0
assert "CCKzUpp6U5hU7-CTMjk4o5H96BV-CDM6E14HcCZjQ-CR1LUvGDVrWye" in result.output
assert "CCKzUpp6U5hU7,CTMjk4o5H96BV,CDM6E14HcCZjQ,CR1LUvGDVrWye" in result.output


def test_batch_recursive():
result = r.invoke(cli, ["batch", "-r", "./tests/batch"])
assert result.exit_code == 0
assert "CCKzUpp6U5hU7-CTMjk4o5H96BV-CDM6E14HcCZjQ-CR1LUvGDVrWye" in result.output
assert "CCKzUpp6U5hU7,CTMjk4o5H96BV,CDM6E14HcCZjQ,CR1LUvGDVrWye" in result.output


def test_batch_python_call():
from iscc_cli.commands.batch import batch

result = batch.callback("./tests/batch/subdir", False, False)
result = batch.callback("./tests/batch/subdir", False, False, False)
assert isinstance(result, list)
assert len(result) == 1
12 changes: 12 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,15 @@ def test_iscc_split():

i = "CCcdAr6GDoF3p"
assert utils.iscc_split(i) == ["CCcdAr6GDoF3p"]


def test_clean_mime():
assert utils.clean_mime("") == ""
assert utils.clean_mime("text/html ") == "text/html"
assert utils.clean_mime(["text/html", "audio/mp3"]) == "text/html"
assert utils.clean_mime([" text/html", "audio/mp3"]) == "text/html"
assert utils.clean_mime(" text/plain; charset=windows-1252 ") == "text/plain"
assert (
utils.clean_mime([" text/plain; charset=windows-1252 ", "audio/mp3"])
== "text/plain"
)
3 changes: 2 additions & 1 deletion tests/video/build_videos.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from iscc_cli.tika import detector
from iscc_cli import ffmpeg
from iscc_cli import video_id
from utils import clean_mime

FORMATS = (
"rm",
Expand Down Expand Up @@ -64,7 +65,7 @@ def build_media_types():
else:
cmd = [ffmpeg.exe_path(), "-i", "master.3gp", "-loglevel", "2", outf]
subprocess.run(cmd)
media_type = detector.from_file(abspath(outf))
media_type = clean_mime(detector.from_file(abspath(outf)))
sigs = video_id.get_frame_vectors(abspath(outf))
vid = video_id.content_id_video(sigs)
print("{} -> {} -> {}".format(vid, outf, media_type))
Expand Down

0 comments on commit bba5562

Please sign in to comment.