From b4faecfd92a28366de9951eafa4ff46fc6000e5a Mon Sep 17 00:00:00 2001 From: Jacob Torrey Date: Fri, 27 Oct 2023 17:21:58 +0000 Subject: [PATCH] Ensure the ai-generated.txt is included in the built package Signed-off-by: Jacob Torrey --- README.md | 14 +++++++++--- setup.py | 3 ++- test_zippy_detect.py | 2 +- ai-generated.txt => zippy/ai-generated.txt | 0 zippy/zippy.py | 25 ++++++++++++++-------- 5 files changed, 30 insertions(+), 14 deletions(-) rename ai-generated.txt => zippy/ai-generated.txt (100%) diff --git a/README.md b/README.md index d2dc522..7fb564a 100644 --- a/README.md +++ b/README.md @@ -29,9 +29,17 @@ Here are each of them compared with both the LZMA and zlib detector across the t ### Usage ZipPy will read files passed as command-line arguments, or will read from stdin to allow for piping of text to it. + +First, build and install the tool: +``` +$ python3 setup.py build && python3 setup.py install +``` + +It will install a new script (`zippy`) that you can use directly: + ``` -$ python3 zippy/zippy.py -h -usage: zippy.py [-h] [-p P] [-e {zlib,lzma,brotli,ensemble}] [-s | sample_files ...] +$ zippy -h +usage: zippy [-h] [-p P] [-e {zlib,lzma,brotli,ensemble}] [-s | sample_files ...] positional arguments: sample_files Text file(s) containing the sample to classify @@ -42,7 +50,7 @@ options: -e {zlib,lzma,brotli,ensemble} Which compression engine to use: lzma, zlib, brotli, or an ensemble of all engines -s Read from stdin until EOF is reached instead of from a file -$ python3 zippy/zippy.py samples/human-generated/about_me.txt +$ zippy samples/human-generated/about_me.txt samples/human-generated/about_me.txt ('Human', 0.06013429262166636) ``` diff --git a/setup.py b/setup.py index 5a68264..2cf8ca5 100644 --- a/setup.py +++ b/setup.py @@ -2,8 +2,9 @@ setup( name='ZipPy setup file', - version='0.1.2', + version='0.1.1', packages=['zippy'], + package_data={"": ["*.txt"]}, entry_points={ 'console_scripts': [ 'zippy=zippy.zippy:main', diff --git a/test_zippy_detect.py b/test_zippy_detect.py index 2274314..1c098b5 100644 --- a/test_zippy_detect.py +++ b/test_zippy_detect.py @@ -37,7 +37,7 @@ PRELUDE_RATIO = None def test_training_file(record_property): - (classification, score) = zippy.run_on_file_chunked('ai-generated.txt') + (classification, score) = zippy.run_on_file_chunked('zippy/ai-generated.txt') record_property("score", str(score)) assert classification == 'AI', 'The training corpus should always be detected as AI-generated... since it is' diff --git a/ai-generated.txt b/zippy/ai-generated.txt similarity index 100% rename from ai-generated.txt rename to zippy/ai-generated.txt diff --git a/zippy/zippy.py b/zippy/zippy.py index 4e92fa8..b890e73 100755 --- a/zippy/zippy.py +++ b/zippy/zippy.py @@ -14,6 +14,7 @@ from math import ceil from typing import List, Optional, Tuple, TypeAlias from multiprocessing import Pool, cpu_count +from importlib.resources import files Score : TypeAlias = tuple[str, float] @@ -40,8 +41,7 @@ def clean_text(s : str) -> str: # The prelude file is a text file containing only AI-generated text, it is used to 'seed' the LZMA dictionary PRELUDE_FILE : str = 'ai-generated.txt' -with open(PRELUDE_FILE, 'r', encoding='utf-8') as fp: - PRELUDE_STR = clean_text(fp.read()) +PRELUDE_STR = clean_text(files('zippy').joinpath(PRELUDE_FILE).read_text()) class AIDetector(ABC): ''' @@ -160,6 +160,7 @@ def __init__(self, prelude_file : Optional[str] = None, prelude_str : Optional[s #print(prelude_file + ' ratio: ' + str(self.prelude_ratio)) if prelude_str != None: + self.prelude_str = prelude_str if self.prelude_ratio == 0.0: self.prelude_ratio = self._compress(prelude_str) @@ -193,22 +194,28 @@ class Zippy: def __init__(self, engine : CompressionEngine = CompressionEngine.LZMA, preset : Optional[int] = None, prelude_file : str = PRELUDE_FILE) -> None: self.ENGINE = engine self.PRESET = preset - self.PRELUDE_FILE = prelude_file + if prelude_file == PRELUDE_FILE: + self.PRELUDE_FILE = str(files('zippy').joinpath(PRELUDE_FILE)) + self.PRELUDE_STR = clean_text(files('zippy').joinpath(PRELUDE_FILE).read_text()) + else: + self.PRELUDE_FILE = prelude_file + with open(self.PRELUDE_FILE, encoding='utf-8') as fp: + self.PRELUDE_STR = clean_text(fp.read()) if engine == CompressionEngine.LZMA: if self.PRESET: - self.detector = LzmaLlmDetector(prelude_file=self.PRELUDE_FILE, preset=self.PRESET) + self.detector = LzmaLlmDetector(prelude_str=self.PRELUDE_STR, preset=self.PRESET) else: - self.detector = LzmaLlmDetector(prelude_file=self.PRELUDE_FILE) + self.detector = LzmaLlmDetector(prelude_str=self.PRELUDE_STR) elif engine == CompressionEngine.BROTLI: if self.PRESET: - self.detector = BrotliLlmDetector(prelude_file=self.PRELUDE_FILE, preset=self.PRESET) + self.detector = BrotliLlmDetector(prelude_str=self.PRELUDE_STR, preset=self.PRESET) else: - self.detector = BrotliLlmDetector(prelude_file=self.PRELUDE_FILE) + self.detector = BrotliLlmDetector(prelude_str=self.PRELUDE_STR) elif engine == CompressionEngine.ZLIB: if self.PRESET: - self.detector = ZlibLlmDetector(prelude_file=self.PRELUDE_FILE, preset=self.PRESET) + self.detector = ZlibLlmDetector(prelude_str=self.PRELUDE_STR, preset=self.PRESET) else: - self.detector = ZlibLlmDetector(prelude_file=self.PRELUDE_FILE) + self.detector = ZlibLlmDetector(prelude_str=self.PRELUDE_STR) def run_on_file(self, filename : str) -> Optional[Score]: '''Given a filename (and an optional number of decimal places to round to) returns the score for the contents of that file'''