Ensure the ai-generated.txt is included in the built package

Signed-off-by: Jacob Torrey <[email protected]>
thinkst · Oct 27, 2023 · b4faecf · b4faecf
1 parent 03cde40
commit b4faecf
Show file tree

Hide file tree

Showing 5 changed files with 30 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -29,9 +29,17 @@ Here are each of them compared with both the LZMA and zlib detector across the t
 ### Usage
 
 ZipPy will read files passed as command-line arguments, or will read from stdin to allow for piping of text to it. 
+
+First, build and install the tool:
+```
+$ python3 setup.py build && python3 setup.py install
+```
+
+It will install a new script (`zippy`) that you can use directly:
+
 ```
-$ python3 zippy/zippy.py -h
-usage: zippy.py [-h] [-p P] [-e {zlib,lzma,brotli,ensemble}] [-s | sample_files ...]
+$ zippy -h
+usage: zippy [-h] [-p P] [-e {zlib,lzma,brotli,ensemble}] [-s | sample_files ...]
 
 positional arguments:
   sample_files          Text file(s) containing the sample to classify
@@ -42,7 +50,7 @@ options:
   -e {zlib,lzma,brotli,ensemble}
                         Which compression engine to use: lzma, zlib, brotli, or an ensemble of all engines
   -s                    Read from stdin until EOF is reached instead of from a file
-$ python3 zippy/zippy.py samples/human-generated/about_me.txt 
+$ zippy samples/human-generated/about_me.txt 
 samples/human-generated/about_me.txt
 ('Human', 0.06013429262166636)
 ```

diff --git a/setup.py b/setup.py
@@ -2,8 +2,9 @@
 
 setup(
     name='ZipPy setup file',
-    version='0.1.2',
+    version='0.1.1',
     packages=['zippy'],
+    package_data={"": ["*.txt"]},
     entry_points={
         'console_scripts': [
             'zippy=zippy.zippy:main',

diff --git a/test_zippy_detect.py b/test_zippy_detect.py
@@ -37,7 +37,7 @@
     PRELUDE_RATIO = None
 
 def test_training_file(record_property):
-    (classification, score) = zippy.run_on_file_chunked('ai-generated.txt')
+    (classification, score) = zippy.run_on_file_chunked('zippy/ai-generated.txt')
     record_property("score", str(score))
     assert classification == 'AI', 'The training corpus should always be detected as AI-generated... since it is'
 

diff --git a/ai-generated.txt → zippy/ai-generated.txt b/ai-generated.txt → zippy/ai-generated.txt
diff --git a/zippy/zippy.py b/zippy/zippy.py
@@ -14,6 +14,7 @@
 from math import ceil
 from typing import List, Optional, Tuple, TypeAlias
 from multiprocessing import Pool, cpu_count
+from importlib.resources import files
 
 Score : TypeAlias = tuple[str, float]
 
@@ -40,8 +41,7 @@ def clean_text(s : str) -> str:
 
 # The prelude file is a text file containing only AI-generated text, it is used to 'seed' the LZMA dictionary
 PRELUDE_FILE : str = 'ai-generated.txt'
-with open(PRELUDE_FILE, 'r', encoding='utf-8') as fp:
-    PRELUDE_STR = clean_text(fp.read())
+PRELUDE_STR = clean_text(files('zippy').joinpath(PRELUDE_FILE).read_text())
 
 class AIDetector(ABC):
     '''
@@ -160,6 +160,7 @@ def __init__(self, prelude_file : Optional[str] = None, prelude_str : Optional[s
             #print(prelude_file + ' ratio: ' + str(self.prelude_ratio))
 
         if prelude_str != None:
+            self.prelude_str = prelude_str
             if self.prelude_ratio == 0.0:
                 self.prelude_ratio = self._compress(prelude_str)
 
@@ -193,22 +194,28 @@ class Zippy:
     def __init__(self, engine : CompressionEngine = CompressionEngine.LZMA, preset : Optional[int] = None, prelude_file : str = PRELUDE_FILE) -> None:
         self.ENGINE = engine
         self.PRESET = preset
-        self.PRELUDE_FILE = prelude_file
+        if prelude_file == PRELUDE_FILE:
+            self.PRELUDE_FILE = str(files('zippy').joinpath(PRELUDE_FILE))
+            self.PRELUDE_STR = clean_text(files('zippy').joinpath(PRELUDE_FILE).read_text())
+        else:
+            self.PRELUDE_FILE = prelude_file
+            with open(self.PRELUDE_FILE, encoding='utf-8') as fp:
+                self.PRELUDE_STR = clean_text(fp.read())
         if engine == CompressionEngine.LZMA:
             if self.PRESET:
-                self.detector = LzmaLlmDetector(prelude_file=self.PRELUDE_FILE, preset=self.PRESET)
+                self.detector = LzmaLlmDetector(prelude_str=self.PRELUDE_STR, preset=self.PRESET)
             else:
-                self.detector = LzmaLlmDetector(prelude_file=self.PRELUDE_FILE)
+                self.detector = LzmaLlmDetector(prelude_str=self.PRELUDE_STR)
         elif engine == CompressionEngine.BROTLI:
             if self.PRESET:
-                self.detector = BrotliLlmDetector(prelude_file=self.PRELUDE_FILE, preset=self.PRESET)
+                self.detector = BrotliLlmDetector(prelude_str=self.PRELUDE_STR, preset=self.PRESET)
             else:
-                self.detector = BrotliLlmDetector(prelude_file=self.PRELUDE_FILE)
+                self.detector = BrotliLlmDetector(prelude_str=self.PRELUDE_STR)
         elif engine == CompressionEngine.ZLIB:
             if self.PRESET:
-                self.detector = ZlibLlmDetector(prelude_file=self.PRELUDE_FILE, preset=self.PRESET)
+                self.detector = ZlibLlmDetector(prelude_str=self.PRELUDE_STR, preset=self.PRESET)
             else:
-                self.detector = ZlibLlmDetector(prelude_file=self.PRELUDE_FILE)
+                self.detector = ZlibLlmDetector(prelude_str=self.PRELUDE_STR)
 
     def run_on_file(self, filename : str) -> Optional[Score]:
         '''Given a filename (and an optional number of decimal places to round to) returns the score for the contents of that file'''