Skip to content

Commit

Permalink
Ensure the ai-generated.txt is included in the built package
Browse files Browse the repository at this point in the history
Signed-off-by: Jacob Torrey <[email protected]>
  • Loading branch information
ranok committed Oct 27, 2023
1 parent 03cde40 commit b4faecf
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 14 deletions.
14 changes: 11 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,17 @@ Here are each of them compared with both the LZMA and zlib detector across the t
### Usage

ZipPy will read files passed as command-line arguments, or will read from stdin to allow for piping of text to it.

First, build and install the tool:
```
$ python3 setup.py build && python3 setup.py install
```

It will install a new script (`zippy`) that you can use directly:

```
$ python3 zippy/zippy.py -h
usage: zippy.py [-h] [-p P] [-e {zlib,lzma,brotli,ensemble}] [-s | sample_files ...]
$ zippy -h
usage: zippy [-h] [-p P] [-e {zlib,lzma,brotli,ensemble}] [-s | sample_files ...]
positional arguments:
sample_files Text file(s) containing the sample to classify
Expand All @@ -42,7 +50,7 @@ options:
-e {zlib,lzma,brotli,ensemble}
Which compression engine to use: lzma, zlib, brotli, or an ensemble of all engines
-s Read from stdin until EOF is reached instead of from a file
$ python3 zippy/zippy.py samples/human-generated/about_me.txt
$ zippy samples/human-generated/about_me.txt
samples/human-generated/about_me.txt
('Human', 0.06013429262166636)
```
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

setup(
name='ZipPy setup file',
version='0.1.2',
version='0.1.1',
packages=['zippy'],
package_data={"": ["*.txt"]},
entry_points={
'console_scripts': [
'zippy=zippy.zippy:main',
Expand Down
2 changes: 1 addition & 1 deletion test_zippy_detect.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
PRELUDE_RATIO = None

def test_training_file(record_property):
(classification, score) = zippy.run_on_file_chunked('ai-generated.txt')
(classification, score) = zippy.run_on_file_chunked('zippy/ai-generated.txt')
record_property("score", str(score))
assert classification == 'AI', 'The training corpus should always be detected as AI-generated... since it is'

Expand Down
File renamed without changes.
25 changes: 16 additions & 9 deletions zippy/zippy.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from math import ceil
from typing import List, Optional, Tuple, TypeAlias
from multiprocessing import Pool, cpu_count
from importlib.resources import files

Score : TypeAlias = tuple[str, float]

Expand All @@ -40,8 +41,7 @@ def clean_text(s : str) -> str:

# The prelude file is a text file containing only AI-generated text, it is used to 'seed' the LZMA dictionary
PRELUDE_FILE : str = 'ai-generated.txt'
with open(PRELUDE_FILE, 'r', encoding='utf-8') as fp:
PRELUDE_STR = clean_text(fp.read())
PRELUDE_STR = clean_text(files('zippy').joinpath(PRELUDE_FILE).read_text())

class AIDetector(ABC):
'''
Expand Down Expand Up @@ -160,6 +160,7 @@ def __init__(self, prelude_file : Optional[str] = None, prelude_str : Optional[s
#print(prelude_file + ' ratio: ' + str(self.prelude_ratio))

if prelude_str != None:
self.prelude_str = prelude_str
if self.prelude_ratio == 0.0:
self.prelude_ratio = self._compress(prelude_str)

Expand Down Expand Up @@ -193,22 +194,28 @@ class Zippy:
def __init__(self, engine : CompressionEngine = CompressionEngine.LZMA, preset : Optional[int] = None, prelude_file : str = PRELUDE_FILE) -> None:
self.ENGINE = engine
self.PRESET = preset
self.PRELUDE_FILE = prelude_file
if prelude_file == PRELUDE_FILE:
self.PRELUDE_FILE = str(files('zippy').joinpath(PRELUDE_FILE))
self.PRELUDE_STR = clean_text(files('zippy').joinpath(PRELUDE_FILE).read_text())
else:
self.PRELUDE_FILE = prelude_file
with open(self.PRELUDE_FILE, encoding='utf-8') as fp:
self.PRELUDE_STR = clean_text(fp.read())
if engine == CompressionEngine.LZMA:
if self.PRESET:
self.detector = LzmaLlmDetector(prelude_file=self.PRELUDE_FILE, preset=self.PRESET)
self.detector = LzmaLlmDetector(prelude_str=self.PRELUDE_STR, preset=self.PRESET)
else:
self.detector = LzmaLlmDetector(prelude_file=self.PRELUDE_FILE)
self.detector = LzmaLlmDetector(prelude_str=self.PRELUDE_STR)
elif engine == CompressionEngine.BROTLI:
if self.PRESET:
self.detector = BrotliLlmDetector(prelude_file=self.PRELUDE_FILE, preset=self.PRESET)
self.detector = BrotliLlmDetector(prelude_str=self.PRELUDE_STR, preset=self.PRESET)
else:
self.detector = BrotliLlmDetector(prelude_file=self.PRELUDE_FILE)
self.detector = BrotliLlmDetector(prelude_str=self.PRELUDE_STR)
elif engine == CompressionEngine.ZLIB:
if self.PRESET:
self.detector = ZlibLlmDetector(prelude_file=self.PRELUDE_FILE, preset=self.PRESET)
self.detector = ZlibLlmDetector(prelude_str=self.PRELUDE_STR, preset=self.PRESET)
else:
self.detector = ZlibLlmDetector(prelude_file=self.PRELUDE_FILE)
self.detector = ZlibLlmDetector(prelude_str=self.PRELUDE_STR)

def run_on_file(self, filename : str) -> Optional[Score]:
'''Given a filename (and an optional number of decimal places to round to) returns the score for the contents of that file'''
Expand Down

0 comments on commit b4faecf

Please sign in to comment.