Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
hinthornw committed Oct 31, 2024
0 parents commit e05f83a
Show file tree
Hide file tree
Showing 15 changed files with 2,350 additions and 0 deletions.
165 changes: 165 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
*.pyc
.DS_Store
build/

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
1 change: 1 addition & 0 deletions .python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.11
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2024 William FH

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
42 changes: 42 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# promptimizer

Prompt optimization trainer.

Example:

```shell
uv venv
source .venv/bin/activate
uv pip install -e .
python main.py tweet
```

Script:

```shell
usage: main.py [-h] [--batch-size BATCH_SIZE] [--train-size TRAIN_SIZE] [--epochs EPOCHS]
{scone,tweet,metaprompt,simpleqa}

Optimize prompts for different tasks.

positional arguments:
{scone,tweet,metaprompt,simpleqa}
Task to optimize

options:
-h, --help show this help message and exit
--batch-size BATCH_SIZE
Batch size for optimization
--train-size TRAIN_SIZE
Training size for optimization
--epochs EPOCHS Number of epochs for optimization
```

Currently has ~ 4 tasks:

- tweet: write tweets
- simpleqa: really hard Q&A
- scone: NLI
- metaprompt: optimize the metaprompt itself over the above 3 tasks

![run](./static/optimizer.gif)
54 changes: 54 additions & 0 deletions design.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Design Notes

Something like this:
```python
@dataclass
class Task:
"""Represents a specific task for prompt optimization."""

name: str
train_datasets: str
dev_dataset_name: str
test_dataset_name: str
evaluators: list[Callable[[Run, Example], dict]]
initial_prompt: str
system: Runnable # Prompt + LLM most likely, where the prompt
max_iterations: int = 1
baseline_experiment: UUID | None
```

Trainer loop:

1. If baseline_experiment is set, fetch metrics for that, otherwise, run current prompt on dev dataset and get baseline scores.
2. Train:
```python
- list examples in train dataset
for x in epochs:
- Shuffle examples list & truncate to train size (default None) & split into batches
- For batch in batches:
- Run aevaluate on batch
- Format feedback
- Use current metaprompt to update the task-specific prompt
```
3. Test
- Run initial/baseline prompt on test dataset and get scores
- Run baseline prompt on test dataset and get scores
- Print out comparisons



Then for optimizing the metaprompt, this itself can be framed as a task,but the langsmith dataset would be kinda more of a reference than actual values....

The metaprompt task would be like:
- train dataset: each example's inputs is the name of a sub-task(?) and subset of the train dataset?
- system would be some object that:
initializes with a map of the train dataset to the task objects for the tasks it's trying to optimize on.
prelim - loads a batch of the subtask-specific examples based on the subset in that example. Looks up evaluators, etc. for this task from the map on the object.
the evaluator looks at the run outputs
1. Run initial task prompt over that batch
2. Run task evals on the results
3. Run metaprompt to generate new prompt
4. Run task evals on the new results
5. return a dict of {original: ..., new: ..., original_scores: ..., new_scores: ....}
6. eavluator takes those outputs and combines into a singel score. Compare the results and assert they are monitonically improving. And/Or could run a preference evaluator LLM-as-judge.
so basically this task's evaluator would be 1 if better, 0.5 if same, 0 if worse (or something; this is just an example)
55 changes: 55 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import asyncio
import argparse

from prompt_optimizer.trainer import PromptOptimizer
from langchain_anthropic import ChatAnthropic
import langsmith as ls
from prompt_optimizer.tasks.scone import scone_task
from prompt_optimizer.tasks.tweet_generator import tweet_task
from prompt_optimizer.tasks.metaprompt import metaprompt_task
from prompt_optimizer.tasks.simpleqa import simpleqa_task


tasks = {
"scone": scone_task,
"tweet": tweet_task,
"metaprompt": metaprompt_task,
"simpleqa": simpleqa_task,
}


optimizer = PromptOptimizer(
model=ChatAnthropic(model="claude-3-5-sonnet-20241022", max_tokens_to_sample=8192),
)


async def run(task_name: str, batch_size: int, train_size: int, epochs: int):
task = tasks.get(task_name)
if not task:
raise ValueError(f"Unknown task: {task_name}")

with ls.tracing_context(project_name="Optim"):
return await optimizer.optimize_prompt(
task, batch_size=batch_size, train_size=train_size, epochs=epochs
)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Optimize prompts for different tasks."
)
parser.add_argument("task", choices=list(tasks), help="Task to optimize")
parser.add_argument(
"--batch-size", type=int, default=40, help="Batch size for optimization"
)
parser.add_argument(
"--train-size", type=int, default=40, help="Training size for optimization"
)
parser.add_argument(
"--epochs", type=int, default=2, help="Number of epochs for optimization"
)

args = parser.parse_args()

results = asyncio.run(run(args.task, args.batch_size, args.train_size, args.epochs))
print(results)
11 changes: 11 additions & 0 deletions prompt_optimizer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import tqdm


# Deactivate tqdm by replacing it with a no-op function
def tqdm_noop(*args, **kwargs):
if args and hasattr(args[0], "__iter__"):
return args[0]
return args


tqdm.auto = tqdm_noop
Loading

0 comments on commit e05f83a

Please sign in to comment.