Skip to content

Commit

Permalink
Merge pull request Haidra-Org#78 from ceruleandeep/feature/additional…
Browse files Browse the repository at this point in the history
…TextModelFields

Generate additional fields in text models db.json
  • Loading branch information
db0 authored Sep 16, 2024
2 parents 669bd18 + aa75ffb commit 65d9dfa
Show file tree
Hide file tree
Showing 6 changed files with 10,976 additions and 3,947 deletions.
25 changes: 24 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,30 @@
# AI-Horde-text-model-reference
A refeference of text models that can be used in the AI Horde

#### Note to maintainers:
## How the `models.csv` file is converted to `db.json` by `convert.py`

All columns in the CSV file are added to the model record, with the following remarks:

- name: processed according to the needs of KoboldCPP and Aphrodite
- parameters_bn: converted to an integer `parameters` field in the model record
- tags: converted from a comma-separated string to a list of tags
- settings: converted from a JSON string to a dictionary
- model_name: assumed not present in CSV, set to the second part of slash-separated `name`
(rationale: may enable front-ends to submit requests for models regardless of the prefix)
- display_name: if not present in CSV, set to a spaced-out version of `model_name`

Conversion will also:

- add the model's style to the tags (so that models with several styles can be filtered by tag)
- add a tag for the number of parameters (may simplify filtering by model size)

Keys and values from defaults.json are always present in the model record.
Values from defaults.json are used to fill in missing fields in the CSV file.

Keys from generation_params.json are used to validate the 'settings' field.
Values from generation_params.json are not used.

## Note to maintainers:

As of a [recent PR](https://github.com/Haidra-Org/AI-Horde-text-model-reference/pull/16), you will have to make sure that:

Expand Down
184 changes: 109 additions & 75 deletions convert.py
Original file line number Diff line number Diff line change
@@ -1,75 +1,109 @@
import csv
import json
import os

input_file = "models.csv"
output_file = "db.json"

data = {}

with open(input_file, newline='') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
print(row)
name = row[0]
model_name = name.split("/")[1] if "/" in name else name
params_str = row[1] # Store the parameter value as a string

# Convert the parameter value to billions
try:
params = int(float(params_str) * 1_000_000_000)
except ValueError as e:
print(e)
print(f"Error converting {params_str} in {name} to an integer")
exit(1)

data[name] = {
"name": name,
"baseline": "opt",
"parameters": int(params),
"description": "Add me",
"version": "1",
"style": "generalist",
"nsfw": False
}
data[f"aphrodite/{name}"] = {
"name": f"aphrodite/{name}",
"baseline": "opt",
"parameters": int(params),
"description": "Add me",
"version": "1",
"style": "generalist",
"nsfw": False
}
data[f"koboldcpp/{model_name}"] = {
"name": f"koboldcpp/{model_name}",
"baseline": "opt",
"parameters": int(params),
"description": "Add me",
"version": "1",
"style": "generalist",
"nsfw": False
}

TESTS_ONGOING = os.getenv("TESTS_ONGOING", False)
if TESTS_ONGOING and os.path.exists(output_file):
# If tests are ongoing, we don't want to overwrite the db.json file
# Instead, we'll write to a new file and make sure the two files are the same
# by comparing them as strings
with open("db_test.json", "w") as f:
json.dump(data, f, indent=4)
f.write("\n")

with open(output_file, "r") as f:
old_data = f.read()

with open("db_test.json", "r") as f:
new_data = f.read()

if old_data != new_data:
print("db.json and db_test.json are different. Did you forget to run `convert.py`?")
exit(1)
else:
with open(output_file, "w") as f:
json.dump(data, f, indent=4)
f.write("\n")
import csv
import json
import os
import re

input_file = "models.csv"
output_file = "db.json"

# All columns in the CSV file are added to the model record
# with the following remarks:
# - name: processed according to the needs of KoboldCPP and Aphrodite
# - parameters_bn: converted to an integer 'parameters' field in the model record
# - tags: converted from a comma-separated string to a list of tags
# - settings: converted from a JSON string to a dictionary
# - model_name: assumed not present in CSV, set to the second part of slash-separated 'name'
# (rationale: may enable front-ends to submit requests for models regardless of the prefix)
# - display_name: if not present in CSV, set to a spaced-out version of model_name

# We also:
# - add the model's style to the tags (so that models with several styles can be filtered by tag)
# - add a tag for the number of parameters (may simplify filtering by model size)

# Keys and values from defaults.json are always present in the model record.
# Values from defaults.json are used to fill in missing fields in the CSV file.
defaults = json.load(open("defaults.json", "r"))

# Keys from generation_params.json are used to validate the 'settings' field.
# Values from generation_params.json are not used.
params = json.load(open("generation_params.json", "r"))

data = {}

with open(input_file, newline="") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
row: dict
name = row.pop("name")
model_name = name.split("/")[1] if "/" in name else name

# Convert the parameter value to billions
params_str = row.pop("parameters_bn")
try:
params_f = float(params_str)
row["parameters"] = int(params_f * 1_000_000_000)
except ValueError as e:
print(e)
print(f"Error converting {params_str} in {name} to an integer")
exit(1)

# Convert the tags to a list
# assuming they are comma-separated
tags = set([t.strip() for t in row["tags"].split(",")] if row["tags"] else [])
if style := row.get("style"):
tags.add(style)
tags.add(f"{round(params_f, 0):.0f}B")
row["tags"] = sorted(list(tags))

# Convert the settings to a dictionary
# assuming it's a JSON dictionary
try:
row["settings"] = json.loads(row["settings"]) if row["settings"] else {}
assert isinstance(row["settings"], dict), f"{name}: settings must be a JSON dictionary"
assert all(k in params for k in row["settings"]), f"{name}: settings must be a subset of generation_params.json"
except json.JSONDecodeError as e:
print(e)
print(f"Error decoding settings for {name}")
exit(1)

# Set a display name if it's not already set
# by adding spaces around underscores and hyphens
if not row.get("display_name"):
row["display_name"] = re.sub(r" +", " ", re.sub(r"[-_]", " ", model_name)).strip()

# Remove empty values
row = {k: v for k, v in row.items() if v}

# Add the model record to the data
for key_format in ["{name}", "aphrodite/{name}", "koboldcpp/{model_name}"]:
key = key_format.format(name=name, model_name=model_name)
data[key] = {"name": key, "model_name": model_name, **defaults, **row}

# data[name] = {"name": name, "model_name": model_name, **defaults, **row}
# data[f"aphrodite/{name}"] = {"name": f"aphrodite/{name}", "model_name": model_name, **defaults, **row}
# data[f"koboldcpp/{model_name}"] = {"name": f"koboldcpp/{model_name}", "model_name": model_name, **defaults, **row}

TESTS_ONGOING = os.getenv("TESTS_ONGOING", False)
if TESTS_ONGOING and os.path.exists(output_file):
# If tests are ongoing, we don't want to overwrite the db.json file
# Instead, we'll write to a new file and make sure the two files are the same
# by comparing them as strings
with open("db_test.json", "w") as f:
json.dump(data, f, indent=4)
f.write("\n")

with open(output_file, "r") as f:
old_data = f.read()

with open("db_test.json", "r") as f:
new_data = f.read()

if old_data != new_data:
print(
"db.json and db_test.json are different. Did you forget to run `convert.py`?"
)
exit(1)
else:
with open(output_file, "w") as f:
json.dump(data, f, indent=4)
f.write("\n")
Loading

0 comments on commit 65d9dfa

Please sign in to comment.