Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

2024-01-11 Updates #34

Merged
merged 7 commits into from
Jan 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions .github/workflows/test-all.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: Test all versions

on:
workflow_dispatch:
push:

jobs:
test-py-3-9:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2

- name: Test Python 3.9
run: make test_3_9

test-py-3-10:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2

- name: Test Python 3.10
run: make test_3_10

test-py-3-11:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2

- name: Test Python 3.11
run: make test_3_11
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ venv/

# Pickled models
*.model
*.pkl
*.bin

# Mac OSX
Expand Down
1 change: 0 additions & 1 deletion .ruff.toml

This file was deleted.

9 changes: 6 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,18 @@ test: test_3_9 test_3_10 test_3_11 ## Test all container versions

.PHONY: test_3_9
test_3_9: build_3_9 ## Test Python 3.9 pickle
docker run -i --rm -v ${PWD}:${PWD} -v /tmp:/tmp ${NAME}_py_3_9:latest --dataset=v4.1/live.parquet --model ${PWD}/tests/models/model_3_9.pkl
docker run -i --rm -v ${PWD}:${PWD} -v /tmp:/tmp ${NAME}_py_3_9:latest --model ${PWD}/tests/models/model_3_9_legacy.pkl
docker run -i --rm -v ${PWD}:${PWD} -v /tmp:/tmp ${NAME}_py_3_9:latest --model ${PWD}/tests/models/model_3_9.pkl

.PHONY: test_3_10
test_3_10: build_3_10 ## Test Python 3.10 pickle
docker run -i --rm -v ${PWD}:${PWD} -v /tmp:/tmp ${NAME}_py_3_10:latest --dataset=v4.1/live.parquet --model ${PWD}/tests/models/model_3_10.pkl
docker run -i --rm -v ${PWD}:${PWD} -v /tmp:/tmp ${NAME}_py_3_10:latest --model ${PWD}/tests/models/model_3_10_legacy.pkl
docker run -i --rm -v ${PWD}:${PWD} -v /tmp:/tmp ${NAME}_py_3_10:latest --model ${PWD}/tests/models/model_3_10.pkl

.PHONY: test_3_11
test_3_11: build_3_11 ## Test Python 3.11 pickle
docker run -i --rm -v ${PWD}:${PWD} -v /tmp:/tmp ${NAME}_py_3_11:latest --dataset=v4.1/live.parquet --model ${PWD}/tests/models/model_3_11.pkl
docker run -i --rm -v ${PWD}:${PWD} -v /tmp:/tmp ${NAME}_py_3_11:latest --model ${PWD}/tests/models/model_3_11_legacy.pkl
docker run -i --rm -v ${PWD}:${PWD} -v /tmp:/tmp ${NAME}_py_3_11:latest --model ${PWD}/tests/models/model_3_11.pkl

.PHONY: push_latest
push_latest: push_latest_3_9 push_latest_3_10 push_latest_3_11 ## Push latest docker containers
Expand Down
164 changes: 96 additions & 68 deletions predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import time
import random
import glob
from inspect import signature

from numerapi import NumerAPI
import pandas as pd
Expand All @@ -20,12 +21,13 @@ def parse_args():
group = parser.add_mutually_exclusive_group()
group.add_argument(
"--dataset",
default="v4.2/live_int8.parquet",
default="v4.3/live_int8.parquet",
help="Numerapi dataset path or local file.",
)
group.add_argument(
"--dataset-glob",
help="Glob pattern to match multiple datasets.",
"--benchmarks",
default="v4.3/live_benchmark_models.parquet",
help="Numerapi benchmark model path or local file.",
)
parser.add_argument("--model", required=True, help="Pickled model file or URL")
parser.add_argument("--output_dir", default="/tmp", help="File output dir")
Expand Down Expand Up @@ -60,13 +62,15 @@ def parse_args():
return args


def py_version(separator='.'):
return separator.join(sys.version.split('.')[:2])
def py_version(separator="."):
return separator.join(sys.version.split(".")[:2])


def exit_with_help(error):
git_ref = os.getenv('GIT_REF', 'latest')
docker_image_path = f"ghcr.io/numerai/numerai_predict_py_{py_version('_')}:{git_ref}"
git_ref = os.getenv("GIT_REF", "latest")
docker_image_path = (
f"ghcr.io/numerai/numerai_predict_py_{py_version('_')}:{git_ref}"
)
docker_args = "--debug --model $PWD/[PICKLE_FILE]"

logging.root.handlers[0].flush()
Expand All @@ -88,6 +92,47 @@ def exit_with_help(error):
sys.exit(error)


@staticmethod
def retry_request_with_backoff(
url: str,
retries: int = 10,
delay_base: float = 1.5,
delay_exp: float = 1.5,
retry_on_status_codes: list[int] = [503],
):
delay_base = max(1.1, delay_base)
delay_exp = max(1.1, delay_exp)
curr_delay = delay_base
for i in range(retries):
response = requests.get(url, stream=True, allow_redirects=True)
if response.status_code in retry_on_status_codes:
time.sleep(curr_delay)
curr_delay **= random.uniform(1, delay_exp)
elif response.status_code != 200:
logging.error(f"{response.reason} {response.text}")
sys.exit(1)
else:
return response
raise RuntimeError(f"Could not complete function call after {retries} retries...")


def get_data(dataset, output_dir):
if os.path.exists(dataset):
dataset_path = dataset
logging.info(f"Using local {dataset_path} for live data")
elif dataset.startswith("/"):
logging.error(f"Local dataset not found - {dataset} does not exist!")
exit_with_help(1)
else:
dataset_path = os.path.join(output_dir, dataset)
logging.info(f"Using NumerAPI to download {dataset} for live data")
napi = NumerAPI()
napi.download_dataset(dataset, dataset_path)
logging.info(f"Loading live features {dataset_path}")
live_features = pd.read_parquet(dataset_path)
return live_features


def main(args):
logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO)

Expand All @@ -96,11 +141,7 @@ def main(args):
if args.model.lower().startswith("http"):
truncated_url = args.model.split("?")[0]
logging.info(f"Downloading model {truncated_url}")
response = requests.get(args.model, stream=True, allow_redirects=True)
if response.status_code != 200:
logging.error(f"{response.reason} {response.text}")
sys.exit(1)

response = retry_request_with_backoff(args.model)
model_name = truncated_url.split("/")[-1]
model_pkl = os.path.join(args.output_dir, model_name)
logging.info(f"Saving model to {model_pkl}")
Expand Down Expand Up @@ -128,74 +169,61 @@ def main(args):
exit_with_help(1)
logging.debug(model)

datasets = []
if args.dataset_glob:
datasets = glob.glob(args.dataset_glob)
if len(datasets) == 0:
logging.error(f"No datasets found matching \"{args.dataset_glob}\"")
exit_with_help(1)
else:
datasets = [args.dataset]

all_predictions = []
for dataset in datasets:
if os.path.exists(dataset):
dataset_path = dataset
logging.info(f"Using local {dataset_path} for live data")
elif dataset.startswith("/"):
logging.error(f"Local dataset not found - {dataset} does not exist!")
exit_with_help(1)
else:
dataset_path = os.path.join(args.output_dir, dataset)
logging.info(f"Using NumerAPI to download {dataset} for live data")
napi = NumerAPI()
napi.download_dataset(dataset, dataset_path)
num_args = len(signature(model).parameters)

logging.info(f"Loading live features {dataset_path}")
live_features = pd.read_parquet(dataset_path)
live_features = get_data(args.dataset, args.output_dir)
if num_args > 1:
benchmark_models = get_data(args.benchmarks, args.output_dir)

logging.info(f"Predicting on {len(live_features)} rows of live features")
try:
logging.info(f"Predicting on {len(live_features)} rows of live features")
try:
if num_args == 1:
predictions = model(live_features)
if predictions is None:
logging.error("Pickle function is invalid - returned None")
exit_with_help(1)
elif type(predictions) != pd.DataFrame:
logging.error(
f"Pickle function is invalid - returned {type(predictions)} instead of pd.DataFrame"
)
exit_with_help(1)
elif len(predictions) == 0:
logging.error("Pickle function returned 0 predictions")
exit_with_help(1)
elif predictions.isna().any().any():
logging.error("Pickle function returned at least 1 NaN prediction")
exit_with_help(1)
elif not (predictions.iloc[:, 0].between(0, 1).all().all()):
logging.error(
"Pickle function returned invalid predictions. Ensure values are between 0 and 1."
)
exit_with_help(1)
except TypeError as e:
logging.error(f"Pickle function is invalid - {e}")
if args.debug:
logging.exception(e)
elif num_args == 2:
predictions = model(live_features, benchmark_models)
else:
logging.error(
f"Invalid pickle function - {model_pkl} must have 1 or 2 arguments"
)
exit_with_help(1)
except Exception as e:
logging.exception(e)

if predictions is None:
logging.error("Pickle function is invalid - returned None")
exit_with_help(1)
elif type(predictions) != pd.DataFrame:
logging.error(
f"Pickle function is invalid - returned {type(predictions)} instead of pd.DataFrame"
)
exit_with_help(1)
elif len(predictions) == 0:
logging.error("Pickle function returned 0 predictions")
exit_with_help(1)
elif predictions.isna().any().any():
logging.error("Pickle function returned at least 1 NaN prediction")
exit_with_help(1)
elif not (predictions.iloc[:, 0].between(0, 1).all().all()):
logging.error(
"Pickle function returned invalid predictions. Ensure values are between 0 and 1."
)
exit_with_help(1)
except TypeError as e:
logging.error(f"Pickle function is invalid - {e}")
if args.debug:
logging.exception(e)
exit_with_help(1)
except Exception as e:
logging.exception(e)
exit_with_help(1)

logging.info(f"Generated {len(predictions)} predictions")
logging.debug(predictions)
all_predictions.append(predictions)
logging.info(f"Generated {len(predictions)} predictions")
logging.debug(predictions)

all_predictions = pd.concat(all_predictions)
predictions_csv = os.path.join(
args.output_dir, f"live_predictions-{secrets.token_hex(6)}.csv"
)
logging.info(f"Saving predictions to {predictions_csv}")
with open(predictions_csv, "w") as f:
all_predictions.to_csv(f)
predictions.to_csv(f)

if args.post_url:
logging.info(f"Uploading predictions to {args.post_url}")
Expand Down
11 changes: 7 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ idna==3.4
jax==0.4.11
Jinja2==3.1.2
joblib==1.2.0
keras==2.14.0
keras==2.15.0
kiwisolver==1.4.4
libclang==16.0.0
lightgbm==4.0.0
Expand All @@ -38,6 +38,7 @@ Markdown==3.4.3
MarkupSafe==2.1.3
matplotlib==3.7.1
ml-dtypes==0.2.0
mlxtend==0.23.1
mpmath==1.3.0
multidict==6.0.4
networkx==3.1
Expand Down Expand Up @@ -66,12 +67,14 @@ rsa==4.9
scikit-learn==1.2.2
scipy==1.10.1
six==1.16.0
statsmodels==0.14.1
sympy==1.12
tenacity==8.2.2
tensorboard==2.14.1
tensorboard==2.15.0
tensorboard-data-server==0.7.2
tensorflow==2.14.0
tensorflow-estimator==2.14.0
tensorflow==2.15.0
tensorflow-decision-forests==1.8.1
tensorflow-estimator==2.15.0
tensorflow-io-gcs-filesystem==0.34.0
termcolor==2.3.0
threadpoolctl==3.1.0
Expand Down
Binary file modified tests/models/model_3_10.pkl
Binary file not shown.
Binary file added tests/models/model_3_10_legacy.pkl
Binary file not shown.
Binary file modified tests/models/model_3_11.pkl
Binary file not shown.
Binary file added tests/models/model_3_11_legacy.pkl
Binary file not shown.
Binary file modified tests/models/model_3_9.pkl
Binary file not shown.
Binary file added tests/models/model_3_9_legacy.pkl
Binary file not shown.
Loading