From 34b8cab68cde2e1f2a81a5a6bdb6275dae4c5e62 Mon Sep 17 00:00:00 2001 From: Santiago Somoza <45318759+santi1234567@users.noreply.github.com> Date: Wed, 15 May 2024 22:24:37 -0300 Subject: [PATCH 1/5] Support mlp classifier (#33) * rename knn_classifier to classifier * remove deprecated file * support changing classifier type * add classifier type flag * update readme * linting --- README.md | 15 +++++++---- build_db.py | 4 +-- knn_classifier.py => classifier.py | 41 +++++++++++++++++++++++------- compute_periods.py | 2 +- interactive.ipynb | 4 +-- multi_classifier.py | 2 +- prepare_training_data.py | 4 +-- tests/test_classifier_persister.py | 2 +- 8 files changed, 51 insertions(+), 23 deletions(-) rename knn_classifier.py => classifier.py (88%) diff --git a/README.md b/README.md index 8ec1b58..461e4e9 100644 --- a/README.md +++ b/README.md @@ -36,11 +36,16 @@ pip install -r requirements.txt pip install -r requirements-dev.txt ``` -### k-NN Classifier +### The Classifier -Blockprint's classifier is a k-nearest neighbours classifier in `knn_classifier.py`. +Blockprint's classifier utilizes one of two machine learning algorithms: -See `./knn_classifier.py --help` for command line options including cross +- K-nearest neighbours +- Multi-layer Perceptron + +These can be chosen with the `--classifier-type` flag in `classifier.py`. + +See `./classifier.py --help` for more command line options including cross validation (CV) and manual classification. ### Training the Classifier @@ -81,10 +86,10 @@ testdata_proc └── 0x7fedb0da9699c93ce66966555c6719e1159ae7b3220c7053a08c8f50e2f3f56f.json ``` -You can then use this directory as the datadir argument to `./knn_classifier.py`: +You can then use this directory as the datadir argument to `./classifier.py`: ``` -./knn_classifier.py testdata_proc --classify testdata +./classifier.py testdata_proc --classify testdata ``` If you then want to use the classifier to build an sqlite database: diff --git a/build_db.py b/build_db.py index 0726bc9..002df7b 100755 --- a/build_db.py +++ b/build_db.py @@ -4,7 +4,7 @@ import json import sqlite3 import argparse -from knn_classifier import Classifier +from classifier import Classifier from multi_classifier import MultiClassifier from prepare_training_data import CLIENTS @@ -370,7 +370,7 @@ def main(): if args.multi_classifier: classifier = MultiClassifier(data_dir) else: - print("loading single KNN classifier") + print("loading single classifier") classifier = Classifier(data_dir) print("loaded") diff --git a/knn_classifier.py b/classifier.py similarity index 88% rename from knn_classifier.py rename to classifier.py index d43aa28..9463b0f 100755 --- a/knn_classifier.py +++ b/classifier.py @@ -9,12 +9,16 @@ import pickle from sklearn.neighbors import KNeighborsClassifier +from sklearn.neural_network import MLPClassifier from sklearn.model_selection import cross_validate from feature_selection import * # noqa F403 from feature_selection import ALL_FEATURES from prepare_training_data import CLIENTS, classify_reward_by_graffiti K = 9 + +MLP_HIDDEN_LAYER_SIZES = (390, 870) + WEIGHTS = "distance" MIN_GUESS_THRESHOLD = 0.20 @@ -69,6 +73,8 @@ def __init__( graffiti_only_clients=DEFAULT_GRAFFITI_ONLY, features=DEFAULT_FEATURES, enable_cv=False, + classifier_type="knn", + hidden_layer_sizes=MLP_HIDDEN_LAYER_SIZES, ): graffiti_only_clients = set(graffiti_only_clients) @@ -82,6 +88,8 @@ def __init__( set(grouped_clients) & graffiti_only_clients == set() ), "clients must not be both graffiti-only and grouped" + assert classifier_type in ["knn", "mlp"], "classifier_type must be knn or mlp" + feature_matrix = [] training_labels = [] @@ -118,18 +126,24 @@ def __init__( feature_matrix = np.array(feature_matrix) - knn = KNeighborsClassifier(n_neighbors=K, weights=WEIGHTS) + if classifier_type == "knn": + classifier = KNeighborsClassifier(n_neighbors=K, weights=WEIGHTS) + elif classifier_type == "mlp": + classifier = MLPClassifier( + hidden_layer_sizes=hidden_layer_sizes, max_iter=1000 + ) + # Assert above makes sure that classifier_type is one of the valid types if enable_cv: self.scores = cross_validate( - knn, feature_matrix, training_labels, scoring="balanced_accuracy" + classifier, feature_matrix, training_labels, scoring="balanced_accuracy" ) else: self.scores = None - knn.fit(feature_matrix, training_labels) + classifier.fit(feature_matrix, training_labels) - self.knn = knn + self.classifier = classifier self.enabled_clients = enabled_clients self.graffiti_only_clients = set(graffiti_only_clients) self.features = features @@ -145,7 +159,7 @@ def classify(self, block_reward): return (graffiti_guess, graffiti_guess, prob_by_client, graffiti_guess) row = into_feature_row(block_reward, self.features) - res = self.knn.predict_proba([row]) + res = self.classifier.predict_proba([row]) prob_by_client = { client: res[0][i] for i, client in enumerate(self.enabled_clients) @@ -219,7 +233,7 @@ def compute_best_guess(probability_map) -> str: def parse_args(): - parser = argparse.ArgumentParser("KNN testing and cross validation") + parser = argparse.ArgumentParser("Classifier testing and cross validation") parser.add_argument("data_dir", help="training data directory") parser.add_argument("--classify", help="data to classify") @@ -235,6 +249,12 @@ def parse_args(): parser.add_argument( "--group", default=[], nargs="+", help="clients to group during classification" ) + parser.add_argument( + "--classifier-type", + default="knn", + choices=["knn", "mlp"], + help="the type of classifier to use", + ) parser.add_argument( "--persist", action="store_true", @@ -280,7 +300,7 @@ def main(): grouped_clients = args.group should_persist = args.should_persist graffiti_only = args.graffiti_only - + classifier_type = args.classifier_type disabled_clients = args.disable enabled_clients = [ client @@ -310,6 +330,7 @@ def main(): graffiti_only_clients=graffiti_only, features=feature_vec, enable_cv=True, + classifier_type=classifier_type, ) print(f"enabled clients: {classifier.enabled_clients}") print(f"classifier scores: {classifier.scores['test_score']}") @@ -327,7 +348,9 @@ def main(): assert classify_dir is not None, "classify dir required" print(f"classifying all data in directory {classify_dir}") print(f"grouped clients: {grouped_clients}") - classifier = Classifier(data_dir, grouped_clients=grouped_clients) + classifier = Classifier( + data_dir, grouped_clients=grouped_clients, classifier_type=classifier_type + ) if args.plot is not None: classifier.plot_feature_matrix(args.plot) @@ -354,7 +377,7 @@ def main(): print(f"total blocks processed: {total_blocks}") if should_persist: - persist_classifier(classifier, "knn_classifier") + persist_classifier(classifier, "classifier") for multilabel, num_blocks in sorted(frequency_map.items()): percentage = round(num_blocks / total_blocks, 4) diff --git a/compute_periods.py b/compute_periods.py index f2b708c..192e3ae 100644 --- a/compute_periods.py +++ b/compute_periods.py @@ -5,7 +5,7 @@ import sqlite3 import requests import statistics -from knn_classifier import compute_best_guess +from classifier import compute_best_guess from prepare_training_data import CLIENTS from build_db import block_row_to_obj diff --git a/interactive.ipynb b/interactive.ipynb index c6f591b..9facf34 100644 --- a/interactive.ipynb +++ b/interactive.ipynb @@ -7,7 +7,7 @@ "metadata": {}, "outputs": [], "source": [ - "from knn_classifier import Classifier, DEFAULT_FEATURES" + "from classifier import Classifier, DEFAULT_FEATURES" ] }, { @@ -19,7 +19,7 @@ "source": [ "datadir = \"data/mainnet/training/slots_3481601_to_3702784_bal2x\"\n", "disabled_clients = []\n", - "features = ['percent_redundant', 'percent_pairwise_ordered', 'norm_reward']\n", + "features = [\"percent_redundant\", \"percent_pairwise_ordered\", \"norm_reward\"]\n", "\n", "classifier = Classifier(datadir, disabled_clients=disabled_clients, features=features)" ] diff --git a/multi_classifier.py b/multi_classifier.py index 4ebfc82..d898734 100644 --- a/multi_classifier.py +++ b/multi_classifier.py @@ -1,6 +1,6 @@ import os -from knn_classifier import Classifier +from classifier import Classifier def start_and_end_slot(sub_dir_name) -> (int, int): diff --git a/prepare_training_data.py b/prepare_training_data.py index c7e5941..a9a2c63 100755 --- a/prepare_training_data.py +++ b/prepare_training_data.py @@ -75,13 +75,13 @@ def process_file( def parse_args(): - parser = argparse.ArgumentParser("create training data for the KNN classifier") + parser = argparse.ArgumentParser("create training data for the classifier") parser.add_argument( "raw_data_dir", help="input containing data to classify using graffiti" ) parser.add_argument( - "proc_data_dir", help="output for processed data, suitable for KNN training" + "proc_data_dir", help="output for processed data, suitable for training" ) parser.add_argument( "--disable", diff --git a/tests/test_classifier_persister.py b/tests/test_classifier_persister.py index 9de8d27..d2c8db4 100644 --- a/tests/test_classifier_persister.py +++ b/tests/test_classifier_persister.py @@ -2,7 +2,7 @@ import json import os from typing import Any, Dict, List -from knn_classifier import Classifier, persist_classifier +from classifier import Classifier, persist_classifier from prepare_training_data import CLIENTS From f6c9509a23bc1326b5842e4180917376a9c0242a Mon Sep 17 00:00:00 2001 From: Tarun Date: Thu, 16 May 2024 16:02:23 +0200 Subject: [PATCH 2/5] First version of a dockerized setup --- .env.sample | 4 ++ Caddyfile.sample | 89 ++++++++++++++++++++++++++++++++++++++++++++ Dockerfile | 8 ++++ docker-compose.yml | 82 ++++++++++++++++++++++++++++++++++++++++ dreamer/example.toml | 16 ++++++++ 5 files changed, 199 insertions(+) create mode 100644 .env.sample create mode 100644 Caddyfile.sample create mode 100644 Dockerfile create mode 100644 docker-compose.yml create mode 100644 dreamer/example.toml diff --git a/.env.sample b/.env.sample new file mode 100644 index 0000000..0025912 --- /dev/null +++ b/.env.sample @@ -0,0 +1,4 @@ +BN_URL= +BP_URL= +DATA_DIR= +PROXY_PORT= \ No newline at end of file diff --git a/Caddyfile.sample b/Caddyfile.sample new file mode 100644 index 0000000..ae86a0f --- /dev/null +++ b/Caddyfile.sample @@ -0,0 +1,89 @@ +# Access to everything. +(authorised-super-users) { +} + +# Access to blockprint's private API and nothing else. +(authorised-blockprint-users) { +} + +# Access to the gauge /classify method, but not blockprint's private API. +(authorised-blockprint-workers) { +} + +http:// { + # tls /certs/cert.pem /certs/key.pem + log { + output stderr + format filter { + wrap console + fields { + request>headers>Authorization delete + } + } + } + + encode gzip zstd + + @public { + path /blocks_per_client/* + path /sync/status + path /sync/gaps + } + + @private { + path /validator/* + path /blocks/* + path /confusion/* + } + + @gauge-classify { + path /gauge/classify + } + + @gauge-accuracy { + path /confusion + path /accuracy + path /gauge/accuracy + path /gauge/confusion + } + + @eleel { + path /eleel + path /eleel/ + } + @eleel-canonical { + path /eleel/canonical + } + + reverse_proxy @public bp:8000 + reverse_proxy @private bp:8000 + + reverse_proxy @gauge-accuracy blockgauge:8002 { + rewrite /accuracy + } + + reverse_proxy @gauge-classify blockgauge:8002 { + rewrite /classify + } + + reverse_proxy @eleel localhost:8552 { + rewrite / + } + respond @eleel-canonical 400 { + body "{\"error\": \"nice try\"}" + close + } + + handle_errors { + respond "{\"error\": \"{http.error.status_code} {http.error.status_text}\"}" + } + + basicauth @private { + import authorised-super-users + import authorised-blockprint-users + } + basicauth @gauge-classify { + import authorised-super-users + import authorised-blockprint-workers + } +} diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..8da7ef2 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,8 @@ +FROM python:3.9 +# Or any preferred Python version. +WORKDIR /app +COPY ./*.py . +ADD requirements.txt . +RUN pip install -r requirements.txt +# Or enter the name of your unique directory and parameter set. +CMD ["gunicorn", "--timeout", "1800", "--bind", "0.0.0.0:8000", "api_server:app"] \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..b02d098 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,82 @@ +networks: + bp_cluster: + +services: + bp: + profiles: + - "" + - "server" + build: + context: . + dockerfile: Dockerfile + environment: + - BLOCK_DB=/app/block_db.sqlite + - BN_URL=${BN_URL:-localhost:5052} + - GUNICORN_CMD_ARGS="--bind=0.0.0.0" + - PYTHONUNBUFFERED=1 + volumes: + - ${DATA_DIR:-./training_data}:/app/data/mainnet/training + - ${BLOCK_DB:-./block_db.sqlite}:/app/block_db.sqlite + networks: + - bp_cluster + entrypoint: ["gunicorn", "--timeout", "1800", "api_server:app"] + bp-bg: + profiles: + - "" + - "server" + build: + context: . + dockerfile: Dockerfile + networks: + - bp_cluster + environment: + - PYTHONUNBUFFERED=1 + - BN_URL=${BN_URL:-http://localhost:5052} + - BP_URL=http://bp:8000 + entrypoint: ["./background_tasks.py"] + + blockgauge: + profiles: + - "" + - "server" + image: ghcr.io/blockprint-collective/blockgauge + command: >- + blockgauge + --lighthouse-url ${BN_URL:-localhost:5052} + --blockprint-url http://bp:8000 + --listen-address 0.0.0.0 + networks: + - bp_cluster + + blockdreamer: + profiles: + - "dreamer" + image: ghcr.io/blockprint-collective/blockdreamer + volumes: + - ./dreamer:/mnt/dreamer + working_dir: "/mnt/dreamer" + command: >- + blockdreamer + --config /mnt/dreamer/config.toml + + caddy: + profiles: + - "" + - "server" + image: caddy:2.7.5 + ports: + - "${PROXY_PORT:-80}:80" + - "443:443" + volumes: + - ./Caddyfile:/etc/caddy/Caddyfile:ro + - ./data/caddy/data:/data + - ./data/caddy/config:/config + - ./certs:/certs + restart: unless-stopped + networks: + - bp_cluster + + + + + \ No newline at end of file diff --git a/dreamer/example.toml b/dreamer/example.toml new file mode 100644 index 0000000..c008cde --- /dev/null +++ b/dreamer/example.toml @@ -0,0 +1,16 @@ +network = "mainnet" +canonical_bn = "http://localhost:5052" + +[[post_endpoints]] +name = "blockgauge" +url = "http://localhost:5052/lighthouse/analysis/block_rewards" +extra_data = false +compare_rewards = true + +[[nodes]] +name = "lighthouse-subscribe-none" +label = "Lighthouse" +url = "http://localhost:5052" +v3 = true +ssz = false +skip_randao_verification = true \ No newline at end of file From 354b1d08da72dbeb7f6b797208112d3f1c727ebe Mon Sep 17 00:00:00 2001 From: Tarun Date: Tue, 28 May 2024 10:05:04 +0200 Subject: [PATCH 3/5] Comments to support model through the docker setup --- docker-compose.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker-compose.yml b/docker-compose.yml index b02d098..5acc52f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -14,9 +14,11 @@ services: - BN_URL=${BN_URL:-localhost:5052} - GUNICORN_CMD_ARGS="--bind=0.0.0.0" - PYTHONUNBUFFERED=1 +# - MODEL_PATH=/app/classifier.pkl volumes: - ${DATA_DIR:-./training_data}:/app/data/mainnet/training - ${BLOCK_DB:-./block_db.sqlite}:/app/block_db.sqlite +# - ./example.pkl:/app/classifier.pkl networks: - bp_cluster entrypoint: ["gunicorn", "--timeout", "1800", "api_server:app"] From 1e57f01caf9db4d6bf1341e44124f36abb0f8afe Mon Sep 17 00:00:00 2001 From: Santiago Somoza <45318759+santi1234567@users.noreply.github.com> Date: Thu, 1 Aug 2024 22:37:52 -0300 Subject: [PATCH 4/5] Support pickle model import on API server (#34) * rename knn_classifier to classifier * remove deprecated file * support changing classifier type * add classifier type flag * update readme * linting * Add method to import persisted model and allow user to set env variable * pickle is not needed anymore in the server file * add workaround for using pickle with gunicorn * linting --------- Co-authored-by: Tarun --- api_server.py | 29 +++++++++++++++++++++++++---- classifier.py | 12 ++++++++++++ 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/api_server.py b/api_server.py index 2d9ba21..1bf06d1 100644 --- a/api_server.py +++ b/api_server.py @@ -1,7 +1,6 @@ import os import json import falcon - from multi_classifier import MultiClassifier from build_db import ( open_block_db, @@ -17,12 +16,21 @@ count_false_positives, count_false_negatives, ) +import __main__ +from classifier import ( + Classifier, + import_classifier, +) + +__main__.Classifier = Classifier + DATA_DIR = "./data/mainnet/training" BLOCK_DB = os.environ.get("BLOCK_DB") or "./block_db.sqlite" BN_URL = "http://localhost:5052" SELF_URL = "http://localhost:8000" DISABLE_CLASSIFIER = "DISABLE_CLASSIFIER" in os.environ +MODEL_PATH = os.environ.get("MODEL_PATH") or "" class Classify: @@ -202,9 +210,22 @@ def on_get(self, req, resp, client, start_slot, end_slot=None): classifier = None if not DISABLE_CLASSIFIER: - print("Initialising classifier, this could take a moment...") - classifier = MultiClassifier(DATA_DIR) if not DISABLE_CLASSIFIER else None - print("Done") + if MODEL_PATH != "": + if MODEL_PATH.endswith(".pkl"): + classifier = import_classifier(MODEL_PATH) + + else: + print("model path must end with .pkl") + exit(0) + + else: + print("Initialising classifier, this could take a moment...") + classifier = MultiClassifier(DATA_DIR) if not DISABLE_CLASSIFIER else None + print("Done") + +if classifier is None: + print("The classifier was not loaded") + exit(0) block_db = open_block_db(BLOCK_DB) diff --git a/classifier.py b/classifier.py index 9463b0f..072d2a4 100755 --- a/classifier.py +++ b/classifier.py @@ -290,6 +290,18 @@ def persist_classifier(classifier: Classifier, name: str) -> None: print(f"Failed to persist classifier due to {e}") +def import_classifier(model_path: str) -> Classifier: + print(f"""Loading classifier from {model_path}""") + + try: + classifier = pickle.load(open(model_path, "rb")) + print("Loaded classifier into memory") + return classifier + + except Exception as e: + print(f"Failed to import classifier due to {e}") + + def main(): args = parse_args() data_dir = args.data_dir From 4c456438c619d2999a690b4b4e5a82c95a1fc304 Mon Sep 17 00:00:00 2001 From: Michael Sproul Date: Fri, 2 Aug 2024 11:54:32 +1000 Subject: [PATCH 5/5] Tweak error handling (#35) --- api_server.py | 12 ++++++------ classifier.py | 15 +++++++-------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/api_server.py b/api_server.py index 1bf06d1..49f30c7 100644 --- a/api_server.py +++ b/api_server.py @@ -212,21 +212,21 @@ def on_get(self, req, resp, client, start_slot, end_slot=None): if not DISABLE_CLASSIFIER: if MODEL_PATH != "": if MODEL_PATH.endswith(".pkl"): - classifier = import_classifier(MODEL_PATH) + try: + classifier = import_classifier(MODEL_PATH) + except Exception as e: + print(f"Failed to persist classifier due to {e}") + exit(1) else: print("model path must end with .pkl") - exit(0) + exit(1) else: print("Initialising classifier, this could take a moment...") classifier = MultiClassifier(DATA_DIR) if not DISABLE_CLASSIFIER else None print("Done") -if classifier is None: - print("The classifier was not loaded") - exit(0) - block_db = open_block_db(BLOCK_DB) app.add_route("/classify/no_store", ClassifyNoStore(classifier)) diff --git a/classifier.py b/classifier.py index 072d2a4..4252da9 100755 --- a/classifier.py +++ b/classifier.py @@ -291,15 +291,14 @@ def persist_classifier(classifier: Classifier, name: str) -> None: def import_classifier(model_path: str) -> Classifier: - print(f"""Loading classifier from {model_path}""") - - try: - classifier = pickle.load(open(model_path, "rb")) - print("Loaded classifier into memory") - return classifier + """Load a pickled classifier. - except Exception as e: - print(f"Failed to import classifier due to {e}") + This function may throw an exception if the data is corrupt or the file does not exist. + """ + print(f"""Loading classifier from {model_path}""") + classifier = pickle.load(open(model_path, "rb")) + print("Loaded classifier into memory") + return classifier def main():