From 34b8cab68cde2e1f2a81a5a6bdb6275dae4c5e62 Mon Sep 17 00:00:00 2001
From: Santiago Somoza <45318759+santi1234567@users.noreply.github.com>
Date: Wed, 15 May 2024 22:24:37 -0300
Subject: [PATCH 1/5] Support mlp classifier (#33)

* rename knn_classifier to classifier

* remove deprecated file

* support changing classifier type

* add classifier type flag

* update readme

* linting
---
 README.md                          | 15 +++++++----
 build_db.py                        |  4 +--
 knn_classifier.py => classifier.py | 41 +++++++++++++++++++++++-------
 compute_periods.py                 |  2 +-
 interactive.ipynb                  |  4 +--
 multi_classifier.py                |  2 +-
 prepare_training_data.py           |  4 +--
 tests/test_classifier_persister.py |  2 +-
 8 files changed, 51 insertions(+), 23 deletions(-)
 rename knn_classifier.py => classifier.py (88%)

diff --git a/README.md b/README.md
index 8ec1b58..461e4e9 100644
--- a/README.md
+++ b/README.md
@@ -36,11 +36,16 @@ pip install -r requirements.txt
 pip install -r requirements-dev.txt
 ```
 
-### k-NN Classifier
+### The Classifier
 
-Blockprint's classifier is a k-nearest neighbours classifier in `knn_classifier.py`.
+Blockprint's classifier utilizes one of two machine learning algorithms:
 
-See `./knn_classifier.py --help` for command line options including cross
+- K-nearest neighbours
+- Multi-layer Perceptron
+
+These can be chosen with the `--classifier-type` flag in `classifier.py`.
+
+See `./classifier.py --help` for more command line options including cross
 validation (CV) and manual classification.
 
 ### Training the Classifier
@@ -81,10 +86,10 @@ testdata_proc
     └── 0x7fedb0da9699c93ce66966555c6719e1159ae7b3220c7053a08c8f50e2f3f56f.json
 ```
 
-You can then use this directory as the datadir argument to `./knn_classifier.py`:
+You can then use this directory as the datadir argument to `./classifier.py`:
 
 ```
-./knn_classifier.py testdata_proc --classify testdata
+./classifier.py testdata_proc --classify testdata
 ```
 
 If you then want to use the classifier to build an sqlite database:
diff --git a/build_db.py b/build_db.py
index 0726bc9..002df7b 100755
--- a/build_db.py
+++ b/build_db.py
@@ -4,7 +4,7 @@
 import json
 import sqlite3
 import argparse
-from knn_classifier import Classifier
+from classifier import Classifier
 from multi_classifier import MultiClassifier
 from prepare_training_data import CLIENTS
 
@@ -370,7 +370,7 @@ def main():
     if args.multi_classifier:
         classifier = MultiClassifier(data_dir)
     else:
-        print("loading single KNN classifier")
+        print("loading single classifier")
         classifier = Classifier(data_dir)
         print("loaded")
 
diff --git a/knn_classifier.py b/classifier.py
similarity index 88%
rename from knn_classifier.py
rename to classifier.py
index d43aa28..9463b0f 100755
--- a/knn_classifier.py
+++ b/classifier.py
@@ -9,12 +9,16 @@
 import pickle
 
 from sklearn.neighbors import KNeighborsClassifier
+from sklearn.neural_network import MLPClassifier
 from sklearn.model_selection import cross_validate
 from feature_selection import *  # noqa F403
 from feature_selection import ALL_FEATURES
 from prepare_training_data import CLIENTS, classify_reward_by_graffiti
 
 K = 9
+
+MLP_HIDDEN_LAYER_SIZES = (390, 870)
+
 WEIGHTS = "distance"
 
 MIN_GUESS_THRESHOLD = 0.20
@@ -69,6 +73,8 @@ def __init__(
         graffiti_only_clients=DEFAULT_GRAFFITI_ONLY,
         features=DEFAULT_FEATURES,
         enable_cv=False,
+        classifier_type="knn",
+        hidden_layer_sizes=MLP_HIDDEN_LAYER_SIZES,
     ):
         graffiti_only_clients = set(graffiti_only_clients)
 
@@ -82,6 +88,8 @@ def __init__(
             set(grouped_clients) & graffiti_only_clients == set()
         ), "clients must not be both graffiti-only and grouped"
 
+        assert classifier_type in ["knn", "mlp"], "classifier_type must be knn or mlp"
+
         feature_matrix = []
         training_labels = []
 
@@ -118,18 +126,24 @@ def __init__(
 
         feature_matrix = np.array(feature_matrix)
 
-        knn = KNeighborsClassifier(n_neighbors=K, weights=WEIGHTS)
+        if classifier_type == "knn":
+            classifier = KNeighborsClassifier(n_neighbors=K, weights=WEIGHTS)
+        elif classifier_type == "mlp":
+            classifier = MLPClassifier(
+                hidden_layer_sizes=hidden_layer_sizes, max_iter=1000
+            )
+        # Assert above makes sure that classifier_type is one of the valid types
 
         if enable_cv:
             self.scores = cross_validate(
-                knn, feature_matrix, training_labels, scoring="balanced_accuracy"
+                classifier, feature_matrix, training_labels, scoring="balanced_accuracy"
             )
         else:
             self.scores = None
 
-        knn.fit(feature_matrix, training_labels)
+        classifier.fit(feature_matrix, training_labels)
 
-        self.knn = knn
+        self.classifier = classifier
         self.enabled_clients = enabled_clients
         self.graffiti_only_clients = set(graffiti_only_clients)
         self.features = features
@@ -145,7 +159,7 @@ def classify(self, block_reward):
             return (graffiti_guess, graffiti_guess, prob_by_client, graffiti_guess)
 
         row = into_feature_row(block_reward, self.features)
-        res = self.knn.predict_proba([row])
+        res = self.classifier.predict_proba([row])
 
         prob_by_client = {
             client: res[0][i] for i, client in enumerate(self.enabled_clients)
@@ -219,7 +233,7 @@ def compute_best_guess(probability_map) -> str:
 
 
 def parse_args():
-    parser = argparse.ArgumentParser("KNN testing and cross validation")
+    parser = argparse.ArgumentParser("Classifier testing and cross validation")
 
     parser.add_argument("data_dir", help="training data directory")
     parser.add_argument("--classify", help="data to classify")
@@ -235,6 +249,12 @@ def parse_args():
     parser.add_argument(
         "--group", default=[], nargs="+", help="clients to group during classification"
     )
+    parser.add_argument(
+        "--classifier-type",
+        default="knn",
+        choices=["knn", "mlp"],
+        help="the type of classifier to use",
+    )
     parser.add_argument(
         "--persist",
         action="store_true",
@@ -280,7 +300,7 @@ def main():
     grouped_clients = args.group
     should_persist = args.should_persist
     graffiti_only = args.graffiti_only
-
+    classifier_type = args.classifier_type
     disabled_clients = args.disable
     enabled_clients = [
         client
@@ -310,6 +330,7 @@ def main():
                     graffiti_only_clients=graffiti_only,
                     features=feature_vec,
                     enable_cv=True,
+                    classifier_type=classifier_type,
                 )
                 print(f"enabled clients: {classifier.enabled_clients}")
                 print(f"classifier scores: {classifier.scores['test_score']}")
@@ -327,7 +348,9 @@ def main():
     assert classify_dir is not None, "classify dir required"
     print(f"classifying all data in directory {classify_dir}")
     print(f"grouped clients: {grouped_clients}")
-    classifier = Classifier(data_dir, grouped_clients=grouped_clients)
+    classifier = Classifier(
+        data_dir, grouped_clients=grouped_clients, classifier_type=classifier_type
+    )
 
     if args.plot is not None:
         classifier.plot_feature_matrix(args.plot)
@@ -354,7 +377,7 @@ def main():
     print(f"total blocks processed: {total_blocks}")
 
     if should_persist:
-        persist_classifier(classifier, "knn_classifier")
+        persist_classifier(classifier, "classifier")
 
     for multilabel, num_blocks in sorted(frequency_map.items()):
         percentage = round(num_blocks / total_blocks, 4)
diff --git a/compute_periods.py b/compute_periods.py
index f2b708c..192e3ae 100644
--- a/compute_periods.py
+++ b/compute_periods.py
@@ -5,7 +5,7 @@
 import sqlite3
 import requests
 import statistics
-from knn_classifier import compute_best_guess
+from classifier import compute_best_guess
 from prepare_training_data import CLIENTS
 from build_db import block_row_to_obj
 
diff --git a/interactive.ipynb b/interactive.ipynb
index c6f591b..9facf34 100644
--- a/interactive.ipynb
+++ b/interactive.ipynb
@@ -7,7 +7,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from knn_classifier import Classifier, DEFAULT_FEATURES"
+    "from classifier import Classifier, DEFAULT_FEATURES"
    ]
   },
   {
@@ -19,7 +19,7 @@
    "source": [
     "datadir = \"data/mainnet/training/slots_3481601_to_3702784_bal2x\"\n",
     "disabled_clients = []\n",
-    "features = ['percent_redundant', 'percent_pairwise_ordered', 'norm_reward']\n",
+    "features = [\"percent_redundant\", \"percent_pairwise_ordered\", \"norm_reward\"]\n",
     "\n",
     "classifier = Classifier(datadir, disabled_clients=disabled_clients, features=features)"
    ]
diff --git a/multi_classifier.py b/multi_classifier.py
index 4ebfc82..d898734 100644
--- a/multi_classifier.py
+++ b/multi_classifier.py
@@ -1,6 +1,6 @@
 import os
 
-from knn_classifier import Classifier
+from classifier import Classifier
 
 
 def start_and_end_slot(sub_dir_name) -> (int, int):
diff --git a/prepare_training_data.py b/prepare_training_data.py
index c7e5941..a9a2c63 100755
--- a/prepare_training_data.py
+++ b/prepare_training_data.py
@@ -75,13 +75,13 @@ def process_file(
 
 
 def parse_args():
-    parser = argparse.ArgumentParser("create training data for the KNN classifier")
+    parser = argparse.ArgumentParser("create training data for the classifier")
 
     parser.add_argument(
         "raw_data_dir", help="input containing data to classify using graffiti"
     )
     parser.add_argument(
-        "proc_data_dir", help="output for processed data, suitable for KNN training"
+        "proc_data_dir", help="output for processed data, suitable for training"
     )
     parser.add_argument(
         "--disable",
diff --git a/tests/test_classifier_persister.py b/tests/test_classifier_persister.py
index 9de8d27..d2c8db4 100644
--- a/tests/test_classifier_persister.py
+++ b/tests/test_classifier_persister.py
@@ -2,7 +2,7 @@
 import json
 import os
 from typing import Any, Dict, List
-from knn_classifier import Classifier, persist_classifier
+from classifier import Classifier, persist_classifier
 from prepare_training_data import CLIENTS
 
 

From f6c9509a23bc1326b5842e4180917376a9c0242a Mon Sep 17 00:00:00 2001
From: Tarun <tarsuno@gmail.com>
Date: Thu, 16 May 2024 16:02:23 +0200
Subject: [PATCH 2/5] First version of a dockerized setup

---
 .env.sample          |  4 ++
 Caddyfile.sample     | 89 ++++++++++++++++++++++++++++++++++++++++++++
 Dockerfile           |  8 ++++
 docker-compose.yml   | 82 ++++++++++++++++++++++++++++++++++++++++
 dreamer/example.toml | 16 ++++++++
 5 files changed, 199 insertions(+)
 create mode 100644 .env.sample
 create mode 100644 Caddyfile.sample
 create mode 100644 Dockerfile
 create mode 100644 docker-compose.yml
 create mode 100644 dreamer/example.toml

diff --git a/.env.sample b/.env.sample
new file mode 100644
index 0000000..0025912
--- /dev/null
+++ b/.env.sample
@@ -0,0 +1,4 @@
+BN_URL=
+BP_URL=
+DATA_DIR=
+PROXY_PORT=
\ No newline at end of file
diff --git a/Caddyfile.sample b/Caddyfile.sample
new file mode 100644
index 0000000..ae86a0f
--- /dev/null
+++ b/Caddyfile.sample
@@ -0,0 +1,89 @@
+# Access to everything.
+(authorised-super-users) {
+}
+
+# Access to blockprint's private API and nothing else.
+(authorised-blockprint-users) {
+}
+
+# Access to the gauge /classify method, but not blockprint's private API.
+(authorised-blockprint-workers) {
+}
+
+http:// {
+	# tls /certs/cert.pem /certs/key.pem
+	log {
+		output stderr
+		format filter {
+			wrap console
+			fields {
+				request>headers>Authorization delete
+			}
+		}
+	}
+
+	encode gzip zstd
+
+	@public {
+		path /blocks_per_client/*
+		path /sync/status
+		path /sync/gaps
+	}
+
+	@private {
+		path /validator/*
+		path /blocks/*
+		path /confusion/*
+	}
+
+	@gauge-classify {
+		path /gauge/classify
+	}
+
+	@gauge-accuracy {
+		path /confusion
+		path /accuracy
+		path /gauge/accuracy
+		path /gauge/confusion
+	}
+
+	@eleel {
+		path /eleel
+		path /eleel/
+	}
+	@eleel-canonical {
+		path /eleel/canonical
+	}
+
+	reverse_proxy @public bp:8000
+	reverse_proxy @private bp:8000
+
+	reverse_proxy @gauge-accuracy blockgauge:8002 {
+		rewrite /accuracy
+	}
+
+	reverse_proxy @gauge-classify blockgauge:8002 {
+		rewrite /classify
+	}
+
+	reverse_proxy @eleel localhost:8552 {
+		rewrite /
+	}
+	respond @eleel-canonical 400 {
+		body "{\"error\": \"nice try\"}"
+		close
+	}
+
+	handle_errors {
+		respond "{\"error\": \"{http.error.status_code} {http.error.status_text}\"}"
+	}
+
+	basicauth @private {
+		import authorised-super-users
+		import authorised-blockprint-users
+	}
+	basicauth @gauge-classify {
+		import authorised-super-users
+		import authorised-blockprint-workers
+	}
+}
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..8da7ef2
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,8 @@
+FROM python:3.9
+# Or any preferred Python version.
+WORKDIR /app
+COPY ./*.py .
+ADD requirements.txt .
+RUN pip install -r requirements.txt
+# Or enter the name of your unique directory and parameter set.
+CMD ["gunicorn", "--timeout", "1800", "--bind", "0.0.0.0:8000", "api_server:app"]
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..b02d098
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,82 @@
+networks:
+  bp_cluster:
+
+services:
+  bp:
+    profiles:
+      - ""
+      - "server"
+    build:
+      context: .
+      dockerfile: Dockerfile
+    environment:
+      - BLOCK_DB=/app/block_db.sqlite
+      - BN_URL=${BN_URL:-localhost:5052}
+      - GUNICORN_CMD_ARGS="--bind=0.0.0.0"
+      - PYTHONUNBUFFERED=1
+    volumes:
+      - ${DATA_DIR:-./training_data}:/app/data/mainnet/training
+      - ${BLOCK_DB:-./block_db.sqlite}:/app/block_db.sqlite
+    networks:
+      - bp_cluster
+    entrypoint: ["gunicorn", "--timeout", "1800", "api_server:app"]                
+  bp-bg:
+    profiles:
+      - ""
+      - "server"
+    build:
+      context: .
+      dockerfile: Dockerfile
+    networks:
+      - bp_cluster
+    environment:
+      - PYTHONUNBUFFERED=1
+      - BN_URL=${BN_URL:-http://localhost:5052}
+      - BP_URL=http://bp:8000
+    entrypoint: ["./background_tasks.py"]
+  
+  blockgauge:
+    profiles:
+      - ""
+      - "server"
+    image: ghcr.io/blockprint-collective/blockgauge
+    command: >-
+      blockgauge
+      --lighthouse-url ${BN_URL:-localhost:5052} 
+      --blockprint-url http://bp:8000
+      --listen-address 0.0.0.0  
+    networks:
+      - bp_cluster
+  
+  blockdreamer:
+    profiles:
+      - "dreamer"
+    image: ghcr.io/blockprint-collective/blockdreamer
+    volumes:
+      - ./dreamer:/mnt/dreamer
+    working_dir: "/mnt/dreamer"
+    command: >-
+      blockdreamer 
+      --config /mnt/dreamer/config.toml
+  
+  caddy:
+    profiles:
+      - ""
+      - "server"
+    image: caddy:2.7.5
+    ports:
+    - "${PROXY_PORT:-80}:80"
+    - "443:443"
+    volumes:
+    - ./Caddyfile:/etc/caddy/Caddyfile:ro
+    - ./data/caddy/data:/data
+    - ./data/caddy/config:/config
+    - ./certs:/certs
+    restart: unless-stopped
+    networks:
+      - bp_cluster
+
+                
+
+        
+        
\ No newline at end of file
diff --git a/dreamer/example.toml b/dreamer/example.toml
new file mode 100644
index 0000000..c008cde
--- /dev/null
+++ b/dreamer/example.toml
@@ -0,0 +1,16 @@
+network = "mainnet"
+canonical_bn = "http://localhost:5052"
+
+[[post_endpoints]]
+name = "blockgauge"
+url = "http://localhost:5052/lighthouse/analysis/block_rewards"
+extra_data = false
+compare_rewards = true
+
+[[nodes]]
+name = "lighthouse-subscribe-none"
+label = "Lighthouse"
+url = "http://localhost:5052"
+v3 = true
+ssz = false
+skip_randao_verification = true
\ No newline at end of file

From 354b1d08da72dbeb7f6b797208112d3f1c727ebe Mon Sep 17 00:00:00 2001
From: Tarun <tarsuno@gmail.com>
Date: Tue, 28 May 2024 10:05:04 +0200
Subject: [PATCH 3/5] Comments to support model through the docker setup

---
 docker-compose.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docker-compose.yml b/docker-compose.yml
index b02d098..5acc52f 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -14,9 +14,11 @@ services:
       - BN_URL=${BN_URL:-localhost:5052}
       - GUNICORN_CMD_ARGS="--bind=0.0.0.0"
       - PYTHONUNBUFFERED=1
+#      - MODEL_PATH=/app/classifier.pkl
     volumes:
       - ${DATA_DIR:-./training_data}:/app/data/mainnet/training
       - ${BLOCK_DB:-./block_db.sqlite}:/app/block_db.sqlite
+#      - ./example.pkl:/app/classifier.pkl
     networks:
       - bp_cluster
     entrypoint: ["gunicorn", "--timeout", "1800", "api_server:app"]                

From 1e57f01caf9db4d6bf1341e44124f36abb0f8afe Mon Sep 17 00:00:00 2001
From: Santiago Somoza <45318759+santi1234567@users.noreply.github.com>
Date: Thu, 1 Aug 2024 22:37:52 -0300
Subject: [PATCH 4/5] Support pickle model import on API server (#34)

* rename knn_classifier to classifier

* remove deprecated file

* support changing classifier type

* add classifier type flag

* update readme

* linting

* Add method to import persisted model and allow user to set env variable

* pickle is not needed anymore in the server file

* add workaround for using pickle with gunicorn

* linting

---------

Co-authored-by: Tarun <tarsuno@gmail.com>
---
 api_server.py | 29 +++++++++++++++++++++++++----
 classifier.py | 12 ++++++++++++
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/api_server.py b/api_server.py
index 2d9ba21..1bf06d1 100644
--- a/api_server.py
+++ b/api_server.py
@@ -1,7 +1,6 @@
 import os
 import json
 import falcon
-
 from multi_classifier import MultiClassifier
 from build_db import (
     open_block_db,
@@ -17,12 +16,21 @@
     count_false_positives,
     count_false_negatives,
 )
+import __main__
+from classifier import (
+    Classifier,
+    import_classifier,
+)
+
+__main__.Classifier = Classifier
+
 
 DATA_DIR = "./data/mainnet/training"
 BLOCK_DB = os.environ.get("BLOCK_DB") or "./block_db.sqlite"
 BN_URL = "http://localhost:5052"
 SELF_URL = "http://localhost:8000"
 DISABLE_CLASSIFIER = "DISABLE_CLASSIFIER" in os.environ
+MODEL_PATH = os.environ.get("MODEL_PATH") or ""
 
 
 class Classify:
@@ -202,9 +210,22 @@ def on_get(self, req, resp, client, start_slot, end_slot=None):
 
 classifier = None
 if not DISABLE_CLASSIFIER:
-    print("Initialising classifier, this could take a moment...")
-    classifier = MultiClassifier(DATA_DIR) if not DISABLE_CLASSIFIER else None
-    print("Done")
+    if MODEL_PATH != "":
+        if MODEL_PATH.endswith(".pkl"):
+            classifier = import_classifier(MODEL_PATH)
+
+        else:
+            print("model path must end with .pkl")
+            exit(0)
+
+    else:
+        print("Initialising classifier, this could take a moment...")
+        classifier = MultiClassifier(DATA_DIR) if not DISABLE_CLASSIFIER else None
+        print("Done")
+
+if classifier is None:
+    print("The classifier was not loaded")
+    exit(0)
 
 block_db = open_block_db(BLOCK_DB)
 
diff --git a/classifier.py b/classifier.py
index 9463b0f..072d2a4 100755
--- a/classifier.py
+++ b/classifier.py
@@ -290,6 +290,18 @@ def persist_classifier(classifier: Classifier, name: str) -> None:
         print(f"Failed to persist classifier due to {e}")
 
 
+def import_classifier(model_path: str) -> Classifier:
+    print(f"""Loading classifier from {model_path}""")
+
+    try:
+        classifier = pickle.load(open(model_path, "rb"))
+        print("Loaded classifier into memory")
+        return classifier
+
+    except Exception as e:
+        print(f"Failed to import classifier due to {e}")
+
+
 def main():
     args = parse_args()
     data_dir = args.data_dir

From 4c456438c619d2999a690b4b4e5a82c95a1fc304 Mon Sep 17 00:00:00 2001
From: Michael Sproul <michael@sigmaprime.io>
Date: Fri, 2 Aug 2024 11:54:32 +1000
Subject: [PATCH 5/5] Tweak error handling (#35)

---
 api_server.py | 12 ++++++------
 classifier.py | 15 +++++++--------
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/api_server.py b/api_server.py
index 1bf06d1..49f30c7 100644
--- a/api_server.py
+++ b/api_server.py
@@ -212,21 +212,21 @@ def on_get(self, req, resp, client, start_slot, end_slot=None):
 if not DISABLE_CLASSIFIER:
     if MODEL_PATH != "":
         if MODEL_PATH.endswith(".pkl"):
-            classifier = import_classifier(MODEL_PATH)
+            try:
+                classifier = import_classifier(MODEL_PATH)
+            except Exception as e:
+                print(f"Failed to persist classifier due to {e}")
+                exit(1)
 
         else:
             print("model path must end with .pkl")
-            exit(0)
+            exit(1)
 
     else:
         print("Initialising classifier, this could take a moment...")
         classifier = MultiClassifier(DATA_DIR) if not DISABLE_CLASSIFIER else None
         print("Done")
 
-if classifier is None:
-    print("The classifier was not loaded")
-    exit(0)
-
 block_db = open_block_db(BLOCK_DB)
 
 app.add_route("/classify/no_store", ClassifyNoStore(classifier))
diff --git a/classifier.py b/classifier.py
index 072d2a4..4252da9 100755
--- a/classifier.py
+++ b/classifier.py
@@ -291,15 +291,14 @@ def persist_classifier(classifier: Classifier, name: str) -> None:
 
 
 def import_classifier(model_path: str) -> Classifier:
-    print(f"""Loading classifier from {model_path}""")
-
-    try:
-        classifier = pickle.load(open(model_path, "rb"))
-        print("Loaded classifier into memory")
-        return classifier
+    """Load a pickled classifier.
 
-    except Exception as e:
-        print(f"Failed to import classifier due to {e}")
+    This function may throw an exception if the data is corrupt or the file does not exist.
+    """
+    print(f"""Loading classifier from {model_path}""")
+    classifier = pickle.load(open(model_path, "rb"))
+    print("Loaded classifier into memory")
+    return classifier
 
 
 def main():