From 84155fe737f2cf95e58cc3727cc6cd610bab1d72 Mon Sep 17 00:00:00 2001 From: extreme4all <> Date: Wed, 7 Feb 2024 00:34:28 +0100 Subject: [PATCH] add retry limit --- .vscode/settings.json | 10 +- api/MachineLearning/data.py | 558 ++++++++++++++++++------------------ api/app.py | 416 +++++++++++++-------------- api/cogs/predict.py | 222 +++++++------- api/cogs/requests.py | 303 ++++++++++---------- notes.md | 106 +++---- 6 files changed, 810 insertions(+), 805 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 154b205..3c73b4d 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,6 +1,6 @@ -{ - "[python]": { - "editor.formatOnSave": true, - "editor.defaultFormatter": "charliermarsh.ruff" - } +{ + "[python]": { + "editor.formatOnSave": true, + "editor.defaultFormatter": "charliermarsh.ruff" + } } \ No newline at end of file diff --git a/api/MachineLearning/data.py b/api/MachineLearning/data.py index a8b34f6..d71f4a2 100644 --- a/api/MachineLearning/data.py +++ b/api/MachineLearning/data.py @@ -1,279 +1,279 @@ -import logging -from typing import List - -import numpy as np -import pandas as pd - -logger = logging.getLogger(__name__) - -skills = [ - "attack", - "defence", - "strength", - "hitpoints", - "ranged", - "prayer", - "magic", - "cooking", - "woodcutting", - "fletching", - "fishing", - "firemaking", - "crafting", - "smithing", - "mining", - "herblore", - "agility", - "thieving", - "slayer", - "farming", - "runecraft", - "hunter", - "construction", -] -minigames = [ - "league", - "bounty_hunter_hunter", - "bounty_hunter_rogue", - "cs_all", - "cs_beginner", - "cs_easy", - "cs_medium", - "cs_hard", - "cs_elite", - "cs_master", - "lms_rank", - "soul_wars_zeal", -] - - -class hiscoreData: - """ - This class is responsible for cleaning data & creating features. - """ - - def __init__(self, data: List[dict]) -> None: - self.df = pd.DataFrame(data) - self.df_clean = self.df.copy() - - self.skills = skills - self.minigames = minigames - - self.__clean() - self.__skill_ratio() - self.__boss_ratio() - - def __clean(self) -> None: - """ - Cleanup the dataframe. - - This method will: - - drop unnecessary columns - - set the index to the player id - - replace -1 with 0 - - create a list of bosses (not skills or minigames) - - create a total xp column - - create a total boss kc column - - reduces memory of dataframe - - fill na with 0 - - create a dataframe with only low level players (total level < 1_000_000) - """ - col_to_drop = ["id", "timestamp", "ts_date", "name"] - col_to_drop = [c for c in self.df_clean.columns if c in col_to_drop] - logger.info(f"dropping: {col_to_drop}") - self.df_clean.drop(columns=col_to_drop, inplace=True) - - # set index to player id - self.df_clean.set_index(["Player_id"], inplace=True) - - # if not on the hiscores it shows -1, replace with 0 - self.df_clean = self.df_clean.replace(-1, 0) - - # bosses - self.bosses = [ - c for c in self.df_clean.columns if c not in ["total"] + skills + minigames - ] - # total is not always on hiscores, create a total xp column - self.df_clean["total"] = self.df_clean[self.skills].sum(axis=1) - - # create a total boss kc column - self.df_clean["boss_total"] = ( - self.df_clean[self.bosses].sum(axis=1).astype(np.int32) - ) - - # fillna - self.df_clean.fillna(0, inplace=True) - - # apply smaller data types to reduce memory usage - non_total_features = [ - col for col in self.df_clean.columns if "total" not in col - ] - self.df_clean[non_total_features] = self.df_clean[non_total_features].astype( - np.int32 - ) - - # get low lvl players - mask = self.df_clean["total"] < 1_000_000 - self.df_low = self.df_clean[mask].copy() - - def __skill_ratio(self): - """ - Create a dataframe with the ratio of each skill to the total level. - - This method will: - - create a dataframe with the index of the original dataframe - - create a column for each skill with the ratio of the skill to the total level - - fill na with 0 - """ - self.skill_ratio = pd.DataFrame(index=self.df_clean.index) - - total = self.df_clean["total"] - - for skill in self.skills: - self.skill_ratio[f"{skill}/total"] = (self.df_clean[skill] / total).astype( - np.float16 - ) - - self.skill_ratio.fillna(0, inplace=True) - - def __boss_ratio(self): - """ - Create a dataframe with the ratio of each boss to the total boss level. - - This method will: - - create a dataframe with the index of the original dataframe - - create a column for each boss with the ratio of the boss to the total boss level - - fill na with 0 - """ - self.boss_ratio = pd.DataFrame(index=self.df_clean.index) - - total = self.df_clean["boss_total"] - for boss in self.bosses: - self.boss_ratio[f"{boss}/total"] = (self.df_clean[boss] / total).astype( - np.float16 - ) - - self.boss_ratio.fillna(0, inplace=True) - - def features( - self, base: bool = True, skill_ratio: bool = True, boss_ratio: bool = True - ): - """ - Create a dataframe with the features. - - This method will: - - create a dataframe with the index of the original dataframe - - merge the original dataframe, the skill ratio dataframe and the boss ratio dataframe - - Parameters - ---------- - base : bool, optional - Whether to include the original dataframe, by default True - skill_ratio : bool, optional - Whether to include the skill ratio dataframe, by default True - boss_ratio : bool, optional - Whether to include the boss ratio dataframe, by default True - - Returns - ------- - pd.DataFrame - Dataframe containing the features - """ - features = pd.DataFrame(index=self.df_clean.index) - if base: - features = features.merge(self.df_clean, left_index=True, right_index=True) - if skill_ratio: - features = features.merge( - self.skill_ratio, left_index=True, right_index=True - ) - if boss_ratio: - features = features.merge( - self.boss_ratio, left_index=True, right_index=True - ) - return features - - -class playerData: - """ - Class to handle the data from the json files. - """ - - def __init__(self, player_data: List[dict], label_data: List[dict]) -> None: - """ - Initialize the class. - - Parameters - ---------- - player_data : List[dict] - List of dictionaries containing the player data - label_data : List[dict] - List of dictionaries containing the label data - """ - self.df_players = pd.DataFrame(player_data) - self.df_labels = pd.DataFrame(label_data) - self.__clean() - - def __clean(self): - """ - Clean the data. - - This method will: - - set the index of the player dataframe to the player id - - set the index of the label dataframe to the label id - - merge the two dataframes - - create a binary label column - """ - # clean players - self.df_players.set_index("id", inplace=True) - - # reduce memory of player dataframe - small_size_columns = [ - "possible_ban", - "confirmed_ban", - "confirmed_player", - "label_id", - "label_jagex", - ] - self.df_players[small_size_columns] = self.df_players[ - small_size_columns - ].astype(np.int8) - - # clean labels - self.df_labels.set_index("id", inplace=True) - - # merge - self.df_players = self.df_players.merge( - self.df_labels, left_on="label_id", right_index=True - ) - self.df_players.drop(columns=["label_id"], inplace=True) - - # binary label, 1 = bot, 0 = not bot - self.df_players["binary_label"] = np.where( - self.df_players["label_jagex"] == 2, 1, 0 - ) - - def get(self, binary: bool = False): - """ - Get the target data. - - This method will: - - return the binary label or the label column - - Parameters - ---------- - binary : bool, optional - Whether to return the binary label or not, by default False - - Returns - ------- - pd.DataFrame - Dataframe containing the target data - """ - if binary: - out = self.df_players.loc[:, ["binary_label"]].astype(np.int8) - out.rename(columns={"binary_label": "target"}, inplace=True) - else: - out = self.df_players.loc[:, ["label"]].astype("category") - out.rename(columns={"label": "target"}, inplace=True) - - return out +import logging +from typing import List + +import numpy as np +import pandas as pd + +logger = logging.getLogger(__name__) + +skills = [ + "attack", + "defence", + "strength", + "hitpoints", + "ranged", + "prayer", + "magic", + "cooking", + "woodcutting", + "fletching", + "fishing", + "firemaking", + "crafting", + "smithing", + "mining", + "herblore", + "agility", + "thieving", + "slayer", + "farming", + "runecraft", + "hunter", + "construction", +] +minigames = [ + "league", + "bounty_hunter_hunter", + "bounty_hunter_rogue", + "cs_all", + "cs_beginner", + "cs_easy", + "cs_medium", + "cs_hard", + "cs_elite", + "cs_master", + "lms_rank", + "soul_wars_zeal", +] + + +class hiscoreData: + """ + This class is responsible for cleaning data & creating features. + """ + + def __init__(self, data: List[dict]) -> None: + self.df = pd.DataFrame(data) + self.df_clean = self.df.copy() + + self.skills = skills + self.minigames = minigames + + self.__clean() + self.__skill_ratio() + self.__boss_ratio() + + def __clean(self) -> None: + """ + Cleanup the dataframe. + + This method will: + - drop unnecessary columns + - set the index to the player id + - replace -1 with 0 + - create a list of bosses (not skills or minigames) + - create a total xp column + - create a total boss kc column + - reduces memory of dataframe + - fill na with 0 + - create a dataframe with only low level players (total level < 1_000_000) + """ + col_to_drop = ["id", "timestamp", "ts_date", "name"] + col_to_drop = [c for c in self.df_clean.columns if c in col_to_drop] + logger.info(f"dropping: {col_to_drop}") + self.df_clean.drop(columns=col_to_drop, inplace=True) + + # set index to player id + self.df_clean.set_index(["Player_id"], inplace=True) + + # if not on the hiscores it shows -1, replace with 0 + self.df_clean = self.df_clean.replace(-1, 0) + + # bosses + self.bosses = [ + c for c in self.df_clean.columns if c not in ["total"] + skills + minigames + ] + # total is not always on hiscores, create a total xp column + self.df_clean["total"] = self.df_clean[self.skills].sum(axis=1) + + # create a total boss kc column + self.df_clean["boss_total"] = ( + self.df_clean[self.bosses].sum(axis=1).astype(np.int32) + ) + + # fillna + self.df_clean.fillna(0, inplace=True) + + # apply smaller data types to reduce memory usage + non_total_features = [ + col for col in self.df_clean.columns if "total" not in col + ] + self.df_clean[non_total_features] = self.df_clean[non_total_features].astype( + np.int32 + ) + + # get low lvl players + mask = self.df_clean["total"] < 1_000_000 + self.df_low = self.df_clean[mask].copy() + + def __skill_ratio(self): + """ + Create a dataframe with the ratio of each skill to the total level. + + This method will: + - create a dataframe with the index of the original dataframe + - create a column for each skill with the ratio of the skill to the total level + - fill na with 0 + """ + self.skill_ratio = pd.DataFrame(index=self.df_clean.index) + + total = self.df_clean["total"] + + for skill in self.skills: + self.skill_ratio[f"{skill}/total"] = (self.df_clean[skill] / total).astype( + np.float16 + ) + + self.skill_ratio.fillna(0, inplace=True) + + def __boss_ratio(self): + """ + Create a dataframe with the ratio of each boss to the total boss level. + + This method will: + - create a dataframe with the index of the original dataframe + - create a column for each boss with the ratio of the boss to the total boss level + - fill na with 0 + """ + self.boss_ratio = pd.DataFrame(index=self.df_clean.index) + + total = self.df_clean["boss_total"] + for boss in self.bosses: + self.boss_ratio[f"{boss}/total"] = (self.df_clean[boss] / total).astype( + np.float16 + ) + + self.boss_ratio.fillna(0, inplace=True) + + def features( + self, base: bool = True, skill_ratio: bool = True, boss_ratio: bool = True + ): + """ + Create a dataframe with the features. + + This method will: + - create a dataframe with the index of the original dataframe + - merge the original dataframe, the skill ratio dataframe and the boss ratio dataframe + + Parameters + ---------- + base : bool, optional + Whether to include the original dataframe, by default True + skill_ratio : bool, optional + Whether to include the skill ratio dataframe, by default True + boss_ratio : bool, optional + Whether to include the boss ratio dataframe, by default True + + Returns + ------- + pd.DataFrame + Dataframe containing the features + """ + features = pd.DataFrame(index=self.df_clean.index) + if base: + features = features.merge(self.df_clean, left_index=True, right_index=True) + if skill_ratio: + features = features.merge( + self.skill_ratio, left_index=True, right_index=True + ) + if boss_ratio: + features = features.merge( + self.boss_ratio, left_index=True, right_index=True + ) + return features + + +class playerData: + """ + Class to handle the data from the json files. + """ + + def __init__(self, player_data: List[dict], label_data: List[dict]) -> None: + """ + Initialize the class. + + Parameters + ---------- + player_data : List[dict] + List of dictionaries containing the player data + label_data : List[dict] + List of dictionaries containing the label data + """ + self.df_players = pd.DataFrame(player_data) + self.df_labels = pd.DataFrame(label_data) + self.__clean() + + def __clean(self): + """ + Clean the data. + + This method will: + - set the index of the player dataframe to the player id + - set the index of the label dataframe to the label id + - merge the two dataframes + - create a binary label column + """ + # clean players + self.df_players.set_index("id", inplace=True) + + # reduce memory of player dataframe + small_size_columns = [ + "possible_ban", + "confirmed_ban", + "confirmed_player", + "label_id", + "label_jagex", + ] + self.df_players[small_size_columns] = self.df_players[ + small_size_columns + ].astype(np.int8) + + # clean labels + self.df_labels.set_index("id", inplace=True) + + # merge + self.df_players = self.df_players.merge( + self.df_labels, left_on="label_id", right_index=True + ) + self.df_players.drop(columns=["label_id"], inplace=True) + + # binary label, 1 = bot, 0 = not bot + self.df_players["binary_label"] = np.where( + self.df_players["label_jagex"] == 2, 1, 0 + ) + + def get(self, binary: bool = False): + """ + Get the target data. + + This method will: + - return the binary label or the label column + + Parameters + ---------- + binary : bool, optional + Whether to return the binary label or not, by default False + + Returns + ------- + pd.DataFrame + Dataframe containing the target data + """ + if binary: + out = self.df_players.loc[:, ["binary_label"]].astype(np.int8) + out.rename(columns={"binary_label": "target"}, inplace=True) + else: + out = self.df_players.loc[:, ["label"]].astype("category") + out.rename(columns={"label": "target"}, inplace=True) + + return out diff --git a/api/app.py b/api/app.py index b195147..307cdc3 100644 --- a/api/app.py +++ b/api/app.py @@ -1,208 +1,208 @@ -import asyncio -import logging -from typing import List - -import pandas as pd -from fastapi import HTTPException -from pydantic import BaseModel -from sklearn.model_selection import train_test_split - -from api import config -from api.cogs import predict -from api.cogs import requests as req -from api.MachineLearning import classifier, data -from datetime import date - -app = config.app - -binary_classifier = classifier.classifier("binaryClassifier").load() -multi_classifier = classifier.classifier("multiClassifier").load() - - -class name(BaseModel): - id: int - name: str - - -logger = logging.getLogger(__name__) - - -@app.on_event("startup") -async def initial_task(): - """ - This function is called when the api starts up. - It will load the latest model and start the prediction process. - """ - global binary_classifier, multi_classifier - if binary_classifier is None or multi_classifier is None: - binary_classifier = classifier.classifier("binaryClassifier") - multi_classifier = classifier.classifier("multiClassifier") - await train(config.secret_token) - await manual_startup(config.secret_token) - return - - -@app.get("/") -async def root(): - """ - This endpoint is used to check if the api is running. - """ - return {"detail": "hello worldz"} - - -@app.get("/startup") -async def manual_startup(secret: str): - logger.debug("manual startup") - """ - This endpoint is used to manually start the prediction process. - It is used by the detector api to start the prediction process. - """ - # secret token for api's to talk to eachother - if secret != config.secret_token: - raise HTTPException(status_code=404, detail="insufficient permissions") - - id = 0 - today = date.today() - while True: - if today != date.today(): - logger.info("new day") - id, today = 0, date.today() - - hiscores = await req.get_prediction_data( - player_id=id, limit=config.BATCH_AMOUNT - ) - _highscores = hiscores[-1] - logger.info(_highscores) - id = _highscores.get("Player_id") - hiscores = pd.DataFrame(hiscores) - - if len(hiscores) == 0: - logger.debug("No data: sleeping") - await asyncio.sleep(60) - continue - - names = hiscores[["Player_id", "name"]] - names = names.rename(columns={"Player_id": "id"}) - hiscores = hiscores[[c for c in hiscores.columns if c != "name"]] - - output = predict.predict(hiscores, names, binary_classifier, multi_classifier) - - logger.debug("Sending response") - await req.post_prediction(output) - - if len(hiscores) < config.BATCH_AMOUNT: - sleep = 60 - logger.info(f"{len(hiscores)=} < {config.BATCH_AMOUNT=}, sleeping: {sleep}") - await asyncio.sleep(sleep) - return {"detail": "ok"} - - -@app.get("/load") -async def load(secret: str): - logger.debug("loading model") - global binary_classifier, multi_classifier - """ - load the latest model. - This endpoint is used by the detector api to load the latest model. - """ - if secret != config.secret_token: - raise HTTPException(status_code=404, detail="insufficient permissions") - - binary_classifier = binary_classifier.load() - multi_classifier = multi_classifier.load() - return {"detail": "ok"} - - -@app.get("/predict") -async def predict_player(secret: str, hiscores, name: name) -> List[dict]: - """ - predict one player. - This endpoint is used by the detector api to predict one player. - """ - logger.debug(f"predicting player {name}") - if secret != config.secret_token: - raise HTTPException(status_code=404, detail="insufficient permissions") - name = pd.DataFrame(name.dict()) - output = predict.predict(hiscores, name, binary_classifier, multi_classifier) - return output - - -@app.get("/train") -async def train(secret: str): - """ - train a new model. - This endpoint is used by the detector api to train a new model. - """ - logger.debug("training model") - if secret != config.secret_token: - raise HTTPException(status_code=404, detail="insufficient permissions") - - labels = await req.get_labels() - players = [] - hiscores = [] - - for label in labels: - if label["label"] not in config.LABELS: - continue - - player_data = await req.get_player_data(label_id=label["id"]) - players.extend(player_data) - - hiscore_data = await req.get_hiscore_data(label_id=label["id"]) - hiscores.extend(hiscore_data) - - # parse hiscoreData - hiscoredata = data.hiscoreData(hiscores) - del hiscores - - # get the desired features - features = hiscoredata.features() - del hiscoredata - - # get players with binary target - player_data = data.playerData(players, labels).get(binary=True) - - # merge features with target - features_labeled = features.merge(player_data, left_index=True, right_index=True) - - # create train test data - x, y = features_labeled.iloc[:, :-1], features_labeled.iloc[:, -1] - train_x, test_x, train_y, test_y = train_test_split( - x, y, test_size=0.2, random_state=42, stratify=y - ) - - # train & score the model - binary_classifier.fit(train_x, train_y) - binary_classifier.score(test_y, test_x) - - # save the model - binary_classifier.save() - - # get players with multi target - player_data = data.playerData(players, labels).get(binary=False) - - # merge features with target - features_labeled = features.merge(player_data, left_index=True, right_index=True) - - # we need at least 100 users - to_little_data_labels = ( - pd.DataFrame(features_labeled.iloc[:, -1].value_counts()) - .query("target < 100") - .index - ) - mask = ~(features_labeled["target"].isin(to_little_data_labels)) - features_labeled = features_labeled[mask] - - # create train test data - x, y = features_labeled.iloc[:, :-1], features_labeled.iloc[:, -1] - train_x, test_x, train_y, test_y = train_test_split( - x, y, test_size=0.2, random_state=42, stratify=y - ) - - # train & score the model - multi_classifier.fit(train_x, train_y) - multi_classifier.score(test_y, test_x) - - # save the model - multi_classifier.save() - return {"detail": "ok"} +import asyncio +import logging +from typing import List + +import pandas as pd +from fastapi import HTTPException +from pydantic import BaseModel +from sklearn.model_selection import train_test_split + +from api import config +from api.cogs import predict +from api.cogs import requests as req +from api.MachineLearning import classifier, data +from datetime import date + +app = config.app + +binary_classifier = classifier.classifier("binaryClassifier").load() +multi_classifier = classifier.classifier("multiClassifier").load() + + +class name(BaseModel): + id: int + name: str + + +logger = logging.getLogger(__name__) + + +@app.on_event("startup") +async def initial_task(): + """ + This function is called when the api starts up. + It will load the latest model and start the prediction process. + """ + global binary_classifier, multi_classifier + if binary_classifier is None or multi_classifier is None: + binary_classifier = classifier.classifier("binaryClassifier") + multi_classifier = classifier.classifier("multiClassifier") + await train(config.secret_token) + await manual_startup(config.secret_token) + return + + +@app.get("/") +async def root(): + """ + This endpoint is used to check if the api is running. + """ + return {"detail": "hello worldz"} + + +@app.get("/startup") +async def manual_startup(secret: str): + logger.debug("manual startup") + """ + This endpoint is used to manually start the prediction process. + It is used by the detector api to start the prediction process. + """ + # secret token for api's to talk to eachother + if secret != config.secret_token: + raise HTTPException(status_code=404, detail="insufficient permissions") + + id = 0 + today = date.today() + while True: + if today != date.today(): + logger.info("new day") + id, today = 0, date.today() + + hiscores = await req.get_prediction_data( + player_id=id, limit=config.BATCH_AMOUNT + ) + _highscores = hiscores[-1] + logger.info(_highscores) + id = _highscores.get("Player_id") + hiscores = pd.DataFrame(hiscores) + + if len(hiscores) == 0: + logger.debug("No data: sleeping") + await asyncio.sleep(60) + continue + + names = hiscores[["Player_id", "name"]] + names = names.rename(columns={"Player_id": "id"}) + hiscores = hiscores[[c for c in hiscores.columns if c != "name"]] + + output = predict.predict(hiscores, names, binary_classifier, multi_classifier) + + logger.debug("Sending response") + await req.post_prediction(output) + + if len(hiscores) < config.BATCH_AMOUNT: + sleep = 60 + logger.info(f"{len(hiscores)=} < {config.BATCH_AMOUNT=}, sleeping: {sleep}") + await asyncio.sleep(sleep) + return {"detail": "ok"} + + +@app.get("/load") +async def load(secret: str): + logger.debug("loading model") + global binary_classifier, multi_classifier + """ + load the latest model. + This endpoint is used by the detector api to load the latest model. + """ + if secret != config.secret_token: + raise HTTPException(status_code=404, detail="insufficient permissions") + + binary_classifier = binary_classifier.load() + multi_classifier = multi_classifier.load() + return {"detail": "ok"} + + +@app.get("/predict") +async def predict_player(secret: str, hiscores, name: name) -> List[dict]: + """ + predict one player. + This endpoint is used by the detector api to predict one player. + """ + logger.debug(f"predicting player {name}") + if secret != config.secret_token: + raise HTTPException(status_code=404, detail="insufficient permissions") + name = pd.DataFrame(name.dict()) + output = predict.predict(hiscores, name, binary_classifier, multi_classifier) + return output + + +@app.get("/train") +async def train(secret: str): + """ + train a new model. + This endpoint is used by the detector api to train a new model. + """ + logger.debug("training model") + if secret != config.secret_token: + raise HTTPException(status_code=404, detail="insufficient permissions") + + labels = await req.get_labels() + players = [] + hiscores = [] + + for label in labels: + if label["label"] not in config.LABELS: + continue + + player_data = await req.get_player_data(label_id=label["id"]) + players.extend(player_data) + + hiscore_data = await req.get_hiscore_data(label_id=label["id"]) + hiscores.extend(hiscore_data) + + # parse hiscoreData + hiscoredata = data.hiscoreData(hiscores) + del hiscores + + # get the desired features + features = hiscoredata.features() + del hiscoredata + + # get players with binary target + player_data = data.playerData(players, labels).get(binary=True) + + # merge features with target + features_labeled = features.merge(player_data, left_index=True, right_index=True) + + # create train test data + x, y = features_labeled.iloc[:, :-1], features_labeled.iloc[:, -1] + train_x, test_x, train_y, test_y = train_test_split( + x, y, test_size=0.2, random_state=42, stratify=y + ) + + # train & score the model + binary_classifier.fit(train_x, train_y) + binary_classifier.score(test_y, test_x) + + # save the model + binary_classifier.save() + + # get players with multi target + player_data = data.playerData(players, labels).get(binary=False) + + # merge features with target + features_labeled = features.merge(player_data, left_index=True, right_index=True) + + # we need at least 100 users + to_little_data_labels = ( + pd.DataFrame(features_labeled.iloc[:, -1].value_counts()) + .query("target < 100") + .index + ) + mask = ~(features_labeled["target"].isin(to_little_data_labels)) + features_labeled = features_labeled[mask] + + # create train test data + x, y = features_labeled.iloc[:, :-1], features_labeled.iloc[:, -1] + train_x, test_x, train_y, test_y = train_test_split( + x, y, test_size=0.2, random_state=42, stratify=y + ) + + # train & score the model + multi_classifier.fit(train_x, train_y) + multi_classifier.score(test_y, test_x) + + # save the model + multi_classifier.save() + return {"detail": "ok"} diff --git a/api/cogs/predict.py b/api/cogs/predict.py index 5dc5f56..f7e8117 100644 --- a/api/cogs/predict.py +++ b/api/cogs/predict.py @@ -1,111 +1,111 @@ -import time -from typing import List - -import numpy as np -import pandas as pd -from api import config -from api.MachineLearning import data -from api.MachineLearning.classifier import classifier -import logging - -logger = logging.getLogger(__name__) - - -def predict( - hiscores, - names, - binary_classifier: classifier, - multi_classifier: classifier, -) -> List[dict]: - """ - This function takes in a list of hiscores, a list of names, and two classifiers. - It then predicts the probability of each hiscore being a bot or a real player. - It then returns a list of dictionaries with the predictions. - The predictions are based on the binary classifier, and the multi classifier. - The binary classifier is used to predict if the player is a real player or a bot. - The multi classifier is used to predict the type of bot. - If the binary classifier predicts that the player is a real player, then the multi classifier is not used. - If the binary classifier predicts that the player is a bot, then the multi classifier is used to predict the type of bot. - The output is a list of dictionaries with the predictions. - """ - logger.debug("Predicting hiscores for players") - hiscores = data.hiscoreData(hiscores) - low_level = hiscores.df_low.index - hiscores = hiscores.features() - - logger.debug("Predicting binary for players") - # binary prediction - binary_pred = binary_classifier.predict_proba(hiscores) - binary_pred = pd.DataFrame( - binary_pred, index=hiscores.index, columns=["Real_Player", "Unknown_bot"] - ) - - # multi prediction - logger.debug("Predicting multi for players") - multi_pred = multi_classifier.predict_proba(hiscores) - multi_pred = pd.DataFrame( - multi_pred, index=hiscores.index, columns=np.unique(config.LABELS) - ) - - # remove real players from multi - logger.debug("Removing real players from multi for players") - real_players = binary_pred.query("Real_Player > 0.5").index - mask = ~(multi_pred.index.isin(real_players)) - multi_pred = multi_pred[mask] - - # remove bots from real players - logger.debug("Removing bots from binary for players") - bots = multi_pred.index - mask = ~(binary_pred.index.isin(bots)) - binary_pred = binary_pred[mask] - - # combine binary & player_pred - logger.debug("Combining binary and multi for players") - output = pd.DataFrame(names).set_index("id") - output = output.merge(binary_pred, left_index=True, right_index=True, how="left") - - output = output.merge( - multi_pred, - left_index=True, - right_index=True, - suffixes=["", "_multi"], - how="left", - ) - - # cleanup predictions - logger.debug("Cleaning up predictions for players") - mask = output["Real_Player"].isna() # all multi class predictions - - # cleanup multi suffixes - output.loc[mask, "Unknown_bot"] = output[mask]["Unknown_bot_multi"] - output.loc[mask, "Real_Player"] = output[mask]["Real_Player_multi"] - - output.drop(columns=["Real_Player_multi", "Unknown_bot_multi"], inplace=True) - output.fillna(0, inplace=True) - - # add Predictions, Predicted_confidence, created - logger.debug("Adding Predictions, Predicted_confidence, created for players") - columns = [c for c in output.columns if c != "name"] - output["Predicted_confidence"] = round(output[columns].max(axis=1) * 100, 2) - output["Prediction"] = output[columns].idxmax(axis=1) - output["created"] = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) - output.reset_index(inplace=True) - - # low level player predictions are not accurate - logger.debug("Removing low level players for players") - - mask = output["id"].isin(low_level) - output.loc[mask, "Prediction"] = "Stats_Too_Low" - - len_too_low_players = len(output[output["Prediction"] == "Stats_Too_Low"]) - logger.debug(f"Len low level players {len_too_low_players}") - - # cut off name - output["name"] = output["name"].astype(str).str[:12] - - # parsing values - output[columns] = round(output[columns] * 100, 2) - - # convert output to dict - output = output.to_dict(orient="records") - return output +import time +from typing import List + +import numpy as np +import pandas as pd +from api import config +from api.MachineLearning import data +from api.MachineLearning.classifier import classifier +import logging + +logger = logging.getLogger(__name__) + + +def predict( + hiscores, + names, + binary_classifier: classifier, + multi_classifier: classifier, +) -> List[dict]: + """ + This function takes in a list of hiscores, a list of names, and two classifiers. + It then predicts the probability of each hiscore being a bot or a real player. + It then returns a list of dictionaries with the predictions. + The predictions are based on the binary classifier, and the multi classifier. + The binary classifier is used to predict if the player is a real player or a bot. + The multi classifier is used to predict the type of bot. + If the binary classifier predicts that the player is a real player, then the multi classifier is not used. + If the binary classifier predicts that the player is a bot, then the multi classifier is used to predict the type of bot. + The output is a list of dictionaries with the predictions. + """ + logger.debug("Predicting hiscores for players") + hiscores = data.hiscoreData(hiscores) + low_level = hiscores.df_low.index + hiscores = hiscores.features() + + logger.debug("Predicting binary for players") + # binary prediction + binary_pred = binary_classifier.predict_proba(hiscores) + binary_pred = pd.DataFrame( + binary_pred, index=hiscores.index, columns=["Real_Player", "Unknown_bot"] + ) + + # multi prediction + logger.debug("Predicting multi for players") + multi_pred = multi_classifier.predict_proba(hiscores) + multi_pred = pd.DataFrame( + multi_pred, index=hiscores.index, columns=np.unique(config.LABELS) + ) + + # remove real players from multi + logger.debug("Removing real players from multi for players") + real_players = binary_pred.query("Real_Player > 0.5").index + mask = ~(multi_pred.index.isin(real_players)) + multi_pred = multi_pred[mask] + + # remove bots from real players + logger.debug("Removing bots from binary for players") + bots = multi_pred.index + mask = ~(binary_pred.index.isin(bots)) + binary_pred = binary_pred[mask] + + # combine binary & player_pred + logger.debug("Combining binary and multi for players") + output = pd.DataFrame(names).set_index("id") + output = output.merge(binary_pred, left_index=True, right_index=True, how="left") + + output = output.merge( + multi_pred, + left_index=True, + right_index=True, + suffixes=["", "_multi"], + how="left", + ) + + # cleanup predictions + logger.debug("Cleaning up predictions for players") + mask = output["Real_Player"].isna() # all multi class predictions + + # cleanup multi suffixes + output.loc[mask, "Unknown_bot"] = output[mask]["Unknown_bot_multi"] + output.loc[mask, "Real_Player"] = output[mask]["Real_Player_multi"] + + output.drop(columns=["Real_Player_multi", "Unknown_bot_multi"], inplace=True) + output.fillna(0, inplace=True) + + # add Predictions, Predicted_confidence, created + logger.debug("Adding Predictions, Predicted_confidence, created for players") + columns = [c for c in output.columns if c != "name"] + output["Predicted_confidence"] = round(output[columns].max(axis=1) * 100, 2) + output["Prediction"] = output[columns].idxmax(axis=1) + output["created"] = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) + output.reset_index(inplace=True) + + # low level player predictions are not accurate + logger.debug("Removing low level players for players") + + mask = output["id"].isin(low_level) + output.loc[mask, "Prediction"] = "Stats_Too_Low" + + len_too_low_players = len(output[output["Prediction"] == "Stats_Too_Low"]) + logger.debug(f"Len low level players {len_too_low_players}") + + # cut off name + output["name"] = output["name"].astype(str).str[:12] + + # parsing values + output[columns] = round(output[columns] * 100, 2) + + # convert output to dict + output = output.to_dict(orient="records") + return output diff --git a/api/cogs/requests.py b/api/cogs/requests.py index 44f8869..b8e7d97 100644 --- a/api/cogs/requests.py +++ b/api/cogs/requests.py @@ -1,149 +1,154 @@ -import logging -import api.config as config -import aiohttp -import asyncio - -logger = logging.getLogger(__name__) - - -# Define an asynchronous function to make a secure HTTP GET request -async def make_request(url: str, params: dict, headers: dict = {}) -> list[dict]: - # Create a secure copy of the parameters by adding a placeholder for the token - _secure_params = params.copy() - _secure_params["token"] = "***" - - # Log the URL and secure parameters for debugging - logger.info({"url": url.split("/v")[-1], "params": _secure_params}) - - # Use aiohttp to make an asynchronous GET request - async with aiohttp.ClientSession() as session: - async with session.get(url=url, params=params, headers=headers) as resp: - # Check if the response status is OK (200) - if not resp.ok: - error_message = ( - f"response status {resp.status} " - f"response body: {await resp.text()}" - ) - # Log the error message and raise a ValueError - logger.error(error_message) - raise ValueError(error_message) - - # Parse the response JSON and return the data - data = await resp.json() - return data - - -# Define an asynchronous function to retry a request until it succeeds or raise an exception on failure -async def retry_request(url: str, params: dict) -> list[dict]: - while True: - try: - # Attempt to make the request - data = await make_request(url, params) - - # If data is received, return it - if data: - return data - except Exception as e: - # Log the error and wait for 15 seconds before retrying - _secure_params = params.copy() - _secure_params["token"] = "***" - logger.error({"url": url, "params": _secure_params, "error": str(e)}) - await asyncio.sleep(15) - - -# Define an asynchronous function to get labels from an API -async def get_labels(): - # Construct the URL and parameters for the request - url = f"{config.detector_api}/v1/label" - params = { - "token": config.token, - } - - # Retry the request until it succeeds and return the data - data = await retry_request(url=url, params=params) - return data - - -async def get_player_data(label_id: int, limit: int = 5000): - url = "http://private-api-svc.bd-prd.svc:5000/v2/player" - - params = { - "player_id": 1, - "label_id": label_id, - "greater_than": 1, - "limit": limit, - } - - # Initialize a list to store player data - players = [] - - # Continue making requests until all data is retrieved - while True: - data = await retry_request(url=url, params=params) - players.extend(data) - - logger.info(f"received: {len(data)}, in total {len(players)}") - - # Check if the received data is less than the row count, indicating the end of data - if len(data) < limit: - break - - # Increment the page parameter for the next request - params["player_id"] = data[-1]["id"] - - return players - - -async def get_hiscore_data(label_id: int, limit: int = 5000): - url = "http://private-api-svc.bd-prd.svc:5000/v2/highscore/latest" # TODO: fix hardcoded - params = {"player_id": 1, "label_id": label_id, "many": 1, "limit": limit} - - # Initialize a list to store hiscore data - hiscores = [] - - # Continue making requests until all data is retrieved - while True: - data = await retry_request(url=url, params=params) - hiscores.extend(data) - - logger.info(f"received: {len(data)}, in total {len(hiscores)}") - - # Check if the received data is less than the row count, indicating the end of data - if len(data) < limit: - break - - # Increment the page parameter for the next request - params["player_id"] = data[-1]["Player_id"] - - return hiscores - - -async def get_prediction_data(player_id: int = 0, limit: int = 0): - url = "http://private-api-svc.bd-prd.svc:5000/v2/highscore/latest" # TODO: fix hardcoded - params = {"player_id": player_id, "many": 1, "limit": limit} - - data = await retry_request(url=url, params=params) - return data - - -async def post_prediction(data: list[dict]): - url = f"{config.detector_api}/v1/prediction" - params = {"token": config.token} - - while True: - try: - async with aiohttp.ClientSession() as session: - async with session.post(url=url, params=params, json=data) as resp: - if not resp.ok: - error_message = ( - f"response status {resp.status} " - f"response body: {await resp.text()}" - ) - # Log the error message and raise a ValueError - logger.error(error_message) - await asyncio.sleep(15) - continue - break - except Exception as e: - logger.error(str(e)) - await asyncio.sleep(60) +import logging +import api.config as config +import aiohttp +import asyncio + +logger = logging.getLogger(__name__) + + +# Define an asynchronous function to make a secure HTTP GET request +async def make_request(url: str, params: dict, headers: dict = {}) -> list[dict]: + # Create a secure copy of the parameters by adding a placeholder for the token + _secure_params = params.copy() + _secure_params["token"] = "***" + + # Log the URL and secure parameters for debugging + logger.info({"url": url.split("/v")[-1], "params": _secure_params}) + + # Use aiohttp to make an asynchronous GET request + async with aiohttp.ClientSession() as session: + async with session.get(url=url, params=params, headers=headers) as resp: + # Check if the response status is OK (200) + if not resp.ok: + error_message = ( + f"response status {resp.status} " + f"response body: {await resp.text()}" + ) + # Log the error message and raise a ValueError + logger.error(error_message) + raise ValueError(error_message) + + # Parse the response JSON and return the data + data = await resp.json() + return data + + +# Define an asynchronous function to retry a request until it succeeds or raise an exception on failure +async def retry_request(url: str, params: dict) -> list[dict]: + max_retry = 3 + retry = 0 + while True: + if max_retry == retry: + break + try: + # Attempt to make the request + data = await make_request(url, params) + + # If data is received, return it + if data: + return data + except Exception as e: + # Log the error and wait for 15 seconds before retrying + _secure_params = params.copy() + _secure_params["token"] = "***" + logger.error({"url": url, "params": _secure_params, "error": str(e)}) + await asyncio.sleep(15) + retry += 1 + + +# Define an asynchronous function to get labels from an API +async def get_labels(): + # Construct the URL and parameters for the request + url = f"{config.detector_api}/v1/label" + params = { + "token": config.token, + } + + # Retry the request until it succeeds and return the data + data = await retry_request(url=url, params=params) + return data + + +async def get_player_data(label_id: int, limit: int = 5000): + url = "http://private-api-svc.bd-prd.svc:5000/v2/player" + + params = { + "player_id": 1, + "label_id": label_id, + "greater_than": 1, + "limit": limit, + } + + # Initialize a list to store player data + players = [] + + # Continue making requests until all data is retrieved + while True: + data = await retry_request(url=url, params=params) + players.extend(data) + + logger.info(f"received: {len(data)}, in total {len(players)}") + + # Check if the received data is less than the row count, indicating the end of data + if len(data) < limit: + break + + # Increment the page parameter for the next request + params["player_id"] = data[-1]["id"] + + return players + + +async def get_hiscore_data(label_id: int, limit: int = 5000): + url = "http://private-api-svc.bd-prd.svc:5000/v2/highscore/latest" # TODO: fix hardcoded + params = {"player_id": 1, "label_id": label_id, "many": 1, "limit": limit} + + # Initialize a list to store hiscore data + hiscores = [] + + # Continue making requests until all data is retrieved + while True: + data = await retry_request(url=url, params=params) + hiscores.extend(data) + + logger.info(f"received: {len(data)}, in total {len(hiscores)}") + + # Check if the received data is less than the row count, indicating the end of data + if len(data) < limit: + break + + # Increment the page parameter for the next request + params["player_id"] = data[-1]["Player_id"] + + return hiscores + + +async def get_prediction_data(player_id: int = 0, limit: int = 0): + url = "http://private-api-svc.bd-prd.svc:5000/v2/highscore/latest" # TODO: fix hardcoded + params = {"player_id": player_id, "many": 1, "limit": limit} + + data = await retry_request(url=url, params=params) + return data + + +async def post_prediction(data: list[dict]): + url = f"{config.detector_api}/v1/prediction" + params = {"token": config.token} + + while True: + try: + async with aiohttp.ClientSession() as session: + async with session.post(url=url, params=params, json=data) as resp: + if not resp.ok: + error_message = ( + f"response status {resp.status} " + f"response body: {await resp.text()}" + ) + # Log the error message and raise a ValueError + logger.error(error_message) + await asyncio.sleep(15) + continue + break + except Exception as e: + logger.error(str(e)) + await asyncio.sleep(60) diff --git a/notes.md b/notes.md index 338539f..819573f 100644 --- a/notes.md +++ b/notes.md @@ -1,54 +1,54 @@ - -# api documentation -```sh -http://127.0.0.1:8000/docs -http://127.0.0.1:8000/redoc -``` -# extra info -```sh -POST: to create data. -GET: to read data. -PUT: to update data. -DELETE: to delete data. -``` -# keeping fork up to date -```sh -git checkout develop -git pull --rebase upstream develop -git push -``` -# setup -## windows -creating a python venv to work in and install the project requirements -```sh -python -m venv .venv -.venv\Scripts\activate -python -m pip install --upgrade pip -pip install -r requirements.txt -``` -## linux -```sh -python3 -m venv .venv -source .venv/bin/activate -python -m pip install --upgrade pip -pip install -r requirements.txt -``` -# for admin purposes saving & upgrading -when you added some dependancies update the requirements -```sh -venv\Scripts\activate -call pip freeze > requirements.txt -``` -when you want to upgrade the dependancies -```sh -venv\Scripts\activate -powershell "(Get-Content requirements.txt) | ForEach-Object { $_ -replace '==', '>=' } | Set-Content requirements.txt" -call pip install -r requirements.txt --upgrade -call pip freeze > requirements.txt -powershell "(Get-Content requirements.txt) | ForEach-Object { $_ -replace '>=', '==' } | Set-Content requirements.txt" -``` -# branch cleanup -if your branch gets out of sync and for some reason you have many pushes and pulls, to become insync without pushing some random changes do this -```sh -git fetch https://github.com/Bot-detector/bot-detector-ML.git + +# api documentation +```sh +http://127.0.0.1:8000/docs +http://127.0.0.1:8000/redoc +``` +# extra info +```sh +POST: to create data. +GET: to read data. +PUT: to update data. +DELETE: to delete data. +``` +# keeping fork up to date +```sh +git checkout develop +git pull --rebase upstream develop +git push +``` +# setup +## windows +creating a python venv to work in and install the project requirements +```sh +python -m venv .venv +.venv\Scripts\activate +python -m pip install --upgrade pip +pip install -r requirements.txt +``` +## linux +```sh +python3 -m venv .venv +source .venv/bin/activate +python -m pip install --upgrade pip +pip install -r requirements.txt +``` +# for admin purposes saving & upgrading +when you added some dependancies update the requirements +```sh +venv\Scripts\activate +call pip freeze > requirements.txt +``` +when you want to upgrade the dependancies +```sh +venv\Scripts\activate +powershell "(Get-Content requirements.txt) | ForEach-Object { $_ -replace '==', '>=' } | Set-Content requirements.txt" +call pip install -r requirements.txt --upgrade +call pip freeze > requirements.txt +powershell "(Get-Content requirements.txt) | ForEach-Object { $_ -replace '>=', '==' } | Set-Content requirements.txt" +``` +# branch cleanup +if your branch gets out of sync and for some reason you have many pushes and pulls, to become insync without pushing some random changes do this +```sh +git fetch https://github.com/Bot-detector/bot-detector-ML.git ``` \ No newline at end of file