From 7767164f017f49d967dee9c172a06b77cfa85394 Mon Sep 17 00:00:00 2001 From: Pietro Tanure Date: Fri, 25 Aug 2023 00:09:40 +0200 Subject: [PATCH 01/43] Merge branch 'ilof-pietro' of https://github.com/pietro-tanure/river into ilof-pietro --- river/anomaly/__init__.py | 2 + river/anomaly/ilof.py | 492 +++++++++++++++++++ river/anomaly/ilof_notebook.ipynb | 779 ++++++++++++++++++++++++++++++ 3 files changed, 1273 insertions(+) create mode 100644 river/anomaly/ilof.py create mode 100644 river/anomaly/ilof_notebook.ipynb diff --git a/river/anomaly/__init__.py b/river/anomaly/__init__.py index 932023896e..60ff843329 100644 --- a/river/anomaly/__init__.py +++ b/river/anomaly/__init__.py @@ -17,6 +17,7 @@ from .filter import QuantileFilter, ThresholdFilter from .gaussian import GaussianScorer from .hst import HalfSpaceTrees +from .ilof import ILOF from .svm import OneClassSVM __all__ = [ @@ -27,4 +28,5 @@ "OneClassSVM", "QuantileFilter", "ThresholdFilter", + "ILOF", ] diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py new file mode 100644 index 0000000000..eca4a1008b --- /dev/null +++ b/river/anomaly/ilof.py @@ -0,0 +1,492 @@ +from __future__ import annotations + +import functools + +import pandas as pd + +from river import anomaly, utils +from river.neighbors.base import DistanceFunc +from river.utils import VectorDict + + +class ILOF(anomaly.base.AnomalyDetector): + """Incremental Local Outlier Factor (ILOF). + + ILOF Algorithm as described in the reference paper + ---------- + + The Incremental Local Outlier Factor (ILOF) is an online version of the Local Outlier Factor (LOF) used to identify outliers based on density of local neighbors. + + We consider: + - NewPoints: new points; + - kNN(p): the neighboors of p (the k closest points to p) + - RkNN(p): the rev-neighboors of p (points that have p as one of their neighboors) + - Set_upd_lrd: Set of points that need to update the local reachability distance + - Set_upd_lof: Set of points that need to update the local outlier factor + + The algorithm here implemented based on the original one in the paper is: + 1) Insert NewPoints and calculate its distance to existing points + 2) Update the neighboors and reverse-neighboors of all the points + 3) Define sets of affected points that required update + 4) Calculate the reachability-distance from new point to neighboors (NewPoints -> kNN(NewPoints)) and from rev-neighboors to new point (RkNN(NewPoints) -> NewPoints) + 5) Update the reachability-distance for affected points: RkNN(RkNN(NewPoints)) -> RkNN(NewPoints) + 6) Update local reachability distance of affected points: lrd(Set_upd_lrd) + 7) Update local outlier factor: lof(Set_upd_lof) + + Parameters + ---------- + n_neighbors : int + The number of nearest neighbors to use for density estimation. + window_size : int + The size of the batch of data to be taken in at once for the model to learn + distance_func : function that takes in dictionaries + A distance function to use. By default, the Euclidean distance is used. + verbose: boolean + Whether or not to print messages + + Attributes + ---------- + X + A list of stored observations. + X_batch + A buffer to hold incoming observations until it's time to update the model. + X_score + A buffer to hold incoming observations until it's time to score them. + dist_dict + A dictionary to hold distances between observations. + neighborhoods + A dictionary to hold neighborhoods for each observation. + rev_neighborhoods + A dictionary to hold reverse neighborhoods for each observation. + k_dist + A dictionary to hold k-distances for each observation. + reach_dist + A dictionary to hold reachability distances for each observation. + lof + A dictionary to hold Local Outlier Factors for each observation. + local_reach + A dictionary to hold local reachability distances for each observation. + skip_first + A boolean value indicating whether to skip the first window of data. + + Example + ---------- + from river import datasets + import pandas as pd + import ilof as ilof + dataset = pd.DataFrame(datasets.CreditCard()) + #Define model + k = 20 #k-neighboors + ilof_river = ilof.ILOF(k, verbose=False) + ilof_river.learn_many(dataset[0:30]) + for i in dataset[0][40:90]: + ilof_river.learn_one(i) + lof_score = [] + for x in dataset[0][100:120]: + lof_score.append(ilof_river.score_one(x)) + + References + ---------- + Pokrajac, David & Lazarevic, Aleksandar & Latecki, Longin Jan. (2007). Incremental Local Outlier Detection for Data Streams. Proceedings of the 2007 IEEE Symposium on Computational Intelligence and Data Mining, CIDM 2007. 504-515. 10.1109/CIDM.2007.368917. + """ + + def __init__( + self, + n_neighbors: int = 10, + verbose=True, + distance_func: DistanceFunc = None, + ): + self.n_neighbors = n_neighbors + self.X: list = [] + self.X_batch: list = [] + self.X_score: list = [] + self.dist_dict: dict = {} + self.neighborhoods: dict = {} + self.rev_neighborhoods: dict = {} + self.k_dist: dict = {} + self.reach_dist: dict = {} + self.lof: dict = {} + self.local_reach: dict = {} + self.verbose = verbose + self.distance = ( + distance_func + if distance_func is not None + else functools.partial(utils.math.minkowski_distance, p=2) + ) + + def learn_many(self, X_batch: pd.Series): + """ + Update the model with many incoming observations + + Parameters + ---------- + X_batch + A Panda Series + """ + X_batch = X_batch[0].tolist() + self.learn(X_batch) + + def learn_one(self, x: dict): + """ + Update the model with one incoming observation + + Parameters + ---------- + x + A dictionary of feature values. + """ + self.X_batch.append(x) + if len(self.X) or len(self.X_batch) > 1: + self.learn(self.X_batch) + self.X_batch = [] + + def learn(self, X_batch: list): + X_batch, equal = self.check_equal(X_batch, self.X) + if equal != 0 and self.verbose: + print("%i samples are equal to previous data" % equal) + + if len(X_batch) == 0: + if self.verbose: + print("No new data was added") + else: + # Increase size of objects to acomodate new data + ( + nm, + self.X, + self.neighborhoods, + self.rev_neighborhoods, + self.k_dist, + self.reach_dist, + self.dist_dict, + self.local_reach, + self.lof, + ) = self.expand_objects( + X_batch, + self.X, + self.neighborhoods, + self.rev_neighborhoods, + self.k_dist, + self.reach_dist, + self.dist_dict, + self.local_reach, + self.lof, + ) + + # Calculate neighborhoods, reverse neighborhoods, k-distances and distances between neighboors + ( + self.neighborhoods, + self.rev_neighborhoods, + self.k_dist, + self.dist_dict, + ) = self.initial_calculations( + self.X, nm, self.neighborhoods, self.rev_neighborhoods, self.k_dist, self.dist_dict + ) + + # Define sets of particles + ( + Set_new_points, + Set_neighbors, + Set_rev_neighbors, + Set_upd_lrd, + Set_upd_lof, + ) = self.define_sets(nm, self.neighborhoods, self.rev_neighborhoods) + + # Calculate new reachability distance of all affected points + self.reach_dist = self.calc_reach_dist_newpoints( + Set_new_points, + self.neighborhoods, + self.rev_neighborhoods, + self.reach_dist, + self.dist_dict, + self.k_dist, + ) + self.reach_dist = self.calc_reach_dist_otherpoints( + Set_rev_neighbors, + self.neighborhoods, + self.rev_neighborhoods, + self.reach_dist, + self.dist_dict, + self.k_dist, + ) + + # Calculate new local reachability distance of all affected points + self.local_reach = self.calc_local_reach_dist( + Set_upd_lrd, self.neighborhoods, self.reach_dist, self.local_reach + ) + + # Calculate new Local Outlier Factor of all affected points + self.lof = self.calc_lof(Set_upd_lof, self.neighborhoods, self.local_reach, self.lof) + + def score_one(self, x: VectorDict, window_score=1): + """ + Score incoming observations based on model constructed previously. + Perform same calculations as 'learn_one' function but doesn't add the new calculations to the atributes + Data samples that are equal to samples stored by the model are not considered. + + Parameters + ---------- + x + A dictionary of feature values. + window_score + The size of the batch of data to be taken in at once for the model to score + + Returns + ------- + lof : list + List of LOF calculated for incoming data + """ + + self.X_score.append(x) + + if len(self.X_score) >= window_score: + self.X_score, equal = self.check_equal(self.X_score, self.X) + if equal != 0 and self.verbose: + print("%i samples are equal to previous data" % equal) + + if len(self.X_score) == 0: + if self.verbose: + print("No new data was added") + else: + Xs = self.X.copy() + ( + nm, + Xs, + neighborhoods, + rev_neighborhoods, + k_dist, + reach_dist, + dist_dict, + local_reach, + lof, + ) = self.expand_objects( + self.X_score, + Xs, + self.neighborhoods, + self.rev_neighborhoods, + self.k_dist, + self.reach_dist, + self.dist_dict, + self.local_reach, + self.lof, + ) + + neighborhoods, rev_neighborhoods, k_dist, dist_dict = self.initial_calculations( + Xs, nm, neighborhoods, rev_neighborhoods, k_dist, dist_dict + ) + ( + Set_new_points, + Set_neighbors, + Set_rev_neighbors, + Set_upd_lrd, + Set_upd_lof, + ) = self.define_sets(nm, neighborhoods, rev_neighborhoods) + reach_dist = self.calc_reach_dist_newpoints( + Set_new_points, neighborhoods, rev_neighborhoods, reach_dist, dist_dict, k_dist + ) + reach_dist = self.calc_reach_dist_otherpoints( + Set_rev_neighbors, + neighborhoods, + rev_neighborhoods, + reach_dist, + dist_dict, + k_dist, + ) + local_reach = self.calc_local_reach_dist( + Set_upd_lrd, neighborhoods, reach_dist, local_reach + ) + lof = self.calc_lof(Set_upd_lof, neighborhoods, local_reach, lof) + self.X_score = [] + + score_keys = list(range(nm[0], nm[0] + nm[1])) + return [lof[i] for i in score_keys] + + def initial_calculations( + self, + X: list, + nm: tuple, + neighborhoods: dict, + rev_neighborhoods: dict, + k_distances: dict, + dist_dict: dict, + ): + """ + Perform initial calculations on the incoming data before applying the ILOF algorithm. + Taking the new data, it updates the neighborhoods, reverse neighborhoods, k-distances and distances between particles. + + Parameters + ---------- + X + A list of stored observations. + nm : tuple of ints, (n, m) + A tuple representing the current size of the dataset. + neighborhoods : dict + A dictionary of particle neighborhoods. + rev_neighborhoods : dict + A dictionary of reverse particle neighborhoods. + k_distances : dict + A dictionary to hold k-distances for each observation. + dist_dict : dict of dicts + A dictionary of dictionaries storing distances between particles + + Returns + ------- + neighborhoods : dict + Updated dictionary of particle neighborhoods + rev_neighborhoods : dict + Updated dictionary of reverse particle neighborhoods + k_distances : dict + Updated dictionary to hold k-distances for each observation + dist_dict : dict of dicts + Updated dictionary of dictionaries storing distances between particles + """ + + n = nm[0] + m = nm[1] + k = self.n_neighbors + + # Calculate distances all particles consdering new and old ones + new_distances = [ + [i, j, self.distance(X[i], X[j])] for i in range(n + m) for j in range(i) if i >= n + ] + # Add new distances to distance dictionary + for i in range(len(new_distances)): + dist_dict[new_distances[i][0]][new_distances[i][1]] = new_distances[i][2] + dist_dict[new_distances[i][1]][new_distances[i][0]] = new_distances[i][2] + + # Calculate new k-dist for each particle + for i, inner_dict in enumerate(dist_dict.values()): + k_distances[i] = sorted(inner_dict.values())[min(k, len(inner_dict.values())) - 1] + + # Only keep particles that are neighbors in distance dictionary + dist_dict = { + k: {k2: v2 for k2, v2 in v.items() if v2 <= k_distances[k]} + for k, v in dist_dict.items() + } + + # Define new neighborhoods for particles + for key, value in dist_dict.items(): + neighborhoods[key] = [index for index in value] + + # Define new reverse neighborhoods for particles + for particle_id, neighbor_ids in neighborhoods.items(): + for neighbor_id in neighbor_ids: + rev_neighborhoods[neighbor_id].append(particle_id) + + return neighborhoods, rev_neighborhoods, k_distances, dist_dict + + def check_equal(self, X: list, Y: list): + """Check if new batch X has some data samples equal to previous data recorded Y""" + result = [x for x in X if not any(x == y for y in Y)] + return result, len(X) - len(result) + + def expand_objects( + self, + new_particles: list, + X: list, + neighborhoods: dict, + rev_neighborhoods: dict, + k_dist: dict, + reach_dist: dict, + dist_dict: dict, + local_reach: dict, + lof: dict, + ): + """Expand size of dictionaries and lists to fit new data""" + n = len(X) + m = len(new_particles) + X.extend(new_particles) + neighborhoods.update({i: [] for i in range(n + m)}) + rev_neighborhoods.update({i: [] for i in range(n + m)}) + k_dist.update({i: float("inf") for i in range(n + m)}) + reach_dist.update({i + n: {} for i in range(m)}) + dist_dict.update({i + n: {} for i in range(m)}) + local_reach.update({i + n: [] for i in range(m)}) + lof.update({i + n: [] for i in range(m)}) + return ( + (n, m), + X, + neighborhoods, + rev_neighborhoods, + k_dist, + reach_dist, + dist_dict, + local_reach, + lof, + ) + + def define_sets(self, nm, neighborhoods: dict, rev_neighborhoods: dict): + """Define sets of points for the ILOF algorithm""" + # Define set of new points from batch + Set_new_points = set(range(nm[0], nm[0] + nm[1])) + Set_neighbors: set = set() + Set_rev_neighbors: set = set() + + # Define neighbors and reverse neighbors of new data points + for i in Set_new_points: + Set_neighbors = set(Set_neighbors) | set(neighborhoods[i]) + Set_rev_neighbors = set(Set_rev_neighbors) | set(rev_neighborhoods[i]) + + # Define points that need to update their local reachability distance because of new data points + Set_upd_lrd = Set_rev_neighbors + for j in Set_rev_neighbors: + Set_upd_lrd = Set_upd_lrd | set(rev_neighborhoods[j]) + Set_upd_lrd = Set_upd_lrd | Set_new_points + + # Define points that need to update their lof because of new data points + Set_upd_lof = Set_upd_lrd + for m in Set_upd_lrd: + Set_upd_lof = Set_upd_lof | set(rev_neighborhoods[m]) + Set_upd_lof = Set_upd_lof + + return Set_new_points, Set_neighbors, Set_rev_neighbors, Set_upd_lrd, Set_upd_lof + + def calc_reach_dist_newpoints( + self, + Set: set, + neighborhoods: dict, + rev_neighborhoods: dict, + reach_dist: dict, + dist_dict: dict, + k_dist: dict, + ): + """Calculate reachability distance from new points to neighbors and from neighbors to new points""" + for c in Set: + for j in set(neighborhoods[c]): + reach_dist[c][j] = max(dist_dict[c][j], k_dist[j]) + for j in set(rev_neighborhoods[c]): + reach_dist[j][c] = max(dist_dict[j][c], k_dist[c]) + return reach_dist + + def calc_reach_dist_otherpoints( + self, + Set: set, + neighborhoods: dict, + rev_neighborhoods: dict, + reach_dist: dict, + dist_dict: dict, + k_dist: dict, + ): + """Calculate reachability distance from reverse neighbors of reverse neighbors ( RkNN(RkNN(NewPoints)) ) to reverse neighbors ( RkNN(NewPoints) ) + These values change because of the insertion of new points""" + for j in Set: + for i in set(rev_neighborhoods[j]): + reach_dist[i][j] = max(dist_dict[i][j], k_dist[j]) + return reach_dist + + def calc_local_reach_dist( + self, Set: set, neighborhoods: dict, reach_dist: dict, local_reach_dist: dict + ): + """Calculate local reachability distance of affected points""" + for i in Set: + local_reach_dist[i] = len(neighborhoods[i]) / sum( + [reach_dist[i][j] for j in neighborhoods[i]] + ) + return local_reach_dist + + def calc_lof(self, Set: set, neighborhoods: dict, local_reach: dict, lof: dict): + """Calculate local outlier factor of affected points""" + for i in Set: + lof[i] = sum([local_reach[j] for j in neighborhoods[i]]) / ( + len(neighborhoods[i]) * local_reach[i] + ) + return lof diff --git a/river/anomaly/ilof_notebook.ipynb b/river/anomaly/ilof_notebook.ipynb new file mode 100644 index 0000000000..7509df27ee --- /dev/null +++ b/river/anomaly/ilof_notebook.ipynb @@ -0,0 +1,779 @@ +{ + "cells": [ + { + "attachments": { + "Screenshot from 2023-06-08 10-01-42.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Incremental Local Outlier factor\n", + "\n", + "##### Created by: Pietro TANURE ONNIS\n", + "##### Final project for the Online Machine Learning course at Telecom Paris 2023\n", + "\n", + "The Incremental Local Outlier Factor (ILOF) is an online version of the Local Outlier Factor (LOF) used to identify outliers based on density of local neighbors.\n", + "\n", + "We consider: \n", + "\n", + " - NewPoints: new points; \n", + " - kNN(p): the neighboors of p (the k closest points to p)\n", + " - RkNN(p): the rev-neighboors of p (points that have p as one of their neighboors)\n", + " - Set_upd_lrd: Set of points that need to update the local reachability distance\n", + " - Set_upd_lof: Set of points that need to update the local outlier factor\n", + "\n", + "The algorithm here implemented based on the original one in the paper is:\n", + "\n", + " 1) Insert NewPoints and calculate its distance to existing points\n", + " 2) Update the neighboors and reverse-neighboors of all the points\n", + " 3) Define sets of affected points that required update\n", + " 4) Calculate the reachability-distance from new point to neighboors (NewPoints -> kNN(NewPoints)) and from rev-neighboors to new point (RkNN(NewPoints) -> NewPoints)\n", + " 5) Update the reachability-distance for affected points: RkNN(RkNN(NewPoints)) -> RkNN(NewPoints)\n", + " 6) Update local reachability distance of affected points: lrd(Set_upd_lrd)\n", + " 7) Update local outlier factor: lof(Set_upd_lof)\n", + "\n", + "Reference: Pokrajac, David & Lazarevic, Aleksandar & Latecki, Longin Jan. (2007). Incremental Local Outlier Detection for Data Streams. Proceedings of the 2007 IEEE Symposium on Computational Intelligence and Data Mining, CIDM 2007. 504-515. 10.1109/CIDM.2007.368917. \n", + "\n", + "![Screenshot from 2023-06-08 10-01-42.png]()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Generate data with some outliers\n", + "\n", + "We create an artifical data centered around two cluster and add some data made from another distribuition (outliers)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "np.random.seed(42)\n", + "\n", + "# Generate train data\n", + "X_inliers = 0.3 * np.random.randn(100, 2)\n", + "X_inliers = np.r_[X_inliers + 2, X_inliers - 2]\n", + "\n", + "# Generate some outliers\n", + "X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))\n", + "X = np.r_[X_inliers, X_outliers]\n", + "\n", + "n_outliers = len(X_outliers)\n", + "ground_truth = np.ones(len(X), dtype=int)\n", + "ground_truth[-n_outliers:] = -1\n", + "\n", + "#Visualize data\n", + "plt.title(\"Data with Outliers\")\n", + "plt.scatter(X[:, 0], X[:, 1], color=\"k\", s=3.0, label=\"Data points\")\n", + "plt.axis(\"tight\")\n", + "plt.xlim((-5, 5))\n", + "plt.ylim((-5, 5))\n", + "plt.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Online Machine Learning with River\n", + "\n", + "Using the built class ILOF we calculte the Local Outlier Factor in an online approach, at each step we send an instance of data to the model, that will learn it using the 'learn_one' function, updating the model incrementally. \n", + "\n", + "The class ILOF takes in the following argumetns: \n", + "\n", + "ILOF(*k_neighbors = int; batch_size = int; verbose = boolean; distance_func = function*)\n", + "\n", + "\n", + "Function to learn batch of data: \n", + "\n", + "learn_one(*x: dictionary*)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import ilof_ as ilof\n", + "from river import utils\n", + "\n", + "#Convert to dictionary\n", + "Xdicts = tuple({f'feature_{i+1}': x[i] for i in range(2)} for x in X)\n", + "\n", + "#Define model\n", + "k = 20 #k-neighboors\n", + "ilof_river = ilof.ILOF(k, verbose=False)\n", + "\n", + "#Fit model on stream data\n", + "for x in Xdicts:\n", + " ilof_river.learn_one(x)\n", + "\n", + "lof_scores_river = np.array([v for v in ilof_river.lof.values()])" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also evaluate data without adding learning it, this way the programme outputs a LOF for the data but doesn't update the parameters of the model (neighborhoods, reverse neighborhoods, k-distances, reachability-distances, local outlier factor).\n", + "\n", + "The learn_one function can take in the following arguments:\n", + "*learn_one(k_neighbors, batch_size, verbose, distance_func)*" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[4.1225951442931335], [4.856989240600968], [6.700737740891463], [4.67398123498832], [10.318743687776967], [1.695085624011412], [3.1578141066216476], [1.6190480666043154], [4.135136784137301], [5.1959975977883746]]\n" + ] + } + ], + "source": [ + "#Evaluate data without updating the model\n", + "X_score = np.random.uniform(low=-4, high=4, size=(10, 2))\n", + "\n", + "X_score_dict = tuple({f'feature_{i+1}': x[i] for i in range(2)} for x in X_score)\n", + "\n", + "lof_score = []\n", + "for x in X_score_dict:\n", + " lof_score.append(ilof_river.score_one(x))\n", + "\n", + "print(lof_score)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Batch Machine Learning with Scikit-learn\n", + "\n", + "To compare our results we fit a model with the same data using scikit-learn\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "from sklearn.neighbors import LocalOutlierFactor\n", + "\n", + "#Define the model\n", + "lof_scikit = LocalOutlierFactor(n_neighbors=k)\n", + "\n", + "#Fit model on data\n", + "lof_scikit.fit_predict(X)\n", + "\n", + "#Get Local Outlier Factor\n", + "lof_scores_scikit = - lof_scikit.negative_outlier_factor_" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Plot results\n", + "\n", + "We observe that both the online river approach (river) and the batch approach (sklearn) give the same results, but the online has the advantage of being adapted to treat data streams" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "We observe that the difference is at machine error level:\n" + ] + }, + { + "data": { + "text/plain": [ + "array([-4.09616785e-12, -6.33715302e-11, 1.09381393e-11, -1.17567955e-10,\n", + " 1.33887346e-11, 1.18958177e-11, -6.57527366e-11, -7.40709716e-11,\n", + " -7.44937445e-12, -6.17876861e-11, -9.02540265e-11, -2.53728150e-11,\n", + " 1.22162280e-11, -1.55155888e-11, 1.49403823e-11, -1.26576083e-10,\n", + " -1.56268332e-11, -4.20663504e-11, -7.77566900e-11, -2.44553267e-11,\n", + " -2.61148880e-11, 1.45454759e-11, -5.68558534e-11, -2.96740410e-11,\n", + " -5.03610487e-11, 6.52056187e-12, 8.45368220e-12, -6.24331697e-11,\n", + " 3.10773629e-12, -1.87143634e-11, 1.69848580e-11, -6.51001475e-11,\n", + " -5.37054845e-11, -7.33879624e-12, -4.45921078e-12, -6.28377350e-11,\n", + " -6.90747459e-11, -3.41847439e-10, 1.80840898e-11, -8.69331274e-11,\n", + " 2.49876786e-11, -1.05403020e-10, -6.49036380e-12, -5.79656323e-11,\n", + " 7.16537940e-12, -7.47046869e-12, 6.71662725e-12, -4.13140633e-11,\n", + " -7.85482790e-13, 1.55593316e-11, -4.18334256e-11, 5.83022519e-12,\n", + " 2.18497442e-11, -1.66783476e-10, 1.46855861e-11, -1.15608412e-10,\n", + " -1.78021597e-10, 2.40570897e-11, -2.40685250e-11, -7.99027511e-11,\n", + " -3.46016549e-11, -1.07650999e-10, -1.20261578e-10, -3.42865736e-11,\n", + " 9.95292737e-12, -3.97579747e-11, -1.71840320e-11, -1.03541176e-10,\n", + " 8.75211015e-12, -4.21567226e-11, -5.32438538e-11, -5.20865573e-11,\n", + " -3.00426350e-13, -9.62636637e-11, -1.02473585e-11, 2.13062901e-12,\n", + " 8.63087379e-12, -3.78064247e-12, -1.60033098e-10, -3.17976756e-11,\n", + " -2.91409119e-11, -9.17790288e-11, -5.40958389e-11, -8.77666828e-11,\n", + " 8.20443713e-12, -4.10795842e-11, 2.28256303e-11, -6.83730850e-12,\n", + " -6.59496902e-11, -2.46087373e-10, -2.80828694e-11, -1.77728943e-11,\n", + " 1.73787651e-11, -2.39386289e-12, -5.03110886e-11, 5.85975712e-13,\n", + " -2.67477152e-11, 1.17149623e-11, -5.93547433e-12, -2.51376697e-11,\n", + " -3.91231492e-12, -6.39361897e-11, 1.17769128e-11, -1.36209932e-10,\n", + " 1.12351239e-11, 1.16284760e-11, -6.91506852e-11, -8.15103540e-11,\n", + " -9.22262267e-12, -6.39255315e-11, -9.08870756e-11, -2.77369239e-11,\n", + " 1.16167076e-11, -1.39550593e-11, 1.46895829e-11, -1.41733292e-10,\n", + " -1.54865010e-11, -4.47302195e-11, -7.35935757e-11, -2.80169221e-11,\n", + " -2.83830737e-11, 1.49515955e-11, -6.71342981e-11, -3.07622816e-11,\n", + " -5.31310551e-11, 6.80433487e-12, 5.86319882e-12, -6.18920470e-11,\n", + " 3.35398376e-13, -2.10242934e-11, 1.76045845e-11, -6.86697366e-11,\n", + " -6.14872597e-11, -6.16851015e-12, -3.94029254e-12, -6.70448141e-11,\n", + " -7.16353643e-11, -3.63674646e-10, 1.83564275e-11, -7.49840190e-11,\n", + " 2.55555577e-11, -1.05627285e-10, -1.05502274e-11, -5.96580563e-11,\n", + " 4.42612613e-12, -6.44362341e-12, 6.25999252e-12, -4.48610038e-11,\n", + " -5.12034859e-13, 1.57456270e-11, -4.36486403e-11, 4.71545025e-12,\n", + " 2.25058860e-11, -1.84880333e-10, 1.47774015e-11, -1.19127597e-10,\n", + " -1.85351290e-10, 2.46216381e-11, -2.55948596e-11, -7.39723838e-11,\n", + " -3.54656304e-11, -1.09745102e-10, -1.43158596e-10, -3.35480532e-11,\n", + " 1.04337650e-11, -4.41688908e-11, -1.50348622e-11, -1.16774812e-10,\n", + " 7.96585020e-12, -4.48201476e-11, -5.69460035e-11, -5.68629588e-11,\n", + " 5.56110713e-13, -1.03254738e-10, -1.00581765e-11, 2.49988918e-12,\n", + " 6.66378064e-12, -3.26383365e-12, -1.77152959e-10, -3.82229803e-11,\n", + " -3.23736593e-11, -9.27151689e-11, -5.59789992e-11, -9.92872451e-11,\n", + " 8.13549228e-12, -4.12598844e-11, 2.33668640e-11, -8.91531293e-12,\n", + " -6.88478163e-11, -2.58987498e-10, -2.74111844e-11, -1.59048330e-11,\n", + " 1.83143500e-11, -2.21744845e-12, -6.28099794e-11, -1.59727787e-12,\n", + " -2.83923995e-11, 1.20728982e-11, -7.79643017e-12, -2.67832423e-11,\n", + " -4.55934845e-10, -1.07836184e-10, -3.60191432e-10, -1.28838495e-10,\n", + " -7.21577020e-10, -1.42578749e-09, -2.53113752e-10, -9.60296287e-11,\n", + " -2.22227037e-09, -9.43074063e-10, -1.70505743e-09, -1.32515510e-09,\n", + " -1.30334055e-09, -1.82649540e-09, -1.89989269e-09, -1.57570668e-09,\n", + " -1.35358125e-09, -1.53304036e-09, -1.98927808e-09, -1.32804345e-09])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from river.utils import dict2numpy\n", + "X_scores1 = lof_scores_scikit\n", + "X_scores2 = lof_scores_river\n", + "\n", + "fig, axs = plt.subplots(ncols=2, figsize=(10, 5))\n", + "\n", + "# First plot\n", + "radius1 = (X_scores1.max() - X_scores1) / (X_scores1.max() - X_scores1.min()) * (X_scores1 > 1.5)\n", + "axs[0].set_title(\"Local Outlier Factor - Sklearn \")\n", + "axs[0].scatter(X[:, 0], X[:, 1], color=\"k\", s=3.0, label=\"Data points\")\n", + "axs[0].axis(\"tight\")\n", + "axs[0].set_xlim((-5, 5))\n", + "axs[0].set_ylim((-5, 5))\n", + "axs[0].scatter(X[:, 0], X[:, 1], s=1000 * radius1, edgecolors=\"r\", facecolors=\"none\", label=\"Outlier scores\")\n", + "axs[0].legend(loc=\"upper left\")\n", + "\n", + "# Second plot\n", + "radius2 = (max(X_scores2) - X_scores2) / (max(X_scores2) - min(X_scores2)) * (X_scores2 > 1.5)\n", + "axs[1].set_title(\"Incremental Local Outlier Factor - River\")\n", + "axs[1].scatter(X[:, 0], X[:, 1], color=\"k\", s=3.0, label=\"Data points\")\n", + "axs[1].axis(\"tight\")\n", + "axs[1].set_xlim((-5, 5))\n", + "axs[1].set_ylim((-5, 5))\n", + "axs[1].scatter(X[:, 0], X[:, 1], s=1000 * radius2, edgecolors=\"r\", facecolors=\"none\", label=\"Outlier scores\")\n", + "axs[1].legend(loc=\"upper left\")\n", + "\n", + "plt.show()\n", + "\n", + "print('We observe that the difference is at machine error level:')\n", + "X_scores1 - X_scores2" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Time comparison\n", + "\n", + "Although not adapted to a batch approach River ILOF also allows to calculate a mini-batch approach using the function 'learn_many', here we compare the time of execution of ScikitLOF and RiverILOF considering both receiving a *batch_size* number of data points:\n", + "\n", + "ScikitLOF has applies methods like tree search that optimize the search for neighboors so its execution time remains more of less constant" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from river import datasets\n", + "import pandas as pd\n", + "from river.utils import dict2numpy\n", + "import time \n", + "dataset = pd.DataFrame(datasets.CreditCard())\n", + "dataset_np = [dict2numpy(i) for i in dataset[0].to_dict().values()]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "We observe again that the error is of machine precision level: (the few errors at the scale of e-05 are because of the river minkowski-distance function error, I submitted a correction request, for details see bottom of this document \n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 1.15640830e-12, 1.39670502e-05, 1.18705046e-12, 1.55497837e-12,\n", + " 4.09583478e-12, 1.33378808e-05, 1.28123493e-05, 1.81729894e-05,\n", + " 2.78910228e-12, 1.16189877e-05, 1.14542134e-05, 9.35752947e-03,\n", + " 1.18571819e-12, 1.33008174e-05, 2.86748403e-12, 1.16448999e-05,\n", + " 1.14404267e-05, 1.10114258e-05, 3.12239123e-12, 1.07421263e-05,\n", + " 5.79758463e-13, 1.48459023e-12, 1.02679237e-05, 1.10608636e-05,\n", + " 9.62921621e-06, 4.63407090e-13, 1.60071956e-12, -1.26565425e-13,\n", + " 1.02962083e-12, -1.02362563e-13, -1.12798659e-13, -8.31557045e-14,\n", + " -1.26343380e-13, -1.26343380e-13, -9.31477118e-14, -9.31477118e-14,\n", + " 5.65991698e-13, -1.12909682e-13, 5.81756865e-14, -6.90558721e-14,\n", + " 5.17141885e-13, -1.15241150e-13, -3.32289751e-13, -1.55431223e-13,\n", + " -3.33177930e-13, -7.32747196e-14, -1.53210777e-13, -2.20934382e-13,\n", + " 5.41122702e-13, -7.88258347e-14, -1.08357767e-13, 3.73034936e-13,\n", + " -1.61870517e-13, -2.10720330e-13, -1.11577414e-13, 1.06137321e-13,\n", + " 7.18092252e-13, -2.16826557e-13, -1.96176408e-13, -1.94289029e-14,\n", + " 5.29576383e-13, 5.44231327e-13, 7.85815857e-13, 7.86259946e-13,\n", + " 5.29798427e-13, 1.77857729e-13, 1.84563476e-12, 2.95319325e-13,\n", + " 1.70974346e-12, 1.01563202e-12, 9.39026634e-13, 3.55271368e-15,\n", + " 1.93178806e-13, 1.67421632e-13, 4.48752147e-13, 1.11022302e-13,\n", + " 1.40487622e-12, 5.26911847e-13, 6.88338275e-14, 1.00586206e-13,\n", + " 2.87547763e-13, 1.34336986e-13, 7.30526750e-13, 3.91464638e-13,\n", + " 1.56761799e-04, 2.69784195e-13, 2.46469511e-13, 1.66405849e-04,\n", + " 1.58040050e-04, 2.77333712e-13, 3.08864045e-13, 1.92956762e-13,\n", + " 1.82164776e-04, 1.60346254e-04, -3.75195624e-04, 6.79012402e-13,\n", + " 1.57798215e-04, -2.95809327e-04, -3.33431024e-04, 6.43923737e-03,\n", + " 1.54669151e-04, -3.04159366e-04, 1.56511402e-04, 1.89848137e-13,\n", + " 1.55735689e-04, -2.70298576e-04, -5.93150343e-06, -5.30967388e-06,\n", + " 4.09087888e-04, 4.07393126e-04, 1.70752301e-13, -5.79448265e-06,\n", + " 4.07393126e-04, 4.07393126e-04, 4.07393126e-04, 4.07393126e-04,\n", + " 1.30118138e-13, 9.82103288e-13, 1.80522264e-13, -5.32003883e-06,\n", + " 9.96536187e-13, -5.26649721e-06, 2.24709140e-13, 2.70457805e-04,\n", + " 3.94795308e-13, 2.55833616e-02, 2.01616501e-13, -7.19974911e-06,\n", + " -4.16309377e-04, 2.26485497e-13, 1.36335387e-13, -1.56394049e-04,\n", + " -1.60504680e-04, 1.44551038e-13, 3.40172335e-13, 3.41948692e-13,\n", + " -6.06908744e-06, -1.60557147e-04, 9.34519349e-03, 3.09530179e-13,\n", + " 1.63424829e-13, -2.24538171e-04, 5.22248911e-13, -6.35571313e-04,\n", + " 4.31520099e-03, -1.72894609e-04, -1.73668432e-04, 4.70068429e-13,\n", + " 2.10026490e-02, 7.59392549e-13, 1.70530257e-13, 5.21804822e-13,\n", + " 3.11006829e-04, 3.11648281e-04, 3.11801549e-04, 3.29088527e-04,\n", + " 3.30327983e-04, 9.14823772e-14, 2.27151631e-13, 2.56017429e-13,\n", + " 1.31450406e-13, 1.53210777e-13, 1.17873899e-05, 3.31071287e-04,\n", + " 2.02060590e-12, 6.01740879e-13, -5.12563388e-04, 1.92068583e-13,\n", + " -5.09268541e-04, 1.82076576e-13, 2.81108470e-13, 4.03899136e-13,\n", + " 2.44915199e-13, 2.04725126e-13, 1.65201186e-13, 1.01008091e-12,\n", + " 7.07878201e-13, 4.12336831e-13, -3.49806685e-04, 2.26929586e-13,\n", + " 9.29034627e-13, 4.90496532e-13, -3.37636089e-04, 3.67039732e-13,\n", + " 8.68860539e-13, 1.28319577e-12, 1.28341782e-13, -3.15081632e-04,\n", + " -3.19684248e-04, 2.52020627e-13, 1.59494640e-12, 1.46549439e-13,\n", + " 4.73399098e-13, 1.38620889e-02, -3.05438289e-04, -3.01573389e-04,\n", + " -2.90886979e-04, -3.03033530e-04, -2.91426910e-04, 6.49596881e-05,\n", + " -2.87462398e-04, 1.91446858e-12, 1.39372554e-04, -2.83187516e-04,\n", + " 6.25721697e-13, 1.22379275e-04, 2.41584530e-13, 1.40776280e-13,\n", + " 1.62048153e-12, -2.77279167e-04, 1.76725301e-12, 1.20264974e-04,\n", + " 4.20552482e-13, 1.44328993e-13, 1.19077456e-04, 1.18444812e-04,\n", + " 1.17944021e-04, 2.42117437e-12, -1.16573418e-14, -3.04312131e-13,\n", + " 1.18190262e-04, 1.18190262e-04, 1.18186866e-04, 1.18186866e-04,\n", + " 1.26787469e-13, 1.36677821e-04, -2.38211696e-04, 1.59650071e-13,\n", + " 1.14530607e-12, 8.85291840e-13, 2.47801779e-13, -2.49588575e-04,\n", + " -2.50087996e-04, -2.55130443e-04, 1.33448808e-13, 1.86295424e-13,\n", + " -2.62012944e-04, -2.68561161e-04, -2.62749009e-04, -3.02303230e-04,\n", + " 1.00364161e-13, -3.37650827e-04, 1.50029701e-02, 1.08113518e-12,\n", + " 1.47952174e-02, -2.90580271e-04, 1.00053299e-12, -2.91386478e-04,\n", + " 1.34847689e-12, 5.00932629e-13, -2.95171897e-04, 6.53255228e-13,\n", + " -3.02667832e-04, -2.98851904e-04, 7.90034704e-13, 4.60520511e-13,\n", + " 8.42437231e-13, 5.03597164e-13, -3.15720643e-04, 3.31956684e-13,\n", + " 5.34683409e-13, -3.06075277e-04, -3.06075277e-04, 1.41664458e-13,\n", + " 3.04645198e-13, 7.26085858e-14, 5.73319170e-13, 3.71258579e-13,\n", + " 1.19015908e-13, 3.38173933e-13, 1.11022302e-13, 5.12923037e-14,\n", + " 1.06670228e-12, 1.15463195e-14, 9.37694367e-13, -2.98649994e-14,\n", + " -2.78665979e-14, 4.21440660e-13, -2.26485497e-14, 5.71764858e-13,\n", + " -2.43138842e-14, 2.72448730e-13, 6.44595488e-13, 3.44169138e-13,\n", + " 8.35997938e-13, -4.45199433e-14, -6.62803146e-14, 3.33288952e-13,\n", + " 4.50528503e-13, -6.94999613e-14, -5.57331958e-14, -4.11892742e-14,\n", + " 9.54125667e-13, 5.40456568e-13, -8.16013923e-14, -9.27036226e-14,\n", + " 9.61453139e-14, 7.74047493e-13, -1.30562228e-13, -9.54791801e-14,\n", + " 4.21884749e-15, 1.77635684e-14, -2.06501483e-14, -2.59792188e-14,\n", + " -3.81916720e-14, -3.16413562e-14, 6.26165786e-13, 9.61453139e-14,\n", + " 2.50910404e-14, 5.21804822e-15, 1.06559206e-12, -4.18554080e-14,\n", + " 4.15223411e-13, 4.17887946e-13, 2.96873637e-13, 5.50892665e-13,\n", + " 1.03472786e-12, 9.06164033e-13, 4.37649916e-13, 1.73638881e-13,\n", + " 1.11466392e-12, 3.01758618e-13, 2.29816166e-13, -2.22044605e-15,\n", + " -2.88657986e-15, 7.31414929e-13, -2.57571742e-14, -3.37507799e-14,\n", + " -1.86517468e-14, 9.34807787e-14, 2.33146835e-14, 1.27675648e-14,\n", + " 7.54951657e-14, 4.63185046e-13, 9.41469125e-14, 8.59312621e-14,\n", + " 5.26245714e-14, 8.12683254e-14, -5.19584376e-14, -1.96509475e-14,\n", + " -7.96029909e-14, 1.24589228e-12, 3.65263375e-13, 4.01012556e-13,\n", + " 4.09894341e-13, -5.41788836e-14, 9.21485110e-14, -2.18047802e-13,\n", + " -4.78506124e-14, 3.14193116e-13, -1.68753900e-13, 2.09943174e-12,\n", + " -1.72306613e-13, 9.87210313e-13, 3.15747428e-13, -2.51243470e-13,\n", + " -1.96620498e-13, 2.12030393e-12, 7.80930876e-13, -3.35287353e-14,\n", + " -3.73034936e-14, -7.16093851e-14, 1.05560005e-12, -2.33146835e-15,\n", + " 5.35127498e-13, 4.10338430e-13, -2.55351296e-14, -1.18016708e-13,\n", + " -1.08912879e-13, -1.32893696e-13, 2.20135021e-12, -2.63122857e-14,\n", + " -3.04201109e-14, 6.73905376e-13, 1.53210777e-12, -3.90798505e-14,\n", + " 1.89404048e-13, -2.84217094e-14, 4.90940621e-13, -1.55431223e-15,\n", + " 5.88196158e-13, -6.69464484e-14, 4.00568467e-13, -3.74145159e-14,\n", + " 1.57096558e-12, -9.04831765e-14, -3.54161145e-14, 3.23518989e-13,\n", + " -9.40358902e-14, -5.65103520e-14, 1.43218770e-14, -5.08482145e-14,\n", + " 4.03010958e-13, 1.71951342e-12, -4.99600361e-14, 7.59392549e-13,\n", + " 3.19522186e-13, 7.03215264e-13, 3.57491814e-14, 2.75557355e-13,\n", + " 1.69819714e-12, 2.54463117e-13, 4.22106794e-13, 2.44693155e-13,\n", + " 1.29674049e-13, 8.52207194e-13, 1.23456800e-13, 3.59934305e-13,\n", + " 1.76103576e-12, -7.46069873e-14, 7.31859018e-13, -4.46309656e-14,\n", + " -4.98490138e-14, -4.76285678e-14, 1.48969725e-12, 5.50670620e-14,\n", + " 1.70974346e-13, -5.25135491e-14, -8.95949981e-14, 1.71196390e-13,\n", + " -8.58202398e-14, 2.91322522e-13, 7.87148124e-13, 1.45439216e-14,\n", + " 3.69038133e-13, -4.09672296e-14, -1.33226763e-14, -3.20854454e-14,\n", + " 2.87769808e-13, 7.26529947e-13, 1.87627691e-14, -2.17603713e-14,\n", + " 6.86339874e-13, 6.21724894e-14, 1.28275168e-12, 5.12923037e-14,\n", + " 2.65121258e-13, -8.32667268e-15, 3.20410365e-13, 3.34177130e-13,\n", + " 1.66533454e-14, 7.10542736e-15, 6.88338275e-15, -1.27675648e-14,\n", + " 1.15396581e-12, 7.19424520e-14, 1.11999299e-12, 9.21485110e-14,\n", + " 2.31370478e-13, 1.75015558e-12, 4.43423076e-13, 1.05648823e-12,\n", + " 1.61270997e-12, 1.94066985e-13, 1.09690035e-13, 3.71036535e-13,\n", + " 9.06830167e-13, 2.65343303e-13, 2.26263452e-13, 1.51434421e-13,\n", + " 2.95097280e-13, 4.04787315e-13, 1.21658239e-12, 1.68753900e-13,\n", + " 4.18776125e-13, 1.00786046e-12, 1.02917674e-12, 4.97379915e-13,\n", + " 1.37667655e-13, -8.93729535e-14, -2.68673972e-14, 6.99440506e-14,\n", + " 9.57900426e-13, -9.35918010e-14, -1.14352972e-14, -1.74305015e-14,\n", + " -1.28341782e-13, -5.66213743e-14, 3.70814490e-14, -1.52211577e-13,\n", + " -9.52571355e-14, 8.90176821e-13, -1.35558231e-13, 3.69926312e-13,\n", + " 5.91082738e-13, 8.71080985e-13, -1.44328993e-14, -9.37028233e-14,\n", + " 1.00142117e-13, 1.82587279e-12, -3.41948692e-14, -6.78346268e-14,\n", + " 7.98472399e-13, 1.94422256e-12, 8.75743922e-13, -7.72715225e-14,\n", + " -4.92939023e-14, 1.18238752e-12, -1.66977543e-13, -1.72750703e-13,\n", + " -1.84297022e-13, -1.36113343e-13, 1.28652644e-12, -1.73083770e-13,\n", + " -9.25926003e-14, 4.66293670e-14, -5.28466160e-14, 9.00612918e-13,\n", + " 5.26245714e-14, 4.75175455e-14, 5.24025268e-14, 1.82187598e-12,\n", + " 8.82405260e-13, 8.59312621e-14, 8.12683254e-14, 4.57411886e-14,\n", + " 1.14197540e-12, 2.68673972e-13, 1.59205982e-13, 5.24025268e-14,\n", + " 3.93018951e-14, 2.10942375e-14, 4.72955008e-14, 8.59312621e-14,\n", + " 3.08642001e-14, 6.41708908e-14, 7.94697641e-13, 6.04183370e-13,\n", + " 8.63753513e-13, 1.02851061e-12, 1.69420034e-13, 1.49658064e-13,\n", + " 2.25597319e-13, 9.03721542e-14, 1.03472786e-13, 1.28563826e-13,\n", + " 7.14095449e-13, 1.77191595e-13, 1.33670852e-13, 8.52429238e-13,\n", + " 1.37445610e-13, 3.09086090e-13, 5.19140286e-13, 1.45838897e-12,\n", + " 5.83977311e-14, 3.57491814e-14, 7.65609798e-13, 8.10018719e-13,\n", + " 6.41264819e-13, 4.50750548e-14, 8.82183215e-13, 4.66293670e-15,\n", + " 6.99440506e-13, 6.29274410e-13, 5.93969318e-13, 6.64801547e-13,\n", + " -4.90718577e-14, 3.17967874e-13, -1.48436818e-13, 7.86481991e-13,\n", + " -6.86117829e-14, -1.45994328e-13, -1.23900890e-13, -1.62647673e-13,\n", + " 6.07736084e-13, 5.29354338e-13, -1.01696429e-13, -6.72795153e-14,\n", + " 1.68753900e-14, -5.54001289e-14, -7.28306304e-14, -5.70654635e-14,\n", + " 5.81756865e-14, 7.53619389e-13, -8.97060204e-14, -6.83897383e-14,\n", + " -8.89288643e-14, 6.91890989e-13, 8.86180018e-13, 2.50466314e-13,\n", + " 9.17044218e-13, -5.76205750e-14, 3.83915122e-13, 9.21485110e-14,\n", + " -1.42885703e-13, -1.24344979e-14, -5.57331958e-14, -6.57252031e-14,\n", + " 2.10942375e-15, 7.87370169e-13, 4.75175455e-14, 4.58744154e-13,\n", + " 9.64561764e-13, 1.25899291e-13, 1.66977543e-13, 3.88356014e-13,\n", + " 1.23012711e-13, 9.45910017e-14, 2.44249065e-13, 1.00230935e-12,\n", + " 4.90940621e-13, 3.52606833e-13, 1.15907284e-13, 9.35918010e-13,\n", + " 5.37347944e-14, 8.06021916e-14, 8.74855743e-14, 6.37268016e-14,\n", + " 8.99058605e-13, 6.68354261e-14, 2.54907206e-13, 2.37587727e-14,\n", + " 1.73194792e-14, 6.63691324e-13, 1.69708692e-12, -1.33226763e-14,\n", + " -6.40598685e-14, 7.32747196e-15, -4.81836793e-14, 2.70006240e-13,\n", + " 7.53175300e-13, 4.79616347e-14, -2.12052598e-14, 6.06181771e-14,\n", + " 1.61870517e-13, -2.91988655e-14, -2.04281037e-14, -7.54951657e-14,\n", + " 7.10542736e-15, 1.86295424e-13, -7.27196081e-14, -1.99840144e-14,\n", + " 6.23945340e-14, 1.38999923e-13, -6.61692923e-14, 9.08162434e-14,\n", + " -4.79616347e-14, 1.23234756e-12, 8.19566637e-13, -1.23345778e-13,\n", + " 5.12256904e-13, -2.04281037e-14, 3.96349620e-13, -4.66293670e-14,\n", + " 2.58904009e-13, 1.16529009e-12, -1.00697228e-13, 7.03437308e-13,\n", + " -7.00550729e-14, -2.50910404e-14, -1.72084569e-14, 3.75255382e-14,\n", + " 1.65423231e-13, 2.44027021e-13, 8.50430837e-14, 7.50066675e-13,\n", + " -5.49560397e-14, 3.40172335e-13, -3.81916720e-14, -2.00950367e-14,\n", + " -7.38298311e-14, 2.48023824e-13, 3.59046126e-13, -3.70814490e-14,\n", + " 2.84217094e-13, 7.37410133e-13, 6.21724894e-14, 3.55493412e-13,\n", + " 5.50670620e-14, 1.39666056e-13, 9.16156040e-13, 2.96429548e-14,\n", + " 3.26405569e-14, 1.44328993e-13, 4.82280882e-13, 3.41948692e-14,\n", + " 3.35953487e-13, 3.03090886e-13, 2.90878432e-14, 4.26325641e-14,\n", + " 7.34967642e-14, 2.53130850e-13, 3.28181926e-13, 4.13225010e-13,\n", + " 4.92939023e-14, 2.52908805e-13, 7.19424520e-14, 9.05941988e-14,\n", + " 1.37667655e-13, 4.52082816e-13, 3.84359211e-13, -2.89768209e-14,\n", + " 3.67927910e-13, 3.20188320e-13, 4.28101998e-13, 4.29212221e-13,\n", + " 5.54889468e-13, 1.07247544e-13, 2.40030218e-13, 7.03881398e-14,\n", + " 1.15685239e-13, 1.04583009e-13, 1.22790667e-13, 1.62092562e-13,\n", + " 8.41771097e-13, 1.69420034e-13, 5.42677014e-13, 4.29434266e-13,\n", + " 1.62980740e-13, 6.54143406e-13, 2.06501483e-13, 1.94511074e-13,\n", + " 8.22231172e-13, 5.83311177e-13, 9.64339719e-13, 8.65529870e-13,\n", + " 1.13686838e-13, 1.48547841e-13, 8.86180018e-13, 2.07389661e-13,\n", + " 7.99360578e-15, 6.17284002e-14, -2.02060590e-14, 1.62536651e-13,\n", + " 9.51239087e-13, 1.35313982e-12, 1.22590826e-12, 8.39328607e-14,\n", + " 1.18216548e-12, -2.07611706e-14, -1.81521465e-13, -1.28785871e-13,\n", + " 4.79838391e-13, 9.35029831e-13, -2.08055795e-13, -1.71751502e-13,\n", + " 1.42108547e-13, -1.88848936e-13, 2.00683914e-12, -1.43440815e-13,\n", + " 4.88942220e-13, -2.53019827e-13, -1.59205982e-13, -2.01838546e-13,\n", + " -7.30526750e-14, 3.79696274e-13, -2.36921593e-13, 4.46975790e-13,\n", + " 2.53552734e-12, -1.84297022e-13, -5.07371922e-14, 7.41851025e-13,\n", + " -8.01581024e-14, -7.90478794e-14, 3.17457172e-12, -2.64233080e-14,\n", + " -7.54951657e-15, 3.68594044e-14, 1.44551038e-13, 4.92939023e-14,\n", + " 1.26343380e-13, -2.76445533e-14, 6.29274410e-13, 1.84074977e-13,\n", + " 7.41406936e-13, 4.39648318e-13, 1.61648472e-13, 1.31827882e-12,\n", + " 9.06830167e-13, 3.14637205e-13, 2.80220291e-13, 2.40918396e-13,\n", + " 6.31938946e-13, 3.30846461e-13, 1.13908882e-13, 2.08277839e-13,\n", + " 1.51079149e-12, 2.36033415e-13, 4.70956607e-13, 1.88515870e-13,\n", + " 2.24043006e-13, 2.07345252e-12, 1.56763491e-13, 1.80744308e-13,\n", + " 1.13020704e-13, 7.68274333e-14, 5.41788836e-14, 1.73527859e-12,\n", + " 3.26405569e-14, 6.63913369e-14, 2.86437540e-14, 5.05595565e-13,\n", + " 7.92699240e-14, 9.11715148e-13, 6.88338275e-14, 3.12194715e-13,\n", + " 6.17284002e-14, 4.95159469e-13, 2.02060590e-14, 1.04782849e-12,\n", + " -1.78745907e-14, 1.04360964e-14, 1.04494191e-12, 1.05138120e-12,\n", + " 1.11999299e-12, 3.80584453e-13, -1.20459198e-13, 4.38316050e-13,\n", + " 8.99280650e-14, 1.38777878e-13, -6.32827124e-14, -9.10382880e-15,\n", + " 1.48991930e-13, 1.48547841e-13, 4.44311254e-13, 1.44328993e-14,\n", + " 2.06279438e-13, 1.79856130e-14, 6.28386232e-14, 1.05693232e-13,\n", + " 4.33653113e-13, 9.89874849e-13, 6.03517236e-13, 4.05897538e-13,\n", + " 6.14397422e-13, 3.30624417e-13, 9.05275854e-13, 3.03090886e-13,\n", + " 7.83817455e-13, 4.95603558e-13, 6.00186567e-13, 3.87911925e-13,\n", + " 8.27782287e-13, 9.02389274e-13, 8.79740725e-13, 3.63487018e-13,\n", + " 8.07354184e-13, 5.88418203e-13, 2.70672373e-13, 6.51922960e-13,\n", + " 7.19424520e-13, 4.84057239e-13, 3.40394379e-13, 1.08801856e-13,\n", + " 6.52811138e-14, -5.98410210e-14, 4.94715380e-13, 3.73923115e-13,\n", + " -1.33337785e-13, 9.99200722e-13, 3.03090886e-14, 4.09006162e-13,\n", + " 1.14308563e-12, -2.00617301e-13, -1.39555034e-13, -5.17363929e-14,\n", + " -1.10356169e-13, -2.79998247e-13, -1.82964754e-13, -1.57762692e-13,\n", + " -1.05027098e-13, -2.20601315e-13, 1.17461596e-13, -1.93511873e-13,\n", + " 7.45847828e-13, 4.78728168e-13, -1.76192394e-13, -1.79856130e-13,\n", + " 4.99822406e-13, -1.25677246e-13, -1.25677246e-13, 8.32667268e-15,\n", + " 5.61772850e-13, 1.22346577e-13, 4.88720175e-13, 2.11164419e-13,\n", + " 7.60724816e-13, 4.28768132e-13, 1.58983937e-13, 1.90514271e-13,\n", + " 3.43947093e-13, 1.44151358e-12, 1.04583009e-13, 6.80344669e-13,\n", + " 2.16493490e-13, 3.59712260e-14, 1.49347201e-12, 2.22488694e-13,\n", + " 1.38178358e-12, 1.08801856e-14, 1.11022302e-15, 3.45501405e-13,\n", + " 2.18269847e-13, 4.09894341e-13, 1.48547841e-13, 3.16857651e-13,\n", + " 1.25144339e-12, 8.00470801e-13, 9.14823772e-14, 7.90478794e-14,\n", + " 4.57411886e-14, 6.92779167e-14, 1.46771484e-13, 2.04725126e-13,\n", + " 5.47339951e-13, 1.23234756e-13, 5.10702591e-13, 6.63913369e-14,\n", + " 2.73114864e-13, 7.13651360e-13, 2.00950367e-14, 1.41420209e-12,\n", + " 1.36135547e-12, 1.72972747e-13, 1.56896718e-12, 1.88737914e-14,\n", + " 1.96287431e-13, 2.83773005e-13, 2.08943973e-13, 1.38933309e-12,\n", + " 8.34887715e-14, 9.01501096e-14, 2.85105273e-13, 1.82520665e-13,\n", + " 4.42756942e-13, 6.43929354e-14, 5.73097125e-13, 7.37854222e-13,\n", + " 6.11510842e-13, 3.34621220e-13, 4.66293670e-15, 9.99200722e-15,\n", + " 1.15241150e-13, 4.81170659e-13, 2.04281037e-14, 5.53779245e-13,\n", + " -1.13242749e-14, 1.11022302e-13, 3.55271368e-14, 1.39888101e-13,\n", + " 1.73372428e-12, 7.27862215e-13, -4.58522109e-14, 2.66475730e-12,\n", + " -1.11466392e-13, -1.00697228e-13, -1.11133325e-13, -8.84847751e-14,\n", + " 4.67403893e-13, -1.42441614e-13, 2.68740585e-12, -1.99840144e-14,\n", + " 1.22790667e-13, -7.39408534e-14, 1.23900890e-13, 1.26876287e-12,\n", + " 8.65973959e-13, 7.48512363e-13, 7.79376563e-14, -1.61537450e-13,\n", + " 3.44169138e-14, -1.08135723e-13, -3.49831275e-13, -1.39999123e-13,\n", + " 4.22550883e-13, -2.65343303e-13, -2.37920794e-13, -1.22013510e-13,\n", + " -1.39555034e-13, 2.35012010e-12, -1.91957561e-13, 1.01252340e-13,\n", + " 7.72271136e-13, -5.12923037e-14, 1.06137321e-12, 7.55395746e-13,\n", + " 2.77555756e-15, 8.32001135e-13, 1.55520041e-12, 1.22124533e-13,\n", + " 1.03939080e-12, 3.17523785e-14, 2.82662782e-13, 1.03916875e-13,\n", + " 1.14552812e-12, 1.16573418e-12, 6.48148202e-13, 2.19824159e-13,\n", + " 4.03632683e-12, 4.03677092e-13, 4.66826577e-12, 7.95807864e-13,\n", + " 3.24407168e-13, 3.04645198e-13, 3.57047725e-13, 7.15871806e-13,\n", + " 8.36664071e-13, 1.99174011e-12, 5.36015676e-12, 7.07878201e-13])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "k = 20\n", + "batch_sizes = [20,50,100,200,400,1000]\n", + "\n", + "time_river = []\n", + "time_scikit = []\n", + "for batch_size in batch_sizes:\n", + " #River\n", + " start_time_r = time.time()\n", + " ilof_river2 = ilof.ILOF(k, verbose=False)\n", + " ilof_river2.learn_many(dataset[0:batch_size])\n", + " time_river.append(time.time() - start_time_r)\n", + " ilof_scores_river2 = np.array([v for v in ilof_river2.lof.values()])\n", + "\n", + " #Scikit\n", + " start_time_s = time.time()\n", + " lof_scikit2 = LocalOutlierFactor(n_neighbors=k)\n", + " lof_scikit2.fit_predict(dataset_np[0:batch_size])\n", + " time_scikit.append(time.time() - start_time_s)\n", + " lof_scores_scikit2 = - lof_scikit2.negative_outlier_factor_\n", + "\n", + "#Compare\n", + "print('We observe again that the error is of machine precision level: \\\n", + " (the few errors at the scale of e-05 are because of the river minkowski-distance function error, \\\n", + " I submitted a correction request, for details see bottom of this document ')\n", + "ilof_scores_river2 - lof_scores_scikit2" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "# Plotting the bar graph\n", + "plt.bar(np.array(range(len(time_river)))*1.5+0.25, time_river, width=0.4, label='River ILOF')\n", + "plt.bar(np.array(range(len(time_river)))*1.5-0.25, time_scikit, width=0.4, label='Scikit LOF')\n", + "plt.xticks(np.array(range(len(time_river)))*1.5, batch_sizes)\n", + "plt.xlabel('Batch size')\n", + "plt.ylabel('Execution Time (seconds)')\n", + "plt.title('Mini-batch mode Time Comparison')\n", + "plt.yscale('log')\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RiverILOF time: [5.1155877113342285] ; ScikitLOF time: [0.22044944763183594] \n", + " We observe that RiverILOF gets slower as the number of points learned increases, since it calculates the distance of the new point to all other ones each iteration\n" + ] + } + ], + "source": [ + "#Time to add new points on existing model with 1000 points\n", + "#River\n", + "time_r = []\n", + "start_t_r = time.time()\n", + "for x in dataset[0][1200:1300]:\n", + " ilof_river2.learn_one(x)\n", + "time_r.append(time.time() - start_t_r)\n", + "\n", + "#Scikit\n", + "time_s = []\n", + "start_t_s = time.time()\n", + "lof_scikit2.novelty = True\n", + "lof_scores_scikit2 = lof_scikit2.score_samples(dataset_np[1200:1300])\n", + "time_s.append(time.time() - start_t_s)\n", + "\n", + "print('RiverILOF time:', time_r,'; ScikitLOF time:', time_s, '\\n We observe that RiverILOF gets slower as the number of points learned increases,\\\n", + "since it calculates the distance of the new point to all other ones each iteration.')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "13.974262055650739 195.28000000000003\n" + ] + } + ], + "source": [ + "#River Minkowski_distance error\n", + "\n", + "import functools\n", + "from river import utils\n", + "#from river.neighbors.base import DistanceFunc\n", + "from river.utils import VectorDict\n", + "\n", + "distancefunc = functools.partial(utils.math.minkowski_distance, p=2)\n", + "a={1: 1.5, 2: 3.5}\n", + "b={1: -0.7, 2: -10.3}\n", + "print(((1.5+0.7)**2+(3.5+10.3)**2)**(1/2), distancefunc(a,b))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From fc3ee3e777078c36b45ecd4afb0280167c1925b3 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Sat, 26 Aug 2023 12:20:22 +0800 Subject: [PATCH 02/43] Update ilof_notebook.ipynb --- {river/anomaly => docs/examples}/ilof_notebook.ipynb | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {river/anomaly => docs/examples}/ilof_notebook.ipynb (100%) diff --git a/river/anomaly/ilof_notebook.ipynb b/docs/examples/ilof_notebook.ipynb similarity index 100% rename from river/anomaly/ilof_notebook.ipynb rename to docs/examples/ilof_notebook.ipynb From e4cb46e0247a449271685bc8720549310517263e Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Mon, 28 Aug 2023 14:02:00 +0700 Subject: [PATCH 03/43] Modify name IncrementalLOF in __init__ file of anomaly module --- river/anomaly/__init__.py | 4 ++-- river/anomaly/ilof.py | 44 +++++++++++++++++++++++---------------- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/river/anomaly/__init__.py b/river/anomaly/__init__.py index 60ff843329..e1d5be039b 100644 --- a/river/anomaly/__init__.py +++ b/river/anomaly/__init__.py @@ -17,7 +17,7 @@ from .filter import QuantileFilter, ThresholdFilter from .gaussian import GaussianScorer from .hst import HalfSpaceTrees -from .ilof import ILOF +from .ilof import IncrementalLOF from .svm import OneClassSVM __all__ = [ @@ -28,5 +28,5 @@ "OneClassSVM", "QuantileFilter", "ThresholdFilter", - "ILOF", + "IncrementalLOF", ] diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index eca4a1008b..782547ab94 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -9,10 +9,10 @@ from river.utils import VectorDict -class ILOF(anomaly.base.AnomalyDetector): - """Incremental Local Outlier Factor (ILOF). +class IncrementalLOF(anomaly.base.AnomalyDetector): + """Incremental Local Outlier Factor (Incremental LOF). - ILOF Algorithm as described in the reference paper + Incremental LOF Algorithm as described in the reference paper ---------- The Incremental Local Outlier Factor (ILOF) is an online version of the Local Outlier Factor (LOF) used to identify outliers based on density of local neighbors. @@ -71,19 +71,27 @@ class ILOF(anomaly.base.AnomalyDetector): Example ---------- - from river import datasets - import pandas as pd - import ilof as ilof - dataset = pd.DataFrame(datasets.CreditCard()) - #Define model - k = 20 #k-neighboors - ilof_river = ilof.ILOF(k, verbose=False) - ilof_river.learn_many(dataset[0:30]) - for i in dataset[0][40:90]: - ilof_river.learn_one(i) - lof_score = [] - for x in dataset[0][100:120]: - lof_score.append(ilof_river.score_one(x)) + + >>> from river import anomaly + >>> from river import datasets + >>> import pandas as pd + + >>> dataset = pd.DataFrame(datasets.CreditCard()) + + >>> k = 20 # Define number of nearest neighbors + >>> incremental_lof = anomaly.IncrementalLOF(k, verbose=False) + + >>> incremental_lof.learn_many(dataset[0:50]) # learn_many for the first 30 observations + + >>> for i in dataset[0][50:100]: + ... incremental_lof.learn_one(i) + + >>> ilof_scores = [] + >>> for x in dataset[0][101:120]: + ... ilof_scores.append(incremental_lof.score_one(x)) + + >>> [[round(ilof_score, 3) for ilof_score in ilof_scores[:5][i]] for i in range(5)] + [[1.207], [1.278], [1.721], [1.271], [1.167]] References ---------- @@ -310,7 +318,7 @@ def initial_calculations( dist_dict: dict, ): """ - Perform initial calculations on the incoming data before applying the ILOF algorithm. + Perform initial calculations on the incoming data before applying the Incremental LOF algorithm. Taking the new data, it updates the neighborhoods, reverse neighborhoods, k-distances and distances between particles. Parameters @@ -415,7 +423,7 @@ def expand_objects( ) def define_sets(self, nm, neighborhoods: dict, rev_neighborhoods: dict): - """Define sets of points for the ILOF algorithm""" + """Define sets of points for the incremental LOF algorithm""" # Define set of new points from batch Set_new_points = set(range(nm[0], nm[0] + nm[1])) Set_neighbors: set = set() From 30caddf7956eaccb68e6d6ce72e7fe84c2fc9f96 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Mon, 28 Aug 2023 14:02:52 +0700 Subject: [PATCH 04/43] Refactor code after precommit run --- river/linear_model/test_glm.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/river/linear_model/test_glm.py b/river/linear_model/test_glm.py index 2d95f2ce73..64a3ed3c37 100644 --- a/river/linear_model/test_glm.py +++ b/river/linear_model/test_glm.py @@ -420,10 +420,7 @@ def test_lin_reg_sklearn_l1_non_regression(): def test_log_reg_sklearn_l1_non_regression(): """Checks that the river L1 implementation results are no worse than sklearn L1.""" - ( - X, - y, - ) = make_classification( + (X, y,) = make_classification( n_samples=1000, n_features=20, n_informative=4, From 56c17e8e0dd1c2b599250783d60347d3da2ea2c1 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Mon, 28 Aug 2023 16:57:59 +0700 Subject: [PATCH 05/43] Remove window_score in score_one function of Incremental LOF (since score_one already assumes that one sample is taken into account at a certain time point). --- river/anomaly/ilof.py | 137 +++++++++++++++++++++--------------------- 1 file changed, 67 insertions(+), 70 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index 782547ab94..a29868a864 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -225,9 +225,9 @@ def learn(self, X_batch: list): # Calculate new Local Outlier Factor of all affected points self.lof = self.calc_lof(Set_upd_lof, self.neighborhoods, self.local_reach, self.lof) - def score_one(self, x: VectorDict, window_score=1): + def score_one(self, x: VectorDict): """ - Score incoming observations based on model constructed previously. + Score an incoming observation based on model constructed previously. Perform same calculations as 'learn_one' function but doesn't add the new calculations to the atributes Data samples that are equal to samples stored by the model are not considered. @@ -235,8 +235,6 @@ def score_one(self, x: VectorDict, window_score=1): ---------- x A dictionary of feature values. - window_score - The size of the batch of data to be taken in at once for the model to score Returns ------- @@ -246,67 +244,66 @@ def score_one(self, x: VectorDict, window_score=1): self.X_score.append(x) - if len(self.X_score) >= window_score: - self.X_score, equal = self.check_equal(self.X_score, self.X) - if equal != 0 and self.verbose: - print("%i samples are equal to previous data" % equal) - - if len(self.X_score) == 0: - if self.verbose: - print("No new data was added") - else: - Xs = self.X.copy() - ( - nm, - Xs, - neighborhoods, - rev_neighborhoods, - k_dist, - reach_dist, - dist_dict, - local_reach, - lof, - ) = self.expand_objects( - self.X_score, - Xs, - self.neighborhoods, - self.rev_neighborhoods, - self.k_dist, - self.reach_dist, - self.dist_dict, - self.local_reach, - self.lof, - ) - - neighborhoods, rev_neighborhoods, k_dist, dist_dict = self.initial_calculations( - Xs, nm, neighborhoods, rev_neighborhoods, k_dist, dist_dict - ) - ( - Set_new_points, - Set_neighbors, - Set_rev_neighbors, - Set_upd_lrd, - Set_upd_lof, - ) = self.define_sets(nm, neighborhoods, rev_neighborhoods) - reach_dist = self.calc_reach_dist_newpoints( - Set_new_points, neighborhoods, rev_neighborhoods, reach_dist, dist_dict, k_dist - ) - reach_dist = self.calc_reach_dist_otherpoints( - Set_rev_neighbors, - neighborhoods, - rev_neighborhoods, - reach_dist, - dist_dict, - k_dist, - ) - local_reach = self.calc_local_reach_dist( - Set_upd_lrd, neighborhoods, reach_dist, local_reach - ) - lof = self.calc_lof(Set_upd_lof, neighborhoods, local_reach, lof) - self.X_score = [] - - score_keys = list(range(nm[0], nm[0] + nm[1])) - return [lof[i] for i in score_keys] + self.X_score, equal = self.check_equal(self.X_score, self.X) + if equal != 0 and self.verbose: + print("The new observation is the same to one of the previously observed instances.") + + if len(self.X_score) == 0: + if self.verbose: + print("No new data was added.") + else: + Xs = self.X.copy() + ( + nm, + Xs, + neighborhoods, + rev_neighborhoods, + k_dist, + reach_dist, + dist_dict, + local_reach, + lof, + ) = self.expand_objects( + self.X_score, + Xs, + self.neighborhoods, + self.rev_neighborhoods, + self.k_dist, + self.reach_dist, + self.dist_dict, + self.local_reach, + self.lof, + ) + + neighborhoods, rev_neighborhoods, k_dist, dist_dict = self.initial_calculations( + Xs, nm, neighborhoods, rev_neighborhoods, k_dist, dist_dict + ) + ( + Set_new_points, + Set_neighbors, + Set_rev_neighbors, + Set_upd_lrd, + Set_upd_lof, + ) = self.define_sets(nm, neighborhoods, rev_neighborhoods) + reach_dist = self.calc_reach_dist_newpoints( + Set_new_points, neighborhoods, rev_neighborhoods, reach_dist, dist_dict, k_dist + ) + reach_dist = self.calc_reach_dist_otherpoints( + Set_rev_neighbors, + neighborhoods, + rev_neighborhoods, + reach_dist, + dist_dict, + k_dist, + ) + local_reach = self.calc_local_reach_dist( + Set_upd_lrd, neighborhoods, reach_dist, local_reach + ) + lof = self.calc_lof(Set_upd_lof, neighborhoods, local_reach, lof) + self.X_score = [] + + score_keys = list(range(nm[0], nm[0] + nm[1])) + return [lof[i] for i in score_keys] def initial_calculations( self, @@ -325,15 +322,15 @@ def initial_calculations( ---------- X A list of stored observations. - nm : tuple of ints, (n, m) + nm A tuple representing the current size of the dataset. - neighborhoods : dict + neighborhoods A dictionary of particle neighborhoods. - rev_neighborhoods : dict + rev_neighborhoods A dictionary of reverse particle neighborhoods. - k_distances : dict + k_distances A dictionary to hold k-distances for each observation. - dist_dict : dict of dicts + dist_dict A dictionary of dictionaries storing distances between particles Returns From cb7e45f2606822598ce27fb38b0dec240a0bf44a Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Mon, 28 Aug 2023 17:54:44 +0700 Subject: [PATCH 06/43] Remove learn_many and refactor Docstring test --- river/anomaly/ilof.py | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index a29868a864..2e184d8fef 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -76,22 +76,20 @@ class IncrementalLOF(anomaly.base.AnomalyDetector): >>> from river import datasets >>> import pandas as pd - >>> dataset = pd.DataFrame(datasets.CreditCard()) + >>> cc_df = pd.DataFrame(datasets.CreditCard()) >>> k = 20 # Define number of nearest neighbors >>> incremental_lof = anomaly.IncrementalLOF(k, verbose=False) - >>> incremental_lof.learn_many(dataset[0:50]) # learn_many for the first 30 observations - - >>> for i in dataset[0][50:100]: - ... incremental_lof.learn_one(i) + >>> for x, _ in datasets.CreditCard().take(200): + ... incremental_lof.learn_one(x) >>> ilof_scores = [] - >>> for x in dataset[0][101:120]: - ... ilof_scores.append(incremental_lof.score_one(x)) + >>> for x in cc_df[0][201:206]: + ... ilof_scores.append(incremental_lof.score_one(x)) - >>> [[round(ilof_score, 3) for ilof_score in ilof_scores[:5][i]] for i in range(5)] - [[1.207], [1.278], [1.721], [1.271], [1.167]] + >>> [[round(ilof_score, 3) for ilof_score in ilof_scores[i]] for i in range(len(ilof_scores))] + [[1.149], [1.098], [1.158], [1.101], [1.092]] References ---------- @@ -122,18 +120,6 @@ def __init__( else functools.partial(utils.math.minkowski_distance, p=2) ) - def learn_many(self, X_batch: pd.Series): - """ - Update the model with many incoming observations - - Parameters - ---------- - X_batch - A Panda Series - """ - X_batch = X_batch[0].tolist() - self.learn(X_batch) - def learn_one(self, x: dict): """ Update the model with one incoming observation From ac4ab6254e6f6daa0f508f37066a79fe748aac50 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Mon, 28 Aug 2023 17:55:39 +0700 Subject: [PATCH 07/43] Remove import pandas since unused. --- river/anomaly/ilof.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index 2e184d8fef..fe25c76f93 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -2,8 +2,6 @@ import functools -import pandas as pd - from river import anomaly, utils from river.neighbors.base import DistanceFunc from river.utils import VectorDict From ebec94fffbb11b11e832f89e7325dd729688e1f7 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Tue, 5 Sep 2023 17:59:53 +0700 Subject: [PATCH 08/43] Refactor --- river/anomaly/ilof.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index fe25c76f93..ac336eaf80 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -139,7 +139,7 @@ def learn(self, X_batch: list): if len(X_batch) == 0: if self.verbose: - print("No new data was added") + print("No new data was added.") else: # Increase size of objects to acomodate new data ( @@ -209,9 +209,9 @@ def learn(self, X_batch: list): # Calculate new Local Outlier Factor of all affected points self.lof = self.calc_lof(Set_upd_lof, self.neighborhoods, self.local_reach, self.lof) - def score_one(self, x: VectorDict): + def score_one(self, x: dict): """ - Score an incoming observation based on model constructed previously. + Score a new incoming observation based on model constructed previously. Perform same calculations as 'learn_one' function but doesn't add the new calculations to the atributes Data samples that are equal to samples stored by the model are not considered. From a3b4093daa217a9d6d1821a14db42b28ef27525b Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Tue, 5 Sep 2023 18:00:44 +0700 Subject: [PATCH 09/43] Change output of `score_one` to only return one single number, not list (to comply with the implementation of other incremental anomaly detection algorithms) --- river/anomaly/ilof.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index ac336eaf80..0c99070c27 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -286,8 +286,7 @@ def score_one(self, x: dict): lof = self.calc_lof(Set_upd_lof, neighborhoods, local_reach, lof) self.X_score = [] - score_keys = list(range(nm[0], nm[0] + nm[1])) - return [lof[i] for i in score_keys] + return lof[nm[0]] def initial_calculations( self, From bc78169de702b3255d65d7d3f6b5e25eb67e3808 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Tue, 5 Sep 2023 18:07:42 +0700 Subject: [PATCH 10/43] Refactor Docstring test --- river/anomaly/ilof.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index 0c99070c27..101c6a297f 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -86,8 +86,8 @@ class IncrementalLOF(anomaly.base.AnomalyDetector): >>> for x in cc_df[0][201:206]: ... ilof_scores.append(incremental_lof.score_one(x)) - >>> [[round(ilof_score, 3) for ilof_score in ilof_scores[i]] for i in range(len(ilof_scores))] - [[1.149], [1.098], [1.158], [1.101], [1.092]] + >>> [round(ilof_score, 3) for ilof_score in ilof_scores] + [1.149, 1.098, 1.158, 1.101, 1.092] References ---------- From 52242653e59797360855836055fd3a356ece06c8 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Wed, 6 Sep 2023 07:59:12 +0700 Subject: [PATCH 11/43] Refactor from IncrementalLOF to LocalOutlierFactor --- river/anomaly/__init__.py | 4 ++-- river/anomaly/ilof.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/river/anomaly/__init__.py b/river/anomaly/__init__.py index e1d5be039b..74882f55c6 100644 --- a/river/anomaly/__init__.py +++ b/river/anomaly/__init__.py @@ -17,7 +17,7 @@ from .filter import QuantileFilter, ThresholdFilter from .gaussian import GaussianScorer from .hst import HalfSpaceTrees -from .ilof import IncrementalLOF +from .ilof import LocalOutlierFactor from .svm import OneClassSVM __all__ = [ @@ -28,5 +28,5 @@ "OneClassSVM", "QuantileFilter", "ThresholdFilter", - "IncrementalLOF", + "LocalOutlierFactor", ] diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index 101c6a297f..b34a4e4005 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -7,7 +7,7 @@ from river.utils import VectorDict -class IncrementalLOF(anomaly.base.AnomalyDetector): +class LocalOutlierFactor(anomaly.base.AnomalyDetector): """Incremental Local Outlier Factor (Incremental LOF). Incremental LOF Algorithm as described in the reference paper @@ -77,7 +77,7 @@ class IncrementalLOF(anomaly.base.AnomalyDetector): >>> cc_df = pd.DataFrame(datasets.CreditCard()) >>> k = 20 # Define number of nearest neighbors - >>> incremental_lof = anomaly.IncrementalLOF(k, verbose=False) + >>> incremental_lof = anomaly.LocalOutlierFactor(k, verbose=False) >>> for x, _ in datasets.CreditCard().take(200): ... incremental_lof.learn_one(x) From c64b803ecd4d46e4ce8bdf9964014d4cbc0423a0 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Wed, 6 Sep 2023 08:19:12 +0700 Subject: [PATCH 12/43] Refactor --- river/anomaly/ilof.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index b34a4e4005..3d8be4dd40 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -11,7 +11,6 @@ class LocalOutlierFactor(anomaly.base.AnomalyDetector): """Incremental Local Outlier Factor (Incremental LOF). Incremental LOF Algorithm as described in the reference paper - ---------- The Incremental Local Outlier Factor (ILOF) is an online version of the Local Outlier Factor (LOF) used to identify outliers based on density of local neighbors. @@ -33,13 +32,13 @@ class LocalOutlierFactor(anomaly.base.AnomalyDetector): Parameters ---------- - n_neighbors : int + n_neighbors The number of nearest neighbors to use for density estimation. - window_size : int + window_size The size of the batch of data to be taken in at once for the model to learn - distance_func : function that takes in dictionaries - A distance function to use. By default, the Euclidean distance is used. - verbose: boolean + distance_func + Distance function to be used. By default, the Euclidean distance is used. + verbose Whether or not to print messages Attributes From 05375fc87704571d0947cbde96a34d45459fea4a Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Wed, 6 Sep 2023 11:57:53 +0700 Subject: [PATCH 13/43] Refactor X_batch to x_batch to align with PEP8. --- river/anomaly/ilof.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index 3d8be4dd40..5c2fffe883 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -45,7 +45,7 @@ class LocalOutlierFactor(anomaly.base.AnomalyDetector): ---------- X A list of stored observations. - X_batch + x_batch A buffer to hold incoming observations until it's time to update the model. X_score A buffer to hold incoming observations until it's time to score them. @@ -101,7 +101,7 @@ def __init__( ): self.n_neighbors = n_neighbors self.X: list = [] - self.X_batch: list = [] + self.x_batch: list = [] self.X_score: list = [] self.dist_dict: dict = {} self.neighborhoods: dict = {} @@ -126,17 +126,17 @@ def learn_one(self, x: dict): x A dictionary of feature values. """ - self.X_batch.append(x) - if len(self.X) or len(self.X_batch) > 1: - self.learn(self.X_batch) - self.X_batch = [] + self.x_batch.append(x) + if len(self.X) or len(self.x_batch) > 1: + self.learn(self.x_batch) + self.x_batch = [] - def learn(self, X_batch: list): - X_batch, equal = self.check_equal(X_batch, self.X) + def learn(self, x_batch: list): + x_batch, equal = self.check_equal(x_batch, self.X) if equal != 0 and self.verbose: print("%i samples are equal to previous data" % equal) - if len(X_batch) == 0: + if len(x_batch) == 0: if self.verbose: print("No new data was added.") else: @@ -152,7 +152,7 @@ def learn(self, X_batch: list): self.local_reach, self.lof, ) = self.expand_objects( - X_batch, + x_batch, self.X, self.neighborhoods, self.rev_neighborhoods, From 4d209b43a2529c578bbd1e612964af71c9860a03 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Wed, 6 Sep 2023 11:58:19 +0700 Subject: [PATCH 14/43] Verbose re-wording. --- river/anomaly/ilof.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index 5c2fffe883..ac1790cdc7 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -134,7 +134,7 @@ def learn_one(self, x: dict): def learn(self, x_batch: list): x_batch, equal = self.check_equal(x_batch, self.X) if equal != 0 and self.verbose: - print("%i samples are equal to previous data" % equal) + print("At least one sample is equal to previously observed instances.") if len(x_batch) == 0: if self.verbose: From e25c8002011c453b5e79fe8bd538037a3970b711 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Wed, 6 Sep 2023 17:46:40 +0700 Subject: [PATCH 15/43] Refactor X_score to x_scores --- river/anomaly/ilof.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index ac1790cdc7..27b4c2cded 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -47,7 +47,7 @@ class LocalOutlierFactor(anomaly.base.AnomalyDetector): A list of stored observations. x_batch A buffer to hold incoming observations until it's time to update the model. - X_score + x_scores A buffer to hold incoming observations until it's time to score them. dist_dict A dictionary to hold distances between observations. @@ -102,7 +102,7 @@ def __init__( self.n_neighbors = n_neighbors self.X: list = [] self.x_batch: list = [] - self.X_score: list = [] + self.x_scores: list = [] self.dist_dict: dict = {} self.neighborhoods: dict = {} self.rev_neighborhoods: dict = {} @@ -225,13 +225,13 @@ def score_one(self, x: dict): List of LOF calculated for incoming data """ - self.X_score.append(x) + self.x_scores.append(x) - self.X_score, equal = self.check_equal(self.X_score, self.X) + self.x_scores, equal = self.check_equal(self.x_scores, self.X) if equal != 0 and self.verbose: print("The new observation is the same to one of the previously observed instances.") - if len(self.X_score) == 0: + if len(self.x_scores) == 0: if self.verbose: print("No new data was added.") else: @@ -247,7 +247,7 @@ def score_one(self, x: dict): local_reach, lof, ) = self.expand_objects( - self.X_score, + self.x_scores, Xs, self.neighborhoods, self.rev_neighborhoods, @@ -283,7 +283,7 @@ def score_one(self, x: dict): Set_upd_lrd, neighborhoods, reach_dist, local_reach ) lof = self.calc_lof(Set_upd_lof, neighborhoods, local_reach, lof) - self.X_score = [] + self.x_scores = [] return lof[nm[0]] From 01798d65c88344c985699901793784761256ec03 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Thu, 7 Sep 2023 14:47:06 +0700 Subject: [PATCH 16/43] Remove type of returned output in DocString --- river/anomaly/ilof.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index 27b4c2cded..dad288421d 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -317,13 +317,13 @@ def initial_calculations( Returns ------- - neighborhoods : dict + neighborhoods Updated dictionary of particle neighborhoods - rev_neighborhoods : dict + rev_neighborhoods Updated dictionary of reverse particle neighborhoods - k_distances : dict + k_distances Updated dictionary to hold k-distances for each observation - dist_dict : dict of dicts + dist_dict Updated dictionary of dictionaries storing distances between particles """ From 38a2b4d5f5881cf79fc3d8b90840d315544eb21d Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Thu, 7 Sep 2023 14:47:40 +0700 Subject: [PATCH 17/43] Make check_equal static method and refactor related variable names according to PEP8 --- river/anomaly/ilof.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index dad288421d..2420158f23 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -361,10 +361,12 @@ def initial_calculations( return neighborhoods, rev_neighborhoods, k_distances, dist_dict - def check_equal(self, X: list, Y: list): - """Check if new batch X has some data samples equal to previous data recorded Y""" - result = [x for x in X if not any(x == y for y in Y)] - return result, len(X) - len(result) + @staticmethod + def check_equal(x_list: list, y_list: list): + """Check if new list of observations (x_list) has any data sample that is equal to + any previous data recorded (y_list).""" + result = [x for x in x_list if not any(x == y for y in y_list)] + return result, len(x_list) - len(result) def expand_objects( self, From 762915c8f13eca884765ea10ce09b8cfca0bb58f Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Thu, 7 Sep 2023 14:55:20 +0700 Subject: [PATCH 18/43] Refactor capital X variable to x_list to represent the stored list of observations. --- river/anomaly/ilof.py | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index 2420158f23..dbea7413fc 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -43,7 +43,7 @@ class LocalOutlierFactor(anomaly.base.AnomalyDetector): Attributes ---------- - X + x_list A list of stored observations. x_batch A buffer to hold incoming observations until it's time to update the model. @@ -100,7 +100,7 @@ def __init__( distance_func: DistanceFunc = None, ): self.n_neighbors = n_neighbors - self.X: list = [] + self.x_list: list = [] self.x_batch: list = [] self.x_scores: list = [] self.dist_dict: dict = {} @@ -127,12 +127,12 @@ def learn_one(self, x: dict): A dictionary of feature values. """ self.x_batch.append(x) - if len(self.X) or len(self.x_batch) > 1: + if len(self.x_list) or len(self.x_batch) > 1: self.learn(self.x_batch) self.x_batch = [] def learn(self, x_batch: list): - x_batch, equal = self.check_equal(x_batch, self.X) + x_batch, equal = self.check_equal(x_batch, self.x_list) if equal != 0 and self.verbose: print("At least one sample is equal to previously observed instances.") @@ -140,10 +140,10 @@ def learn(self, x_batch: list): if self.verbose: print("No new data was added.") else: - # Increase size of objects to acomodate new data + # Increase size of objects to accommodate new data ( nm, - self.X, + self.x_list, self.neighborhoods, self.rev_neighborhoods, self.k_dist, @@ -153,7 +153,7 @@ def learn(self, x_batch: list): self.lof, ) = self.expand_objects( x_batch, - self.X, + self.x_list, self.neighborhoods, self.rev_neighborhoods, self.k_dist, @@ -170,7 +170,7 @@ def learn(self, x_batch: list): self.k_dist, self.dist_dict, ) = self.initial_calculations( - self.X, nm, self.neighborhoods, self.rev_neighborhoods, self.k_dist, self.dist_dict + self.x_list, nm, self.neighborhoods, self.rev_neighborhoods, self.k_dist, self.dist_dict ) # Define sets of particles @@ -227,7 +227,7 @@ def score_one(self, x: dict): self.x_scores.append(x) - self.x_scores, equal = self.check_equal(self.x_scores, self.X) + self.x_scores, equal = self.check_equal(self.x_scores, self.x_list) if equal != 0 and self.verbose: print("The new observation is the same to one of the previously observed instances.") @@ -235,10 +235,10 @@ def score_one(self, x: dict): if self.verbose: print("No new data was added.") else: - Xs = self.X.copy() + x_list_copy = self.x_list.copy() ( nm, - Xs, + x_list_copy, neighborhoods, rev_neighborhoods, k_dist, @@ -248,7 +248,7 @@ def score_one(self, x: dict): lof, ) = self.expand_objects( self.x_scores, - Xs, + x_list_copy, self.neighborhoods, self.rev_neighborhoods, self.k_dist, @@ -259,7 +259,7 @@ def score_one(self, x: dict): ) neighborhoods, rev_neighborhoods, k_dist, dist_dict = self.initial_calculations( - Xs, nm, neighborhoods, rev_neighborhoods, k_dist, dist_dict + x_list_copy, nm, neighborhoods, rev_neighborhoods, k_dist, dist_dict ) ( Set_new_points, @@ -289,7 +289,7 @@ def score_one(self, x: dict): def initial_calculations( self, - X: list, + x_list: list, nm: tuple, neighborhoods: dict, rev_neighborhoods: dict, @@ -302,7 +302,7 @@ def initial_calculations( Parameters ---------- - X + x_list A list of stored observations. nm A tuple representing the current size of the dataset. @@ -331,9 +331,9 @@ def initial_calculations( m = nm[1] k = self.n_neighbors - # Calculate distances all particles consdering new and old ones + # Calculate distances all particles considering new and old ones new_distances = [ - [i, j, self.distance(X[i], X[j])] for i in range(n + m) for j in range(i) if i >= n + [i, j, self.distance(x_list[i], x_list[j])] for i in range(n + m) for j in range(i) if i >= n ] # Add new distances to distance dictionary for i in range(len(new_distances)): @@ -371,7 +371,7 @@ def check_equal(x_list: list, y_list: list): def expand_objects( self, new_particles: list, - X: list, + x_list: list, neighborhoods: dict, rev_neighborhoods: dict, k_dist: dict, @@ -381,9 +381,9 @@ def expand_objects( lof: dict, ): """Expand size of dictionaries and lists to fit new data""" - n = len(X) + n = len(x_list) m = len(new_particles) - X.extend(new_particles) + x_list.extend(new_particles) neighborhoods.update({i: [] for i in range(n + m)}) rev_neighborhoods.update({i: [] for i in range(n + m)}) k_dist.update({i: float("inf") for i in range(n + m)}) @@ -393,7 +393,7 @@ def expand_objects( lof.update({i + n: [] for i in range(m)}) return ( (n, m), - X, + x_list, neighborhoods, rev_neighborhoods, k_dist, From b6fbdb5a7052dd5164d769707b5829f605a6fc3b Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Thu, 7 Sep 2023 14:56:01 +0700 Subject: [PATCH 19/43] Refactor code with black --- river/anomaly/ilof.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index dbea7413fc..f946868963 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -170,7 +170,12 @@ def learn(self, x_batch: list): self.k_dist, self.dist_dict, ) = self.initial_calculations( - self.x_list, nm, self.neighborhoods, self.rev_neighborhoods, self.k_dist, self.dist_dict + self.x_list, + nm, + self.neighborhoods, + self.rev_neighborhoods, + self.k_dist, + self.dist_dict, ) # Define sets of particles @@ -333,7 +338,10 @@ def initial_calculations( # Calculate distances all particles considering new and old ones new_distances = [ - [i, j, self.distance(x_list[i], x_list[j])] for i in range(n + m) for j in range(i) if i >= n + [i, j, self.distance(x_list[i], x_list[j])] + for i in range(n + m) + for j in range(i) + if i >= n ] # Add new distances to distance dictionary for i in range(len(new_distances)): @@ -364,7 +372,7 @@ def initial_calculations( @staticmethod def check_equal(x_list: list, y_list: list): """Check if new list of observations (x_list) has any data sample that is equal to - any previous data recorded (y_list).""" + any previous data recorded (y_list).""" result = [x for x in x_list if not any(x == y for y in y_list)] return result, len(x_list) - len(result) From e9f009605139da0fb1e22ab00f2a193d45892aa9 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Thu, 7 Sep 2023 15:53:15 +0700 Subject: [PATCH 20/43] Remove description for unnecessary variables. --- river/anomaly/ilof.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index f946868963..bf418c7cca 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -34,8 +34,6 @@ class LocalOutlierFactor(anomaly.base.AnomalyDetector): ---------- n_neighbors The number of nearest neighbors to use for density estimation. - window_size - The size of the batch of data to be taken in at once for the model to learn distance_func Distance function to be used. By default, the Euclidean distance is used. verbose From 1f1a99ffd2c76dd349e5a3587fb21b3b5b85e5e8 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Thu, 7 Sep 2023 15:53:40 +0700 Subject: [PATCH 21/43] Re-wording for variable description. --- river/anomaly/ilof.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index bf418c7cca..204ca59726 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -37,7 +37,7 @@ class LocalOutlierFactor(anomaly.base.AnomalyDetector): distance_func Distance function to be used. By default, the Euclidean distance is used. verbose - Whether or not to print messages + Whether to print warning/messages Attributes ---------- From a5afc239a58ee830bada9abf31d5b82590a09df3 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Thu, 7 Sep 2023 17:30:43 +0700 Subject: [PATCH 22/43] Refactor. --- river/anomaly/ilof.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index 204ca59726..9201e7c67e 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -88,7 +88,9 @@ class LocalOutlierFactor(anomaly.base.AnomalyDetector): References ---------- - Pokrajac, David & Lazarevic, Aleksandar & Latecki, Longin Jan. (2007). Incremental Local Outlier Detection for Data Streams. Proceedings of the 2007 IEEE Symposium on Computational Intelligence and Data Mining, CIDM 2007. 504-515. 10.1109/CIDM.2007.368917. + Pokrajac, David & Lazarevic, Aleksandar & Latecki, Longin Jan. (2007). Incremental Local Outlier Detection for Data Streams. + In: Proceedings of the 2007 IEEE Symposium on Computational Intelligence and Data Mining (CIDM 2007). 504-515. + DOI: 10.1109/CIDM.2007.368917. """ def __init__( From 456cb6ec80937494dcb00754d893b8fa2392f7f5 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Thu, 7 Sep 2023 17:40:52 +0700 Subject: [PATCH 23/43] Add learn_many to learn multiple instances at a time and update Docstring test to take into account the newly implemented function. --- river/anomaly/ilof.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index 9201e7c67e..d9c450c18e 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -2,6 +2,8 @@ import functools +import pandas as pd + from river import anomaly, utils from river.neighbors.base import DistanceFunc from river.utils import VectorDict @@ -79,12 +81,14 @@ class LocalOutlierFactor(anomaly.base.AnomalyDetector): >>> for x, _ in datasets.CreditCard().take(200): ... incremental_lof.learn_one(x) + >>> incremental_lof.learn_many(cc_df[201:401]) + >>> ilof_scores = [] - >>> for x in cc_df[0][201:206]: + >>> for x in cc_df[0][401:406]: ... ilof_scores.append(incremental_lof.score_one(x)) >>> [round(ilof_score, 3) for ilof_score in ilof_scores] - [1.149, 1.098, 1.158, 1.101, 1.092] + [1.802, 1.937, 1.567, 1.181, 1.28] References ---------- @@ -117,6 +121,19 @@ def __init__( else functools.partial(utils.math.minkowski_distance, p=2) ) + def learn_many(self, x: pd.DataFrame): + """ + Update the model with multiple incoming observations simultaneously. + This function assumes that the observations are stored in the first column of the dataset. + + Parameters + ---------- + x + A Pandas DataFrame including multiple instances to be learned at the same time + """ + x = x[0].tolist() + self.learn(x) + def learn_one(self, x: dict): """ Update the model with one incoming observation From b4605e3c2b9479b202ff9b5ea56c470251127f87 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Fri, 8 Sep 2023 00:00:41 +0700 Subject: [PATCH 24/43] Remove description of unnecessary variables. --- river/anomaly/ilof.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index d9c450c18e..6d7a4a58c0 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -63,8 +63,6 @@ class LocalOutlierFactor(anomaly.base.AnomalyDetector): A dictionary to hold Local Outlier Factors for each observation. local_reach A dictionary to hold local reachability distances for each observation. - skip_first - A boolean value indicating whether to skip the first window of data. Example ---------- From 0e47a2b08af81982a3c5e8ac9e81e3699b679738 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Fri, 8 Sep 2023 00:12:35 +0700 Subject: [PATCH 25/43] Spelling correction. --- river/anomaly/ilof.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index 6d7a4a58c0..cad128a9d4 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -178,7 +178,7 @@ def learn(self, x_batch: list): self.lof, ) - # Calculate neighborhoods, reverse neighborhoods, k-distances and distances between neighboors + # Calculate neighborhoods, reverse neighborhoods, k-distances and distances between neighbors ( self.neighborhoods, self.rev_neighborhoods, From f3a2cb1770f2170cd5d831adb9c05a9288a64a54 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Fri, 8 Sep 2023 00:15:23 +0700 Subject: [PATCH 26/43] Modify variable names by removing capital letters to align with PEP8. --- river/anomaly/ilof.py | 66 +++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index cad128a9d4..c25c94885c 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -195,16 +195,16 @@ def learn(self, x_batch: list): # Define sets of particles ( - Set_new_points, - Set_neighbors, - Set_rev_neighbors, - Set_upd_lrd, - Set_upd_lof, + set_new_points, + set_neighbors, + set_rev_neighbors, + set_upd_lrd, + set_upd_lof, ) = self.define_sets(nm, self.neighborhoods, self.rev_neighborhoods) # Calculate new reachability distance of all affected points self.reach_dist = self.calc_reach_dist_newpoints( - Set_new_points, + set_new_points, self.neighborhoods, self.rev_neighborhoods, self.reach_dist, @@ -212,7 +212,7 @@ def learn(self, x_batch: list): self.k_dist, ) self.reach_dist = self.calc_reach_dist_otherpoints( - Set_rev_neighbors, + set_rev_neighbors, self.neighborhoods, self.rev_neighborhoods, self.reach_dist, @@ -222,11 +222,11 @@ def learn(self, x_batch: list): # Calculate new local reachability distance of all affected points self.local_reach = self.calc_local_reach_dist( - Set_upd_lrd, self.neighborhoods, self.reach_dist, self.local_reach + set_upd_lrd, self.neighborhoods, self.reach_dist, self.local_reach ) # Calculate new Local Outlier Factor of all affected points - self.lof = self.calc_lof(Set_upd_lof, self.neighborhoods, self.local_reach, self.lof) + self.lof = self.calc_lof(set_upd_lof, self.neighborhoods, self.local_reach, self.lof) def score_one(self, x: dict): """ @@ -282,17 +282,17 @@ def score_one(self, x: dict): x_list_copy, nm, neighborhoods, rev_neighborhoods, k_dist, dist_dict ) ( - Set_new_points, - Set_neighbors, - Set_rev_neighbors, - Set_upd_lrd, - Set_upd_lof, + set_new_points, + set_neighbors, + set_rev_neighbors, + set_upd_lrd, + set_upd_lof, ) = self.define_sets(nm, neighborhoods, rev_neighborhoods) reach_dist = self.calc_reach_dist_newpoints( - Set_new_points, neighborhoods, rev_neighborhoods, reach_dist, dist_dict, k_dist + set_new_points, neighborhoods, rev_neighborhoods, reach_dist, dist_dict, k_dist ) reach_dist = self.calc_reach_dist_otherpoints( - Set_rev_neighbors, + set_rev_neighbors, neighborhoods, rev_neighborhoods, reach_dist, @@ -300,9 +300,9 @@ def score_one(self, x: dict): k_dist, ) local_reach = self.calc_local_reach_dist( - Set_upd_lrd, neighborhoods, reach_dist, local_reach + set_upd_lrd, neighborhoods, reach_dist, local_reach ) - lof = self.calc_lof(Set_upd_lof, neighborhoods, local_reach, lof) + lof = self.calc_lof(set_upd_lof, neighborhoods, local_reach, lof) self.x_scores = [] return lof[nm[0]] @@ -429,28 +429,28 @@ def expand_objects( def define_sets(self, nm, neighborhoods: dict, rev_neighborhoods: dict): """Define sets of points for the incremental LOF algorithm""" # Define set of new points from batch - Set_new_points = set(range(nm[0], nm[0] + nm[1])) - Set_neighbors: set = set() - Set_rev_neighbors: set = set() + set_new_points = set(range(nm[0], nm[0] + nm[1])) + set_neighbors: set = set() + set_rev_neighbors: set = set() # Define neighbors and reverse neighbors of new data points - for i in Set_new_points: - Set_neighbors = set(Set_neighbors) | set(neighborhoods[i]) - Set_rev_neighbors = set(Set_rev_neighbors) | set(rev_neighborhoods[i]) + for i in set_new_points: + set_neighbors = set(set_neighbors) | set(neighborhoods[i]) + set_rev_neighbors = set(set_rev_neighbors) | set(rev_neighborhoods[i]) # Define points that need to update their local reachability distance because of new data points - Set_upd_lrd = Set_rev_neighbors - for j in Set_rev_neighbors: - Set_upd_lrd = Set_upd_lrd | set(rev_neighborhoods[j]) - Set_upd_lrd = Set_upd_lrd | Set_new_points + set_upd_lrd = set_rev_neighbors + for j in set_rev_neighbors: + set_upd_lrd = set_upd_lrd | set(rev_neighborhoods[j]) + set_upd_lrd = set_upd_lrd | set_new_points # Define points that need to update their lof because of new data points - Set_upd_lof = Set_upd_lrd - for m in Set_upd_lrd: - Set_upd_lof = Set_upd_lof | set(rev_neighborhoods[m]) - Set_upd_lof = Set_upd_lof + set_upd_lof = set_upd_lrd + for m in set_upd_lrd: + set_upd_lof = set_upd_lof | set(rev_neighborhoods[m]) + set_upd_lof = set_upd_lof - return Set_new_points, Set_neighbors, Set_rev_neighbors, Set_upd_lrd, Set_upd_lof + return set_new_points, set_neighbors, set_rev_neighbors, set_upd_lrd, set_upd_lof def calc_reach_dist_newpoints( self, From 508373ff4ce3d17fab701522ec76ae65ad417043 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Fri, 8 Sep 2023 00:30:32 +0700 Subject: [PATCH 27/43] Refactor calc_local_reach_dist and cal_lof and change these functions to static methods. --- river/anomaly/ilof.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index c25c94885c..8afb602ffa 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -485,19 +485,21 @@ def calc_reach_dist_otherpoints( reach_dist[i][j] = max(dist_dict[i][j], k_dist[j]) return reach_dist + @staticmethod def calc_local_reach_dist( - self, Set: set, neighborhoods: dict, reach_dist: dict, local_reach_dist: dict + set_index: set, neighborhoods: dict, reach_dist: dict, local_reach_dist: dict ): - """Calculate local reachability distance of affected points""" - for i in Set: + """Calculate local reachability distance of affected points.""" + for i in set_index: local_reach_dist[i] = len(neighborhoods[i]) / sum( [reach_dist[i][j] for j in neighborhoods[i]] ) return local_reach_dist - def calc_lof(self, Set: set, neighborhoods: dict, local_reach: dict, lof: dict): - """Calculate local outlier factor of affected points""" - for i in Set: + @staticmethod + def calc_lof(set_index: set, neighborhoods: dict, local_reach: dict, lof: dict): + """Calculate local outlier factor (LOF) of affected points.""" + for i in set_index: lof[i] = sum([local_reach[j] for j in neighborhoods[i]]) / ( len(neighborhoods[i]) * local_reach[i] ) From 5af882d7fb19458ec676d1c0933c2b9c470d441d Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Fri, 8 Sep 2023 00:34:58 +0700 Subject: [PATCH 28/43] Change expand_objects to static method --- river/anomaly/ilof.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index 8afb602ffa..0a495ef841 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -391,8 +391,8 @@ def check_equal(x_list: list, y_list: list): result = [x for x in x_list if not any(x == y for y in y_list)] return result, len(x_list) - len(result) + @staticmethod def expand_objects( - self, new_particles: list, x_list: list, neighborhoods: dict, From c7ed8cc3ff90d6df8bee5c00edc1c88fa7bf8a29 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Fri, 8 Sep 2023 00:50:59 +0700 Subject: [PATCH 29/43] Refactor. --- river/anomaly/ilof.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index 0a495ef841..16ed09d934 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -386,8 +386,9 @@ def initial_calculations( @staticmethod def check_equal(x_list: list, y_list: list): - """Check if new list of observations (x_list) has any data sample that is equal to - any previous data recorded (y_list).""" + """ + Check if new list of observations (x_list) has any data sample that is equal to any previous data recorded (y_list). + """ result = [x for x in x_list if not any(x == y for y in y_list)] return result, len(x_list) - len(result) From fedf6f733da2af90a1eaca98036d8c8f8d15b3ab Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Fri, 8 Sep 2023 00:51:19 +0700 Subject: [PATCH 30/43] Change define_sets to static method. --- river/anomaly/ilof.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index 16ed09d934..b3ae7429e3 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -427,8 +427,11 @@ def expand_objects( lof, ) - def define_sets(self, nm, neighborhoods: dict, rev_neighborhoods: dict): - """Define sets of points for the incremental LOF algorithm""" + @staticmethod + def define_sets(nm, neighborhoods: dict, rev_neighborhoods: dict): + """ + Define sets of points for the incremental LOF algorithm. + """ # Define set of new points from batch set_new_points = set(range(nm[0], nm[0] + nm[1])) set_neighbors: set = set() From 8b7bb8cb374f59e3c23ae0a0f54964a07df0715f Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Fri, 8 Sep 2023 00:51:36 +0700 Subject: [PATCH 31/43] Change calc_reach_dist_newpoints to static method. --- river/anomaly/ilof.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index b3ae7429e3..a9f61463e9 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -456,17 +456,19 @@ def define_sets(nm, neighborhoods: dict, rev_neighborhoods: dict): return set_new_points, set_neighbors, set_rev_neighbors, set_upd_lrd, set_upd_lof + @staticmethod def calc_reach_dist_newpoints( - self, - Set: set, + set_index: set, neighborhoods: dict, rev_neighborhoods: dict, reach_dist: dict, dist_dict: dict, k_dist: dict, ): - """Calculate reachability distance from new points to neighbors and from neighbors to new points""" - for c in Set: + """ + Calculate reachability distance from new points to neighbors and from neighbors to new points. + """ + for c in set_index: for j in set(neighborhoods[c]): reach_dist[c][j] = max(dist_dict[c][j], k_dist[j]) for j in set(rev_neighborhoods[c]): From cc1864a3f1959af6427f33430b858a739bae370a Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Fri, 8 Sep 2023 01:00:06 +0700 Subject: [PATCH 32/43] Change calc_reach_dist_otherpoints to static method and refactor. --- river/anomaly/ilof.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index a9f61463e9..df20313d72 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -475,18 +475,20 @@ def calc_reach_dist_newpoints( reach_dist[j][c] = max(dist_dict[j][c], k_dist[c]) return reach_dist + @staticmethod def calc_reach_dist_otherpoints( - self, - Set: set, + set_index: set, neighborhoods: dict, rev_neighborhoods: dict, reach_dist: dict, dist_dict: dict, k_dist: dict, ): - """Calculate reachability distance from reverse neighbors of reverse neighbors ( RkNN(RkNN(NewPoints)) ) to reverse neighbors ( RkNN(NewPoints) ) - These values change because of the insertion of new points""" - for j in Set: + """ + Calculate reachability distance from reverse neighbors of reverse neighbors ( RkNN(RkNN(NewPoints)) ) + to reverse neighbors ( RkNN(NewPoints) ). These values change due to the insertion of new points. + """ + for j in set_index: for i in set(rev_neighborhoods[j]): reach_dist[i][j] = max(dist_dict[i][j], k_dist[j]) return reach_dist From 74d814afcee76ec0288d3cc19c767176f2f1da1e Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Fri, 8 Sep 2023 01:14:05 +0700 Subject: [PATCH 33/43] Refactor. --- river/anomaly/ilof.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index df20313d72..b7745b3688 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -90,8 +90,8 @@ class LocalOutlierFactor(anomaly.base.AnomalyDetector): References ---------- - Pokrajac, David & Lazarevic, Aleksandar & Latecki, Longin Jan. (2007). Incremental Local Outlier Detection for Data Streams. - In: Proceedings of the 2007 IEEE Symposium on Computational Intelligence and Data Mining (CIDM 2007). 504-515. + David Pokrajac, Aleksandar Lazarevic, and Longin Jan Latecki (2007). Incremental Local Outlier Detection for Data + Streams. In: Proceedings of the 2007 IEEE Symposium on Computational Intelligence and Data Mining (CIDM 2007). 504-515. DOI: 10.1109/CIDM.2007.368917. """ @@ -231,7 +231,7 @@ def learn(self, x_batch: list): def score_one(self, x: dict): """ Score a new incoming observation based on model constructed previously. - Perform same calculations as 'learn_one' function but doesn't add the new calculations to the atributes + Perform same calculations as 'learn_one' function but doesn't add the new calculations to the attributes Data samples that are equal to samples stored by the model are not considered. Parameters From fca09a080dfc070a72bfb9d5d80a6c4962d7ef77 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Fri, 8 Sep 2023 01:14:29 +0700 Subject: [PATCH 34/43] Refactor calc_reach_dist_new_points --- river/anomaly/ilof.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index b7745b3688..a59f6b7eaf 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -203,7 +203,7 @@ def learn(self, x_batch: list): ) = self.define_sets(nm, self.neighborhoods, self.rev_neighborhoods) # Calculate new reachability distance of all affected points - self.reach_dist = self.calc_reach_dist_newpoints( + self.reach_dist = self.calc_reach_dist_new_points( set_new_points, self.neighborhoods, self.rev_neighborhoods, @@ -288,7 +288,7 @@ def score_one(self, x: dict): set_upd_lrd, set_upd_lof, ) = self.define_sets(nm, neighborhoods, rev_neighborhoods) - reach_dist = self.calc_reach_dist_newpoints( + reach_dist = self.calc_reach_dist_new_points( set_new_points, neighborhoods, rev_neighborhoods, reach_dist, dist_dict, k_dist ) reach_dist = self.calc_reach_dist_otherpoints( @@ -457,7 +457,7 @@ def define_sets(nm, neighborhoods: dict, rev_neighborhoods: dict): return set_new_points, set_neighbors, set_rev_neighbors, set_upd_lrd, set_upd_lof @staticmethod - def calc_reach_dist_newpoints( + def calc_reach_dist_new_points( set_index: set, neighborhoods: dict, rev_neighborhoods: dict, From 385fc89986cf64b513a1bed2422275801492e59e Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Fri, 8 Sep 2023 01:15:17 +0700 Subject: [PATCH 35/43] Refactor calc_reach_dist_other_points. --- river/anomaly/ilof.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index a59f6b7eaf..653190bf8e 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -211,7 +211,7 @@ def learn(self, x_batch: list): self.dist_dict, self.k_dist, ) - self.reach_dist = self.calc_reach_dist_otherpoints( + self.reach_dist = self.calc_reach_dist_other_points( set_rev_neighbors, self.neighborhoods, self.rev_neighborhoods, @@ -291,7 +291,7 @@ def score_one(self, x: dict): reach_dist = self.calc_reach_dist_new_points( set_new_points, neighborhoods, rev_neighborhoods, reach_dist, dist_dict, k_dist ) - reach_dist = self.calc_reach_dist_otherpoints( + reach_dist = self.calc_reach_dist_other_points( set_rev_neighbors, neighborhoods, rev_neighborhoods, @@ -476,7 +476,7 @@ def calc_reach_dist_new_points( return reach_dist @staticmethod - def calc_reach_dist_otherpoints( + def calc_reach_dist_other_points( set_index: set, neighborhoods: dict, rev_neighborhoods: dict, From 2d18fe6a9b4bae24adf4934ac74f0872758829e5 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Fri, 8 Sep 2023 01:17:19 +0700 Subject: [PATCH 36/43] Remove import --- river/anomaly/ilof.py | 1 - 1 file changed, 1 deletion(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index 653190bf8e..701e6967cb 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -6,7 +6,6 @@ from river import anomaly, utils from river.neighbors.base import DistanceFunc -from river.utils import VectorDict class LocalOutlierFactor(anomaly.base.AnomalyDetector): From 7ef647e3ac9bd6f925fc394dcc6fff8d6baa0a73 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Fri, 8 Sep 2023 01:18:08 +0700 Subject: [PATCH 37/43] Refactor and add description in docstring regarding expected performance of ILOF. --- river/anomaly/ilof.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index 701e6967cb..05e3ecf822 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -17,7 +17,7 @@ class LocalOutlierFactor(anomaly.base.AnomalyDetector): We consider: - NewPoints: new points; - - kNN(p): the neighboors of p (the k closest points to p) + - kNN(p): the neighboors of p (the k-closest points to p) - RkNN(p): the rev-neighboors of p (points that have p as one of their neighboors) - Set_upd_lrd: Set of points that need to update the local reachability distance - Set_upd_lof: Set of points that need to update the local outlier factor From f22393cc88148add26230ac8a80ea541451cc0ed Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Fri, 8 Sep 2023 07:47:37 +0700 Subject: [PATCH 38/43] Refactor. --- river/anomaly/ilof.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index 05e3ecf822..966247295e 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -403,7 +403,9 @@ def expand_objects( local_reach: dict, lof: dict, ): - """Expand size of dictionaries and lists to fit new data""" + """ + Expand size of dictionaries and lists to take into account new data points. + """ n = len(x_list) m = len(new_particles) x_list.extend(new_particles) @@ -496,7 +498,9 @@ def calc_reach_dist_other_points( def calc_local_reach_dist( set_index: set, neighborhoods: dict, reach_dist: dict, local_reach_dist: dict ): - """Calculate local reachability distance of affected points.""" + """ + Calculate local reachability distance of affected points. + """ for i in set_index: local_reach_dist[i] = len(neighborhoods[i]) / sum( [reach_dist[i][j] for j in neighborhoods[i]] @@ -505,7 +509,9 @@ def calc_local_reach_dist( @staticmethod def calc_lof(set_index: set, neighborhoods: dict, local_reach: dict, lof: dict): - """Calculate local outlier factor (LOF) of affected points.""" + """ + Calculate local outlier factor (LOF) of affected points. + """ for i in set_index: lof[i] = sum([local_reach[j] for j in neighborhoods[i]]) / ( len(neighborhoods[i]) * local_reach[i] From 1eb210dd3b6286c7aa6abc2a3a4bfcb890b5ccb7 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Fri, 8 Sep 2023 08:23:42 +0700 Subject: [PATCH 39/43] Remove one unnecessary variable of calc_reach_dist_other_points. --- river/anomaly/ilof.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index 966247295e..92ac687a9d 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -212,7 +212,6 @@ def learn(self, x_batch: list): ) self.reach_dist = self.calc_reach_dist_other_points( set_rev_neighbors, - self.neighborhoods, self.rev_neighborhoods, self.reach_dist, self.dist_dict, @@ -292,7 +291,6 @@ def score_one(self, x: dict): ) reach_dist = self.calc_reach_dist_other_points( set_rev_neighbors, - neighborhoods, rev_neighborhoods, reach_dist, dist_dict, @@ -479,7 +477,6 @@ def calc_reach_dist_new_points( @staticmethod def calc_reach_dist_other_points( set_index: set, - neighborhoods: dict, rev_neighborhoods: dict, reach_dist: dict, dist_dict: dict, From 7ae1ab8592478ac51b551d3c8a18253d5ec0d219 Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Fri, 8 Sep 2023 08:24:00 +0700 Subject: [PATCH 40/43] Modify docstring description of the algorithm --- river/anomaly/ilof.py | 44 ++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index 92ac687a9d..56b2cde678 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -11,25 +11,31 @@ class LocalOutlierFactor(anomaly.base.AnomalyDetector): """Incremental Local Outlier Factor (Incremental LOF). - Incremental LOF Algorithm as described in the reference paper - - The Incremental Local Outlier Factor (ILOF) is an online version of the Local Outlier Factor (LOF) used to identify outliers based on density of local neighbors. - - We consider: - - NewPoints: new points; - - kNN(p): the neighboors of p (the k-closest points to p) - - RkNN(p): the rev-neighboors of p (points that have p as one of their neighboors) - - Set_upd_lrd: Set of points that need to update the local reachability distance - - Set_upd_lof: Set of points that need to update the local outlier factor - - The algorithm here implemented based on the original one in the paper is: - 1) Insert NewPoints and calculate its distance to existing points - 2) Update the neighboors and reverse-neighboors of all the points - 3) Define sets of affected points that required update - 4) Calculate the reachability-distance from new point to neighboors (NewPoints -> kNN(NewPoints)) and from rev-neighboors to new point (RkNN(NewPoints) -> NewPoints) - 5) Update the reachability-distance for affected points: RkNN(RkNN(NewPoints)) -> RkNN(NewPoints) - 6) Update local reachability distance of affected points: lrd(Set_upd_lrd) - 7) Update local outlier factor: lof(Set_upd_lof) + The Incremental Local Outlier Factor (ILOF) is an online version of the Local Outlier Factor (LOF), proposed by + Pokrajac et al. (2017), and is used to identify outliers based on density of local neighbors. + + The algorithm take into account the following elements: + - `NewPoints`: new points; + - `kNN(p)`: the k-nearest neighboors of `p` (the k-closest points to `p`); + - `RkNN(p)`: the reverse-k-nearest neighboors of `p` (points that have `p` as one of their neighboors); + - `set_upd_lrd`: Set of points that need to have the local reachability distance updated; + - `set_upd_lof`: Set of points that need to have the local outlier factor updated. + + This current implementation within `River`, based on the original one in the paper, follows the following steps: + 1) Insert new data points (`NewPoints`) and calculate its distance to existing points; + 2) Update the nreaest neighboors and reverse nearest neighboors of all the points; + 3) Define sets of affected points that required updates; + 4) Calculate the reachability-distance from new point to neighboors (`NewPoints` -> `kNN(NewPoints)`) + and from rev-neighboors to new point (`RkNN(NewPoints)` -> `NewPoints`); + 5) Update the reachability-distance for affected points: `RkNN(RkNN(NewPoints))` -> `RkNN(NewPoints)` + 6) Update local reachability distance of affected points: `lrd(set_upd_lrd)`; + 7) Update local outlier factor: `lof(set_upd_lof)`. + + The incremental LOF algorithm is expected to provide equivalent detection performance as the iterated static + LOF algroithm (applied after insertion of each data record), while requiring significantly less computational time. + Moreover, the insertion of a new data point as well as deletion of an old data point influence only a limited number + of their closest neighbors, which means that the number of updates per such insertion/deletion does not depend + on the total number of instances learned/in the data set. Parameters ---------- From 3b5f851ea030cf77d5794018701c6f6676245adc Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Sat, 9 Sep 2023 23:53:04 +0700 Subject: [PATCH 41/43] Add comments to justify the returning results of score_one function. --- river/anomaly/ilof.py | 1 + 1 file changed, 1 insertion(+) diff --git a/river/anomaly/ilof.py b/river/anomaly/ilof.py index 56b2cde678..e66104c4e5 100644 --- a/river/anomaly/ilof.py +++ b/river/anomaly/ilof.py @@ -308,6 +308,7 @@ def score_one(self, x: dict): lof = self.calc_lof(set_upd_lof, neighborhoods, local_reach, lof) self.x_scores = [] + # Use nm[0] as index since upon this configuration nm[1] is expected to be 1. return lof[nm[0]] def initial_calculations( From 8dae3dcb0291e9c2ebe2422387c315943979905d Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Sat, 9 Sep 2023 23:53:26 +0700 Subject: [PATCH 42/43] Add tests for the newly implemented iLOF algorithm. --- river/anomaly/test_ilof.py | 73 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 river/anomaly/test_ilof.py diff --git a/river/anomaly/test_ilof.py b/river/anomaly/test_ilof.py new file mode 100644 index 0000000000..23fb7375e1 --- /dev/null +++ b/river/anomaly/test_ilof.py @@ -0,0 +1,73 @@ +import numpy as np +import pandas as pd + +from river import anomaly, datasets +from river.utils import dict2numpy +from sklearn import neighbors + +np.random.seed(42) + + +def test_incremental_lof_scores(): + """ + Test that the incremental LOF algorithm returns similar LOF scores for each observation + compared with the original static LOF algorithm implemented in scikit-learn. + """ + norm_dist = 0.5 * np.random.rand(100, 2) + x_inliers = np.concatenate((norm_dist - 2, norm_dist, norm_dist + 2), axis=0) + x_outliers = np.concatenate( + ( + np.random.uniform(low=-4, high=4, size=(20, 2)), + np.random.uniform(low=-10, high=-5, size=(10, 2)), + np.random.uniform(low=5, high=10, size=(10, 2)), + ), + axis=0, + ) + x_train = np.concatenate((x_inliers, x_outliers), axis=0) + x_train_dict = [{f"feature_{i + 1}": elem[i] for i in range(2)} for elem in x_train] + ground_truth = np.ones(len(x_train), dtype=int) + ground_truth[-len(x_outliers) :] = -1 + df_train = pd.DataFrame({"observations": x_train_dict, "ground_truth": ground_truth}) + x_pred = np.random.uniform(low=-5, high=5, size=(30, 2)) + x_pred_dict = [{f"feature_{i + 1}": elem[i] for i in range(2)} for elem in x_pred] + incremental_lof = anomaly.LocalOutlierFactor(n_neighbors=20, verbose=False) + + for x in df_train["observations"]: + incremental_lof.learn_one(x) + + ilof_scores_train = np.array([ilof_score for ilof_score in incremental_lof.lof.values()]) + + ilof_scores_pred = [] + for x in x_pred_dict: + ilof_scores_pred.append(incremental_lof.score_one(x)) + + lof_sklearn = neighbors.LocalOutlierFactor(n_neighbors=20) + lof_sklearn.fit_predict(x_train) + lof_sklearn_scores_train = -lof_sklearn.negative_outlier_factor_ + + assert np.allclose(ilof_scores_train, lof_sklearn_scores_train, rtol=1e-08, atol=1e-08) + + +def test_batch_lof_scores(): + """ + Test that the incremental LOF algorithm returns similar LOF scores for each batch + with `learn_many` compared with the original static LOF algorithm implemented in scikit-learn, + under different batch sizes. + """ + cc_df = pd.DataFrame(datasets.CreditCard()) + cc_df_np = [dict2numpy(i) for i in cc_df[0].to_dict().values()] + + batch_sizes = [20, 50, 100] + + for batch_size in batch_sizes: + ilof_river_batch = anomaly.LocalOutlierFactor(n_neighbors=20, verbose=False) + ilof_river_batch.learn_many(cc_df[0:batch_size]) + ilof_scores_river_batch = np.array([v for v in ilof_river_batch.lof.values()]) + + lof_sklearn_batch = neighbors.LocalOutlierFactor(n_neighbors=20) + lof_sklearn_batch.fit_predict(cc_df_np[0:batch_size]) + lof_scores_sklearn_batch = -lof_sklearn_batch.negative_outlier_factor_ + + assert np.allclose( + ilof_scores_river_batch, lof_scores_sklearn_batch, rtol=1e-02, atol=1e-02 + ) From 7ce72be9b320dbfac68e48ebfe05f6911c6d569a Mon Sep 17 00:00:00 2001 From: Hoang-Anh Ngo <50743576+hoanganhngo610@users.noreply.github.com> Date: Sat, 9 Sep 2023 23:57:00 +0700 Subject: [PATCH 43/43] Remove iLOF notebook (since the content has been covered by the test file within the same module). --- docs/examples/ilof_notebook.ipynb | 779 ------------------------------ 1 file changed, 779 deletions(-) delete mode 100644 docs/examples/ilof_notebook.ipynb diff --git a/docs/examples/ilof_notebook.ipynb b/docs/examples/ilof_notebook.ipynb deleted file mode 100644 index 7509df27ee..0000000000 --- a/docs/examples/ilof_notebook.ipynb +++ /dev/null @@ -1,779 +0,0 @@ -{ - "cells": [ - { - "attachments": { - "Screenshot from 2023-06-08 10-01-42.png": { - "image/png": "" - } - }, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Incremental Local Outlier factor\n", - "\n", - "##### Created by: Pietro TANURE ONNIS\n", - "##### Final project for the Online Machine Learning course at Telecom Paris 2023\n", - "\n", - "The Incremental Local Outlier Factor (ILOF) is an online version of the Local Outlier Factor (LOF) used to identify outliers based on density of local neighbors.\n", - "\n", - "We consider: \n", - "\n", - " - NewPoints: new points; \n", - " - kNN(p): the neighboors of p (the k closest points to p)\n", - " - RkNN(p): the rev-neighboors of p (points that have p as one of their neighboors)\n", - " - Set_upd_lrd: Set of points that need to update the local reachability distance\n", - " - Set_upd_lof: Set of points that need to update the local outlier factor\n", - "\n", - "The algorithm here implemented based on the original one in the paper is:\n", - "\n", - " 1) Insert NewPoints and calculate its distance to existing points\n", - " 2) Update the neighboors and reverse-neighboors of all the points\n", - " 3) Define sets of affected points that required update\n", - " 4) Calculate the reachability-distance from new point to neighboors (NewPoints -> kNN(NewPoints)) and from rev-neighboors to new point (RkNN(NewPoints) -> NewPoints)\n", - " 5) Update the reachability-distance for affected points: RkNN(RkNN(NewPoints)) -> RkNN(NewPoints)\n", - " 6) Update local reachability distance of affected points: lrd(Set_upd_lrd)\n", - " 7) Update local outlier factor: lof(Set_upd_lof)\n", - "\n", - "Reference: Pokrajac, David & Lazarevic, Aleksandar & Latecki, Longin Jan. (2007). Incremental Local Outlier Detection for Data Streams. Proceedings of the 2007 IEEE Symposium on Computational Intelligence and Data Mining, CIDM 2007. 504-515. 10.1109/CIDM.2007.368917. \n", - "\n", - "![Screenshot from 2023-06-08 10-01-42.png]()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Generate data with some outliers\n", - "\n", - "We create an artifical data centered around two cluster and add some data made from another distribuition (outliers)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "\n", - "np.random.seed(42)\n", - "\n", - "# Generate train data\n", - "X_inliers = 0.3 * np.random.randn(100, 2)\n", - "X_inliers = np.r_[X_inliers + 2, X_inliers - 2]\n", - "\n", - "# Generate some outliers\n", - "X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))\n", - "X = np.r_[X_inliers, X_outliers]\n", - "\n", - "n_outliers = len(X_outliers)\n", - "ground_truth = np.ones(len(X), dtype=int)\n", - "ground_truth[-n_outliers:] = -1\n", - "\n", - "#Visualize data\n", - "plt.title(\"Data with Outliers\")\n", - "plt.scatter(X[:, 0], X[:, 1], color=\"k\", s=3.0, label=\"Data points\")\n", - "plt.axis(\"tight\")\n", - "plt.xlim((-5, 5))\n", - "plt.ylim((-5, 5))\n", - "plt.show()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Online Machine Learning with River\n", - "\n", - "Using the built class ILOF we calculte the Local Outlier Factor in an online approach, at each step we send an instance of data to the model, that will learn it using the 'learn_one' function, updating the model incrementally. \n", - "\n", - "The class ILOF takes in the following argumetns: \n", - "\n", - "ILOF(*k_neighbors = int; batch_size = int; verbose = boolean; distance_func = function*)\n", - "\n", - "\n", - "Function to learn batch of data: \n", - "\n", - "learn_one(*x: dictionary*)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import ilof_ as ilof\n", - "from river import utils\n", - "\n", - "#Convert to dictionary\n", - "Xdicts = tuple({f'feature_{i+1}': x[i] for i in range(2)} for x in X)\n", - "\n", - "#Define model\n", - "k = 20 #k-neighboors\n", - "ilof_river = ilof.ILOF(k, verbose=False)\n", - "\n", - "#Fit model on stream data\n", - "for x in Xdicts:\n", - " ilof_river.learn_one(x)\n", - "\n", - "lof_scores_river = np.array([v for v in ilof_river.lof.values()])" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also evaluate data without adding learning it, this way the programme outputs a LOF for the data but doesn't update the parameters of the model (neighborhoods, reverse neighborhoods, k-distances, reachability-distances, local outlier factor).\n", - "\n", - "The learn_one function can take in the following arguments:\n", - "*learn_one(k_neighbors, batch_size, verbose, distance_func)*" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[4.1225951442931335], [4.856989240600968], [6.700737740891463], [4.67398123498832], [10.318743687776967], [1.695085624011412], [3.1578141066216476], [1.6190480666043154], [4.135136784137301], [5.1959975977883746]]\n" - ] - } - ], - "source": [ - "#Evaluate data without updating the model\n", - "X_score = np.random.uniform(low=-4, high=4, size=(10, 2))\n", - "\n", - "X_score_dict = tuple({f'feature_{i+1}': x[i] for i in range(2)} for x in X_score)\n", - "\n", - "lof_score = []\n", - "for x in X_score_dict:\n", - " lof_score.append(ilof_river.score_one(x))\n", - "\n", - "print(lof_score)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Batch Machine Learning with Scikit-learn\n", - "\n", - "To compare our results we fit a model with the same data using scikit-learn\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "from sklearn.neighbors import LocalOutlierFactor\n", - "\n", - "#Define the model\n", - "lof_scikit = LocalOutlierFactor(n_neighbors=k)\n", - "\n", - "#Fit model on data\n", - "lof_scikit.fit_predict(X)\n", - "\n", - "#Get Local Outlier Factor\n", - "lof_scores_scikit = - lof_scikit.negative_outlier_factor_" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Plot results\n", - "\n", - "We observe that both the online river approach (river) and the batch approach (sklearn) give the same results, but the online has the advantage of being adapted to treat data streams" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "We observe that the difference is at machine error level:\n" - ] - }, - { - "data": { - "text/plain": [ - "array([-4.09616785e-12, -6.33715302e-11, 1.09381393e-11, -1.17567955e-10,\n", - " 1.33887346e-11, 1.18958177e-11, -6.57527366e-11, -7.40709716e-11,\n", - " -7.44937445e-12, -6.17876861e-11, -9.02540265e-11, -2.53728150e-11,\n", - " 1.22162280e-11, -1.55155888e-11, 1.49403823e-11, -1.26576083e-10,\n", - " -1.56268332e-11, -4.20663504e-11, -7.77566900e-11, -2.44553267e-11,\n", - " -2.61148880e-11, 1.45454759e-11, -5.68558534e-11, -2.96740410e-11,\n", - " -5.03610487e-11, 6.52056187e-12, 8.45368220e-12, -6.24331697e-11,\n", - " 3.10773629e-12, -1.87143634e-11, 1.69848580e-11, -6.51001475e-11,\n", - " -5.37054845e-11, -7.33879624e-12, -4.45921078e-12, -6.28377350e-11,\n", - " -6.90747459e-11, -3.41847439e-10, 1.80840898e-11, -8.69331274e-11,\n", - " 2.49876786e-11, -1.05403020e-10, -6.49036380e-12, -5.79656323e-11,\n", - " 7.16537940e-12, -7.47046869e-12, 6.71662725e-12, -4.13140633e-11,\n", - " -7.85482790e-13, 1.55593316e-11, -4.18334256e-11, 5.83022519e-12,\n", - " 2.18497442e-11, -1.66783476e-10, 1.46855861e-11, -1.15608412e-10,\n", - " -1.78021597e-10, 2.40570897e-11, -2.40685250e-11, -7.99027511e-11,\n", - " -3.46016549e-11, -1.07650999e-10, -1.20261578e-10, -3.42865736e-11,\n", - " 9.95292737e-12, -3.97579747e-11, -1.71840320e-11, -1.03541176e-10,\n", - " 8.75211015e-12, -4.21567226e-11, -5.32438538e-11, -5.20865573e-11,\n", - " -3.00426350e-13, -9.62636637e-11, -1.02473585e-11, 2.13062901e-12,\n", - " 8.63087379e-12, -3.78064247e-12, -1.60033098e-10, -3.17976756e-11,\n", - " -2.91409119e-11, -9.17790288e-11, -5.40958389e-11, -8.77666828e-11,\n", - " 8.20443713e-12, -4.10795842e-11, 2.28256303e-11, -6.83730850e-12,\n", - " -6.59496902e-11, -2.46087373e-10, -2.80828694e-11, -1.77728943e-11,\n", - " 1.73787651e-11, -2.39386289e-12, -5.03110886e-11, 5.85975712e-13,\n", - " -2.67477152e-11, 1.17149623e-11, -5.93547433e-12, -2.51376697e-11,\n", - " -3.91231492e-12, -6.39361897e-11, 1.17769128e-11, -1.36209932e-10,\n", - " 1.12351239e-11, 1.16284760e-11, -6.91506852e-11, -8.15103540e-11,\n", - " -9.22262267e-12, -6.39255315e-11, -9.08870756e-11, -2.77369239e-11,\n", - " 1.16167076e-11, -1.39550593e-11, 1.46895829e-11, -1.41733292e-10,\n", - " -1.54865010e-11, -4.47302195e-11, -7.35935757e-11, -2.80169221e-11,\n", - " -2.83830737e-11, 1.49515955e-11, -6.71342981e-11, -3.07622816e-11,\n", - " -5.31310551e-11, 6.80433487e-12, 5.86319882e-12, -6.18920470e-11,\n", - " 3.35398376e-13, -2.10242934e-11, 1.76045845e-11, -6.86697366e-11,\n", - " -6.14872597e-11, -6.16851015e-12, -3.94029254e-12, -6.70448141e-11,\n", - " -7.16353643e-11, -3.63674646e-10, 1.83564275e-11, -7.49840190e-11,\n", - " 2.55555577e-11, -1.05627285e-10, -1.05502274e-11, -5.96580563e-11,\n", - " 4.42612613e-12, -6.44362341e-12, 6.25999252e-12, -4.48610038e-11,\n", - " -5.12034859e-13, 1.57456270e-11, -4.36486403e-11, 4.71545025e-12,\n", - " 2.25058860e-11, -1.84880333e-10, 1.47774015e-11, -1.19127597e-10,\n", - " -1.85351290e-10, 2.46216381e-11, -2.55948596e-11, -7.39723838e-11,\n", - " -3.54656304e-11, -1.09745102e-10, -1.43158596e-10, -3.35480532e-11,\n", - " 1.04337650e-11, -4.41688908e-11, -1.50348622e-11, -1.16774812e-10,\n", - " 7.96585020e-12, -4.48201476e-11, -5.69460035e-11, -5.68629588e-11,\n", - " 5.56110713e-13, -1.03254738e-10, -1.00581765e-11, 2.49988918e-12,\n", - " 6.66378064e-12, -3.26383365e-12, -1.77152959e-10, -3.82229803e-11,\n", - " -3.23736593e-11, -9.27151689e-11, -5.59789992e-11, -9.92872451e-11,\n", - " 8.13549228e-12, -4.12598844e-11, 2.33668640e-11, -8.91531293e-12,\n", - " -6.88478163e-11, -2.58987498e-10, -2.74111844e-11, -1.59048330e-11,\n", - " 1.83143500e-11, -2.21744845e-12, -6.28099794e-11, -1.59727787e-12,\n", - " -2.83923995e-11, 1.20728982e-11, -7.79643017e-12, -2.67832423e-11,\n", - " -4.55934845e-10, -1.07836184e-10, -3.60191432e-10, -1.28838495e-10,\n", - " -7.21577020e-10, -1.42578749e-09, -2.53113752e-10, -9.60296287e-11,\n", - " -2.22227037e-09, -9.43074063e-10, -1.70505743e-09, -1.32515510e-09,\n", - " -1.30334055e-09, -1.82649540e-09, -1.89989269e-09, -1.57570668e-09,\n", - " -1.35358125e-09, -1.53304036e-09, -1.98927808e-09, -1.32804345e-09])" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from river.utils import dict2numpy\n", - "X_scores1 = lof_scores_scikit\n", - "X_scores2 = lof_scores_river\n", - "\n", - "fig, axs = plt.subplots(ncols=2, figsize=(10, 5))\n", - "\n", - "# First plot\n", - "radius1 = (X_scores1.max() - X_scores1) / (X_scores1.max() - X_scores1.min()) * (X_scores1 > 1.5)\n", - "axs[0].set_title(\"Local Outlier Factor - Sklearn \")\n", - "axs[0].scatter(X[:, 0], X[:, 1], color=\"k\", s=3.0, label=\"Data points\")\n", - "axs[0].axis(\"tight\")\n", - "axs[0].set_xlim((-5, 5))\n", - "axs[0].set_ylim((-5, 5))\n", - "axs[0].scatter(X[:, 0], X[:, 1], s=1000 * radius1, edgecolors=\"r\", facecolors=\"none\", label=\"Outlier scores\")\n", - "axs[0].legend(loc=\"upper left\")\n", - "\n", - "# Second plot\n", - "radius2 = (max(X_scores2) - X_scores2) / (max(X_scores2) - min(X_scores2)) * (X_scores2 > 1.5)\n", - "axs[1].set_title(\"Incremental Local Outlier Factor - River\")\n", - "axs[1].scatter(X[:, 0], X[:, 1], color=\"k\", s=3.0, label=\"Data points\")\n", - "axs[1].axis(\"tight\")\n", - "axs[1].set_xlim((-5, 5))\n", - "axs[1].set_ylim((-5, 5))\n", - "axs[1].scatter(X[:, 0], X[:, 1], s=1000 * radius2, edgecolors=\"r\", facecolors=\"none\", label=\"Outlier scores\")\n", - "axs[1].legend(loc=\"upper left\")\n", - "\n", - "plt.show()\n", - "\n", - "print('We observe that the difference is at machine error level:')\n", - "X_scores1 - X_scores2" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Time comparison\n", - "\n", - "Although not adapted to a batch approach River ILOF also allows to calculate a mini-batch approach using the function 'learn_many', here we compare the time of execution of ScikitLOF and RiverILOF considering both receiving a *batch_size* number of data points:\n", - "\n", - "ScikitLOF has applies methods like tree search that optimize the search for neighboors so its execution time remains more of less constant" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "from river import datasets\n", - "import pandas as pd\n", - "from river.utils import dict2numpy\n", - "import time \n", - "dataset = pd.DataFrame(datasets.CreditCard())\n", - "dataset_np = [dict2numpy(i) for i in dataset[0].to_dict().values()]" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "We observe again that the error is of machine precision level: (the few errors at the scale of e-05 are because of the river minkowski-distance function error, I submitted a correction request, for details see bottom of this document \n" - ] - }, - { - "data": { - "text/plain": [ - "array([ 1.15640830e-12, 1.39670502e-05, 1.18705046e-12, 1.55497837e-12,\n", - " 4.09583478e-12, 1.33378808e-05, 1.28123493e-05, 1.81729894e-05,\n", - " 2.78910228e-12, 1.16189877e-05, 1.14542134e-05, 9.35752947e-03,\n", - " 1.18571819e-12, 1.33008174e-05, 2.86748403e-12, 1.16448999e-05,\n", - " 1.14404267e-05, 1.10114258e-05, 3.12239123e-12, 1.07421263e-05,\n", - " 5.79758463e-13, 1.48459023e-12, 1.02679237e-05, 1.10608636e-05,\n", - " 9.62921621e-06, 4.63407090e-13, 1.60071956e-12, -1.26565425e-13,\n", - " 1.02962083e-12, -1.02362563e-13, -1.12798659e-13, -8.31557045e-14,\n", - " -1.26343380e-13, -1.26343380e-13, -9.31477118e-14, -9.31477118e-14,\n", - " 5.65991698e-13, -1.12909682e-13, 5.81756865e-14, -6.90558721e-14,\n", - " 5.17141885e-13, -1.15241150e-13, -3.32289751e-13, -1.55431223e-13,\n", - " -3.33177930e-13, -7.32747196e-14, -1.53210777e-13, -2.20934382e-13,\n", - " 5.41122702e-13, -7.88258347e-14, -1.08357767e-13, 3.73034936e-13,\n", - " -1.61870517e-13, -2.10720330e-13, -1.11577414e-13, 1.06137321e-13,\n", - " 7.18092252e-13, -2.16826557e-13, -1.96176408e-13, -1.94289029e-14,\n", - " 5.29576383e-13, 5.44231327e-13, 7.85815857e-13, 7.86259946e-13,\n", - " 5.29798427e-13, 1.77857729e-13, 1.84563476e-12, 2.95319325e-13,\n", - " 1.70974346e-12, 1.01563202e-12, 9.39026634e-13, 3.55271368e-15,\n", - " 1.93178806e-13, 1.67421632e-13, 4.48752147e-13, 1.11022302e-13,\n", - " 1.40487622e-12, 5.26911847e-13, 6.88338275e-14, 1.00586206e-13,\n", - " 2.87547763e-13, 1.34336986e-13, 7.30526750e-13, 3.91464638e-13,\n", - " 1.56761799e-04, 2.69784195e-13, 2.46469511e-13, 1.66405849e-04,\n", - " 1.58040050e-04, 2.77333712e-13, 3.08864045e-13, 1.92956762e-13,\n", - " 1.82164776e-04, 1.60346254e-04, -3.75195624e-04, 6.79012402e-13,\n", - " 1.57798215e-04, -2.95809327e-04, -3.33431024e-04, 6.43923737e-03,\n", - " 1.54669151e-04, -3.04159366e-04, 1.56511402e-04, 1.89848137e-13,\n", - " 1.55735689e-04, -2.70298576e-04, -5.93150343e-06, -5.30967388e-06,\n", - " 4.09087888e-04, 4.07393126e-04, 1.70752301e-13, -5.79448265e-06,\n", - " 4.07393126e-04, 4.07393126e-04, 4.07393126e-04, 4.07393126e-04,\n", - " 1.30118138e-13, 9.82103288e-13, 1.80522264e-13, -5.32003883e-06,\n", - " 9.96536187e-13, -5.26649721e-06, 2.24709140e-13, 2.70457805e-04,\n", - " 3.94795308e-13, 2.55833616e-02, 2.01616501e-13, -7.19974911e-06,\n", - " -4.16309377e-04, 2.26485497e-13, 1.36335387e-13, -1.56394049e-04,\n", - " -1.60504680e-04, 1.44551038e-13, 3.40172335e-13, 3.41948692e-13,\n", - " -6.06908744e-06, -1.60557147e-04, 9.34519349e-03, 3.09530179e-13,\n", - " 1.63424829e-13, -2.24538171e-04, 5.22248911e-13, -6.35571313e-04,\n", - " 4.31520099e-03, -1.72894609e-04, -1.73668432e-04, 4.70068429e-13,\n", - " 2.10026490e-02, 7.59392549e-13, 1.70530257e-13, 5.21804822e-13,\n", - " 3.11006829e-04, 3.11648281e-04, 3.11801549e-04, 3.29088527e-04,\n", - " 3.30327983e-04, 9.14823772e-14, 2.27151631e-13, 2.56017429e-13,\n", - " 1.31450406e-13, 1.53210777e-13, 1.17873899e-05, 3.31071287e-04,\n", - " 2.02060590e-12, 6.01740879e-13, -5.12563388e-04, 1.92068583e-13,\n", - " -5.09268541e-04, 1.82076576e-13, 2.81108470e-13, 4.03899136e-13,\n", - " 2.44915199e-13, 2.04725126e-13, 1.65201186e-13, 1.01008091e-12,\n", - " 7.07878201e-13, 4.12336831e-13, -3.49806685e-04, 2.26929586e-13,\n", - " 9.29034627e-13, 4.90496532e-13, -3.37636089e-04, 3.67039732e-13,\n", - " 8.68860539e-13, 1.28319577e-12, 1.28341782e-13, -3.15081632e-04,\n", - " -3.19684248e-04, 2.52020627e-13, 1.59494640e-12, 1.46549439e-13,\n", - " 4.73399098e-13, 1.38620889e-02, -3.05438289e-04, -3.01573389e-04,\n", - " -2.90886979e-04, -3.03033530e-04, -2.91426910e-04, 6.49596881e-05,\n", - " -2.87462398e-04, 1.91446858e-12, 1.39372554e-04, -2.83187516e-04,\n", - " 6.25721697e-13, 1.22379275e-04, 2.41584530e-13, 1.40776280e-13,\n", - " 1.62048153e-12, -2.77279167e-04, 1.76725301e-12, 1.20264974e-04,\n", - " 4.20552482e-13, 1.44328993e-13, 1.19077456e-04, 1.18444812e-04,\n", - " 1.17944021e-04, 2.42117437e-12, -1.16573418e-14, -3.04312131e-13,\n", - " 1.18190262e-04, 1.18190262e-04, 1.18186866e-04, 1.18186866e-04,\n", - " 1.26787469e-13, 1.36677821e-04, -2.38211696e-04, 1.59650071e-13,\n", - " 1.14530607e-12, 8.85291840e-13, 2.47801779e-13, -2.49588575e-04,\n", - " -2.50087996e-04, -2.55130443e-04, 1.33448808e-13, 1.86295424e-13,\n", - " -2.62012944e-04, -2.68561161e-04, -2.62749009e-04, -3.02303230e-04,\n", - " 1.00364161e-13, -3.37650827e-04, 1.50029701e-02, 1.08113518e-12,\n", - " 1.47952174e-02, -2.90580271e-04, 1.00053299e-12, -2.91386478e-04,\n", - " 1.34847689e-12, 5.00932629e-13, -2.95171897e-04, 6.53255228e-13,\n", - " -3.02667832e-04, -2.98851904e-04, 7.90034704e-13, 4.60520511e-13,\n", - " 8.42437231e-13, 5.03597164e-13, -3.15720643e-04, 3.31956684e-13,\n", - " 5.34683409e-13, -3.06075277e-04, -3.06075277e-04, 1.41664458e-13,\n", - " 3.04645198e-13, 7.26085858e-14, 5.73319170e-13, 3.71258579e-13,\n", - " 1.19015908e-13, 3.38173933e-13, 1.11022302e-13, 5.12923037e-14,\n", - " 1.06670228e-12, 1.15463195e-14, 9.37694367e-13, -2.98649994e-14,\n", - " -2.78665979e-14, 4.21440660e-13, -2.26485497e-14, 5.71764858e-13,\n", - " -2.43138842e-14, 2.72448730e-13, 6.44595488e-13, 3.44169138e-13,\n", - " 8.35997938e-13, -4.45199433e-14, -6.62803146e-14, 3.33288952e-13,\n", - " 4.50528503e-13, -6.94999613e-14, -5.57331958e-14, -4.11892742e-14,\n", - " 9.54125667e-13, 5.40456568e-13, -8.16013923e-14, -9.27036226e-14,\n", - " 9.61453139e-14, 7.74047493e-13, -1.30562228e-13, -9.54791801e-14,\n", - " 4.21884749e-15, 1.77635684e-14, -2.06501483e-14, -2.59792188e-14,\n", - " -3.81916720e-14, -3.16413562e-14, 6.26165786e-13, 9.61453139e-14,\n", - " 2.50910404e-14, 5.21804822e-15, 1.06559206e-12, -4.18554080e-14,\n", - " 4.15223411e-13, 4.17887946e-13, 2.96873637e-13, 5.50892665e-13,\n", - " 1.03472786e-12, 9.06164033e-13, 4.37649916e-13, 1.73638881e-13,\n", - " 1.11466392e-12, 3.01758618e-13, 2.29816166e-13, -2.22044605e-15,\n", - " -2.88657986e-15, 7.31414929e-13, -2.57571742e-14, -3.37507799e-14,\n", - " -1.86517468e-14, 9.34807787e-14, 2.33146835e-14, 1.27675648e-14,\n", - " 7.54951657e-14, 4.63185046e-13, 9.41469125e-14, 8.59312621e-14,\n", - " 5.26245714e-14, 8.12683254e-14, -5.19584376e-14, -1.96509475e-14,\n", - " -7.96029909e-14, 1.24589228e-12, 3.65263375e-13, 4.01012556e-13,\n", - " 4.09894341e-13, -5.41788836e-14, 9.21485110e-14, -2.18047802e-13,\n", - " -4.78506124e-14, 3.14193116e-13, -1.68753900e-13, 2.09943174e-12,\n", - " -1.72306613e-13, 9.87210313e-13, 3.15747428e-13, -2.51243470e-13,\n", - " -1.96620498e-13, 2.12030393e-12, 7.80930876e-13, -3.35287353e-14,\n", - " -3.73034936e-14, -7.16093851e-14, 1.05560005e-12, -2.33146835e-15,\n", - " 5.35127498e-13, 4.10338430e-13, -2.55351296e-14, -1.18016708e-13,\n", - " -1.08912879e-13, -1.32893696e-13, 2.20135021e-12, -2.63122857e-14,\n", - " -3.04201109e-14, 6.73905376e-13, 1.53210777e-12, -3.90798505e-14,\n", - " 1.89404048e-13, -2.84217094e-14, 4.90940621e-13, -1.55431223e-15,\n", - " 5.88196158e-13, -6.69464484e-14, 4.00568467e-13, -3.74145159e-14,\n", - " 1.57096558e-12, -9.04831765e-14, -3.54161145e-14, 3.23518989e-13,\n", - " -9.40358902e-14, -5.65103520e-14, 1.43218770e-14, -5.08482145e-14,\n", - " 4.03010958e-13, 1.71951342e-12, -4.99600361e-14, 7.59392549e-13,\n", - " 3.19522186e-13, 7.03215264e-13, 3.57491814e-14, 2.75557355e-13,\n", - " 1.69819714e-12, 2.54463117e-13, 4.22106794e-13, 2.44693155e-13,\n", - " 1.29674049e-13, 8.52207194e-13, 1.23456800e-13, 3.59934305e-13,\n", - " 1.76103576e-12, -7.46069873e-14, 7.31859018e-13, -4.46309656e-14,\n", - " -4.98490138e-14, -4.76285678e-14, 1.48969725e-12, 5.50670620e-14,\n", - " 1.70974346e-13, -5.25135491e-14, -8.95949981e-14, 1.71196390e-13,\n", - " -8.58202398e-14, 2.91322522e-13, 7.87148124e-13, 1.45439216e-14,\n", - " 3.69038133e-13, -4.09672296e-14, -1.33226763e-14, -3.20854454e-14,\n", - " 2.87769808e-13, 7.26529947e-13, 1.87627691e-14, -2.17603713e-14,\n", - " 6.86339874e-13, 6.21724894e-14, 1.28275168e-12, 5.12923037e-14,\n", - " 2.65121258e-13, -8.32667268e-15, 3.20410365e-13, 3.34177130e-13,\n", - " 1.66533454e-14, 7.10542736e-15, 6.88338275e-15, -1.27675648e-14,\n", - " 1.15396581e-12, 7.19424520e-14, 1.11999299e-12, 9.21485110e-14,\n", - " 2.31370478e-13, 1.75015558e-12, 4.43423076e-13, 1.05648823e-12,\n", - " 1.61270997e-12, 1.94066985e-13, 1.09690035e-13, 3.71036535e-13,\n", - " 9.06830167e-13, 2.65343303e-13, 2.26263452e-13, 1.51434421e-13,\n", - " 2.95097280e-13, 4.04787315e-13, 1.21658239e-12, 1.68753900e-13,\n", - " 4.18776125e-13, 1.00786046e-12, 1.02917674e-12, 4.97379915e-13,\n", - " 1.37667655e-13, -8.93729535e-14, -2.68673972e-14, 6.99440506e-14,\n", - " 9.57900426e-13, -9.35918010e-14, -1.14352972e-14, -1.74305015e-14,\n", - " -1.28341782e-13, -5.66213743e-14, 3.70814490e-14, -1.52211577e-13,\n", - " -9.52571355e-14, 8.90176821e-13, -1.35558231e-13, 3.69926312e-13,\n", - " 5.91082738e-13, 8.71080985e-13, -1.44328993e-14, -9.37028233e-14,\n", - " 1.00142117e-13, 1.82587279e-12, -3.41948692e-14, -6.78346268e-14,\n", - " 7.98472399e-13, 1.94422256e-12, 8.75743922e-13, -7.72715225e-14,\n", - " -4.92939023e-14, 1.18238752e-12, -1.66977543e-13, -1.72750703e-13,\n", - " -1.84297022e-13, -1.36113343e-13, 1.28652644e-12, -1.73083770e-13,\n", - " -9.25926003e-14, 4.66293670e-14, -5.28466160e-14, 9.00612918e-13,\n", - " 5.26245714e-14, 4.75175455e-14, 5.24025268e-14, 1.82187598e-12,\n", - " 8.82405260e-13, 8.59312621e-14, 8.12683254e-14, 4.57411886e-14,\n", - " 1.14197540e-12, 2.68673972e-13, 1.59205982e-13, 5.24025268e-14,\n", - " 3.93018951e-14, 2.10942375e-14, 4.72955008e-14, 8.59312621e-14,\n", - " 3.08642001e-14, 6.41708908e-14, 7.94697641e-13, 6.04183370e-13,\n", - " 8.63753513e-13, 1.02851061e-12, 1.69420034e-13, 1.49658064e-13,\n", - " 2.25597319e-13, 9.03721542e-14, 1.03472786e-13, 1.28563826e-13,\n", - " 7.14095449e-13, 1.77191595e-13, 1.33670852e-13, 8.52429238e-13,\n", - " 1.37445610e-13, 3.09086090e-13, 5.19140286e-13, 1.45838897e-12,\n", - " 5.83977311e-14, 3.57491814e-14, 7.65609798e-13, 8.10018719e-13,\n", - " 6.41264819e-13, 4.50750548e-14, 8.82183215e-13, 4.66293670e-15,\n", - " 6.99440506e-13, 6.29274410e-13, 5.93969318e-13, 6.64801547e-13,\n", - " -4.90718577e-14, 3.17967874e-13, -1.48436818e-13, 7.86481991e-13,\n", - " -6.86117829e-14, -1.45994328e-13, -1.23900890e-13, -1.62647673e-13,\n", - " 6.07736084e-13, 5.29354338e-13, -1.01696429e-13, -6.72795153e-14,\n", - " 1.68753900e-14, -5.54001289e-14, -7.28306304e-14, -5.70654635e-14,\n", - " 5.81756865e-14, 7.53619389e-13, -8.97060204e-14, -6.83897383e-14,\n", - " -8.89288643e-14, 6.91890989e-13, 8.86180018e-13, 2.50466314e-13,\n", - " 9.17044218e-13, -5.76205750e-14, 3.83915122e-13, 9.21485110e-14,\n", - " -1.42885703e-13, -1.24344979e-14, -5.57331958e-14, -6.57252031e-14,\n", - " 2.10942375e-15, 7.87370169e-13, 4.75175455e-14, 4.58744154e-13,\n", - " 9.64561764e-13, 1.25899291e-13, 1.66977543e-13, 3.88356014e-13,\n", - " 1.23012711e-13, 9.45910017e-14, 2.44249065e-13, 1.00230935e-12,\n", - " 4.90940621e-13, 3.52606833e-13, 1.15907284e-13, 9.35918010e-13,\n", - " 5.37347944e-14, 8.06021916e-14, 8.74855743e-14, 6.37268016e-14,\n", - " 8.99058605e-13, 6.68354261e-14, 2.54907206e-13, 2.37587727e-14,\n", - " 1.73194792e-14, 6.63691324e-13, 1.69708692e-12, -1.33226763e-14,\n", - " -6.40598685e-14, 7.32747196e-15, -4.81836793e-14, 2.70006240e-13,\n", - " 7.53175300e-13, 4.79616347e-14, -2.12052598e-14, 6.06181771e-14,\n", - " 1.61870517e-13, -2.91988655e-14, -2.04281037e-14, -7.54951657e-14,\n", - " 7.10542736e-15, 1.86295424e-13, -7.27196081e-14, -1.99840144e-14,\n", - " 6.23945340e-14, 1.38999923e-13, -6.61692923e-14, 9.08162434e-14,\n", - " -4.79616347e-14, 1.23234756e-12, 8.19566637e-13, -1.23345778e-13,\n", - " 5.12256904e-13, -2.04281037e-14, 3.96349620e-13, -4.66293670e-14,\n", - " 2.58904009e-13, 1.16529009e-12, -1.00697228e-13, 7.03437308e-13,\n", - " -7.00550729e-14, -2.50910404e-14, -1.72084569e-14, 3.75255382e-14,\n", - " 1.65423231e-13, 2.44027021e-13, 8.50430837e-14, 7.50066675e-13,\n", - " -5.49560397e-14, 3.40172335e-13, -3.81916720e-14, -2.00950367e-14,\n", - " -7.38298311e-14, 2.48023824e-13, 3.59046126e-13, -3.70814490e-14,\n", - " 2.84217094e-13, 7.37410133e-13, 6.21724894e-14, 3.55493412e-13,\n", - " 5.50670620e-14, 1.39666056e-13, 9.16156040e-13, 2.96429548e-14,\n", - " 3.26405569e-14, 1.44328993e-13, 4.82280882e-13, 3.41948692e-14,\n", - " 3.35953487e-13, 3.03090886e-13, 2.90878432e-14, 4.26325641e-14,\n", - " 7.34967642e-14, 2.53130850e-13, 3.28181926e-13, 4.13225010e-13,\n", - " 4.92939023e-14, 2.52908805e-13, 7.19424520e-14, 9.05941988e-14,\n", - " 1.37667655e-13, 4.52082816e-13, 3.84359211e-13, -2.89768209e-14,\n", - " 3.67927910e-13, 3.20188320e-13, 4.28101998e-13, 4.29212221e-13,\n", - " 5.54889468e-13, 1.07247544e-13, 2.40030218e-13, 7.03881398e-14,\n", - " 1.15685239e-13, 1.04583009e-13, 1.22790667e-13, 1.62092562e-13,\n", - " 8.41771097e-13, 1.69420034e-13, 5.42677014e-13, 4.29434266e-13,\n", - " 1.62980740e-13, 6.54143406e-13, 2.06501483e-13, 1.94511074e-13,\n", - " 8.22231172e-13, 5.83311177e-13, 9.64339719e-13, 8.65529870e-13,\n", - " 1.13686838e-13, 1.48547841e-13, 8.86180018e-13, 2.07389661e-13,\n", - " 7.99360578e-15, 6.17284002e-14, -2.02060590e-14, 1.62536651e-13,\n", - " 9.51239087e-13, 1.35313982e-12, 1.22590826e-12, 8.39328607e-14,\n", - " 1.18216548e-12, -2.07611706e-14, -1.81521465e-13, -1.28785871e-13,\n", - " 4.79838391e-13, 9.35029831e-13, -2.08055795e-13, -1.71751502e-13,\n", - " 1.42108547e-13, -1.88848936e-13, 2.00683914e-12, -1.43440815e-13,\n", - " 4.88942220e-13, -2.53019827e-13, -1.59205982e-13, -2.01838546e-13,\n", - " -7.30526750e-14, 3.79696274e-13, -2.36921593e-13, 4.46975790e-13,\n", - " 2.53552734e-12, -1.84297022e-13, -5.07371922e-14, 7.41851025e-13,\n", - " -8.01581024e-14, -7.90478794e-14, 3.17457172e-12, -2.64233080e-14,\n", - " -7.54951657e-15, 3.68594044e-14, 1.44551038e-13, 4.92939023e-14,\n", - " 1.26343380e-13, -2.76445533e-14, 6.29274410e-13, 1.84074977e-13,\n", - " 7.41406936e-13, 4.39648318e-13, 1.61648472e-13, 1.31827882e-12,\n", - " 9.06830167e-13, 3.14637205e-13, 2.80220291e-13, 2.40918396e-13,\n", - " 6.31938946e-13, 3.30846461e-13, 1.13908882e-13, 2.08277839e-13,\n", - " 1.51079149e-12, 2.36033415e-13, 4.70956607e-13, 1.88515870e-13,\n", - " 2.24043006e-13, 2.07345252e-12, 1.56763491e-13, 1.80744308e-13,\n", - " 1.13020704e-13, 7.68274333e-14, 5.41788836e-14, 1.73527859e-12,\n", - " 3.26405569e-14, 6.63913369e-14, 2.86437540e-14, 5.05595565e-13,\n", - " 7.92699240e-14, 9.11715148e-13, 6.88338275e-14, 3.12194715e-13,\n", - " 6.17284002e-14, 4.95159469e-13, 2.02060590e-14, 1.04782849e-12,\n", - " -1.78745907e-14, 1.04360964e-14, 1.04494191e-12, 1.05138120e-12,\n", - " 1.11999299e-12, 3.80584453e-13, -1.20459198e-13, 4.38316050e-13,\n", - " 8.99280650e-14, 1.38777878e-13, -6.32827124e-14, -9.10382880e-15,\n", - " 1.48991930e-13, 1.48547841e-13, 4.44311254e-13, 1.44328993e-14,\n", - " 2.06279438e-13, 1.79856130e-14, 6.28386232e-14, 1.05693232e-13,\n", - " 4.33653113e-13, 9.89874849e-13, 6.03517236e-13, 4.05897538e-13,\n", - " 6.14397422e-13, 3.30624417e-13, 9.05275854e-13, 3.03090886e-13,\n", - " 7.83817455e-13, 4.95603558e-13, 6.00186567e-13, 3.87911925e-13,\n", - " 8.27782287e-13, 9.02389274e-13, 8.79740725e-13, 3.63487018e-13,\n", - " 8.07354184e-13, 5.88418203e-13, 2.70672373e-13, 6.51922960e-13,\n", - " 7.19424520e-13, 4.84057239e-13, 3.40394379e-13, 1.08801856e-13,\n", - " 6.52811138e-14, -5.98410210e-14, 4.94715380e-13, 3.73923115e-13,\n", - " -1.33337785e-13, 9.99200722e-13, 3.03090886e-14, 4.09006162e-13,\n", - " 1.14308563e-12, -2.00617301e-13, -1.39555034e-13, -5.17363929e-14,\n", - " -1.10356169e-13, -2.79998247e-13, -1.82964754e-13, -1.57762692e-13,\n", - " -1.05027098e-13, -2.20601315e-13, 1.17461596e-13, -1.93511873e-13,\n", - " 7.45847828e-13, 4.78728168e-13, -1.76192394e-13, -1.79856130e-13,\n", - " 4.99822406e-13, -1.25677246e-13, -1.25677246e-13, 8.32667268e-15,\n", - " 5.61772850e-13, 1.22346577e-13, 4.88720175e-13, 2.11164419e-13,\n", - " 7.60724816e-13, 4.28768132e-13, 1.58983937e-13, 1.90514271e-13,\n", - " 3.43947093e-13, 1.44151358e-12, 1.04583009e-13, 6.80344669e-13,\n", - " 2.16493490e-13, 3.59712260e-14, 1.49347201e-12, 2.22488694e-13,\n", - " 1.38178358e-12, 1.08801856e-14, 1.11022302e-15, 3.45501405e-13,\n", - " 2.18269847e-13, 4.09894341e-13, 1.48547841e-13, 3.16857651e-13,\n", - " 1.25144339e-12, 8.00470801e-13, 9.14823772e-14, 7.90478794e-14,\n", - " 4.57411886e-14, 6.92779167e-14, 1.46771484e-13, 2.04725126e-13,\n", - " 5.47339951e-13, 1.23234756e-13, 5.10702591e-13, 6.63913369e-14,\n", - " 2.73114864e-13, 7.13651360e-13, 2.00950367e-14, 1.41420209e-12,\n", - " 1.36135547e-12, 1.72972747e-13, 1.56896718e-12, 1.88737914e-14,\n", - " 1.96287431e-13, 2.83773005e-13, 2.08943973e-13, 1.38933309e-12,\n", - " 8.34887715e-14, 9.01501096e-14, 2.85105273e-13, 1.82520665e-13,\n", - " 4.42756942e-13, 6.43929354e-14, 5.73097125e-13, 7.37854222e-13,\n", - " 6.11510842e-13, 3.34621220e-13, 4.66293670e-15, 9.99200722e-15,\n", - " 1.15241150e-13, 4.81170659e-13, 2.04281037e-14, 5.53779245e-13,\n", - " -1.13242749e-14, 1.11022302e-13, 3.55271368e-14, 1.39888101e-13,\n", - " 1.73372428e-12, 7.27862215e-13, -4.58522109e-14, 2.66475730e-12,\n", - " -1.11466392e-13, -1.00697228e-13, -1.11133325e-13, -8.84847751e-14,\n", - " 4.67403893e-13, -1.42441614e-13, 2.68740585e-12, -1.99840144e-14,\n", - " 1.22790667e-13, -7.39408534e-14, 1.23900890e-13, 1.26876287e-12,\n", - " 8.65973959e-13, 7.48512363e-13, 7.79376563e-14, -1.61537450e-13,\n", - " 3.44169138e-14, -1.08135723e-13, -3.49831275e-13, -1.39999123e-13,\n", - " 4.22550883e-13, -2.65343303e-13, -2.37920794e-13, -1.22013510e-13,\n", - " -1.39555034e-13, 2.35012010e-12, -1.91957561e-13, 1.01252340e-13,\n", - " 7.72271136e-13, -5.12923037e-14, 1.06137321e-12, 7.55395746e-13,\n", - " 2.77555756e-15, 8.32001135e-13, 1.55520041e-12, 1.22124533e-13,\n", - " 1.03939080e-12, 3.17523785e-14, 2.82662782e-13, 1.03916875e-13,\n", - " 1.14552812e-12, 1.16573418e-12, 6.48148202e-13, 2.19824159e-13,\n", - " 4.03632683e-12, 4.03677092e-13, 4.66826577e-12, 7.95807864e-13,\n", - " 3.24407168e-13, 3.04645198e-13, 3.57047725e-13, 7.15871806e-13,\n", - " 8.36664071e-13, 1.99174011e-12, 5.36015676e-12, 7.07878201e-13])" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "k = 20\n", - "batch_sizes = [20,50,100,200,400,1000]\n", - "\n", - "time_river = []\n", - "time_scikit = []\n", - "for batch_size in batch_sizes:\n", - " #River\n", - " start_time_r = time.time()\n", - " ilof_river2 = ilof.ILOF(k, verbose=False)\n", - " ilof_river2.learn_many(dataset[0:batch_size])\n", - " time_river.append(time.time() - start_time_r)\n", - " ilof_scores_river2 = np.array([v for v in ilof_river2.lof.values()])\n", - "\n", - " #Scikit\n", - " start_time_s = time.time()\n", - " lof_scikit2 = LocalOutlierFactor(n_neighbors=k)\n", - " lof_scikit2.fit_predict(dataset_np[0:batch_size])\n", - " time_scikit.append(time.time() - start_time_s)\n", - " lof_scores_scikit2 = - lof_scikit2.negative_outlier_factor_\n", - "\n", - "#Compare\n", - "print('We observe again that the error is of machine precision level: \\\n", - " (the few errors at the scale of e-05 are because of the river minkowski-distance function error, \\\n", - " I submitted a correction request, for details see bottom of this document ')\n", - "ilof_scores_river2 - lof_scores_scikit2" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import matplotlib.pyplot as plt\n", - "# Plotting the bar graph\n", - "plt.bar(np.array(range(len(time_river)))*1.5+0.25, time_river, width=0.4, label='River ILOF')\n", - "plt.bar(np.array(range(len(time_river)))*1.5-0.25, time_scikit, width=0.4, label='Scikit LOF')\n", - "plt.xticks(np.array(range(len(time_river)))*1.5, batch_sizes)\n", - "plt.xlabel('Batch size')\n", - "plt.ylabel('Execution Time (seconds)')\n", - "plt.title('Mini-batch mode Time Comparison')\n", - "plt.yscale('log')\n", - "plt.legend()\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "RiverILOF time: [5.1155877113342285] ; ScikitLOF time: [0.22044944763183594] \n", - " We observe that RiverILOF gets slower as the number of points learned increases, since it calculates the distance of the new point to all other ones each iteration\n" - ] - } - ], - "source": [ - "#Time to add new points on existing model with 1000 points\n", - "#River\n", - "time_r = []\n", - "start_t_r = time.time()\n", - "for x in dataset[0][1200:1300]:\n", - " ilof_river2.learn_one(x)\n", - "time_r.append(time.time() - start_t_r)\n", - "\n", - "#Scikit\n", - "time_s = []\n", - "start_t_s = time.time()\n", - "lof_scikit2.novelty = True\n", - "lof_scores_scikit2 = lof_scikit2.score_samples(dataset_np[1200:1300])\n", - "time_s.append(time.time() - start_t_s)\n", - "\n", - "print('RiverILOF time:', time_r,'; ScikitLOF time:', time_s, '\\n We observe that RiverILOF gets slower as the number of points learned increases,\\\n", - "since it calculates the distance of the new point to all other ones each iteration.')" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "13.974262055650739 195.28000000000003\n" - ] - } - ], - "source": [ - "#River Minkowski_distance error\n", - "\n", - "import functools\n", - "from river import utils\n", - "#from river.neighbors.base import DistanceFunc\n", - "from river.utils import VectorDict\n", - "\n", - "distancefunc = functools.partial(utils.math.minkowski_distance, p=2)\n", - "a={1: 1.5, 2: 3.5}\n", - "b={1: -0.7, 2: -10.3}\n", - "print(((1.5+0.7)**2+(3.5+10.3)**2)**(1/2), distancefunc(a,b))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.16" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}