Skip to content

Commit

Permalink
test
Browse files Browse the repository at this point in the history
  • Loading branch information
Yaxuan Wang authored and Yaxuan Wang committed Jan 12, 2025
0 parents commit 3e9d15c
Show file tree
Hide file tree
Showing 94 changed files with 7,788 additions and 0 deletions.
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# [KDD 25] NRdetector: Noise-Resilient Point-wise Anomaly Detection in Time Series using Weak Segment Labels

This repository provides the implementation of the NRdetector: Noise-Resilient Point-wise Anomaly Detection in Time Series using Weak Segment Labels

## Abstract
Detecting anomalies in temporal data has gained significant attention across various real-world applications, aiming to identify unusual events and mitigate potential hazards.
In practice, situations often involve a mix of segment-level labels (detected abnormal events with segments of time points) and unlabeled data (undetected events), while the ideal algorithmic outcome should be point-level predictions. Therefore, the huge label information gap between training data and targets makes the task challenging.
In this study, we formulate the above imperfect information as noisy labels and propose NRdetector, a noise-resilient framework that incorporates confidence-based sample selection, robust segment-level learning, and data-centric point-level detection for multivariate time series anomaly detection.
Particularly, to bridge the information gap between noisy segment-level labels and missing point-level labels, we develop a novel loss function that can effectively mitigate the label noise and consider the temporal features. It encourages the smoothness of consecutive points and the separability of points from segments with different labels.
Extensive experiments on real-world multivariate time series datasets with 11 different evaluation metrics demonstrate that NRdetector
consistently achieves robust results across multiple real-world datasets, outperforming various baselines adapted to operate in our setting.

## Quick Start

Train and evaluate. You can reproduce the experiment results as follows:

```
python main.py
```

Binary file added checkpints/EMG_best_model.pth
Binary file not shown.
Binary file added data/EMG/test/data.npy
Binary file not shown.
Binary file added data/EMG/test/label.npy
Binary file not shown.
Binary file added data/EMG/train/data.npy
Binary file not shown.
Binary file added data/EMG/train/label.npy
Binary file not shown.
Binary file added data/EMG/valid/data.npy
Binary file not shown.
Binary file added data/EMG/valid/label.npy
Binary file not shown.
170 changes: 170 additions & 0 deletions data_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@

import os
import numpy as np
import pandas as pd

import torch
import torch.nn.functional as f
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import ConcatDataset

from sklearn.preprocessing import StandardScaler


class SegLoader(Dataset):
def __init__(self, data_dir, split_size, split, **kwargs):
# split:train,val,test
super().__init__()
self.data_dir = data_dir
# print(self.data_dir.split('/')[-1])

self.split_size = split_size
self._load_data(data_dir, split)

def _load_data(self, data_dir, split):

data, dlabel, wlabel = [], [], []
data_ = np.load(os.path.join(data_dir, split)+ "/data.npy")
label_ = np.load(os.path.join(data_dir, split)+ "/label.npy")

# data_, label_, (length, input_size) = np.load(filepath, allow_pickle=True)
if self.data_dir.split('/')[-1] == 'PSM':
label_ = label_.squeeze()
data_, dlabel_, wlabel_ = self._preprocess(data_, label_, self.split_size)

data.append(data_)
dlabel.append(dlabel_)
wlabel.append(wlabel_)
# self.input_size = input_size
self.data = torch.cat(data, dim=0)
self.dlabel = torch.cat(dlabel, dim=0)
self.wlabel = torch.cat(wlabel, dim=0)

# print(self.data.size())
# print(self.dlabel.size())
# print(self.wlabel.size())



def _preprocess(self, data, label, split_size):

# normalize
scaler = StandardScaler()
scaler.fit(data)
data = scaler.transform(data)

# split
data = torch.Tensor(data)
data = f.pad(data, (0, 0, split_size - data.shape[0] % split_size, 0), 'constant', 0)
data = torch.unsqueeze(data, dim=0)
data = torch.cat(torch.split(data, split_size, dim=1), dim=0)

label = torch.Tensor(label)
label = f.pad(label, (split_size - label.shape[0] % split_size, 0), 'constant', 0)
label = torch.unsqueeze(label, dim=0)
label = torch.cat(torch.split(label, split_size, dim=1), dim=0)

dlabel = label
wlabel = torch.max(label, dim=1)[0]

return data, dlabel, wlabel

def __len__(self):
return len(self.data)

def __getitem__(self, idx):
return {
'data': self.data[idx],
'dlabel': self.dlabel[idx],
'wlabel': self.wlabel[idx]
}

class PULoader(Dataset):
def __init__(self, samples, labels):
self.samples = samples
self.labels = labels

def __len__(self):
return len(self.samples)

def __getitem__(self, index):
sample = self.samples[index]
label = self.labels[index]

return sample, label



def get_segment(args):


train_dataset = SegLoader(args.data_dir, args.win_size, 'train')
valid_dataset = SegLoader(args.data_dir, args.win_size, 'valid')
test_dataset = SegLoader(args.data_dir, args.win_size, 'test')
# whole dataset
combined_dataset = ConcatDataset([train_dataset, valid_dataset, test_dataset])

train_loader = DataLoader(dataset=train_dataset, batch_size=10000, shuffle=False, num_workers=1)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=10000, shuffle=False, num_workers=1)
test_loader = DataLoader(dataset=test_dataset, batch_size=10000, shuffle=False, num_workers=1)

dataloader = DataLoader(dataset=combined_dataset, batch_size=args.batch_size, shuffle=False, num_workers=1)

print("The information of the dataset:")
# list = tensor.numpy().tolist()
for _, train_batch in enumerate(train_loader):
train_data = train_batch['data']
train_wlabel = train_batch['wlabel']
train_dlabel = train_batch['dlabel']
print("The size of train_data:",train_data.size())
print("The num of anomaly instances:",torch.sum(train_wlabel))
for _, valid_batch in enumerate(valid_loader):
valid_data = valid_batch['data']
valid_wlabel = valid_batch['wlabel']
valid_dlabel = valid_batch['dlabel']
print("The size of valid_data:",valid_data.size())
print("The num of anomaly instances:",torch.sum(valid_wlabel))
for _, test_batch in enumerate(test_loader):
test_data = test_batch['data']
test_wlabel = test_batch['wlabel']
test_dlabel = test_batch['dlabel']
print("The size of test_data:",test_data.size())
print("The num of anomaly instances:",torch.sum(test_wlabel))
print("The num of anomaly points:",torch.sum(test_dlabel))
data_ = torch.cat((train_data,valid_data,test_data),dim=0)
wlabel_ = torch.cat((train_wlabel,valid_wlabel,test_wlabel),dim=0)
dlabel_ = torch.cat((train_dlabel,valid_dlabel,test_dlabel),dim=0)
print("The whole data size:",data_.size()) # data: torch.Size([1452, 100, 8])

data = data_.tolist()
wlabel = wlabel_.tolist()
dlabel = dlabel_.tolist()
train_wlabel_ = train_wlabel.tolist()
valid_wlabel_ = valid_wlabel.tolist()
test_wlabel_ = test_wlabel.tolist()
train_i = []
train_n = []
valid_i = []
valid_n = []
test_i = []
test_n = []
for i in range(len(train_wlabel_)):
if train_wlabel_[i]>0:
train_i.append(i+1)
else:
train_n.append(i+1)
for i in range(len(valid_wlabel_)):
if valid_wlabel_[i]>0:
valid_i.append(i+1)
else:
valid_n.append(i+1)
for i in range(len(test_wlabel_)):
if test_wlabel_[i]>0:
test_i.append(i+1)
else:
test_n.append(i+1)

return train_i, train_n, valid_i, valid_n, test_i, test_n, data, wlabel, dataloader, dlabel


194 changes: 194 additions & 0 deletions evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
from tadpak import pak
from tadpak import evaluate
from metrics.metrics import *


import numpy as np
from itertools import groupby
from operator import itemgetter
from sklearn.metrics import f1_score
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score


def detection_adjustment(results,labels):

pred = []
gt = []
for item in results:
pred.append(item)
for item in labels:
gt.append(item)

anomaly_state = False
for i in range(len(gt)):
if gt[i] == 1 and pred[i] == 1 and not anomaly_state:
anomaly_state = True
for j in range(i, 0, -1):
if gt[j] == 0:
break
else:
if pred[j] == 0:
pred[j] = 1
for j in range(i, len(gt)):
if gt[j] == 0:
break
else:
if pred[j] == 0:
pred[j] = 1
elif gt[i] == 0:
anomaly_state = False
if anomaly_state:
pred[i] = 1
return pred, gt


def evaluation(results, labels):
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import pandas as pd
matriz_confusao = pd.DataFrame(0, columns=['pred_pos', 'pred_neg'],index=['classe_pos', 'classe_neg'])
for l in range(len(results)):
rotulo_original = 'classe_pos' if labels[l] == 1 else 'classe_neg'
predito = 'pred_pos' if results[l] == 1 else 'pred_neg'
matriz_confusao[predito][rotulo_original] += 1

print(matriz_confusao)


# matriz_confusao.to_csv('out/confusion_lphn_{}.csv'.format(1), sep='\t')
positive_total = matriz_confusao['pred_pos'].sum() #total de noticias classificadas como positivo
true_positive_news = matriz_confusao['pred_pos'].loc['classe_pos'].sum()
print("======================Evaluation===========================")
f = open('avaliados.csv', 'a')
TP = matriz_confusao['pred_pos'].loc['classe_pos'].sum()
FP = matriz_confusao['pred_pos'].loc['classe_neg'].sum()
TN = matriz_confusao['pred_neg'].loc['classe_neg'].sum()
FN = matriz_confusao['pred_neg'].loc['classe_pos'].sum()
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = (2*precision*recall)/(precision+recall)
f.write("-------------------------\n")
f.write("\t".join([str(f1_score(labels, results, average='macro')), str(f1_score(labels, results, average='micro')),
str(accuracy_score(labels, results)), str(precision_score(labels, results, average='macro')),
str(precision_score(labels, results, average='micro')), str(recall_score(labels, results, average='macro')),
str(recall_score(labels, results, average='micro')), str(true_positive_news/positive_total),
str(precision), str(recall), str(f1)])+'\n')

f.close()

print('Macro f1:', f1_score(labels, results, average='macro'))
print('Micro f1:', f1_score(labels, results, average='micro'))
print('accuracy:', accuracy_score(labels, results))
print('Macro Precision:', precision_score(labels, results, average='macro'))
print('Micro Precision:', precision_score(labels, results, average='micro'))
print('Macro Recall:', recall_score(labels, results, average='macro'))
print('Micro Recall:', recall_score(labels, results, average='micro'))
print('True Positive:', true_positive_news/positive_total)
print('Interest-class precision: ', precision)
print('Interest-class recall:', recall)
print('Interest-class f1:', f1)
return f1, precision, recall


def pak_auc(results,labels):
scores = np.array(results)
targets = np.array(labels)
thres = 0.5

candidate = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
f1 = []
for k in candidate:
adjusted_preds = pak.pak(scores, targets, thres, k)
# print(adjusted_preds)
# results = evaluate.evaluate(scores, targets)456

# results = evaluate.evaluate(adjusted_preds, targets)
for item in range(len(adjusted_preds)):
if adjusted_preds[item] == False:
adjusted_preds[item] = 0
else:
adjusted_preds[item] = 1
# 计算f1
precision, recall, threshold = metrics.precision_recall_curve(targets, adjusted_preds)
f1_score = 2 * precision * recall / (precision + recall + 1e-12)
f1.append(np.max(f1_score))
# print("K range(0,1) f1:",f1)
auc = sum(f1)/len(f1)
return auc


def f1_prec_recall_PA(preds, gts, k=0):
# Find out the indices of the anomalies
gt_idx = np.where(gts == 1)[0]
anomalies = []
new_preds = np.array(preds)
# Find the continuous index
for _, g in groupby(enumerate(gt_idx), lambda x : x[0] - x[1]):
anomalies.append(list(map(itemgetter(1), g)))
# For each anomaly (point or seq) in the test dataset
for a in anomalies:
# Find the predictions where the anomaly falls
pred_labels = new_preds[a]
# Check how many anomalies have been predicted (ratio)
if len(np.where(pred_labels == 1)[0]) / len(a) > (k/100):
# Update the whole prediction range as correctly predicted
new_preds[a] = 1
f1_pa = f1_score(gts, new_preds)
prec_pa = precision_score(gts, new_preds)
recall_pa = recall_score(gts, new_preds)
return f1_pa, prec_pa, recall_pa


def f1_prec_recall_K(preds, gts):
f1_pa_k = []
prec_pa_k = []
recall_pa_k = []
for k in range(0,101):
f1_pa, prec_pa, recall_pa = f1_prec_recall_PA(preds, gts, k)
f1_pa_k.append(f1_pa)
prec_pa_k.append(prec_pa)
recall_pa_k.append(recall_pa)
f1_pak_auc = np.trapz(f1_pa_k)
prec_pak_auc = np.trapz(prec_pa_k)
recall_pak_auc = np.trapz(recall_pa_k)
return f1_pak_auc/100, prec_pak_auc/100, recall_pak_auc/100


def get_all_res(results, labels):

f_score, precision, recall = evaluation(results,labels)
print("Precision : {:0.4f}, Recall : {:0.4f}, F-score : {:0.4f} ".format(precision, recall, f_score))


f1_pak_auc, prec_pak_auc, recall_pak_auc = f1_prec_recall_K(results, labels)
print("Precision pak : {:0.4f}, Recall pak : {:0.4f}, F-score pak : {:0.4f} ".format(prec_pak_auc, recall_pak_auc, f1_pak_auc))

matrix = [137]
scores_simple = combine_all_evaluation_scores(np.array(results), np.array(labels))
for key, value in scores_simple.items():
matrix.append(value)
print('{0:21} : {1:0.4f}'.format(key, value))


pred, gt = detection_adjustment(results,labels)
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(gt, pred)
pa_precision, pa_recall, pa_f_score, support = precision_recall_fscore_support(gt, pred, average='binary')
print("Accuracy : {:0.4f}, Precision : {:0.4f}, Recall : {:0.4f}, F-score : {:0.4f} ".format(accuracy, pa_precision, pa_recall, pa_f_score))
return f_score, precision, recall, f1_pak_auc, prec_pak_auc, recall_pak_auc, pa_f_score, pa_precision, pa_recall


def cal_mean_matrix(results_list):
res = np.mean(np.array(results_list), axis=0)
print("===================== Average Results =====================")
print("Precision : {:0.4f}, Recall : {:0.4f}, F-score : {:0.4f} ".format(res[1], res[2], res[0]))
print("Precision Pak : {:0.4f}, Recall Pak : {:0.4f}, F-score Pak : {:0.4f} ".format(res[4], res[5], res[3]))
print("Precision Pa : {:0.4f}, Recall Pa : {:0.4f}, F-score Pa : {:0.4f} ".format(res[7], res[8], res[6]))
print("===========================================================")



Loading

0 comments on commit 3e9d15c

Please sign in to comment.