test

UCSC-REAL · Jan 12, 2025 · 3e9d15c · 3e9d15c
commit 3e9d15c
Show file tree

Hide file tree

Showing 94 changed files with 7,788 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,20 @@
+# [KDD 25] NRdetector: Noise-Resilient Point-wise Anomaly Detection in Time Series using Weak Segment Labels
+
+This repository provides the implementation of the NRdetector: Noise-Resilient Point-wise Anomaly Detection in Time Series using Weak Segment Labels
+
+## Abstract
+Detecting anomalies in temporal data has gained significant attention across various real-world applications, aiming to identify unusual events and mitigate potential hazards. 
+In practice, situations often involve a mix of segment-level labels (detected abnormal events with segments of time points) and unlabeled data (undetected events), while the ideal algorithmic outcome should be point-level predictions. Therefore, the huge label information gap between training data and targets makes the task challenging.
+In this study, we formulate the above imperfect information as noisy labels and propose NRdetector, a noise-resilient framework that incorporates confidence-based sample selection, robust segment-level learning, and data-centric point-level detection for multivariate time series anomaly detection.
+Particularly, to bridge the information gap between noisy segment-level labels and missing point-level labels, we develop a novel loss function that can effectively mitigate the label noise and consider the temporal features. It encourages the smoothness of consecutive points and the separability of points from segments with different labels.
+Extensive experiments on real-world multivariate time series datasets with 11 different evaluation metrics demonstrate that NRdetector 
+consistently achieves robust results across multiple real-world datasets, outperforming various baselines adapted to operate in our setting.
+
+## Quick Start
+
+Train and evaluate. You can reproduce the experiment results as follows:
+
+```
+python main.py
+```
+
diff --git a/checkpints/EMG_best_model.pth b/checkpints/EMG_best_model.pth
diff --git a/data/EMG/test/data.npy b/data/EMG/test/data.npy
diff --git a/data/EMG/test/label.npy b/data/EMG/test/label.npy
diff --git a/data/EMG/train/data.npy b/data/EMG/train/data.npy
diff --git a/data/EMG/train/label.npy b/data/EMG/train/label.npy
diff --git a/data/EMG/valid/data.npy b/data/EMG/valid/data.npy
diff --git a/data/EMG/valid/label.npy b/data/EMG/valid/label.npy
diff --git a/data_loader.py b/data_loader.py
@@ -0,0 +1,170 @@
+
+import os
+import numpy as np
+import pandas as pd
+
+import torch
+import torch.nn.functional as f
+from torch.utils.data import Dataset
+from torch.utils.data import DataLoader
+from torch.utils.data import ConcatDataset
+
+from sklearn.preprocessing import StandardScaler
+
+
+class SegLoader(Dataset):
+    def __init__(self, data_dir, split_size, split, **kwargs): 
+        # split:train,val,test
+        super().__init__()
+        self.data_dir = data_dir
+        # print(self.data_dir.split('/')[-1])
+
+        self.split_size = split_size
+        self._load_data(data_dir, split)
+
+    def _load_data(self, data_dir, split):
+
+        data, dlabel, wlabel = [], [], []
+        data_ = np.load(os.path.join(data_dir, split)+ "/data.npy") 
+        label_ = np.load(os.path.join(data_dir, split)+ "/label.npy")
+
+        # data_, label_, (length, input_size) = np.load(filepath, allow_pickle=True)
+        if self.data_dir.split('/')[-1] == 'PSM':
+            label_ = label_.squeeze()
+        data_, dlabel_, wlabel_ = self._preprocess(data_, label_, self.split_size)
+
+        data.append(data_)
+        dlabel.append(dlabel_)
+        wlabel.append(wlabel_)
+        # self.input_size = input_size
+        self.data = torch.cat(data, dim=0)
+        self.dlabel = torch.cat(dlabel, dim=0)
+        self.wlabel = torch.cat(wlabel, dim=0) 
+
+        # print(self.data.size())   
+        # print(self.dlabel.size())
+        # print(self.wlabel.size())    
+
+
+
+    def _preprocess(self, data, label, split_size):
+
+        # normalize
+        scaler = StandardScaler()
+        scaler.fit(data)
+        data = scaler.transform(data)
+
+        # split
+        data = torch.Tensor(data)
+        data = f.pad(data, (0, 0, split_size - data.shape[0] % split_size, 0), 'constant', 0)
+        data = torch.unsqueeze(data, dim=0)
+        data = torch.cat(torch.split(data, split_size, dim=1), dim=0)
+
+        label = torch.Tensor(label)
+        label = f.pad(label, (split_size - label.shape[0] % split_size, 0), 'constant', 0)
+        label = torch.unsqueeze(label, dim=0)
+        label = torch.cat(torch.split(label, split_size, dim=1), dim=0)
+
+        dlabel = label
+        wlabel = torch.max(label, dim=1)[0]
+
+        return data, dlabel, wlabel
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        return {
+            'data': self.data[idx],
+            'dlabel': self.dlabel[idx],
+            'wlabel': self.wlabel[idx]
+        } 
+
+class PULoader(Dataset):
+    def __init__(self, samples, labels):
+        self.samples = samples
+        self.labels = labels
+
+    def __len__(self):
+        return len(self.samples)
+
+    def __getitem__(self, index):
+        sample = self.samples[index]
+        label = self.labels[index]
+
+        return sample, label
+
+
+
+def get_segment(args):
+
+
+    train_dataset = SegLoader(args.data_dir, args.win_size, 'train')
+    valid_dataset = SegLoader(args.data_dir, args.win_size, 'valid')
+    test_dataset = SegLoader(args.data_dir, args.win_size, 'test')
+    # whole dataset
+    combined_dataset = ConcatDataset([train_dataset, valid_dataset, test_dataset])
+
+    train_loader = DataLoader(dataset=train_dataset, batch_size=10000, shuffle=False, num_workers=1)
+    valid_loader = DataLoader(dataset=valid_dataset, batch_size=10000, shuffle=False, num_workers=1)
+    test_loader = DataLoader(dataset=test_dataset, batch_size=10000, shuffle=False, num_workers=1)
+
+    dataloader = DataLoader(dataset=combined_dataset, batch_size=args.batch_size, shuffle=False, num_workers=1)
+
+    print("The information of the dataset:")
+    # list = tensor.numpy().tolist()
+    for _, train_batch in enumerate(train_loader):
+        train_data = train_batch['data']
+        train_wlabel = train_batch['wlabel']
+        train_dlabel = train_batch['dlabel']
+        print("The size of train_data:",train_data.size())
+        print("The num of anomaly instances:",torch.sum(train_wlabel))
+    for _, valid_batch in enumerate(valid_loader):
+        valid_data = valid_batch['data']
+        valid_wlabel = valid_batch['wlabel']
+        valid_dlabel = valid_batch['dlabel']
+        print("The size of valid_data:",valid_data.size())
+        print("The num of anomaly instances:",torch.sum(valid_wlabel))
+    for _, test_batch in enumerate(test_loader):
+        test_data = test_batch['data']
+        test_wlabel = test_batch['wlabel']
+        test_dlabel = test_batch['dlabel']
+        print("The size of test_data:",test_data.size())
+        print("The num of anomaly instances:",torch.sum(test_wlabel))
+        print("The num of anomaly points:",torch.sum(test_dlabel))
+    data_ = torch.cat((train_data,valid_data,test_data),dim=0)
+    wlabel_ = torch.cat((train_wlabel,valid_wlabel,test_wlabel),dim=0)
+    dlabel_ = torch.cat((train_dlabel,valid_dlabel,test_dlabel),dim=0)
+    print("The whole data size:",data_.size()) # data: torch.Size([1452, 100, 8])
+
+    data = data_.tolist()
+    wlabel = wlabel_.tolist()
+    dlabel = dlabel_.tolist() 
+    train_wlabel_ = train_wlabel.tolist()
+    valid_wlabel_ = valid_wlabel.tolist()
+    test_wlabel_ = test_wlabel.tolist()
+    train_i = []
+    train_n = []
+    valid_i = []
+    valid_n = []
+    test_i = []
+    test_n = []
+    for i in range(len(train_wlabel_)):
+        if train_wlabel_[i]>0:
+            train_i.append(i+1)
+        else:
+            train_n.append(i+1)
+    for i in range(len(valid_wlabel_)):
+        if valid_wlabel_[i]>0:
+            valid_i.append(i+1)
+        else:
+            valid_n.append(i+1)
+    for i in range(len(test_wlabel_)):
+        if test_wlabel_[i]>0:
+            test_i.append(i+1)
+        else:
+            test_n.append(i+1)
+
+    return train_i, train_n, valid_i, valid_n, test_i, test_n, data, wlabel, dataloader, dlabel
+
+
diff --git a/evaluation.py b/evaluation.py
@@ -0,0 +1,194 @@
+from tadpak import pak
+from tadpak import evaluate
+from metrics.metrics import * 
+
+
+import numpy as np
+from itertools import groupby
+from operator import itemgetter
+from sklearn.metrics import f1_score
+from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
+
+
+def detection_adjustment(results,labels):
+
+    pred = []
+    gt = []
+    for item in results:
+        pred.append(item)
+    for item in labels:
+        gt.append(item)
+
+    anomaly_state = False
+    for i in range(len(gt)):
+        if gt[i] == 1 and pred[i] == 1 and not anomaly_state:
+            anomaly_state = True
+            for j in range(i, 0, -1): 
+                if gt[j] == 0:
+                    break
+                else:
+                    if pred[j] == 0:
+                        pred[j] = 1
+            for j in range(i, len(gt)):
+                if gt[j] == 0:
+                    break
+                else:
+                    if pred[j] == 0:
+                        pred[j] = 1
+        elif gt[i] == 0:
+            anomaly_state = False
+        if anomaly_state:
+            pred[i] = 1
+    return pred, gt
+
+
+def evaluation(results, labels):
+    from sklearn.metrics import accuracy_score
+    from sklearn.metrics import precision_score
+    from sklearn.metrics import recall_score
+    from sklearn.metrics import f1_score
+    import pandas as pd
+    matriz_confusao = pd.DataFrame(0, columns=['pred_pos', 'pred_neg'],index=['classe_pos', 'classe_neg'])
+    for l in range(len(results)):
+        rotulo_original =  'classe_pos' if labels[l] == 1 else 'classe_neg'
+        predito = 'pred_pos' if results[l] == 1 else 'pred_neg'
+        matriz_confusao[predito][rotulo_original] += 1
+
+    print(matriz_confusao)
+
+
+    # matriz_confusao.to_csv('out/confusion_lphn_{}.csv'.format(1), sep='\t')
+    positive_total = matriz_confusao['pred_pos'].sum() #total de noticias classificadas como positivo
+    true_positive_news = matriz_confusao['pred_pos'].loc['classe_pos'].sum()
+    print("======================Evaluation===========================")
+    f = open('avaliados.csv', 'a')
+    TP = matriz_confusao['pred_pos'].loc['classe_pos'].sum()
+    FP = matriz_confusao['pred_pos'].loc['classe_neg'].sum()    
+    TN = matriz_confusao['pred_neg'].loc['classe_neg'].sum()
+    FN = matriz_confusao['pred_neg'].loc['classe_pos'].sum()
+    precision = TP / (TP + FP)
+    recall = TP / (TP + FN)
+    f1 = (2*precision*recall)/(precision+recall)
+    f.write("-------------------------\n")
+    f.write("\t".join([str(f1_score(labels, results, average='macro')), str(f1_score(labels, results, average='micro')), 
+			str(accuracy_score(labels, results)), str(precision_score(labels, results, average='macro')), 
+			str(precision_score(labels, results, average='micro')), str(recall_score(labels, results, average='macro')), 
+			str(recall_score(labels, results, average='micro')), str(true_positive_news/positive_total), 
+			str(precision), str(recall), str(f1)])+'\n')
+
+    f.close()
+
+    print('Macro f1:', f1_score(labels, results, average='macro'))
+    print('Micro f1:', f1_score(labels, results, average='micro'))
+    print('accuracy:', accuracy_score(labels, results))
+    print('Macro Precision:', precision_score(labels, results, average='macro'))
+    print('Micro Precision:', precision_score(labels, results, average='micro'))
+    print('Macro Recall:', recall_score(labels, results, average='macro'))
+    print('Micro Recall:', recall_score(labels, results, average='micro'))
+    print('True Positive:', true_positive_news/positive_total)
+    print('Interest-class precision: ', precision)
+    print('Interest-class recall:', recall)
+    print('Interest-class f1:', f1)
+    return f1, precision, recall
+
+
+def pak_auc(results,labels):
+    scores = np.array(results) 
+    targets = np.array(labels)
+    thres = 0.5
+
+    candidate = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
+    f1 = []
+    for k in candidate:
+        adjusted_preds = pak.pak(scores, targets, thres, k)  
+        # print(adjusted_preds)
+        # results = evaluate.evaluate(scores, targets)456
+
+        # results = evaluate.evaluate(adjusted_preds, targets)
+        for item in range(len(adjusted_preds)):
+            if adjusted_preds[item] == False:
+                adjusted_preds[item] = 0
+            else:
+                adjusted_preds[item] = 1
+        # 计算f1
+        precision, recall, threshold = metrics.precision_recall_curve(targets, adjusted_preds)
+        f1_score = 2 * precision * recall / (precision + recall + 1e-12)
+        f1.append(np.max(f1_score))
+    # print("K range(0,1) f1:",f1)
+    auc = sum(f1)/len(f1)
+    return auc
+
+
+def f1_prec_recall_PA(preds, gts, k=0):
+    # Find out the indices of the anomalies
+    gt_idx = np.where(gts == 1)[0]
+    anomalies = []
+    new_preds = np.array(preds)
+    # Find the continuous index 
+    for  _, g in groupby(enumerate(gt_idx), lambda x : x[0] - x[1]):
+        anomalies.append(list(map(itemgetter(1), g)))
+    # For each anomaly (point or seq) in the test dataset
+    for a in anomalies:
+        # Find the predictions where the anomaly falls
+        pred_labels = new_preds[a]
+        # Check how many anomalies have been predicted (ratio)
+        if len(np.where(pred_labels == 1)[0]) / len(a) > (k/100):
+            # Update the whole prediction range as correctly predicted
+            new_preds[a] = 1
+    f1_pa = f1_score(gts, new_preds)
+    prec_pa = precision_score(gts, new_preds)
+    recall_pa = recall_score(gts, new_preds)
+    return f1_pa, prec_pa, recall_pa
+
+
+def f1_prec_recall_K(preds, gts):
+    f1_pa_k = []
+    prec_pa_k = []
+    recall_pa_k = []
+    for k in range(0,101):
+        f1_pa, prec_pa, recall_pa = f1_prec_recall_PA(preds, gts, k)   
+        f1_pa_k.append(f1_pa)
+        prec_pa_k.append(prec_pa)
+        recall_pa_k.append(recall_pa)
+    f1_pak_auc = np.trapz(f1_pa_k)
+    prec_pak_auc = np.trapz(prec_pa_k)
+    recall_pak_auc = np.trapz(recall_pa_k)
+    return f1_pak_auc/100, prec_pak_auc/100, recall_pak_auc/100
+
+
+def get_all_res(results, labels):
+
+    f_score, precision, recall = evaluation(results,labels)
+    print("Precision : {:0.4f}, Recall : {:0.4f}, F-score : {:0.4f} ".format(precision, recall, f_score))
+
+
+    f1_pak_auc, prec_pak_auc, recall_pak_auc = f1_prec_recall_K(results, labels)
+    print("Precision pak : {:0.4f}, Recall pak : {:0.4f}, F-score pak : {:0.4f} ".format(prec_pak_auc, recall_pak_auc, f1_pak_auc))
+
+    matrix = [137]
+    scores_simple = combine_all_evaluation_scores(np.array(results), np.array(labels))
+    for key, value in scores_simple.items():
+        matrix.append(value)
+        print('{0:21} : {1:0.4f}'.format(key, value))
+
+
+    pred, gt = detection_adjustment(results,labels)
+    from sklearn.metrics import precision_recall_fscore_support
+    from sklearn.metrics import accuracy_score
+
+    accuracy = accuracy_score(gt, pred)
+    pa_precision, pa_recall, pa_f_score, support = precision_recall_fscore_support(gt, pred, average='binary')
+    print("Accuracy : {:0.4f}, Precision : {:0.4f}, Recall : {:0.4f}, F-score : {:0.4f} ".format(accuracy, pa_precision, pa_recall, pa_f_score))
+    return f_score, precision, recall, f1_pak_auc, prec_pak_auc, recall_pak_auc, pa_f_score, pa_precision, pa_recall
+
+
+def cal_mean_matrix(results_list):
+    res = np.mean(np.array(results_list), axis=0)
+    print("===================== Average Results =====================")
+    print("Precision : {:0.4f}, Recall : {:0.4f}, F-score : {:0.4f} ".format(res[1], res[2], res[0]))
+    print("Precision Pak : {:0.4f}, Recall Pak : {:0.4f}, F-score Pak : {:0.4f} ".format(res[4], res[5], res[3]))
+    print("Precision Pa : {:0.4f}, Recall Pa : {:0.4f}, F-score Pa : {:0.4f} ".format(res[7], res[8], res[6]))
+    print("===========================================================")
+
+
+