ymirsky · GuyPuts · Jul 1, 2018 · Jul 3, 2018 · Jul 3, 2018 · Oct 5, 2018
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+input_data/*
+output_data/*
+pickles/*
+__pycache__/*
+.idea/*
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/AfterImage.py b/AfterImage.py
@@ -3,7 +3,7 @@
 
 
 class incStat:
-    def __init__(self, Lambda, ID, init_time=0, isTypeDiff=False):  # timestamp is creation time
+    def __init__(self, Lambda, ID, init_time=0, isTypeDiff=False, tcpFlags=False):  # timestamp is creation time
         self.ID = ID
         self.CF1 = 0  # linear sum
         self.CF2 = 0  # sum of squares
@@ -15,8 +15,28 @@ def __init__(self, Lambda, ID, init_time=0, isTypeDiff=False):  # timestamp is c
         self.cur_var = np.nan
         self.cur_std = np.nan
         self.covs = [] # a list of incStat_covs (references) with relate to this incStat
+        self.tcpPkts = 0
+        self.flag_counts = {
+            "FIN": 0,
+            "SYN": 0,
+            "RST": 0,
+            "PSH": 0,
+            "ACK": 0,
+            "URG": 0,
+            "ECE": 0,
+            "CWR": 0
+        }
+
+    def insert(self, v, t=0, tcpFlags=False):  # v is a scalar, t is v's arrival the timestamp
+        if tcpFlags:
+            self.tcpPkts += 1
+            flag_int = int(tcpFlags, 16)  # Convert hex string to integer
+            flags = ["FIN", "SYN", "RST", "PSH", "ACK", "URG", "ECE", "CWR"]
+            for i, flag in enumerate(flags):
+                if flag_int & (1 << i):  # Check if the flag is set
+                    self.flag_counts[flag] += 1
+            return True
 
-    def insert(self, v, t=0):  # v is a scalar, t is v's arrival the timestamp
         if self.isTypeDiff:
             dif = t - self.lastTimestamp
             if dif > 0:
@@ -98,9 +118,13 @@ def magnitude(self, other_incStats):  # the magnitude of a set of incStats
         return math.sqrt(A)
 
     #calculates and pulls all stats on this stream
-    def allstats_1D(self):
+    def allstats_1D(self, tcpFlags=False):
         self.cur_mean = self.CF1 / self.w
         self.cur_var = abs(self.CF2 / self.w - math.pow(self.cur_mean, 2))
+        # Return mean of tcp flags
+        if tcpFlags:
+            flags = [flag / self.tcpPkts for flag in list(self.flag_counts.values())]
+            return flags
         return [self.w, self.cur_mean, self.cur_var]
 
     #calculates and pulls all stats on this stream, and stats shared with the indicated stream
@@ -264,7 +288,6 @@ def get_lambda(self,Lambda):
     def register(self,ID,Lambda=1,init_time=0,isTypeDiff=False):
         #Default Lambda?
         Lambda = self.get_lambda(Lambda)
-
         #Retrieve incStat
         key = ID+"_"+str(Lambda)
         incS = self.HT.get(key)
@@ -298,9 +321,9 @@ def register_cov(self,ID1,ID2,Lambda=1,init_time=0,isTypeDiff=False):
         return inc_cov
 
     # updates/registers stream
-    def update(self,ID,t,v,Lambda=1,isTypeDiff=False):
+    def update(self,ID,t,v,Lambda=1,isTypeDiff=False,tcpFlags=False):
         incS = self.register(ID,Lambda,t,isTypeDiff)
-        incS.insert(v,t)
+        incS.insert(v,t,tcpFlags=tcpFlags)
         return incS
 
     # Pulls current stats from the given ID
@@ -369,9 +392,9 @@ def get_nD_Stats(self,IDs,Lambda=1): #radius, magnitude (IDs is a list)
         return [np.sqrt(rad),np.sqrt(mag)]
 
     # Updates and then pulls current 1D stats from the given ID. Automatically registers previously unknown stream IDs
-    def update_get_1D_Stats(self, ID,t,v,Lambda=1,isTypeDiff=False):  # weight, mean, std
-        incS = self.update(ID,t,v,Lambda,isTypeDiff)
-        return incS.allstats_1D()
+    def update_get_1D_Stats(self, ID,t,v,Lambda=1,isTypeDiff=False, tcpFlags=False):  # weight, mean, std
+        incS = self.update(ID,t,v,Lambda,isTypeDiff, tcpFlags=tcpFlags)
+        return incS.allstats_1D(tcpFlags)
 
 
     # Updates and then pulls current correlative stats between the given IDs. Automatically registers previously unknown stream IDs, and cov tracking
@@ -439,4 +462,3 @@ def cleanOutOldRecords(self,cutoffWeight,curTime):
             elif W > cutoffWeight:
                 break
         return n
-
diff --git a/FeatureExtractor.py b/FeatureExtractor.py
@@ -17,6 +17,7 @@
 import os.path
 import platform
 import subprocess
+import csv
 
 
 #Extracts Kitsune features from given pcap file one packet at a time using "get_next_vector()"
@@ -63,7 +64,6 @@ def __prep__(self):
         ##If file is TSV (pre-parsed by wireshark script)
         if type == "tsv":
             self.parse_type = "tsv"
-
         ##If file is pcap
         elif type == "pcap" or type == 'pcapng':
             # Try parsing via tshark dll of wireshark (faster)
@@ -106,7 +106,7 @@ def __prep__(self):
             self.limit = len(self.scapyin)
             print("Loaded " + str(len(self.scapyin)) + " Packets.")
 
-    def get_next_vector(self):
+    def get_next_vector(self, single=False):
         if self.curPacketIndx == self.limit:
             if self.parse_type == 'tsv':
                 self.tsvinf.close()
@@ -120,6 +120,10 @@ def get_next_vector(self):
             framelen = row[1]
             srcIP = ''
             dstIP = ''
+            tcpFlags = ''
+            tcpFlags = row[19]
+            payload = ''
+            #payload = int(row[20])+int(row[21])
             if row[4] != '':  # IPv4
                 srcIP = row[4]
                 dstIP = row[5]
@@ -128,8 +132,7 @@ def get_next_vector(self):
                 srcIP = row[17]
                 dstIP = row[18]
                 IPtype = 1
-            srcproto = row[6] + row[
-                8]  # UDP or TCP port: the concatenation of the two port strings will will results in an OR "[tcp|udp]"
+            srcproto = row[6] + row[8]  # UDP or TCP port: the concatenation of the two port strings will will results in an OR "[tcp|udp]"
             dstproto = row[7] + row[9]  # UDP or TCP port
             srcMAC = row[2]
             dstMAC = row[3]
@@ -147,7 +150,6 @@ def get_next_vector(self):
                 elif srcIP + srcproto + dstIP + dstproto == '':  # some other protocol
                     srcIP = row[2]  # src MAC
                     dstIP = row[3]  # dst MAC
-
         elif self.parse_type == "scapy":
             packet = self.scapyin[self.curPacketIndx]
             IPtype = np.nan
@@ -195,24 +197,51 @@ def get_next_vector(self):
             return []
 
         self.curPacketIndx = self.curPacketIndx + 1
-
+        if not single:
+            tcpFlags = False
 
         ### Extract Features
         try:
             return self.nstat.updateGetStats(IPtype, srcMAC, dstMAC, srcIP, srcproto, dstIP, dstproto,
                                                  int(framelen),
-                                                 float(timestamp))
+                                                 float(timestamp), tcpFlags, payload)
         except Exception as e:
             print(e)
             return []
 
 
     def pcap2tsv_with_tshark(self):
         print('Parsing with tshark...')
-        fields = "-e frame.time_epoch -e frame.len -e eth.src -e eth.dst -e ip.src -e ip.dst -e tcp.srcport -e tcp.dstport -e udp.srcport -e udp.dstport -e icmp.type -e icmp.code -e arp.opcode -e arp.src.hw_mac -e arp.src.proto_ipv4 -e arp.dst.hw_mac -e arp.dst.proto_ipv4 -e ipv6.src -e ipv6.dst"
+        fields = "-e frame.time_epoch -e frame.len -e eth.src -e eth.dst -e ip.src -e ip.dst -e tcp.srcport -e tcp.dstport -e udp.srcport -e udp.dstport -e icmp.type -e icmp.code -e arp.opcode -e arp.src.hw_mac -e arp.src.proto_ipv4 -e arp.dst.hw_mac -e arp.dst.proto_ipv4 -e ipv6.src -e ipv6.dst -e tcp.flags -e tcp.len -e udp.length -e http.response.code"
         cmd =  '"' + self._tshark + '" -r '+ self.path +' -T fields '+ fields +' -E header=y -E occurrence=f > '+self.path+".tsv"
         subprocess.call(cmd,shell=True)
         print("tshark parsing complete. File saved as: "+self.path +".tsv")
 
     def get_num_features(self):
         return len(self.nstat.getNetStatHeaders())
+
+    def get_all_vectors(self, csv_path=False, single=False):
+        vectorList = []
+        if csv_path:
+            with open(csv_path, mode='w', newline='') as csv_file:
+                csv_writer = csv.writer(csv_file)
+                while True:
+                    if self.curPacketIndx % 100000 == 0:
+                        print(self.curPacketIndx)
+                    vector = self.get_next_vector(single)
+                    if len(vector) == 0 or self.curPacketIndx > self.limit:
+                        self.curPacketIndx = 0
+                        return csv_path
+                    else:
+                        csv_writer.writerow(vector)
+        else:
+            while True:
+                if self.curPacketIndx % 1000 == 0:
+                    print(self.curPacketIndx)
+                vector = self.get_next_vector()
+                if len(vector) == 0 or self.curPacketIndx > self.limit:
+                    self.curPacketIndx = 0
+                    return vectorList
+                else:
+                   vectorList.append(vector)
+
diff --git a/KitNET/KitNET.py b/KitNET/KitNET.py
@@ -17,7 +17,7 @@ class KitNET:
     #feature_map: One may optionally provide a feature map instead of learning one. The map must be a list,
     #           where the i-th entry contains a list of the feature indices to be assingned to the i-th autoencoder in the ensemble.
     #           For example, [[2,5,3],[4,0,1],[6,7]]
-    def __init__(self,n,max_autoencoder_size=10,FM_grace_period=None,AD_grace_period=10000,learning_rate=0.1,hidden_ratio=0.75, feature_map = None):
+    def __init__(self,n,max_autoencoder_size=10,FM_grace_period=None,AD_grace_period=10000,learning_rate=0.1,hidden_ratio=0.75, feature_map=None):
         # Parameters:
         self.AD_grace_period = AD_grace_period
         if FM_grace_period is None:
@@ -50,7 +50,8 @@ def __init__(self,n,max_autoencoder_size=10,FM_grace_period=None,AD_grace_period
     #Note: KitNET automatically performs 0-1 normalization on all attributes.
     def process(self,x):
         if self.n_trained > self.FM_grace_period + self.AD_grace_period: #If both the FM and AD are in execute-mode
-            return self.execute(x)
+            result = self.execute(x)
+            return result
         else:
             self.train(x)
             return 0.0
@@ -104,6 +105,16 @@ def __createAD__(self):
         params = AE.dA_params(len(self.v), n_hidden=0, lr=self.lr, corruption_level=0, gracePeriod=0, hiddenRatio=self.hr)
         self.outputLayer = AE.dA(params)
 
+    def process_batch(self, data):
+        resultList = []
+        count = 0
+        for instance in data:
+            if count % 1000 == 0:
+                print("processing packet ", count, " / ", len(data))
+            resultList.append(self.process(instance))
+            count += 1
+        return np.array(resultList)
+
 # Copyright (c) 2017 Yisroel Mirsky
 #
 # MIT License

diff --git a/KitNET/__pycache__/KitNET.cpython-39.pyc b/KitNET/__pycache__/KitNET.cpython-39.pyc
diff --git a/KitNET/__pycache__/__init__.cpython-39.pyc b/KitNET/__pycache__/__init__.cpython-39.pyc
diff --git a/KitNET/__pycache__/corClust.cpython-39.pyc b/KitNET/__pycache__/corClust.cpython-39.pyc
diff --git a/KitNET/__pycache__/dA.cpython-39.pyc b/KitNET/__pycache__/dA.cpython-39.pyc
diff --git a/KitNET/__pycache__/utils.cpython-39.pyc b/KitNET/__pycache__/utils.cpython-39.pyc