diff --git a/README.md b/README.md index 0431d2e..b686a22 100644 --- a/README.md +++ b/README.md @@ -90,12 +90,12 @@ Possible features to pass to `flows2features` include: * `IAT`: A flow is represented as a timeseries of inter-arrival times between packets, *i.e.*, elapsed time in seconds between any two packets in the flow. +[comment]: <> (I think this is a better way to describe the statistics as it corresponds to how the data appears in the code) * `STATS`: A flow is represented as a set of statistical quantities. We choose 12 of the most common such statistics in the literature: flow duration, number of - packets sent per second, number of bytes per second, and various statistics on - packet sizes within each flow: mean, standard deviation, inter-quartile range, - minimum, and maximum. Finally, the total number of packets and total number - of bytes for each flow. + packets sent per second, number of bytes per second, mean, standard deviation, + first quartile, median, third quartile, minimum, maximum total number of packets, + and total number of bytes for each flow. * `SIZE`: A flow is represented as a timeseries of packet sizes in bytes, with one sample per packet. diff --git a/src/netml/pparser/parser.py b/src/netml/pparser/parser.py index 663cca2..eb3244f 100644 --- a/src/netml/pparser/parser.py +++ b/src/netml/pparser/parser.py @@ -428,8 +428,8 @@ def _get_IAT_SIZE(flows): def _get_STATS(flows): - """get basic stats features, which includes duration, pkts_rate, bytes_rate, mean, - median, std, q1, q2, q3, min, and max. + """get basic stats features, which is: [duration, pkts_rate, bytes_rate, mean, + std, q1, median, q3, min, max, num_pkts, and num_bytes]. Parameters ---------- @@ -443,7 +443,9 @@ def _get_STATS(flows): each value is five-tuple """ - features = [] + features = [] # could potential initialize the list to : + # ["duration", "pkts_rate", "bytes_rate", "mean", "std", "q1", "median", "q3", "min", "max", "num_pkts", "num_bytes"] + # so that it is clear to users what each term means fids = [] for fid, pkts in flows: sizes = [len(pkt) for pkt in pkts]