-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathanalysis.py
122 lines (95 loc) · 4.15 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from wrangle import *
import seaborn as sns
sns.set(rc={'figure.facecolor':'lightgray'})
import matplotlib.pyplot as plt
pd.set_option('display.width', pd.util.terminal.get_terminal_size()[0])
pd.set_option('display.max_columns', None)'
sns.stripplot(x='pdate', y='ptime', data
#def barplot(count):
#plot = sns.barplot(x = count.columns[0], y = count.columns[1], data = count)
#plot.set_xticklabels(rotation = 45)
#return(plot)
#Try a hexplot
sns.set(style='ticks')
sns.set_context('talk')
x = df['created_at'].dt.day
y = df['created_at'].dt.hour
g = sns.jointplot(x,y, kind='hex')
cntmap = bt.vendor.value_counts()
btsub = bt.vendor.value_counts().ix[bt.vendor.value_counts() > 2]
g = sns.barplot(x=btsub.index, y=btsub.values, order=btsub.index)
g.set_title('Count of Devices by Vendor [freq > 2]')
g.set_ylabel('Count of Devices')
g.set_xticklabels(g.get_xticklabels(),rotation = 90)
g = sns.barplot(x=btsub.address.value_counts().index, y=btsub.address.value_counts())
#Graph of vendor counts
g = sns.barplot(x=bt.address.value_counts().index, y=bt.address.value_counts(), order=bt.address.value_counts().index[::-1])
g.get_xaxis().set_visible(True)
g.set_title("Device Observation Frequency")
g.set_ylabel('Number of Days a Device is Observed')
g.set_xlabel('Unique Device Addresses')
g.set_xticklabels('')
#build heatmap for uap_lap over each day
hm = pd.pivot_table(bt,index='uap_lap',values='uap_lap', \
columns=bt['created_at'].dt.day,aggfunc=len)
g = sns.heatmap(hm, fmt='g')
pd.pivot_table(bt,index='uap_lap',values='name',columns=bt['created_at'].dt.day,aggfunc=len)
hm = pd.pivot_table(bt,index=['uap_lap'],values='uuid',columns='day',\
aggfunc=lambda x: len(x.dropna().unique()))
g = sns.heatmap(hm, fmt='g')
g.set_xticklabels(g.get_xticklabels(),rotation = 90)
g.set_yticklabels(g.get_yticklabels(),rotation=0)
#Violin plot of times by day
g = sns.violinplot(x=bt['date'], y = bt['tsincefirst'])
#heatmap of when a device is observed over ptime
multi = bt[bt['freq'] > 8]
multi.vendor.cat.remove_unused_categories()
multi.address.cat.remove_unused_categories()
multi = multi.sort_values(by='freq')
multi.name.fillna('Undetected', inplace = True)
hm = pd.pivot_table(multi, index=[multi.address.cat.remove_unused_categories()], values='uuid',\
columns='pdate', aggfunc = len)
hm = pd.pivot_table(multi, index='namegen', values='uuid',\
columns='pdate', aggfunc = len)
'''Using the name column in the code below can cause double counting of
unique addresses. This happens because an address can be recorded with
a name and without one. If sniffing captures a device both ways then
both will be counted as unique device. The vendor column doesn't have the
same problem.
'''
hm = pd.pivot_table(multi, index = 'pdate', values = 'uuid', \
columns = ['address', 'vendor'], aggfunc = len)
hm.size
hm.shape
sns.set_context('talk')
plt.figure(figsize=(8,6))
ax = plt.axes()
g = sns.heatmap(hm, fmt='g', cmap='RdBu_r', cbar=False, \
linewidths = 1, linecolor = 'gray', vmin=0, vmax=1, \
square=False)
g = sns.heatmap(hm, fmt='g', cmap='RdBu_r', cbar=False, vmin=0, vmax=1)
g.set_title('Observation of Devices Over Time (freq > 5)')
g.set_yticklabels(g.get_yticklabels(),rotation=0)
g.set_xticklabels(g.get_xticklabels(),rotation = 90)
#dot plot of vendors
sns.stripplot(x='vendor', y='strtime', data=bt, jitter = True)
sns.stripplot(x='pdate', y='ptime', data = bt, jitter = True)
sns.violinplot(x='pdate', y='ptime', data = bt)
ax = sns.stripplot(x=bt['created_at'])
ax = sns.stripplot(x=bt['pdate'], y = bt['created_at'].dt.time)
#build dict of counts
counts = dict()
for i in bt['vendor']:
counts[i] = counts.get(i,0)+1
#How to subset by masking
start_date = '20161108'
end_date = '20161110'
maskdate = (df['created_at'] > start_date) & (df['created_at'] <= end_date)
masktime = (df['created_at'].dt.hour > wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwcx15)
maskdate = (df['created_at'].dt.month == 11)
sub = df.loc[mask]
sns.factorplot(x =test['vendor'], y=pd.to_datetime(test.ptime, format='%H:%M'), data=test, type = 'violin')
#It took forever to figure this out. This is the syntax to access parts of a
#datetime object.
#df['created_at'].dt.hour
#df['created_at'].dt.day