-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyze.py
324 lines (285 loc) · 12 KB
/
analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
#!/usr/bin/python3
"""
THE PURPOSE OF THIS SCRIPT IS TO IDENTIFY THE MOST HEAVY EMAIL SENDERS/OFFENDERS IN MY GMAIL,
so I can find what to delete, make filters for, etc.
"""
from apiclient.discovery import build
from apiclient import errors
from httplib2 import Http
from oauth2client import file, client, tools
import operator
import sys
import time
from typing import Optional
import config
SCOPES = 'https://www.googleapis.com/auth/gmail.readonly'
MIN_FREQ_TO_DISPLAY = 15 # TODO could be a threshold based on the overall count
HOUR_SECONDS = 60 * 60
_service = None
_emailSenders = {}
def init():
# Setup the Gmail API oauth token
# NOTE: TO CHANGE SCOPES, DELETE credentials.json file & RERUN
global _service
store = file.Storage('credentials.json')
creds = store.get()
if not creds or creds.invalid:
flow = client.flow_from_clientsecrets('client_secret.json', SCOPES)
creds = tools.run_flow(flow, store)
_service = build('gmail', 'v1', http=creds.authorize(Http()))
return _service
def fetch_and_count_messages(*, service, user_id: str, query: str, token: Optional[str]):
count = 0
page_token = None
response = service.users().messages().list(
userId=user_id, q=query, pageToken=token).execute()
ids = []
if 'messages' in response:
messages = response['messages']
count += len(messages)
for message_info in messages:
ids.append(message_info['id'])
if not ids:
return count, page_token
batch = service.new_batch_http_request(callback=parseEmailHeader)
for id in ids:
batch.add(service.users().messages().get(userId='me', id=id,
format="metadata", fields="labelIds,payload/headers"))
batch.execute()
page_token = response.get('nextPageToken')
return count, page_token
def parseEmailHeader(request_id, response, exception):
global _emailSenders
sender = _parseEmailHeader(response)
if sender:
_emailSenders[sender] = _emailSenders.get(sender, 0) + 1
def _parseEmailHeader(response)-> Optional[str]:
headers = response['payload']['headers']
for h in headers:
if h['name'].lower() == "from": # From, FROM
return h['value']
# payload['headers']['Date'] #{'name': 'Date', 'value': 'Wed, 18 Apr 2018 03:43:58 +0000 (UTC)'}
# payload['headers']['To']
print("Error parsing Sender", headers, id)
return None
def CountMessageSendersForQuery(service, user_id, query=''):
"""Count senders among all Messages in the user's mailbox matching the query.
Args:
service: Authorized Gmail API service instance.
user_id: User's email address. The special value "me" = authenticated user.
query: String used to filter messages returned.
Eg.- 'from:user@some_domain.com' for Messages from a particular sender.
"""
global _emailSenders # this sucks.
show_status_every_n = config.NUM_EMAILS_PER_PROGRESS_UPDATE
try:
count = 0
sum, page_token = fetch_and_count_messages(
service=service, user_id=user_id, query=query, token=None)
count += sum
if count % show_status_every_n == 0:
print(count, end=".")
sys.stdout.flush()
while page_token:
sum, page_token = fetch_and_count_messages(
service=service, user_id=user_id, query=query, token=page_token)
count += sum
if count % show_status_every_n == 0:
print(count, end=".")
sys.stdout.flush()
if count % show_status_every_n != 0:
print(count, "emails analyzed", end="")
except errors.HttpError as error:
print('An error occurred: %s' % error)
if _emailSenders:
print()
normalizeSenders(_emailSenders, query)
else:
print("None for %s" % query)
def sort_by_value(map, reverse=True):
# rsort dict by value
return sorted(map.items(), key=operator.itemgetter(1), reverse=reverse)
def find_dominant_field(senderc, emailc, namec, domainc, sender, email, name, domain):
""" given the matches for this sender/ domain/ name/ email, pick the most representative
sample by whichever one occurs the most frequently, broadest to narrowest in match as example
e.g. yahoo.com will be returned if it's count >= [email protected]
"""
vals = []
for x in senderc, emailc, namec, domainc:
if x != None:
vals.append(x)
else:
vals.append(0) # replace Nones, just in case they occur
dominant = max(vals)
if dominant == domain: # show broadest to most narrow specificity
return domain
elif dominant == namec:
return name if name else email
elif dominant == emailc:
return email
else:
return sender
def normalizeSenders(senderMap, query):
# this function creates a score associated with the overall sender identity -
# score increases with raw number of messages received per email address,
# per domain, and per sender name ; so the highest volume senders as well
# as senders who mask the same email with unique names will rise to the top
# MANY NAMES CAN BE ASSOCIATED WITH SAME EMAIL
# ('"Bob Barker (LinkedIn Invitations)" <[email protected]>', 1)
#('Jane Doe via LinkedIn <[email protected]>', 1)
# 'John Doe <[email protected]>'
# MANY EMAILS CAN BE ASSOCIATED WITH SAME SENDER NAME AND/OR DOMAIN
# ('"AT&T Online Services" <[email protected]>', 1)
# ('"AT&T Online Services" <[email protected]>', 1)
# ('Pacific Gas and Electric Company <[email protected]>', 1)
# ('Pacific Gas and Electric Company <[email protected]>', 1)
# ('"Facebook" <[email protected]>', 278)
# ('Facebook <[email protected]>', 253)
num_examples_per_line = config.NUM_EXAMPLES_PER_LINE
email_map = {}
name_map = {}
domain_map = {}
domain_sender = {}
for sender, count in senderMap.items():
email, name, domain = parseSender(sender)
# count per sender email
weight = email_map.get(email, 0) + 1
email_map[email] = weight
# count per name
name_map[name] = name_map.get(name, 0) + 1
# count domain if not a whitelisted one, in which case give it a lower score
weight = 1 if domain in config.SAFE_DOMAINS else domain_map.get(
domain, 0) + 1
domain_map[domain] = weight
senderlist = domain_sender.setdefault(domain, [])
senderlist.append(sender)
# order the results by most frequent sender (the entire header string)
sorted_senders = sort_by_value(senderMap)
rows = []
printed = {}
# build an overall score per combo of sender / email / name / domain; then resort by that score descending.
# -> safe domains will have a lower score than all other domains
# this will bubble up the biggest senders & spammers (rolling up by reuse of domain or sender name or sender email)
for sender, count in sorted_senders:
email, name, domain = parseSender(sender)
email_count = email_map[email]
name_count = name_map[name]
domain_count = domain_map[domain]
best_example = find_dominant_field(
count, email_count, name_count, domain_count, sender, email, name, domain)
score = count + email_count + name_count + domain_count
# output only highest-ranking line for this email (even if variants within sender header)
if printed.get(email): # once we've printed an email, don't print it again
continue
if printed.get(domain): # output only highest-ranking line for this domain
continue
if len(domain_sender[domain]) > 0:
# skip showing duplicate if already shown as best example
min_index = 1 if best_example == sender else 0
max_index = min_index + num_examples_per_line
# pick up to N examples of sender strings to show for this sender (don't need to show a million)
instances = domain_sender[domain][min_index:min(
max_index, len(domain_sender[domain]))]
else:
instances = ""
rows.append((score, best_example, count, email, email_count,
name, name_count, domain, domain_count, instances))
printed[email] = 1
printed[domain] = 1
# print rows (desc by score) above a certain frequency (ignore the long tail)
sorted_rows = sorted(rows, key=operator.itemgetter(0), reverse=True)
print()
print("=" * 10, query, "=" * 10)
print(", ".join(("score","best_example", "sender_count", "email", "email_count", "name", "name_count",
"domain", "domain_count", "more_top_examples")))
for tup in sorted_rows:
if tup[0] >= MIN_FREQ_TO_DISPLAY:
print(tup)
print("=" * 55)
def stripQuotes(thething):
if not thething:
return
if thething[0] == '"' and thething[-1] == '"':
thething = thething[1:-1]
elif thething[0] == "'" and thething[-1] == "'":
thething = thething[1:-1]
return thething
def parseSender(sender):
"""
given an email header for a sender, parse into name, email, and domain
:param sender:
:return:
"""
# THIS PARSES ALL OF THESE FORMATS SEEN IN MY GMAIL HISTORY
# Jennifer <[email protected]>
# Amazon Associates <'[email protected]'>
name = None
email = None
domain = None
args = sender.split("<")
if len(args) == 1: # no reply-to name if no angle brackets
email = args[0].strip()
name = email
elif len(args) > 1:
name = args[0].strip()
email = args[1].strip()
if email[-1] == ">":
email = email[:-1]
# normalize by removing quotes
name = stripQuotes(name)
if email:
email = stripQuotes(email).lower()
parts = email.split("@")
if len(parts) != 2 and not parts[0]:
print("Warning: Invalid domain format", email)
if len(parts) > 1:
domain = parts[1]
return email, name, domain
def get_ignore_labels_for_query():
# Example: -{label:exclude1 label:exclude2}
if not config.IGNORE_LABELS:
return ""
labels = ["-{"]
for l in config.IGNORE_LABELS:
labels.append(f"label:{l}")
labels.append("}")
return " ".join(labels)
def get_ignore_senders_for_query():
# Example: -{from:[email protected] from:[email protected]}
if not config.IGNORE_EMAILS:
return ""
labels = ["-{from:me"] # anything from me to me is an ignore
for l in config.IGNORE_EMAILS:
labels.append(f"from:{l}")
labels.append("}")
return " ".join(labels)
def elapsed_pretty(elapsed_sec: int):
if elapsed_sec > HOUR_SECONDS:
hr, min = divmod(elapsed_sec, HOUR_SECONDS)
min, sec = divmod(min, 60)
return "%d:%02d:%02d hr" % (hr, min, sec)
elif elapsed_sec > 60:
min, sec = divmod(elapsed_sec, 60)
return "%d:%02d min" % (min, sec)
else:
return "%d.02 sec" % elapsed_sec
if __name__ == '__main__':
service = init()
starttime = time.time()
start = config.EARLIEST_YEAR or sys.exit("config.EARLIEST_YEAR is required")
thisyear = int(time.strftime("%Y"))
end = int(config.LATEST_YEAR or thisyear) + 1
interval = config.NUM_YEARS_PER_BATCH or 50
end = min(end, (thisyear + 1))
ignore_labels = get_ignore_labels_for_query()
ignore_senders = get_ignore_senders_for_query()
while start < end:
before = min(start + interval, end)
# example filter: after:2004 before:2005 = 1 full year
CountMessageSendersForQuery(service, 'me', query=f"after:{start} before:{before} {ignore_senders} {ignore_labels}")
start += interval
endtime = time.time()
elapsed = endtime - starttime
print("Elapsed time:", elapsed_pretty(elapsed_sec=elapsed))