-
Notifications
You must be signed in to change notification settings - Fork 0
/
load_gmane_mails.py
168 lines (141 loc) · 6.59 KB
/
load_gmane_mails.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import os
import pickle
import pandas as pd
import requests
import mailbox
import nntplib
def fetch_mails_range(base_url, list_id, start_id, end_id):
project_messages_url = '{base_url}/{list_id}/{start_id}/{end_id}'.format(
base_url=base_url,
list_id=list_id,
start_id=start_id,
end_id=end_id)
print('fetching {}...'.format(project_messages_url))
mails = requests.get(project_messages_url)
project_messages_filename = os.path.join(
storage_path,
"{}_{}-{}.batch.mbox".format(list_id, start_id, end_id))
with open(project_messages_filename, 'w') as pm:
pm.write(mails.text)
def fetch_mails(projects_data_filename, base_url, storage_path, batchsize=3000):
projects_data = pd.read_csv(projects_data_filename, skipfooter=1, engine='python')
for row_id, project_data in projects_data.iterrows():
if project_data.finished_downloading:
print("skipping {} as its already completely downloaded.".format(
project_data.project))
continue
batch_start_id = project_data.start_id
batch_end_id = batch_start_id + batchsize
while batch_start_id < project_data.end_id:
fetch_mails_range(
base_url=base_url,
list_id=project_data.list_id,
start_id=batch_start_id,
end_id=batch_end_id)
batch_start_id += batchsize
batch_end_id += batchsize
def transform_to_mboxo(projects_data_filename, storage_path, from_line):
projects_data = pd.read_csv(projects_data_filename, skipfooter=1, engine='python')
for row_id, project_data in projects_data.iterrows():
raw_project_messages_filename = os.path.join(
storage_path,
'raw',
project_data.list_id+'.mbox')
mboxo_project_messages_filename = os.path.join(
storage_path,
'mboxo',
project_data.list_id+'.mbox')
with open(raw_project_messages_filename, 'r') as raw_msgs:
with open(mboxo_project_messages_filename, 'w') as mboxo_msgs:
for line in raw_msgs:
if line.startswith('From ') and line[5:].strip() != from_line:
mboxo_msgs.write('>'+line)
else:
mboxo_msgs.write(line)
def transform_to_mboxo_generic(projects_data_filename, storage_path, from_line):
projects_data = pd.read_csv(projects_data_filename, skipfooter=1, engine='python')
for row_id, project_data in projects_data.iterrows():
raw_project_messages_filename = os.path.join(
storage_path,
'raw',
project_data.list_id+'.mbox')
mboxo_project_messages_filename = os.path.join(
storage_path,
'mboxo',
project_data.list_id+'.mbox')
with open(raw_project_messages_filename, 'r') as raw_msgs:
with open(mboxo_project_messages_filename, 'w') as mboxo_msgs:
previous_line = None
for line in raw_msgs:
if previous_line is not None:
if previous_line.startswith('From '):
if line.startswith('From:'):
# valid From
mboxo_msgs.write(previous_line)
else:
# invalid From
mboxo_msgs.write('>'+previous_line)
print(previous_line)
else:
# normal line
mboxo_msgs.write(previous_line)
previous_line = line
def extract_headers_only(projects_data_filename, storage_path):
projects_data = pd.read_csv(projects_data_filename, skipfooter=1, engine='python')
for row_id, project_data in projects_data.iterrows():
mboxo_project_messages_filename = os.path.join(
storage_path,
'mboxo',
project_data.list_id+'.mbox')
headers_project_messages_filename = os.path.join(
storage_path,
'header',
project_data.list_id+'.pkl')
mbox = mailbox.mbox(mboxo_project_messages_filename)
headers = []
for m in mbox:
headers.append(dict(m))
with open(headers_project_messages_filename, 'wb') as headers_dump:
pickle.dump(headers, headers_dump)
def count_mails(projects_data_filename, storage_path, from_line):
projects_data = pd.read_csv(projects_data_filename, skipfooter=1, engine='python')
for row_id, project_data in projects_data.iterrows():
project_messages_filename = os.path.join(
storage_path,
'mboxo',
project_data.list_id+'.mbox')
mbox = mailbox.mbox(project_messages_filename)
i = 0
for m in mbox:
i += 1
assert m.get_from() == from_line
print("{} contains {} mails".format(project_data.list_id, i))
def fetch_gmane_ids(projects_data_filename, gmane_news_url, storage_path):
news_server = nntplib.NNTP(gmane_news_url)
projects_data = pd.read_csv(projects_data_filename, skipfooter=1, engine='python')
for _, project_data in projects_data.iterrows():
project_overview_filename = os.path.join(
storage_path,
'nntp_overview',
project_data.list_id+'.pkl')
print('connecting to {}'.format(project_data.list_id))
resp, _, _, _, _ = news_server.group(project_data.list_id)
print(resp)
print('fetching overviews...')
resp, overviews = news_server.over((project_data.start_id, project_data.end_id))
print(resp)
with open(project_overview_filename, 'wb') as overviews_dump:
pickle.dump(overviews, overviews_dump)
news_server.quit()
if __name__ == '__main__':
storage_path = '/home/zormit/bigdata/innovation-thesis/'
projects_data_filename = '/home/zormit/ownCloud/Uni/msemester5/innovation-thesis/data/projects.csv'
gmane_base_url = 'http://download.gmane.org'
gmane_news_url = 'news.gmane.org'
gmane_from_line = '[email protected] Tue Mar 04 03:33:20 2003'
fetch_mails(projects_data_filename, gmane_base_url, storage_path)
count_mails(projects_data_filename, storage_path, gmane_from_line)
transform_to_mboxo(projects_data_filename, storage_path, gmane_from_line)
transform_to_mboxo_generic(projects_data_filename, storage_path, gmane_from_line)
extract_headers_only(projects_data_filename, storage_path)
fetch_gmane_ids(projects_data_filename, gmane_news_url, storage_path)