-
Notifications
You must be signed in to change notification settings - Fork 0
/
collectData.py
executable file
·133 lines (111 loc) · 4.23 KB
/
collectData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python
import re
import pycurl
import sqlite3
import string
import StringIO
import sys
#Determines whether or not a message had a meaningful attachment
def messageHasAttachment(contents):
if contents.find("non-text attachment was scrubbed...") == -1:
return "no"
if contents.count("non-text attachment was scrubbed") == \
contents.count("pgp-signature"):
return "no"
return "yes"
#Extracts metadata from a single message
def parseURL(URL):
messageData = {}
buffer = StringIO.StringIO()
curl = pycurl.Curl()
curl.setopt(pycurl.URL, "%s" % URL)
curl.setopt(pycurl.WRITEFUNCTION, buffer.write)
curl.perform()
curl.close()
contents = buffer.getvalue()
messageData["length"] = len(contents)
messageData["attachment"] = messageHasAttachment(contents)
dateTimeRE = re.compile("<I>([a-zA-Z]+).*?([0-9:]{8})")
match = dateTimeRE.search(contents)
messageData["day"] = match.group(1)
messageData["time"] = match.group(2)
return messageData
if __name__ == "__main__":
#create an empty database with the appropriate columns
#note that this deletes your old database file, so move it
try:
conn = sqlite3.connect('MailingListData.db')
sqlite = conn.cursor()
sqlite.execute("DROP TABLE MailingListData")
sqlite.execute("CREATE TABLE MailingListData(Message_Subject TEXT, Author TEXT, Received_Reply TEXT, Time_of_Day TEXT, Day_of_Week TEXT, Message_Length INTEGER, Any_Attachments TEXT, Archive_URL TEXT)")
except sqlite3.Error, e:
print "Error 1 occurred:", e.args[0]
sys.exit(1)
#initialize our data structure.
#This will contain data about the messages that we parse.
subjects = {}
#regular expression to get subject
subjectRE = re.compile(r"HREF=\"(.*?)\">\[Insight-users\]\s(.*)$")
#regular expression to get author
authorRE = re.compile(r"<I>(.*)$")
baseURL = "http://www.itk.org/pipermail/insight-users/2011-August/"
#use cURL to get a list of messages to parse
listURL = "http://www.itk.org/pipermail/insight-users/2011-August/thread.html"
buffer = StringIO.StringIO()
curl = pycurl.Curl()
curl.setopt(pycurl.URL, "%s" % listURL)
curl.setopt(pycurl.WRITEFUNCTION, buffer.write)
curl.perform()
curl.close()
#parse the list of messages
contents = buffer.getvalue().split('\n');
for i in range(0,len(contents)):
line = contents[i]
#skip lines that don't contain data about messages sent to the mailing list
if line.find("[Insight-users]") == -1:
continue
#get the subject of this message
match = subjectRE.search(line)
URL = baseURL + match.group(1)
subject = string.replace(match.group(2), "\t", " ")
#get the author of this message (two lines later)
i = i + 2
line = contents[i]
match = authorRE.search(line)
#unicode is fun
author = match.group(1)
#is this a reply to a message that we've already seen?
if subject in subjects:
messageData = subjects[subject]
#is it still waiting for a proper reply?
if messageData["reply"] != "yes":
#is the reply from the original author?
if messageData["author"] == author:
messageData["reply"] = "self only"
else:
messageData["reply"] = "yes"
else:
messageData = parseURL(URL)
messageData["author"] = author
messageData["URL"] = URL
messageData["reply"] = "no"
subjects[subject] = messageData
try:
for subject, data in subjects.items():
#reorganize the data into the list that SQLite expects
row = [subject, data["author"], data["reply"], data["time"], data["day"], data["length"], data["attachment"], data["URL"]]
#sqlite.execute("INSERT INTO MailingListData(Model, Perturbation, Initialization_Time, Forecast_Hour, Handle) VALUES(?,?,?,?,?)", r)
print row
sqlite.execute("INSERT INTO MailingListData(Message_Subject, Author, Received_Reply, Time_of_Day, Day_of_Week, Message_Length, Any_Attachments, Archive_URL) VALUES(?,?,?,?,?,?,?,?)", row)
conn.commit()
except sqlite3.Error, e:
print "Error 2 occurred:", e.args[0]
sys.exit(1)
try:
sqlite.execute("SELECT * FROM MailingListData")
except sqlite3.Error, e:
print "Error 1 occurred:", e.args[0]
sys.exit(1)
for row in sqlite:
print row
sqlite.close()