-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathTweetReporting.py
242 lines (153 loc) · 5.02 KB
/
TweetReporting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import json
import datetime
import timedelta
import sys
import random
import operator
import csv
# Accepts two strings a and b
# Returns a float of the Jaccard distance between the two sets as computed from the
# formula given in the assignment description
def jaccard_distance(a, b):
a = remove_tags(a)
a = process_text(a)
b = remove_tags(b)
b = process_text(b)
u = union(a,b)
i = intersect(a,b)
if len(u) == 0:
return 1.0
return 1.0 - float(len(i)) / float(len(u))
# Accepts a string text
# Returns the string without any "RT" or "@handle" tags or links, words beginning with "http"
# Also removes any hashtag characters but keeps the characters after the # symbol
def remove_tags(text):
words = text.split()
ret = ""
for word in words:
if word[0] == '#':
ret = ret + word[1:] + " "
elif not ((word[0:2] == "RT") | (word[0] == '@') | (word[0:4] == "http")):
ret = ret + word + " "
return ret
# Accepts a string s
# Returns the string with all punctuation removed and with all characters converted
# to lower case
def process_text(s):
return ''.join(ch.lower() for ch in s if ((ch.isalnum()) | (ch == " ")))
# Accepts two strings a and b
# Returns a list of strings that represents the union of the two sets, a list of
# all words that appear in either set
def union(a,b):
a = a.split()
b = b.split()
a = set(a)
b = set(b)
return a | b
# Accepts two strings a and b
# Returns a list of strings that represents the intersection of the two sets, a list
# of all words that appear in both sets
def intersect(a,b):
a = a.split()
b = b.split()
a = set(a)
b = set(b)
return a & b
# Accepts a file f
# Returns each line of the file f as a string, excluding newlines and any commas
# at the end of the line
# Used for reading in the list of twitter IDs in InitialSeeds.txt
def get_ids(f):
nums = []
for line in f:
line = line.rstrip()
if line[-1] == ",":
nums.append(int(line[:-1]))
else:
nums.append(int(line))
return nums
def read_tweets(filename):
tweets = []
with open(filename, 'r') as file:
csvreader = csv.reader(file, delimiter=",", quotechar='"')
i = 1
for line in csvreader:
tweets.append(line[2])
return tweets[1:]
# Goes through the given file line by line and returns a list of tuples, the first element storing the date
# in a date object, the second element storing the sentiment score as a double
def parse_predictions(filename):
dates = []
first_line = True
with open(filename, 'r') as file:
for line in file:
if first_line:
first_line = False
continue
line = line.split()
date = line[0].split("-")
year = date[0]
month = date[1]
day = date[2]
year = int("".join(ch for ch in year if ch.isalnum()))
month = int("".join(ch for ch in month if ch.isalnum()))
day = int("".join(ch for ch in day if ch.isalnum()))
dates.append((datetime.datetime(year, month, day), float(line[1])))
return dates
# Accepts the name of the company and a threshold for magnitude of movement
# Returns all dates for which the actual corrected stock price movement for that company was greater
# than the threshold
def get_dates(company, threshold):
dates = []
filename = company + "-predict.txt"
predictions = parse_predictions(filename)
for date in predictions:
if abs(date[1]) > threshold:
dates.append((date[0], date[1]))
return dates
def get_tweets_dict(tweets_text):
dic = dict()
for tweet in tweets_text:
if tweet in dic:
dic[tweet] = dic[tweet] + 1
else:
dic[tweet] = 1
return dic
# Accepts the name of the company and a list of dates
# For each date, appends to the output the text of tweets from the 5 largest clusters
# Returns the output as a string
def get_output(company, dates, num_tweets):
output = ""
for date in dates:
filename = "Tesla_csv/" + company + "-" + str(date[0].year) + "-" + str(date[0].month) + "-" + str(date[0].day) + ".csv"
tweets = read_tweets(filename)
tweets_text = []
for t in tweets:
r = remove_tags(t)
if len(r) > 0:
tweets_text.append(r)
tweets_dict = get_tweets_dict(tweets_text)
sorted_tweets_dict = sorted(tweets_dict.items(), key = operator.itemgetter(1), reverse = True)
output = output + company + " on " + str(date[0].year) + "-" + str(date[0].month) + "-" + str(date[0].day) + "\n"
output = output + "Predicted movement of " + str(date[1]) + "\n"
output = output + "15 Most Common Topics of Conversation\n"
for i in range(num_tweets):
output = output + str(sorted_tweets_dict[i][1]) + "x: " + sorted_tweets_dict[i][0] + "\n"
output = output + "\n\n\n"
return output
def main(argv):
companies = ["Tesla"] #["Apple", "Google", "Tesla", "Facebook", "Intel", "Tmobile", "Amazon"]
if len(argv) == 3:
threshold = float(argv[1])
num_tweets = int(argv[2])
else:
threshold = 0.1
num_tweets = 15
output = ""
for company in companies:
dates = get_dates(company, threshold)
output = output + get_output(company, dates, num_tweets)
with open("SignificantEvents.txt", 'w') as file:
file.write(output)
if __name__ == "__main__":
main(sys.argv)