-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyse_hdfs_results.py
178 lines (142 loc) · 8.77 KB
/
analyse_hdfs_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#!/usr/bin/python
# SCRIPT 10: Analyse HDFS output, provide visualisations and insights.
# DES: Having completed 2 sentiment analyses using hadoop, this script
# reads in the results of the analysis to highlight insights, and to visualise results.
# BY: Tiernan Barry, x19141840 - NCI.
# Libraries:
import pandas as pd
import functions_tweet_mapreduce as fns
import matplotlib.pyplot as plt
from tabulate import tabulate
from matplotlib.pyplot import ion
ion() # enables interactive mode
# Installations (if needed):
# import missingno as msno
##########################################################################
# Extract: Read in data locally (after being called from HDFS in script 7)
##########################################################################
hdfs_date_results = pd.read_csv("./date_apr9.csv")
hdfs_acc_results = pd.read_csv("./account_apr9.csv")
##########################################################################
#. Transform:
##########################################################################
# -- Fix headings
hdfs_date_results.columns = ['DATE', 'SOURCE', 'MEAN_SENT', 'STND_DEV_SENT', 'MEDIAN_SENT',
'MIN_SENT', 'MAX_SENT', 'FAVS_PER_TWEETS', 'RT_PER_TWEET', 'CORR_FAV_SENT', 'CORR_RT_SENT','TWEETS_PER_DATE']
hdfs_acc_results.columns = ['SOURCE', 'MEAN_SENT', 'STND_DEV_SENT', 'MEDIAN_SENT', 'MIN_SENT',
'MAX_SENT', 'MAX_FLWR', 'FAV_TO_FLWR', 'RT_TO_FLWR', 'CORR_FAV_SENT', 'CORR_RT_SENT', 'TWEETS_PER_ACC']
# -- Ensure no duplicates:
hdfs_date_results.drop_duplicates()
hdfs_acc_results.drop_duplicates()
##########################################################
#. Analysis 1: Dataset reduced by date: date_apr8.csv
#. 1. Plot number of tweets by date.
#. 2. Based on above, plot summary statistics by date.
#. 3. Plot standard deviation of sentiment last X days:
#. 4. Plot Daily Favourite:Followers ratio vs RT:Followers ratio by Date:
#. 4. Plot Daily Favourite:Followers ratio vs RT:Followers ratio by Date:
##########################################################
#. 1. Plot number of tweets by TWEETS_PER_DATE
fns.plot_x_last_x_days(hdfs_date_results, "TWEETS_PER_DATE", len(hdfs_date_results), "Number of Tweets by Day", "Number of Tweets")
# -- Notes from above graph:
# -- Total days = 1337, using 20 days means theres around 2k+ tweets per day.
# -- Total days = 1337, using 30 days means theres around 1300+ tweets per day.
# -- Total days = 1337, using 40 days means theres around 500 tweets per day.
# -- Total days = 1337, using 70 days means theres around 200+ tweets per day.
# -- Total days = 1337, using 100 days means theres around 50-100+ tweets per day.
# -- Total days = 1337, between 1280 and 1250 not much difference - between 250-50+ tweets per day.
#. 2. Plot Mean Sentiment for last X days:
fns.plot_x_last_x_days(hdfs_date_results, "MEAN_SENT", 100, "Daily Mean Sentiment 100 days", "Mean Sentiment")
fns.plot_x_last_x_days(hdfs_date_results, "MEAN_SENT", 60, "Daily Mean Sentiment 60 days", "Mean Sentiment")
fns.plot_x_last_x_days(hdfs_date_results, "MEAN_SENT", 30, "Daily Mean Sentiment 30 days", "Mean Sentiment")
fns.plot_x_last_x_days(hdfs_date_results, "MEAN_SENT", 20, "Daily Mean Sentiment 20 days", "Mean Sentiment")
fns.plot_x_last_x_days(hdfs_date_results, "MEAN_SENT", 45, "Daily % Change Mean Sentiment 40 days", "% Change Mean Sentiment", pct_ch="YES")
#. 3. Plot standard deviation of sentiment last X days:
fns.plot_x_last_x_days(hdfs_date_results, "STND_DEV_SENT", 100, "Standard Deviation of Sentiment 30 days", "Standard Deviation")
#. 4. Plot Daily Favourite:Followers ratio vs RT:Followers ratio by Date:
plt.figure()
plot_dt_fav, = plt.plot([i for i in range(0,60)], hdfs_date_results["FAVS_PER_TWEETS"][len(hdfs_date_results)-60:])
plot_dt_rt, = plt.plot([i for i in range(0,60)], hdfs_date_results["RT_PER_TWEET"][len(hdfs_date_results)-60:])
plt.xlabel("Number of Days")
plt.ylabel("Ratio per Day")
plt.title("Daily Favourite:Follower vs RT:Follower Ratio")
plt.legend([plot_dt_fav, plot_dt_rt], ["Mean Favourite", "Mean RT"])
#. 5. Plot Correlation between sentiment and Favourites/RT for last X days:
plt.figure()
plot_dt_corr1, = plt.plot([i for i in range(0,100)], hdfs_date_results['CORR_FAV_SENT'][len(hdfs_date_results)-100:])
plot_dt_corr2, = plt.plot([i for i in range(0,100)], hdfs_date_results['CORR_RT_SENT'][len(hdfs_date_results)-100:])
plt.xlabel("Number of Days")
plt.ylabel("Ratio per Day")
plt.title("Daily Correlation: Sentiment:Favourite vs Sentiment:RT")
plt.legend([plot_dt_corr1, plot_dt_corr2], ["Sentiment:Favourite", "Sentiment:RT"])
hdfs_date_results_dt_corr = hdfs_date_results.sort_values(['CORR_RT_SENT'])[hdfs_date_results['TWEETS_PER_DATE'] > 250]
hdfs_date_results_dt_corr = hdfs_date_results_dt_corr[['DATE', 'CORR_RT_SENT', 'TWEETS_PER_DATE']]
#-- Plot days where > 250 tweets:
fig_corr_dts, ax_corr_dt = plt.subplots()
ax_corr_dt.table(cellText=hdfs_date_results_dt_corr.values, colLabels=hdfs_date_results_dt_corr.columns, loc='center')
ax_corr_dt.set_title("Correlation between", size=15)
ax_corr_dt.axis('off')
##########################################################
#. Analysis 2: Dataset reduced by account: account_apr8.csv
#. -- Tweets by account = 2400
# 1. Plotting average sentiment
# 2. Plot Tables of Top X accounts and Bottom X accunts by sentiment:
# 3. Analyse engagements by account:
##########################################################
# 1. Plotting average sentiment
fns.plot_x_last_x_days_acc(hdfs_acc_results, 'MEAN_SENT', [i for i in range(0,len(hdfs_acc_results))], len(hdfs_acc_results), "Mean Sentiment per Account", "Mean Sentiment")
print(tabulate(hdfs_acc_results, headers=hdfs_acc_results.columns))
# 2. Plot Tables of Top X accounts and Bottom X accunts by sentiment:
# -- Sort by sentiment:
hdfs_acc_results_mean = hdfs_acc_results.sort_values(by='MEAN_SENT')
bottom_x_sentiment = hdfs_acc_results_mean[['SOURCE', 'MEAN_SENT']][0:10]
bottom_x_sentiment = bottom_x_sentiment.sort_values(by=['MEAN_SENT'], ascending=True)
top_x_sentiment = hdfs_acc_results_mean[['SOURCE', 'MEAN_SENT']][len(hdfs_acc_results)-10:]
top_x_sentiment = top_x_sentiment.sort_values(by=['MEAN_SENT'], ascending=False)
# -- Plot bottom X accounts:
fig_bottom, ax_bottom = plt.subplots()
ax_bottom.table(cellText=bottom_x_sentiment.values, colLabels=bottom_x_sentiment.columns, loc='center')
ax_bottom.set_title("Bottom 10 Ranked Twitter Accounts by Mean Sentiment", size=15)
ax_bottom.axis('off')
bottom_x_sentiment = bottom_x_sentiment.set_index('SOURCE')
bottom_x_sentiment.plot(kind='bar')
# -- Plot top X accounts:
fig_top, ax_top = plt.subplots()
ax_top.table(cellText=top_x_sentiment.values, colLabels=top_x_sentiment.columns, loc='center')
ax_top.set_title("Top 10 Ranked Twitter Accounts by Mean Sentiment")
ax_top.axis('off')
top_x_sentiment = top_x_sentiment.set_index('SOURCE')
top_x_sentiment.plot(kind='bar')
top_x_sentiment[0:3].plot(kind='bar')
bottom_x_sentiment[0:3].plot(kind='bar')
# 3. Analyse engagements by account:
# -- Sort by CORR_RT_SENT:
hdfs_acc_results_corr = hdfs_acc_results.sort_values(['CORR_RT_SENT'])
median_corr = hdfs_acc_results_corr['CORR_RT_SENT'][int(len(hdfs_acc_results_corr)/2)]
bottom_x_rt_corr = hdfs_acc_results_corr[['SOURCE', 'CORR_RT_SENT']][0:10]
bottom_x_rt_corr = bottom_x_rt_corr.sort_values(['CORR_RT_SENT'], ascending=True)
top_x_rt_corr = hdfs_acc_results_corr[['SOURCE', 'CORR_RT_SENT']][len(hdfs_acc_results)-10:]
top_x_rt_corr = top_x_rt_corr.sort_values(['CORR_RT_SENT'], ascending=False)
# -- Plot bottom X accounts:
fig_corr_botm, ax_corr_botm = plt.subplots()
ax_corr_botm.table(cellText=bottom_x_rt_corr.values, colLabels=bottom_x_rt_corr.columns, loc='center')
ax_corr_botm.set_title("Lowest Correlation between Sentiment and Engagement (RT)", size=15)
ax_corr_botm.axis('off')
# -- Plot top X accounts:
fig_corr_top, ax_corr_top = plt.subplots()
ax_corr_top.table(cellText=top_x_rt_corr.values, colLabels=top_x_rt_corr.columns, loc='center')
ax_corr_top.set_title("Highest Correlation between Sentiment and Engagement (RT)")
ax_corr_top.axis('off')
top_x_rt_corr = top_x_rt_corr.set_index('SOURCE')
top_x_rt_corr.plot(kind='bar')
hdfs_acc_results_fav = hdfs_acc_results.sort_values(['CORR_FAV_SENT'])
median_fav = hdfs_acc_results_fav['CORR_FAV_SENT'][int(len(hdfs_acc_results_corr)/2)]
bottom_x_fav = hdfs_acc_results_fav[['SOURCE', 'CORR_FAV_SENT']][0:10]
bottom_x_fav = bottom_x_fav.sort_values(['CORR_FAV_SENT'], ascending=True)
top_x_rt_fav = hdfs_acc_results_fav[['SOURCE', 'CORR_FAV_SENT']][len(hdfs_acc_results)-10:]
top_x_rt_fav = top_x_rt_fav.sort_values(['CORR_FAV_SENT'], ascending=False)
# -- Plot bottom X accounts:
fig_corr_botm, ax_corr_botm = plt.subplots()
ax_corr_botm.table(cellText=bottom_x_rt_corr.values, colLabels=bottom_x_rt_corr.columns, loc='center')
ax_corr_botm.set_title("Lowest Correlation between Sentiment and Engagement (RT)", size=15)
ax_corr_botm.axis('off')