-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataAnalysisRun.r
304 lines (259 loc) · 16.6 KB
/
dataAnalysisRun.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
# Colin Robertson
# AI-Tweets Project
#
# Winter 2016
#***********************************************************************************
#library(twitteR)
library(NLP)
library(tm)
library(RColorBrewer)
library(wordcloud)
library(topicmodels)
library(SnowballC)
setwd("X")
options(stringsAsFactors = FALSE)
x <- read.csv("../../../../nsercCREATE/Scraping/tweets/test2.csv")
tweets <- xA1$text
# Here we pre-process the data in some standard ways. I'll post-define each step
tweets <- iconv(tweets, to = "ASCII", sub = " ") # Convert to basic ASCII text to avoid silly characters
tweets <- tolower(tweets) # Make everything consistently lower case
tweets <- gsub("rt", " ", tweets) # Remove the "RT" (retweet) so duplicates are duplicates
tweets <- gsub("@\\w+", " ", tweets) # Remove user names (all proper names if you're wise!)
tweets <- gsub("http.+ |http.+$", " ", tweets) # Remove links
tweets <- gsub("[[:punct:]]", " ", tweets) # Remove punctuation
tweets <- gsub("[ |\t]{2,}", " ", tweets) # Remove tabs
tweets <- gsub("amp", " ", tweets) # "&" is "&" in HTML, so after punctuation removed ...
tweets <- gsub("^ ", "", tweets) # Leading blanks
tweets <- gsub(" $", "", tweets) # Lagging blanks
tweets <- gsub(" +", " ", tweets) # General spaces (should just do all whitespaces no?)
tweets <- unique(tweets) # Now get rid of duplicates!
# Convert to tm corpus and use its API for some additional fun
corpus <- Corpus(VectorSource(tweets)) # Create corpus object
# Remove English stop words. This could be greatly expanded! # Don't forget the mc.cores thing
corpus <- tm_map(corpus, removeWords, stopwords("en"), mc.cores=1)
# Remove numbers. This could have been done earlier, of course.
corpus <- tm_map(corpus, removeNumbers, mc.cores=1)
# Stem the words. Google if you don't understand
corpus <- tm_map(corpus, stemDocument, mc.cores=1)
# Remove the stems associated with our search terms!
corpus <- tm_map(corpus, removeWords, c("bird", "flu", "avian", "influenza", "poultry"), mc.cores=1)
#pal <- brewer.pal(8, "Dark2")
#wordcloud(corpus, min.freq=2, max.words = 150, random.order = TRUE, col = pal)
# Now for Topic Modeling
# Get the lengths and make sure we only create a DTM for tweets with
# some actual content
doc.lengths <- rowSums(as.matrix(DocumentTermMatrix(corpus)))
dtm <- DocumentTermMatrix(corpus[doc.lengths > 0])
# model <- LDA(dtm, 10) # Go ahead and test a simple model if you want
# Now for some topics
SEED = sample(1:1000000, 1) # Pick a random seed for replication
harmonicMean <- function(logLikelihoods, precision=2000L) {
library("Rmpfr")
llMed <- median(logLikelihoods)
as.double(llMed - log(mean(exp(-mpfr(logLikelihoods,
prec = precision) + llMed))))
}
k = 15 #31
burnin = 1000
iter = 1000
keep = 50
maxK = 34*4 #k is 31 for AI1, and 15 for AI2 (i think for 2)
# generate numerous topic models with different numbers of topics
sequ <- seq(2, maxK, 1) # in this case a sequence of numbers from 1 to 50, by ones.
fitted_many <- lapply(sequ, function(k) LDA(dtm, k = k, method = "Gibbs",control = list(burnin = burnin, iter = iter, keep = keep) ))
# extract logliks from each topic
logLiks_many <- lapply(fitted_many, function(L) L@logLiks[-c(1:(burnin/keep))])
# compute harmonic means
hm_many <- sapply(logLiks_many, function(h) harmonicMean(h))
# inspect
dx <- data.frame(sequ, hm_many)
png("Figxx_TopicModelsA1.png", width=12, height=8, units="in", res=300)
ggplot(dx, aes(x=sequ, y=hm_many)) + geom_line() + theme_bw() + labs(x='Number of Topics', y = 'Log Likelihood')
dev.off()
# compute optimum number of topics
sequ[which.max(hm_many)]
##
k = sequ[which.max(hm_many)]
Gibbs = LDA(dtm, k = k, method = "Gibbs", control = list(seed = SEED, burnin = 1000, thin = 100, iter = 1000))
terms(Gibbs, 20)
top <- topics(Gibbs)
top.df <- as.data.frame(top)
top.df$tx <- tweets[which(!doc.lengths == 0)]
png("Figxx_TopicModelsA2_xx.png", width=12, height=8, units="in", res=300)
par(mfrow=c(4,4))
for(i in 1:15) {
corpus <- Corpus(VectorSource(top.df$tx[which(top.df$top == i)])) # Create corpus object
corpus <- tm_map(corpus, removeWords, stopwords("en"), mc.cores=1)
corpus <- tm_map(corpus, removeWords, c("bird", "flu", "avian", "influenza", "poultry"), mc.cores=1)
pal <- brewer.pal(8, "Dark2")
wordcloud(corpus, min.freq=2, max.words = 150, random.order = TRUE, col = pal)
}
dev.off()
sapply(models[2:4], slot, "alpha")
sapply(models, function(x) mean(apply(posterior(x)$topics,1, function(z) - sum(z * log(z)))))
par(mfrow=c(1,4))
hist(sapply(models[1], slot, "gamma"), xlab="P(x)", main="CTM")
hist(sapply(models[2], slot, "gamma"), xlab="P(x)", main="VEM")
hist(sapply(models[3], slot, "gamma"), xlab="P(x)", main="VEM_Fixed")
hist(sapply(models[4], slot, "gamma"), xlab="P(x)", main="Gibbs")
########################################################
#section 1 - scan statistic or tscount model
########################################################
library(plyr)
library(ggplot2)
library(zoo)
x$D <- as.POSIXct(strptime(as.character(x$created_at), "%a %b %d %H:%M:%S %z %Y"))
x$day <- as.Date(x$D)
#x$textC <- removeURL(x$text)
#x <- x[!duplicated(x$textC), ]
daily <- ddply(x, .(day), summarise, numTweets = length(day)) #summarize by mean
#figure 1****************************************************************************************************
daily$dt=strptime(daily$day, "%Y-%m-%d")
temp.zoo<-zoo(daily$numTweets,daily$dt) #Make zoo object of data
#Calculate moving average with window 3 and make first and last value as NA (to ensure identical length of vectors)
m.av<-rollmean(temp.zoo, 7,fill = list(NA, NULL, NA))
daily$numTweets7=coredata(m.av) #Add calculated moving averages to existing data frame
m.av<-rollmean(temp.zoo, 3,fill = list(NA, NULL, NA)) #Add calculated moving averages to existing data frame
daily$numTweets3=coredata(m.av)
#Add additional line for moving average in red
require(grid)
png("Fig1_timeSeries.png", width=12, height=8, units="in", res=300)
ggplot(daily, aes(dt)) + geom_line(aes(y=numTweets, colour="Daily"), size=1) + stat_smooth(aes(y=numTweets)) + geom_line(aes(y=numTweets3, colour="3 Day Moving \nAverage"), size=1) + geom_line(aes(y=numTweets7, color="7 Day Moving \nAverage"), size=1) + xlab("Day") + ylab("Daily AI-related Tweets") + scale_colour_manual("Lines", values=c("Daily"="black", "3 Day Moving \nAverage"="red", "7 Day Moving \nAverage"="grey")) + theme_bw() + theme(legend.title = element_blank(), legend.key.size = unit(1.5, "cm"))
dev.off()
png("Fig1_timeSeries_noSmooth.png", width=12, height=8, units="in", res=300)
ggplot(daily, aes(dt)) + geom_line(aes(y=numTweets, colour="Daily"), size=1) + geom_line(aes(y=numTweets3, colour="3 Day Moving \nAverage"), size=1) + geom_line(aes(y=numTweets7, color="7 Day Moving \nAverage"), size=1) + xlab("Day") + ylab("Daily AI-related Tweets") + scale_colour_manual("Lines", values=c("Daily"="black", "3 Day Moving \nAverage"="red", "7 Day Moving \nAverage"="grey")) + theme_bw() + theme(legend.title = element_blank(), legend.key.size = unit(1.5, "cm"))
dev.off()
png("Fig1x_timeSeries_noSmooth_OIE.png", width=12, height=8, units="in", res=300)
ggplot(daily, aes(dt)) + geom_line(aes(y=numTweets, colour="Daily"), size=1) + geom_line(aes(y=numTweets3, colour="3 Day Moving \nAverage"), size=1) + geom_line(aes(y=numTweets7, color="7 Day Moving \nAverage"), size=1) + geom_point(data=oieS, aes(x=D, y=rep(2000, 171))) + xlab("Day") + ylab("Daily AI-related Tweets") + scale_colour_manual("Lines", values=c("Daily"="black", "3 Day Moving \nAverage"="red", "7 Day Moving \nAverage"="grey")) + theme_bw() + theme(legend.title = element_blank(), legend.key.size = unit(1.5, "cm"))
dev.off()
#figure 1****************************************************************************************************
oie <- read.csv("OIEOutbreaks.csv")
library(stringr)
oie$Yr <- as.numeric(paste('20', str_sub(oie$Day, -2), sep=""))
oie$Mo <- as.numeric(str_sub(str_extract(oie$Day, "/.*/"), 2, -2))
oie$D <- as.numeric(str_sub(oie$Day, 1, str_locate(oie$Day, "/")-1))[1:514] #hack
oie$day <- paste(oie$D, oie$Mo, oie$Yr, sep="/")
oie$D <- strptime(as.character(oie$day), "%d/%m/%Y")
oie$Day <- as.Date(oie$D)
oieS <- subset(oie, Mo >= 10 & !Virus.Type == "" | Yr == 2016 & !Virus.Type == "" )
oieS$ReportType[which(str_trim(oieS$ReportType) == "Folow-up")] <- "Follow-up" #spelling mistake
oieS$Mo <- months(oieS$D)
oieM <- ddply(oieS[,1:9], c("Mo", "Virus.Type"), summarise, numReports = length(Link)) #summarize by mean
oieM$Month <- factor(oieM$Mo, levels=c("October", "November", "December", "January", "February", "March"))
#png("Fig2x_timeSeries_OIE.png", width=12, height=8, units="in", res=300)
# ggplot(oieS, aes(D)) + geom_point(aes(y=factor(trim(ReportType)), colour=factor(Virus.Type)), size=5) + theme_bw() + labs(x="Date of Report", y="Report Type") + scale_colour_grey(name = "Virus Type")
#dev.off()
png("Fig2x_timeReports_OIE.png", width=12, height=8, units="in", res=300)
ggplot(oieM, aes(x = Month, y = numReports, fill=Virus.Type)) + geom_bar(stat = "identity") + theme_bw() + labs(y = "Number of OIE Reports") + guides(fill=guide_legend(title="Virus Type"))
dev.off()
x$MM <- months(x$day)
oiM <- ddply(oieS[,1:9], c("Mo"), summarise, numReports = length(Link)) #summarize by mean
TwM <- ddply(x, c("MM"), summarise, numReports = length(day)) #summarize by mean
options(scipen = 999)
library(tscount)
err_ts = ts(as.vector(daily$numTweets), frequency = 1)
err_fit = tsglm(err_ts, model = list(past_obs = 1:5, past_mean = NULL, external = FALSE), distr = 'nbinom')
err_fitP = tsglm(err_ts, model = list(past_obs = 1:5, past_mean = NULL, external = FALSE), distr = 'poisson')
pit(err_fit)
scoring(err_fit)
#check for autocorrelation structure
png("Figxx_acf_TW.png", width=8, height=5, units="in", res=300)
acf(err_ts, xlab="Lag (Days)", main="")
dev.off()
#fit the negative binomial model and plot observed vs expected
png("Figxx_ts_TW.png", width=8, height=5, units="in", res=300)
ts.plot(err_ts, gpars=list(col=c('black'), lwd=c(2)), ylab='AI-Related Tweets', xlab='Day')
abline(h=mean(err_ts) + (2 * (sd(err_ts)/sqrt(147))), col="blue", lwd=2)
points(which(err_ts > (mean(err_ts) + (2 * (sd(err_ts)/sqrt(147))))), as.numeric(err_ts[which(err_ts > (mean(err_ts) + (2 * (sd(err_ts)/sqrt(147)))))]), pch=20)
dev.off()
#use model estimates to find alarms using the cusum algorithm
library(surveillance)
disProgObj <- create.disProg(week=1:147, observed = daily$numTweets, state=rep(0,147))
find.kh(ARLa = 365, ARLr = 7, sided = "one", method = "BFGS", verbose=FALSE)
res <- algo.cusum(disProgObj, control = list(range = 1:147, trans="pearsonNegBin", m=fitted(err_fit), alpha = 0.433, k=0.56937, h=3.668))
png("Figxx_cusum_OIE.png", width=8, height=5, units="in", res=300)
plot(res, xlab="Day", xaxis.years=FALSE, ylab='AI-Related Tweets', main='')
dev.off()
png("Figxx_cusum.png", width=8, height=5, units="in", res=300)
ts.plot(err_ts, res$upperbound, gpars=list(col=c('black', 'blue'), lwd=c(2,2)), ylab='AI-Related Tweets', xlab='Day')
points(which(res$alarm == 1), as.numeric(err_ts[which(res$alarm == 1)]), pch=20)
dev.off()
#simple alarms
xA1 <- x[x$day %in% daily[which(err_ts > (mean(err_ts) + (2 * (sd(err_ts)/sqrt(147))))), 1], ]
#model alarms
#daily[which(res$alarm==1), ]
xA2 <- x[x$day %in% daily[which(res$alarm==1), 1], ]
#########^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^########################################
#########^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^########################################
######### Comparing OIE and Tweets ########################################
######### ########################################
#########^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^########################################
#########^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^########################################
oieF <- read.csv("OIEOutbreaks_final2.csv") #read.csv("OIEOutbreaks_final.csv")
oieF$Yr <- as.numeric(paste('20', str_sub(oieF$ReportDay, -2), sep=""))
oieF$Mo <- as.numeric(str_sub(str_extract(oieF$ReportDay, "/.*/"), 2, -2))
oieF$D <- as.numeric(str_sub(oieF$ReportDay, 1, str_locate(oieF$ReportDay, "/")-1))[1:71] #hack
oieF$newDate <- paste(oieF$D, oieF$Mo, oieF$Yr, sep="/")
oieF$newDate2 <- strptime(as.character(oieF$newDate), "%d/%m/%Y")
oieF$Day <- as.Date(oieF$newDate2)
oieF$Month <- months(oieF$Day)
oieFs <- subset(oieF, Mo >= 10 | Yr == 2016 )
oieFs$startDate <- strptime(as.character(oieFs$Start.of.Outbreak), "%d/%m/%Y")
oieFs$SDay <- as.Date(oieFs$startDate)
ggplot(oieFs, aes(startDate, reorder(Location, as.POSIXct(startDate)), colour = Location)) + geom_errorbarh(aes(xmin = startDate,xmax = newDate2, height = 0.2)) + scale_colour_discrete(guide = FALSE) + labs(x="Start of Outbreak to the Report Date", y = "Location of Outbreak")
dailyA1 <- ddply(xA1, .(day), summarise, numTweets = length(day)) #summarize by mean
dailyA2 <- ddply(xA2, .(day), summarise, numTweets = length(day)) #summarize by mean
png("Figxx_AI_offical2.png", width=9, height=6, units="in", res=300)
ggplot(oieFs, aes(SDay, reorder(Location, as.POSIXct(SDay)), colour = Location)) + geom_vline(data=dailyA1,aes(xintercept=as.numeric(day)), colour='green') + geom_vline(data=dailyA2,aes(xintercept=as.numeric(day)), colour='blue') + geom_errorbarh(aes(xmin = SDay,xmax = Day, height = 0.2)) + scale_colour_discrete(guide = FALSE) + labs(x="Start of Outbreak to the Report Date", y = "Location of Outbreak") + scale_x_date()
dev.off()
write.csv(xA2, "XA2.csv")
xA1$tweets <- iconv(xA1$text, to = "ASCII", sub = " ") # Convert to basic ASCII text to avoid silly characters
xA1$tweets <- tolower(xA1$tweets) # Make everything consistently lower case
xA1$tweets <- gsub("rt", " ", xA1$tweets) # Remove the "RT" (retweet) so duplicates are duplicates
xA1$tweets <- gsub("@\\w+", " ", xA1$tweets) # Remove user names (all proper names if you're wise!)
xA1$tweets <- gsub("http.+ |http.+$", " ", xA1$tweets) # Remove links
xA1$tweets <- gsub("[[:punct:]]", " ", xA1$tweets) # Remove punctuation
xA1$tweets <- gsub("[ |\t]{2,}", " ", xA1$tweets) # Remove tabs
xA1$tweets <- gsub("amp", " ", xA1$tweets) # "&" is "&" in HTML, so after punctuation removed ...
xA1$tweets <- gsub("^ ", "", xA1$tweets) # Leading blanks
xA1$tweets <- gsub(" $", "", xA1$tweets) # Lagging blanks
xA1$tweets <- gsub(" +", " ", xA1$tweets) # General spaces (should just do all whitespaces no?)
write.csv(xA1[!duplicated(xA1$tweets),], "XA1.csv")
xA1.d <- xA1[!duplicated(xA1$tweets),]
locations <- data.frame(countA1 = sort(table(xA1$location), TRUE)[1:25])
locations$LocationA1 <- row.names(locations)
locations2 <- data.frame(countA2 = sort(table(xA2$location), TRUE)[1:25])
locations2$Location <- row.names(locations2)
locations$countA2 <- locations2$countA2
locations$LocationA2 <- locations2$Location
rm(locations2)
write.csv(locations, 'locations.csv')
dailyA1$Method = 'Static'
dailyA2$Method = 'Dynamic'
dailyA <- rbind(dailyA1, dailyA2)
png("Figxx_OIE_TW2.png", width=10, height=8, units="in", res=300)
ggplot(dailyA, aes(x = day, y = numTweets)) + geom_point(aes(colour=Method), size=3) + geom_vline(data=oieFs,aes(xintercept=as.numeric(SDay)), colour='green') + scale_x_date(limits = as.Date(c('2015-10-1','2016-04-01'))) + geom_text(data = oieFs, aes(x=SDay, y=rep(1500, 35), label=Location), angle=90) + labs(x = 'Date', y='AI-Related Tweets ')
dev.off()
xA2$tweets <- iconv(xA2$text, to = "ASCII", sub = " ") # Convert to basic ASCII text to avoid silly characters
xA2$tweets <- tolower(xA2$tweets) # Make everything consistently lower case
xA2$tweets <- gsub("rt", " ", xA2$tweets) # Remove the "RT" (retweet) so duplicates are duplicates
xA2$tweets <- gsub("@\\w+", " ", xA2$tweets) # Remove user names (all proper names if you're wise!)
xA2$tweets <- gsub("http.+ |http.+$", " ", xA2$tweets) # Remove links
xA2$tweets <- gsub("[[:punct:]]", " ", xA2$tweets) # Remove punctuation
xA2$tweets <- gsub("[ |\t]{2,}", " ", xA2$tweets) # Remove tabs
xA2$tweets <- gsub("amp", " ", xA2$tweets) # "&" is "&" in HTML, so after punctuation removed ...
xA2$tweets <- gsub("^ ", "", xA2$tweets) # Leading blanks
xA2$tweets <- gsub(" $", "", xA2$tweets) # Lagging blanks
xA2$tweets <- gsub(" +", " ", xA2$tweets) # General spaces (should just do all whitespaces no?)
write.csv(xA1[!duplicated(xA2$tweets),], "XA2.csv")
xA2.d <- xA2[!duplicated(xA2$tweets),]
xA2.d.1 <- subset(xA2.d, day == '2015-10-26')
write.csv(xA2.d.1, 'dynclust1.csv')
#paper revisions
library(stringr)
x <- read.csv("../../../../nsercCREATE/Scraping/tweets/test2.csv")
x$ct1 <- str_count(x$text, ignore.case(c('bird flu')))
x$ct2 <- str_count(x$text, ignore.case(c('avian influenza')))
x$ct3 <- str_count(x$text, ignore.case(c('avian flu')))
x$ct4 <- str_count(x$text, ignore.case(c('poultry disease')))
x$ct <- x$ct1 + x$ct2 + x$ct3 + x$ct4