index.Rmd

---
title: "StubbTweets"
author: Tuomo Nieminen
date: 01.02.2016
output: 
  html_document:
    theme: cosmo
    fig_caption: true
    fig_width: 8
    fig_height: 6
    code_folding: hide
    df_print: kable
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, out.width = "600px", comment = NA)

source("utility/functions.R")
access_libraries(install_all = F)

#load data
tweeter <- "alexstubb"
filepath <- paste0("data/",tweeter,"_tweets.Rda")
tw <- get(load(filepath))
tw <- dplyr::select(tw, -suosio)
```

![](figure/stubb_hyppy.png)

*Graphic by Heikki Ritaluoma* 

# Welcome

This GitHub page includes explorations of the former prime minister of Finland Alexander Stubb's tweets, retreived from the twitter API on Jan 2016 and ananalyzed using the R programming language for statistical analysis. The codes for the analysis are available at the [StubbTweets repository](https://github.com/TuomoNieminen/StubbTweets). An article including the graphics displayed here can be found [here](http://tyyppiarvo.com/2016/03/kannanottoja-ja-kokoomus-tsemppia-tassa-kaikki-mita-pitaa-tietaa-stubbin-twitterista/) (in finnish).


## The data

```{r, include = FALSE}
mindate <- format(min(tw$created),"%d.%m.%Y")
maxdate <- format(max(tw$created),"%d.%m.%Y")
ntweets <- nrow(tw)
data.frame(mindate,maxdate, ntweets)
```

```{r}
glimpse(tw)
```

## Monthly tweets

```{r monthly_tweets}
par(mar=c(8,9,6,4))
h <- hist(tw$created, main = paste(tweeter, "tweets by month"),
     breaks="month", freq=T, 
     xlab="",ylab="", las=2,
     labels=T, cex.axis=1.3, cex.lab=1.5,
     ylim= c(0,400), format="%Y-%m",
     col="cadetblue3")
mtext(side = 2, text = "tweets", line = 5, cex = 1.5)
```

## Daily tweets

```{r daily_tweets}
par(mar=c(7,9,5,3))
h <- hist(tw$created, main = paste(tweeter, "tweets by day"),
          breaks="days", freq=T, 
          xlab="", ylab="",col="grey55", lty=0,
          cex.axis=1.3 ,cex.lab=1.5, tck=0.05, xaxt="n")
peaks=2
tickpos <- h$breaks[order(h$counts,decreasing=T)[1:peaks]]
labels <- names(sort(table(format(tw$created,"%d.%m.%Y")),decreasing=T))[1:peaks]
axis(1, at=tickpos, labels=labels,cex.axis=1.5)
mtext(side = 2, text = "tweets", line = 5, cex = 1.5)
mtext(side = 1, text = "day", line = 3, cex = 1.5)
```

## Hourly tweets

```{r tweets_hour, out.width = "700px"}
hourly <- table(format(tw$created,"%H"))
hourly_prc <- paste(round(100*hourly/sum(hourly),1),"%")
par(mar=c(8,10,5,3))
bp <- barplot(hourly, main = paste(tweeter, "tweets by hour"),
              space=0.5, ylim=c(0,250),
              ylab="",cex.lab=1, cex.names = 1.2,
              cex.axis=1.2, las=2, col = "deepskyblue3",
              xlab="")
text(bp+0.1, hourly, hourly_prc, pos=3, cex=0.7) 
mtext(side = 2, text = "tweets", line = 5, cex = 1.3)
mtext(side = 1, text = "time", line = 5, cex = 1.3)
```

# Most active tweet days

```{r active_days}
top5tweetdays <- sort(table(Date = format(tw$created,"%Y-%m-%d")),decreasing=T)[1:5]
data.frame(top5tweetdays)
```


## Tweet times during the most active days

```{r top_days}
top5days <- names(top5tweetdays)

topdaydata <- lapply(1:4,function(day) {
  get_datedata(tw, top5days[day])
})

newpar <- par(mfrow=c(2,2))
for(i in 1:4){
  time <- topdaydata[[i]]$created
  hist(time, breaks=100,freq=T,border=NULL,ylab="Tweets",
       tck=0,cex.axis=0.8, cex.lab=0.8, xlab="",
       main=paste0(tweeter," ",top5days[i]),
       cex.main=1, ylim=c(0,8))
}
```

## Most used hashtags

```{r hashfreq}
hashcounts <- extract_hashes(tw$text)
hash <- head(hashcounts, 5)
kable(hash)
```

```{r hashfreq_plot}
par(las=2, mar=c(5,14,4,2))
barplot(hash$freq, horiz=T, names.arg=hash$tag,
        cex.names=1.3, cex.axis = 1.5,
        col = "skyblue", border=NA)
```

## Most used words

```{r wordclouds, fig.height=15, fig.width = 9}
# document term matrices for each language
DTM <- get(load(paste0("data/",tweeter,"_DTM.Rda")))

# word frequencies for each language
FREQ <- lapply(DTM, twitter_wordfreqs)
names(FREQ) <- names(DTM)

# wordclouds
par(mfrow= c(3,1))
temp <- lapply(FREQ, function(lang) {
  suppressWarnings(
    wordcloud(lang$word, lang$freq,
            scale = c(5,1),
            random.order=FALSE, colors=brewer.pal(8, "Dark2")))
})

```

# Topic model  

```{r}
TW <- get(load(paste0("data/",tweeter,"_topicdata.Rda")))
tw_fi <- TW[[2]]
topic_labels <- c("Kannanottoja", "SuomiNousuun","KookoomusTsemppi","Kansanviestit")
tw_fi[["aihe"]] <- factor(tw_fi$topic, labels = topic_labels)
tw_fi <- tw_fi[!is.na(tw_fi$aihe),]

```

## Average popularity by topic  

```{r}
library(dplyr)
group_by(tw_fi, aihe) %>% 
  summarise(keskisuosio = round(mean(suosio)), mediaanisuosio = round(quantile(suosio, probs=0.5)))

```


## Number of tweets by month and popularity

```{r}

# popularity by month
df_summary <- get_summary(tw_fi, "kuukausi")
q <- ggplot(df_summary,aes(aika,tweets, size=keski_suosio)) + geom_point() 
q <- q + ylab("tweettejä") 
q <- q + scale_x_discrete() + xlab("")
q <- q + theme(axis.text.x=element_text(size = 10,angle = -90, hjust = 0),
               axis.text.y = element_text(size = 15),
               axis.title.y = element_text(size=15),
               legend.text = element_text(size = 15),
               legend.title = element_text(size=10))
q
```


## Popularity by topics (log scale)

```{r, warning = F}
# help funtion for scale_y_log10
fmt <- function(){
  function(x) format(x, nsmall=0L, scientific = FALSE)
}

q <-ggplot(tw_fi,aes(x=aihe, y = suosio)) + 
  geom_boxplot(outlier.size = 3)
q <- q + scale_y_log10(labels=fmt())
q + theme(text= element_text(size=15))
```

## Tweets by topic, hour and popularity

```{r}

q <- ggplot(tw_fi,aes(topic,tunti,color=aihe, size=suosio)) + 
  geom_point(shape=19, position="jitter", alpha=0.5) + xlab("") + 
  scale_size(range = c(1, 10))
q <- q + theme(text = element_text(size =20),
          legend.key.size=unit(1,"cm"),
          legend.text = element_text(size = 10))

q + guides(colour=guide_legend(override.aes=list(size=5)))
```

# Regression analysis

```{r}

tw_fi$vastaus <- !is.na(tw_fi$replyToSN)
my_lm <- lm(suosio~aihe+tunti+vastaus,data=tw_fi)

summary(my_lm)
```

# A comment from Stubb

The former prime minister commented on the analysis using twitter.  

![](figure/stubb_response.png)