5_trump_tweet_analysis.Rmd

---
title: "Trump tweet analysis by dplyr"
output:
  html_notebook:
    number_sections: yes
    toc: yes
    toc_float:
      collapsed: no
      smooth_scroll: no
  html_document:
    toc: yes
editor_options: 
  chunk_output_type: inline
---


# Source
* Author: The case is written by David Robinson, author of the book "R for text mining", author of library tidytext, data scientist at StackOverFlow.
* Link of github: https://github.com/dgrtwo/dgrtwo.github.com/blob/master/_R/2016-08-09-trump-tweets.Rmd
* Link of the article: http://varianceexplained.org/r/trump-tweets/


# Load and clean data

## Loading data

```{r}
library(dplyr)
library(tidyverse)
library(ggplot2)
load(url("http://varianceexplained.org/files/trump_tweets_df.rda"))
dim(trump_tweets_df)
names(trump_tweets_df)
```


## Cleaning data
```{r}
library(tidyr) # tidyr::extract()
library(stringr) # stringr::str_replace
tweets <- trump_tweets_df %>% 
	dplyr::select(id, statusSource, text, created) %>%
	mutate(source = str_replace(statusSource, ".*Twitter for (.*?)<.*", "\\1"))
tweets$statusSource[1]

tweets <- trump_tweets_df %>%
	dplyr::select(id, statusSource, text, created) %>%
	mutate(source = str_replace(statusSource, ".*Twitter for (.*?)<.*", "\\1")) %>%
  	# extract(statusSource, "source", "Twitter for (.*?)<") %>%
  	filter(source %in% c("iPhone", "Android"))

# Using stringr::str_replace() to mutate a new source variable, replacing tidyr::

str(tweets)
```


## Analyzing data
```{r}
library(lubridate)
library(scales)

tweets %>%
    mutate(hour = hour(with_tz(created, "EST"))) %>%
    count(source, hour) %>%
    mutate(percent = n / sum(n)) %>%
    ggplot() +
    aes(hour, percent, color = source) +
    geom_line() +
    scale_y_continuous(labels = percent_format())
```


## With Pictures or Not
```{r}
library(stringr)
tweets %>%
    filter(!str_detect(text, '^"')) %>%
    mutate(picture = ifelse(str_detect(text, "t.co"),
                         "With pic/link", "No pic/link")) %>%
    count(source, picture) %>%
	ggplot() + 
	aes(source, n, fill = picture) + 
	geom_col(position="dodge")
```


## Comparison of words
```{r}
library(tidytext)	# unnest_tokens()
library(stringr)	# str_detect(), str_replace_all()


tweet_words <- tweets %>%
  	filter(!str_detect(text, '^"')) %>%
  	mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&amp;", "")) %>%
  	unnest_tokens(word, text) %>%
	# unnest_tokens(word, text, token = "regex", pattern = "[^A-Za-z\\d#@']") %>%
  	filter(!word %in% stop_words$word,
  		   str_detect(word, "[a-z]"))
View(tweet_words)
```

```{r test: stop_word}
stop_words$word
```


```{r}
tweet_words %>%
  count(word, sort = TRUE) %>%
  head(20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  ylab("Occurrences") +
  coord_flip()
```


## words frequency by different devices
```{r}

word_by_source <- tweet_words %>%
  count(word, source) %>%
  filter(n >= 5) %>%
  spread(source, n, fill = 0) %>%
  ungroup()

sum(word_by_source$iPhone)
sum(word_by_source$Android)

android_iphone_ratios <- word_by_source %>%
	mutate(iPhone = (iPhone+1)/sum(iPhone+1)) %>%
	mutate(Android = (Android+1)/sum(Android+1)) %>%
  	mutate(logratio = log2(Android / iPhone)) %>%
  	arrange(desc(logratio))
```

### Visualization
```{r}
# android_iphone_ratios %>%
# 	mutate(word = reorder(word, logratio)) %>%
# 	ggplot() + 
# 	aes(word, logratio, fill=logratio < 0) + 
# 	geom_col() + 
# 	coord_flip()


android_iphone_ratios %>%
  group_by(logratio > 0) %>%
  top_n(15, abs(logratio)) %>%
  ungroup() %>%
  mutate(word = reorder(word, logratio)) %>%
  ggplot(aes(word, logratio, fill = logratio < 0)) +
  geom_col() +
  coord_flip() +
  ylab("Android / iPhone log ratio") +
  scale_fill_manual(name = "", labels = c("Android", "iPhone"),
                    values = c("red", "lightblue"))

class(android_iphone_ratios$word)
```


## Sentiment Analysis
```{r}
library(tidytext)
nrc <- sentiments %>%
  filter(lexicon == "nrc") %>%
  dplyr::select(word, sentiment)
nrc
```

```{r test for sentiments}
sentiments
get_sentiments("afinn") # negative to positive scale from -3~3
get_sentiments("bing") # only negative/positive label
table((get_sentiments("nrc")$sentiment))
nrc <- get_sentiments("nrc")
```


```{r}
sources <- tweet_words %>%
  group_by(source) %>%
  mutate(total_words = n()) %>%
  ungroup() %>%
  distinct(id, source, total_words)

by_source_sentiment <- tweet_words %>%
  inner_join(nrc, by = "word") %>%
  count(sentiment, id) %>%
  ungroup() %>%
  complete(sentiment, id, fill = list(n = 0)) %>%
  inner_join(sources) %>%
  group_by(source, sentiment, total_words) %>%
  summarize(words = sum(n)) %>%
  ungroup()

head(by_source_sentiment)
```

```{r}
joined <- tweet_words %>%
  inner_join(nrc, by = "word")

sentiment_joined1 <- tweet_words %>%
  inner_join(nrc, by = "word") %>%
  count(sentiment, id) %>%
  ungroup()

sentiment_joined2 <- tweet_words %>%
  inner_join(nrc, by = "word") %>%
  count(sentiment, id) %>%
  ungroup() %>%
  complete(sentiment, id, fill = list(n = 0))

test <- tweet_words %>%
  group_by(source) %>%
  mutate(total_words = n()) %>%
  ungroup()

join_source <- tweet_words %>%
  inner_join(nrc, by = "word") %>%
  count(sentiment, id) %>%
  ungroup() %>%
  complete(sentiment, id, fill = list(n = 0)) %>%
  inner_join(sources)


```


```{r Testing code for complete}
df <- tibble(
  group = c(1:2, 1),
  item_id = c(1:2, 2),
  item_name = c("a", "b", "b"),
  value1 = 1:3,
  value2 = 4:6
)
df
df %>% complete(group, nesting(item_id, item_name))
```


```{r}
library(broom)

sentiment_differences <- by_source_sentiment %>%
  	group_by(sentiment) %>%
	do(tidy(poisson.test(.$words, .$total_words)))

sentiment_differences
```
```{r testing broom:tidy)() and poisson.test()}

df3 <- by_source_sentiment %>%
	filter(sentiment == "anger") %>%
	mutate(words = as.double(words))

poisson.test(x= df3$words, T= df3$total_words)
```


```{r}
library(scales)

sentiment_differences %>%
	ungroup() %>%
  	mutate(sentiment = reorder(sentiment, estimate)) %>%
	mutate_at(c("estimate", "conf.low", "conf.high"), funs(.-1)) %>%
  	# mutate_each(funs(. - 1), estimate, conf.low, conf.high) %>%
  	ggplot(aes(estimate, sentiment)) +
  	geom_point() +
  	geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
  	scale_x_continuous(labels = percent_format()) +
  	labs(x = "% increase in Android relative to iPhone",
  		 y = "Sentiment")
```
```{r}
android_iphone_ratios %>%
  inner_join(nrc, by = "word") %>%
  filter(!sentiment %in% c("positive", "negative")) %>%
  mutate(sentiment = reorder(sentiment, -logratio),
         word = reorder(word, -logratio)) %>%
  group_by(sentiment) %>%
  top_n(10, abs(logratio)) %>%
  ungroup() %>%
  ggplot(aes(word, logratio, fill = logratio < 0)) +
  facet_wrap(~ sentiment, scales = "free", nrow = 2) +
  geom_col() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(x = "", y = "Android / iPhone log ratio") +
  scale_fill_manual(name = "", labels = c("Android", "iPhone"),
                    values = c("red", "lightblue"))
```