ken.Rmd

---
title: "Ken's Proj"
author: "Ken Chen"
date: ""
output: 
  html_document:
    fig_height: 7
    fig_width: 10
---
<!-- Don't edit in between this line and the one below -->
```{r include=FALSE}
# Don't delete this chunk if you are using the DataComputing package
library(DataComputing)
```
*Source file* 
```{r, results='asis', echo=FALSE}
includeSourceDocuments()
```
<!-- Don't edit the material above this line -->

```{r}
library(XML)
library(igraph)
library(network)

filepath <- "/Users/kenchen/MDST/visual_reddit/"
Posts <- read.file(paste(filepath, "PostSample_10k.csv", sep=""))
Comments <- read.file(paste(filepath, "CommentSample_10k.csv", sep=""))


Posts %>% variable.names()

# [1] "domain"                 "subreddit"              "selftext"               "saved"                 
# [5] "id"                     "from_kind"              "gilded"                 "from"                  
# [9] "stickied"               "title"                  "num_comments"           "score"                 
# [13] "retrieved_on"           "over_18"                "thumbnail"              "subreddit_id"          
# [17] "hide_score"             "link_flair_css_class"   "author_flair_css_class" "downs"                 
# [21] "archived"               "is_self"                "from_id"                "permalink"             
# [25] "name"                   "created"                "url"                    "author_flair_text"     
# [29] "quarantine"             "author"                 "created_utc"            "link_flair_text"       
# [33] "ups"                    "distinguished"    


# Cleaning up Posts
Posts <-
  Posts %>%
  mutate(retrieved_on = as.POSIXct(retrieved_on, origin="1970-01-01"), 
         created_utc = as.POSIXct(created_utc, origin="1970-01-01"), 
         created = as.POSIXct(created, origin="1970-01-01")) %>%
  select(created_utc, author, title, subreddit, subreddit_id, 
         id, name, gilded, ups, downs, num_comments, retrieved_on, over_18) %>%
  mutate(karma = ups - downs, 
         hour = lubridate::hour(created_utc), 
         day_of_year = lubridate::yday(created_utc), 
         day_of_week = lubridate::wday(created_utc, label = TRUE), 
         morning = hour < 12,
         working_hours = hour > 8 & hour < 18, 
         gilded = as.factor(gilded))


# Some general stuff about karma thresholds:
#   - controversial: < 0 karma
#   - standard: >= 0 karma
#   - good: >= 100 karma
#   - viral: >= 500 karma
#   - front page: >= 3000 karma

standard <- 0
good <- 100
viral <- 500
front_page <- 3000


# Average karma of posts based on time of day
KarmaByHour <- 
  Posts %>%
  group_by(hour) %>%
  summarize(karma = mean(karma))

KarmaByHour %>% 
  ggplot(aes(x = hour, y = karma, fill = karma)) + geom_bar(stat = "identity") +
  ggtitle("Average karma of posts by hour of day")

# Average karma for viral (>= 500 karma) posts based on time of day
ViralKarmaByHour <- 
  Posts %>%
  filter(karma >= viral) %>%
  group_by(hour) %>%
  summarize(karma = mean(karma))

ViralKarmaByHour %>% 
  ggplot(aes(x = hour, y = karma, fill = karma)) + geom_bar(stat = "identity") +
  ggtitle("Average viral karma of posts by hour of day")


# Length of post title
PostTitles <-
  Posts %>%
  mutate(len_title = nchar(title))

PostTitles %>%
  ggplot(aes(x = len_title, y = karma)) + geom_point(aes(col = gilded)) + 
  ggtitle("Karma earned vs. length of post title")

# Length of post title (exc. any posts <100 karma to reduce noise)
PostTitlesTenPlus <-
  PostTitles %>%
  filter(karma >= good)

PostTitlesTenPlus %>%
  ggplot(aes(x = len_title, y = karma)) + geom_point(aes(col = gilded)) + 
  ggtitle("Karma earned vs. length of post title (for 100+ karma posts)")


# Which subreddits give the most karma?

## One metric: posts usually become 'viral' or hit the front page of reddit once they've
## reached 3000 karma. We will take all posts over 3000 karma, and scale their karma relatively
## to 3000 (e.g. 6000 becomes 6000/3000 = 2 points in our new metric), and figure out which 
## subreddits (subforums) have the most overall points

karmaThreshold <- front_page
ViralSubreddits <-
  Posts %>%
  filter(karma >= karmaThreshold) %>%
  mutate(points = karma / karmaThreshold) %>%
  group_by(subreddit) %>%
  summarize(total_points = sum(points)) %>%
  arrange(desc(total_points)) %>%
  head(25)

ViralSubreddits %>%
  ggplot(aes(x = subreddit, y = total_points, fill = total_points)) + geom_bar(stat = "identity") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 12)) + 
  ggtitle("Subreddits with the most viral posts")

## Another metric: simply the average karma per post in the top 25 subreddits (current data
## is not representative because of small sample size.)

AvgKarmaSubreddits <-
  Posts %>%
  group_by(subreddit) %>%
  summarize(average_karma = mean(karma)) %>%
  arrange(desc(average_karma)) %>%
  head(25)

AvgKarmaSubreddits %>%
  ggplot(aes(x = subreddit, y = average_karma, fill = average_karma)) + geom_bar(stat = "identity") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 12)) + 
  ggtitle("Average karma of posts in the top 25 subreddits")


# Number of comments vs. karma
KarmaComments <-
  Posts %>%
  select(karma, num_comments, working_hours)

KarmaComments %>%
  ggplot(aes(x = karma, y = num_comments, col = working_hours)) + geom_point() + 
  ggtitle("Number of comments vs. karma")


# Time and day vs. karma earned

## What time on what particular weekday gives the best karma?
# HelperHourTable <-
#   Posts %>%
#   filter(karma >= good) %>%
#   group_by(weekday = lubridate::wday(created_utc)) %>%
#   summarize(avg_hour = mean(hour))

Posts %>%
  filter(karma >= viral) %>%
  ggplot(aes(x = day_of_week, y = hour)) + 
  geom_jitter(aes(size = karma, col = karma), alpha = 0.8) + 
  # geom_smooth(data = HelperHourTable, aes(x = weekday, y = avg_hour)) + 
  ggtitle("What time on what particular weekday gives the best karma?")


# Total Karma vs. Day over the past year
KarmaByDay <-
  Posts %>%
  filter(year(created_utc) == 2014) %>%
  group_by(day_of_year) %>%
  summarize(total_karma = sum(karma)) %>%
  arrange(day_of_year)

KarmaByDay %>%
  ggplot(aes(x = day_of_year, y = total_karma)) + geom_point()


# Effin swag network graph of top subreddits and their posts

# Table of all nodes
NetworkPosts <-
  Posts %>%
  select(id, subreddit, subreddit_id, title, karma, gilded) %>%
  mutate(type = "post")

NetworkSubreddits <-
  Posts %>%
  group_by(subreddit) %>%
  summarize(id = head(subreddit_id, 1)) %>%
  mutate(type = "subreddit", 
         subreddit_id = NA, 
         title = NA,
         karma = NA, 
         gilded = NA)

NetworkNodes <- rbind(NetworkPosts, NetworkSubreddits)

# Table of all edges
NetworkEdges <-
  NetworkPosts %>%
  select(from = subreddit_id, to = id, weight = karma)

# Network
redditNetwork <- graph.data.frame(NetworkEdges, NetworkNodes, directed=T) %>%
  simplify(remove.multiple = F, remove.loops = T) 
# V(net)$size <- 3 * degree(redditNetwork, mode="all")
colors <- c("tomato", "gold")
V(redditNetwork)$color <- colors[ifelse(V(redditNetwork)$type == "subreddit", 1, 2)] %>%
  adjustcolor(alpha.f = 0.6)
V(redditNetwork)$size <- ifelse(V(redditNetwork)$type == "subreddit", 
                                3, 
                                1 + V(redditNetwork)$karma / 1000)
plot(redditNetwork, 
     vertex.frame.color = adjustcolor("white", alpha.f = 0), 
     edge.color = adjustcolor("#616161", alpha.f = 0.6), 
     edge.arrow.size = 0.05, 
     edge.arrow.width = 0.05, 
     edge.lty = 1, 
     edge.width = 0.5, 
     edge.curved = 0.5, 
     vertex.label = NA, 
     main = "Network baby", 
     sub = "too many subreddits wtf")

```