.Rhistory

library(tidyr)
library(tidyselect)
library(tidyverse)
library(ggthemes)
library(nycflights13)
Sys.setenv(LANG = "eng")
source('C:/Users/david/Desktop/R/R-for-data-science.R', echo=TRUE)
source('C:/Users/david/Desktop/R/R-for-data-science.R', echo=TRUE)
source('C:/Users/david/Desktop/R/R-for-data-science.R', echo=TRUE)
source('C:/Users/david/Desktop/R/R-for-data-science.R', echo=TRUE)
source('C:/Users/david/Desktop/R/R-for-data-science.R', echo=TRUE)
source('C:/Users/david/Desktop/R/R-for-data-science.R', echo=TRUE)
ggplot(data =  mpg) +
geom_point(mapping = aes(x = displ, y = hwy))
ggplot(data =  mpg) +
geom_point(mapping = aes(x = displ, y = hwy))
ggplot(data =  mpg) +
geom_point(mapping = aes(x = displ, y = hwy))
ggplot(data =  mpg) +
geom_point(mapping = aes(x = displ, y = hwy))
ggplot(data =  mpg) +
geom_point(mapping = aes(x = displ, y = hwy))
ggplot(data =  mpg) +
geom_point(mapping = aes(x = displ, y = hwy))
ggplot(data =  mpg) +
geom_point(mapping = aes(x = displ, y = hwy))
ggplot(data =  mpg) +
geom_point(mapping = aes(x = displ, y = hwy))
ggplot(data =  mpg) +
geom_point(mapping = aes(x = displ, y = hwy))
ggplot(data =  mpg) +
geom_point(mapping = aes(x = displ, y = hwy))
ggplot(data = mpg)
ggplot(data = mpg) +
geom_point(mapping = aes(x = hwy, y = cyl))
source('C:/Users/david/Desktop/R/R-for-data-science.R', echo=TRUE)
ggplot(data = mpg) +
geom_point(mapping = aes(x = hwy, y = cyl))
ggplot(data = mpg) +
geom_point(mapping = aes(x = hwy, y = cyl))
library(tidyr)
library(tidyselect)
library(tidyverse)
library(ggthemes)
library(nycflights13)
Sys.setenv(LANG = "eng")
ggplot(data = flights, mapping = aes(dep_time, arr_delay)) +
geom_point()
library(tidyr)
library(tidyselect)
library(tidyverse)
library(ggthemes)
library(nycflights13)
Sys.setenv(LANG = "eng")
str(flights)
min_delay_daytime <- flights %>%
ggplot(data = flights, mapping = aes(dep_time, arr_delay)) +
geom_point() +
geom_line()
ggplot(data = flights, mapping = aes(dep_time, arr_delay)) +
geom_point() +
geom_line()
count = n(),
avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
arrange(desc(avg_arr_delay)))
(min_delay_daytime <- flights %>%
group_by(hour) %>%
summarise(
count = n(),
avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
arrange(desc(avg_arr_delay)))
?n
(min_delay_daytime <- flights %>%
group_by(hour) %>%
summarise(
count = n(),
avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
filter(n > 30)
arrange(desc(avg_arr_delay)))
(min_delay_daytime <- flights %>%
group_by(hour) %>%
summarise(
count = n(),
avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
filter(count > 30)
arrange(desc(avg_arr_delay)))
(min_delay_daytime <- flights %>%
group_by(hour) %>%
summarise(
count = n(),
avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
filter(!is.na(count))
arrange(desc(avg_arr_delay)))
(min_delay_daytime <- flights %>%
group_by(hour) %>%
summarise(
count = n(),
avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
filter(count > 0)
arrange(desc(avg_arr_delay)))
(min_delay_daytime <- flights %>%
group_by(hour) %>%
summarise(
count = n(),
avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
filter(count > 0)
arrange(desc(avg_arr_delay)))
(min_delay_daytime <- flights %>%
group_by(hour) %>%
summarise(
count = n(),
avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
filter(count > 0)
arrange(desc(avg_arr_delay)))
(min_delay_daytime <- flights %>%
group_by(hour) %>%
summarise(
count = n(),
avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
filter(count > 0) %>%
arrange(desc(avg_arr_delay)))
(min_delay_daytime <- flights %>%
group_by(hour) %>%
summarise(
count = n(),
avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
filter(count > 0) %>%
arrange(avg_arr_delay))
View(combined_matrix)
View(diamonds2)
View(cars.subset)
search()
systemInfo()
systemInfo()
system("info")
system()
?system
search()
diamonds
diamonds2
View(diamonds2)
diamonds2
Sys.setenv(lang = "eng")
aetrsoin
diamonds2
library("vctrs")
diamonds
data()
swiss
attenu
?data
WorldPhones
cars
chickwts
co2
co2
head(co2)
str(co2)
library(dplyr)
glimpse(co2)
Nile
Titanic
str(Titanic)
glimpse(Titanic)
big_matrix
library(ggplot2)
data()
co2
chickwt
Nile
rock
mpg
ggplot(mpg, aes(cty, hwy))
ggplot(mpg, aes(cty, hwy)) +
geom_point(aes(label = drv))
geom_point(aes(drv = label))
ggplot(mpg, aes(cty, hwy)) +
geom_text(aes(label = year))
diamonds
head(data.frame(ntoken(sentiment_corpus)))
head(data.frame(ntoken(sentiment_corpus)))
head(data.frame(ntoken(sentiment_corpus)))
test_vector1
x <- c(test_vector1[1,2])
x <- test_vector1[c(1,2)]
x
?rm
rm(list=ls())
library(haven)
library(tidyverse)
library(margins)
library(magrittr)
library(ggeffects)
library(lme4)
library(splines)
library(stargazer)
library(dplyr)
library(tidytext)
library(strex)
library(tm)
library(stringr)
## Data Preparation
#Import the dataset. Filter for variables that we need in the end as well as rename the variables to facilitate subsequent coding.
hudoc <- read.csv("hudoc.csv")
head(hudoc)
hudoc <- hudoc %>% rename(case = Document.Title,
case_id = Application.Number,
concdate = Date,
conclusion = Conclusion) %>% select(-3, -4)
#Recode and create variables
#convert everything to lower case to facilitate recoding
hudoc$conclusion <- sapply(hudoc$conclusion, tolower)
hudoc$case <- sapply(hudoc$case, tolower)
#create country variable
hudoc$case <- gsub("v. |c. ", "vs.", hudoc$case)
hudoc$country <- gsub(".*vs.", "", hudoc$case)
hudoc$country <- word(hudoc$country, 1)
table(hudoc$country)
hudoc$country <- gsub("slovaquie", "slovakia", hudoc$country)
hudoc$country <- gsub("suisse", "switzerland", hudoc$country)
?ylim
ggplot2::ylim
?ggplot2::ylim
library(ggplot2)
?ylim
nchar("hi")
length("hi")
lengths("hi")
search()
library(haven)
library(tidyverse)
library(margins)
library(magrittr)
search()
?matrix
matrix(1:9)
matrix(1:9, nrow = 3)
matrix(1:9, nrow = 3)[c(1,3),1]
data()
morley
?morley
trees
women
women
women
glimpse(women)
economics
tail(economics)
ggplot(economics, aes(date, unemploy))
ggplot(economics, aes(date, unemploy)) +
geom_line()
ggplot(economics) +
geom_line(aes(date, unemploy)) +
geom_line(aes(date, pop))
ggplot(economics) +
geom_line(aes(date, pop/unemploy))
ggplot(economics) +
geom_line(aes(date, unemploy/pop))
ggplot(economics, aes(date, unemploy)) +
geom_line()
ggplot(economics) +
geom_line(aes(date, unemploy/pop))
install.packages(scales)
install.packages("scales")
library(sclaes)
install.packages("scales")
library(scales)
ggplot(economics) +
geom_line(aes(date, scales::percent(unemploy/pop)))
library(ggplot2)
ggplot(economics) +
geom_line(aes(date, scales::percent(unemploy/pop)))
ggplot(economics) +
geom_line(aes(date, percent(unemploy/pop)))
ggplot(economics) +
geom_line(aes(date, unemploy/pop))
economics
?economics
ggplot(economics) +
geom_line(aes(date, psavert))
ggplot(economics) +
geom_line(aes(date, unemploy/pop)) +
xlim(c(1990, 2000))
ggplot(economics) +
geom_line(aes(date, unemploy/pop)) +
xlim(c(as.Date(1990, 2000)))
#this clears the environment
rm(list = ls())
#this sets the wd to the directory of this R file. might not be necessary, but better be safe than sorry
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
#sets R to english because I don't like german warnings
Sys.setenv(LANG = "en")
library(haven)
library(tidyverse)
library(margins)
library(magrittr)
library(ggeffects)
library(lme4)
library(splines)
library(stargazer)
library(dplyr)
library(tidytext)
library(strex)
library(tm)
library(stringr)
## Data Preparation
#Import the dataset. Filter for variables that we need in the end, rename the variables to facilitate subsequent coding, and remove duplicates.
hudoc <- read.csv("hudoc.csv")
head(hudoc)
hudoc <- hudoc %>% rename(case = Document.Title  ,case_id = Application.Number,
concdate = Date,
conclusion = Conclusion)%>% select(-3, -4) %>% distinct(case_id, .keep_all = TRUE)
#Recode and create variables
#convert everything to lower case to facilitate recoding
hudoc$conclusion <- sapply(hudoc$conclusion, tolower)
hudoc$case <- sapply(hudoc$case, tolower)
#create country variable
hudoc$case <- gsub("v. |c. ", "vs.", hudoc$case)
hudoc$country <- gsub(".*vs.", "", hudoc$case)
hudoc$country <- word(hudoc$country, 1)
table(hudoc$country)
hudoc$country <- gsub("suisse", "switzerland", hudoc$country)
#create violation variable (first filter for the text and number of article separately, then merge together. Afterwards remove junk/summarize)
#create the text variable for the extraction of violations
#hudoc$text <- str_extract(hudoc$conclusion, "^\\D+")
#hudoc$text <- rm()
hudoc$text <- hudoc$conclusion
#removeSpecialChars <- function(x) gsub("[^a-zA-Z0-9\\+ ]", " ", x)
#hudoc$text <- sapply(hudoc$text, removeSpecialChars)
#this makes it so that the french is the same as english
hudoc$text <- gsub("  |   ", " ", hudoc$text)
hudoc$text <- gsub(" de l'", " of ", hudoc$text)
hudoc$text <- gsub("article ", "art. ", hudoc$text)
hudoc$text <- gsub("non ", "no ", hudoc$text)
hudoc$text <- gsub("non-violation", "no violation", hudoc$text)
hudoc$text <- gsub("\\+", " violation of art. ", hudoc$text)
#this looks for the phrase "violation of" followed by any number of non-digit characters and ends with one or more digits. the whole phrase must not be preceded by "no ". the matches are then written into the new varialbe "violations_with_text". the variable is a list
hudoc$violations_with_text <-  stringr::str_extract_all(hudoc$text, "(?<!no\\s)violation\\sof\\sart\\D{0,20}\\d+(?!\\sof\\sprotocol)")
# #this creates a new variable called "violations_with_text". it gets rid of the text and leaves only the numbers. I couldn't do it with only regex because for some reason, using negative and positive lookbehind does not work (in R). anyway, the variable is also a list
# for (l in 1:length(hudoc$violations_with_text)) {
#   hudoc$violations_with_text_numbers[l] <- stringr::str_extract_all(hudoc$violations_with_text[l], "\\d+")
# }
#this loop below gets rid of duplicate values from situations like "violation of art. 14+3" where 14 was matched twice
for (l in 1:length(hudoc$violations_with_text)) {
hudoc$violations_with_text[[l]] <- unique(hudoc$violations_with_text[[l]])
}
#this gets rid of rows where "hudoc$violations_with_text" is missing values
hudoc <- hudoc[!is.na(hudoc$violations_with_text), ]
#still to do before data visualization:
# - find and get rid of duplicates (is included in the first lines of code)
# - use hudoc$violations variable to create a new row in every case where there is more than one violation, copying everything except for the value of hudoc$violation. don't know how to do that yet
# => I was a bit lazy and chose another approach, I groupd the violations (violations_with_text) by country, aggregated all violations per country, got rid of the "violations of" part, and extracted the numbers
#create new dataobject and group new dataset "bycountry" by country, remove list values by applying unlist
bycountry <- apply(hudoc, 2, function(y) sapply(y, function(x) paste(unlist(x), collapse=" ")))
#aggregate data by country, concatenate text of violations
agg <- aggregate(data=bycountry, unlist(violations_with_text)~country, paste0, collapse=' ')
#rename variable name to concat_violations
agg$concat_violations <-agg$`unlist(violations_with_text)`
#get number of violations for slovakia (since the values are displayed as a list, we need to apply unlist; in a second step we remove the spaces)
slovakia <- agg$concat_violations[1]
slovakia <- str_split(slovakia, 'violation of art. ')
slovakia <- unlist(slovakia)
slovakia <- gsub(" |   ", "", slovakia)
#not necessary, but facilitates the identification of articles (see levels)
slovakia <- as.factor(slovakia)
slovakia
#art 6 is the most violated article for Slovakia
table(slovakia)
#get number of violations for switzerland (since the values are displayed as a list, we need to apply unlist; in a second step we remove the spaces)
switz <- agg$concat_violations[2]
switz <- str_split(switz, 'violation of art. ')
switz <- unlist(switz)
switz <- gsub(" |   ", "", switz)
#not necessary, but facilitates the identification of articles (see levels)
switz <- as.factor(switz)
switz
#Interestingly, art 6 is the most violated article for Switzerland
table(switz)
#this clears the environment
rm(list = ls())
#this sets the wd to the directory of this R file. might not be necessary, but better be safe than sorry
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
#sets R to english because I don't like german warnings
Sys.setenv(LANG = "en")
library(haven)
library(tidyverse)
library(margins)
library(magrittr)
library(ggeffects)
library(lme4)
library(splines)
library(stargazer)
library(dplyr)
library(tidytext)
library(strex)
library(tm)
library(stringr)
## Data Preparation
#Import the dataset. Filter for variables that we need in the end, rename the variables to facilitate subsequent coding, and remove duplicates.
hudoc <- read.csv("hudoc.csv")
head(hudoc)
hudoc <- hudoc %>% rename(case = Document.Title  ,case_id = Application.Number,
concdate = Date,
conclusion = Conclusion)%>% select(-3, -4) %>% distinct(case_id, .keep_all = TRUE)
#convert everything to lower case to facilitate recoding
hudoc$conclusion <- sapply(hudoc$conclusion, tolower)
hudoc$case <- sapply(hudoc$case, tolower)
#create country variable
hudoc$case <- gsub("v. |c. ", "vs.", hudoc$case)
hudoc$country <- gsub(".*vs.", "", hudoc$case)
hudoc$country <- word(hudoc$country, 1)
table(hudoc$country)
hudoc$country <- gsub("suisse", "switzerland", hudoc$country)
#create text variable as a first step to retract violated articles
hudoc$text <- hudoc$conclusion
#this makes it so that the french is the same as english
hudoc$text <- gsub("  |   ", " ", hudoc$text)
hudoc$text <- gsub(" de l'", " of ", hudoc$text)
hudoc$text <- gsub("article ", "art. ", hudoc$text)
hudoc$text <- gsub("non ", "no ", hudoc$text)
hudoc$text <- gsub("non-violation", "no violation", hudoc$text)
hudoc$text <- gsub("\\+", " violation of art. ", hudoc$text)
#this looks for the phrase "violation of" followed by any number of non-digit characters and ends with one or more digits. the whole phrase must not be preceded by "no ". the matches are then written into the new varialbe "violations_with_text". the variable is a list
hudoc$violations_with_text <-  stringr::str_extract_all(hudoc$text, "(?<!no\\s)violation\\sof\\sart\\D{0,20}\\d+(?!\\sof\\sprotocol)")
#this loop below gets rid of duplicate values from situations like "violation of art. 14+3" where 14 was matched twice
for (l in 1:length(hudoc$violations_with_text)) {
hudoc$violations_with_text[[l]] <- unique(hudoc$violations_with_text[[l]])
}
#this gets rid of rows where "hudoc$violations_with_text" is missing values
#hudoc <- hudoc[!is.na(hudoc$violations_with_text), ]
#hudoc <- list.clean(hudoc, fun = function(x) length(x) == 0L, recursive = TRUE)
#create new dataobject and group new dataset "bycountry" by country, remove list values by applying unlist
bycountry <- apply(hudoc, 2, function(y) sapply(y, function(x) paste(unlist(x), collapse=" ")))
#aggregate data by country, concatenate text of violations
agg <- aggregate(data=bycountry, unlist(violations_with_text)~country, paste0, collapse=' ')
#rename variable name to concat_violations
agg$concat_violations <-agg$`unlist(violations_with_text)`
#get number of violations for slovakia (since the values are displayed as a list, we need to apply unlist; in a second step we remove the spaces)
slovakia <- agg$concat_violations[1]
slovakia <- str_split(slovakia, 'violation of art. ')
slovakia <- unlist(slovakia)
slovakia <- gsub(" |   ", "", slovakia)
#not necessary, but facilitates the identification of articles (see levels)
slovakia <- as.factor(slovakia)
slovakia
#art 6 is the most violated article for Slovakia
table(slovakia)
#get number of violations for switzerland (since the values are displayed as a list, we need to apply unlist; in a second step we remove the spaces)
switz <- agg$concat_violations[2]
switz <- str_split(switz, 'violation of art. ')
switz <- unlist(switz)
switz <- gsub(" |   ", "", switz)
#not necessary, but facilitates the identification of articles (see levels)
switz <- as.factor(switz)
switz
#Interestingly, art 6 is the most violated article for Switzerland
table(switz)
bycountry
glimpse(bycountry)
str(bycountry)
function()
?function
?function
aggregate
?aggregate
?try
ggplot(data = )
matrix(1:9, 1:8)
matrix(himark)
try(matrix(himark))
library("readr")
parse_number(c("this is number 1", "hi mark", "number2<", "3n45"))
c("this is number 1", "hi mark", "number2<", "3n45")