-
Notifications
You must be signed in to change notification settings - Fork 0
/
Sharp Sight Labs - dplyr How to do data manipulation with R.R
72 lines (49 loc) · 2.32 KB
/
Sharp Sight Labs - dplyr How to do data manipulation with R.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
## Tutorial http://sharpsightlabs.com/blog/dplyr-intro-data-manipulation-with-r/
#dplyr: the essential data manipulation toolset
#In data wrangling, what are the main tasks?
#– Filtering rows (to create a subset)
#– Selecting columns of data (i.e., selecting variables)
#– Adding new variables
#– Sorting
#– Aggregating
library(tidyverse)
library(ggplot2)
#filter() - Row selection from your data.
#filter() subsets your data by keeping rows that meet specified conditions.
df.diamonds_ideal <- filter(diamonds, cut=="Ideal")
#select()
#Select allows you to select specific columns from your data.
df.diamonds_ideal <- select(df.diamonds_ideal, carat, cut, color, price, clarity)
#mutate()
#Mutate allows you to add variables to your dataset.
df.diamonds_ideal <- mutate(df.diamonds_ideal, price_per_carat = price/carat)
#arrange()
#Arrange sorts your data.
arrange(df.disordered_data, num_var)
arrange(df.disordered_data, desc(num_var))
#summarize()
#Summarize allows you to compute summary statistics.
#summarize() becomes extremely useful when combined with group_by().
summarize(df.diamonds_ideal, avg_price = mean(price, na.rm = TRUE) )
# “Chaining” in dplyr
#The %>% operatorIn the dplyr syntax, we “chain” commands together using the %>% operator.
#We use the %>% operator to connect one command to another. The output of one command becomes the input for the next command.
df.diamonds_ideal_chained <- diamonds %>%
filter(cut=="Ideal") %>%
select(carat, cut, color, price, clarity) %>%
mutate(price_per_carat = price/carat)
#Example 1: Boxplot, dplyr + ggplot
# dplyr + ggplot
# PRICE DISTRIBUTION, Ideal-cut diamonds
diamonds %>% # Start with the 'diamonds' dataset
filter(cut == "Ideal") %>% # Then, filter down to rows where cut == Ideal
ggplot(aes(x=color,y=price)) + # Then, plot using ggplot
geom_boxplot() # with and create a boxplot
#Example 2: Small Multiple Histogram, dplyr + ggplot
# dplyr + ggplot
# HISTOGRAM of price, ideal cut diamonds
diamonds %>% # Start with the 'diamonds' dataset
filter(cut == "Ideal") %>% # Then, filter down to rows where cut == Ideal
ggplot(aes(price)) + # Then, plot using ggplot
geom_histogram() + # and plot histograms
facet_wrap(~ color) # in a 'small multiple' plot, broken out by 'color'