-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtreemap_visualization.R
171 lines (142 loc) · 4.83 KB
/
treemap_visualization.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# Objective
# Using the data, achieve the following objectives in Pakistan.
# 1) Identify the most popular cars in Pakistan.
# 2) Identify the cars most commonly sold in Pakistan across different brands
# 3) Which is better in terms of it's resale value.
# 4) Predict the cars for the cases, which has missing data for the cars.
# Loading packages
library(tidyverse)
library(treemapify)
library(paletteer)
library(ggtext)
library(showtext)
library(gt)
font_add_google(name = "Lora", family = "title_font")
font_add_google(name = "Roboto", family = "body_font")
showtext_auto()
# Loading data
olx_car_data <- read_csv("data/OLX_Car_Data_CSV.csv")
# Data Book
# Sample data contains vehicles registered in the year
range(olx_car_data$Year, na.rm = TRUE)[1]
range(olx_car_data$Year, na.rm = TRUE)[2]
glimpse(olx_car_data)
data_book <- data_frame(
var_name = c("Brand",
"Condition",
"Fuel",
"KMs Driven",
"Model",
"Price",
"Registered City",
"Transaction Type",
"Year"),
data_type = c("character",
"character",
"character",
"double",
"character",
"double",
"character",
"character",
"double"
),
description = c("Brand name of the car in sample dataset",
"Whether the car is new or used",
"Fuel types used to operate the car",
"Kilometers the vehicle is driven",
"Vehicle model from the manufacturer",
"Price is PKR",
"City of Registration",
"Cash transaction or leased vehicle",
"Year of registration"
)
)
data_book_gt <- data_book %>%
gt() %>%
tab_header(title = "Data book",
subtitle = "Data type and description of variables in
the dataset") %>%
cols_label(
var_name = "Variable",
data_type = "Data type",
description = "Description"
)
data_book_gt
# Listing all the different types, so it can be grouped by country/region of
# origin. Country of manufacturing, have different price points and
# consequently market is segmented based on it. In addition, different
# brand manufacturers cater to high end customers based on the prestige,
# and brand value. Thus to make meaningful comparison it is important
# to add variables and thus compare like groups
unique(olx_car_data$Brand)
# Feature engineering
# Creating a new variable to determine the country of origin/manufacturing
european <- c("Range Rover", "Land Rover", "Porsche", "Audi",
"BMW", "Mercedes")
japanese <- c("Toyota", "Honda", "Mitsubishi", "Lexus", "Subaru", "Suzuki",
"Daihatsu", "Daihatsu", "Nissan" , "Mazda")
korean = c("Hyundai" , "KIA", "Daewoo")
chinese = c("FAW", "Changan")
american = c("Chevrolet")
olx_car_data <- olx_car_data %>%
mutate(origin_country = case_when(
Brand %in% european ~ "Europe",
Brand %in% japanese ~ "Japan",
Brand %in% korean ~ "Korea",
Brand %in% chinese ~ "China",
Brand %in% american ~ "American",
TRUE ~ as.character(Brand))
)
# Popularity of used cars in Pakistan
# Popularity of the car is determined based on its proportionate
# representation, in the sample data
olx_car_data_by_org <- olx_car_data %>%
group_by(origin_country) %>%
count() %>%
drop_na() %>%
arrange(desc(n)) %>%
ungroup()
olx_car_data_by_org_gt <- olx_car_data_by_org %>%
gt() %>%
gt::tab_header(
title = "Number of vehicles by country of origin",
subtitle = "Japanese vehicles seem to be the most popular"
) %>%
gt::cols_label(
origin_country = "Origin Country",
n = "# of Vehicle"
) %>%
gt::fmt_number(
columns = c("n"),
use_seps = TRUE,
decimals = 0
)
olx_car_data_by_org_gt
brnd_by_origin <- olx_car_data %>%
group_by(origin_country, `Brand`) %>%
count() %>%
arrange(desc(n)) %>%
drop_na() %>%
ungroup()
glimpse(brnd_by_origin)
ggplot(data = filter(brnd_by_origin, origin_country == "Japan"),
mapping = aes(area = n, label = Brand, fill = Brand)
) +
geom_treemap() +
geom_treemap_text(colour = "white", reflow = TRUE) +
scale_fill_paletteer_d(`"ggsci::default_igv"`) +
labs(
title = "**Japanese vehicles that feature on OLX**",
subtitle = "Despite our reservations, Suzuki represent the highest market share",
caption = "OLX Data Sourced from Kaggle \n "
) +
theme_minimal() +
theme(
legend.position = "none",
plot.caption = element_text(size = 7),
text = element_text(family = "body_font", size = 10),
plot.title = element_markdown(family = "title_font", size = 16,
color = "#0f3e63"),
plot.margin = margin(3, 3, 3, 3, "mm")
)