-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path6_chinese_typhoon.Rmd
211 lines (181 loc) · 5.6 KB
/
6_chinese_typhoon.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
---
title: "TM02_news_typhoon"
output:
html_notebook:
code_folding: hide
number_sections: true
fig_caption: yes
highlight: zenburn
theme: simplex
toc: yes
editor_options:
chunk_output_type: inline
---
# Loading data and library
```{r}
library(tidyverse)
library(stringr)
library(tidytext)
library(jiebaR)
library(lubridate)
```
```{r}
news.df <- readRDS("data/typhoon.rds") %>%
mutate(doc_id = row_number())
```
# Tokenization
## Initialize cutter()
* Also loading stopWords
```{r}
segment_not <- c("第卅六條", "第卅八條", "蘇南成", "災前", "災後", "莫拉克", "颱風", "應變中心", "停班停課", "停課", "停班", "停駛", "路樹", "里長", "賀伯", "採收", "菜價", "蘇迪", "受災戶", "颱風警報", "韋恩", "台東縣", "馬總統", "豪大雨", "梅姬", "台東", "台北市政府", "工務段", "漂流木", "陳菊", "台南縣", "卡玫基", "魚塭", "救助金", "陳情", "全省", "強颱", "中颱", "輕颱", "小林村", "野溪", "蚵民", "農委會", "來襲", "中油公司", "蔣總統經國", "颱風天", "土石流", "蘇迪勒", "水利署", "陳說", "颱風假", "颱風地區", "台灣", "臺灣", "柯羅莎", "八八風災", "紓困","傅崑萁", "傅崐萁","台中", "文旦柚", "鄉鎮市公所", "鄉鎮市", "房屋稅", "高雄", "未達", "台灣省", "台北市")
cutter <- worker()
new_user_word(cutter, segment_not)
stopWords <- readRDS("data/stopWords.rds")
```
```{r}
news.df$time %>% summary
tokenized.df <- news.df %>%
mutate(timestamp=ymd(time)) %>%
filter(timestamp > as.Date("2009-01-01")) %>%
select(-time) %>%
select(title, text, cat, timestamp, everything()) %>%
mutate(word = purrr::map(text, function(x)segment(x, cutter)))
unnested.df <- tokenized.df %>%
select(doc_id, text, word) %>%
unnest(word) %>%
filter(!(word %in% stopWords$word)) %>%
filter(!str_detect(word, "[a-zA-Z0-9]+"))
```
## Word frequency distribution
```{r}
word.count <- tokenized.df %>%
unnest(word) %>%
count(word, sort=T) %>%
filter(!(word %in% stopWords$word)) %>%
filter(nchar(word) > 1) %>%
filter(!str_detect(word, "[a-zA-Z0-9]+")) %>%
filter(n > 3)
word.count %>%
count(n, sort=T) %>%
ggplot(aes(log(n), log(nn))) +
geom_point(alpha=0.5, size = 1, color="#333333")
```
```{r}
cat_word_count <- tokenized.df %>%
unnest(word) %>%
count(cat, word) %>%
ungroup() %>%
filter(!str_detect(word, "[a-zA-Z0-9]+")) %>%
filter(!(word %in% stopWords$word)) %>%
filter(nchar(word)>1)
```
```{r}
early_lat_ratio <- cat_word_count %>%
filter(n>1) %>%
spread(cat, n, fill = 0) %>%
ungroup() %>%
mutate_if(is.numeric, funs((. + 1) / sum(. + 1))) %>%
mutate(logratio = log2(early / lat)) %>%
arrange(desc(logratio))
```
```{r}
early_lat_ratio %>%
group_by(logratio > 0) %>%
top_n(20, abs(logratio)) %>%
ungroup() %>%
mutate(word = reorder(word, logratio)) %>%
ggplot(aes(word, logratio, fill = logratio < 0)) +
geom_bar(stat = "identity") +
coord_flip() +
ylab("early / recent log ratio") +
scale_fill_manual(name = "", labels = c("early", "recent"),
values = c("tomato", "lightblue")) +
theme(axis.text.y=element_text(colour="black", family="Heiti TC Light"))
```
```{r}
frequency <- cat_word_count %>%
filter(n>3) %>%
group_by(cat) %>%
mutate(proportion = n/sum(n)) %>%
select(-n) %>%
spread(cat, proportion) %>%
na.omit()
```
```{r}
library(scales)
frequency %>%
ggplot(aes(x = early, y = lat, color = abs(early - lat))) +
geom_abline(color = "gray40", lty = 2) +
geom_point(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family="Heiti TC Light") +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
theme(legend.position="none") +
coord_fixed(1)
```
```{r}
word_count <- tokenized.df %>%
unnest(word) %>%
count(cat, word, sort=T) %>%
mutate(total_words=sum(n)) %>%
mutate(rank=row_number(), tf=n/total_words)
```
```{r}
ggplot(word_count, aes(tf, fill=cat)) +
geom_histogram(show.legend = F) +
xlim(NA, 0.0009) +
# scale_x_log10() +
# scale_y_log10() +
facet_wrap(~cat, ncol=2, scales="free_y")
```
```{r}
word_count %>%
ggplot(aes(rank, tf, color=cat)) +
geom_line(size=1.1, alpha=0.5, show.legend = F) +
scale_x_log10() +
scale_y_log10()
```
```{r}
rank_subset <- word_count %>%
filter(rank < 500,
rank > 10)
lm_result <- lm(log10(tf) ~ log10(rank), data = rank_subset)
lm_result$coefficients[[1]]
```
```{r}
word_count %>%
ggplot(aes(rank, tf, color = cat)) +
geom_abline(intercept = lm_result$coefficients[[1]],
slope = lm_result$coefficients[[2]],
color = "gray50", linetype = 2) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()
```
```{r}
news_count <- tokenized.df %>%
unnest(word) %>%
count(link, word) %>%
bind_tf_idf(word, link, n) %>%
arrange(desc(tf_idf))
news_count %>%
ggplot(aes(tf_idf)) +
geom_histogram(bins = 100) +
scale_x_log10()
```
```{r}
news_count %>%
left_join(news.df, by="link") %>%
filter(!(word %in% c("NA"))) %>%
group_by(cat) %>%
arrange(desc(tf_idf)) %>%
top_n(30, tf_idf) %>%
ungroup() %>%
mutate(word = reorder(word, tf_idf)) %>%
ggplot(aes(word, tf_idf, fill = cat)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~cat, ncol = 2, scales = "free") +
coord_flip() +
theme(axis.text.y=element_text(family="Heiti TC Light"))
```