-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathN1_Kurir_networks_1.R
220 lines (137 loc) · 7.23 KB
/
N1_Kurir_networks_1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# Necessary libraries.
library(rvest)
library(dplyr)
library(igraph)
library(ggraph)
### Scraping N1
# Base link for the website
linkN1<- "https://rs.n1info.com/"
# Reading the website html
bodyN1<- read_html(linkN1)
# Getting the list of titles. The css used in this while code has been acquired by using Chrome's SelectorGadget.
elements_titlesN1<- html_elements(bodyN1,
css = ".uc-block-post-grid-title-link")
titlesN1<- html_text(elements_titlesN1)
titlesN1
# Getting the links for each post on the website from the titles.
linksN1<- html_attr(elements_titlesN1, "href")
linksN1
#linksN1<-linksN1[!linksN1%in%""]
#linksN1
# Scraping the tags and titles for every post reached trough the scraped links,
# and storing them in a tab separated file as a data table.
for (i in 1:length(linksN1)){
PostBody<-read_html(linksN1[i])
PostElements<-html_elements(PostBody,
css =".tags-wrapper a")
PostTags<-html_text(PostElements)
PostTitle<-html_elements(PostBody,
css = ".main-head .entry-title")
PostTitle_text<- html_text(PostTitle)
tab<-cbind(PostTitle_text, PostTags)
write.table(tab, file="PostTagsN1.tab", sep="\t", col.names=F,row.names=F, append=T, quote=F)
}
# Importing the scraped data table.
TagsLongTableN1<-read.csv("PostTagsN1.tab", sep="\t")
# The column names of imported data are messy, so we will name them V1 and V2.
# V1 Is the column containing post names; V2 is the column containing tags.
colnames(TagsLongTableN1)<- c("V1","V2")
# Since missing values, blanks as well as rows where post name and tag name are identical cause errors in creating a network,
# I wrote the code below to remove these problematic rows.
TagsLongTableN1_1<- TagsLongTableN1[!(is.na(TagsLongTableN1$V2) |
TagsLongTableN1$V2=="" |
TagsLongTableN1$V1 == TagsLongTableN1$V2), ]
# Importing the 'igraph' library to turn the table into a bipartite network.
library(igraph)
g<-graph.data.frame(TagsLongTableN1_1, directed=F)
V(g)$type <- bipartite_mapping(g)$type
# Importing 'ggraph' library to plot the network.
# Note: I have purposefuly minimized the size of the names of post names (plotted as red squares) because the sheer size of
# the text makes the network messy. The tags are more interesting, but if you want the network to show the names of posts instead
# just change the sizes in the code (the 'size =ifelse(V(g)$type, 2.8, 0' part).
library(ggraph)
ggraph(g, layout = "stress") +
geom_edge_link(arrow = grid::arrow(type = "closed", length = unit(2, "mm")),
end_cap = circle(1, "mm"), color= "darkgray") +
geom_node_point(color = ifelse(V(g)$type, "lightblue", "salmon"),
shape= ifelse(V(g)$type, "circle", "square"),
size = 3) +
geom_node_text(aes(label = name), color= "black", size =ifelse(V(g)$type, 2.8, 0)) +
theme_void()
# Some network centrality measures.
sort(degree(g))
sort(betweenness(g))
# Emptying the Tags file so that it is ready for future use.
cat("", file="PostTagsN1.tab")
### Scraping Kurir
# Kurir website is structured differently and the acquired for each of the posts need additional attention and cleaning.
# For this reason, though the code is largely the same as for N1, additional there are some changes.
# Base link for the website
linkKurir<- "https://www.kurir.rs/"
# Reading the website html
bodyKurir<- read_html(linkKurir)
# Getting the list of titles.
elements_titlesKurir<- html_elements(bodyKurir,
css = ".title a")
elements_titlesKurir
titlesKurir<- html_text(elements_titlesKurir)
titlesKurir
# Getting the links for each post on the website from the titles.
linksKurir<- html_attr(elements_titlesKurir, "href")
linksKurir
# The first difference compared to N1 scraping is the structure and type of links leading to posts on the website.
# On N1, each post title had a full link that leads to the post.
# On Kurir, however, only the part of the link after the base link is included.
# For example,'/stars/rijaliti/3937963/zola-urnisao-miljanu' without 'https://www.kurir.rs' that is the part of the full link.
# For this reason, the loop for scraping tags and titles needs to first
# paste the base link with the scraped links for them to be useful (more on that soon).
# In addition, the links scraped from the cover page of Kurir also contain links leading to external websites
# and there is apparently no way to exclude them from scraping. These links also happen to be the only ones that start
# with 'https', so I wrote the code below using regex in order to clean the list of scraped links from these.
linksKurir1<- gsub("^https:.+", "", linksKurir)
linksKurir1
# Removing the blanks.
linksKurir1<-linksKurir1[!linksKurir1%in%""]
linksKurir1
# Base link that will be used during scraping
baseLinkKurir<- "https://www.kurir.rs"
# Scraping the tags and titles for every post reached trough the scraped links,
# and storing them in a tab separated file as a data table.
for (i in 1:length(linksKurir1)){
PostLink<-paste(baseLinkKurir, linksKurir1[i], sep="")
PostBody<-read_html(PostLink)
PostElements<-html_elements(PostBody,
css =".lnk")
PostTags<-html_text(PostElements)
PostTitle<-html_elements(PostBody,
css = ".singleWrap .titleWrap .title")
PostTitle_text<- html_text(PostTitle)
tab<-cbind(PostTitle_text, PostTags)
write.table(tab, file="PostTagsKurir.tab", sep="\t", col.names=F,row.names=F, append=T, quote=F)
}
# Importing the scraped data table.
TagsLongTableKurir<-read.csv("PostTagsKurir.tab", sep="\t")
# Naming the columns.
colnames(TagsLongTableKurir)<- c("V1","V2")
# Removing missing values, blanks and rows with identical V1 and V2 values in order to avoid errors.
# Note: Kurir has many more blanks compared to N1.
TagsLongTableKurir1<- TagsLongTableKurir[!(is.na(TagsLongTableKurir$V2) |
TagsLongTableKurir$V2=="" |
TagsLongTableKurir$V1 == TagsLongTableKurir$V2), ]
# Turning the table into a bipartite network.
g<-graph.data.frame(TagsLongTableKurir1, directed=F)
V(g)$type <- bipartite_mapping(g)$type
# Plotting the network.
ggraph(g, layout = "stress") +
geom_edge_link(arrow = grid::arrow(type = "closed", length = unit(2, "mm")),
end_cap = circle(1, "mm"), color= "darkgray") +
geom_node_point(color = ifelse(V(g)$type, "lightblue", "salmon"),
size = 3,
shape= ifelse(V(g)$type, "circle", "square")) +
geom_node_text(aes(label = name), color= "black", size =ifelse(V(g)$type, 2.5, 0)) +
theme_void()
# Some network centrality measures
sort(degree(g))
sort(betweenness(g))
# Emptying the Tags file so that it is ready for future use.
cat("", file="PostTagsKurir.tab")