-
-
Notifications
You must be signed in to change notification settings - Fork 75
curl을 활용한 멀티 수집
Chan-Yub Park edited this page Jun 19, 2017
·
1 revision
curl에 multi_run()
함수를 이용해서 좀 수집을 빠르게 하는 코드로 변경해 봤습니다. 안타깝게도 또 카테고리입니다....
키워드 부분은 제가 네이버 서비스 자체를 좀 따져봐야 될게 좀 있어서 시간을 내서 한 번 보겠습니다. 감사합니다.
install.packages("selectr")
library(selectr)
library(curl)
library(rvest)
devtools::install_github("forkonlp/N2H4")
library(N2H4)
options(stringsAsFactors = F)
success <- function(res){
cat("Request done! Status:", res$status, "\n")
res$content<-iconv(rawToChar(res$content),from="CP949",to="UTF-8")
data <<- c(data, list(res))
}
failure <- function(msg){
cat("Oh noes! Request failed!", msg, "\n")
}
cate<-getMainCategory()
subcate<-lapply(cate[,2], getSubCategory)
scate<-c()
for(i in 1:length(subcate)){
scate<-rbind(scate, data.frame(cate_name=cate[i,1],sid1=cate[i,2],subcate[[i]]))
}
strDate<-as.Date("2017-04-30")
endDate<-as.Date("2017-01-01")
strTime<-Sys.time()
midTime<-Sys.time()
for (date in strDate:endDate){
date<-gsub("-","",as.character(as.Date(date,origin = "1970-01-01")))
for (i in 1:nrow(scate)){
print(paste0(date," / ",scate[i,1] ," - ",scate[i,3]," / start Time: ", strTime," / spent Time: ", Sys.time()-midTime," / spent Time at first: ", Sys.time()-strTime))
midTime<-Sys.time()
pageUrli<-paste0("http://news.naver.com/main/list.nhn?sid2=",scate[i,4],"&sid1=",scate[i,2],"&mid=shm&mode=LS2D&date=",date)
trym<-0
max<-try(getMaxPageNum(pageUrli), silent = T)
while(trym<=5&&class(max)=="try-error"){
max<-try(getMaxPageNum(pageUrli), silent = T)
Sys.sleep(abs(rnorm(1)))
trym<-trym+1
print(paste0("try again max num: ",pageUrli))
}
for (pageNum in 1:max){
print(paste0(date," / ",scate[i,1]," / ",scate[i,3]," / ",pageNum, " / start Time: ", strTime," / spent Time: ", Sys.time()-midTime," / spent Time at first: ", Sys.time()-strTime))
midTime<-Sys.time()
pageUrl<-paste0(pageUrli,"&page=",pageNum)
tryp<-0
newsList<-try(getUrlListByCategory(pageUrl), silent = T)
while(tryp<=5&&class(newsList)=="try-error"){
newsList<-try(getUrlListByCategory(pageUrl), silent = T)
Sys.sleep(abs(rnorm(1)))
tryp<-tryp+1
print(paste0("try again max num: ",pageUrl))
}
pool <- new_pool()
data <- list()
sapply(newsList$links, function(x) curl_fetch_multi(x,success,failure))
res <- multi_run()
if( identical(data, list()) ){
res <- multi_run()
}
loc<-sapply(data, function(x) grepl("^http://news.naver",x$url))
cont<-sapply(data, function(x) x$content)
cont<-cont[loc]
titles<-unlist(lapply(cont,function(x) getContentTitle(read_html(x))))
bodies<-unlist(lapply(cont,function(x) getContentBody(read_html(x))))
presses<-unlist(lapply(cont,function(x) getContentPress(read_html(x))))
data<-data.frame(title=titles,press=presses,body=bodies)
dir.create("./data",showWarnings=F)
dir.create(paste0("./data/cate_",scate[i,4]),showWarnings=F)
write.csv(data, file=paste0("./data/cate_",scate[i,4],"/news",scate[i,2],"_",scate[i,4],"_",date,"_",pageNum,".csv"),row.names = F)
}
}
}