forked from ovh/summit2016-RankingPredict
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstep5_extractMajesticSEO.R
74 lines (53 loc) · 3.2 KB
/
step5_extractMajesticSEO.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
library(dplyr)
library(rjson)
library(urltools)
library(stringr)
library(httr)
apiKey <- "XXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
# change to api for prod
# change to developper to sandbox
apiUrl <-"https://api.majestic.com/api/json?app_api_key="
#historic : all period
#refresh : 90 last days
modeUrl <- "historic"
fileName <- "./majestic/dataset-majestic-historic.csv"
#autoriser la bonne IP !!!!!!!!!!!
httr::set_config( config( ssl_verifypeer = 0L ) )
#
# get majestic data
#
majesticGetInfoUrl <- function(u) {
u <- url_encode(u)
print(proc.time())
url <- paste(apiUrl,apiKey,"&cmd=GetIndexItemInfo&items=1&item0=",u,"&datasource=",modeUrl,sep="")
req <-GET(url)
#result <- fromJSON( req )
result <- content(req)$DataTables$Results$Data
return(result)
}
datasetNatUniqueSubdomain <- datasetNat[!duplicated(datasetNat$Subdomain), ]
if(file.exists(fileName)){
print("file exist")
dataDf <- read.csv2(fileName, header = TRUE, stringsAsFactors = FALSE)
init <- dim(dataDf)[1]
}
if (!file.exists(fileName)){
print("file not exist")
init <- 1
dataDf <- data.frame(ItemNum="",Item="",ResultCode="",Status="",ExtBackLinks="",RefDomains="",AnalysisResUnitsCost="",ACRank="",ItemType="",IndexedURLs="",GetTopBackLinksAnalysisResUnitsCost="",DownloadBacklinksAnalysisResUnitsCost="",DownloadRefDomainBacklinksAnalysisResUnitsCost="",RefIPs="",RefSubNets="",RefDomainsEDU="",ExtBackLinksEDU="",RefDomainsGOV="",ExtBackLinksGOV="",RefDomainsEDU_Exact="",ExtBackLinksEDU_Exact="",RefDomainsGOV_Exact="",ExtBackLinksGOV_Exact="",CrawledFlag="",LastCrawlDate="",LastCrawlResult="",RedirectFlag="",FinalRedirectResult="",OutDomainsExternal="",OutLinksExternal="",OutLinksInternal="",LastSeen="",Title="",RedirectTo="",CitationFlow="",TrustFlow="",TrustMetric="",TopicalTrustFlow_Topic_0="",TopicalTrustFlow_Value_0="",TopicalTrustFlow_Topic_1="",TopicalTrustFlow_Value_1="",TopicalTrustFlow_Topic_2="",TopicalTrustFlow_Value_2="")
dataDf <- data.frame(lapply(dataDf, as.character), stringsAsFactors=FALSE)
}
for(i in init:dim(datasetNatUniqueSubdomain)[1]){
url <- datasetNatUniqueSubdomain$Subdomain[i]
result = tryCatch({
df <- majesticGetInfoUrl(url)
df <- as.data.frame(df, stringsAsFactors= FALSE)
},
error = {
df <- data.frame(ItemNum="",Item="",ResultCode="",Status="",ExtBackLinks="",RefDomains="",AnalysisResUnitsCost="",ACRank="",ItemType="",IndexedURLs="",GetTopBackLinksAnalysisResUnitsCost="",DownloadBacklinksAnalysisResUnitsCost="",DownloadRefDomainBacklinksAnalysisResUnitsCost="",RefIPs="",RefSubNets="",RefDomainsEDU="",ExtBackLinksEDU="",RefDomainsGOV="",ExtBackLinksGOV="",RefDomainsEDU_Exact="",ExtBackLinksEDU_Exact="",RefDomainsGOV_Exact="",ExtBackLinksGOV_Exact="",CrawledFlag="",LastCrawlDate="",LastCrawlResult="",RedirectFlag="",FinalRedirectResult="",OutDomainsExternal="",OutLinksExternal="",OutLinksInternal="",LastSeen="",Title="",RedirectTo="",CitationFlow="",TrustFlow="",TrustMetric="",TopicalTrustFlow_Topic_0="",TopicalTrustFlow_Value_0="",TopicalTrustFlow_Topic_1="",TopicalTrustFlow_Value_1="",TopicalTrustFlow_Topic_2="",TopicalTrustFlow_Value_2="")
},
finally = {
dataDf <- rbind(dataDf,df)
write.csv2(dataDf,fileName,row.names = FALSE)
})
}