Added full support to OpenAlex csv export files.

massimoaria · Feb 23, 2024 · 1b8ca20 · 1b8ca20
1 parent 1f6677a
commit 1b8ca20
Show file tree

Hide file tree

Showing 5 changed files with 56 additions and 107 deletions.
diff --git a/R/convert2df.R b/R/convert2df.R
@@ -16,6 +16,7 @@
 #' h)\tab 'openalex_api' \tab a data frame object returned by openalexR package, containing a collection of works resulting from a query fetched from OpenAlex database.}
 #' @param dbsource is a character indicating the bibliographic database. \code{dbsource} can be \code{dbsource = c('cochrane','dimensions','generic','isi','openalex', 'pubmed','scopus','wos', 'lens')} . Default is \code{dbsource = "isi"}.
 #' @param format is a character indicating the SCOPUS, Clarivate Analytics WoS, and other databases export file format. \code{format} can be \code{c('api', 'bibtex', 'csv', 'endnote','excel','plaintext', 'pubmed')}. Default is \code{format = "plaintext"}.
+#' @param remove.duplicates is logical. If TRUE, the function will remove duplicated items checking by DOI and database ID.
 #' @return a data frame with cases corresponding to articles and variables to Field Tags in the original export file.
 #' 
 #' I.e We have three files downlaod from Web of Science in plaintext format, file will be:
@@ -58,7 +59,7 @@
 #' 
 #' @export
 
-convert2df<-function(file,dbsource="wos",format="plaintext"){
+convert2df<-function(file,dbsource="wos",format="plaintext", remove.duplicates=TRUE){
 
   allowed_formats <- c('api', 'bibtex', 'csv', 'endnote','excel','plaintext', 'pubmed') 
   allowed_db <- c('cochrane','dimensions','generic','isi','openalex', 'openalex_api','pubmed','scopus','wos', 'lens')
@@ -172,7 +173,7 @@ convert2df<-function(file,dbsource="wos",format="plaintext"){
     M$CR <- trim.leading(trimES(gsub("\\[,||\\[||\\]|| \\.\\. || \\. ","",M$CR)))  # remove foreign characters from CR (i.e. Chinese, Russian characters)
   }
 
-  if (dbsource!="cochrane"){M$AU=gsub(intToUtf8(8217),intToUtf8(39),M$AU)}
+  if (dbsource!="cochrane"){M$AU <- gsub(intToUtf8(8217),intToUtf8(39),M$AU)}
 
   cat("Done!\n\n")
 
@@ -208,10 +209,37 @@ convert2df<-function(file,dbsource="wos",format="plaintext"){
   }
 
   ### SR field creation
+  if (isTRUE(remove.duplicates)){
+    switch(dbsource,
+           isi={
+             id_field <- "UT"
+           },
+           scopus={
+             id_field <- "UT"
+           },
+           openalex={
+             id_field <- "id_oa"
+           },
+           openalex_api={
+             id_field <- "id_oa"
+           },
+           dimneisons={
+             id_field <- "UT"
+           },
+           pubmed={
+             id_field <- "PMID"
+           },
+           lens={
+             id_field <- "UT"
+           },
+           {
+             id_field <- "TI"
+           })
+    d <- duplicated(M[id_field]) 
+    if (sum(d)>0) cat("\nRemoved ",sum(d),"duplicated documents\n")
+    M <- M[!d,]
+    }
   suppressWarnings(M <- metaTagExtraction(M, Field="SR"))
-  d <- duplicated(M$SR)
-  if (sum(d)>0) cat("\nRemoved ",sum(d),"duplicated documents\n")
-  M <- M[!d,]
   row.names(M) <- M$SR
 
   ### bibliometrix>DB class

diff --git a/R/csvLens2df.R b/R/csvLens2df.R
@@ -54,12 +54,13 @@ csvLens2df <- function(file){
 
   # Iso Source Titles
   DATA$SO[DATA$SO==""] <- DATA$Publisher[DATA$SO==""] 
-  DATA$JI <- sapply(DATA$SO, AbbrevTitle, USE.NAMES = FALSE)
-  DATA$J9 <- gsub("\\.","",DATA$JI)
+  # DATA$JI <- sapply(DATA$SO, AbbrevTitle, USE.NAMES = FALSE)
+  # DATA$J9 <- gsub("\\.","",DATA$JI)
+  DATA$JI <- DATA$J9 <- DATA$SO
   DATA$ID <- DATA$DE
   DI <- DATA$DI
   URL <- DATA$URL
-  DATA <- data.frame(lapply(DATA,toUpper))
+  DATA <- data.frame(lapply(DATA,toupper))
   DATA$DI <- DI
   DATA$URL <- URL
   DATA$AU_CO <- "NA"

diff --git a/R/csvOA2df.R b/R/csvOA2df.R
@@ -1,4 +1,4 @@
-utils::globalVariables(c("all_of", "corr"))
+utils::globalVariables(c("all_of", "corr", "DI", "id_oa","RP","UN","AU_ID"))
 
 csvOA2df <- function(file){
   options(readr.num_columns = 0)
@@ -49,24 +49,24 @@ csvOA2df <- function(file){
   UN <- strsplit(DATA$C1,";")
   corresp <- strsplit(DATA$authorships_is_corresponding,";")
   df_UN <- data.frame(UN=unlist(UN), id_oa=rep(DATA$id_oa,lengths(UN))) %>% 
-    group_by(.data$id_oa) %>% 
+    group_by(id_oa) %>% 
     mutate(n=row_number())
   df_COR <- data.frame(corr=unlist(corresp), id_oa=rep(DATA$id_oa,lengths(corresp))) %>% 
-    group_by(.data$id_oa) %>% 
+    group_by(id_oa) %>% 
     mutate(n=row_number())
   df_UN <- df_UN %>% 
     left_join(df_COR, by=(c("id_oa","n"))) 
   AU <- strsplit(DATA$AU,";")
   AU_df <- data.frame(RP = unlist(AU), AU_ID=unlist(strsplit(DATA$AU_ID,";")), id_oa=rep(DATA$id_oa,lengths(AU))) %>% 
-    group_by(.data$id_oa) %>% 
+    group_by(id_oa) %>% 
     mutate(n=row_number()) %>% 
     left_join(df_UN %>% select("UN","id_oa", "corr", "n"),
               by = c("id_oa","n")) %>% 
     dplyr::filter(corr == "true") %>% 
-    mutate(RP = paste(.data$RP,.data$UN, sep=", ")) %>% 
+    mutate(RP = paste(RP,UN, sep=", ")) %>% 
     ungroup() %>% 
     select("RP", "AU_ID") %>% 
-    distinct(.data$AU_ID, .keep_all = TRUE)
+    distinct(AU_ID, .keep_all = TRUE)
   DATA <- DATA %>% 
     left_join(AU_df, by = c("corresponding_author_ids" = "AU_ID"))
 
@@ -78,7 +78,9 @@ csvOA2df <- function(file){
   label <- names(ind)[ind==FALSE & !is.na(ind)]
 
   DATA <- DATA %>% 
-    mutate(across(all_of(label), toupper))
+    mutate(across(all_of(label), toupper),
+           DI = gsub("https://doi.org/","",DI),
+           DI = ifelse(DI == "null",NA,DI)) 
 
   return(DATA)
 }

diff --git a/R/histNetwork.R b/R/histNetwork.R
@@ -195,95 +195,6 @@ wos <- function(M, min.citations, sep, network, verbose){
   return(results)
 }
 
-# scopus <- function(M, min.citations, sep, network, verbose){
-#   
-#   if (isTRUE(verbose)) {
-#     cat("\nSCOPUS DB: Searching local citations (LCS) by document titles (TI) and DOIs...\n")
-#   }
-#   
-#   if (!("SR_FULL" %in% names(M))) {
-#     M = metaTagExtraction(M, Field = "SR")
-#   }
-#   
-#   M$nCITING <- 1:nrow(M)
-#   papers <- M$nCITING[M$TC >= min.citations]
-#   
-#   TIpost <-
-#     paste(gsub("[[:punct:]]", "", M$TI[papers]), " ", M$PY[papers], " ", sep = "")
-#   
-#   CR <- gsub("[[:punct:]]", "", M$CR)
-#   n <- nchar(CR)
-#   n[is.na(n)] <- 2
-#   n <- n + 1
-#   nCum <- c(1, cumsum(n[-length(n)]))
-#   CR <- paste(CR, collapse = " ")
-#   
-#   #L <- str_locate_all(CR, TIpost)
-#   L <- stringi::stri_locate_all_regex(CR,TIpost, omit_no_match = TRUE)
-#   
-#   LCS <- lengths(L) / 2
-#   
-#   M$LCS <- 0
-#   M$LCS[papers] <- LCS
-#   
-# 
-#   ### HistData
-#   histData <- M %>%
-#     select(.data$SR_FULL, .data$TI,.data$DE,.data$ID,.data$DI, .data$PY, .data$LCS, .data$TC) %>%
-#     rename(
-#       Paper = .data$SR_FULL,
-#       Title = .data$TI,
-#       Author_Keywords = .data$DE,
-#       KeywordsPlus = .data$ID,
-#       DOI = .data$DI,
-#       Year = .data$PY,
-#       GCS = .data$TC
-#     ) %>%
-#     arrange(.data$Year) %>%
-#     dplyr::filter(.data$GCS>=min.citations) %>% 
-#     as.data.frame()
-#   
-#   
-#   if (isTRUE(network)) {
-#     ## Network matrix
-#     df <- lapply(seq_along(L), function(i) {
-#       l <-
-#         data.frame(
-#           ref = L[[i]],
-#           paper = rep(papers[i], length(L[[i]][, 1]))
-#         )
-#     })
-#     df <- (do.call(rbind, df))
-#     
-#     A <- outer(df$ref.start, nCum, "-")
-#     A[A < 0] <- NA
-#     df$CITINGn <- unlist(apply(A, 1, which.min))
-#     df$CITING <- M$SR[df$CITINGn]
-#     df$CITED <- M$SR[df$paper]
-#     df <- df %>% 
-#       dplyr::filter(.data$CITING %in% histData$Paper)
-#     
-#     NetMatrix <-
-#       (as_adjacency_matrix(graph_from_data_frame(df[, c(6, 5)], directed = T)))
-#   } else{
-#     NetMatrix = NULL
-#   }
-#   
-#   if (isTRUE(verbose)) {
-#     cat("\nFound",
-#         length(M$LCS[M$LCS > 0]),
-#         "documents with no empty Local Citations (LCS)\n")
-#   }
-#   
-#   results <-
-#     list(
-#       NetMatrix = NetMatrix,
-#       histData = histData,
-#       M = M,
-#       LCS = M$LCS
-#     )
-# }
-
 # New algorithm for Scopus
 # Local citation matching is based on First Author, Year and PP
 scopus <- function(M, min.citations, sep, network, verbose){
@@ -387,7 +298,7 @@ scopus <- function(M, min.citations, sep, network, verbose){
 
 openalex <- function(M, min.citations=min.citations, sep=sep, network=network, verbose=verbose){
 
-  M$CR[is.na(M$CR)] <- "none"
+  M$CR[is.na(M$CR) | M$CR==""] <- "none"
   ids <- M$id_oa
   CR <- strsplit(M$CR, ";")
   CR <- data.frame(id_oa = rep(M$id_oa,lengths(CR)), ref = unlist(CR)) %>% 
@@ -420,7 +331,7 @@ openalex <- function(M, min.citations=min.citations, sep=sep, network=network, v
     SRrow <- WLCR %>% select(.data$id_oa) %>% 
       left_join(M %>% 
                   select(.data$id_oa, .data$SR), 
-                by="id_oa")
+                by="id_oa") 
 
     SR_col <- data.frame(id_oa = colnames(WLCR)[-1]) %>% 
       left_join(M %>% 

diff --git a/man/convert2df.Rd b/man/convert2df.Rd