Merge pull request #420 from massimoaria/develop

issue #404: OpenAlex integration
massimoaria · Feb 23, 2024 · 76e56a0 · 76e56a0
2 parents bc5a38c + 1b8ca20
commit 76e56a0
Show file tree

Hide file tree

Showing 8 changed files with 194 additions and 110 deletions.
diff --git a/R/convert2df.R b/R/convert2df.R
@@ -12,9 +12,11 @@
 #' d)\tab 'lens' \tab Lens.org (in csv '.csv');\cr
 #' e)\tab 'pubmed' \tab an object of the class \code{pubmedR (package pubmedR)} containing a collection obtained from a query performed with pubmedR package;\cr
 #' f)\tab 'dimensions' \tab an object of the class \code{dimensionsR (package dimensionsR)} containing a collection obtained from a query performed with dimensionsR package;\cr
-#' g)\tab 'openalex' \tab a data frame object returned by openalexR package, containing a collection of works resulting from a query fetched from OpenAlex database.}
+#' g)\tab 'openalex' \tab OpenAlex .csv file;\cr
+#' h)\tab 'openalex_api' \tab a data frame object returned by openalexR package, containing a collection of works resulting from a query fetched from OpenAlex database.}
 #' @param dbsource is a character indicating the bibliographic database. \code{dbsource} can be \code{dbsource = c('cochrane','dimensions','generic','isi','openalex', 'pubmed','scopus','wos', 'lens')} . Default is \code{dbsource = "isi"}.
-#' @param format is a character indicating the format of the SCOPUS and Clarivate Analytics WoS export file. \code{format} can be \code{c('api', 'bibtex', 'csv', 'endnote','excel','plaintext', 'pubmed')}. Default is \code{format = "plaintext"}.
+#' @param format is a character indicating the SCOPUS, Clarivate Analytics WoS, and other databases export file format. \code{format} can be \code{c('api', 'bibtex', 'csv', 'endnote','excel','plaintext', 'pubmed')}. Default is \code{format = "plaintext"}.
+#' @param remove.duplicates is logical. If TRUE, the function will remove duplicated items checking by DOI and database ID.
 #' @return a data frame with cases corresponding to articles and variables to Field Tags in the original export file.
 #' 
 #' I.e We have three files downlaod from Web of Science in plaintext format, file will be:
@@ -57,10 +59,10 @@
 #' 
 #' @export
 
-convert2df<-function(file,dbsource="wos",format="plaintext"){
+convert2df<-function(file,dbsource="wos",format="plaintext", remove.duplicates=TRUE){
 
   allowed_formats <- c('api', 'bibtex', 'csv', 'endnote','excel','plaintext', 'pubmed') 
-  allowed_db <- c('cochrane','dimensions','generic','isi','openalex', 'pubmed','scopus','wos', 'lens')
+  allowed_db <- c('cochrane','dimensions','generic','isi','openalex', 'openalex_api','pubmed','scopus','wos', 'lens')
 
   cat("\nConverting your",dbsource,"collection into a bibliographic dataframe\n\n")
   if (length(setdiff(dbsource,allowed_db))>0){
@@ -147,7 +149,10 @@ convert2df<-function(file,dbsource="wos",format="plaintext"){
              })
 
     },
-    openalex = {
+    openalex={
+      M <- csvOA2df(file)
+    },
+    openalex_api = {
       if (!"bibliometrixDB" %in% class(file)){
         M <- openalexR::oa2bibliometrix(file)
       } else {
@@ -168,11 +173,11 @@ convert2df<-function(file,dbsource="wos",format="plaintext"){
     M$CR <- trim.leading(trimES(gsub("\\[,||\\[||\\]|| \\.\\. || \\. ","",M$CR)))  # remove foreign characters from CR (i.e. Chinese, Russian characters)
   }
 
-  if (dbsource!="cochrane"){M$AU=gsub(intToUtf8(8217),intToUtf8(39),M$AU)}
+  if (dbsource!="cochrane"){M$AU <- gsub(intToUtf8(8217),intToUtf8(39),M$AU)}
 
   cat("Done!\n\n")
 
-  if (!(dbsource %in% c("pubmed", "lens", "openalex"))) {
+  if (!(dbsource %in% c("pubmed", "lens", "openalex_api"))) {
     ## AU_UN field creation
     if ("C1" %in% names(M)) {
       cat("\nGenerating affiliation field tag AU_UN from C1:  ")
@@ -204,10 +209,37 @@ convert2df<-function(file,dbsource="wos",format="plaintext"){
   }
 
   ### SR field creation
+  if (isTRUE(remove.duplicates)){
+    switch(dbsource,
+           isi={
+             id_field <- "UT"
+           },
+           scopus={
+             id_field <- "UT"
+           },
+           openalex={
+             id_field <- "id_oa"
+           },
+           openalex_api={
+             id_field <- "id_oa"
+           },
+           dimneisons={
+             id_field <- "UT"
+           },
+           pubmed={
+             id_field <- "PMID"
+           },
+           lens={
+             id_field <- "UT"
+           },
+           {
+             id_field <- "TI"
+           })
+    d <- duplicated(M[id_field]) 
+    if (sum(d)>0) cat("\nRemoved ",sum(d),"duplicated documents\n")
+    M <- M[!d,]
+    }
   suppressWarnings(M <- metaTagExtraction(M, Field="SR"))
-  d <- duplicated(M$SR)
-  if (sum(d)>0) cat("\nRemoved ",sum(d),"duplicated documents\n")
-  M <- M[!d,]
   row.names(M) <- M$SR
 
   ### bibliometrix>DB class

diff --git a/R/csvLens2df.R b/R/csvLens2df.R
@@ -54,12 +54,13 @@ csvLens2df <- function(file){
 
   # Iso Source Titles
   DATA$SO[DATA$SO==""] <- DATA$Publisher[DATA$SO==""] 
-  DATA$JI <- sapply(DATA$SO, AbbrevTitle, USE.NAMES = FALSE)
-  DATA$J9 <- gsub("\\.","",DATA$JI)
+  # DATA$JI <- sapply(DATA$SO, AbbrevTitle, USE.NAMES = FALSE)
+  # DATA$J9 <- gsub("\\.","",DATA$JI)
+  DATA$JI <- DATA$J9 <- DATA$SO
   DATA$ID <- DATA$DE
   DI <- DATA$DI
   URL <- DATA$URL
-  DATA <- data.frame(lapply(DATA,toUpper))
+  DATA <- data.frame(lapply(DATA,toupper))
   DATA$DI <- DI
   DATA$URL <- URL
   DATA$AU_CO <- "NA"

diff --git a/R/csvOA2df.R b/R/csvOA2df.R
@@ -0,0 +1,123 @@
+utils::globalVariables(c("all_of", "corr", "DI", "id_oa","RP","UN","AU_ID"))
+
+csvOA2df <- function(file){
+  options(readr.num_columns = 0)
+
+  ## import all files in a single data frame
+  for (i in 1:length(file)){
+    #D <- read.csv(file[i], quote='"', check.names = F, stringsAsFactors = F) #fileEncoding = "UTF-8-BOM")
+    D <- read_csv(file[i], na=character(), quote='"', trim_ws = FALSE, progress = show_progress(), show_col_types = FALSE) %>%
+      mutate(across(where(is.numeric), as.character)) %>% 
+      mutate(across(where(is.character), \(x) tidyr::replace_na(x,""))) %>% 
+      as.data.frame()
+
+    if (i>1){
+      l <- intersect(l,names(D))
+      DATA <- rbind(DATA[l],D[l])
+    }else{
+      l <- names(D)
+      DATA <- D}
+  }
+  rm(D)
+
+  ## Post-Processing
+
+  # column re-labelling
+  DATA <- relabelling(DATA)
+
+  # recode as numeric
+  DATA$TC <- as.numeric(DATA$TC)
+  DATA$PY <- as.numeric(DATA$PY)
+  DATA$relevance_score <- as.numeric(DATA$relevance_score)
+
+  # replace | with ;
+  DATA <- DATA %>% 
+    mutate(across(where(is.character), ~ stringi::stri_replace_all_regex(.,"\\|",";")))
+
+  DATA$AF <- DATA$AU
+  DATA$ID <- DATA$DE
+  DATA$AB=""
+  DATA$CR <- gsub("https://openalex.org/","",DATA$CR)
+  DATA$AU_ID <- gsub("https://openalex.org/","",DATA$AU_ID)
+  DATA$id_oa <- gsub("https://openalex.org/","",DATA$id_oa)
+  DATA$JI <- DATA$J9 <- gsub("https://openalex.org/","",DATA$SO_ID)
+  DATA$corresponding_author_ids <- gsub("https://openalex.org/","",DATA$corresponding_author_ids)
+  DATA$C1 <- gsub("https://", "", DATA$C1)
+  DATA$DB <- "OPENALEX"
+
+  ## corresponding author
+  UN <- strsplit(DATA$C1,";")
+  corresp <- strsplit(DATA$authorships_is_corresponding,";")
+  df_UN <- data.frame(UN=unlist(UN), id_oa=rep(DATA$id_oa,lengths(UN))) %>% 
+    group_by(id_oa) %>% 
+    mutate(n=row_number())
+  df_COR <- data.frame(corr=unlist(corresp), id_oa=rep(DATA$id_oa,lengths(corresp))) %>% 
+    group_by(id_oa) %>% 
+    mutate(n=row_number())
+  df_UN <- df_UN %>% 
+    left_join(df_COR, by=(c("id_oa","n"))) 
+  AU <- strsplit(DATA$AU,";")
+  AU_df <- data.frame(RP = unlist(AU), AU_ID=unlist(strsplit(DATA$AU_ID,";")), id_oa=rep(DATA$id_oa,lengths(AU))) %>% 
+    group_by(id_oa) %>% 
+    mutate(n=row_number()) %>% 
+    left_join(df_UN %>% select("UN","id_oa", "corr", "n"),
+              by = c("id_oa","n")) %>% 
+    dplyr::filter(corr == "true") %>% 
+    mutate(RP = paste(RP,UN, sep=", ")) %>% 
+    ungroup() %>% 
+    select("RP", "AU_ID") %>% 
+    distinct(AU_ID, .keep_all = TRUE)
+  DATA <- DATA %>% 
+    left_join(AU_df, by = c("corresponding_author_ids" = "AU_ID"))
+
+
+  # move all char strings to Upper
+  ind <- apply(DATA,2,function(x){
+    sum(regexpr("https://",x)>-1, na.rm = TRUE)>0
+  })
+  label <- names(ind)[ind==FALSE & !is.na(ind)]
+
+  DATA <- DATA %>% 
+    mutate(across(all_of(label), toupper),
+           DI = gsub("https://doi.org/","",DI),
+           DI = ifelse(DI == "null",NA,DI)) 
+
+  return(DATA)
+}
+
+relabelling <- function(DATA){
+  ## column re-labelling
+  label <- names(DATA)
+  label[label %in% "id"] <- "id_oa"
+  label[label %in% "display_name"] <- "TI"
+  label[label %in% "primary_location_display_name"] <- "SO"
+  label[label %in% "primary_location_id"] <- "SO_ID"
+  label[label %in% "primary_location_host_organization"] <- "PU"
+  label[label %in% "primary_location_issns"] <- "ISSN"
+  label[label %in% "primary_location_issn_l"] <- "ISSN_I"
+  label[label %in% "primary_location_landing_page_url"] <- "URL"
+  label[label %in% "primary_location_pdf_url"] <- "URL_PDF"
+  label[label %in% "author_ids"] <- "AU_ID"
+  label[label %in% "author_names"] <- "AU"
+  label[label %in% "author_orcids"] <- "OI"
+  label[label %in% "author_institution_names"] <- "C3"
+  label[label %in% "cited_by_count"] <- "TC"
+  label[label %in% "publication_year"] <- "PY"
+  label[label %in% "type"] <- "DT"
+  label[label %in% "biblio_issue"] <- "IS"
+  label[label %in% "biblio_volume"] <- "VL"
+  label[label %in% "referenced_works" ] <- "CR"
+  label[label %in% "keywords_keyword"] <- "DE"
+  label[label %in% "concepts_display_name"] <- "CONCEPTS"
+  label[label %in% "topics_display_name"] <- "TOPICS"
+  label[label %in% "sustainable_development_goals_display_name"] <- "SDG"
+  label[label %in% "primary_topic_field_display_name"] <- "SC"
+  label[label %in% "mesh_descriptor_name"] <- "MESH"
+  label[label %in% "referenced_works_count"] <- "NR"
+  label[label %in% "language"] <- "LA"
+  label[label %in% "authorships_author_position"] <- "AU_POSITION"
+  label[label %in% "authorships_raw_affiliation_string"] <- "C1"
+  label[label %in% "doi"] <- "DI"
+  names(DATA) <- label
+  return(DATA)
+}
diff --git a/R/histNetwork.R b/R/histNetwork.R
@@ -195,95 +195,6 @@ wos <- function(M, min.citations, sep, network, verbose){
   return(results)
 }
 
-# scopus <- function(M, min.citations, sep, network, verbose){
-#   
-#   if (isTRUE(verbose)) {
-#     cat("\nSCOPUS DB: Searching local citations (LCS) by document titles (TI) and DOIs...\n")
-#   }
-#   
-#   if (!("SR_FULL" %in% names(M))) {
-#     M = metaTagExtraction(M, Field = "SR")
-#   }
-#   
-#   M$nCITING <- 1:nrow(M)
-#   papers <- M$nCITING[M$TC >= min.citations]
-#   
-#   TIpost <-
-#     paste(gsub("[[:punct:]]", "", M$TI[papers]), " ", M$PY[papers], " ", sep = "")
-#   
-#   CR <- gsub("[[:punct:]]", "", M$CR)
-#   n <- nchar(CR)
-#   n[is.na(n)] <- 2
-#   n <- n + 1
-#   nCum <- c(1, cumsum(n[-length(n)]))
-#   CR <- paste(CR, collapse = " ")
-#   
-#   #L <- str_locate_all(CR, TIpost)
-#   L <- stringi::stri_locate_all_regex(CR,TIpost, omit_no_match = TRUE)
-#   
-#   LCS <- lengths(L) / 2
-#   
-#   M$LCS <- 0
-#   M$LCS[papers] <- LCS
-#   
-# 
-#   ### HistData
-#   histData <- M %>%
-#     select(.data$SR_FULL, .data$TI,.data$DE,.data$ID,.data$DI, .data$PY, .data$LCS, .data$TC) %>%
-#     rename(
-#       Paper = .data$SR_FULL,
-#       Title = .data$TI,
-#       Author_Keywords = .data$DE,
-#       KeywordsPlus = .data$ID,
-#       DOI = .data$DI,
-#       Year = .data$PY,
-#       GCS = .data$TC
-#     ) %>%
-#     arrange(.data$Year) %>%
-#     dplyr::filter(.data$GCS>=min.citations) %>% 
-#     as.data.frame()
-#   
-#   
-#   if (isTRUE(network)) {
-#     ## Network matrix
-#     df <- lapply(seq_along(L), function(i) {
-#       l <-
-#         data.frame(
-#           ref = L[[i]],
-#           paper = rep(papers[i], length(L[[i]][, 1]))
-#         )
-#     })
-#     df <- (do.call(rbind, df))
-#     
-#     A <- outer(df$ref.start, nCum, "-")
-#     A[A < 0] <- NA
-#     df$CITINGn <- unlist(apply(A, 1, which.min))
-#     df$CITING <- M$SR[df$CITINGn]
-#     df$CITED <- M$SR[df$paper]
-#     df <- df %>% 
-#       dplyr::filter(.data$CITING %in% histData$Paper)
-#     
-#     NetMatrix <-
-#       (as_adjacency_matrix(graph_from_data_frame(df[, c(6, 5)], directed = T)))
-#   } else{
-#     NetMatrix = NULL
-#   }
-#   
-#   if (isTRUE(verbose)) {
-#     cat("\nFound",
-#         length(M$LCS[M$LCS > 0]),
-#         "documents with no empty Local Citations (LCS)\n")
-#   }
-#   
-#   results <-
-#     list(
-#       NetMatrix = NetMatrix,
-#       histData = histData,
-#       M = M,
-#       LCS = M$LCS
-#     )
-# }
-
 # New algorithm for Scopus
 # Local citation matching is based on First Author, Year and PP
 scopus <- function(M, min.citations, sep, network, verbose){
@@ -387,7 +298,7 @@ scopus <- function(M, min.citations, sep, network, verbose){
 
 openalex <- function(M, min.citations=min.citations, sep=sep, network=network, verbose=verbose){
 
-  M$CR[is.na(M$CR)] <- "none"
+  M$CR[is.na(M$CR) | M$CR==""] <- "none"
   ids <- M$id_oa
   CR <- strsplit(M$CR, ";")
   CR <- data.frame(id_oa = rep(M$id_oa,lengths(CR)), ref = unlist(CR)) %>% 
@@ -420,7 +331,7 @@ openalex <- function(M, min.citations=min.citations, sep=sep, network=network, v
     SRrow <- WLCR %>% select(.data$id_oa) %>% 
       left_join(M %>% 
                   select(.data$id_oa, .data$SR), 
-                by="id_oa")
+                by="id_oa") 
 
     SR_col <- data.frame(id_oa = colnames(WLCR)[-1]) %>% 
       left_join(M %>% 

diff --git a/R/metaTagExtraction.R b/R/metaTagExtraction.R
@@ -405,7 +405,7 @@ AU_UN<-function(M,sep){
   })
   AFFL=unlist(AFFL)
   M$AU_UN=AFFL
-  if (M$DB[1]=="ISI" & "C3" %in% names(M)){
+  if (M$DB[1] %in% c("ISI", "OPENALEX") & "C3" %in% names(M)){
     M$AU_UN[!is.na(M$C3) & M$C3!=""] <- M$C3[!is.na(M$C3) & M$C3!=""]
   }
   M$AU_UN=gsub("\\\\&","AND",M$AU_UN)

diff --git a/inst/biblioshiny/server.R b/inst/biblioshiny/server.R
@@ -281,7 +281,15 @@ To ensure the functionality of Biblioshiny,
                                 })
                  })
         },
-        openalex = {
+        openalex={
+          withProgress(message = 'Conversion in progress',
+                         value = 0, {
+                           M <- convert2df(inFile$datapath,
+                                           dbsource = input$dbsource,
+                                           format = "csv")
+                         })
+        },
+        openalex_api = {
           M <- smart_load(inFile$datapath)
         },
         lens = {

diff --git a/inst/biblioshiny/ui.R b/inst/biblioshiny/ui.R
@@ -254,7 +254,8 @@ body <- dashboardBody(
                                    "Web of Science (WoS/WoK)" = "isi",
                                    "Scopus" = "scopus",
                                    "Dimensions" = "dimensions",
-                                   "OpenAlex (via openalexR)" = "openalex",
+                                   "Openalex" ="openalex",
+                                   "OpenAlex API (via openalexR)" = "openalex_api",
                                    "Lens.org" = "lens",
                                    "PubMed" = "pubmed",
                                    "Cochrane Library" = "cochrane"