diff --git a/R/convert2df.R b/R/convert2df.R index 397217e..067a127 100644 --- a/R/convert2df.R +++ b/R/convert2df.R @@ -12,9 +12,11 @@ #' d)\tab 'lens' \tab Lens.org (in csv '.csv');\cr #' e)\tab 'pubmed' \tab an object of the class \code{pubmedR (package pubmedR)} containing a collection obtained from a query performed with pubmedR package;\cr #' f)\tab 'dimensions' \tab an object of the class \code{dimensionsR (package dimensionsR)} containing a collection obtained from a query performed with dimensionsR package;\cr -#' g)\tab 'openalex' \tab a data frame object returned by openalexR package, containing a collection of works resulting from a query fetched from OpenAlex database.} +#' g)\tab 'openalex' \tab OpenAlex .csv file;\cr +#' h)\tab 'openalex_api' \tab a data frame object returned by openalexR package, containing a collection of works resulting from a query fetched from OpenAlex database.} #' @param dbsource is a character indicating the bibliographic database. \code{dbsource} can be \code{dbsource = c('cochrane','dimensions','generic','isi','openalex', 'pubmed','scopus','wos', 'lens')} . Default is \code{dbsource = "isi"}. -#' @param format is a character indicating the format of the SCOPUS and Clarivate Analytics WoS export file. \code{format} can be \code{c('api', 'bibtex', 'csv', 'endnote','excel','plaintext', 'pubmed')}. Default is \code{format = "plaintext"}. +#' @param format is a character indicating the SCOPUS, Clarivate Analytics WoS, and other databases export file format. \code{format} can be \code{c('api', 'bibtex', 'csv', 'endnote','excel','plaintext', 'pubmed')}. Default is \code{format = "plaintext"}. +#' @param remove.duplicates is logical. If TRUE, the function will remove duplicated items checking by DOI and database ID. #' @return a data frame with cases corresponding to articles and variables to Field Tags in the original export file. #' #' I.e We have three files downlaod from Web of Science in plaintext format, file will be: @@ -57,10 +59,10 @@ #' #' @export -convert2df<-function(file,dbsource="wos",format="plaintext"){ +convert2df<-function(file,dbsource="wos",format="plaintext", remove.duplicates=TRUE){ allowed_formats <- c('api', 'bibtex', 'csv', 'endnote','excel','plaintext', 'pubmed') - allowed_db <- c('cochrane','dimensions','generic','isi','openalex', 'pubmed','scopus','wos', 'lens') + allowed_db <- c('cochrane','dimensions','generic','isi','openalex', 'openalex_api','pubmed','scopus','wos', 'lens') cat("\nConverting your",dbsource,"collection into a bibliographic dataframe\n\n") if (length(setdiff(dbsource,allowed_db))>0){ @@ -147,7 +149,10 @@ convert2df<-function(file,dbsource="wos",format="plaintext"){ }) }, - openalex = { + openalex={ + M <- csvOA2df(file) + }, + openalex_api = { if (!"bibliometrixDB" %in% class(file)){ M <- openalexR::oa2bibliometrix(file) } else { @@ -168,11 +173,11 @@ convert2df<-function(file,dbsource="wos",format="plaintext"){ M$CR <- trim.leading(trimES(gsub("\\[,||\\[||\\]|| \\.\\. || \\. ","",M$CR))) # remove foreign characters from CR (i.e. Chinese, Russian characters) } - if (dbsource!="cochrane"){M$AU=gsub(intToUtf8(8217),intToUtf8(39),M$AU)} + if (dbsource!="cochrane"){M$AU <- gsub(intToUtf8(8217),intToUtf8(39),M$AU)} cat("Done!\n\n") - if (!(dbsource %in% c("pubmed", "lens", "openalex"))) { + if (!(dbsource %in% c("pubmed", "lens", "openalex_api"))) { ## AU_UN field creation if ("C1" %in% names(M)) { cat("\nGenerating affiliation field tag AU_UN from C1: ") @@ -204,10 +209,37 @@ convert2df<-function(file,dbsource="wos",format="plaintext"){ } ### SR field creation + if (isTRUE(remove.duplicates)){ + switch(dbsource, + isi={ + id_field <- "UT" + }, + scopus={ + id_field <- "UT" + }, + openalex={ + id_field <- "id_oa" + }, + openalex_api={ + id_field <- "id_oa" + }, + dimneisons={ + id_field <- "UT" + }, + pubmed={ + id_field <- "PMID" + }, + lens={ + id_field <- "UT" + }, + { + id_field <- "TI" + }) + d <- duplicated(M[id_field]) + if (sum(d)>0) cat("\nRemoved ",sum(d),"duplicated documents\n") + M <- M[!d,] + } suppressWarnings(M <- metaTagExtraction(M, Field="SR")) - d <- duplicated(M$SR) - if (sum(d)>0) cat("\nRemoved ",sum(d),"duplicated documents\n") - M <- M[!d,] row.names(M) <- M$SR ### bibliometrix>DB class diff --git a/R/csvLens2df.R b/R/csvLens2df.R index b8bf733..e240dc9 100644 --- a/R/csvLens2df.R +++ b/R/csvLens2df.R @@ -54,12 +54,13 @@ csvLens2df <- function(file){ # Iso Source Titles DATA$SO[DATA$SO==""] <- DATA$Publisher[DATA$SO==""] - DATA$JI <- sapply(DATA$SO, AbbrevTitle, USE.NAMES = FALSE) - DATA$J9 <- gsub("\\.","",DATA$JI) + # DATA$JI <- sapply(DATA$SO, AbbrevTitle, USE.NAMES = FALSE) + # DATA$J9 <- gsub("\\.","",DATA$JI) + DATA$JI <- DATA$J9 <- DATA$SO DATA$ID <- DATA$DE DI <- DATA$DI URL <- DATA$URL - DATA <- data.frame(lapply(DATA,toUpper)) + DATA <- data.frame(lapply(DATA,toupper)) DATA$DI <- DI DATA$URL <- URL DATA$AU_CO <- "NA" diff --git a/R/csvOA2df.R b/R/csvOA2df.R new file mode 100644 index 0000000..7281543 --- /dev/null +++ b/R/csvOA2df.R @@ -0,0 +1,123 @@ +utils::globalVariables(c("all_of", "corr", "DI", "id_oa","RP","UN","AU_ID")) + +csvOA2df <- function(file){ + options(readr.num_columns = 0) + + ## import all files in a single data frame + for (i in 1:length(file)){ + #D <- read.csv(file[i], quote='"', check.names = F, stringsAsFactors = F) #fileEncoding = "UTF-8-BOM") + D <- read_csv(file[i], na=character(), quote='"', trim_ws = FALSE, progress = show_progress(), show_col_types = FALSE) %>% + mutate(across(where(is.numeric), as.character)) %>% + mutate(across(where(is.character), \(x) tidyr::replace_na(x,""))) %>% + as.data.frame() + + if (i>1){ + l <- intersect(l,names(D)) + DATA <- rbind(DATA[l],D[l]) + }else{ + l <- names(D) + DATA <- D} + } + rm(D) + + ## Post-Processing + + # column re-labelling + DATA <- relabelling(DATA) + + # recode as numeric + DATA$TC <- as.numeric(DATA$TC) + DATA$PY <- as.numeric(DATA$PY) + DATA$relevance_score <- as.numeric(DATA$relevance_score) + + # replace | with ; + DATA <- DATA %>% + mutate(across(where(is.character), ~ stringi::stri_replace_all_regex(.,"\\|",";"))) + + DATA$AF <- DATA$AU + DATA$ID <- DATA$DE + DATA$AB="" + DATA$CR <- gsub("https://openalex.org/","",DATA$CR) + DATA$AU_ID <- gsub("https://openalex.org/","",DATA$AU_ID) + DATA$id_oa <- gsub("https://openalex.org/","",DATA$id_oa) + DATA$JI <- DATA$J9 <- gsub("https://openalex.org/","",DATA$SO_ID) + DATA$corresponding_author_ids <- gsub("https://openalex.org/","",DATA$corresponding_author_ids) + DATA$C1 <- gsub("https://", "", DATA$C1) + DATA$DB <- "OPENALEX" + + ## corresponding author + UN <- strsplit(DATA$C1,";") + corresp <- strsplit(DATA$authorships_is_corresponding,";") + df_UN <- data.frame(UN=unlist(UN), id_oa=rep(DATA$id_oa,lengths(UN))) %>% + group_by(id_oa) %>% + mutate(n=row_number()) + df_COR <- data.frame(corr=unlist(corresp), id_oa=rep(DATA$id_oa,lengths(corresp))) %>% + group_by(id_oa) %>% + mutate(n=row_number()) + df_UN <- df_UN %>% + left_join(df_COR, by=(c("id_oa","n"))) + AU <- strsplit(DATA$AU,";") + AU_df <- data.frame(RP = unlist(AU), AU_ID=unlist(strsplit(DATA$AU_ID,";")), id_oa=rep(DATA$id_oa,lengths(AU))) %>% + group_by(id_oa) %>% + mutate(n=row_number()) %>% + left_join(df_UN %>% select("UN","id_oa", "corr", "n"), + by = c("id_oa","n")) %>% + dplyr::filter(corr == "true") %>% + mutate(RP = paste(RP,UN, sep=", ")) %>% + ungroup() %>% + select("RP", "AU_ID") %>% + distinct(AU_ID, .keep_all = TRUE) + DATA <- DATA %>% + left_join(AU_df, by = c("corresponding_author_ids" = "AU_ID")) + + + # move all char strings to Upper + ind <- apply(DATA,2,function(x){ + sum(regexpr("https://",x)>-1, na.rm = TRUE)>0 + }) + label <- names(ind)[ind==FALSE & !is.na(ind)] + + DATA <- DATA %>% + mutate(across(all_of(label), toupper), + DI = gsub("https://doi.org/","",DI), + DI = ifelse(DI == "null",NA,DI)) + + return(DATA) +} + +relabelling <- function(DATA){ + ## column re-labelling + label <- names(DATA) + label[label %in% "id"] <- "id_oa" + label[label %in% "display_name"] <- "TI" + label[label %in% "primary_location_display_name"] <- "SO" + label[label %in% "primary_location_id"] <- "SO_ID" + label[label %in% "primary_location_host_organization"] <- "PU" + label[label %in% "primary_location_issns"] <- "ISSN" + label[label %in% "primary_location_issn_l"] <- "ISSN_I" + label[label %in% "primary_location_landing_page_url"] <- "URL" + label[label %in% "primary_location_pdf_url"] <- "URL_PDF" + label[label %in% "author_ids"] <- "AU_ID" + label[label %in% "author_names"] <- "AU" + label[label %in% "author_orcids"] <- "OI" + label[label %in% "author_institution_names"] <- "C3" + label[label %in% "cited_by_count"] <- "TC" + label[label %in% "publication_year"] <- "PY" + label[label %in% "type"] <- "DT" + label[label %in% "biblio_issue"] <- "IS" + label[label %in% "biblio_volume"] <- "VL" + label[label %in% "referenced_works" ] <- "CR" + label[label %in% "keywords_keyword"] <- "DE" + label[label %in% "concepts_display_name"] <- "CONCEPTS" + label[label %in% "topics_display_name"] <- "TOPICS" + label[label %in% "sustainable_development_goals_display_name"] <- "SDG" + label[label %in% "primary_topic_field_display_name"] <- "SC" + label[label %in% "mesh_descriptor_name"] <- "MESH" + label[label %in% "referenced_works_count"] <- "NR" + label[label %in% "language"] <- "LA" + label[label %in% "authorships_author_position"] <- "AU_POSITION" + label[label %in% "authorships_raw_affiliation_string"] <- "C1" + label[label %in% "doi"] <- "DI" + names(DATA) <- label + return(DATA) +} diff --git a/R/histNetwork.R b/R/histNetwork.R index 8881068..49e0ce1 100644 --- a/R/histNetwork.R +++ b/R/histNetwork.R @@ -195,95 +195,6 @@ wos <- function(M, min.citations, sep, network, verbose){ return(results) } -# scopus <- function(M, min.citations, sep, network, verbose){ -# -# if (isTRUE(verbose)) { -# cat("\nSCOPUS DB: Searching local citations (LCS) by document titles (TI) and DOIs...\n") -# } -# -# if (!("SR_FULL" %in% names(M))) { -# M = metaTagExtraction(M, Field = "SR") -# } -# -# M$nCITING <- 1:nrow(M) -# papers <- M$nCITING[M$TC >= min.citations] -# -# TIpost <- -# paste(gsub("[[:punct:]]", "", M$TI[papers]), " ", M$PY[papers], " ", sep = "") -# -# CR <- gsub("[[:punct:]]", "", M$CR) -# n <- nchar(CR) -# n[is.na(n)] <- 2 -# n <- n + 1 -# nCum <- c(1, cumsum(n[-length(n)])) -# CR <- paste(CR, collapse = " ") -# -# #L <- str_locate_all(CR, TIpost) -# L <- stringi::stri_locate_all_regex(CR,TIpost, omit_no_match = TRUE) -# -# LCS <- lengths(L) / 2 -# -# M$LCS <- 0 -# M$LCS[papers] <- LCS -# -# -# ### HistData -# histData <- M %>% -# select(.data$SR_FULL, .data$TI,.data$DE,.data$ID,.data$DI, .data$PY, .data$LCS, .data$TC) %>% -# rename( -# Paper = .data$SR_FULL, -# Title = .data$TI, -# Author_Keywords = .data$DE, -# KeywordsPlus = .data$ID, -# DOI = .data$DI, -# Year = .data$PY, -# GCS = .data$TC -# ) %>% -# arrange(.data$Year) %>% -# dplyr::filter(.data$GCS>=min.citations) %>% -# as.data.frame() -# -# -# if (isTRUE(network)) { -# ## Network matrix -# df <- lapply(seq_along(L), function(i) { -# l <- -# data.frame( -# ref = L[[i]], -# paper = rep(papers[i], length(L[[i]][, 1])) -# ) -# }) -# df <- (do.call(rbind, df)) -# -# A <- outer(df$ref.start, nCum, "-") -# A[A < 0] <- NA -# df$CITINGn <- unlist(apply(A, 1, which.min)) -# df$CITING <- M$SR[df$CITINGn] -# df$CITED <- M$SR[df$paper] -# df <- df %>% -# dplyr::filter(.data$CITING %in% histData$Paper) -# -# NetMatrix <- -# (as_adjacency_matrix(graph_from_data_frame(df[, c(6, 5)], directed = T))) -# } else{ -# NetMatrix = NULL -# } -# -# if (isTRUE(verbose)) { -# cat("\nFound", -# length(M$LCS[M$LCS > 0]), -# "documents with no empty Local Citations (LCS)\n") -# } -# -# results <- -# list( -# NetMatrix = NetMatrix, -# histData = histData, -# M = M, -# LCS = M$LCS -# ) -# } - # New algorithm for Scopus # Local citation matching is based on First Author, Year and PP scopus <- function(M, min.citations, sep, network, verbose){ @@ -387,7 +298,7 @@ scopus <- function(M, min.citations, sep, network, verbose){ openalex <- function(M, min.citations=min.citations, sep=sep, network=network, verbose=verbose){ - M$CR[is.na(M$CR)] <- "none" + M$CR[is.na(M$CR) | M$CR==""] <- "none" ids <- M$id_oa CR <- strsplit(M$CR, ";") CR <- data.frame(id_oa = rep(M$id_oa,lengths(CR)), ref = unlist(CR)) %>% @@ -420,7 +331,7 @@ openalex <- function(M, min.citations=min.citations, sep=sep, network=network, v SRrow <- WLCR %>% select(.data$id_oa) %>% left_join(M %>% select(.data$id_oa, .data$SR), - by="id_oa") + by="id_oa") SR_col <- data.frame(id_oa = colnames(WLCR)[-1]) %>% left_join(M %>% diff --git a/R/metaTagExtraction.R b/R/metaTagExtraction.R index 4d72e5b..25b5c1c 100644 --- a/R/metaTagExtraction.R +++ b/R/metaTagExtraction.R @@ -405,7 +405,7 @@ AU_UN<-function(M,sep){ }) AFFL=unlist(AFFL) M$AU_UN=AFFL - if (M$DB[1]=="ISI" & "C3" %in% names(M)){ + if (M$DB[1] %in% c("ISI", "OPENALEX") & "C3" %in% names(M)){ M$AU_UN[!is.na(M$C3) & M$C3!=""] <- M$C3[!is.na(M$C3) & M$C3!=""] } M$AU_UN=gsub("\\\\&","AND",M$AU_UN) diff --git a/inst/biblioshiny/server.R b/inst/biblioshiny/server.R index 62e8875..df93647 100644 --- a/inst/biblioshiny/server.R +++ b/inst/biblioshiny/server.R @@ -281,7 +281,15 @@ To ensure the functionality of Biblioshiny, }) }) }, - openalex = { + openalex={ + withProgress(message = 'Conversion in progress', + value = 0, { + M <- convert2df(inFile$datapath, + dbsource = input$dbsource, + format = "csv") + }) + }, + openalex_api = { M <- smart_load(inFile$datapath) }, lens = { diff --git a/inst/biblioshiny/ui.R b/inst/biblioshiny/ui.R index 68b916c..e7f5272 100644 --- a/inst/biblioshiny/ui.R +++ b/inst/biblioshiny/ui.R @@ -254,7 +254,8 @@ body <- dashboardBody( "Web of Science (WoS/WoK)" = "isi", "Scopus" = "scopus", "Dimensions" = "dimensions", - "OpenAlex (via openalexR)" = "openalex", + "Openalex" ="openalex", + "OpenAlex API (via openalexR)" = "openalex_api", "Lens.org" = "lens", "PubMed" = "pubmed", "Cochrane Library" = "cochrane" diff --git a/man/convert2df.Rd b/man/convert2df.Rd index 18eca39..d3f9eb4 100644 --- a/man/convert2df.Rd +++ b/man/convert2df.Rd @@ -4,7 +4,12 @@ \alias{convert2df} \title{Import and Convert bibliographic export files and API objects.} \usage{ -convert2df(file, dbsource = "wos", format = "plaintext") +convert2df( + file, + dbsource = "wos", + format = "plaintext", + remove.duplicates = TRUE +) } \arguments{ \item{file}{a character array containing a sequence of filenames coming from WoS, Scopus, Dimensions, Lens.org, and Pubmed. Alternatively, \code{file} can be @@ -16,11 +21,14 @@ c)\tab 'dimensions' \tab Digital Science Dimensions (in csv '.csv' or excel '.xl d)\tab 'lens' \tab Lens.org (in csv '.csv');\cr e)\tab 'pubmed' \tab an object of the class \code{pubmedR (package pubmedR)} containing a collection obtained from a query performed with pubmedR package;\cr f)\tab 'dimensions' \tab an object of the class \code{dimensionsR (package dimensionsR)} containing a collection obtained from a query performed with dimensionsR package;\cr -g)\tab 'openalex' \tab a data frame object returned by openalexR package, containing a collection of works resulting from a query fetched from OpenAlex database.}} +g)\tab 'openalex' \tab OpenAlex .csv file;\cr +h)\tab 'openalex_api' \tab a data frame object returned by openalexR package, containing a collection of works resulting from a query fetched from OpenAlex database.}} \item{dbsource}{is a character indicating the bibliographic database. \code{dbsource} can be \code{dbsource = c('cochrane','dimensions','generic','isi','openalex', 'pubmed','scopus','wos', 'lens')} . Default is \code{dbsource = "isi"}.} -\item{format}{is a character indicating the format of the SCOPUS and Clarivate Analytics WoS export file. \code{format} can be \code{c('api', 'bibtex', 'csv', 'endnote','excel','plaintext', 'pubmed')}. Default is \code{format = "plaintext"}.} +\item{format}{is a character indicating the SCOPUS, Clarivate Analytics WoS, and other databases export file format. \code{format} can be \code{c('api', 'bibtex', 'csv', 'endnote','excel','plaintext', 'pubmed')}. Default is \code{format = "plaintext"}.} + +\item{remove.duplicates}{is logical. If TRUE, the function will remove duplicated items checking by DOI and database ID.} } \value{ a data frame with cases corresponding to articles and variables to Field Tags in the original export file.