Skip to content

Commit

Permalink
Added full support to OpenAlex csv export files.
Browse files Browse the repository at this point in the history
  • Loading branch information
massimoaria committed Feb 23, 2024
1 parent 1f6677a commit 1b8ca20
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 107 deletions.
38 changes: 33 additions & 5 deletions R/convert2df.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#' h)\tab 'openalex_api' \tab a data frame object returned by openalexR package, containing a collection of works resulting from a query fetched from OpenAlex database.}
#' @param dbsource is a character indicating the bibliographic database. \code{dbsource} can be \code{dbsource = c('cochrane','dimensions','generic','isi','openalex', 'pubmed','scopus','wos', 'lens')} . Default is \code{dbsource = "isi"}.
#' @param format is a character indicating the SCOPUS, Clarivate Analytics WoS, and other databases export file format. \code{format} can be \code{c('api', 'bibtex', 'csv', 'endnote','excel','plaintext', 'pubmed')}. Default is \code{format = "plaintext"}.
#' @param remove.duplicates is logical. If TRUE, the function will remove duplicated items checking by DOI and database ID.
#' @return a data frame with cases corresponding to articles and variables to Field Tags in the original export file.
#'
#' I.e We have three files downlaod from Web of Science in plaintext format, file will be:
Expand Down Expand Up @@ -58,7 +59,7 @@
#'
#' @export

convert2df<-function(file,dbsource="wos",format="plaintext"){
convert2df<-function(file,dbsource="wos",format="plaintext", remove.duplicates=TRUE){

allowed_formats <- c('api', 'bibtex', 'csv', 'endnote','excel','plaintext', 'pubmed')
allowed_db <- c('cochrane','dimensions','generic','isi','openalex', 'openalex_api','pubmed','scopus','wos', 'lens')
Expand Down Expand Up @@ -172,7 +173,7 @@ convert2df<-function(file,dbsource="wos",format="plaintext"){
M$CR <- trim.leading(trimES(gsub("\\[,||\\[||\\]|| \\.\\. || \\. ","",M$CR))) # remove foreign characters from CR (i.e. Chinese, Russian characters)
}

if (dbsource!="cochrane"){M$AU=gsub(intToUtf8(8217),intToUtf8(39),M$AU)}
if (dbsource!="cochrane"){M$AU <- gsub(intToUtf8(8217),intToUtf8(39),M$AU)}

cat("Done!\n\n")

Expand Down Expand Up @@ -208,10 +209,37 @@ convert2df<-function(file,dbsource="wos",format="plaintext"){
}

### SR field creation
if (isTRUE(remove.duplicates)){
switch(dbsource,
isi={
id_field <- "UT"
},
scopus={
id_field <- "UT"
},
openalex={
id_field <- "id_oa"
},
openalex_api={
id_field <- "id_oa"
},
dimneisons={
id_field <- "UT"
},
pubmed={
id_field <- "PMID"
},
lens={
id_field <- "UT"
},
{
id_field <- "TI"
})
d <- duplicated(M[id_field])
if (sum(d)>0) cat("\nRemoved ",sum(d),"duplicated documents\n")
M <- M[!d,]
}
suppressWarnings(M <- metaTagExtraction(M, Field="SR"))
d <- duplicated(M$SR)
if (sum(d)>0) cat("\nRemoved ",sum(d),"duplicated documents\n")
M <- M[!d,]
row.names(M) <- M$SR

### bibliometrix>DB class
Expand Down
7 changes: 4 additions & 3 deletions R/csvLens2df.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,13 @@ csvLens2df <- function(file){

# Iso Source Titles
DATA$SO[DATA$SO==""] <- DATA$Publisher[DATA$SO==""]
DATA$JI <- sapply(DATA$SO, AbbrevTitle, USE.NAMES = FALSE)
DATA$J9 <- gsub("\\.","",DATA$JI)
# DATA$JI <- sapply(DATA$SO, AbbrevTitle, USE.NAMES = FALSE)
# DATA$J9 <- gsub("\\.","",DATA$JI)
DATA$JI <- DATA$J9 <- DATA$SO
DATA$ID <- DATA$DE
DI <- DATA$DI
URL <- DATA$URL
DATA <- data.frame(lapply(DATA,toUpper))
DATA <- data.frame(lapply(DATA,toupper))
DATA$DI <- DI
DATA$URL <- URL
DATA$AU_CO <- "NA"
Expand Down
16 changes: 9 additions & 7 deletions R/csvOA2df.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
utils::globalVariables(c("all_of", "corr"))
utils::globalVariables(c("all_of", "corr", "DI", "id_oa","RP","UN","AU_ID"))

csvOA2df <- function(file){
options(readr.num_columns = 0)
Expand Down Expand Up @@ -49,24 +49,24 @@ csvOA2df <- function(file){
UN <- strsplit(DATA$C1,";")
corresp <- strsplit(DATA$authorships_is_corresponding,";")
df_UN <- data.frame(UN=unlist(UN), id_oa=rep(DATA$id_oa,lengths(UN))) %>%
group_by(.data$id_oa) %>%
group_by(id_oa) %>%
mutate(n=row_number())
df_COR <- data.frame(corr=unlist(corresp), id_oa=rep(DATA$id_oa,lengths(corresp))) %>%
group_by(.data$id_oa) %>%
group_by(id_oa) %>%
mutate(n=row_number())
df_UN <- df_UN %>%
left_join(df_COR, by=(c("id_oa","n")))
AU <- strsplit(DATA$AU,";")
AU_df <- data.frame(RP = unlist(AU), AU_ID=unlist(strsplit(DATA$AU_ID,";")), id_oa=rep(DATA$id_oa,lengths(AU))) %>%
group_by(.data$id_oa) %>%
group_by(id_oa) %>%
mutate(n=row_number()) %>%
left_join(df_UN %>% select("UN","id_oa", "corr", "n"),
by = c("id_oa","n")) %>%
dplyr::filter(corr == "true") %>%
mutate(RP = paste(.data$RP,.data$UN, sep=", ")) %>%
mutate(RP = paste(RP,UN, sep=", ")) %>%
ungroup() %>%
select("RP", "AU_ID") %>%
distinct(.data$AU_ID, .keep_all = TRUE)
distinct(AU_ID, .keep_all = TRUE)
DATA <- DATA %>%
left_join(AU_df, by = c("corresponding_author_ids" = "AU_ID"))

Expand All @@ -78,7 +78,9 @@ csvOA2df <- function(file){
label <- names(ind)[ind==FALSE & !is.na(ind)]

DATA <- DATA %>%
mutate(across(all_of(label), toupper))
mutate(across(all_of(label), toupper),
DI = gsub("https://doi.org/","",DI),
DI = ifelse(DI == "null",NA,DI))

return(DATA)
}
Expand Down
93 changes: 2 additions & 91 deletions R/histNetwork.R
Original file line number Diff line number Diff line change
Expand Up @@ -195,95 +195,6 @@ wos <- function(M, min.citations, sep, network, verbose){
return(results)
}

# scopus <- function(M, min.citations, sep, network, verbose){
#
# if (isTRUE(verbose)) {
# cat("\nSCOPUS DB: Searching local citations (LCS) by document titles (TI) and DOIs...\n")
# }
#
# if (!("SR_FULL" %in% names(M))) {
# M = metaTagExtraction(M, Field = "SR")
# }
#
# M$nCITING <- 1:nrow(M)
# papers <- M$nCITING[M$TC >= min.citations]
#
# TIpost <-
# paste(gsub("[[:punct:]]", "", M$TI[papers]), " ", M$PY[papers], " ", sep = "")
#
# CR <- gsub("[[:punct:]]", "", M$CR)
# n <- nchar(CR)
# n[is.na(n)] <- 2
# n <- n + 1
# nCum <- c(1, cumsum(n[-length(n)]))
# CR <- paste(CR, collapse = " ")
#
# #L <- str_locate_all(CR, TIpost)
# L <- stringi::stri_locate_all_regex(CR,TIpost, omit_no_match = TRUE)
#
# LCS <- lengths(L) / 2
#
# M$LCS <- 0
# M$LCS[papers] <- LCS
#
#
# ### HistData
# histData <- M %>%
# select(.data$SR_FULL, .data$TI,.data$DE,.data$ID,.data$DI, .data$PY, .data$LCS, .data$TC) %>%
# rename(
# Paper = .data$SR_FULL,
# Title = .data$TI,
# Author_Keywords = .data$DE,
# KeywordsPlus = .data$ID,
# DOI = .data$DI,
# Year = .data$PY,
# GCS = .data$TC
# ) %>%
# arrange(.data$Year) %>%
# dplyr::filter(.data$GCS>=min.citations) %>%
# as.data.frame()
#
#
# if (isTRUE(network)) {
# ## Network matrix
# df <- lapply(seq_along(L), function(i) {
# l <-
# data.frame(
# ref = L[[i]],
# paper = rep(papers[i], length(L[[i]][, 1]))
# )
# })
# df <- (do.call(rbind, df))
#
# A <- outer(df$ref.start, nCum, "-")
# A[A < 0] <- NA
# df$CITINGn <- unlist(apply(A, 1, which.min))
# df$CITING <- M$SR[df$CITINGn]
# df$CITED <- M$SR[df$paper]
# df <- df %>%
# dplyr::filter(.data$CITING %in% histData$Paper)
#
# NetMatrix <-
# (as_adjacency_matrix(graph_from_data_frame(df[, c(6, 5)], directed = T)))
# } else{
# NetMatrix = NULL
# }
#
# if (isTRUE(verbose)) {
# cat("\nFound",
# length(M$LCS[M$LCS > 0]),
# "documents with no empty Local Citations (LCS)\n")
# }
#
# results <-
# list(
# NetMatrix = NetMatrix,
# histData = histData,
# M = M,
# LCS = M$LCS
# )
# }

# New algorithm for Scopus
# Local citation matching is based on First Author, Year and PP
scopus <- function(M, min.citations, sep, network, verbose){
Expand Down Expand Up @@ -387,7 +298,7 @@ scopus <- function(M, min.citations, sep, network, verbose){

openalex <- function(M, min.citations=min.citations, sep=sep, network=network, verbose=verbose){

M$CR[is.na(M$CR)] <- "none"
M$CR[is.na(M$CR) | M$CR==""] <- "none"
ids <- M$id_oa
CR <- strsplit(M$CR, ";")
CR <- data.frame(id_oa = rep(M$id_oa,lengths(CR)), ref = unlist(CR)) %>%
Expand Down Expand Up @@ -420,7 +331,7 @@ openalex <- function(M, min.citations=min.citations, sep=sep, network=network, v
SRrow <- WLCR %>% select(.data$id_oa) %>%
left_join(M %>%
select(.data$id_oa, .data$SR),
by="id_oa")
by="id_oa")

SR_col <- data.frame(id_oa = colnames(WLCR)[-1]) %>%
left_join(M %>%
Expand Down
9 changes: 8 additions & 1 deletion man/convert2df.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 1b8ca20

Please sign in to comment.