Merge pull request #287 from massimoaria/develop

Develop to CRAN
massimoaria · Jan 13, 2023 · 75f663e · 75f663e
2 parents 5e3c37c + 310c25d
commit 75f663e
Show file tree

Hide file tree

Showing 33 changed files with 3,996 additions and 3,564 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,11 +6,14 @@
 .txt
 .bib
 .Ruserdata
+.DS_Store
 VOSviewer.jar
 network.net
 Rubbish
 desktop.ini
 vignette.txt
 inst/doc
+inst/biblioshiny/__MACOSX
+inst/biblioshiny/rsconnect
 _gh-pages
 
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: bibliometrix
 Type: Package
 Title: Comprehensive Science Mapping Analysis
-Version: 4.0.2
+Version: 4.1.0
 Authors@R: c(
     person(given = "Massimo",
            family = "Aria",
@@ -48,7 +48,7 @@ Imports: stats,
 		 shiny,
 		 SnowballC,
 		 stringdist,
-		 stringr,
+		 stringi,
 		 tidyr,
 		 tidytext
 Suggests: 
@@ -59,6 +59,6 @@ Suggests:
     shinycssloaders,
     visNetwork,
     wordcloud2
-RoxygenNote: 7.2.1
+RoxygenNote: 7.2.3
 NeedsCompilation: no
 Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
@@ -26,6 +26,7 @@ export(localCitations)
 export(lotka)
 export(mergeDbSources)
 export(metaTagExtraction)
+export(missingData)
 export(net2VOSviewer)
 export(networkPlot)
 export(networkStat)
@@ -56,6 +57,7 @@ import(readr)
 import(readxl)
 import(shiny)
 import(stats)
+import(stringi)
 import(tidytext)
 importFrom(DT,DTOutput)
 importFrom(DT,datatable)
@@ -294,9 +296,6 @@ importFrom(rscopus,author_df_orig)
 importFrom(rscopus,author_search)
 importFrom(rscopus,get_complete_author_info)
 importFrom(stringdist,stringdistmatrix)
-importFrom(stringr,str_extract_all)
-importFrom(stringr,str_locate_all)
-importFrom(stringr,str_replace_all)
 importFrom(tidyr,drop_na)
 importFrom(tidyr,gather)
 importFrom(tidyr,pivot_longer)

diff --git a/NEWS b/NEWS
@@ -1,3 +1,17 @@
+bibliometrix v4.0.1 (Release date: 2023-01-13)
+
+Features:
+* Added a new function missingData() to check the completeness of metadata included in a bibliographic data frame 
+* Biblioshiny: Added the ability to create an excel report by adding step by step results of different analysis
+* Biblioshiny: Added a popup that returns the results of the metadata completeness check of imported collections
+* Biblioshiny: Revamped interface with floating options menu and more space for graphical analysis results
+
+Changes:
+* Several bug fixes
+* Computational speed improvements
+
+
+
 bibliometrix v4.0.1 (Release date: 2022-09-16)
 
 Features:

diff --git a/R/bib2df.R b/R/bib2df.R
@@ -22,8 +22,10 @@ bib2df<-function(D, dbsource = "isi"){
 
   if (dbsource == "isi") D <- gsub(" = \\{","={",D)
 
+  D <- gsub("\\\t","",gsub(" = \\{","=\\{",D)) # to work also with new scopus bib format
+
   D[Papers] <- paste("Paper={",D[Papers],sep="")
-  #ii <- regexpr("\\{",D[Papers])
+
   ind <- regexpr("=\\{",D) # sep among tags and contents
   ind[Papers] <- 6
 

diff --git a/R/conceptualStructure.R b/R/conceptualStructure.R
@@ -409,36 +409,34 @@ conceptualStructure<-function(M,field="ID", ngrams=1, method="MCA", quali.supp=N
     b_doc_TC <- b_doc_TC + annotation_custom(logo, xmin = xl[1], xmax = xl[2], ymin = yl[1], ymax = yl[2]) 
     ##
 
-    params <- list(field = field, 
-                   ngrams = ngrams,
-                   method=method,
-                   quali.supp=quali.supp,
-                   quanti.supp=quanti.supp,
-                   minDegree=minDegree,
-                   clust=clust,
-                   k.max=k.max,
-                   stemming = stemming, 
-                   labelsize=labelsize,
-                   documents=documents,
-                   graph=graph, 
-                   remove.terms = remove.terms, 
-                   synonyms = synonyms)
-
-    params <- data.frame(params=names(unlist(params)),values=unlist(params), row.names = NULL)
 
     if (isTRUE(graph)){plot(b_doc_TC)}
 
     semanticResults=list(net=CW,res=res.mca,km.res=km.res,graph_terms=b,graph_dendogram=b_dend,
-                         graph_documents_Contrib=b_doc,graph_documents_TC=b_doc_TC,docCoord=docCoord,
-                         params=params)
+                         graph_documents_Contrib=b_doc,graph_documents_TC=b_doc_TC,docCoord=docCoord)
 
   }else{
 
     semanticResults=list(net=CW,res=res.mca,km.res=km.res,graph_terms=b,graph_dendogram=b_dend,
-                         graph_documents_Contrib=NULL,graph_documents_TC=NULL,docCoord=NULL,
-                         params=params)
+                         graph_documents_Contrib=NULL,graph_documents_TC=NULL,docCoord=NULL)
     }
 
+  params <- list(field = field, 
+                 ngrams = ngrams,
+                 method=method,
+                 quali.supp=quali.supp,
+                 quanti.supp=quanti.supp,
+                 minDegree=minDegree,
+                 clust=clust,
+                 k.max=k.max,
+                 stemming = stemming, 
+                 labelsize=labelsize,
+                 documents=documents,
+                 graph=graph, 
+                 remove.terms = remove.terms, 
+                 synonyms = synonyms)
+
+  semanticResults$params <- data.frame(params=names(unlist(params)),values=unlist(params), row.names = NULL)
 
 
   return(semanticResults)

diff --git a/R/couplingMap.R b/R/couplingMap.R
@@ -195,7 +195,22 @@ couplingMap <- function(M, analysis = "documents", field="CR", n=500, label.term
   row.names(df)=NULL
   df <- df %>% rename(items = .data$words)
 
-  results=list(map=g, clusters=df, data=df_lab,nclust=dim(df)[1], NCS = D, net=Net)
+  params <- list(analysis = analysis,
+                 field=field, 
+                 n=n, 
+                 minfreq=minfreq,
+                 label.term=label.term, 
+                 ngrams=ngrams, 
+                 impact.measure=impact.measure,
+                 stemming=stemming, 
+                 n.labels=n.labels, 
+                 size=size,
+                 community.repulsion = community.repulsion, 
+                 repel=repel, 
+                 cluster=cluster)
+  params <- data.frame(params=names(unlist(params)),values=unlist(params), row.names = NULL)
+
+  results=list(map=g, clusters=df, data=df_lab,nclust=dim(df)[1], NCS = D, net=Net, params=params)
   return(results)
 }
 
@@ -315,7 +330,7 @@ labeling <- function(M, df_lab, term, n, n.labels, analysis, ngrams){
 
   #clusters <- unique(df$Cluster)
   #w <- character(length(clusters))
-
+  df$SR <- df[,1]
   tab_global <- tableTag(df, term)
   tab_global <- data.frame(label=names(tab_global),tot=as.numeric(tab_global), n=nrow(M),stringsAsFactors = FALSE)
 

diff --git a/R/dimensions2df.R b/R/dimensions2df.R
@@ -76,7 +76,7 @@ dimensions2df <- function(file, format = "csv") {
 
 
 postprocessingDim <- function(DATA) {
-  DATA <- data.frame(lapply(DATA, toupper), stringsAsFactors = FALSE)
+  # DATA <- data.frame(lapply(DATA, toupper), stringsAsFactors = FALSE)
 
   ## Converting original references in WOS format (AU, PY, SO, VOL, NUM, DOI)
   if ("Cited.references" %in% names(DATA)) {
@@ -232,15 +232,20 @@ postprocessingDim <- function(DATA) {
   if (("SO" %in% names(DATA)) & ("Anthology.title" %in% names(DATA))) {
     ind <- which(is.na(DATA$SO) | DATA$SO=="")
     DATA$SO[ind] <- DATA$Anthology.title[ind]
-    DATA$SO[DATA$SO==""] <- NA
+    DATA$SO[is.na(DATA$SO) | DATA$SO==""] <- "NA"
   }
 
   if (!("SO" %in% names(DATA))) {
     DATA$SO <- "NA"
   }
 
+  ####
+  cat("\nCreating ISO Source names...")
   DATA$JI <- sapply(DATA$SO, AbbrevTitle, USE.NAMES = FALSE)
   DATA$J9 <- gsub("\\.","",DATA$JI)
+  ####
+
+  DATA <- data.frame(lapply(DATA, toupper), stringsAsFactors = FALSE)
 
   DATA$PY <- as.numeric(DATA$PY)
 

diff --git a/R/histNetwork.R b/R/histNetwork.R
@@ -203,8 +203,8 @@ scopus <- function(M, min.citations, sep, network, verbose){
   nCum <- c(1, cumsum(n[-length(n)]))
   CR <- paste(CR, collapse = " ")
 
-  L <- str_locate_all(CR, TIpost)
-
+  #L <- str_locate_all(CR, TIpost)
+  L <- stringi::stri_locate_all_regex(CR,TIpost, omit_no_match = TRUE)
 
   LCS <- lengths(L) / 2
 

diff --git a/R/histPlot.R b/R/histPlot.R
@@ -74,14 +74,14 @@ histPlot<-function(histResults, n=20, size = 5, labelsize = 5, title_as_label =
 
   switch(label,
          title={
-           title <- strsplit(stringr::str_to_title(V(bsk.network)$title), " ")
+           title <- strsplit(stringi::stri_trans_totitle(V(bsk.network)$title), " ")
            V(bsk.network)$id <- unlist(lapply(title, function(l){
              n <- floor(length(l)/2)
              paste0(paste(l[1:n], collapse=" ", sep=""),"\n",paste(l[(n+1):length(l)], collapse=" ", sep=""))
            }))
          },
          keywords={
-           kw <- strsplit(stringr::str_to_title(V(bsk.network)$keywords), ";")
+           kw <- strsplit(stringi::stri_trans_totitle(V(bsk.network)$keywords), ";")
            kw[is.na(kw)] <- "Not Available"
            V(bsk.network)$id <- unlist(lapply(kw, function(l){
              if (length(l)>1){
@@ -92,7 +92,7 @@ histPlot<-function(histResults, n=20, size = 5, labelsize = 5, title_as_label =
            }))
          },
          keywordsplus={
-           kw <- strsplit(stringr::str_to_title(V(bsk.network)$keywordsplus), ";")
+           kw <- strsplit(stringi::stri_trans_totitle(V(bsk.network)$keywordsplus), ";")
            kw[is.na(kw)] <- "Not Available"
            V(bsk.network)$id <- unlist(lapply(kw, function(l){
              if (length(l)>1){
@@ -107,24 +107,13 @@ histPlot<-function(histResults, n=20, size = 5, labelsize = 5, title_as_label =
          }
   )
 
-  # if (isTRUE(title_as_label)){
-  #   title <- strsplit(stringr::str_to_title(V(bsk.network)$title), " ")
-  #   V(bsk.network)$id <- unlist(lapply(title, function(l){
-  #     n <- floor(length(l)/2)
-  #     paste0(paste(l[1:n], collapse=" ", sep=""),"\n",paste(l[(n+1):length(l)], collapse=" ", sep=""))
-  #   }))
-  #   #V(bsk.network)$id <- tolower(paste(substr(V(bsk.network)$title,1,50),"...",sep=""))
-  # } else {
-  #   V(bsk.network)$id <- tolower(unlist(RR))
-  # }
-
   # Compute node degrees (#links) and use that to set node size:
   deg <- LCS
   V(bsk.network)$size <- size
     #rep(size,length(V(bsk.network)))}
 
   #Years=histResults$histData$Year[ind]
-  Years <- as.numeric(unlist(str_extract_all(unlist(RR),"[[:digit:]]{4}$")))
+  Years <- as.numeric(unlist(stringi::stri_extract_all_regex(unlist(RR),"[[:digit:]]{4}$")))
   V(bsk.network)$years <- Years
 
   # Remove loops

diff --git a/R/keywordGrowth.R b/R/keywordGrowth.R
@@ -52,8 +52,8 @@ KeywordGrowth <- function(M, Tag = "ID", sep = ";", top=10, cdf=TRUE, remove.ter
       A <- A %>% 
         mutate(
           # Tab = str_replace_all(Tab, paste(sold[[i]], collapse="|",sep=""),snew[i])
-          Tab= str_replace_all(Tab, str_replace_all(str_replace_all(paste(sold[[i]], collapse="|",sep=""),"\\(","\\\\("),"\\)","\\\\)"),snew[i])
-
+          #Tab= str_replace_all(Tab, str_replace_all(str_replace_all(paste(sold[[i]], collapse="|",sep=""),"\\(","\\\\("),"\\)","\\\\)"),snew[i]),
+          Tab= stringi::stri_replace_all_regex(Tab, stringi::stri_replace_all_regex(stringi::stri_replace_all_regex(paste(sold[[i]], collapse="|",sep=""),"\\(","\\\\("),"\\)","\\\\)"),snew[i])
         )
     }
   }

diff --git a/R/missingData.R b/R/missingData.R
@@ -0,0 +1,69 @@
+#' Completeness of bibliographic metadata
+#'
+#' It calculates the percentage of missing data in the metadata of a bibliographic data frame. 
+#' 
+#' Each metadata is assigned a status c("Excellent," "Good," "Acceptable", "Poor", "Critical," "Completely missing") 
+#' depending on the percentage of missing data. In particular, the column *status* classifies the percentage of missing 
+#' value in 5 categories: "Excellent" (0%), "Good" (0.01% to 10.00%), "Acceptable" (from 10.01% to 20.00%), 
+#' "Poor" (from 20.01% to 50.00%), "Critical" (from 50.01% to 99.99%), "Completely missing" (100%).
+#' 
+#' The results of the function allow us to understand which analyses can be performed with bibliometrix 
+#' and which cannot based on the completeness (or status) of different metadata.
+#' @param M is a bibliographic data frame obtained by \code{\link{convert2df}} function.
+#' 
+#' @return The function \code{missingData} returns a list containing two objects:
+#' \tabular{lll}{
+#' \code{allTags}  \tab   \tab is a data frame including results for all original metadata tags from the collection\cr
+#' \code{mandatoryTags}\tab    \tab is a data frame that included only the tags needed for analysis with bibliometrix and biblioshiny.}
+#'
+#' @examples
+#' data(scientometrics, package = "bibliometrixData")
+#' res <- missingData(scientometrics)
+#' print(res$mandatoryTags)
+#'
+#' @export
+#' 
+missingData <- function(M) {
+  cols <- names(M)
+  missing_counts <- sapply(cols, function(x){
+    sum(is.na(M[,x]) | M[,x] %in% c("NA,0000,NA","NA",""))
+    })
+  missing_pct <- round(missing_counts/nrow(M) * 100, 2)
+  df_all <- data.frame(cols, missing_counts, missing_pct)
+
+  tag <- unlist(
+    strsplit(
+      "AB,AU,C1,CR,DE,DI,DT,ID,LA,NR,PY,RP,SO,TC,TI,WC",","
+      )
+  )
+  description <- trimws(unlist(
+    strsplit(
+      "Abstract, Author,Affiliation,Cited References,Keywords,DOI,Document Type,Keywords Plus,Language,Number of Cited References,
+      Publication Year,Corresponding Author, Journal, Total Citation, Title, Science Categories", ","
+    )
+  ))
+
+  df_all <- df_all %>% 
+    mutate(status = status(missing_pct)) %>% 
+    replace_na(replace = list(missing_counts = nrow(M), missing_pct = 100))
+
+  df_tags <- data.frame(tag, description) %>% 
+    left_join(df_all, by = c("tag" = "cols")) %>% 
+    replace_na(replace = list(missing_counts = nrow(M), missing_pct = 100, status = "Completely missing")) %>% 
+    arrange(missing_pct,description)
+
+  results <- list(allTags=df_all, mandatoryTags=df_tags)
+  return(results)
+}
+
+status <- function(x){
+  y <- character(length(x))
+  y[x==0] <- "Excellent"
+  y[x>0 & x<= 10] <- "Good"
+  y[x>10 & x<= 20] <- "Acceptable"
+  y[x>20 & x<=50] <- "Poor"
+  y[x>50 & x<100] <- "Critical"
+  y[is.na(x) | x==100] <- "Completely missing"
+  return(y)
+}
+
diff --git a/R/rpys.R b/R/rpys.R
@@ -109,7 +109,7 @@ y <- c(min(c(RPYS$Citations,RPYS$diffMedian)),min(c(RPYS$Citations,RPYS$diffMedi
 
 g=ggplot(RPYS, aes(x=.data$Year ,y=.data$Citations,text=paste("Year: ",.data$Year,"\nN. of References: ",.data$Citations)))+
   geom_line(aes(group="NA")) +
-  geom_area(aes(group="NA"),fill = 'grey90', alpha = .5) +
+  #geom_area(aes(group="NA"),fill = 'grey90', alpha = .5) +
   #geom_hline(aes(yintercept=0, color = 'grey'))+
   geom_line(aes(x=.data$Year,y=.data$diffMedian, color="firebrick", group="NA"))+
   labs(x = 'Year'

diff --git a/R/thematicEvolution.R b/R/thematicEvolution.R
@@ -11,7 +11,7 @@
 #' 
 #' @param M is a bibliographic data frame obtained by the converting function \code{\link{convert2df}}.
 #' @param field is a character object. It indicates the content field to use. Field can be one of c=("ID","DE","TI","AB"). Default value is \code{field="ID"}.
-#' @param years is a numeric vector of two or more unique cut points.
+#' @param years is a numeric vector of one or more unique cut points.
 #' @param n is numerical. It indicates the number of words to use in the network analysis
 #' @param minFreq is numerical. It indicates the min frequency of words included in to a cluster.
 #' @param ngrams is an integer between 1 and 4. It indicates the type of n-gram to extract from texts. 
@@ -57,6 +57,7 @@ thematicEvolution <- function(M, field = "ID", years, n = 250, minFreq = 2, size
     resk <- thematicMap(Mk, field = field, n = n, minfreq = minFreq, ngrams=ngrams,
                         stemming = stemming, size = size, n.labels = n.labels, 
                         repel = repel, remove.terms = remove.terms, synonyms = synonyms, cluster=cluster)
+    resk$params <- resk$params %>% dplyr::filter(.data$params!="minfreq")
     res[[k]] <-  resk
     net[[k]] <-  resk$net
   }

diff --git a/R/toUpper.R b/R/toUpper.R
@@ -1,3 +1,3 @@
 toUpper <- function(D){
-  stringr::str_to_upper(D, locale = "en")
+  stringi::stri_trans_toupper(D, locale = "en")
 }