ropensci review 14-11-24

- the function rpkb() returns directly the generated data set and not a list - packages ggplot2 and foreach not called as full packages but using importFrom - nClust argument of pkbc function has no default value - usage of standardGeneric (see https://adv-r.hadley.nz/s4.html#s4-generics) - goodpractice sugestions - Uniform notation: d is dimension in Eucldean space with spherical data lying on the (d-1)-dimensional sphere - Specify dimension of input matrix x in clustering functions. - Improve explaination of Average Silhouette Width - Update NEWS.md file - Add Hingee Kassel as reviewer in the DESCRIPTION file - Update package version
giovsaraceno · Nov 15, 2024 · e94750c · e94750c
1 parent 32b794e
commit e94750c
Show file tree

Hide file tree

Showing 89 changed files with 551 additions and 501 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,13 +1,14 @@
 Type: Package
 Package: QuadratiK
 Title: Collection of Methods Constructed using Kernel-Based Quadratic Distances
-Version: 1.1.2
+Version: 1.1.3
 Authors@R: c(
     person("Giovanni", "Saraceno", ,"[email protected]", role = c("aut", "cre"),
            comment = "ORCID 000-0002-1753-2367"),
     person("Marianthi", "Markatou", role = "aut"),
     person("Raktim", "Mukhopadhyay", role = "aut"),
-    person("Mojgan", "Golzy", role = c("aut"))
+    person("Mojgan", "Golzy", role = c("aut")),
+    person("Hingee", "Kassel", role = "rev")
   )
 Maintainer: Giovanni Saraceno <[email protected]>
 Description: It includes test for multivariate normality, test for uniformity on the d-dimensional 

diff --git a/NAMESPACE b/NAMESPACE
@@ -22,11 +22,24 @@ exportMethods(stats_clusters)
 exportMethods(summary)
 import(Rcpp)
 import(RcppEigen)
-import(foreach)
 import(ggplot2)
 import(rlecuyer)
 importFrom(Rcpp,sourceCpp)
 importFrom(doParallel,registerDoParallel)
+importFrom(foreach,"%dopar%")
+importFrom(foreach,foreach)
+importFrom(ggplot2,geom_abline)
+importFrom(ggplot2,geom_line)
+importFrom(ggplot2,geom_point)
+importFrom(ggplot2,ggplot)
+importFrom(ggplot2,ggtitle)
+importFrom(ggplot2,labs)
+importFrom(ggplot2,scale_color_brewer)
+importFrom(ggplot2,theme)
+importFrom(ggplot2,theme_light)
+importFrom(ggplot2,theme_minimal)
+importFrom(ggplot2,xlab)
+importFrom(ggplot2,ylab)
 importFrom(ggpubr,ggarrange)
 importFrom(grDevices,rainbow)
 importFrom(graphics,legend)
@@ -56,4 +69,3 @@ importFrom(stats,sd)
 importFrom(stats,uniroot)
 importFrom(utils,install.packages)
 useDynLib(QuadratiK)
-useDynLib(QuadratiK, .registration = TRUE)
diff --git a/NEWS.md b/NEWS.md
@@ -1,4 +1,35 @@
-QuadratiK 1.1.2 (Development version) 
+QuadratiK 1.1.3 (Development version) 
+=========================
+
+### NEW FEATURES
+
+  * the function rpkb() returns directly the generated data set and not a list
+
+
+### MINOR IMPROVEMENTS
+
+  * packages ggplot2 and foreach not called as full packages but using 
+    importFrom
+
+  * nClust argument of pkbc function has no default value
+
+  * usage of standardGeneric (see https://adv-r.hadley.nz/s4.html#s4-generics)
+
+### BUG FIXES
+
+  * goodpractice sugestions
+
+### DOCUMENTATION FIXES
+
+  * Uniform notation: d is dimension in Eucldean space with spherical data lying
+    on the (d-1)-dimensional sphere
+
+  * Specify dimension of input matrix x in clustering functions.
+
+  * Improve explaination of Average Silhouette Width
+
+
+QuadratiK 1.1.2 (2024-10-29) 
 =========================
 
 ### NEW FEATURES
@@ -25,8 +56,8 @@ QuadratiK 1.1.2 (Development version)
 ### DOCUMENTATION FIXES
 
   * The help documentation for the methods defined for the classes in the 
-    package can be accessed directly (addition of roxygen tag @name and 
-    @aliases)
+    package can be accessed directly (addition of roxygen tag name and 
+    aliases)
 
 
 QuadratiK 1.1.1 (2024-06-05)

diff --git a/R/QuadratiK-package.R b/R/QuadratiK-package.R
@@ -59,7 +59,8 @@
 #' The `QuadratiK` package is also available in Python on PyPI 
 #' <https://pypi.org/project/QuadratiK/> and also as a Dashboard application.
 #' Usage instruction for the Dashboard can be found at 
-#' <https://quadratik.readthedocs.io/en/latest/user_guide/dashboard_application_usage.html>.
+#' <https://quadratik.readthedocs.io/en/latest/user_guide/
+#' dashboard_application_usage.html>.
 #' 
 #' @author 
 #' Giovanni Saraceno, Marianthi Markatou, Raktim Mukhopadhyay, Mojgan Golzy 

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -16,7 +16,7 @@
 #' 
 #' @noRd
 computeKernelMatrix <- function(x_mat, y_mat, H) {
-    .Call(`_QuadratiK_computeKernelMatrix`, x_mat, y_mat, H)
+    .Call('_QuadratiK_computeKernelMatrix', PACKAGE = 'QuadratiK', x_mat, y_mat, H)
 }
 
 #' Compute the Poisson kernel matrix between observations in a sample.
@@ -32,7 +32,7 @@ computeKernelMatrix <- function(x_mat, y_mat, H) {
 #' 
 #' @noRd
 computePoissonMatrix <- function(x_mat, rho) {
-    .Call(`_QuadratiK_computePoissonMatrix`, x_mat, rho)
+    .Call('_QuadratiK_computePoissonMatrix', PACKAGE = 'QuadratiK', x_mat, rho)
 }
 
 #' Non-parametric centered kernel
@@ -53,7 +53,7 @@ computePoissonMatrix <- function(x_mat, rho) {
 #' 
 #' @noRd
 NonparamCentering <- function(kmat_zz, n_z) {
-    .Call(`_QuadratiK_NonparamCentering`, kmat_zz, n_z)
+    .Call('_QuadratiK_NonparamCentering', PACKAGE = 'QuadratiK', kmat_zz, n_z)
 }
 
 #' Parametric centered kernel
@@ -75,7 +75,7 @@ NonparamCentering <- function(kmat_zz, n_z) {
 #' 
 #' @noRd
 ParamCentering <- function(kmat_zz, z_mat, H, mu_hat, Sigma_hat) {
-    .Call(`_QuadratiK_ParamCentering`, kmat_zz, z_mat, H, mu_hat, Sigma_hat)
+    .Call('_QuadratiK_ParamCentering', PACKAGE = 'QuadratiK', kmat_zz, z_mat, H, mu_hat, Sigma_hat)
 }
 
 #' Compute kernel-based quadratic distance test for Normality
@@ -93,7 +93,7 @@ ParamCentering <- function(kmat_zz, z_mat, H, mu_hat, Sigma_hat) {
 #' 
 #' @noRd
 kbNormTest <- function(x_mat, h, mu_hat, Sigma_hat) {
-    .Call(`_QuadratiK_kbNormTest`, x_mat, h, mu_hat, Sigma_hat)
+    .Call('_QuadratiK_kbNormTest', PACKAGE = 'QuadratiK', x_mat, h, mu_hat, Sigma_hat)
 }
 
 #' Poisson kernel-based test for Uniformity on the Sphere
@@ -111,7 +111,7 @@ kbNormTest <- function(x_mat, h, mu_hat, Sigma_hat) {
 #' 
 #' @noRd
 statPoissonUnif <- function(x_mat, rho) {
-    .Call(`_QuadratiK_statPoissonUnif`, x_mat, rho)
+    .Call('_QuadratiK_statPoissonUnif', PACKAGE = 'QuadratiK', x_mat, rho)
 }
 
 #'
@@ -127,7 +127,7 @@ statPoissonUnif <- function(x_mat, rho) {
 #' 
 #' @keywords internal
 var_two <- function(Kcen, nsamples) {
-    .Call(`_QuadratiK_var_two`, Kcen, nsamples)
+    .Call('_QuadratiK_var_two', PACKAGE = 'QuadratiK', Kcen, nsamples)
 }
 
 #' Compute kernel-based quadratic distance two-sample test with Normal kernel
@@ -154,7 +154,7 @@ var_two <- function(Kcen, nsamples) {
 #' 
 #' @noRd
 stat2sample <- function(x_mat, y_mat, h, mu_hat, Sigma_hat, centeringType = "Nonparam", compute_variance = TRUE) {
-    .Call(`_QuadratiK_stat2sample`, x_mat, y_mat, h, mu_hat, Sigma_hat, centeringType, compute_variance)
+    .Call('_QuadratiK_stat2sample', PACKAGE = 'QuadratiK', x_mat, y_mat, h, mu_hat, Sigma_hat, centeringType, compute_variance)
 }
 
 #'
@@ -171,7 +171,7 @@ stat2sample <- function(x_mat, y_mat, h, mu_hat, Sigma_hat, centeringType = "Non
 #' 
 #' @keywords internal
 var_k <- function(Kcen, sizes, cum_size) {
-    .Call(`_QuadratiK_var_k`, Kcen, sizes, cum_size)
+    .Call('_QuadratiK_var_k', PACKAGE = 'QuadratiK', Kcen, sizes, cum_size)
 }
 
 #' Kernel-based quadratic distance k-sample tests
@@ -194,6 +194,6 @@ var_k <- function(Kcen, sizes, cum_size) {
 #' 
 #' @noRd
 stat_ksample_cpp <- function(x, y, h, sizes, cum_size, compute_variance = TRUE) {
-    .Call(`_QuadratiK_stat_ksample_cpp`, x, y, h, sizes, cum_size, compute_variance)
+    .Call('_QuadratiK_stat_ksample_cpp', PACKAGE = 'QuadratiK', x, y, h, sizes, cum_size, compute_variance)
 }
 
diff --git a/R/clustering_functions.R b/R/clustering_functions.R
@@ -20,9 +20,9 @@
 #' are fairly well concentrated around the vectors \eqn{\mu_j} of each cluster;
 #' (3) the percentage of noise in the data increases.
 #' 
-#' @param dat Data matrix or data.frame of data points on the sphere to be 
-#'            clustered. The observations in \code{dat} are normalized by 
-#'            dividing with the length of the vector to ensure
+#' @param dat \eqn{(n \times d)}-data matrix or data.frame of data points on the
+#'             sphere to be clustered. The observations in \code{dat} are 
+#'             normalized by dividing with the length of the vector to ensure
 #'            that they lie on the \eqn{d}-dimensional sphere. Note that 
 #'            \eqn{d > 1}.
 #' @param nClust Number of clusters. It can be a single value or a numeric 
@@ -68,7 +68,7 @@
 #' \eqn{\mathcal{S}^{d-1}}, but it can also be performed on spherically
 #' transformed observations, i.e. data points on the Euclidean space 
 #' \eqn{\mathbb{R}^d} that are normalized such that they lie on the 
-#' corresponding \eqn{d}-dimensional sphere \eqn{\mathcal{S}^{d-1}}.
+#' corresponding \eqn{(d-1)}-dimensional sphere \eqn{\mathcal{S}^{d-1}}.
 #'
 #' @return An S4 object of class \code{pkbc} containing the results of the 
 #' clustering procedure based on Poisson kernel-based distributions. The object 
@@ -114,7 +114,7 @@
 #' data1<-rpkb(size, c(1,0,0),rho)
 #' data2<-rpkb(size, c(0,1,0),rho)
 #' data3<-rpkb(size, c(0,0,1),rho)
-#' dat<-rbind(data1$x,data2$x, data3$x)
+#' dat<-rbind(data1,data2, data3)
 #'
 #' #Perform the clustering algorithm with number of clusters k=3.
 #' pkbd<- pkbc(dat=dat, nClust=3)
@@ -130,14 +130,12 @@
 #' 
 #' @export
 setGeneric("pkbc",function(dat, 
-                           nClust = NULL,
+                           nClust,
                            maxIter = 300,
                            stoppingRule = "loglik",
                            initMethod = "sampleData",
-                           numInit = 10){
-
-   standardGeneric("pkbc")
-})
+                           numInit = 10)
+   standardGeneric("pkbc"))
 #' @rdname pkbc
 #' 
 #' @srrstats {G2.0} input nClust
@@ -155,7 +153,7 @@ setGeneric("pkbc",function(dat,
 #' @export
 setMethod("pkbc", signature(dat = "ANY"),
     function(dat,
-             nClust = NULL,
+             nClust,
              maxIter = 300,
              stoppingRule = "loglik",
              initMethod = "sampleData",
@@ -490,8 +488,9 @@ setMethod("summary", "pkbc", function(object) {
             function(res) {
                c(LogLik = res$LogLik, wcss = sum(res$wcss))
             }))
+   summaryMatrix <- cbind(object@input$nClust, summaryMatrix)
    rownames(summaryMatrix) <- names(object@res_k[object@input$nClust])
-   colnames(summaryMatrix) <- c("LogLik", "WCSS")
+   colnames(summaryMatrix) <- c("nClust", "LogLik", "WCSS")
    cat("Summary:\n")
    print(summaryMatrix)
    cat("\n")
@@ -706,7 +705,7 @@ setMethod("plot", c(x = "pkbc"),
 #' @importFrom grDevices rainbow
 #' @importFrom graphics legend
 #' @importFrom graphics par
-#' @import ggplot2
+#' @importFrom ggplot2 ggplot geom_point theme_minimal labs theme
 #' @importFrom rrcov PcaLocantore
 #' 
 #' @srrstats {G1.4} roxigen2 is used
@@ -831,7 +830,7 @@ scatterplotMethod <- function(object, k, true_label = NULL, pca_res = FALSE) {
 #'         within sum of squares computed with the Euclidean distance and the 
 #'         cosine similarity.
 #' 
-#' @import ggplot2
+#' @importFrom ggplot2 ggplot geom_line geom_point labs theme_minimal
 #' @importFrom ggpubr ggarrange
 #' 
 #' @srrstats {G1.4} roxigen2 is used
@@ -1008,10 +1007,12 @@ setMethod("predict", signature(object="pkbc"),
 #' between the partitions and a value close to 0 indicates a random assignment 
 #' of data points to clusters.
 #' 
-#' Each cluster can represented by a so-called silhouette which is based on the
-#' comparison of its tightness and separation. The average silhouette width 
-#' provides an evaluation of clustering validity, and might be used to select 
-#' an *appropriate* number of clusters (Rousseeuw 1987). 
+#' The average silhouette width quantifies the quality of clustering by 
+#' measuring how well each object fits within its assigned cluster. It is the 
+#' mean of silhouette values, which compare the tightness of an object within 
+#' its cluster to its separation from other clusters. Higher values indicate 
+#' well-separated, cohesive clusters, making it useful for selecting the 
+#' *appropriate* number of clusters (Rousseeuw 1987). 
 #' 
 #' Macro Precision is a metric used in multi-class classification that 
 #' calculates the precision for each class independently and then takes the 
@@ -1045,8 +1046,8 @@ setMethod("predict", signature(object="pkbc"),
 #'          \linkS4class{pkbc} for the class object definition.
 #'
 #' @references
-#' Kapp, A.V. and Tibshirani, R. (2007) "Are clusters found in one dataset present 
-#' in another dataset?", Biostatistics, 8(1), 9–31, 
+#' Kapp, A.V. and Tibshirani, R. (2007) "Are clusters found in one dataset 
+#' present in another dataset?", Biostatistics, 8(1), 9–31, 
 #' https://doi.org/10.1093/biostatistics/kxj029
 #' 
 #' Rousseeuw, P.J. (1987) Silhouettes: A graphical aid to the interpretation and
@@ -1067,7 +1068,7 @@ setMethod("predict", signature(object="pkbc"),
 #' data1<-rpkb(size, c(1,0,0),rho,method='rejvmf')
 #' data2<-rpkb(size, c(0,1,0),rho,method='rejvmf')
 #' data3<-rpkb(size, c(1,0,0),rho,method='rejvmf')
-#' data<-rbind(data1$x,data2$x, data3$x)
+#' data<-rbind(data1,data2, data3)
 #'
 #' #Perform the clustering algorithm
 #' pkbc_res<- pkbc(data, 3)

diff --git a/R/h_selection.R b/R/h_selection.R
@@ -123,16 +123,17 @@
 #' @importFrom parallel clusterExport
 #' @importFrom parallel detectCores
 #' @importFrom parallel stopCluster
-#' @import foreach 
+#' @importFrom foreach foreach %dopar%
 #' @importFrom stats cov
 #' @importFrom stats aggregate
 #' @importFrom stats power
-#' @import ggplot2
+#' @importFrom ggplot2 ggplot geom_line labs theme_minimal theme_light theme
+#' @importFrom ggplot2 scale_color_brewer
 #' @import RcppEigen
 #' @import rlecuyer
 #' @importFrom Rcpp sourceCpp
 #'
-#' @useDynLib QuadratiK, .registration = TRUE
+#' @useDynLib QuadratiK
 #'
 #' @srrstats {G1.4} roxigen2 is used
 #' @srrstats {G2.0, G2.0a} input y, delta_dim, B, b
@@ -358,12 +359,13 @@ select_h <- function(x, y=NULL, alternative=NULL, method="subsampling", b=0.8,
    #                                       "compute_CV","stat2sample"))
 
    D <- length(delta)
+   len_h <- length(h_values)
 
    k_values <- 1:D
    rep_values <- 1:Nrep
 
    params <- expand.grid(Rep=rep_values, h = h_values)
-   params <- split(params, seq(nrow(params)))
+   params <- split(params, seq_len(Nrep*len_h))
 
    res <- data.frame(delta=numeric(),
                      h=numeric(), power=numeric())

diff --git a/R/kb.test.R b/R/kb.test.R
@@ -256,10 +256,8 @@ setGeneric("kb.test",function(x, y=NULL, h = NULL, method = "subsampling",
                               B = 150, b = NULL, Quantile = 0.95, 
                               mu_hat = NULL, Sigma_hat = NULL, 
                               centeringType="Nonparam", 
-                              K_threshold=10, alternative="skewness"){
-
-   standardGeneric("kb.test")
-})
+                              K_threshold=10, alternative="skewness")
+   standardGeneric("kb.test"))
 #' @rdname kb.test
 #' 
 #' @srrstats {G1.4} roxigen2 is used
@@ -517,7 +515,8 @@ setMethod("show", "kb.test",
 #' @seealso [kb.test()] and \linkS4class{kb.test} for more details.
 #'
 #' @importFrom ggpubr ggarrange
-#' @import ggplot2
+#' @importFrom ggplot2 ggplot geom_line theme_minimal geom_abline ggtitle
+#' @importFrom ggplot2 xlab ylab
 #'
 #'@examples
 #' # create a kb.test object