From 9ed2609f419c9f8aa42ad5751925c77ce1a97200 Mon Sep 17 00:00:00 2001 From: PiotrTymoszuk <80723424+PiotrTymoszuk@users.noreply.github.com> Date: Fri, 17 Nov 2023 11:22:25 +0100 Subject: [PATCH] Regularized KMEANS clustering, new functionalities Implementation of hard threshold regularized KMEANS clustering (HTKmeans, function `htk_cluster()`). Tuning functions for lambda in HTKmeans and k in kNN prediction of the cluster assignment --- DESCRIPTION | 9 +- NAMESPACE | 18 + R/appearance.R | 27 +- R/class_testing.R | 7 +- R/clust_analysis_oop.R | 201 ++++---- R/combi_analysis_oop.R | 16 + R/combi_clustering.R | 7 + R/constructors.R | 123 ++++- R/cross_distance.R | 23 +- R/cross_distance_utils.R | 19 +- R/cross_validation.R | 126 +++-- R/feature_importance.R | 206 ++++----- R/iterative_semi_supervised.R | 242 ++++++++++ R/plotting_utils.R | 113 +++++ R/red_analysis_oop.R | 116 ++++- R/reduction.R | 73 ++- R/regularized_clustering.R | 481 ++++++++++++++++++++ R/sil_oop.R | 12 +- R/tuner_oop.R | 269 +++++++++++ inst/examples/devel_test.R | 152 ++++++- inst/examples/github_examples_multi_layer.R | 2 +- inst/examples/github_examples_som.R | 18 +- inst/examples/github_examples_sparsity.R | 381 ++++++++++++++++ 23 files changed, 2342 insertions(+), 299 deletions(-) create mode 100644 R/iterative_semi_supervised.R create mode 100644 R/regularized_clustering.R create mode 100644 R/tuner_oop.R create mode 100644 inst/examples/github_examples_sparsity.R diff --git a/DESCRIPTION b/DESCRIPTION index ca9adfe..9de7fe6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,12 +1,12 @@ Package: clustTools Type: Package Title: Tools for Dimensionality Reduction and Data Clustering -Version: 1.2.1 +Version: 1.3.0 Maintainer: Piotr Tymoszuk Description: Provides a comprehensive set of R functions for dimensionality - reduction (PCA, MDS and UMAP), clustering (HCl, k-means, PAM, DBSCAN, SOM) - and combined reduction-clustering tools. Visualization and QC toolset is - provided as well. + reduction (PCA, MDS and UMAP), clustering (HCl, KMEANS, PAM, DBSCAN, SOM + and regularized KMEANS) and combined reduction-clustering tools. + Visualization and QC toolset is provided as well. License: GPL-3 Encoding: UTF-8 LazyData: true @@ -16,6 +16,7 @@ Authors@R: comment = c(ORCID = "0000-0002-0398-6034")) Imports: caret, + clusterHD, cowplot, coxed, dbscan, diff --git a/NAMESPACE b/NAMESPACE index c1f7787..c4b5d43 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -21,6 +21,7 @@ S3method(extract,clust_analysis) S3method(extract,cluster_cv) S3method(extract,combi_analysis) S3method(extract,red_analysis) +S3method(extract,tuner) S3method(impact,clust_analysis) S3method(impact,combi_analysis) S3method(impact,min_analysis) @@ -39,12 +40,14 @@ S3method(np,red_analysis) S3method(pbc,clust_analysis) S3method(pbc,combi_analysis) S3method(plot,clust_analysis) +S3method(plot,clust_red) S3method(plot,combi_analysis) S3method(plot,cross_dist) S3method(plot,importance) S3method(plot,knb) S3method(plot,red_analysis) S3method(plot,sil_extra) +S3method(plot,tuner) S3method(plot,umatrix_analysis) S3method(plot_clust_hm,clust_analysis) S3method(plot_clust_hm,combi_analysis) @@ -55,22 +58,30 @@ S3method(predict,combi_analysis) S3method(predict,min_analysis) S3method(predict,red_analysis) S3method(predict,umatrix_analysis) +S3method(prediter,clust_analysis) +S3method(prediter,combi_analysis) +S3method(prediter,min_analysis) +S3method(prediter,umatrix_analysis) S3method(print,clust_analysis) S3method(print,combi_analysis) S3method(print,cross_dist) S3method(print,red_analysis) +S3method(print,tuner) S3method(qe,clust_analysis) S3method(qe,combi_analysis) S3method(rename,clust_analysis) S3method(rename,combi_analysis) S3method(silhouette,clust_analysis) S3method(silhouette,combi_analysis) +S3method(summary,clust_analysis) S3method(summary,cluster_cv) +S3method(summary,combi_analysis) S3method(summary,cross_dist) S3method(summary,importance) S3method(summary,knb) S3method(summary,red_analysis) S3method(summary,sil_extra) +S3method(summary,tuner) S3method(te,clust_analysis) S3method(te,combi_analysis) S3method(var,clust_analysis) @@ -97,6 +108,7 @@ export(extract.red_analysis) export(get_clust_tendency) export(get_kernel_info) export(hcluster) +export(htk_cluster) export(impact) export(is_clust_analysis) export(is_cluster_cv) @@ -107,6 +119,7 @@ export(is_knb) export(is_min_analysis) export(is_red_analysis) export(is_sil_extra) +export(is_tuner) export(is_umatrix_analysis) export(kcluster) export(min_max) @@ -121,11 +134,13 @@ export(plot.importance) export(plot.knb) export(plot.red_analysis) export(plot.sil_extra) +export(plot.tuner) export(plot_clust_hm) export(plot_nbclust) export(predict.clust_analysis) export(predict.combi_analysis) export(predict.red_analysis) +export(prediter) export(qe) export(red_analysis) export(reduce_data) @@ -136,11 +151,14 @@ export(silhouette.clust_analysis) export(silhouette.combi_analysis) export(som_cluster) export(som_reduce) +export(summary.clust_analysis) +export(summary.combi_analysis) export(summary.cross_dist) export(summary.importance) export(summary.knb) export(summary.sil_extra) export(te) +export(tune_htk) export(var) export(var.clust_analysis) export(var.combi_analysis) diff --git a/R/appearance.R b/R/appearance.R index e5cbace..be92371 100644 --- a/R/appearance.R +++ b/R/appearance.R @@ -1,6 +1,6 @@ # Appearance for the S3 class objects -#' Printing of a clust_analysis object. +#' Printing of objects. #' #' @description #' Prints a `clust_analysis`, `combi_analysis`, `red_analysis` @@ -33,7 +33,12 @@ stopifnot(is_combi_analysis(x)) - purrr::walk(x$clust_analyses, print) + cat(paste('Combined SOM/clusetring analysis with', + x$dist_method, 'distance method.')) + + cat('\nCluster assignment:\n') + + print(as_tibble(x$clust_assignment)) } @@ -61,9 +66,27 @@ stopifnot(is_red_analysis(x)) cat(paste0(toupper(x$red_fun), ' reduction analysis object.')) + cat('\nComponents:\n') + print(as_tibble(x$component_tbl)) } +#' @rdname print.clust_analysis +#' @export + + print.tuner <- function(x, ...) { + + stopifnot(is_tuner(x)) + + cat(paste('Tuning results for', + paste(x$tune_params, collapse = ', '))) + + cat('\nBest parameter combination:\n') + + print(as_tibble(x$best_tune)) + + } + # END ------ diff --git a/R/class_testing.R b/R/class_testing.R index b47926b..91e73bf 100644 --- a/R/class_testing.R +++ b/R/class_testing.R @@ -5,7 +5,7 @@ #' @description #' Tests if the object is an instance of the `red_analysis`, `clust_analysis`, #' `combi_analysis`, `importance`, `cross_dist`, `sil_extra`, `min_analysis`, -#' `umatrix_analysis`, `cluster_cv` or `knb` class. +#' `umatrix_analysis`, `cluster_cv`, `knb` or `tuner` class. #' #' @return a logical value. #' @@ -60,4 +60,9 @@ is_knb <- function(x) inherits(x, 'knb') +#' @rdname is_clust_analysis +#' @export + + is_tuner <- function(x) inherits(x, 'tuner') + # END ------ diff --git a/R/clust_analysis_oop.R b/R/clust_analysis_oop.R index 86228b3..9eedc50 100644 --- a/R/clust_analysis_oop.R +++ b/R/clust_analysis_oop.R @@ -105,6 +105,7 @@ clust_algorithm <- switch(x$clust_fun, hclust = paste(',', x$hc_method, 'method'), kmeans = '', + htk = '', pam = '', som = ', single-layer', supersom = ', multi-layer', @@ -113,6 +114,7 @@ clust_method <- switch(x$clust_fun, hclust = 'Hierarchical clustering', kmeans = 'Kmeans clustering', + htk = 'HT-KMEANS clustering', pam = 'PAM clustering', som = 'SOM clustering', supersom = 'SOM clustering', @@ -137,7 +139,7 @@ if(type == 'data' & x$clust_fun %in% c('supersom', 'supersom_prediction')) { - warning("The 'data' option is not available for mMulti-layer SOM.", + warning("The 'data' option is not available for multi-layer SOM.", call. = FALSE) return(NULL) @@ -152,8 +154,6 @@ if(x$clust_fun %in% c('hclust', 'kmeans', 'pam')) { - plot_title <- 'Optimal cluster number' - k <- length(unique(x$clust_assignment$clust_id)) clust_args <- switch(x$clust_fun, @@ -170,7 +170,7 @@ diss = dist(x, 'distance'), k = k, !!!clust_args, - plot_title = plot_title, + plot_title = 'Optimal cluster number', plot_subtitle = plot_subtitle, plot_tag = plot_tag, cust_theme = cust_theme)) @@ -207,6 +207,14 @@ plot_list <- plot_som(x$clust_obj) + } else if(x$clust_fun == 'htk') { + + plot_list <- plot_htk(x = x, + plot_title = 'Optimal cluster number', + plot_subtitle = plot_subtitle, + plot_tag = plot_tag, + cust_theme = cust_theme) + } else { warning('No training plots available for cluster predictions', @@ -230,113 +238,28 @@ if(is_red_analysis(red_results)) { - ## providing a 1d PCA, MDS or UMAP should throw an error - - if(!all(c('comp_1', 'comp_2') %in% names(red_results$component_tbl))) { - - stop('Atempt to plot a 1-dimensional reduction analysis result. Adjust kdim?', - call. = FALSE) - - } - - sdevs <- var(red_results) - - if(red_results$red_fun == 'pca') { - - ax_labs <- map2(c('PC1', 'PC2'), - signif(sdevs$perc_var[1:2], 3), - ~paste0(.x, ', ', .y, '%')) - - } else { - - ax_labs <- map2(c('Dim 1', 'Dim 2'), - signif(sdevs$perc_var[1:2], 3), - ~paste0(.x, ', ', .y, '%')) - - } - - point_plot <- - plot_point(data = red_results$component_tbl, - x_var = 'comp_1', - y_var = 'comp_2', - fill_var = 'clust_id', - plot_title = switch(red_results$red_fun , - pca = 'PCA', - mds = 'MDS', - umap = 'UMAP'), - plot_subtitle = plot_subtitle, - plot_tag = plot_tag, - x_lab = ax_labs[[1]], - y_lab = ax_labs[[2]], - cust_theme = cust_theme, - jitter_height = jitter_height, - jitter_width = jitter_width, - fill_lab = 'Cluster ID', - point_alpha = point_alpha) + point_plot <- plot.clust_red(red_results, + type = 'score', + label_clust = TRUE, + cust_theme = cust_theme, + plot_subtitle = plot_subtitle, + jitter_height = jitter_height, + jitter_width = jitter_width, + point_alpha = point_alpha) return(point_plot) } else { - plot_data <- map(red_results, ~.x$component_tbl) - - name_check <- - map_lgl(plot_data, - function(x) all(c('comp_1', 'comp_2') %in% names(x))) - - if(any(!name_check)) { - - stop('Atempt to plot a 1-dimensional reduction analysis result. Adjust kdim?', - call. = FALSE) - - } - - sdev <- map(red_results, var) - - if(red_results[[1]]$red_fun == 'pca') { - - ax_labs <- - map(sdev, - function(layer) map2(c('PC1', 'PC2'), - signif(layer$perc_var[1:2], 3), - ~paste0(.x, ', ', .y, '%'))) - - } else { - - ax_labs <- - map(sdev, - function(layer) map2(c('Dim1', 'Dim2'), - signif(layer$perc_var[1:2], 3), - ~paste0(.x, ', ', .y, '%'))) - - } - - ax_labs <- purrr::transpose(ax_labs) - - n_numbers <- map(red_results, nobs) - - plot_tags <- map(n_numbers, - ~paste('Observations: n = ', .x[['observations']], - '\nVariables: n = ', .x[['variables']])) - point_plots <- - pmap(list(data = plot_data, - x_lab = ax_labs[[1]], - y_lab = ax_labs[[2]], - plot_tag = plot_tags), - plot_point, - x_var = 'comp_1', - y_var = 'comp_2', - fill_var = 'clust_id', - plot_title = switch(red_results[[1]]$red_fun, - pca = 'PCA', - mds = 'MDS', - umap = 'UMAP'), - plot_subtitle = plot_subtitle, + pmap(list(x = red_results), + plot.clust_red, + type = 'score', + label_clust = TRUE, cust_theme = cust_theme, + plot_subtitle = plot_subtitle, jitter_height = jitter_height, jitter_width = jitter_width, - fill_lab = 'Cluster ID', point_alpha = point_alpha) return(point_plots) @@ -474,4 +397,78 @@ } +# Summary quality stats ------- + +#' Quality control of clustering solutions. +#' +#' @description +#' Computes basic global statistics of quality of a clustering analysis object. +#' +#' @details +#' The statistics retrieved by the `summary()` method are: +#' +#' * _silhouette width_ (`sil_width`) +#' +#' * _fraction of potentially misclassified observations_ with negative +#' silhouette widths (`frac_misclassified`) +#' +#' * _fraction of explained clustering variance_ expressed as the ratio of total +#' between sum of squares to total sum of squares (`frac_var`) +#' +#' * _fraction of preserved nearest neighbors_ (`frac_np`) +#' +#' The statistics are computed with \code{\link{silhouette}}, \code{\link{var}}, +#' and \code{\link{np}} methods for the entire clustering structure and not +#' for particular clusters. +#' +#' @references +#' Rousseeuw PJ. Silhouettes: A graphical aid to the interpretation and +#' validation of cluster analysis. J Comput Appl Math (1987) 20:53–65. +#' doi:10.1016/0377-0427(87)90125-7 +#' +#' @references +#' Venna J, Kaski S. Neighborhood preservation in nonlinear projection methods: +#' An experimental study. Lect Notes Comput Sci (including Subser Lect Notes +#' Artif Intell Lect Notes Bioinformatics) (2001) 2130:485–491. +#' doi:10.1007/3-540-44668-0_68 +#' +#' @return a data frame with columns characterized in Details. +#' +#' @param object a `clust_analysis` or `combi_analysis` object. +#' @param ... extra arguments passed to \code{\link{np}}. +#' +#' @export summary.clust_analysis +#' @export + + summary.clust_analysis <- function(object, ...) { + + stopifnot(is_clust_analysis(object) | is_combi_analysis(object)) + + sil_res <- try(silhouette(object), silent = TRUE) + + variance <- try(var(object), silent = TRUE) + + neighbor_preservation <- try(np(object, type = 'final', ...), silent = TRUE) + + stats <- + tibble(sil_width = tryCatch(mean(sil_res$sil_width, na.rm = TRUE), + error = function(e) NA), + frac_misclassified = tryCatch(sum(sil_res$sil_width < 0)/nrow(sil_res), + error = function(e) NA), + frac_var = tryCatch(variance$frac_var, + error = function(e) NA), + frac_np = tryCatch(mean(neighbor_preservation$frac_np, na.rm = TRUE), + error = function(e) NA)) + + if(any(map_lgl(stats, is.na))) { + + warning('At least one statistic could not be calculated.', + call. = FALSE) + + } + + stats + + } + # END ------ diff --git a/R/combi_analysis_oop.R b/R/combi_analysis_oop.R index c6251af..ae8ed63 100644 --- a/R/combi_analysis_oop.R +++ b/R/combi_analysis_oop.R @@ -122,6 +122,10 @@ obs_red <- components(x$clust_analyses$observation, ...) + ## handling bad requests, e.g. training objects for distance reductions + + if(is.null(obs_red)) return(NULL) + score_tbl <- select(obs_red$component_tbl, observation, dplyr::starts_with('comp')) @@ -278,4 +282,16 @@ } +# Summary ------ + +#' @rdname summary.clust_analysis +#' @export summary.combi_analysis +#' @export + + summary.combi_analysis <- function(object, ...) { + + summary.clust_analysis(object, ...) + + } + # END ------ diff --git a/R/combi_clustering.R b/R/combi_clustering.R index 0b40eb8..a6028ad 100644 --- a/R/combi_clustering.R +++ b/R/combi_clustering.R @@ -156,6 +156,11 @@ dots <- list2(...) + ## special handling of lambda vectors for regularized + ## cluster functions of the nodes + + dots$lambdas <- NULL + if(!is.null(som_args)) { dots <- c(dots, @@ -163,6 +168,8 @@ } + dots <- compact(dots) + combi_analysis(list(clust_analyses = list(observation = som_clust, node = node_clust), clust_assignment = combi_ass, diff --git a/R/constructors.R b/R/constructors.R index c8ad646..7a4dbfd 100644 --- a/R/constructors.R +++ b/R/constructors.R @@ -24,6 +24,12 @@ #' #' `data` a quosure calling the original data set. #' +#' If the `component_tbl` data frame contains the cluster assignment information +#' an object of the `clust_red` sub-class is created. This sub-class inherits +#' almost all methods from the superclass `red_analysis`. The only difference +#' concerns the `plot()` method, which, by default generates layout (score) +#' scatter plots with the cluster assignment coded by the point color. +#' #' @param x a named list, see Details. #' #' @return a `red_analysis` object with the elements listed in Details. @@ -67,7 +73,19 @@ ## output - structure(x, class = 'red_analysis') + x <- structure(x, class = 'red_analysis') + + if(is.data.frame(x$component_tbl)) { + + if('clust_id' %in% names(x$component_tbl)) { + + x <- structure(x, class = c('clust_red', class(x))) + + } + + } + + x } @@ -155,6 +173,7 @@ stopifnot(x$clust_fun %in% c('hclust', 'kmeans', + 'htk', 'dbscan', 'som', 'pam', @@ -529,12 +548,12 @@ err_txt <- paste("The 'fold_stats' element has to be a data frame with the", - "'fold', 'corr_rate', 'err_rate', 'frac_var' and 'sil_width'", + "'fold', 'accuracy', 'error', 'frac_var' and 'sil_width'", "variables") if(!is.data.frame(x$fold_stats)) stop(err_txt, call. = FALSE) - if(any(!c('fold', 'corr_rate', 'err_rate', 'frac_var', 'sil_width') %in% names(x$fold_stats))) { + if(any(!c('fold', 'accuracy', 'error', 'frac_var', 'sil_width') %in% names(x$fold_stats))) { stop(err_txt, call. = FALSE) @@ -628,4 +647,102 @@ } +# Tuning results ------ + +#' Create an `tuner` class object. +#' +#' @description +#' Creates and object of the `tuner` class on the top of a list with tuning +#' of parameters of cluster analysis or prediction. +#' +#' @details +#' The input list has to have three elements: +#' +#' * `analysis`: a `clust_analysis` or `combi_analysis` object created with the +#' best set of the tuning parameters +#' +#' * `stats`: a data frame with values of quality stats (silhouette width, +#' fraction of potentially misclassified observations/negative silhouette width, +#' fraction of explained clustering variance, and fraction of preserved nearest +#' neighbors) +#' +#' * `fun`: name of the tuning function +#' +#' * `dataset`: a string specified which data was used during the tuning: the +#' training data set or cross-validation +#' +#' * `type`: type of analysis, development or prediction +#' +#' * `clust_vars`: a vector of names of clustering variables +#' +#' * `tune_params`: a vector of names of the tuning parameters +#' +#' * `tune_criteria`: a data frame that specifies which criteria were applied +#' to select the best combination of the tuning parameters +#' +#' * `best_tune`: a data frame storing the best values of the tuning parameters +#' +#' @param x a list with elements specified in Details. +#' +#' @return an instance of the `tuner` class as described in Details. + + tuner <- function(x) { + + ## entry control ------- + + elements <- c('analysis', + 'stats', + 'fun', + 'dataset', + 'type', + 'clust_vars', + 'tune_params', + 'tune_criteria', + 'best_tune') + + err_txt <- + paste("'x' has to be a list with the", + paste(elements, collapse = ', '), + "elements.") + + if(!is.list(x)) stop(err_txt, call. = FALSE) + + if(any(!elements %in% names(x))) { + + stop(err_txt, call. = FALSE) + + } + + stat_vars <- c('sil_width', + 'frac_misclassified', + 'frac_var', + 'frac_np') + + err_txt <- + paste("The `stats` element of 'x' has to be a data frame with the", + paste(stat_vars, collapse = ', '), + "columns") + + if(!is.data.frame(x$stats)) stop(err_txt, call. = FALSE) + + if(any(!stat_vars %in% names(x$stats))) { + + stop(err_txt, call. = FALSE) + + } + + if(!is_clust_analysis(x$analysis) & !is_combi_analysis(x$analysis)) { + + stop(paste("The `analysis` element of 'x' has to be an instance of", + "of the 'clust_analysis' or 'combi_analysis' class."), + call. = FALSE) + + } + + ## class setting ------- + + structure(x, class = 'tuner') + + } + # END ------ diff --git a/R/cross_distance.R b/R/cross_distance.R index e491785..8dd3b00 100644 --- a/R/cross_distance.R +++ b/R/cross_distance.R @@ -162,7 +162,19 @@ if(!is_multi_layer(x)) { - return(cross_single_homolog(x, method)) + if(!is.null(method)) { + + return(cross_single_homolog(x, method)) + + } else { + + dist_obj <- cross_distance.min_analysis(x) + + attr(dist_obj, 'dist_method') <- x$dist_method + + return(dist_obj) + + } } else { @@ -239,11 +251,16 @@ ~list(clust_assignment[[.x[1]]], clust_assignment[[.x[2]]])) - ## cross-distance objects + ## cross-distance objects: subsetting done in two steps + ## to prevent returning one-dimensional vectors dist_lst <- map(pairs, - ~general_dist[.x[[1]], .x[[2]]], drop = FALSE) + ~general_dist[.x[[1]], , drop = FALSE]) + + dist_lst <- + map2(dist_lst, pairs, + ~.x[, .y[[2]], drop = FALSE]) cross_dist(dist_lst, type = 'homologous', diff --git a/R/cross_distance_utils.R b/R/cross_distance_utils.R index a949dd6..8568244 100644 --- a/R/cross_distance_utils.R +++ b/R/cross_distance_utils.R @@ -201,14 +201,27 @@ function(data, clust) map(clust, ~data[.x, , drop = FALSE])) + ## I'm working with safely() here. It is possible that some clusters + ## are empty in one of the tested objects dist_lst <- furrr::future_map(pairs, - ~cross_distance(clust_data$x[[.x[1]]], - clust_data$y[[.x[2]]], - method = method), + ~purrr::safely(cross_distance)(clust_data$x[[.x[1]]], + clust_data$y[[.x[2]]], + method = method), .options = furrr::furrr_options(seed = TRUE)) + dist_errors <- compact(map(dist_lst, ~.x$error)) + + if(length(dist_errors) > 0) { + + warning('Distance calcultion failed for come cluster pairs.', + call. = FALSE) + + } + + dist_lst <- compact(map(dist_lst, ~.x$result)) + cross_dist(dist_lst, type = 'heterologous', method = method, diff --git a/R/cross_validation.R b/R/cross_validation.R index 6123c5c..3fe60c0 100644 --- a/R/cross_validation.R +++ b/R/cross_validation.R @@ -44,6 +44,11 @@ #' @references #' Wehrens R, Kruisselbrink J. Flexible self-organizing maps in kohonen 3.0. #' J Stat Softw (2018) 87:1–18. doi:10.18637/jss.v087.i07 +#' @references +#' Venna J, Kaski S. Neighborhood preservation in nonlinear projection +#' methods: An experimental study. Lect Notes Comput Sci (including Subser Lect +#' Notes Artif Intell Lect Notes Bioinformatics) (2001) 2130:485–491. +#' doi:10.1007/3-540-44668-0_68 #' #' @param data a numeric data frame, matrix or a `red_analysis` object. If a #' `red_analysis` object is provided as the data argument, the observation @@ -66,6 +71,12 @@ #' @param kernel_fun kernel function transforming the distance into weight. #' @param clustering_fun clustering function. Should return a #' `clust_analysis` object. +#' @param kNN_data number of the nearest neighbors in the genuine data set +#' used for calculation of neighborhood preservation. See \code{\link{np}} +#' for details. +#' @param kNN_cluster number of the nearest neighbors of the given cluster used +#' for calculation of neighborhood preservation. See \code{\link{np}} for +#' details. #' @param seed initial setting of the random number generator. #' @param .parallel logical, should the CV be run in parallel? #' @param ... extra arguments passed to the clustering_fun. @@ -92,6 +103,8 @@ resolve_ties = FALSE, kernel_fun = function(x) 1/x, clustering_fun = clustTools::kcluster, + kNN_data = 5, + kNN_cluster = NULL, seed = 1234, .parallel = FALSE, ...) { @@ -242,8 +255,8 @@ by = 'observation')) correct <- NULL - corr_rate <- NULL - err_rate <- NULL + accuracy <- NULL + error <- NULL fold <- NULL test_preds <- @@ -254,54 +267,57 @@ fold = .y)) test_error <- dplyr::summarise(test_preds, - corr_rate = mean(as.numeric(correct), - na.rm = TRUE), - err_rate = 1 - corr_rate, + accuracy = mean(as.numeric(correct), + na.rm = TRUE), + error = 1 - accuracy, .by = 'fold') - ## variances and silhouettes ------- - - frac_var <- NULL - sil_width <- NULL - - test_variances <- map(test_obj, var) + ## variances, silhouettes and neighborhood preservation, out-of-fold ------- - test_variances <- map2_dfr(test_variances, - names(test_variances), - ~tibble(fold = .y, - frac_var = .x$frac_var)) + ## neighborhood preservation seems to be a bit tricky statistic to calculate + ## with some distance metrics for a large dimension number, hence, + ## I'm working with a safe solution - test_silhouettes <- map(test_obj, purrr::safely(silhouette)) + test_stats <- map(test_obj, + purrr::safely(summary), + kNN_data = kNN_data, + kNN_cluster = kNN_cluster) - test_silhouettes <- - compact(map(test_silhouettes, ~.x$result)) + test_stats <- map(test_stats, ~.x$result) - test_silhouettes <- map2_dfr(test_silhouettes, - names(test_silhouettes), - ~tibble(fold = .y, - sil_width = mean(.x$sil_width))) + test_stats <- map2_dfr(test_stats, names(test_stats), + ~mutate(.x, fold = .y)) ## summary stats -------- - test_stats <- reduce(list(test_error, test_variances, test_silhouettes), - left_join, - by = 'fold') + test_stats <- left_join(test_error, + test_stats, + by = 'fold') + + stat_names <- + c('accuracy', 'error', + 'sil_width', 'frac_misclassified', 'frac_var', 'frac_np') test_stats <- - test_stats[c('fold', 'corr_rate', 'err_rate', 'frac_var', 'sil_width')] + select(test_stats, + fold, + dplyr::all_of(stat_names)) oof_means <- - map(test_stats[c('corr_rate', 'err_rate', 'frac_var', 'sil_width')], - mean, na.rm = TRUE) + map(select(test_stats, + dplyr::all_of(stat_names)), + mean, + na.rm = TRUE) bca_err <- - map(test_stats[c('corr_rate', 'err_rate', 'frac_var', 'sil_width')], + map(select(test_stats, + dplyr::all_of(stat_names)), ~.x[!is.na(.x)]) bca_err <- map(bca_err, coxed::bca, conf.level = 0.95) test_summary <- - pmap(list(c('accuracy', 'error', 'frac_var', 'sil_width'), + pmap(list(stat_names[stat_names %in% names(test_stats)], oof_means, bca_err), function(x, y, z) tibble(!!paste0(x, '_mean') := y, @@ -371,6 +387,11 @@ #' @references #' Wehrens R, Kruisselbrink J. Flexible self-organizing maps in kohonen 3.0. #' J Stat Softw (2018) 87:1–18. doi:10.18637/jss.v087.i07 +#' @references +#' Venna J, Kaski S. Neighborhood preservation in nonlinear projection +#' methods: An experimental study. Lect Notes Comput Sci (including Subser Lect +#' Notes Artif Intell Lect Notes Bioinformatics) (2001) 2130:485–491. +#' doi:10.1007/3-540-44668-0_68 #' #' @param x an object. #' @param nfolds number of CV folds. @@ -384,6 +405,12 @@ #' @param resolve_ties logical, should the ties be resolved at random? Applies #' only to the simple unweighted voting algorithm. #' @param kernel_fun kernel function transforming the distance into weight. +#' @param kNN_data number of the nearest neighbors in the genuine data set +#' used for calculation of neighborhood preservation. See \code{\link{np}} +#' for details. +#' @param kNN_cluster number of the nearest neighbors of the given cluster used +#' for calculation of neighborhood preservation. See \code{\link{np}} for +#' details. #' @param seed initial setting of the random number generator. #' @param .parallel logical, should the CV be run in parallel? #' @param ... extra arguments, currently none. @@ -395,11 +422,12 @@ #' * kNN projection (prediction) results (`predictions`) #' #' * a data frame with the classification error, accuracy, fraction of -#' explained clustering variance and silhouette for the out-of-fold -#' predictions (`fold_stats`) +#' explained clustering variance, silhouette and neighbor preservation for +#' the out-of-fold predictions (`fold_stats`) #' #' * means and BCA's 95% confidence intervals for the classification error, -#' accuracy, fraction of explained variance and silhouette (`summary`) +#' accuracy, fraction of explained variance, silhouette and neighborhood +#' preservation (`summary`) #' #' Note the \code{\link{summary.cluster_cv}} and #' \code{\link{extract.cluster_cv}} methods. @@ -418,6 +446,8 @@ simple_vote = TRUE, resolve_ties = FALSE, kernel_fun = function(x) 1/x, + kNN_data = 5, + kNN_cluster = NULL, seed = 1234, .parallel = FALSE, ...) { @@ -430,6 +460,21 @@ nfolds <- as.integer(nfolds) kNN <- as.integer(kNN) + kNN_data <- as.integer(kNN_data) + + if(is.null(kNN_cluster)) { + + if(x$clust_fun %in% c('som', 'supersom')) { + + kNN_cluster <- kNN_data + + } else { + + kNN_cluster <- 1 + + } + + } stopifnot(is.logical(simple_vote)) stopifnot(is.logical(resolve_ties)) @@ -485,11 +530,14 @@ resolve_ties = resolve_ties, kernel_fun = kernel_fun, distance_method = distance_names, + kNN_data = kNN_data, + kNN_cluster = kNN_cluster, seed = seed, .parallel = .parallel, k = nrow(ngroups(x)), hc_method = x$hc_method, clust_fun = x$clust_fun, + lambdas = x$lambdas, xdim = x$grid$xdim, ydim = x$grid$ydim, topo = x$grid$topo, @@ -502,7 +550,7 @@ ## function calls ------- - if(x$clust_fun %in% c('hclust', 'som', 'supersom')) { + if(x$clust_fun %in% c('hclust', 'som', 'supersom', 'htk')) { cmm_args$clust_fun <- NULL @@ -520,6 +568,7 @@ clustering_fun = switch(x$clust_fun, hclust = hcluster, kmeans = kcluster, + htk = htk_cluster, pam = kcluster, dbscan = dbscan_cluster, som = som_cluster, @@ -554,6 +603,8 @@ simple_vote = TRUE, resolve_ties = FALSE, kernel_fun = function(x) 1/x, + kNN_data = 5, + kNN_cluster = NULL, seed = 1234, .parallel = FALSE, ...) { @@ -566,6 +617,9 @@ nfolds <- as.integer(nfolds) kNN <- as.integer(kNN) + kNN_data <- as.integer(kNN_data) + + if(is.null(kNN_cluster)) kNN_cluster <- 1 stopifnot(is.logical(simple_vote)) stopifnot(is.logical(resolve_ties)) @@ -604,7 +658,6 @@ } - args <- list2(data = model.frame(x)$observation, nfolds = nfolds, @@ -622,12 +675,15 @@ node_clust_fun = switch(x$clust_analyses$node$clust_fun, hclust = hcluster, kmeans = kcluster, + htk = htk_cluster, pam = kcluster, dbscan = dbscan_cluster, som = som_cluster), kernel_fun = kernel_fun, simple_vote = simple_vote, resolve_ties = resolve_ties, + kNN_data = kNN_data, + kNN_cluster = kNN_cluster, seed = seed, .parallel = .parallel) diff --git a/R/feature_importance.R b/R/feature_importance.R index 7e9add1..06c0b34 100644 --- a/R/feature_importance.R +++ b/R/feature_importance.R @@ -163,6 +163,8 @@ ## entry control -------- + start_time <- Sys.time() + stopifnot(is_clust_analysis(x)) stopifnot(is.numeric(n_iter)) @@ -180,25 +182,49 @@ } + clustering_fun <- + switch(x$clust_fun, + hclust = hcluster, + kmeans = kcluster, + htk = htk_cluster, + pam = kcluster, + dbscan = dbscan_cluster, + som = som_cluster, + supersom = som_cluster) + + ## parallel backend and benchmarking ------- + + message(paste('Permutation importance testing with', + n_iter, 'iterations')) + + if(.parallel) future::plan('multisession') + + on.exit(message(paste('Elapsed:', Sys.time() - start_time)), + add = TRUE) + + on.exit(future::plan('sequential')) + ## common parameters -------- + cmm_list <- + list2(data = model.frame(x), + distance_method = distance_names, + .parallel = .parallel, + k = nrow(ngroups(x)), + hc_method = x$hc_method, + clust_fun = x$clust_fun, + lambdas = x$lambdas, + xdim = x$grid$xdim, + ydim = x$grid$ydim, + topo = x$grid$topo, + eps = x$eps, + minPts = x$minPts, + neighbourhood.fct = as.character(x$grid$neighbourhood.fct), + toroidal = x$grid$toroidal) + if(n_iter == 1) { - cmm_args <- - list2(data = model.frame(x), - distance_method = distance_names, - seed = seed, - .parallel = .parallel, - k = nrow(ngroups(x)), - hc_method = x$hc_method, - clust_fun = x$clust_fun, - xdim = x$grid$xdim, - ydim = x$grid$ydim, - topo = x$grid$topo, - eps = x$eps, - minPts = x$minPts, - neighbourhood.fct = as.character(x$grid$neighbourhood.fct), - toroidal = x$grid$toroidal) + cmm_args <- c(cmm_list, list2(seed = seed)) cmm_args <- c(cmm_args, x$dots) @@ -211,21 +237,7 @@ seeds <- set_names(seeds, paste0('run_', 1:n_iter)) cmm_args <- - map(seeds, - ~list2(data = model.frame(x), - distance_method = distance_names, - seed = .x, - .parallel = FALSE, - k = nrow(ngroups(x)), - hc_method = x$hc_method, - clust_fun = x$clust_fun, - xdim = x$grid$xdim, - ydim = x$grid$ydim, - topo = x$grid$topo, - eps = x$eps, - minPts = x$minPts, - neighbourhood.fct = as.character(x$grid$neighbourhood.fct), - toroidal = x$grid$toroidal)) + map(seeds, ~c(cmm_list, list2(seed = .x))) cmm_args <- map(cmm_args, ~c(.x, x$dots)) @@ -235,7 +247,7 @@ if(n_iter == 1) { - if(x$clust_fun %in% c('hclust', 'som', 'supersom')) { + if(x$clust_fun %in% c('hclust', 'som', 'supersom', 'htk')) { cmm_args$clust_fun <- NULL @@ -250,13 +262,7 @@ imp_call <- call2('importance_cluster', !!!compact(cmm_args), - clustering_fun = switch(x$clust_fun, - hclust = hcluster, - kmeans = kcluster, - pam = kcluster, - dbscan = dbscan_cluster, - som = som_cluster, - supersom = som_cluster)) + clustering_fun = clustering_fun) return(eval(imp_call)) @@ -264,7 +270,7 @@ ## calls: multiple iterations ---------- - if(x$clust_fun %in% c('hclust', 'som', 'supersom')) { + if(x$clust_fun %in% c('hclust', 'som', 'supersom', 'htk')) { for(i in names(cmm_args)) { @@ -288,30 +294,12 @@ map(cmm_args, ~call2('importance_cluster', !!!compact(.x), - clustering_fun = switch(x$clust_fun, - hclust = hcluster, - kmeans = kcluster, - pam = kcluster, - dbscan = dbscan_cluster, - som = som_cluster, - supersom = som_cluster))) + clustering_fun = clustering_fun)) - if(.parallel) { - - future::plan('multisession') - - res <- - furrr::future_map(imp_call, - eval, - .options = furrr::furrr_options(seed = TRUE)) - - future::plan('sequential') - - } else { - - res <- map(imp_call, eval) - - } + res <- + suppressMessages(furrr::future_map(imp_call, + eval, + .options = furrr::furrr_options(seed = TRUE))) res <- map2_dfr(res, names(res), @@ -345,6 +333,8 @@ ## entry control ------- + start_time <- Sys.time() + stopifnot(is_combi_analysis(x)) stopifnot(is.numeric(n_iter)) @@ -368,29 +358,45 @@ } + node_clust_fun = switch(x$clust_analyses$node$clust_fun, + hclust = hcluster, + kmeans = kcluster, + htk = htk_cluster, + pam = kcluster, + dbscan = dbscan_cluster, + som = som_cluster) + + ## parallel backend and benchmarking -------- + + message(paste('Permutation importance testing with', + n_iter, 'iterations')) + + if(.parallel) future::plan('multisession') + + on.exit(message(paste('Elapsed:', Sys.time() - start_time)), + add = TRUE) + + on.exit(future::plan('sequential')) + ## common parameters ------- + cmm_list <- + list2(data = model.frame(x)$observation, + clustering_fun = clustering_fun, + distance_som = distance_som, + distance_method = distance_method, + xdim = x$clust_analyses$observation$grid$xdim, + ydim = x$clust_analyses$observation$grid$ydim, + topo = x$clust_analyses$observation$grid$topo, + neighbourhood.fct = as.character(x$clust_analyses$observation$grid$neighbourhood.fct), + toroidal = x$clust_analyses$observation$grid$toroidal, + rlen = nrow(x$clust_analyses$observation$clust_obj$changes), + node_clust_fun = node_clust_fun, + .parallel = .parallel) + if(n_iter == 1) { - args <- - list2(data = model.frame(x)$observation, - clustering_fun = clustering_fun, - distance_som = distance_som, - distance_method = distance_method, - xdim = x$clust_analyses$observation$grid$xdim, - ydim = x$clust_analyses$observation$grid$ydim, - topo = x$clust_analyses$observation$grid$topo, - neighbourhood.fct = as.character(x$clust_analyses$observation$grid$neighbourhood.fct), - toroidal = x$clust_analyses$observation$grid$toroidal, - rlen = nrow(x$clust_analyses$observation$clust_obj$changes), - node_clust_fun = switch(x$clust_analyses$node$clust_fun, - hclust = hcluster, - kmeans = kcluster, - pam = kcluster, - dbscan = dbscan_cluster, - som = som_cluster), - seed = seed, - .parallel = .parallel) + args <- c(cmm_list, list(seed = seed)) args <- c(args, x$dots) @@ -404,26 +410,10 @@ seeds <- set_names(seeds, paste0('run_', 1:n_iter)) + cmm_list$.parallel <- FALSE + args <- - map(seeds, - ~list2(data = model.frame(x)$observation, - clustering_fun = clustering_fun, - distance_som = distance_som, - distance_method = distance_method, - xdim = x$clust_analyses$observation$grid$xdim, - ydim = x$clust_analyses$observation$grid$ydim, - topo = x$clust_analyses$observation$grid$topo, - neighbourhood.fct = as.character(x$clust_analyses$observation$grid$neighbourhood.fct), - toroidal = x$clust_analyses$observation$grid$toroidal, - rlen = nrow(x$clust_analyses$observation$clust_obj$changes), - node_clust_fun = switch(x$clust_analyses$node$clust_fun, - hclust = hcluster, - kmeans = kcluster, - pam = kcluster, - dbscan = dbscan_cluster, - som = som_cluster), - seed = .x, - .parallel = FALSE)) + map(seeds, ~c(cmm_list, list(seed = .x))) args <- map(args, ~c(.x, x$dots)) @@ -447,14 +437,6 @@ imp_call <- map(args, ~call2('importance_cluster', !!!.x)) - if(.parallel) { - - future::plan('multisession') - - } - - on.exit(future::plan('sequential')) - exports <- c('tidyverse', 'rlang', 'cluster', @@ -467,10 +449,10 @@ 'clustTools') res <- - furrr::future_map(imp_call, - eval, - .options = furrr::furrr_options(seed = TRUE, - packages = exports)) + suppressMessages(furrr::future_map(imp_call, + eval, + .options = furrr::furrr_options(seed = TRUE, + packages = exports))) res <- map2_dfr(res, names(res), diff --git a/R/iterative_semi_supervised.R b/R/iterative_semi_supervised.R new file mode 100644 index 0000000..9c3c27c --- /dev/null +++ b/R/iterative_semi_supervised.R @@ -0,0 +1,242 @@ +# Iterative kNN label propagation algorithm used for semi-supervised clustering + +# S3 interface ------- + +#' Iterative k-nearest neighbor label propagation algorithm. +#' +#' @description +#' Prediction of cluster assignment by the k-nearest neighbor label propagation +#' algorithm with an automated choice of the k value based on a loss function. +#' +#' @details +#' The function finds the optimal value of k for the k-nearest neighbor +#' classifier by iteratively checking quality of the cluster assignment. The +#' quality check is accomplished by one of the loss functions: +#' silhouette width (`select_stat = 'silhouette'`, default), +#' percentage of observations with negative silhouette widths +#' ('misclassification'), +#' fraction of explained clustering variance (i.e. ratio of the between-cluster +#' sum of squares to the total sum of squares, `select_stat = 'variance'`), +#' or neighbor preservation (`select_stat = 'np'`). +#' The `prediter()` function is a S3 generic function. +#' The function works only for clustering analyses and combined SOM - clustering +#' analyses with the data provided as data frames but not as distance matrices. +#' +#' @references +#' Leng M, Wang J, Cheng J, Zhou H, Chen X. Adaptive semi-supervised +#' clustering algorithm with label propagation. J Softw Eng (2014) 8:14–22. +#' doi:10.3923/jse.2014.14.22 +#' +#' @return an object of the \code{\link{tuner}} class with the `plot()` and `summary()` +#' methods. +#' +#' @param x a `clust_analysis` or `combi_analysis` object. +#' @param newdata a data frame or a matrix with the new data. If `NULL`, +#' the training data is used for fitting the clustering structure. +#' @param select_stat a name of the loss function defining the quality measure +#' of the prediction. For details, see: Details. +#' @param max_k the maximal number of the nearest neighbors to be tested. +#' @param kNN_data number of the nearest neighbors in the dataset, used for +#' determination of neighborhood preservation statistic. See: \code{\link{np}} +#' for details. +#' @param kNN_cluster number of the nearest neighbors in the cluster, used for +#' determination of neighborhood preservation statistic. See: \code{\link{np}} +#' for details. +#' @param .parallel logical, should the analysis be run in parallel? +#' @param ... extra arguments passed to \code{\link{predict.clust_analysis}} or +#' \code{\link{predict.combi_analysis}} such as kernel weighting. Note that you +#' cannot specify the `kNN` and `type` arguments. +#' +#' @export + + prediter <- function(x, ...) UseMethod('prediter') + +#' @rdname prediter +#' @export + + prediter.clust_analysis <- function(x, + newdata = NULL, + select_stat = c('silhouette', + 'misclassification', + 'variance', + 'np'), + max_k = 20, + kNN_data = 5, + kNN_cluster = NULL, + .parallel = FALSE, ...) { + + ## entry control ------- + + start_time <- Sys.time() + + stopifnot(is_clust_analysis(x) | is_combi_analysis(x)) + + select_stat <- match.arg(select_stat[1], + c('silhouette', + 'misclassification', + 'variance', + 'np')) + + stopifnot(is.numeric(max_k)) + stopifnot(is.logical(.parallel)) + + n_observations <- nobs(x)$observations + + if(max_k > n_observations) { + + stop("'max_k' has to be lower than the observation number.", + call. = FALSE) + + } + + max_k <- as.integer(max_k) + + k_vec <- 1:max_k + + ## Benchmarking --------- + + message(paste('Finding optimal number of nearest neighbors for', + length(k_vec), 'k values')) + + on.exit(message(paste('Elapsed:', Sys.time() - start_time)), + add = TRUE, + after = FALSE) + + on.exit(future::plan('sequential')) + + ## data check: essentially done by `predict()` ------ + + if(is.null(newdata)) newdata <- model.frame(x) + + check_numeric(newdata) + + ## predictions and quality stats --------- + + if(.parallel) { + + future::plan('multisession') + + preds <- + furrr::future_pmap(list(kNN = k_vec), + propagate, + object = x, + newdata = newdata, + ..., + .options = furrr::furrr_options(seed = TRUE)) + + stats <- + furrr::future_map_dfr(preds, + summary, + kNN_data = kNN_data, + kNN_cluster = kNN_cluster, + .options = furrr::furrr_options(seed = TRUE)) + + } else { + + preds <- pmap(list(kNN = k_vec), + propagate, + object = x, + newdata = newdata, ...) + + stats <- map_dfr(preds, + summary, + kNN_data = kNN_data, + kNN_cluster = kNN_cluster) + + } + + kNN <- NULL + + stats <- mutate(stats, kNN = k_vec) + + stats <- dplyr::relocate(stats, kNN) + + ## choice of the optimal k and output ------- + + select_var <- switch(select_stat, + silhouette = 'sil_width', + misclassification = 'frac_misclassified', + variance = 'frac_var', + np = 'frac_np') + + select_fun <- switch(select_stat, + silhouette = function(x) max(x, na.rm = TRUE), + misclassification = function(x) min(x, na.rm = TRUE), + variance = function(x) max(x, na.rm = TRUE), + np = function(x) max(x, na.rm = TRUE)) + + select_lab <- switch(select_stat, + silhouette = 'max', + misclassification = 'min', + variance = 'max', + np = 'max') + + opt_k <- + filter(stats, + .data[[select_var]] == select_fun(.data[[select_var]])) + + opt_k <- opt_k$kNN[1] + + tuner(list(analysis = preds[[opt_k]], + stats = stats, + fun = 'prediter', + dataset = 'train', + type = 'prediction', + clust_vars = names(model.frame(x)), + tune_params = 'kNN', + tune_criteria = tibble(!!select_var := select_lab), + best_tune = tibble(kNN = opt_k))) + + } + +#' @rdname prediter +#' @export + + prediter.min_analysis <- function(x, ...) { + + warning(paste('Iterative predictions are not available for clustering', + 'analyses done with distance matrices.'), + call. = FALSE) + + return(NULL) + + } + +#' @rdname prediter +#' @export + + prediter.umatrix_analysis <- function(x, ...) { + + warning(paste('Iterative predictions are not available for clustering', + 'analyses done with multiple data layers.'), + call. = FALSE) + + return(NULL) + + } + +#' @rdname prediter +#' @export + + prediter.combi_analysis <- function(x, + newdata = NULL, + select_stat = c('silhouette', + 'misclassification', + 'variance', + 'np'), + max_k = 20, + kNN_data = 5, + kNN_cluster = NULL, + .parallel = FALSE, ...) { + + ## entry control ------- + + prediter.clust_analysis(x, + newdata = newdata, + select_stat = select_stat, + max_k = max_k, + .parallel = .parallel, ...) + + } + +# END ----- diff --git a/R/plotting_utils.R b/R/plotting_utils.R index ede9541..10da687 100644 --- a/R/plotting_utils.R +++ b/R/plotting_utils.R @@ -382,6 +382,119 @@ } +# Diagnostic plots for HTK clustering ------- + +#' Generate diagnostic plots for hard threshold KMEANS clustering. +#' +#' @description +#' Creates the plot of within-cluster sum of squares and mean silhouette width +#' for the varying cluster numbers. +#' +#' @details +#' Corresponds to the output of \code{\link[factoextra]{fviz_nbclust}}. +#' Intended for internal use. +#' +#' @return a list of two elements `wss` and `silhouette` storing `ggplot` +#' objects with the plots of within-cluster sum of squares and silhouette widths. +#' +#' @param x a `clust_analysis` object. +#' @param k a numeric vector with the k cluster numbers. If `NULL`, it +#' will be determined automatically. +#' @param plot_title title of the plots. +#' @param plot_subtitle subtitle of the plots. +#' @param plot_tag plot tag. +#' @param cust_theme a custom `ggplot` theme. + + plot_htk <- function(x, + k = NULL, + plot_title = NULL, + plot_subtitle = NULL, + plot_tag = NULL, + cust_theme = ggplot2::theme_classic()) { + + ## input control ----- + + stopifnot(is_clust_analysis(x)) + stopifnot(x$clust_fun == 'htk') + stopifnot(inherits(cust_theme, 'theme')) + + ## determining the cluster number and the range of k ------ + + clust_n <- nrow(ngroups(x)) + + k_vec <- c(1:clust_n, + (clust_n + 1):(clust_n + 6)) + + ## fitting the clustering objects ------- + + clust_calls <- + map(k_vec, + ~call2('htk_cluster', + data = model.frame(x), + k = .x, + lambdas = x$lambdas, + !!!compact(x$dots))) + + ## safely evaluate, for high cluster numbers + ## an high lambda values, the algorithm may have no enough + ## unique data points to initialize the centers + + clust_objects <- map(clust_calls, purrr::safely(eval)) + + errors <- map(clust_objects, ~.x$error) + + correct <- map_lgl(errors, is.null) + + clust_objects <- compact(map(clust_objects, ~.x$result)) + + ## stats -------- + + variances <- map(clust_objects, var) + + variances <- map_dbl(variances, ~.x$total_wss) + + ## no point at computing silhouette for a single cluster + ## its zero + + sil_w <- map(clust_objects[-1], silhouette) + + sil_w <- map_dbl(sil_w, ~mean(.x$sil_width)) + + variance <- NULL + silhouette <- NULL + + stats <- tibble(k = k_vec[correct], + variance = variances, + silhouette = c(0, sil_w)) + + ## plots + + plots <- + pmap(list(x = c('variance', 'silhouette'), + y = c('Total Within Sum of Square', + 'Average silhouette width')), + function(x, y) ggplot(stats, + aes(x = k, + y = .data[[x]])) + + ggplot2::geom_path(color = 'steelblue') + + ggplot2::geom_point(shape = 16, + size = 2, + color = 'steelblue') + + ggplot2::geom_vline(xintercept = clust_n, + linetype = 'dashed', + color = 'coral3') + + ggplot2::scale_x_continuous(breaks = k_vec) + + cust_theme + + ggplot2::labs(title = plot_title, + subtitle = plot_subtitle, + tag = plot_tag, + y = y, + x = 'Number of clusters k')) + + set_names(plots, c('wss', 'silhouette')) + + } + # General plotting functions ------- #' Generate a custom scatter ggplot. diff --git a/R/red_analysis_oop.R b/R/red_analysis_oop.R index 65caedf..8f51c68 100644 --- a/R/red_analysis_oop.R +++ b/R/red_analysis_oop.R @@ -38,7 +38,7 @@ # plotting ---- -#' Plot features of a red_analysis object. +#' Plot features of a `red_analysis` object. #' #' @description #' Plots the component table, loadings table - in both cases the @@ -47,13 +47,20 @@ #' the components/dimensions. #' #' @details -#' The loadings table plot is available only for the PCA `red_analysis` objects. +#' The loadings table plot is available only for the PCA and factor +#' analysis `red_analysis` objects. +#' For `red_analysis` objects created with `clust_analysis` objects, scatter +#' plots of the scores/components table/layout can convey +#' the cluster assignment information coded by the point or bar color +#' (`label_clust = TRUE`). #' -#' @param x a `red_analysis` object, created with \code{\link{reduce_data}}. +#' @param x a `red_analysis` object, created with \code{\link{reduce_data}} or +#' `components()` called for clustering analyses. #' @param type plot type: #' -#' * 'component_tbl' or 'score' present the scores for particular observations -#' in a scatter plot +#' * 'component_tbl' or 'score' present the scores (layout) for particular +#' observations in a scatter plot. If `label_clust = TRUE`, point color codes +#' for the optional cluster assignment information. #' #' * 'loadings' plot the variable PCA loadings as a scatter plot. #' @@ -64,6 +71,8 @@ #' #' @param label_points logical, should the variable names be displayed in the #' plot? Valid only for the PCA loadings plot. +#' @param label_clust logical, should the cluster assignment (if available) be +#' coded be the point color? #' @param cust_theme a ggplot plot theme. #' @param segment_color color of the lines presented in the PCA loading plot. #' @param ... extra arguments passed to \code{\link{plot_point}} @@ -81,6 +90,7 @@ 'scree', 'neighborhood'), label_points = TRUE, + label_clust = FALSE, cust_theme = ggplot2::theme_classic(), segment_color = 'steelblue', ...) { @@ -232,6 +242,102 @@ } +#' @rdname plot.red_analysis +#' @export + + plot.clust_red <- function(x, + type = c('component_tbl', + 'scores', + 'loadings', + 'scree', + 'neighborhood'), + label_points = TRUE, + label_clust = TRUE, + cust_theme = ggplot2::theme_classic(), + segment_color = 'steelblue', ...) { + + ## entry control is done be the superclass method ------- + + stopifnot(is.logical(label_clust)) + + type <- match.arg(type[1], + c('component_tbl', + 'scores', + 'loadings', + 'scree', + 'neighborhood')) + + ## calls for the superclass method for plots without cluster assignment ------ + + if(!label_clust) return(NextMethod()) + + if(!type %in% c('component_tbl', 'scores', 'neighborhood')) { + + return(NextMethod()) + + } + + ## numbers of observations and variables ------ + + plot_n <- nobs(x) + + plot_tag <- paste0('\nObservations: n = ', + plot_n$observations, + '\nVariables: n = ', + plot_n$variables) + + ## scatter plots with the cluster assignment information ------ + + if(type %in% c('scores', 'component_tbl')) { + + plot_data <- extract(x, 'scores') + + if(!all(c('comp_1', 'comp_2') %in% names(plot_data))) { + + stop(paste('Atempt to plot a 1-dimensional reduction analysis result.', + 'Adjust kdim?'), + call. = FALSE) + + } + + sdevs <- var(x) + + if(x$red_fun == 'pca') { + + ax_labs <- map2(c('PC1', 'PC2'), + signif(sdevs$perc_var[1:2], 3), + ~paste0(.x, ', ', .y, '%')) + + } else { + + ax_labs <- map2(c('Dim 1', 'Dim 2'), + signif(sdevs$perc_var[1:2], 3), + ~paste0(.x, ', ', .y, '%')) + + } + + point_plot <- + plot_point(data = plot_data, + x_var = 'comp_1', + y_var = 'comp_2', + fill_var = 'clust_id', + plot_title = switch(x$red_fun , + pca = 'PCA', + mds = 'MDS', + umap = 'UMAP'), + plot_tag = plot_tag, + x_lab = ax_labs[[1]], + y_lab = ax_labs[[2]], + cust_theme = cust_theme, + fill_lab = 'Cluster ID', ...) + + return(point_plot) + + } + + } + + # prediction ------ #' Project new data onto a reduction analysis layout. diff --git a/R/reduction.R b/R/reduction.R index 7fc92ae..a3543c5 100644 --- a/R/reduction.R +++ b/R/reduction.R @@ -30,6 +30,11 @@ #' @param with type of the input data for the reduction analysis: #' the clustering data ('data'), the matrix of distances between observations #' ('distance') or U matrix between SOM nodes ('umatrix'). +#' @param train_object an optional `red_analysis` object wchich will be used as +#' a training layout. This works currently only for PCA and UMAP and in case +#' of single layer data frames as clustering data. +#' Please refer to \code{\link[stats]{prcomp}} and \code{\link[umap]{umap}} for +#' details. #' @param ... extra arguments passed to \code{\link{reduce_data}}. #' #' @return a `red_analysis` object with the component/score table containing @@ -43,7 +48,8 @@ components.clust_analysis <- function(object, kdim = NULL, red_fun = c('pca', 'mds', 'umap'), - with = c('distance', 'data', 'umatrix'), ...) { + with = c('distance', 'data', 'umatrix'), + train_object = NULL, ...) { ## entry control ------- @@ -55,7 +61,7 @@ if(is.null(kdim)) { - kdim <- length(unique(object$clust_assignment$clust_id)) + kdim <- nrow(ngroups(object)) } @@ -77,6 +83,56 @@ node <- NULL clust_id <- NULL + ## reduction with a trained object ------ + + if(!is.null(train_object)) { + + if(check_supersom) { + + warning(paste('Predictions for multi-layer data are not implemented', + 'at the moment.'), + call. = FALSE) + + return(NULL) + + } + + if(!is_red_analysis(train_object)) { + + stop("'train_object' has to be a 'red_analysis' object.", + call. = FALSE) + + } + + red_fun <- train_object$red_fun + + if(!red_fun %in% c('pca', 'umap')) { + + stop("'train_object' has to be a PCA or UMAP analysis.", + call. = FALSE) + + } + + if(with %in% c('distance', 'umatrix')) { + + warning('Predictions for distances are not supported.', call. = FALSE) + + return(NULL) + + } + + red_obj <- predict(train_object, + newdata = extract(object, type = with), ...) + + red_obj$component_tbl <- + left_join(red_obj$component_tbl, + object$clust_assignment, + by = 'observation') + + return(red_analysis(red_obj)) + + } + ## reduction: most cases -------- ## concerns clustering without multi-layer SOM and distances @@ -102,7 +158,7 @@ } - return(red_obj) + return(red_analysis(red_obj)) } @@ -134,7 +190,7 @@ } - red_objects + map(red_objects, red_analysis) } @@ -147,6 +203,7 @@ NextMethod() } + #' @rdname components.clust_analysis #' @export components.combi_analysis #' @export @@ -154,7 +211,8 @@ components.combi_analysis <- function(object, kdim = NULL, red_fun = c('pca', 'mds', 'umap'), - with = c('distance', 'data', 'umatrix'), ...) { + with = c('distance', 'data', 'umatrix'), + train_object = NULL, ...) { ## entry control ------- @@ -169,7 +227,8 @@ red_analysis <- components(object$clust_analyses$observation, kdim = kdim, red_fun = red_fun, - with = with, ...) + with = with, + train_object = train_object, ...) observation <- NULL @@ -237,7 +296,7 @@ } - data_comps + map(data_comps, red_analysis) } diff --git a/R/regularized_clustering.R b/R/regularized_clustering.R new file mode 100644 index 0000000..d455e6b --- /dev/null +++ b/R/regularized_clustering.R @@ -0,0 +1,481 @@ +# Functions for regularized clustering + +# Regularized K-means clustering ----- + +#' Regularized hard threshold KMEANS clustering. +#' +#' @description +#' Implements the hard threshold KMEANS algorithm proposed by Raymaekers +#' and Zamar and provided by the `clusterHD` package. +#' +#' @details +#' The algorithm offers an interesting approach to clustering of +#' multi-dimensional data, especially those containing variables +#' of little relevance for the clustering analysis (e.g. as investigated by +#' permutation importance with \code{\link{impact}}). For details, please refer +#' to the genuine R function \code{\link[clusterHD]{HTKmeans}} and the seminal +#' paper. The function works with the squared Euclidean distance metric and +#' accepts only a numeric data frame as the input data. +#' There are two crucial parameters to be provided by the user, the number of +#' centers/clusters `k`, which can be determined by methods such as peak mean +#' silhouette width, and the regularization argument `lambdas`, whose value can +#' be found by tuning (e.g. comparing silhouette widths or clustering variances +#' for various lambda values). If `lambdas` is set to `NULL` or provided as +#' a numeric vector, the best lambda value is found with the +#' \code{\link[clusterHD]{getLambda}}. If a single value is provided, it +#' will be used for clustering. +#' Tuning of the lambda parameter using explained clustering variance, +#' silhouette widths and neighbor preservation statistic is facilitated by the +#' `tune_htk()` function. +#' +#' @references +#' Raymaekers J, Zamar RH. Regularized K-means Through Hard-Thresholding. +#' J Mach Learn Res (2022) 23:1–48. +#' Available at: http://jmlr.org/papers/v23/21-0052.html +#' +#' @param data a numeric data frame with observations in the rows and +#' variables in the columns. +#' @param k number of centers (clusters). +#' @param lambdas a numeric vector of the regularization parameter. See Details. +#' @param select_stat statistic used for selection of the optimal lambda value. +#' Ignored if `lambdas` is a single numeric value. For `htk_cluster()` they are +#' 'AIC' (Akaike Information Criterion) or 'BIC' (Bayesian Information +#' Criterion). For `tune_htk()` they are silhouette width ('silhouette'), +#' fraction of observations with negative silhouette values +#' ('misclassification'), fraction of explained clustering variance ('variance'), +#' or neighborhood preservation ('np'). +#' @param seed root of the random number generator. +#' @param type type of the tuning procedure. When set to 'train' (default), +#' cluster structure quality statistics are computed for the entire data set. +#' When set to 'cv', cross-validated statistics for subsequent lambda values +#' are calculated. +#' @param nfolds number of CV folds. +#' @param kNN number of the nearest neighbors used by the cluster assignment +#' classifier in the cross-validation folds. Ignored if `type = 'train'`. +#' @param simple_vote logical, should classical unweighted k-NN classification +#' be applied? If FALSE, distance-weighted k-NN is used with the provided kernel +#' function. Ignored if `type = 'train'`. +#' @param resolve_ties logical, should the ties be resolved at random? Applies +#' only to the simple unweighted voting algorithm. Ignored if `type = 'train'`. +#' @param kernel_fun kernel function transforming the distance into weight. +#' Ignored if `type = 'train'`. +#' @param kNN_data number of the nearest neighbors in the genuine data set +#' used for calculation of neighborhood preservation. See \code{\link{np}} +#' for details. +#' @param kNN_cluster number of the nearest neighbors of the given cluster used +#' for calculation of neighborhood preservation. See \code{\link{np}} for +#' details. +#' @param .parallel logical, shoudl the analysis be run in parallel? +#' @param ... extra arguments provided to \code{\link[clusterHD]{HTKmeans}}. +#' +#' @return `htk_cluster()` returns an object of the +#' class \code{\link{clust_analysis}}. +#' +#' @export + + htk_cluster <- function(data, + k = 2, + lambdas = NULL, + select_stat = c('AIC', 'BIC'), + seed = 1234, ...) { + + ## entry control ------- + + check_numeric(data) + + stopifnot(is.numeric(k)) + + k <- as.integer(k[1]) + + if(!is.null(lambdas)) { + + if(!is.numeric(lambdas)) { + + stop(paste("'lambdas' has to be a numeric vector, NULL or a", + "single numeric value."), + call. = FALSE) + + } + + } + + select_stat <- match.arg(select_stat[1], c('AIC', 'BIC')) + + ## check for the data ------ + + check_numeric(data) + + mat_data <- as.matrix(data) + + if(is.null(rownames(mat_data))) { + + obs_id <- 1:nrow(mat_data) + + } else { + + obs_id <- rownames(mat_data) + + } + + model_frame <- enexpr(data) + + ## clustering ------ + + ## handling distance method requests, to make the function compatible + ## with combined SOM procedures + + extra_args <- list2(...) + + extra_args$distance_method <- NULL + + extra_args <- compact(extra_args) + + fun <- clusterHD::HTKmeans + + res_call <- call2('fun', + X = mat_data, + k = k, + lambdas = lambdas, + !!!extra_args) + + res <- eval(res_call) + + ## lambdas as a single number -------- + + if(length(lambdas) == 1) { + + set.seed(seed) + + clust_ass <- + tibble(observation = obs_id, + clust_id = factor(res$HTKmeans.out[[1]]$cluster)) + + return(clust_analysis(list(data = quo(!!model_frame), + dist_mtx = calculate_dist(data, 'squared_euclidean'), + dist_method = 'squared_euclidean', + lambdas = lambdas, + clust_fun = 'htk', + clust_obj = res, + clust_assignment = clust_ass, + dots = list2(...)))) + + } + + ## lambda as NULL or a numeric vector ------ + + opt_lambda <- clusterHD::getLambda(res, type = select_stat) + + return(htk_cluster(data = data, + k = k, + lambdas = opt_lambda, + seed = seed, ...)) + + } + +#' @rdname htk_cluster +#' @export + + tune_htk <- function(data, + k = 2, + lambdas = NULL, + select_stat = c('silhouette', + 'misclassification', + 'variance', + 'np'), + type = c('train', 'cv'), + nfolds = 5, + kNN = 5, + simple_vote = TRUE, + resolve_ties = FALSE, + kernel_fun = function(x) 1/x, + kNN_data = 5, + kNN_cluster = NULL, + seed = 1234, + .parallel = FALSE, ...) { + + ## input check ------ + + start_time <- Sys.time() + + check_numeric(data) + + clust_features <- names(data) + + stopifnot(is.numeric(k)) + + k <- as.integer(k[1]) + + type <- match.arg(type[1], c('train', 'cv')) + + stopifnot(is.logical(.parallel)) + + select_stat <- match.arg(select_stat[1], + c('silhouette', + 'misclassification', + 'variance', + 'np')) + + ## the CV arguments are checked by a downstream function + + ## the lambda vector is stripped from the output of the HTKmeans() + ## function + + if(is.null(lambdas)) { + + res <- clusterHD::HTKmeans(X = as.matrix(data), + k = k, + lambdas = lambdas, ...) + + lambdas <- res$lambdas + + } + + stopifnot(is.numeric(lambdas)) + + ## benchmarking and parallel backend --------- + + message(paste('Tuning for', length(lambdas), 'lambda values')) + + if(.parallel) future::plan('multisession') + + on.exit(message(paste('Elapsed:', Sys.time() - start_time)), + add = TRUE, + after = FALSE) + + on.exit(future::plan('sequential')) + + ## tuning objects -------- + + extra_args <- list2(...) + + tune_calls <- + map(lambdas, + ~call2('htk_cluster', + data = data, + k = k, + lambdas = .x, + select_stat = 'AIC', + seed = seed, + !!!compact(extra_args))) + + tune_objects <- + furrr::future_map(tune_calls, eval, + .options = furrr::furrr_options(seed = seed)) + + tune_objects <- set_names(tune_objects, + paste0('obj_', 1:length(tune_objects))) + + lambda <- NULL + object_id <- NULL + + lambda_assign <- tibble(lambda = lambdas, + object_id = names(tune_objects)) + + ## training data stats ------ + + if(type == 'train') { + + lambda <- NULL + + ## working with safely(), because clustering for some lambda values + ## can fail + + stats <- + furrr::future_map2(tune_objects, + names(tune_objects), + ~purrr::safely(tibble)(mutate(summary(.x, + kNN_data = kNN_data, + kNN_cluster = kNN_cluster)), + object_id = .y), + .options = furrr::furrr_options(seed = TRUE)) + + stats <- map_dfr(stats, ~.x$result) + + stats <- left_join(stats, lambda_assign, by = 'object_id') + + } + + ## CV stats ------- + + if(type == 'cv') { + + if(.parallel) { + + if(nfolds > length(lambdas)) { + + cv_objects <- + map(tune_objects, + ~purrr::safely(cv)(.x, + type = 'propagation', + nfolds = nfolds, + kNN = kNN, + simple_vote = simple_vote, + resolve_ties = resolve_ties, + kernel_fun = kernel_fun, + kNN_data = kNN_data, + kNN_cluster = kNN_cluster, + seed = seed, + .parallel = TRUE)) + + } else { + + cv_objects <- + furrr::future_map(tune_objects, + ~purrr::safely(cv)(.x, + type = 'propagation', + nfolds = nfolds, + kNN = kNN, + simple_vote = simple_vote, + resolve_ties = resolve_ties, + kernel_fun = kernel_fun, + kNN_data = kNN_data, + kNN_cluster = kNN_cluster, + seed = seed, + .parallel = FALSE), + .options = furrr::furrr_options(seed = TRUE)) + + } + + } else { + + cv_objects <- + map(tune_objects, + ~purrr::safely(cv)(.x, + type = 'propagation', + nfolds = nfolds, + kNN = kNN, + simple_vote = simple_vote, + resolve_ties = resolve_ties, + kernel_fun = kernel_fun, + kNN_data = kNN_data, + kNN_cluster = kNN_cluster, + seed = seed, + .parallel = FALSE)) + + + } + + + correct <- map_lgl(cv_objects, ~is.null(.x$error)) + + cv_objects <- compact(map(cv_objects, ~.x$result)) + + stats <- map(cv_objects, summary) + + stats <- map2_dfr(stats, names(stats), + ~mutate(.x, object_id = .y)) + + stats <- left_join(stats, lambda_assign, by = 'object_id') + + frac_var_mean <- NULL + sil_width_mean <- NULL + frac_np_mean <- NULL + frac_misclassified_mean <- NULL + + sil_width <- NULL + frac_misclassified <- NULL + frac_var <- NULL + frac_np <- NULL + + stats <- dplyr::transmute(stats, + object_id = object_id, + lambda = lambda, + sil_width = sil_width_mean, + frac_misclassified = frac_misclassified_mean, + frac_var = frac_var_mean, + frac_np = frac_np_mean) + + stats <- mutate(stats, lambda = lambdas[correct]) + + } + + stats <- dplyr::relocate(stats, lambda) + + ## centers and identification of active variables ------- + + ## active variables are those for which all center coordinates + ## are non zero. I'm computing the Euclidean distance to the zero point + + center_coords <- + map(tune_objects, + ~.x$clust_obj$HTKmeans.out[[1]]$centers) + + center_coords <- map(center_coords, ~.x^2) + + center_dists <- map(center_coords, colSums) + + center_dists <- do.call('rbind', center_dists) + + center_dists <- sqrt(center_dists) + + center_dists <- set_names(as.data.frame(center_dists), + clust_features) + + n_acite_vars <- + map(1:nrow(center_dists), + ~center_dists[.x, ] != 0) + + n_active_vars <- NULL + + center_dists <- mutate(center_dists, + lambda = lambdas, + n_active_vars = map_dbl(n_acite_vars, sum)) + + center_dists <- as_tibble(center_dists) + + stats <- left_join(stats, + center_dists[c('lambda', 'n_active_vars')], + by = 'lambda') + + ## best tune and the analysis object ------- + + select_var <- switch(select_stat, + silhouette = 'sil_width', + misclassification = 'frac_misclassified', + variance = 'frac_var', + np = 'frac_np') + + select_fun <- + switch(select_stat, + silhouette = function(x) max(x, na.rm = TRUE), + misclassification = function(x) min(x, na.rm = TRUE), + variance = function(x) max(x, na.rm = TRUE), + np = function(x) max(x, na.rm = TRUE)) + + select_lab <- switch(select_stat, + silhouette = 'max', + misclassification = 'min', + variance = 'max', + np = 'max') + + ## in case there are more than one optimal tunes selected by the numeric + ## statistic criterion, the scenario with the lowest number of active + ## variables is selected + + best_stats <- + filter(stats, + .data[[select_var]] == select_fun(.data[[select_var]])) + + best_stats <- + filter(best_stats, + n_active_vars == min(n_active_vars)) + + best_stats <- best_stats[1, ] + + ## the output object --------- + + tuner(list(analysis = tune_objects[[best_stats$object_id[[1]]]], + stats = stats, + center_distances = center_dists, + fun = 'tune_htk', + dataset = type, + type = 'development', + clust_vars = clust_features, + tune_params = 'lambda', + tune_criteria = tibble(!!select_var := select_lab, + n_active_vars = 'min'), + best_tune = tibble(lambda = best_stats$lambda[[1]]))) + + } + +# END ------- diff --git a/R/sil_oop.R b/R/sil_oop.R index be7a128..9d40e40 100644 --- a/R/sil_oop.R +++ b/R/sil_oop.R @@ -18,7 +18,9 @@ #' @param ... extra arguments, currently none. #' #' @return a data frame with numeric statistics for the whole clustering -#' structure (`clust_id` = 'global') and particular clusters. +#' structure (`clust_id` = 'global') and particular clusters (mean, SD, median, +#' interquartile range, 95% percentile range, range, number and fraction of +#' potentially misclassified observations with negative silhouette widths). #' #' @export summary.sil_extra #' @export @@ -77,7 +79,7 @@ neg <- map2_dfr(neg, names(neg), ~mutate(.x, clust_id = .y, - perc_negative = 100 * n_negative/n)) + frac_misclassified = n_negative/n)) stats <- left_join(neg, stats, by = 'clust_id') @@ -158,7 +160,7 @@ sil_width <- NULL sil_sign <- NULL n <- NULL - perc_negative <- NULL + frac_misclassified <- NULL clust_id <- NULL observation <- NULL neighbor_id <- NULL @@ -190,8 +192,8 @@ stats <- mutate(summary(x), plot_lab = paste0('total: n = ', n, - '\nnegative: ', - signif(perc_negative, signif_digits), '%', + '\nnegative fraction = ', + signif(frac_misclassified, signif_digits), '\navg(s) = ', signif(mean, signif_digits)), plot_lab = paste(clust_id, plot_lab, sep = '\n')) diff --git a/R/tuner_oop.R b/R/tuner_oop.R new file mode 100644 index 0000000..fd5f88d --- /dev/null +++ b/R/tuner_oop.R @@ -0,0 +1,269 @@ +# S3 OOP interface for the `tuner` class + +# Extraction ------- + +#' Extract cluster assignment predictions. +#' +#' @description +#' The function extracts the `clust_analysis` object, quality statistics or +#' the best combination of the tuning parameters. +#' +#' @param x a `tuner` object. +#' @param type feature to be extracted from the object: +#' +#' * `clust_object` or `analysis` returns the `clust_analysis` object generated +#' with the best combination of the tuning parameters +#' +#' * `stats` extracts quality statistics for combinations of the tuning +#' parameters +#' +#' * `criteria` returns a data frame with criteria of selection of the best +#' combination of the tuning parameters +#' +#' * `best_tune` extracts a data frame with the best values of the tuning +#' parameters +#' +#' @param ... extra arguments, currently none. +#' +#' @return a `clust_analysis` object. +#' +#' @export + + extract.tuner <- function(x, + type = c('clust_object', + 'analysis', + 'stats', + 'criteria', + 'best_tune'), ...) { + + stopifnot(is_tuner(x)) + + type <- match.arg(type[1], + c('clust_object', + 'analysis', + 'stats', + 'criteria', + 'best_tune')) + + switch(type, + clust_object = x$analysis, + analysis = x$analysis, + stats = x$stats, + criteria = x$tune_criteria, + best_tune = x$best_tune) + + } + +# Summary ------- + +#' Summary of quality statistics. +#' +#' @description +#' The `summary()` method called for `tuner` class objects extracts cluster +#' assignment prediction statistics for subsequent combinations of the tuning +#' parameters from the object. +#' +#' @param object a `tuner` object. +#' @param ... extra arguments, currently none. +#' +#' @return a data frame with the following columns: +#' +#' * `sil_width`: silhouette width +#' +#' * `frac_misclassified`: fraction of observations with negative silhouette +#' widths suggestive of misclassification +#' +#' * `frac_var`: fraction of explained clustering variance +#' +#' * `frac_np`: fraction of preserved nearest neighbors +#' +#' * columns named after names of the tuning parameters and containing their +#' values +#' +#' @export + + summary.tuner <- function(object, ...) object$stats + +# Plotting ------- + +#' Plot cluster quality statistic values for tuning. +#' +#' @description +#' The function plots quality statistic values (such as explained clustering +#' variance, silhouette width or mean neighborhood preservation) for +#' combinations of the tuning parameters. +#' +#' @param x a `tuner` object. +#' @param cust_theme a custom `ggplot` theme. +#' @param line_alpha alpha of the lines, applies only to plots of +#' regularization paths. +#' @param point_size size of the data points. +#' @param ... extra arguments, currently none. +#' +#' @return a list of `ggplot` graphic objects. +#' +#' @export plot.tuner +#' @export + + plot.tuner <- function(x, + cust_theme = ggplot2::theme_classic(), + line_alpha = 1, + point_size = 2, ...) { + + ## entry control ------ + + stopifnot(is_tuner(x)) + + if(!inherits(cust_theme, 'theme')) { + + stop("'cust_theme' has to be a valid ggplot theme.", + call. = FALSE) + + } + + ## variable declaration, compatibility with CHECK ------ + + n_active_vars <- NULL + value <- NULL + statistic <- NULL + labs <- NULL + lambda <- NULL + distance <- NULL + variable <- NULL + + ## plot data and metadata -------- + + stat_vars <- c('sil_width', 'frac_misclassified', 'frac_var', 'frac_np') + + stat_titles <- + set_names(c('Cluster separation', + 'Potential misclassification', + 'Explained clustering variance', + 'Neighborhood preservation'), + stat_vars) + + stat_labels <- + set_names(c('mean silhouette width', + 'fraction of negative silhouette widths', + 'fraction of explained clustering variance', + 'fraction of preserved nearest neighbors', + 'fraction of active variables'), + c(stat_vars, 'n_active_vars')) + + stat_colors <- + set_names(c('coral3', + 'steelblue3', + 'darkolivegreen', + 'plum4', + 'firebrick'), + c(stat_vars, 'n_active_vars')) + + data_label <- switch(x$dataset, + train = 'training', + cv = 'CV') + + criteria <- paste(stat_labels[names(x$tune_criteria)], collapse = ', ') + + best_label <- map_dbl(x$best_tune, signif, 2) + + best_label <- map2_chr(names(best_label), best_label, + paste, sep = ' = ') + + best_label <- paste(best_label, collapse = ', ') + + plot_subtitle <- paste0('dataset: ', data_label, + '\ncriteria: ', criteria, + '\nbest tune:', best_label) + + ## plotting table ------- + + stats <- summary(x) + + if(x$fun == 'tune_htk') { + + stat_vars <- c(stat_vars, 'n_active_vars') + + stats <- mutate(stats, + n_active_vars = n_active_vars/length(x$clust_vars)) + + } + + stats <- + tidyr::pivot_longer(stats, + cols = dplyr::all_of(stat_vars), + names_to = 'statistic', + values_to = 'value') + + ## plotting: base plots -------- + + if(x$fun %in% c('prediter', 'tune_htk')) { + + base_var <- x$tune_params[1] + + x_intercept <- x$best_tune[[base_var]][1] + + stat_plot <- + ggplot(stats, + aes(x = .data[[base_var]], + y = value, + color = statistic)) + + ggplot2::geom_path() + + ggplot2::geom_point(shape = 16, + size = point_size) + + ggplot2::geom_vline(xintercept = x_intercept, + linetype = 'dashed') + + ggplot2::scale_color_manual(values = stat_colors, + labels = stat_labels, + name = 'statistic') + + cust_theme + + labs(title = paste('Tuning of', base_var), + subtitle = plot_subtitle, + x = base_var, + y = 'statistic value') + + } + + ## plotting: regularization paths ------ + + if(x$fun %in% c('tune_htk')) { + + x_intercept <- x$best_tune[['lambda']][1] + + reg_data <- + tidyr::pivot_longer(x$center_distances, + cols = dplyr::all_of(x$clust_vars), + names_to = 'variable', + values_to = 'distance') + + reg_plot <- + ggplot(reg_data, + aes(x = lambda, + y = distance, + color = variable)) + + ggplot2::geom_line(alpha = line_alpha) + + ggplot2::geom_vline(xintercept = x_intercept, + linetype = 'dashed') + + cust_theme + + ggplot2::labs(title = 'Regularization paths', + subtitle = plot_subtitle, + x = 'lambda', + y = 'distance from mean') + + } + + ## output ---- + + if(x$fun == 'prediter') { + + return(stat_plot) + + } else if(x$fun == 'tune_htk') { + + return(list(statistics = stat_plot, + regularization = reg_plot)) + + } + + } + +# END ------ diff --git a/inst/examples/devel_test.R b/inst/examples/devel_test.R index 84cfed7..0a07897 100644 --- a/inst/examples/devel_test.R +++ b/inst/examples/devel_test.R @@ -175,7 +175,7 @@ clust_fun = 'kmeans', k = 3) - test_pam <- kcluster(data = test_red$umap, + test_pam <- kcluster(data = test_data, distance_method = 'manhattan', clust_fun = 'pam', k = 3) @@ -275,7 +275,7 @@ ## predictions for reduction analysis objects - test_pred_red <- predict(object = test_pam, + test_pred_red <- predict(object = test_hcl, newdata = reduce_data(data = new_data, distance_method = 'cosine', kdim = 3, @@ -284,7 +284,7 @@ plot(test_pred_red, type = 'data') - plot(test_pam, type = 'data') + plot(test_hcl, type = 'data') test_pred_pca <- predict(object = test_dbscan, newdata = reduce_data(data = new_data, @@ -331,7 +331,7 @@ # SOM combi clustering and its OOP ----- test_combi <- combi_cluster(data = test_data, - distance_som = 'canberra', + distance_som = 'cosine', xdim = 5, ydim = 4, topo = 'hexagonal', @@ -440,6 +440,12 @@ cross_distance(test_pam) + cross_distance(test_pam, + predict(test_pam, + newdata = new_data, + type = 'propagation')) %>% + plot + test_cross <- cross_distance(test_combi, test_new_combi) summary(cross_distance(test_combi, test_new_combi)) @@ -480,9 +486,6 @@ plot(silhouette(test_combi), fill_by = 'sign') plot(silhouette(test_hcl), fill_by = 'neighbor') -# Batch - - # Multi-layer SOM ------- biopsy_data <- MASS::biopsy %>% @@ -534,8 +537,18 @@ components(train_batch, kdim = 2, red_fun = 'umap', with = 'data') plot(train_batch, type = 'diagnostic') - plot(train_batch, type = 'components', with = 'data', kdim = 2, red_fun = 'mds') - plot(train_batch, type = 'components', with = 'umatrix', kdim = 2, red_fun = 'umap') + + plot(train_batch, + type = 'components', + with = 'data', + kdim = 2, + red_fun = 'mds') + + plot(train_batch, + type = 'components', + with = 'umatrix', + kdim = 2, + red_fun = 'umap') plot(train_batch, 'training') @@ -568,7 +581,11 @@ components(test_batch, kdim = 2, red_fun = 'umap', with = 'data') - plot(test_batch, type = 'components', with = 'distance', kdim = 2, red_fun = 'umap') + plot(test_batch, + type = 'components', + with = 'data', + kdim = 2, + red_fun = 'umap') test_homo_cross <- cross_distance(train_batch, .parallel = TRUE) @@ -618,6 +635,8 @@ silhouette(car_batch) %>% plot + components(car_batch, kdim = 2, with = 'data') + plot(car_batch, 'heat_map') car_batch %>% @@ -700,6 +719,10 @@ components(biopsy_umatrix_kmeans, kdim = 2, red_fun = 'mds') %>% plot components(biopsy_umatrix_pam, kdim = 2, red_fun = 'pca') %>% plot + plot(biopsy_umatrix_hcl, + type = 'components', + with = 'data') + cross_distance(biopsy_umatrix_hcl) %>% plot('mean') cross_distance(biopsy_umatrix_kmeans) %>% plot('mean') cross_distance(biopsy_umatrix_pam) %>% plot('histogram') @@ -921,4 +944,113 @@ te(test_combi, type = 'final') +# adaptive label propagation ------ + + test_adapt <- test_pam %>% + prediter(newdata = new_data, + select_stat = 'misclassification', + simple_vote = FALSE, + resolve_ties = TRUE, + .parallel = TRUE) + + test_adapt %>% + extract + + test_adapt %>% + summary + + plot(test_adapt) + +# reduction analysis of the clustering data, training object ------- + + train_components <- test_pam %>% + components(kdim = 2, red_fun = 'umap', with = 'data') + + train_components %>% + plot(type = 'component_tbl') + + test_pam %>% + plot('components', + with = 'data') + + new_pam <- predict(test_pam, + newdata = new_data, + type = 'propagation') + + ## using the trained components + + new_components <- new_pam %>% + components(kdim = 2, + red_fun = 'pca', + with = 'data', + train_object = train_components) + + plot(new_components, type = 'scores') + + plot(new_pam, + type = 'components', + with = 'data', + kdim = 2, + red_fun = 'umap') + + plot(new_pam, + type = 'components', + with = 'data', + train_object = train_components) + +# reduction analysis of the clustering analysis, single layer SOM ------- + + train_components <- test_som %>% + components(kdim = 2, + red_fun = 'pca', + with = 'data') + + plot(train_components) + + new_som <- test_som %>% + predict(newdata = new_data, + type = 'som') + + plot(new_som, + type = 'components', + kdim = 2, + with = 'data', + red_fun = 'pca') + + plot(new_som, + type = 'components', + kdim = 2, + with = 'data', + train_object = train_components) + +# reduction analysis of the combi analysis ------- + + train_components <- test_combi %>% + components(kdim = 2, + with = 'data', + red_fun = 'umap') + + test_new_combi %>% + components(with = 'data', + train_object = train_components) + + new_components <- test_new_neuro_combi %>% + components(with = 'data', + train_object = train_components) + + plot(train_components) + plot(new_components) + + plot(test_combi, + type = 'components', + with = 'data', + kdim = 2, + red_fun = 'umap', + train_object = train_components) + + plot(test_new_combi, + type = 'components', + with = 'data', + train_object = train_components) + # END ------ diff --git a/inst/examples/github_examples_multi_layer.R b/inst/examples/github_examples_multi_layer.R index 97f90d7..1674344 100644 --- a/inst/examples/github_examples_multi_layer.R +++ b/inst/examples/github_examples_multi_layer.R @@ -76,7 +76,7 @@ # fitting the SOM ------- ## the layers will be handled by three various distances: - ## Euclidean for numeric variables, Manhattan for ordinal variables + ## cosine for numeric variables, Manhattan for ordinal variables ## and Tanimoto for binary features car_som <- diff --git a/inst/examples/github_examples_som.R b/inst/examples/github_examples_som.R index 52ae319..933ed48 100644 --- a/inst/examples/github_examples_som.R +++ b/inst/examples/github_examples_som.R @@ -328,11 +328,17 @@ cosine_feature_hm$train + cosine_feature_hm$test +# Cross distances ----- + + cross_distance(cosine_clusters$train, + cosine_clusters$test) %>% + plot('mean') + # Feature importance ------- - cosine_importance <- impact.combi_analysis(cosine_clusters$train, - n_iter = 50, - .parallel = TRUE) + cosine_importance <- impact(cosine_clusters$train, + n_iter = 50, + .parallel = TRUE) plot(cosine_importance) @@ -360,9 +366,9 @@ map(mutate, obs = vintage, pred = car::recode(clust_id, - "'1' = 'Barolo'; - '2' = 'Grignolino'; - '3' = 'Barbera'"), + "'1' = 'Barbera'; + '2' = 'Barolo'; + '3' = 'Grignolino'"), pred = factor(pred, levels = c('Barbera', 'Barolo', 'Grignolino'))) diff --git a/inst/examples/github_examples_sparsity.R b/inst/examples/github_examples_sparsity.R new file mode 100644 index 0000000..691fdb5 --- /dev/null +++ b/inst/examples/github_examples_sparsity.R @@ -0,0 +1,381 @@ +# Fitting a clustering structure with the hard-threshold KMEANS algorithm + +# tools -------- + + library(kohonen) + + library(clustTools) + library(tidyverse) + library(clusterHD) + library(caret) + library(somKernels) + + extract <- clustTools::extract + map <- purrr::map + + library(patchwork) + +# analysis data set ------- + + ## the wines dataset pre-processing: elimination of invariant features + ## (low variance to mean ratio) and mean centering + + data("wines") + + my_wines <- wines %>% + as.data.frame %>% + mutate(ID = paste0('wine_', 1:nrow(.))) %>% + column_to_rownames('ID') + + distr_stats <- my_wines %>% + map_dfr(~tibble(variance = var(.x), + mean = mean(.x), + var_mean_ratio = var(.x)/mean(.x))) %>% + mutate(variable = names(my_wines)) %>% + relocate(variable) + + top_variant_features <- distr_stats %>% + filter(var_mean_ratio > 0.1) %>% + .$variable + + my_wines <- my_wines %>% + select(all_of(top_variant_features)) %>% + center_data('mean') + + ## appending the parental data frame with wine classification + ## it will be used for the final validation of the results + + my_wines <- my_wines %>% + mutate(vintage = vintages) + + ## training: 100 randomly chosen observations + ## the rest used for validation# + ## created with caret's createDataPartition() to keep + ## the vintage distribution + + set.seed(12345) + + train_ids <- createDataPartition(my_wines$vintage, p = 100/177)[[1]] + + wine_lst <- + list(train = my_wines[train_ids, ], + test = my_wines[-train_ids, ]) %>% + map(select, -vintage) + +# variable distribution ------ + + ## variable violin plots + + clust_variables <- names(wine_lst$train) + + var_distr_plot <- wine_lst$train %>% + pivot_longer(cols = all_of(clust_variables), + names_to = 'variable', + values_to = 'z_score') %>% + ggplot(aes(x = variable, + y = z_score)) + + geom_violin(draw_quantiles = c(0.25, 0.5, 0.75), + scale = 'width', + fill = 'cornsilk') + + geom_point(shape = 16, + size = 1, + alpha = 0.5, + position = position_jitter(width = 0.1)) + +# constructing clustering structures in the training data set ------- + + train_clusters <- list() + + ## canonical KMEANS + + train_clusters$kmeans <- kcluster(data = wine_lst$train, + distance_method = 'squared_euclidean', + clust_fun = 'kmeans', + k = 3) + + ## regularized KMEANS, finding lambda by CV-tuning + + train_tune <- tune_htk(data = wine_lst$train, + k = 3, + lambdas = seq(0, 1, by = 0.025), + select_stat = 'silhouette', + type = 'cv', + nfolds = 10, + kNN = 11, + .parallel = TRUE) + + summary(train_tune) + + plot(train_tune) + + train_clusters$htk <- extract(train_tune, 'analysis') + + ## renaming the clusters after their predominant vintages (checked elsewhere) + + train_clusters$kmeans <- train_clusters$kmeans %>% + rename(c('2' = 'Barbera', + '1' = 'Barolo', + '3' = 'Grignolino')) + + train_clusters$htk <- train_clusters$htk %>% + rename(c('3' = 'Barbera', + '1' = 'Barolo', + '2' = 'Grignolino')) + +# performance in the training data and cross-validation ------- + + train_clusters %>% + map(summary) + + train_clusters %>% + map(cv, + type = 'propagation', + nfolds = 10, + kNN = 11) %>% + map(summary) %>% + map(select, ends_with('mean')) + +# performance in the test data ------- + + test_clusters <- train_clusters %>% + map(predict, + newdata = wine_lst$test, + type = 'propagation', + kNN = 11) + + test_clusters %>% + map(summary) + + ## cross distances + + map2(train_clusters, + test_clusters, + cross_distance) %>% + map(plot, 'mean') %>% + map(~.x + + scale_fill_gradient2(low = 'firebrick', + mid = 'white', + high = 'steelblue', + midpoint = 20, + limits = c(8, 32))) + +# summary of the performance stats ----- + + clust_performance <- + c(train_clusters, test_clusters) %>% + map_dfr(summary) %>% + mutate(dataset = c(rep('training', 2), + rep('test', 2)), + algorithm = rep(c('KMEANS', 'HTKmeans'), 2)) + + clust_performance %>% + pivot_longer(cols = c(sil_width, + frac_misclassified, + frac_var, + frac_np), + names_to = 'statistic', + values_to = 'value') %>% + ggplot(aes(x = value, + y = algorithm, + fill = dataset)) + + geom_bar(stat = 'identity', + color = 'black', + position = position_dodge(0.9)) + + facet_wrap(facets = vars(statistic), + scales = 'free', + labeller = as_labeller(c(sil_width = 'silhouette\nwidth', + frac_misclassified = 'fraction\nmisclassified', + frac_var = 'explained\nvariance', + frac_np = 'fraction\npreserved neighbors'))) + + labs(title = 'Clustering of the wines dataset') + + +# UMAP plots ------ + + umap_train <- wine_lst$train %>% + reduce_data(distance_method = 'euclidean', + kdim = 2, + red_fun = 'umap', + random_state = 12345) + + kmeans_plots <- c(train_clusters, + test_clusters) %>% + set_names(c('train_kmeans', 'train_htk', + 'test_kmeans', 'test_htk')) %>% + map(plot, + type = 'components', + with = 'data', + red_fun = 'umap', + train_object = umap_train) + + kmeans_plots <- + map2(kmeans_plots, + c('Wines, training, KMEANS', + 'Wines, training, HTKmeans', + 'Wines, test, KMEANS', + 'Wines, test, HTKmeans'), + ~.x + + labs(title = .y) + + theme(plot.tag = element_blank(), + legend.position = 'none')) + + kmeans_plots$train_kmeans + + kmeans_plots$test_kmeans + + kmeans_plots$train_htk + + kmeans_plots$test_htk + +# variable importance ------- + + var_imp <- train_clusters %>% + map(impact, + n_iter = 50, + .parallel = TRUE) + + ## plotting and customizing the title + + var_imp_plots <- var_imp %>% + map(plot) %>% + map2(., + paste('Variable importance, wines', + c('KMEANS', 'HTKmeans')), + ~.x + + geom_vline(xintercept = 0, + linetype = 'dashed') + + labs(title = .y)) + + var_imp_plots$kmeans + + var_imp_plots$htk + +# Accuracy of vintage prediction ------ + + vintage_assignment <- my_wines %>% + rownames_to_column('observation') %>% + select(observation, vintage) + + ## assignment of samples to the clusters and vintages + + kmeans_assignment <- list(train = train_clusters$kmeans, + test = test_clusters$kmeans) %>% + map(extract, 'assignment') %>% + map(left_join, vintage_assignment, by = 'observation') + + htk_assignment <- list(train = train_clusters$htk, + test = test_clusters$htk) %>% + map(extract, 'assignment') %>% + map(left_join, vintage_assignment, by = 'observation') + + ## frequencies of vintages in the clusters + + kmeans_wine_counts <- kmeans_assignment %>% + map(count, clust_id, vintage) + + htk_wine_counts <- htk_assignment %>% + map(count, clust_id, vintage) + + ## receiver-operating characteristic + + kmeans_roc <- kmeans_assignment %>% + map(transmute, + obs = vintage, + pred = clust_id) %>% + map(as.data.frame) %>% + map(multiClassSummary, + lev = c('Barbera', 'Barolo', 'Grignolino')) + + htk_roc <- htk_assignment %>% + map(transmute, + obs = vintage, + pred = clust_id) %>% + map(as.data.frame) %>% + map(multiClassSummary, + lev = c('Barbera', 'Barolo', 'Grignolino')) + +# summary of roc stats ------- + + roc_summary <- c(kmeans_roc, htk_roc) %>% + reduce(rbind) %>% + as_tibble %>% + mutate(dataset = rep(c('training', 'test'), 2), + algorithm = c(rep('KMEANS', 2), + rep('HTKmeans', 2))) + + roc_summary %>% + pivot_longer(cols = c(Accuracy, + Kappa, + Mean_Sensitivity, + Mean_Specificity), + names_to = 'statistic', + values_to = 'value') %>% + ggplot(aes(x = value, + y = algorithm, + fill = dataset)) + + geom_bar(stat = 'identity', + color = 'black', + position = position_dodge(0.9)) + + facet_wrap(facets = vars(statistic), + scales = 'free') + + labs(title = 'Vintage prediction') + + +# combi clustering -------- + + ## training, comparing kmeans with regularized kmeans + + train_combi <- list() + + train_combi$kmeans <- combi_cluster(data = wine_lst$train, + distance_som = 'cosine', + xdim = 5, + ydim = 4, + topo = 'hexagonal', + neighbourhood.fct = 'gaussian', + toroidal = FALSE, + rlen = 3000, + node_clust_fun = kcluster, + distance_nodes = 'squared_euclidean', + k = 3, + clust_fun = 'kmeans') + + train_combi$htk <- combi_cluster(data = wine_lst$train, + distance_som = 'cosine', + xdim = 5, + ydim = 4, + topo = 'hexagonal', + neighbourhood.fct = 'gaussian', + toroidal = FALSE, + rlen = 3000, + node_clust_fun = htk_cluster, + lambdas = seq(0, 1, by = 0.025), + standardize = FALSE, + k = 3) + + ## comparing performance in the training data set and cross-validation + + train_combi %>% + map(summary) + + train_combi %>% + map(cv, + nfolds = 10, + type = 'som', + .parallel = FALSE) %>% + map(summary) %>% + map(select, ends_with('mean')) + + ## comparing performance in the test data set + + train_combi %>% + map(predict, + newdata = wine_lst$test, + type = 'som') %>% + map(summary) + + ## variable importance + + train_combi %>% + map(impact, + n_iter = 10, + .parallel = TRUE) + + +# END ------