From 9ed2609f419c9f8aa42ad5751925c77ce1a97200 Mon Sep 17 00:00:00 2001
From: PiotrTymoszuk <80723424+PiotrTymoszuk@users.noreply.github.com>
Date: Fri, 17 Nov 2023 11:22:25 +0100
Subject: [PATCH] Regularized KMEANS clustering, new functionalities

Implementation of hard threshold regularized KMEANS clustering (HTKmeans, function `htk_cluster()`). Tuning functions for lambda in HTKmeans and k in kNN prediction of the cluster assignment
---
 DESCRIPTION                                 |   9 +-
 NAMESPACE                                   |  18 +
 R/appearance.R                              |  27 +-
 R/class_testing.R                           |   7 +-
 R/clust_analysis_oop.R                      | 201 ++++----
 R/combi_analysis_oop.R                      |  16 +
 R/combi_clustering.R                        |   7 +
 R/constructors.R                            | 123 ++++-
 R/cross_distance.R                          |  23 +-
 R/cross_distance_utils.R                    |  19 +-
 R/cross_validation.R                        | 126 +++--
 R/feature_importance.R                      | 206 ++++-----
 R/iterative_semi_supervised.R               | 242 ++++++++++
 R/plotting_utils.R                          | 113 +++++
 R/red_analysis_oop.R                        | 116 ++++-
 R/reduction.R                               |  73 ++-
 R/regularized_clustering.R                  | 481 ++++++++++++++++++++
 R/sil_oop.R                                 |  12 +-
 R/tuner_oop.R                               | 269 +++++++++++
 inst/examples/devel_test.R                  | 152 ++++++-
 inst/examples/github_examples_multi_layer.R |   2 +-
 inst/examples/github_examples_som.R         |  18 +-
 inst/examples/github_examples_sparsity.R    | 381 ++++++++++++++++
 23 files changed, 2342 insertions(+), 299 deletions(-)
 create mode 100644 R/iterative_semi_supervised.R
 create mode 100644 R/regularized_clustering.R
 create mode 100644 R/tuner_oop.R
 create mode 100644 inst/examples/github_examples_sparsity.R

diff --git a/DESCRIPTION b/DESCRIPTION
index ca9adfe..9de7fe6 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,12 +1,12 @@
 Package: clustTools
 Type: Package
 Title: Tools for Dimensionality Reduction and Data Clustering
-Version: 1.2.1
+Version: 1.3.0
 Maintainer: Piotr Tymoszuk <piotr.s.tymoszuk@gmail.com>
 Description: Provides a comprehensive set of R functions for dimensionality 
-    reduction (PCA, MDS and UMAP), clustering (HCl, k-means, PAM, DBSCAN, SOM)
-    and combined reduction-clustering tools. Visualization and QC toolset is 
-    provided as well.
+    reduction (PCA, MDS and UMAP), clustering (HCl, KMEANS, PAM, DBSCAN, SOM 
+    and regularized KMEANS) and combined reduction-clustering tools. 
+    Visualization and QC toolset is provided as well.
 License: GPL-3
 Encoding: UTF-8
 LazyData: true
@@ -16,6 +16,7 @@ Authors@R:
     comment = c(ORCID = "0000-0002-0398-6034"))
 Imports: 
     caret,
+    clusterHD,
     cowplot,
     coxed,
     dbscan,
diff --git a/NAMESPACE b/NAMESPACE
index c1f7787..c4b5d43 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -21,6 +21,7 @@ S3method(extract,clust_analysis)
 S3method(extract,cluster_cv)
 S3method(extract,combi_analysis)
 S3method(extract,red_analysis)
+S3method(extract,tuner)
 S3method(impact,clust_analysis)
 S3method(impact,combi_analysis)
 S3method(impact,min_analysis)
@@ -39,12 +40,14 @@ S3method(np,red_analysis)
 S3method(pbc,clust_analysis)
 S3method(pbc,combi_analysis)
 S3method(plot,clust_analysis)
+S3method(plot,clust_red)
 S3method(plot,combi_analysis)
 S3method(plot,cross_dist)
 S3method(plot,importance)
 S3method(plot,knb)
 S3method(plot,red_analysis)
 S3method(plot,sil_extra)
+S3method(plot,tuner)
 S3method(plot,umatrix_analysis)
 S3method(plot_clust_hm,clust_analysis)
 S3method(plot_clust_hm,combi_analysis)
@@ -55,22 +58,30 @@ S3method(predict,combi_analysis)
 S3method(predict,min_analysis)
 S3method(predict,red_analysis)
 S3method(predict,umatrix_analysis)
+S3method(prediter,clust_analysis)
+S3method(prediter,combi_analysis)
+S3method(prediter,min_analysis)
+S3method(prediter,umatrix_analysis)
 S3method(print,clust_analysis)
 S3method(print,combi_analysis)
 S3method(print,cross_dist)
 S3method(print,red_analysis)
+S3method(print,tuner)
 S3method(qe,clust_analysis)
 S3method(qe,combi_analysis)
 S3method(rename,clust_analysis)
 S3method(rename,combi_analysis)
 S3method(silhouette,clust_analysis)
 S3method(silhouette,combi_analysis)
+S3method(summary,clust_analysis)
 S3method(summary,cluster_cv)
+S3method(summary,combi_analysis)
 S3method(summary,cross_dist)
 S3method(summary,importance)
 S3method(summary,knb)
 S3method(summary,red_analysis)
 S3method(summary,sil_extra)
+S3method(summary,tuner)
 S3method(te,clust_analysis)
 S3method(te,combi_analysis)
 S3method(var,clust_analysis)
@@ -97,6 +108,7 @@ export(extract.red_analysis)
 export(get_clust_tendency)
 export(get_kernel_info)
 export(hcluster)
+export(htk_cluster)
 export(impact)
 export(is_clust_analysis)
 export(is_cluster_cv)
@@ -107,6 +119,7 @@ export(is_knb)
 export(is_min_analysis)
 export(is_red_analysis)
 export(is_sil_extra)
+export(is_tuner)
 export(is_umatrix_analysis)
 export(kcluster)
 export(min_max)
@@ -121,11 +134,13 @@ export(plot.importance)
 export(plot.knb)
 export(plot.red_analysis)
 export(plot.sil_extra)
+export(plot.tuner)
 export(plot_clust_hm)
 export(plot_nbclust)
 export(predict.clust_analysis)
 export(predict.combi_analysis)
 export(predict.red_analysis)
+export(prediter)
 export(qe)
 export(red_analysis)
 export(reduce_data)
@@ -136,11 +151,14 @@ export(silhouette.clust_analysis)
 export(silhouette.combi_analysis)
 export(som_cluster)
 export(som_reduce)
+export(summary.clust_analysis)
+export(summary.combi_analysis)
 export(summary.cross_dist)
 export(summary.importance)
 export(summary.knb)
 export(summary.sil_extra)
 export(te)
+export(tune_htk)
 export(var)
 export(var.clust_analysis)
 export(var.combi_analysis)
diff --git a/R/appearance.R b/R/appearance.R
index e5cbace..be92371 100644
--- a/R/appearance.R
+++ b/R/appearance.R
@@ -1,6 +1,6 @@
 # Appearance for the S3 class objects
 
-#' Printing of a clust_analysis object.
+#' Printing of objects.
 #'
 #' @description
 #' Prints a `clust_analysis`, `combi_analysis`, `red_analysis`
@@ -33,7 +33,12 @@
 
     stopifnot(is_combi_analysis(x))
 
-    purrr::walk(x$clust_analyses, print)
+    cat(paste('Combined SOM/clusetring analysis with',
+               x$dist_method, 'distance method.'))
+
+    cat('\nCluster assignment:\n')
+
+    print(as_tibble(x$clust_assignment))
 
   }
 
@@ -61,9 +66,27 @@
     stopifnot(is_red_analysis(x))
 
     cat(paste0(toupper(x$red_fun), ' reduction analysis object.'))
+
     cat('\nComponents:\n')
+
     print(as_tibble(x$component_tbl))
 
   }
 
+#' @rdname print.clust_analysis
+#' @export
+
+  print.tuner <- function(x, ...) {
+
+    stopifnot(is_tuner(x))
+
+    cat(paste('Tuning results for',
+              paste(x$tune_params, collapse = ', ')))
+
+    cat('\nBest parameter combination:\n')
+
+    print(as_tibble(x$best_tune))
+
+  }
+
 # END ------
diff --git a/R/class_testing.R b/R/class_testing.R
index b47926b..91e73bf 100644
--- a/R/class_testing.R
+++ b/R/class_testing.R
@@ -5,7 +5,7 @@
 #' @description
 #' Tests if the object is an instance of the `red_analysis`, `clust_analysis`,
 #' `combi_analysis`, `importance`, `cross_dist`, `sil_extra`, `min_analysis`,
-#' `umatrix_analysis`, `cluster_cv` or `knb` class.
+#' `umatrix_analysis`, `cluster_cv`, `knb` or `tuner` class.
 #'
 #' @return a logical value.
 #'
@@ -60,4 +60,9 @@
 
   is_knb <- function(x) inherits(x, 'knb')
 
+#' @rdname is_clust_analysis
+#' @export
+
+  is_tuner <- function(x) inherits(x, 'tuner')
+
 # END ------
diff --git a/R/clust_analysis_oop.R b/R/clust_analysis_oop.R
index 86228b3..9eedc50 100644
--- a/R/clust_analysis_oop.R
+++ b/R/clust_analysis_oop.R
@@ -105,6 +105,7 @@
     clust_algorithm <- switch(x$clust_fun,
                               hclust = paste(',', x$hc_method, 'method'),
                               kmeans = '',
+                              htk = '',
                               pam = '',
                               som = ', single-layer',
                               supersom = ', multi-layer',
@@ -113,6 +114,7 @@
     clust_method <- switch(x$clust_fun,
                            hclust = 'Hierarchical clustering',
                            kmeans = 'Kmeans clustering',
+                           htk = 'HT-KMEANS clustering',
                            pam = 'PAM clustering',
                            som = 'SOM clustering',
                            supersom = 'SOM clustering',
@@ -137,7 +139,7 @@
 
     if(type == 'data' & x$clust_fun %in% c('supersom', 'supersom_prediction')) {
 
-      warning("The 'data' option is not available for mMulti-layer SOM.",
+      warning("The 'data' option is not available for multi-layer SOM.",
               call. = FALSE)
 
       return(NULL)
@@ -152,8 +154,6 @@
 
       if(x$clust_fun %in% c('hclust', 'kmeans', 'pam')) {
 
-        plot_title <- 'Optimal cluster number'
-
         k <- length(unique(x$clust_assignment$clust_id))
 
         clust_args <-  switch(x$clust_fun,
@@ -170,7 +170,7 @@
                                   diss = dist(x, 'distance'),
                                   k = k,
                                   !!!clust_args,
-                                  plot_title = plot_title,
+                                  plot_title = 'Optimal cluster number',
                                   plot_subtitle = plot_subtitle,
                                   plot_tag = plot_tag,
                                   cust_theme = cust_theme))
@@ -207,6 +207,14 @@
 
         plot_list <- plot_som(x$clust_obj)
 
+      } else if(x$clust_fun == 'htk') {
+
+        plot_list <- plot_htk(x = x,
+                              plot_title = 'Optimal cluster number',
+                              plot_subtitle = plot_subtitle,
+                              plot_tag = plot_tag,
+                              cust_theme = cust_theme)
+
       } else {
 
         warning('No training plots available for cluster predictions',
@@ -230,113 +238,28 @@
 
       if(is_red_analysis(red_results)) {
 
-        ## providing a 1d PCA, MDS or UMAP should throw an error
-
-        if(!all(c('comp_1', 'comp_2') %in% names(red_results$component_tbl))) {
-
-          stop('Atempt to plot a 1-dimensional reduction analysis result. Adjust kdim?',
-               call. = FALSE)
-
-        }
-
-        sdevs <- var(red_results)
-
-        if(red_results$red_fun == 'pca') {
-
-          ax_labs <- map2(c('PC1', 'PC2'),
-                          signif(sdevs$perc_var[1:2], 3),
-                          ~paste0(.x, ', ', .y, '%'))
-
-        } else {
-
-          ax_labs <- map2(c('Dim 1', 'Dim 2'),
-                          signif(sdevs$perc_var[1:2], 3),
-                          ~paste0(.x, ', ', .y, '%'))
-
-        }
-
-        point_plot <-
-          plot_point(data = red_results$component_tbl,
-                     x_var = 'comp_1',
-                     y_var = 'comp_2',
-                     fill_var = 'clust_id',
-                     plot_title = switch(red_results$red_fun ,
-                                         pca = 'PCA',
-                                         mds = 'MDS',
-                                         umap = 'UMAP'),
-                     plot_subtitle = plot_subtitle,
-                     plot_tag = plot_tag,
-                     x_lab = ax_labs[[1]],
-                     y_lab = ax_labs[[2]],
-                     cust_theme = cust_theme,
-                     jitter_height = jitter_height,
-                     jitter_width = jitter_width,
-                     fill_lab = 'Cluster ID',
-                     point_alpha = point_alpha)
+        point_plot <- plot.clust_red(red_results,
+                                     type = 'score',
+                                     label_clust = TRUE,
+                                     cust_theme = cust_theme,
+                                     plot_subtitle = plot_subtitle,
+                                     jitter_height = jitter_height,
+                                     jitter_width = jitter_width,
+                                     point_alpha = point_alpha)
 
         return(point_plot)
 
       } else {
 
-        plot_data <- map(red_results, ~.x$component_tbl)
-
-        name_check <-
-          map_lgl(plot_data,
-                  function(x) all(c('comp_1', 'comp_2') %in% names(x)))
-
-        if(any(!name_check)) {
-
-          stop('Atempt to plot a 1-dimensional reduction analysis result. Adjust kdim?',
-               call. = FALSE)
-
-        }
-
-        sdev <- map(red_results, var)
-
-        if(red_results[[1]]$red_fun == 'pca') {
-
-          ax_labs <-
-            map(sdev,
-                function(layer) map2(c('PC1', 'PC2'),
-                                     signif(layer$perc_var[1:2], 3),
-                                     ~paste0(.x, ', ', .y, '%')))
-
-        } else {
-
-          ax_labs <-
-            map(sdev,
-                function(layer) map2(c('Dim1', 'Dim2'),
-                                     signif(layer$perc_var[1:2], 3),
-                                     ~paste0(.x, ', ', .y, '%')))
-
-        }
-
-        ax_labs <- purrr::transpose(ax_labs)
-
-        n_numbers <- map(red_results, nobs)
-
-        plot_tags <- map(n_numbers,
-                         ~paste('Observations: n = ', .x[['observations']],
-                                '\nVariables: n = ', .x[['variables']]))
-
         point_plots <-
-          pmap(list(data = plot_data,
-                    x_lab = ax_labs[[1]],
-                    y_lab = ax_labs[[2]],
-                    plot_tag = plot_tags),
-               plot_point,
-               x_var = 'comp_1',
-               y_var = 'comp_2',
-               fill_var = 'clust_id',
-               plot_title = switch(red_results[[1]]$red_fun,
-                                   pca = 'PCA',
-                                   mds = 'MDS',
-                                   umap = 'UMAP'),
-               plot_subtitle = plot_subtitle,
+          pmap(list(x = red_results),
+               plot.clust_red,
+               type = 'score',
+               label_clust = TRUE,
                cust_theme = cust_theme,
+               plot_subtitle = plot_subtitle,
                jitter_height = jitter_height,
                jitter_width = jitter_width,
-               fill_lab = 'Cluster ID',
                point_alpha = point_alpha)
 
         return(point_plots)
@@ -474,4 +397,78 @@
 
   }
 
+# Summary quality stats -------
+
+#' Quality control of clustering solutions.
+#'
+#' @description
+#' Computes basic global statistics of quality of a clustering analysis object.
+#'
+#' @details
+#' The statistics retrieved by the `summary()` method are:
+#'
+#' * _silhouette width_ (`sil_width`)
+#'
+#' * _fraction of potentially misclassified observations_ with negative
+#' silhouette widths (`frac_misclassified`)
+#'
+#' * _fraction of explained clustering variance_ expressed as the ratio of total
+#' between sum of squares to total sum of squares (`frac_var`)
+#'
+#' * _fraction of preserved nearest neighbors_ (`frac_np`)
+#'
+#' The statistics are computed with \code{\link{silhouette}}, \code{\link{var}},
+#' and \code{\link{np}} methods for the entire clustering structure and not
+#' for particular clusters.
+#'
+#' @references
+#' Rousseeuw PJ. Silhouettes: A graphical aid to the interpretation and
+#' validation of cluster analysis. J Comput Appl Math (1987) 20:53–65.
+#' doi:10.1016/0377-0427(87)90125-7
+#'
+#' @references
+#' Venna J, Kaski S. Neighborhood preservation in nonlinear projection methods:
+#' An experimental study. Lect Notes Comput Sci (including Subser Lect Notes
+#' Artif Intell Lect Notes Bioinformatics) (2001) 2130:485–491.
+#' doi:10.1007/3-540-44668-0_68
+#'
+#' @return a data frame with columns characterized in Details.
+#'
+#' @param object a `clust_analysis` or `combi_analysis` object.
+#' @param ... extra arguments passed to \code{\link{np}}.
+#'
+#' @export summary.clust_analysis
+#' @export
+
+  summary.clust_analysis <- function(object, ...) {
+
+    stopifnot(is_clust_analysis(object) | is_combi_analysis(object))
+
+    sil_res <- try(silhouette(object), silent = TRUE)
+
+    variance <- try(var(object), silent = TRUE)
+
+    neighbor_preservation <- try(np(object, type = 'final', ...), silent = TRUE)
+
+    stats <-
+      tibble(sil_width = tryCatch(mean(sil_res$sil_width, na.rm = TRUE),
+                                  error = function(e) NA),
+             frac_misclassified = tryCatch(sum(sil_res$sil_width < 0)/nrow(sil_res),
+                                           error = function(e) NA),
+             frac_var = tryCatch(variance$frac_var,
+                                 error = function(e) NA),
+             frac_np = tryCatch(mean(neighbor_preservation$frac_np, na.rm = TRUE),
+                                error = function(e) NA))
+
+    if(any(map_lgl(stats, is.na))) {
+
+      warning('At least one statistic could not be calculated.',
+              call. = FALSE)
+
+    }
+
+    stats
+
+  }
+
 # END ------
diff --git a/R/combi_analysis_oop.R b/R/combi_analysis_oop.R
index c6251af..ae8ed63 100644
--- a/R/combi_analysis_oop.R
+++ b/R/combi_analysis_oop.R
@@ -122,6 +122,10 @@
 
       obs_red <- components(x$clust_analyses$observation, ...)
 
+      ## handling bad requests, e.g. training objects for distance reductions
+
+      if(is.null(obs_red)) return(NULL)
+
       score_tbl <- select(obs_red$component_tbl,
                           observation,
                           dplyr::starts_with('comp'))
@@ -278,4 +282,16 @@
 
   }
 
+# Summary ------
+
+#' @rdname summary.clust_analysis
+#' @export summary.combi_analysis
+#' @export
+
+  summary.combi_analysis <- function(object, ...) {
+
+    summary.clust_analysis(object, ...)
+
+  }
+
 # END ------
diff --git a/R/combi_clustering.R b/R/combi_clustering.R
index 0b40eb8..a6028ad 100644
--- a/R/combi_clustering.R
+++ b/R/combi_clustering.R
@@ -156,6 +156,11 @@
 
     dots <- list2(...)
 
+    ## special handling of lambda vectors for regularized
+    ## cluster functions of the nodes
+
+    dots$lambdas <- NULL
+
     if(!is.null(som_args)) {
 
       dots <- c(dots,
@@ -163,6 +168,8 @@
 
     }
 
+    dots <- compact(dots)
+
     combi_analysis(list(clust_analyses = list(observation = som_clust,
                                               node = node_clust),
                         clust_assignment = combi_ass,
diff --git a/R/constructors.R b/R/constructors.R
index c8ad646..7a4dbfd 100644
--- a/R/constructors.R
+++ b/R/constructors.R
@@ -24,6 +24,12 @@
 #'
 #' `data` a quosure calling the original data set.
 #'
+#' If the `component_tbl` data frame contains the cluster assignment information
+#' an object of the `clust_red` sub-class is created. This sub-class inherits
+#' almost all methods from the superclass `red_analysis`. The only difference
+#' concerns the `plot()` method, which, by default generates layout (score)
+#' scatter plots with the cluster assignment coded by the point color.
+#'
 #' @param x a named list, see Details.
 #'
 #' @return a `red_analysis` object with the elements listed in Details.
@@ -67,7 +73,19 @@
 
     ## output
 
-    structure(x, class = 'red_analysis')
+    x <- structure(x, class = 'red_analysis')
+
+    if(is.data.frame(x$component_tbl)) {
+
+      if('clust_id' %in% names(x$component_tbl)) {
+
+        x <- structure(x, class = c('clust_red', class(x)))
+
+      }
+
+    }
+
+    x
 
   }
 
@@ -155,6 +173,7 @@
 
     stopifnot(x$clust_fun %in% c('hclust',
                                  'kmeans',
+                                 'htk',
                                  'dbscan',
                                  'som',
                                  'pam',
@@ -529,12 +548,12 @@
 
     err_txt <-
       paste("The 'fold_stats' element has to be a data frame with the",
-            "'fold', 'corr_rate', 'err_rate', 'frac_var' and 'sil_width'",
+            "'fold', 'accuracy', 'error', 'frac_var' and 'sil_width'",
             "variables")
 
     if(!is.data.frame(x$fold_stats)) stop(err_txt, call. = FALSE)
 
-    if(any(!c('fold', 'corr_rate', 'err_rate', 'frac_var', 'sil_width') %in% names(x$fold_stats))) {
+    if(any(!c('fold', 'accuracy', 'error', 'frac_var', 'sil_width') %in% names(x$fold_stats))) {
 
       stop(err_txt, call. = FALSE)
 
@@ -628,4 +647,102 @@
 
   }
 
+# Tuning results ------
+
+#' Create an `tuner` class object.
+#'
+#' @description
+#' Creates and object of the `tuner` class on the top of a list with tuning
+#' of parameters of cluster analysis or prediction.
+#'
+#' @details
+#' The input list has to have three elements:
+#'
+#' * `analysis`: a `clust_analysis` or `combi_analysis` object created with the
+#' best set of the tuning parameters
+#'
+#' * `stats`: a data frame with values of quality stats (silhouette width,
+#' fraction of potentially misclassified observations/negative silhouette width,
+#' fraction of explained clustering variance, and fraction of preserved nearest
+#' neighbors)
+#'
+#' * `fun`: name of the tuning function
+#'
+#' * `dataset`: a string specified which data was used during the tuning: the
+#' training data set or cross-validation
+#'
+#' * `type`: type of analysis, development or prediction
+#'
+#' * `clust_vars`: a vector of names of clustering variables
+#'
+#' * `tune_params`: a vector of names of the tuning parameters
+#'
+#' * `tune_criteria`: a data frame that specifies which criteria were applied
+#' to select the best combination of the tuning parameters
+#'
+#' * `best_tune`: a data frame storing the best values of the tuning parameters
+#'
+#' @param x a list with elements specified in Details.
+#'
+#' @return an instance of the `tuner` class as described in Details.
+
+  tuner <- function(x) {
+
+    ## entry control -------
+
+    elements <- c('analysis',
+                  'stats',
+                  'fun',
+                  'dataset',
+                  'type',
+                  'clust_vars',
+                  'tune_params',
+                  'tune_criteria',
+                  'best_tune')
+
+    err_txt <-
+      paste("'x' has to be a list with the",
+            paste(elements, collapse = ', '),
+            "elements.")
+
+    if(!is.list(x)) stop(err_txt, call. = FALSE)
+
+    if(any(!elements %in% names(x))) {
+
+      stop(err_txt, call. = FALSE)
+
+    }
+
+    stat_vars <- c('sil_width',
+                   'frac_misclassified',
+                   'frac_var',
+                   'frac_np')
+
+    err_txt <-
+      paste("The `stats` element of 'x' has to be a data frame with the",
+            paste(stat_vars, collapse = ', '),
+            "columns")
+
+    if(!is.data.frame(x$stats)) stop(err_txt, call. = FALSE)
+
+    if(any(!stat_vars %in% names(x$stats))) {
+
+      stop(err_txt, call. = FALSE)
+
+    }
+
+    if(!is_clust_analysis(x$analysis) & !is_combi_analysis(x$analysis)) {
+
+      stop(paste("The `analysis` element of 'x' has to be an instance of",
+                 "of the 'clust_analysis' or 'combi_analysis' class."),
+           call. = FALSE)
+
+    }
+
+    ## class setting -------
+
+    structure(x, class = 'tuner')
+
+  }
+
 # END ------
diff --git a/R/cross_distance.R b/R/cross_distance.R
index e491785..8dd3b00 100644
--- a/R/cross_distance.R
+++ b/R/cross_distance.R
@@ -162,7 +162,19 @@
 
       if(!is_multi_layer(x)) {
 
-        return(cross_single_homolog(x, method))
+        if(!is.null(method)) {
+
+          return(cross_single_homolog(x, method))
+
+        } else {
+
+          dist_obj <- cross_distance.min_analysis(x)
+
+          attr(dist_obj, 'dist_method') <- x$dist_method
+
+          return(dist_obj)
+
+        }
 
       } else {
 
@@ -239,11 +251,16 @@
           ~list(clust_assignment[[.x[1]]],
                 clust_assignment[[.x[2]]]))
 
-    ## cross-distance objects
+    ## cross-distance objects: subsetting done in two steps
+    ## to prevent returning one-dimensional vectors
 
     dist_lst <-
       map(pairs,
-          ~general_dist[.x[[1]], .x[[2]]], drop = FALSE)
+          ~general_dist[.x[[1]], , drop = FALSE])
+
+    dist_lst <-
+      map2(dist_lst, pairs,
+           ~.x[, .y[[2]], drop = FALSE])
 
     cross_dist(dist_lst,
                type = 'homologous',
diff --git a/R/cross_distance_utils.R b/R/cross_distance_utils.R
index a949dd6..8568244 100644
--- a/R/cross_distance_utils.R
+++ b/R/cross_distance_utils.R
@@ -201,14 +201,27 @@
                        function(data, clust) map(clust,
                                                  ~data[.x, , drop = FALSE]))
 
+    ## I'm working with safely() here. It is possible that some clusters
+    ## are empty in one of the tested objects
 
     dist_lst <-
       furrr::future_map(pairs,
-                        ~cross_distance(clust_data$x[[.x[1]]],
-                                        clust_data$y[[.x[2]]],
-                                        method = method),
+                        ~purrr::safely(cross_distance)(clust_data$x[[.x[1]]],
+                                                       clust_data$y[[.x[2]]],
+                                                       method = method),
                         .options = furrr::furrr_options(seed = TRUE))
 
+    dist_errors <- compact(map(dist_lst, ~.x$error))
+
+    if(length(dist_errors) > 0) {
+
+      warning('Distance calcultion failed for come cluster pairs.',
+              call. = FALSE)
+
+    }
+
+    dist_lst <- compact(map(dist_lst, ~.x$result))
+
     cross_dist(dist_lst,
                type = 'heterologous',
                method = method,
diff --git a/R/cross_validation.R b/R/cross_validation.R
index 6123c5c..3fe60c0 100644
--- a/R/cross_validation.R
+++ b/R/cross_validation.R
@@ -44,6 +44,11 @@
 #' @references
 #' Wehrens R, Kruisselbrink J. Flexible self-organizing maps in kohonen 3.0.
 #' J Stat Softw (2018) 87:1–18. doi:10.18637/jss.v087.i07
+#' @references
+#' Venna J, Kaski S. Neighborhood preservation in nonlinear projection
+#' methods: An experimental study. Lect Notes Comput Sci (including Subser Lect
+#' Notes Artif Intell Lect Notes Bioinformatics) (2001) 2130:485–491.
+#' doi:10.1007/3-540-44668-0_68
 #'
 #' @param data a numeric data frame, matrix or a `red_analysis` object. If a
 #' `red_analysis` object is provided as the data argument, the observation
@@ -66,6 +71,12 @@
 #' @param kernel_fun kernel function transforming the distance into weight.
 #' @param clustering_fun clustering function. Should return a
 #' `clust_analysis` object.
+#' @param kNN_data number of the nearest neighbors in the genuine data set
+#' used for calculation of neighborhood preservation. See \code{\link{np}}
+#' for details.
+#' @param kNN_cluster number of the nearest neighbors of the given cluster used
+#' for calculation of neighborhood preservation. See \code{\link{np}} for
+#' details.
 #' @param seed initial setting of the random number generator.
 #' @param .parallel logical, should the CV be run in parallel?
 #' @param ... extra arguments passed to the clustering_fun.
@@ -92,6 +103,8 @@
                          resolve_ties = FALSE,
                          kernel_fun = function(x) 1/x,
                          clustering_fun = clustTools::kcluster,
+                         kNN_data = 5,
+                         kNN_cluster = NULL,
                          seed = 1234,
                          .parallel = FALSE, ...) {
 
@@ -242,8 +255,8 @@
                                  by = 'observation'))
 
     correct <- NULL
-    corr_rate <- NULL
-    err_rate <- NULL
+    accuracy <- NULL
+    error <- NULL
     fold <- NULL
 
     test_preds <-
@@ -254,54 +267,57 @@
                        fold = .y))
 
     test_error <- dplyr::summarise(test_preds,
-                                   corr_rate = mean(as.numeric(correct),
-                                                    na.rm = TRUE),
-                                   err_rate = 1 - corr_rate,
+                                   accuracy = mean(as.numeric(correct),
+                                                   na.rm = TRUE),
+                                   error = 1 - accuracy,
                                    .by = 'fold')
 
-    ## variances and silhouettes -------
-
-    frac_var <- NULL
-    sil_width <- NULL
-
-    test_variances <- map(test_obj, var)
+    ## variances, silhouettes and neighborhood preservation, out-of-fold  -------
 
-    test_variances <- map2_dfr(test_variances,
-                               names(test_variances),
-                               ~tibble(fold = .y,
-                                       frac_var = .x$frac_var))
+    ## neighborhood preservation seems to be a bit tricky statistic to calculate
+    ## with some distance metrics for a large dimension number, hence,
+    ## I'm working with a safe solution
 
-    test_silhouettes <- map(test_obj, purrr::safely(silhouette))
+    test_stats <- map(test_obj,
+                      purrr::safely(summary),
+                      kNN_data = kNN_data,
+                      kNN_cluster = kNN_cluster)
 
-    test_silhouettes <-
-      compact(map(test_silhouettes, ~.x$result))
+    test_stats <- map(test_stats, ~.x$result)
 
-    test_silhouettes <- map2_dfr(test_silhouettes,
-                                 names(test_silhouettes),
-                                 ~tibble(fold = .y,
-                                         sil_width = mean(.x$sil_width)))
+    test_stats <- map2_dfr(test_stats, names(test_stats),
+                           ~mutate(.x, fold = .y))
 
     ## summary stats --------
 
-    test_stats <- reduce(list(test_error, test_variances, test_silhouettes),
-                         left_join,
-                         by = 'fold')
+    test_stats <- left_join(test_error,
+                            test_stats,
+                            by = 'fold')
+
+    stat_names <-
+      c('accuracy', 'error',
+        'sil_width', 'frac_misclassified', 'frac_var', 'frac_np')
 
     test_stats <-
-      test_stats[c('fold', 'corr_rate', 'err_rate', 'frac_var', 'sil_width')]
+      select(test_stats,
+             fold,
+             dplyr::all_of(stat_names))
 
     oof_means <-
-      map(test_stats[c('corr_rate', 'err_rate', 'frac_var', 'sil_width')],
-          mean, na.rm = TRUE)
+      map(select(test_stats,
+                 dplyr::all_of(stat_names)),
+          mean,
+          na.rm = TRUE)
 
     bca_err <-
-      map(test_stats[c('corr_rate', 'err_rate', 'frac_var', 'sil_width')],
+      map(select(test_stats,
+                 dplyr::all_of(stat_names)),
           ~.x[!is.na(.x)])
 
     bca_err <- map(bca_err, coxed::bca, conf.level = 0.95)
 
     test_summary <-
-      pmap(list(c('accuracy', 'error', 'frac_var', 'sil_width'),
+      pmap(list(stat_names[stat_names %in% names(test_stats)],
                 oof_means,
                 bca_err),
            function(x, y, z) tibble(!!paste0(x, '_mean') := y,
@@ -371,6 +387,11 @@
 #' @references
 #' Wehrens R, Kruisselbrink J. Flexible self-organizing maps in kohonen 3.0.
 #' J Stat Softw (2018) 87:1–18. doi:10.18637/jss.v087.i07
+#' @references
+#' Venna J, Kaski S. Neighborhood preservation in nonlinear projection
+#' methods: An experimental study. Lect Notes Comput Sci (including Subser Lect
+#' Notes Artif Intell Lect Notes Bioinformatics) (2001) 2130:485–491.
+#' doi:10.1007/3-540-44668-0_68
 #'
 #' @param x an object.
 #' @param nfolds number of CV folds.
@@ -384,6 +405,12 @@
 #' @param resolve_ties logical, should the ties be resolved at random? Applies
 #' only to the simple unweighted voting algorithm.
 #' @param kernel_fun kernel function transforming the distance into weight.
+#' @param kNN_data number of the nearest neighbors in the genuine data set
+#' used for calculation of neighborhood preservation. See \code{\link{np}}
+#' for details.
+#' @param kNN_cluster number of the nearest neighbors of the given cluster used
+#' for calculation of neighborhood preservation. See \code{\link{np}} for
+#' details.
 #' @param seed initial setting of the random number generator.
 #' @param .parallel logical, should the CV be run in parallel?
 #' @param ... extra arguments, currently none.
@@ -395,11 +422,12 @@
 #' * kNN projection (prediction) results (`predictions`)
 #'
 #' * a data frame with the classification error, accuracy, fraction of
-#' explained clustering variance and silhouette for the out-of-fold
-#' predictions (`fold_stats`)
+#' explained clustering variance, silhouette and neighbor preservation for
+#' the out-of-fold predictions (`fold_stats`)
 #'
 #' * means and BCA's 95% confidence intervals for the classification error,
-#' accuracy, fraction of explained variance and silhouette (`summary`)
+#' accuracy, fraction of explained variance, silhouette and neighborhood
+#' preservation (`summary`)
 #'
 #' Note the \code{\link{summary.cluster_cv}} and
 #' \code{\link{extract.cluster_cv}} methods.
@@ -418,6 +446,8 @@
                                 simple_vote = TRUE,
                                 resolve_ties = FALSE,
                                 kernel_fun = function(x) 1/x,
+                                kNN_data = 5,
+                                kNN_cluster = NULL,
                                 seed = 1234,
                                 .parallel = FALSE, ...) {
 
@@ -430,6 +460,21 @@
     nfolds <- as.integer(nfolds)
 
     kNN <- as.integer(kNN)
+    kNN_data <- as.integer(kNN_data)
+
+    if(is.null(kNN_cluster)) {
+
+      if(x$clust_fun %in% c('som', 'supersom')) {
+
+        kNN_cluster <- kNN_data
+
+      } else {
+
+        kNN_cluster <- 1
+
+      }
+
+    }
 
     stopifnot(is.logical(simple_vote))
     stopifnot(is.logical(resolve_ties))
@@ -485,11 +530,14 @@
             resolve_ties = resolve_ties,
             kernel_fun = kernel_fun,
             distance_method = distance_names,
+            kNN_data = kNN_data,
+            kNN_cluster = kNN_cluster,
             seed = seed,
             .parallel = .parallel,
             k = nrow(ngroups(x)),
             hc_method = x$hc_method,
             clust_fun = x$clust_fun,
+            lambdas = x$lambdas,
             xdim = x$grid$xdim,
             ydim = x$grid$ydim,
             topo = x$grid$topo,
@@ -502,7 +550,7 @@
 
     ## function calls -------
 
-    if(x$clust_fun %in% c('hclust', 'som', 'supersom')) {
+    if(x$clust_fun %in% c('hclust', 'som', 'supersom', 'htk')) {
 
       cmm_args$clust_fun <- NULL
 
@@ -520,6 +568,7 @@
             clustering_fun = switch(x$clust_fun,
                                     hclust = hcluster,
                                     kmeans = kcluster,
+                                    htk = htk_cluster,
                                     pam = kcluster,
                                     dbscan = dbscan_cluster,
                                     som = som_cluster,
@@ -554,6 +603,8 @@
                                 simple_vote = TRUE,
                                 resolve_ties = FALSE,
                                 kernel_fun = function(x) 1/x,
+                                kNN_data = 5,
+                                kNN_cluster = NULL,
                                 seed = 1234,
                                 .parallel = FALSE, ...) {
 
@@ -566,6 +617,9 @@
     nfolds <- as.integer(nfolds)
 
     kNN <- as.integer(kNN)
+    kNN_data <- as.integer(kNN_data)
+
+    if(is.null(kNN_cluster)) kNN_cluster <- 1
 
     stopifnot(is.logical(simple_vote))
     stopifnot(is.logical(resolve_ties))
@@ -604,7 +658,6 @@
 
     }
 
-
     args <-
       list2(data = model.frame(x)$observation,
             nfolds = nfolds,
@@ -622,12 +675,15 @@
             node_clust_fun = switch(x$clust_analyses$node$clust_fun,
                                     hclust = hcluster,
                                     kmeans = kcluster,
+                                    htk = htk_cluster,
                                     pam = kcluster,
                                     dbscan = dbscan_cluster,
                                     som = som_cluster),
             kernel_fun = kernel_fun,
             simple_vote = simple_vote,
             resolve_ties = resolve_ties,
+            kNN_data = kNN_data,
+            kNN_cluster = kNN_cluster,
             seed = seed,
             .parallel = .parallel)
 
diff --git a/R/feature_importance.R b/R/feature_importance.R
index 7e9add1..06c0b34 100644
--- a/R/feature_importance.R
+++ b/R/feature_importance.R
@@ -163,6 +163,8 @@
 
     ## entry control --------
 
+    start_time <- Sys.time()
+
     stopifnot(is_clust_analysis(x))
     stopifnot(is.numeric(n_iter))
 
@@ -180,25 +182,49 @@
 
     }
 
+    clustering_fun <-
+      switch(x$clust_fun,
+             hclust = hcluster,
+             kmeans = kcluster,
+             htk = htk_cluster,
+             pam = kcluster,
+             dbscan = dbscan_cluster,
+             som = som_cluster,
+             supersom = som_cluster)
+
+    ## parallel backend and benchmarking -------
+
+    message(paste('Permutation importance testing with',
+                  n_iter, 'iterations'))
+
+    if(.parallel) future::plan('multisession')
+
+    on.exit(message(paste('Elapsed:', Sys.time() - start_time)),
+            add = TRUE)
+
+    on.exit(future::plan('sequential'))
+
     ## common parameters --------
 
+    cmm_list <-
+      list2(data = model.frame(x),
+            distance_method = distance_names,
+            .parallel = .parallel,
+            k = nrow(ngroups(x)),
+            hc_method = x$hc_method,
+            clust_fun = x$clust_fun,
+            lambdas = x$lambdas,
+            xdim = x$grid$xdim,
+            ydim = x$grid$ydim,
+            topo = x$grid$topo,
+            eps = x$eps,
+            minPts = x$minPts,
+            neighbourhood.fct = as.character(x$grid$neighbourhood.fct),
+            toroidal = x$grid$toroidal)
+
     if(n_iter == 1) {
 
-      cmm_args <-
-        list2(data = model.frame(x),
-              distance_method = distance_names,
-              seed = seed,
-              .parallel = .parallel,
-              k = nrow(ngroups(x)),
-              hc_method = x$hc_method,
-              clust_fun = x$clust_fun,
-              xdim = x$grid$xdim,
-              ydim = x$grid$ydim,
-              topo = x$grid$topo,
-              eps = x$eps,
-              minPts = x$minPts,
-              neighbourhood.fct = as.character(x$grid$neighbourhood.fct),
-              toroidal = x$grid$toroidal)
+      cmm_args <- c(cmm_list, list2(seed = seed))
 
       cmm_args <- c(cmm_args, x$dots)
 
@@ -211,21 +237,7 @@
       seeds <- set_names(seeds, paste0('run_', 1:n_iter))
 
       cmm_args <-
-        map(seeds,
-            ~list2(data = model.frame(x),
-                   distance_method = distance_names,
-                   seed = .x,
-                   .parallel = FALSE,
-                   k = nrow(ngroups(x)),
-                   hc_method = x$hc_method,
-                   clust_fun = x$clust_fun,
-                   xdim = x$grid$xdim,
-                   ydim = x$grid$ydim,
-                   topo = x$grid$topo,
-                   eps = x$eps,
-                   minPts = x$minPts,
-                   neighbourhood.fct = as.character(x$grid$neighbourhood.fct),
-                   toroidal = x$grid$toroidal))
+        map(seeds, ~c(cmm_list, list2(seed = .x)))
 
       cmm_args <- map(cmm_args, ~c(.x, x$dots))
 
@@ -235,7 +247,7 @@
 
     if(n_iter == 1) {
 
-      if(x$clust_fun %in% c('hclust', 'som', 'supersom')) {
+      if(x$clust_fun %in% c('hclust', 'som', 'supersom', 'htk')) {
 
         cmm_args$clust_fun <- NULL
 
@@ -250,13 +262,7 @@
       imp_call <-
         call2('importance_cluster',
               !!!compact(cmm_args),
-              clustering_fun = switch(x$clust_fun,
-                                      hclust = hcluster,
-                                      kmeans = kcluster,
-                                      pam = kcluster,
-                                      dbscan = dbscan_cluster,
-                                      som = som_cluster,
-                                      supersom = som_cluster))
+              clustering_fun = clustering_fun)
 
       return(eval(imp_call))
 
@@ -264,7 +270,7 @@
 
     ## calls: multiple iterations ----------
 
-    if(x$clust_fun %in% c('hclust', 'som', 'supersom')) {
+    if(x$clust_fun %in% c('hclust', 'som', 'supersom', 'htk')) {
 
       for(i in names(cmm_args)) {
 
@@ -288,30 +294,12 @@
       map(cmm_args,
           ~call2('importance_cluster',
                  !!!compact(.x),
-                 clustering_fun = switch(x$clust_fun,
-                                         hclust = hcluster,
-                                         kmeans = kcluster,
-                                         pam = kcluster,
-                                         dbscan = dbscan_cluster,
-                                         som = som_cluster,
-                                         supersom = som_cluster)))
+                 clustering_fun = clustering_fun))
 
-    if(.parallel) {
-
-      future::plan('multisession')
-
-      res <-
-        furrr::future_map(imp_call,
-                          eval,
-                          .options = furrr::furrr_options(seed = TRUE))
-
-      future::plan('sequential')
-
-    } else {
-
-      res <- map(imp_call, eval)
-
-    }
+    res <-
+      suppressMessages(furrr::future_map(imp_call,
+                                         eval,
+                                         .options = furrr::furrr_options(seed = TRUE)))
 
     res <-
       map2_dfr(res, names(res),
@@ -345,6 +333,8 @@
 
     ## entry control -------
 
+    start_time <- Sys.time()
+
     stopifnot(is_combi_analysis(x))
     stopifnot(is.numeric(n_iter))
 
@@ -368,29 +358,45 @@
 
     }
 
+    node_clust_fun = switch(x$clust_analyses$node$clust_fun,
+                            hclust = hcluster,
+                            kmeans = kcluster,
+                            htk = htk_cluster,
+                            pam = kcluster,
+                            dbscan = dbscan_cluster,
+                            som = som_cluster)
+
+    ## parallel backend and benchmarking --------
+
+    message(paste('Permutation importance testing with',
+                  n_iter, 'iterations'))
+
+    if(.parallel) future::plan('multisession')
+
+    on.exit(message(paste('Elapsed:', Sys.time() - start_time)),
+            add = TRUE)
+
+    on.exit(future::plan('sequential'))
+
     ## common parameters -------
 
+    cmm_list <-
+      list2(data = model.frame(x)$observation,
+            clustering_fun = clustering_fun,
+            distance_som = distance_som,
+            distance_method = distance_method,
+            xdim = x$clust_analyses$observation$grid$xdim,
+            ydim = x$clust_analyses$observation$grid$ydim,
+            topo = x$clust_analyses$observation$grid$topo,
+            neighbourhood.fct = as.character(x$clust_analyses$observation$grid$neighbourhood.fct),
+            toroidal = x$clust_analyses$observation$grid$toroidal,
+            rlen = nrow(x$clust_analyses$observation$clust_obj$changes),
+            node_clust_fun = node_clust_fun,
+            .parallel = .parallel)
+
     if(n_iter == 1) {
 
-      args <-
-        list2(data = model.frame(x)$observation,
-              clustering_fun = clustering_fun,
-              distance_som = distance_som,
-              distance_method = distance_method,
-              xdim = x$clust_analyses$observation$grid$xdim,
-              ydim = x$clust_analyses$observation$grid$ydim,
-              topo = x$clust_analyses$observation$grid$topo,
-              neighbourhood.fct = as.character(x$clust_analyses$observation$grid$neighbourhood.fct),
-              toroidal = x$clust_analyses$observation$grid$toroidal,
-              rlen = nrow(x$clust_analyses$observation$clust_obj$changes),
-              node_clust_fun = switch(x$clust_analyses$node$clust_fun,
-                                      hclust = hcluster,
-                                      kmeans = kcluster,
-                                      pam = kcluster,
-                                      dbscan = dbscan_cluster,
-                                      som = som_cluster),
-              seed = seed,
-              .parallel = .parallel)
+      args <- c(cmm_list, list(seed = seed))
 
       args <- c(args, x$dots)
 
@@ -404,26 +410,10 @@
 
       seeds <- set_names(seeds, paste0('run_', 1:n_iter))
 
+      cmm_list$.parallel <- FALSE
+
       args <-
-        map(seeds,
-            ~list2(data = model.frame(x)$observation,
-                   clustering_fun = clustering_fun,
-                   distance_som = distance_som,
-                   distance_method = distance_method,
-                   xdim = x$clust_analyses$observation$grid$xdim,
-                   ydim = x$clust_analyses$observation$grid$ydim,
-                   topo = x$clust_analyses$observation$grid$topo,
-                   neighbourhood.fct = as.character(x$clust_analyses$observation$grid$neighbourhood.fct),
-                   toroidal = x$clust_analyses$observation$grid$toroidal,
-                   rlen = nrow(x$clust_analyses$observation$clust_obj$changes),
-                   node_clust_fun = switch(x$clust_analyses$node$clust_fun,
-                                           hclust = hcluster,
-                                           kmeans = kcluster,
-                                           pam = kcluster,
-                                           dbscan = dbscan_cluster,
-                                           som = som_cluster),
-                   seed = .x,
-                   .parallel = FALSE))
+        map(seeds, ~c(cmm_list, list(seed = .x)))
 
       args <- map(args, ~c(.x, x$dots))
 
@@ -447,14 +437,6 @@
     imp_call <-
       map(args, ~call2('importance_cluster', !!!.x))
 
-    if(.parallel) {
-
-      future::plan('multisession')
-
-    }
-
-    on.exit(future::plan('sequential'))
-
     exports <- c('tidyverse',
                  'rlang',
                  'cluster',
@@ -467,10 +449,10 @@
                  'clustTools')
 
     res <-
-      furrr::future_map(imp_call,
-                        eval,
-                        .options = furrr::furrr_options(seed = TRUE,
-                                                        packages = exports))
+      suppressMessages(furrr::future_map(imp_call,
+                                         eval,
+                                         .options = furrr::furrr_options(seed = TRUE,
+                                                                         packages = exports)))
 
     res <-
       map2_dfr(res, names(res),
diff --git a/R/iterative_semi_supervised.R b/R/iterative_semi_supervised.R
new file mode 100644
index 0000000..9c3c27c
--- /dev/null
+++ b/R/iterative_semi_supervised.R
@@ -0,0 +1,242 @@
+# Iterative kNN label propagation algorithm used for semi-supervised clustering
+
+# S3 interface -------
+
+#' Iterative k-nearest neighbor label propagation algorithm.
+#'
+#' @description
+#' Prediction of cluster assignment by the k-nearest neighbor label propagation
+#' algorithm with an automated choice of the k value based on a loss function.
+#'
+#' @details
+#' The function finds the optimal value of k for the k-nearest neighbor
+#' classifier by iteratively checking quality of the cluster assignment. The
+#' quality check is accomplished by one of the loss functions:
+#' silhouette width (`select_stat = 'silhouette'`, default),
+#' percentage of observations with negative silhouette widths
+#' ('misclassification'),
+#' fraction of explained clustering variance (i.e. ratio of the between-cluster
+#' sum of squares to the total sum of squares, `select_stat = 'variance'`),
+#' or neighbor preservation (`select_stat = 'np'`).
+#' The `prediter()` function is a S3 generic function.
+#' The function works only for clustering analyses and combined SOM - clustering
+#' analyses with the data provided as data frames but not as distance matrices.
+#'
+#' @references
+#' Leng M, Wang J, Cheng J, Zhou H, Chen X. Adaptive semi-supervised
+#' clustering algorithm with label propagation. J Softw Eng (2014) 8:14–22.
+#' doi:10.3923/jse.2014.14.22
+#'
+#' @return an object of the \code{\link{tuner}} class with the `plot()` and `summary()`
+#' methods.
+#'
+#' @param x a `clust_analysis` or `combi_analysis` object.
+#' @param newdata a data frame or a matrix with the new data. If `NULL`,
+#' the training data is used for fitting the clustering structure.
+#' @param select_stat a name of the loss function defining the quality measure
+#' of the prediction. For details, see: Details.
+#' @param max_k the maximal number of the nearest neighbors to be tested.
+#' @param kNN_data number of the nearest neighbors in the dataset, used for
+#' determination of neighborhood preservation statistic. See: \code{\link{np}}
+#' for details.
+#' @param kNN_cluster number of the nearest neighbors in the cluster, used for
+#' determination of neighborhood preservation statistic. See: \code{\link{np}}
+#' for details.
+#' @param .parallel logical, should the analysis be run in parallel?
+#' @param ... extra arguments passed to \code{\link{predict.clust_analysis}} or
+#' \code{\link{predict.combi_analysis}} such as kernel weighting. Note that you
+#' cannot specify the `kNN` and `type` arguments.
+#'
+#' @export
+
+  prediter <- function(x, ...) UseMethod('prediter')
+
+#' @rdname prediter
+#' @export
+
+  prediter.clust_analysis <- function(x,
+                                      newdata = NULL,
+                                      select_stat = c('silhouette',
+                                                      'misclassification',
+                                                      'variance',
+                                                      'np'),
+                                      max_k = 20,
+                                      kNN_data = 5,
+                                      kNN_cluster = NULL,
+                                      .parallel = FALSE, ...) {
+
+    ## entry control -------
+
+    start_time <- Sys.time()
+
+    stopifnot(is_clust_analysis(x) | is_combi_analysis(x))
+
+    select_stat <- match.arg(select_stat[1],
+                             c('silhouette',
+                               'misclassification',
+                               'variance',
+                               'np'))
+
+    stopifnot(is.numeric(max_k))
+    stopifnot(is.logical(.parallel))
+
+    n_observations <- nobs(x)$observations
+
+    if(max_k > n_observations) {
+
+      stop("'max_k' has to be lower than the observation number.",
+           call. = FALSE)
+
+    }
+
+    max_k <- as.integer(max_k)
+
+    k_vec <- 1:max_k
+
+    ## Benchmarking ---------
+
+    message(paste('Finding optimal number of nearest neighbors for',
+                  length(k_vec), 'k values'))
+
+    on.exit(message(paste('Elapsed:', Sys.time() - start_time)),
+            add = TRUE,
+            after = FALSE)
+
+    on.exit(future::plan('sequential'))
+
+    ## data check: essentially done by `predict()` ------
+
+    if(is.null(newdata)) newdata <- model.frame(x)
+
+    check_numeric(newdata)
+
+    ## predictions and quality stats ---------
+
+    if(.parallel) {
+
+      future::plan('multisession')
+
+      preds <-
+        furrr::future_pmap(list(kNN = k_vec),
+                           propagate,
+                           object = x,
+                           newdata = newdata,
+                           ...,
+                           .options = furrr::furrr_options(seed = TRUE))
+
+      stats <-
+        furrr::future_map_dfr(preds,
+                              summary,
+                              kNN_data = kNN_data,
+                              kNN_cluster = kNN_cluster,
+                              .options = furrr::furrr_options(seed = TRUE))
+
+    } else {
+
+      preds <- pmap(list(kNN = k_vec),
+                    propagate,
+                    object = x,
+                    newdata = newdata, ...)
+
+      stats <- map_dfr(preds,
+                       summary,
+                       kNN_data = kNN_data,
+                       kNN_cluster = kNN_cluster)
+
+    }
+
+    kNN <- NULL
+
+    stats <- mutate(stats, kNN = k_vec)
+
+    stats <- dplyr::relocate(stats, kNN)
+
+    ## choice of the optimal k and output -------
+
+    select_var <- switch(select_stat,
+                         silhouette = 'sil_width',
+                         misclassification = 'frac_misclassified',
+                         variance = 'frac_var',
+                         np = 'frac_np')
+
+    select_fun <- switch(select_stat,
+                         silhouette = function(x) max(x, na.rm = TRUE),
+                         misclassification = function(x) min(x, na.rm = TRUE),
+                         variance = function(x) max(x, na.rm = TRUE),
+                         np = function(x) max(x, na.rm = TRUE))
+
+    select_lab <- switch(select_stat,
+                         silhouette = 'max',
+                         misclassification = 'min',
+                         variance = 'max',
+                         np = 'max')
+
+    opt_k <-
+      filter(stats,
+             .data[[select_var]] == select_fun(.data[[select_var]]))
+
+    opt_k <- opt_k$kNN[1]
+
+    tuner(list(analysis = preds[[opt_k]],
+               stats = stats,
+               fun = 'prediter',
+               dataset = 'train',
+               type = 'prediction',
+               clust_vars = names(model.frame(x)),
+               tune_params = 'kNN',
+               tune_criteria = tibble(!!select_var := select_lab),
+               best_tune = tibble(kNN = opt_k)))
+
+  }
+
+#' @rdname prediter
+#' @export
+
+  prediter.min_analysis <- function(x, ...) {
+
+    warning(paste('Iterative predictions are not available for clustering',
+                  'analyses done with distance matrices.'),
+            call. = FALSE)
+
+    return(NULL)
+
+  }
+
+#' @rdname prediter
+#' @export
+
+  prediter.umatrix_analysis <- function(x, ...) {
+
+    warning(paste('Iterative predictions are not available for clustering',
+                  'analyses done with multiple data layers.'),
+            call. = FALSE)
+
+    return(NULL)
+
+  }
+
+#' @rdname prediter
+#' @export
+
+  prediter.combi_analysis <- function(x,
+                                      newdata = NULL,
+                                      select_stat = c('silhouette',
+                                                      'misclassification',
+                                                      'variance',
+                                                      'np'),
+                                      max_k = 20,
+                                      kNN_data = 5,
+                                      kNN_cluster = NULL,
+                                      .parallel = FALSE, ...) {
+
+    ## entry control -------
+
+    prediter.clust_analysis(x,
+                            newdata = newdata,
+                            select_stat = select_stat,
+                            max_k = max_k,
+                            .parallel = .parallel, ...)
+
+  }
+
+# END -----
diff --git a/R/plotting_utils.R b/R/plotting_utils.R
index ede9541..10da687 100644
--- a/R/plotting_utils.R
+++ b/R/plotting_utils.R
@@ -382,6 +382,119 @@
 
   }
 
+# Diagnostic plots for HTK clustering -------
+
+#' Generate diagnostic plots for hard threshold KMEANS clustering.
+#'
+#' @description
+#' Creates the plot of within-cluster sum of squares and mean silhouette width
+#' for the varying cluster numbers.
+#'
+#' @details
+#' Corresponds to the output of \code{\link[factoextra]{fviz_nbclust}}.
+#' Intended for internal use.
+#'
+#' @return a list of two elements `wss` and `silhouette` storing `ggplot`
+#' objects with the plots of within-cluster sum of squares and silhouette widths.
+#'
+#' @param x a `clust_analysis` object.
+#' @param k a numeric vector with the k cluster numbers. If `NULL`, it
+#' will be determined automatically.
+#' @param plot_title title of the plots.
+#' @param plot_subtitle subtitle of the plots.
+#' @param plot_tag plot tag.
+#' @param cust_theme a custom `ggplot` theme.
+
+  plot_htk <- function(x,
+                       k = NULL,
+                       plot_title = NULL,
+                       plot_subtitle = NULL,
+                       plot_tag = NULL,
+                       cust_theme = ggplot2::theme_classic()) {
+
+    ## input control -----
+
+    stopifnot(is_clust_analysis(x))
+    stopifnot(x$clust_fun == 'htk')
+    stopifnot(inherits(cust_theme, 'theme'))
+
+    ## determining the cluster number and the range of k ------
+
+    clust_n <- nrow(ngroups(x))
+
+    k_vec <- c(1:clust_n,
+               (clust_n + 1):(clust_n + 6))
+
+    ## fitting the clustering objects -------
+
+    clust_calls <-
+      map(k_vec,
+          ~call2('htk_cluster',
+                 data = model.frame(x),
+                 k = .x,
+                 lambdas = x$lambdas,
+                 !!!compact(x$dots)))
+
+    ## safely evaluate, for high cluster numbers
+    ## an high lambda values, the algorithm may have no enough
+    ## unique data points to initialize the centers
+
+    clust_objects <- map(clust_calls, purrr::safely(eval))
+
+    errors <- map(clust_objects, ~.x$error)
+
+    correct <- map_lgl(errors, is.null)
+
+    clust_objects <- compact(map(clust_objects, ~.x$result))
+
+    ## stats --------
+
+    variances <- map(clust_objects, var)
+
+    variances <- map_dbl(variances, ~.x$total_wss)
+
+    ## no point at computing silhouette for a single cluster
+    ## its zero
+
+    sil_w <- map(clust_objects[-1], silhouette)
+
+    sil_w <- map_dbl(sil_w, ~mean(.x$sil_width))
+
+    variance <- NULL
+    silhouette <- NULL
+
+    stats <- tibble(k = k_vec[correct],
+                    variance = variances,
+                    silhouette = c(0, sil_w))
+
+    ## plots
+
+    plots <-
+      pmap(list(x = c('variance', 'silhouette'),
+                y = c('Total Within Sum of Square',
+                      'Average silhouette width')),
+           function(x, y) ggplot(stats,
+                                 aes(x = k,
+                                     y = .data[[x]])) +
+             ggplot2::geom_path(color = 'steelblue') +
+             ggplot2::geom_point(shape = 16,
+                                 size = 2,
+                                 color = 'steelblue') +
+             ggplot2::geom_vline(xintercept = clust_n,
+                                 linetype = 'dashed',
+                                 color = 'coral3') +
+             ggplot2::scale_x_continuous(breaks = k_vec) +
+             cust_theme +
+             ggplot2::labs(title = plot_title,
+                           subtitle = plot_subtitle,
+                           tag = plot_tag,
+                           y = y,
+                           x = 'Number of clusters k'))
+
+    set_names(plots, c('wss', 'silhouette'))
+
+  }
+
 # General plotting functions -------
 
 #' Generate a custom scatter ggplot.
diff --git a/R/red_analysis_oop.R b/R/red_analysis_oop.R
index 65caedf..8f51c68 100644
--- a/R/red_analysis_oop.R
+++ b/R/red_analysis_oop.R
@@ -38,7 +38,7 @@
 
 # plotting ----
 
-#' Plot features of a red_analysis object.
+#' Plot features of a `red_analysis` object.
 #'
 #' @description
 #' Plots the component table, loadings table - in both cases the
@@ -47,13 +47,20 @@
 #' the components/dimensions.
 #'
 #' @details
-#' The loadings table plot is available only for the PCA `red_analysis` objects.
+#' The loadings table plot is available only for the PCA and factor
+#' analysis `red_analysis` objects.
+#' For `red_analysis` objects created with `clust_analysis` objects, scatter
+#' plots of the  scores/components table/layout can convey
+#' the cluster assignment information coded by the point or bar color
+#' (`label_clust = TRUE`).
 #'
-#' @param x a `red_analysis` object, created with \code{\link{reduce_data}}.
+#' @param x a `red_analysis` object, created with \code{\link{reduce_data}} or
+#' `components()` called for clustering analyses.
 #' @param type plot type:
 #'
-#' * 'component_tbl' or 'score' present the scores for particular observations
-#' in a scatter plot
+#' * 'component_tbl' or 'score' present the scores (layout) for particular
+#' observations in a scatter plot. If `label_clust = TRUE`, point color codes
+#' for the optional cluster assignment information.
 #'
 #' * 'loadings' plot the variable PCA loadings as a scatter plot.
 #'
@@ -64,6 +71,8 @@
 #'
 #' @param label_points logical, should the variable names be displayed in the
 #' plot? Valid only for the PCA loadings plot.
+#' @param label_clust logical, should the cluster assignment (if available) be
+#' coded be the point color?
 #' @param cust_theme a ggplot plot theme.
 #' @param segment_color color of the lines presented in the PCA loading plot.
 #' @param ... extra arguments passed to \code{\link{plot_point}}
@@ -81,6 +90,7 @@
                                          'scree',
                                          'neighborhood'),
                                 label_points = TRUE,
+                                label_clust = FALSE,
                                 cust_theme = ggplot2::theme_classic(),
                                 segment_color = 'steelblue', ...) {
 
@@ -232,6 +242,102 @@
 
   }
 
+#' @rdname plot.red_analysis
+#' @export
+
+  plot.clust_red <- function(x,
+                             type = c('component_tbl',
+                                      'scores',
+                                      'loadings',
+                                      'scree',
+                                      'neighborhood'),
+                             label_points = TRUE,
+                             label_clust = TRUE,
+                             cust_theme = ggplot2::theme_classic(),
+                             segment_color = 'steelblue', ...) {
+
+    ## entry control is done be the superclass method -------
+
+    stopifnot(is.logical(label_clust))
+
+    type <- match.arg(type[1],
+                      c('component_tbl',
+                        'scores',
+                        'loadings',
+                        'scree',
+                        'neighborhood'))
+
+    ## calls for the superclass method for plots without cluster assignment ------
+
+    if(!label_clust) return(NextMethod())
+
+    if(!type %in% c('component_tbl', 'scores', 'neighborhood')) {
+
+      return(NextMethod())
+
+    }
+
+    ## numbers of observations and variables ------
+
+    plot_n <- nobs(x)
+
+    plot_tag <- paste0('\nObservations: n = ',
+                       plot_n$observations,
+                       '\nVariables: n = ',
+                       plot_n$variables)
+
+    ## scatter plots with the cluster assignment information ------
+
+    if(type %in% c('scores', 'component_tbl')) {
+
+      plot_data <- extract(x, 'scores')
+
+      if(!all(c('comp_1', 'comp_2') %in% names(plot_data))) {
+
+        stop(paste('Atempt to plot a 1-dimensional reduction analysis result.',
+                   'Adjust kdim?'),
+             call. = FALSE)
+
+      }
+
+      sdevs <- var(x)
+
+      if(x$red_fun == 'pca') {
+
+        ax_labs <- map2(c('PC1', 'PC2'),
+                        signif(sdevs$perc_var[1:2], 3),
+                        ~paste0(.x, ', ', .y, '%'))
+
+      } else {
+
+        ax_labs <- map2(c('Dim 1', 'Dim 2'),
+                        signif(sdevs$perc_var[1:2], 3),
+                        ~paste0(.x, ', ', .y, '%'))
+
+      }
+
+      point_plot <-
+        plot_point(data = plot_data,
+                   x_var = 'comp_1',
+                   y_var = 'comp_2',
+                   fill_var = 'clust_id',
+                   plot_title = switch(x$red_fun ,
+                                       pca = 'PCA',
+                                       mds = 'MDS',
+                                       umap = 'UMAP'),
+                   plot_tag = plot_tag,
+                   x_lab = ax_labs[[1]],
+                   y_lab = ax_labs[[2]],
+                   cust_theme = cust_theme,
+                   fill_lab = 'Cluster ID', ...)
+
+      return(point_plot)
+
+    }
+
+  }
+
+
 # prediction ------
 
 #' Project new data onto a reduction analysis layout.
diff --git a/R/reduction.R b/R/reduction.R
index 7fc92ae..a3543c5 100644
--- a/R/reduction.R
+++ b/R/reduction.R
@@ -30,6 +30,11 @@
 #' @param with type of the input data for the reduction analysis:
 #' the clustering data ('data'), the matrix of distances between observations
 #' ('distance') or U matrix between SOM nodes ('umatrix').
+#' @param train_object an optional `red_analysis` object wchich will be used as
+#' a training layout. This works currently only for PCA and UMAP and in case
+#' of single layer data frames as clustering data.
+#' Please refer to \code{\link[stats]{prcomp}} and \code{\link[umap]{umap}} for
+#' details.
 #' @param ... extra arguments passed to \code{\link{reduce_data}}.
 #'
 #' @return a `red_analysis` object with the component/score table containing
@@ -43,7 +48,8 @@
   components.clust_analysis <- function(object,
                                         kdim = NULL,
                                         red_fun = c('pca', 'mds', 'umap'),
-                                        with = c('distance', 'data', 'umatrix'), ...) {
+                                        with = c('distance', 'data', 'umatrix'),
+                                        train_object = NULL, ...) {
 
     ## entry control -------
 
@@ -55,7 +61,7 @@
 
     if(is.null(kdim)) {
 
-      kdim <- length(unique(object$clust_assignment$clust_id))
+      kdim <- nrow(ngroups(object))
 
     }
 
@@ -77,6 +83,56 @@
     node <- NULL
     clust_id <- NULL
 
+    ## reduction with a trained object ------
+
+    if(!is.null(train_object)) {
+
+      if(check_supersom) {
+
+        warning(paste('Predictions for multi-layer data are not implemented',
+                      'at the moment.'),
+                call. = FALSE)
+
+        return(NULL)
+
+      }
+
+      if(!is_red_analysis(train_object)) {
+
+        stop("'train_object' has to be a 'red_analysis' object.",
+             call. = FALSE)
+
+      }
+
+      red_fun <- train_object$red_fun
+
+      if(!red_fun %in% c('pca', 'umap')) {
+
+        stop("'train_object' has to be a PCA or UMAP analysis.",
+             call. = FALSE)
+
+      }
+
+      if(with %in% c('distance', 'umatrix')) {
+
+        warning('Predictions for distances are not supported.', call. = FALSE)
+
+        return(NULL)
+
+      }
+
+      red_obj <- predict(train_object,
+                         newdata = extract(object, type = with), ...)
+
+      red_obj$component_tbl <-
+        left_join(red_obj$component_tbl,
+                  object$clust_assignment,
+                  by = 'observation')
+
+      return(red_analysis(red_obj))
+
+    }
+
     ## reduction: most cases --------
     ## concerns clustering without multi-layer SOM and distances
 
@@ -102,7 +158,7 @@
 
       }
 
-      return(red_obj)
+      return(red_analysis(red_obj))
 
     }
 
@@ -134,7 +190,7 @@
 
     }
 
-    red_objects
+    map(red_objects, red_analysis)
 
   }
 
@@ -147,6 +203,7 @@
     NextMethod()
 
   }
+
 #' @rdname components.clust_analysis
 #' @export components.combi_analysis
 #' @export
@@ -154,7 +211,8 @@
   components.combi_analysis <- function(object,
                                         kdim = NULL,
                                         red_fun = c('pca', 'mds', 'umap'),
-                                        with = c('distance', 'data', 'umatrix'), ...) {
+                                        with = c('distance', 'data', 'umatrix'),
+                                        train_object = NULL, ...) {
 
     ## entry control -------
 
@@ -169,7 +227,8 @@
     red_analysis <- components(object$clust_analyses$observation,
                                kdim = kdim,
                                red_fun = red_fun,
-                               with = with, ...)
+                               with = with,
+                               train_object = train_object, ...)
 
     observation <- NULL
 
@@ -237,7 +296,7 @@
 
     }
 
-    data_comps
+    map(data_comps, red_analysis)
 
   }
 
diff --git a/R/regularized_clustering.R b/R/regularized_clustering.R
new file mode 100644
index 0000000..d455e6b
--- /dev/null
+++ b/R/regularized_clustering.R
@@ -0,0 +1,481 @@
+# Functions for regularized clustering
+
+# Regularized K-means clustering -----
+
+#' Regularized hard threshold KMEANS clustering.
+#'
+#' @description
+#' Implements the hard threshold KMEANS algorithm proposed by Raymaekers
+#' and Zamar and provided by the `clusterHD` package.
+#'
+#' @details
+#' The algorithm offers an interesting approach to clustering of
+#'  multi-dimensional data, especially those containing variables
+#' of little relevance for the clustering analysis (e.g. as investigated by
+#' permutation importance with \code{\link{impact}}). For details, please refer
+#' to the genuine R function \code{\link[clusterHD]{HTKmeans}} and the seminal
+#' paper. The function works with the squared Euclidean distance metric and
+#' accepts only a numeric data frame as the input data.
+#' There are two crucial parameters to be provided by the user, the number of
+#' centers/clusters `k`, which can be determined by methods such as peak mean
+#' silhouette width, and the regularization argument `lambdas`, whose value can
+#' be found by tuning (e.g. comparing silhouette widths or clustering variances
+#' for various lambda values). If `lambdas` is set to `NULL` or provided as
+#' a numeric vector, the best lambda value is found with the
+#' \code{\link[clusterHD]{getLambda}}. If a single value is provided, it
+#' will be used for clustering.
+#' Tuning of the lambda parameter using explained clustering variance,
+#' silhouette widths and neighbor preservation statistic is facilitated by the
+#' `tune_htk()` function.
+#'
+#' @references
+#' Raymaekers J, Zamar RH. Regularized K-means Through Hard-Thresholding.
+#' J Mach Learn Res (2022) 23:1–48.
+#' Available at: http://jmlr.org/papers/v23/21-0052.html
+#'
+#' @param data a numeric data frame with observations in the rows and
+#' variables in the columns.
+#' @param k number of centers (clusters).
+#' @param lambdas a numeric vector of the regularization parameter. See Details.
+#' @param select_stat statistic used for selection of the optimal lambda value.
+#' Ignored if `lambdas` is a single numeric value. For `htk_cluster()` they are
+#' 'AIC' (Akaike Information Criterion) or 'BIC' (Bayesian Information
+#' Criterion). For `tune_htk()` they are silhouette width ('silhouette'),
+#' fraction of observations with negative silhouette values
+#' ('misclassification'), fraction of explained clustering variance ('variance'),
+#' or neighborhood preservation ('np').
+#' @param seed root of the random number generator.
+#' @param type type of the tuning procedure. When set to 'train' (default),
+#' cluster structure quality statistics are computed for the entire data set.
+#' When set to 'cv', cross-validated statistics for subsequent lambda values
+#' are calculated.
+#' @param nfolds number of CV folds.
+#' @param kNN number of the nearest neighbors used by the cluster assignment
+#' classifier in the cross-validation folds. Ignored if `type = 'train'`.
+#' @param simple_vote logical, should classical unweighted k-NN classification
+#' be applied? If FALSE, distance-weighted k-NN is used with the provided kernel
+#' function. Ignored if `type = 'train'`.
+#' @param resolve_ties logical, should the ties be resolved at random? Applies
+#' only to the simple unweighted voting algorithm. Ignored if `type = 'train'`.
+#' @param kernel_fun kernel function transforming the distance into weight.
+#' Ignored if `type = 'train'`.
+#' @param kNN_data number of the nearest neighbors in the genuine data set
+#' used for calculation of neighborhood preservation. See \code{\link{np}}
+#' for details.
+#' @param kNN_cluster number of the nearest neighbors of the given cluster used
+#' for calculation of neighborhood preservation. See \code{\link{np}} for
+#' details.
+#' @param .parallel logical, shoudl the analysis be run in parallel?
+#' @param ... extra arguments provided to \code{\link[clusterHD]{HTKmeans}}.
+#'
+#' @return `htk_cluster()` returns an object of the
+#' class \code{\link{clust_analysis}}.
+#'
+#' @export
+
+  htk_cluster <- function(data,
+                          k = 2,
+                          lambdas = NULL,
+                          select_stat = c('AIC', 'BIC'),
+                          seed = 1234, ...) {
+
+    ## entry control -------
+
+    check_numeric(data)
+
+    stopifnot(is.numeric(k))
+
+    k <- as.integer(k[1])
+
+    if(!is.null(lambdas)) {
+
+      if(!is.numeric(lambdas)) {
+
+        stop(paste("'lambdas' has to be a numeric vector, NULL or a",
+                   "single numeric value."),
+             call. = FALSE)
+
+      }
+
+    }
+
+    select_stat <- match.arg(select_stat[1], c('AIC', 'BIC'))
+
+    ## check for the data ------
+
+    check_numeric(data)
+
+    mat_data <- as.matrix(data)
+
+    if(is.null(rownames(mat_data))) {
+
+      obs_id <- 1:nrow(mat_data)
+
+    } else {
+
+      obs_id <- rownames(mat_data)
+
+    }
+
+    model_frame <- enexpr(data)
+
+    ## clustering ------
+
+    ## handling distance method requests, to make the function compatible
+    ## with combined SOM procedures
+
+    extra_args <- list2(...)
+
+    extra_args$distance_method <- NULL
+
+    extra_args <- compact(extra_args)
+
+    fun <- clusterHD::HTKmeans
+
+    res_call <- call2('fun',
+                      X = mat_data,
+                      k = k,
+                      lambdas = lambdas,
+                      !!!extra_args)
+
+    res <- eval(res_call)
+
+    ## lambdas as a single number --------
+
+    if(length(lambdas) == 1) {
+
+      set.seed(seed)
+
+      clust_ass <-
+        tibble(observation = obs_id,
+               clust_id = factor(res$HTKmeans.out[[1]]$cluster))
+
+      return(clust_analysis(list(data = quo(!!model_frame),
+                                 dist_mtx = calculate_dist(data, 'squared_euclidean'),
+                                 dist_method = 'squared_euclidean',
+                                 lambdas = lambdas,
+                                 clust_fun = 'htk',
+                                 clust_obj = res,
+                                 clust_assignment = clust_ass,
+                                 dots = list2(...))))
+
+    }
+
+    ## lambda as NULL or a numeric vector ------
+
+    opt_lambda <- clusterHD::getLambda(res, type = select_stat)
+
+    return(htk_cluster(data = data,
+                       k = k,
+                       lambdas = opt_lambda,
+                       seed = seed, ...))
+
+  }
+
+#' @rdname htk_cluster
+#' @export
+
+  tune_htk <- function(data,
+                       k = 2,
+                       lambdas = NULL,
+                       select_stat = c('silhouette',
+                                       'misclassification',
+                                       'variance',
+                                       'np'),
+                       type = c('train', 'cv'),
+                       nfolds = 5,
+                       kNN = 5,
+                       simple_vote = TRUE,
+                       resolve_ties = FALSE,
+                       kernel_fun = function(x) 1/x,
+                       kNN_data = 5,
+                       kNN_cluster = NULL,
+                       seed = 1234,
+                       .parallel = FALSE, ...) {
+
+    ## input check ------
+
+    start_time <- Sys.time()
+
+    check_numeric(data)
+
+    clust_features <- names(data)
+
+    stopifnot(is.numeric(k))
+
+    k <- as.integer(k[1])
+
+    type <- match.arg(type[1], c('train', 'cv'))
+
+    stopifnot(is.logical(.parallel))
+
+    select_stat <- match.arg(select_stat[1],
+                             c('silhouette',
+                               'misclassification',
+                               'variance',
+                               'np'))
+
+    ## the CV arguments are checked by a downstream function
+
+    ## the lambda vector is stripped from the output of the HTKmeans()
+    ## function
+
+    if(is.null(lambdas)) {
+
+      res <- clusterHD::HTKmeans(X = as.matrix(data),
+                                 k = k,
+                                 lambdas = lambdas, ...)
+
+      lambdas <- res$lambdas
+
+    }
+
+    stopifnot(is.numeric(lambdas))
+
+    ## benchmarking and parallel backend ---------
+
+    message(paste('Tuning for', length(lambdas), 'lambda values'))
+
+    if(.parallel) future::plan('multisession')
+
+    on.exit(message(paste('Elapsed:', Sys.time() - start_time)),
+            add = TRUE,
+            after = FALSE)
+
+    on.exit(future::plan('sequential'))
+
+    ## tuning objects --------
+
+    extra_args <- list2(...)
+
+    tune_calls <-
+      map(lambdas,
+          ~call2('htk_cluster',
+                 data = data,
+                 k = k,
+                 lambdas = .x,
+                 select_stat = 'AIC',
+                 seed = seed,
+                 !!!compact(extra_args)))
+
+    tune_objects <-
+      furrr::future_map(tune_calls, eval,
+                        .options = furrr::furrr_options(seed = seed))
+
+    tune_objects <- set_names(tune_objects,
+                              paste0('obj_', 1:length(tune_objects)))
+
+    lambda <- NULL
+    object_id <- NULL
+
+    lambda_assign <- tibble(lambda = lambdas,
+                            object_id = names(tune_objects))
+
+    ## training data stats ------
+
+    if(type == 'train') {
+
+      lambda <- NULL
+
+      ## working with safely(), because clustering for some lambda values
+      ## can fail
+
+      stats <-
+        furrr::future_map2(tune_objects,
+                           names(tune_objects),
+                           ~purrr::safely(tibble)(mutate(summary(.x,
+                                                                 kNN_data = kNN_data,
+                                                                 kNN_cluster = kNN_cluster)),
+                                                  object_id = .y),
+                           .options = furrr::furrr_options(seed = TRUE))
+
+      stats <- map_dfr(stats, ~.x$result)
+
+      stats <- left_join(stats, lambda_assign, by = 'object_id')
+
+    }
+
+    ## CV stats -------
+
+    if(type == 'cv') {
+
+      if(.parallel) {
+
+        if(nfolds > length(lambdas)) {
+
+          cv_objects <-
+            map(tune_objects,
+                ~purrr::safely(cv)(.x,
+                                   type = 'propagation',
+                                   nfolds = nfolds,
+                                   kNN = kNN,
+                                   simple_vote = simple_vote,
+                                   resolve_ties = resolve_ties,
+                                   kernel_fun = kernel_fun,
+                                   kNN_data = kNN_data,
+                                   kNN_cluster = kNN_cluster,
+                                   seed = seed,
+                                   .parallel = TRUE))
+
+        } else {
+
+          cv_objects <-
+            furrr::future_map(tune_objects,
+                              ~purrr::safely(cv)(.x,
+                                                 type = 'propagation',
+                                                 nfolds = nfolds,
+                                                 kNN = kNN,
+                                                 simple_vote = simple_vote,
+                                                 resolve_ties = resolve_ties,
+                                                 kernel_fun = kernel_fun,
+                                                 kNN_data = kNN_data,
+                                                 kNN_cluster = kNN_cluster,
+                                                 seed = seed,
+                                                 .parallel = FALSE),
+                              .options = furrr::furrr_options(seed = TRUE))
+
+        }
+
+      } else {
+
+        cv_objects <-
+          map(tune_objects,
+              ~purrr::safely(cv)(.x,
+                                 type = 'propagation',
+                                 nfolds = nfolds,
+                                 kNN = kNN,
+                                 simple_vote = simple_vote,
+                                 resolve_ties = resolve_ties,
+                                 kernel_fun = kernel_fun,
+                                 kNN_data = kNN_data,
+                                 kNN_cluster = kNN_cluster,
+                                 seed = seed,
+                                 .parallel = FALSE))
+
+
+      }
+
+
+      correct <- map_lgl(cv_objects, ~is.null(.x$error))
+
+      cv_objects <- compact(map(cv_objects, ~.x$result))
+
+      stats <- map(cv_objects, summary)
+
+      stats <- map2_dfr(stats, names(stats),
+                        ~mutate(.x, object_id = .y))
+
+      stats <- left_join(stats, lambda_assign, by = 'object_id')
+
+      frac_var_mean <- NULL
+      sil_width_mean <- NULL
+      frac_np_mean <- NULL
+      frac_misclassified_mean <- NULL
+
+      sil_width <- NULL
+      frac_misclassified <- NULL
+      frac_var <- NULL
+      frac_np <- NULL
+
+      stats <- dplyr::transmute(stats,
+                                object_id = object_id,
+                                lambda = lambda,
+                                sil_width = sil_width_mean,
+                                frac_misclassified = frac_misclassified_mean,
+                                frac_var = frac_var_mean,
+                                frac_np = frac_np_mean)
+
+      stats <- mutate(stats, lambda = lambdas[correct])
+
+    }
+
+    stats <- dplyr::relocate(stats, lambda)
+
+    ## centers and identification of active variables -------
+
+    ## active variables are those for which all center coordinates
+    ## are non zero. I'm computing the Euclidean distance to the zero point
+
+    center_coords <-
+      map(tune_objects,
+          ~.x$clust_obj$HTKmeans.out[[1]]$centers)
+
+    center_coords <- map(center_coords, ~.x^2)
+
+    center_dists <- map(center_coords, colSums)
+
+    center_dists <- do.call('rbind', center_dists)
+
+    center_dists <- sqrt(center_dists)
+
+    center_dists <- set_names(as.data.frame(center_dists),
+                              clust_features)
+
+    n_acite_vars <-
+      map(1:nrow(center_dists),
+          ~center_dists[.x, ] != 0)
+
+    n_active_vars <- NULL
+
+    center_dists <- mutate(center_dists,
+                           lambda = lambdas,
+                           n_active_vars = map_dbl(n_acite_vars, sum))
+
+    center_dists <- as_tibble(center_dists)
+
+    stats <- left_join(stats,
+                       center_dists[c('lambda', 'n_active_vars')],
+                       by = 'lambda')
+
+    ## best tune and the analysis object -------
+
+    select_var <- switch(select_stat,
+                         silhouette = 'sil_width',
+                         misclassification = 'frac_misclassified',
+                         variance = 'frac_var',
+                         np = 'frac_np')
+
+    select_fun <-
+      switch(select_stat,
+             silhouette = function(x) max(x, na.rm = TRUE),
+             misclassification = function(x) min(x, na.rm = TRUE),
+             variance = function(x) max(x, na.rm = TRUE),
+             np = function(x) max(x, na.rm = TRUE))
+
+    select_lab <- switch(select_stat,
+                         silhouette = 'max',
+                         misclassification = 'min',
+                         variance = 'max',
+                         np = 'max')
+
+    ## in case there are more than one optimal tunes selected by the numeric
+    ## statistic criterion, the scenario with the lowest number of active
+    ## variables is selected
+
+    best_stats <-
+      filter(stats,
+             .data[[select_var]] == select_fun(.data[[select_var]]))
+
+    best_stats <-
+      filter(best_stats,
+             n_active_vars == min(n_active_vars))
+
+    best_stats <- best_stats[1, ]
+
+    ## the output object ---------
+
+    tuner(list(analysis = tune_objects[[best_stats$object_id[[1]]]],
+               stats = stats,
+               center_distances = center_dists,
+               fun = 'tune_htk',
+               dataset = type,
+               type = 'development',
+               clust_vars = clust_features,
+               tune_params = 'lambda',
+               tune_criteria = tibble(!!select_var := select_lab,
+                                      n_active_vars = 'min'),
+               best_tune = tibble(lambda = best_stats$lambda[[1]])))
+
+  }
+
+# END -------
diff --git a/R/sil_oop.R b/R/sil_oop.R
index be7a128..9d40e40 100644
--- a/R/sil_oop.R
+++ b/R/sil_oop.R
@@ -18,7 +18,9 @@
 #' @param ... extra arguments, currently none.
 #'
 #' @return  a data frame with numeric statistics for the whole clustering
-#' structure (`clust_id` = 'global') and particular clusters.
+#' structure (`clust_id` = 'global') and particular clusters (mean, SD, median,
+#' interquartile range, 95% percentile range, range, number and fraction of
+#' potentially misclassified observations with negative silhouette widths).
 #'
 #' @export summary.sil_extra
 #' @export
@@ -77,7 +79,7 @@
     neg <- map2_dfr(neg, names(neg),
                     ~mutate(.x,
                             clust_id = .y,
-                            perc_negative = 100 * n_negative/n))
+                            frac_misclassified = n_negative/n))
 
     stats <- left_join(neg, stats, by = 'clust_id')
 
@@ -158,7 +160,7 @@
     sil_width <- NULL
     sil_sign <- NULL
     n <- NULL
-    perc_negative <- NULL
+    frac_misclassified <- NULL
     clust_id <- NULL
     observation <- NULL
     neighbor_id <- NULL
@@ -190,8 +192,8 @@
       stats <-
         mutate(summary(x),
                plot_lab = paste0('total: n = ', n,
-                                 '\nnegative: ',
-                                 signif(perc_negative, signif_digits), '%',
+                                 '\nnegative fraction = ',
+                                 signif(frac_misclassified, signif_digits),
                                  '\navg(s) = ', signif(mean, signif_digits)),
                plot_lab = paste(clust_id, plot_lab, sep = '\n'))
 
diff --git a/R/tuner_oop.R b/R/tuner_oop.R
new file mode 100644
index 0000000..fd5f88d
--- /dev/null
+++ b/R/tuner_oop.R
@@ -0,0 +1,269 @@
+# S3 OOP interface for the `tuner` class
+
+# Extraction -------
+
+#' Extract cluster assignment predictions.
+#'
+#' @description
+#' The function extracts the `clust_analysis` object, quality statistics or
+#' the best combination of the tuning parameters.
+#'
+#' @param x a `tuner` object.
+#' @param type feature to be extracted from the object:
+#'
+#' * `clust_object` or `analysis` returns the `clust_analysis` object generated
+#' with the best combination of the tuning parameters
+#'
+#' * `stats` extracts quality statistics for combinations of the tuning
+#' parameters
+#'
+#' * `criteria` returns a data frame with criteria of selection of the best
+#' combination of the tuning parameters
+#'
+#' * `best_tune` extracts a data frame with the best values of the tuning
+#' parameters
+#'
+#' @param ... extra arguments, currently none.
+#'
+#' @return a `clust_analysis` object.
+#'
+#' @export
+
+  extract.tuner <- function(x,
+                            type = c('clust_object',
+                                     'analysis',
+                                     'stats',
+                                     'criteria',
+                                     'best_tune'), ...) {
+
+    stopifnot(is_tuner(x))
+
+    type <- match.arg(type[1],
+                      c('clust_object',
+                        'analysis',
+                        'stats',
+                        'criteria',
+                        'best_tune'))
+
+    switch(type,
+           clust_object = x$analysis,
+           analysis = x$analysis,
+           stats = x$stats,
+           criteria = x$tune_criteria,
+           best_tune = x$best_tune)
+
+  }
+
+# Summary -------
+
+#' Summary of quality statistics.
+#'
+#' @description
+#' The `summary()` method called for `tuner` class objects extracts cluster
+#' assignment prediction statistics for subsequent combinations of the tuning
+#' parameters from the object.
+#'
+#' @param object a `tuner` object.
+#' @param ... extra arguments, currently none.
+#'
+#' @return a data frame with the following columns:
+#'
+#' * `sil_width`: silhouette width
+#'
+#' * `frac_misclassified`: fraction of observations with negative silhouette
+#' widths suggestive of misclassification
+#'
+#' * `frac_var`: fraction of explained clustering variance
+#'
+#' * `frac_np`: fraction of preserved nearest neighbors
+#'
+#' * columns named after names of the tuning parameters and containing their
+#' values
+#'
+#' @export
+
+  summary.tuner <- function(object, ...) object$stats
+
+# Plotting -------
+
+#' Plot cluster quality statistic values for tuning.
+#'
+#' @description
+#' The function plots quality statistic values (such as explained clustering
+#' variance, silhouette width or mean neighborhood preservation) for
+#' combinations of the tuning parameters.
+#'
+#' @param x a `tuner` object.
+#' @param cust_theme a custom `ggplot` theme.
+#' @param line_alpha alpha of the lines, applies only to plots of
+#' regularization paths.
+#' @param point_size size of the data points.
+#' @param ... extra arguments, currently none.
+#'
+#' @return a list of `ggplot` graphic objects.
+#'
+#' @export plot.tuner
+#' @export
+
+  plot.tuner <- function(x,
+                         cust_theme = ggplot2::theme_classic(),
+                         line_alpha = 1,
+                         point_size = 2, ...) {
+
+    ## entry control ------
+
+    stopifnot(is_tuner(x))
+
+    if(!inherits(cust_theme, 'theme')) {
+
+      stop("'cust_theme' has to be a valid ggplot theme.",
+           call. = FALSE)
+
+    }
+
+    ## variable declaration, compatibility with CHECK ------
+
+    n_active_vars <- NULL
+    value <- NULL
+    statistic <- NULL
+    labs <- NULL
+    lambda <- NULL
+    distance <- NULL
+    variable <- NULL
+
+    ## plot data and metadata --------
+
+    stat_vars <- c('sil_width', 'frac_misclassified', 'frac_var', 'frac_np')
+
+    stat_titles <-
+      set_names(c('Cluster separation',
+                  'Potential misclassification',
+                  'Explained clustering variance',
+                  'Neighborhood preservation'),
+                stat_vars)
+
+    stat_labels <-
+      set_names(c('mean silhouette width',
+                  'fraction of negative silhouette widths',
+                  'fraction of explained clustering variance',
+                  'fraction of preserved nearest neighbors',
+                  'fraction of active variables'),
+                c(stat_vars, 'n_active_vars'))
+
+    stat_colors <-
+      set_names(c('coral3',
+                  'steelblue3',
+                  'darkolivegreen',
+                  'plum4',
+                  'firebrick'),
+                c(stat_vars, 'n_active_vars'))
+
+    data_label <- switch(x$dataset,
+                         train = 'training',
+                         cv = 'CV')
+
+    criteria <- paste(stat_labels[names(x$tune_criteria)], collapse = ', ')
+
+    best_label <- map_dbl(x$best_tune, signif, 2)
+
+    best_label <- map2_chr(names(best_label), best_label,
+                           paste, sep = ' = ')
+
+    best_label <- paste(best_label, collapse = ', ')
+
+    plot_subtitle <- paste0('dataset: ', data_label,
+                            '\ncriteria: ', criteria,
+                            '\nbest tune:', best_label)
+
+    ## plotting table -------
+
+    stats <- summary(x)
+
+    if(x$fun == 'tune_htk') {
+
+      stat_vars <- c(stat_vars, 'n_active_vars')
+
+      stats <- mutate(stats,
+                      n_active_vars = n_active_vars/length(x$clust_vars))
+
+    }
+
+    stats <-
+      tidyr::pivot_longer(stats,
+                          cols = dplyr::all_of(stat_vars),
+                          names_to = 'statistic',
+                          values_to = 'value')
+
+    ## plotting: base plots --------
+
+    if(x$fun %in% c('prediter', 'tune_htk')) {
+
+      base_var <- x$tune_params[1]
+
+      x_intercept <- x$best_tune[[base_var]][1]
+
+      stat_plot <-
+        ggplot(stats,
+               aes(x = .data[[base_var]],
+                   y = value,
+                   color = statistic)) +
+        ggplot2::geom_path() +
+        ggplot2::geom_point(shape = 16,
+                            size = point_size) +
+        ggplot2::geom_vline(xintercept = x_intercept,
+                            linetype = 'dashed') +
+        ggplot2::scale_color_manual(values = stat_colors,
+                                    labels = stat_labels,
+                                    name = 'statistic') +
+        cust_theme +
+        labs(title = paste('Tuning of', base_var),
+             subtitle = plot_subtitle,
+             x = base_var,
+             y = 'statistic value')
+
+    }
+
+    ## plotting: regularization paths ------
+
+    if(x$fun %in% c('tune_htk')) {
+
+      x_intercept <- x$best_tune[['lambda']][1]
+
+      reg_data <-
+        tidyr::pivot_longer(x$center_distances,
+                            cols = dplyr::all_of(x$clust_vars),
+                            names_to = 'variable',
+                            values_to = 'distance')
+
+      reg_plot <-
+        ggplot(reg_data,
+               aes(x = lambda,
+                   y = distance,
+                   color = variable)) +
+        ggplot2::geom_line(alpha = line_alpha) +
+        ggplot2::geom_vline(xintercept = x_intercept,
+                            linetype = 'dashed') +
+        cust_theme +
+        ggplot2::labs(title = 'Regularization paths',
+                      subtitle = plot_subtitle,
+                      x = 'lambda',
+                      y = 'distance from mean')
+
+    }
+
+    ## output ----
+
+    if(x$fun == 'prediter') {
+
+      return(stat_plot)
+
+    } else if(x$fun == 'tune_htk') {
+
+      return(list(statistics = stat_plot,
+                  regularization = reg_plot))
+
+    }
+
+  }
+
+# END ------
diff --git a/inst/examples/devel_test.R b/inst/examples/devel_test.R
index 84cfed7..0a07897 100644
--- a/inst/examples/devel_test.R
+++ b/inst/examples/devel_test.R
@@ -175,7 +175,7 @@
                           clust_fun = 'kmeans',
                           k = 3)
 
-  test_pam <- kcluster(data = test_red$umap,
+  test_pam <- kcluster(data = test_data,
                        distance_method = 'manhattan',
                        clust_fun = 'pam',
                        k = 3)
@@ -275,7 +275,7 @@
 
   ## predictions for reduction analysis objects
 
-  test_pred_red <- predict(object = test_pam,
+  test_pred_red <- predict(object = test_hcl,
                            newdata = reduce_data(data = new_data,
                                                  distance_method = 'cosine',
                                                  kdim = 3,
@@ -284,7 +284,7 @@
 
   plot(test_pred_red, type = 'data')
 
-  plot(test_pam, type = 'data')
+  plot(test_hcl, type = 'data')
 
   test_pred_pca <- predict(object = test_dbscan,
                            newdata = reduce_data(data = new_data,
@@ -331,7 +331,7 @@
 # SOM combi clustering and its OOP -----
 
   test_combi <- combi_cluster(data = test_data,
-                              distance_som = 'canberra',
+                              distance_som = 'cosine',
                               xdim = 5,
                               ydim = 4,
                               topo = 'hexagonal',
@@ -440,6 +440,12 @@
 
   cross_distance(test_pam)
 
+  cross_distance(test_pam,
+                 predict(test_pam,
+                         newdata = new_data,
+                         type = 'propagation')) %>%
+    plot
+
   test_cross <- cross_distance(test_combi, test_new_combi)
 
   summary(cross_distance(test_combi, test_new_combi))
@@ -480,9 +486,6 @@
   plot(silhouette(test_combi), fill_by = 'sign')
   plot(silhouette(test_hcl), fill_by = 'neighbor')
 
-# Batch
-
-
 # Multi-layer SOM -------
 
   biopsy_data <- MASS::biopsy %>%
@@ -534,8 +537,18 @@
   components(train_batch, kdim = 2, red_fun = 'umap', with = 'data')
 
   plot(train_batch, type = 'diagnostic')
-  plot(train_batch, type = 'components', with = 'data', kdim = 2, red_fun = 'mds')
-  plot(train_batch, type = 'components', with = 'umatrix', kdim = 2, red_fun = 'umap')
+
+  plot(train_batch,
+       type = 'components',
+       with = 'data',
+       kdim = 2,
+       red_fun = 'mds')
+
+  plot(train_batch,
+       type = 'components',
+       with = 'umatrix',
+       kdim = 2,
+       red_fun = 'umap')
 
   plot(train_batch, 'training')
 
@@ -568,7 +581,11 @@
 
   components(test_batch, kdim = 2, red_fun = 'umap', with = 'data')
 
-  plot(test_batch, type = 'components', with = 'distance', kdim = 2, red_fun = 'umap')
+  plot(test_batch,
+       type = 'components',
+       with = 'data',
+       kdim = 2,
+       red_fun = 'umap')
 
   test_homo_cross <- cross_distance(train_batch,
                                     .parallel = TRUE)
@@ -618,6 +635,8 @@
   silhouette(car_batch) %>%
     plot
 
+  components(car_batch, kdim = 2, with = 'data')
+
   plot(car_batch, 'heat_map')
 
   car_batch %>%
@@ -700,6 +719,10 @@
   components(biopsy_umatrix_kmeans, kdim = 2, red_fun = 'mds') %>% plot
   components(biopsy_umatrix_pam, kdim = 2, red_fun = 'pca') %>% plot
 
+  plot(biopsy_umatrix_hcl,
+       type = 'components',
+       with = 'data')
+
   cross_distance(biopsy_umatrix_hcl) %>% plot('mean')
   cross_distance(biopsy_umatrix_kmeans) %>% plot('mean')
   cross_distance(biopsy_umatrix_pam) %>% plot('histogram')
@@ -921,4 +944,113 @@
 
   te(test_combi, type = 'final')
 
+# adaptive label propagation ------
+
+  test_adapt <- test_pam %>%
+    prediter(newdata = new_data,
+             select_stat = 'misclassification',
+             simple_vote = FALSE,
+             resolve_ties = TRUE,
+             .parallel = TRUE)
+
+  test_adapt %>%
+    extract
+
+  test_adapt %>%
+    summary
+
+  plot(test_adapt)
+
+# reduction analysis of the clustering data, training object -------
+
+  train_components <- test_pam %>%
+    components(kdim = 2, red_fun = 'umap', with = 'data')
+
+  train_components %>%
+    plot(type = 'component_tbl')
+
+  test_pam %>%
+    plot('components',
+         with = 'data')
+
+  new_pam <- predict(test_pam,
+                     newdata = new_data,
+                     type = 'propagation')
+
+  ## using the trained components
+
+  new_components <- new_pam %>%
+    components(kdim = 2,
+               red_fun = 'pca',
+               with = 'data',
+               train_object = train_components)
+
+  plot(new_components, type = 'scores')
+
+  plot(new_pam,
+       type = 'components',
+       with = 'data',
+       kdim = 2,
+       red_fun = 'umap')
+
+  plot(new_pam,
+       type = 'components',
+       with = 'data',
+       train_object = train_components)
+
+# reduction analysis of the clustering analysis, single layer SOM -------
+
+  train_components <- test_som %>%
+    components(kdim = 2,
+               red_fun = 'pca',
+               with = 'data')
+
+  plot(train_components)
+
+  new_som <- test_som %>%
+    predict(newdata = new_data,
+            type = 'som')
+
+  plot(new_som,
+       type = 'components',
+       kdim = 2,
+       with = 'data',
+       red_fun = 'pca')
+
+  plot(new_som,
+       type = 'components',
+       kdim = 2,
+       with = 'data',
+       train_object = train_components)
+
+# reduction analysis of the combi analysis -------
+
+  train_components <- test_combi %>%
+    components(kdim = 2,
+               with = 'data',
+               red_fun = 'umap')
+
+  test_new_combi %>%
+    components(with = 'data',
+               train_object = train_components)
+
+  new_components <- test_new_neuro_combi %>%
+    components(with = 'data',
+               train_object = train_components)
+
+  plot(train_components)
+  plot(new_components)
+
+  plot(test_combi,
+       type = 'components',
+       with = 'data',
+       kdim = 2,
+       red_fun = 'umap',
+       train_object = train_components)
+
+  plot(test_new_combi,
+       type = 'components',
+       with = 'data',
+       train_object = train_components)
+
 # END ------
diff --git a/inst/examples/github_examples_multi_layer.R b/inst/examples/github_examples_multi_layer.R
index 97f90d7..1674344 100644
--- a/inst/examples/github_examples_multi_layer.R
+++ b/inst/examples/github_examples_multi_layer.R
@@ -76,7 +76,7 @@
 # fitting the SOM -------
 
   ## the layers will be handled by three various distances:
-  ## Euclidean for numeric variables, Manhattan for ordinal variables
+  ## cosine for numeric variables, Manhattan for ordinal variables
   ## and Tanimoto for binary features
 
   car_som <-
diff --git a/inst/examples/github_examples_som.R b/inst/examples/github_examples_som.R
index 52ae319..933ed48 100644
--- a/inst/examples/github_examples_som.R
+++ b/inst/examples/github_examples_som.R
@@ -328,11 +328,17 @@
   cosine_feature_hm$train +
     cosine_feature_hm$test
 
+# Cross distances -----
+
+  cross_distance(cosine_clusters$train,
+                 cosine_clusters$test) %>%
+    plot('mean')
+
 # Feature importance -------
 
-  cosine_importance <- impact.combi_analysis(cosine_clusters$train,
-                                             n_iter = 50,
-                                             .parallel = TRUE)
+  cosine_importance <- impact(cosine_clusters$train,
+                              n_iter = 50,
+                              .parallel = TRUE)
 
   plot(cosine_importance)
 
@@ -360,9 +366,9 @@
     map(mutate,
         obs = vintage,
         pred = car::recode(clust_id,
-                           "'1' = 'Barolo';
-                           '2' = 'Grignolino';
-                           '3' = 'Barbera'"),
+                           "'1' = 'Barbera';
+                           '2' = 'Barolo';
+                           '3' = 'Grignolino'"),
         pred = factor(pred,
                       levels = c('Barbera', 'Barolo', 'Grignolino')))
 
diff --git a/inst/examples/github_examples_sparsity.R b/inst/examples/github_examples_sparsity.R
new file mode 100644
index 0000000..691fdb5
--- /dev/null
+++ b/inst/examples/github_examples_sparsity.R
@@ -0,0 +1,381 @@
+# Fitting a clustering structure with the hard-threshold KMEANS algorithm
+
+# tools --------
+
+  library(kohonen)
+
+  library(clustTools)
+  library(tidyverse)
+  library(clusterHD)
+  library(caret)
+  library(somKernels)
+
+  extract <- clustTools::extract
+  map <- purrr::map
+
+  library(patchwork)
+
+# analysis data set -------
+
+  ## the wines dataset pre-processing: elimination of invariant features
+  ## (low variance to mean ratio) and mean centering
+
+  data("wines")
+
+  my_wines <- wines %>%
+    as.data.frame %>%
+    mutate(ID = paste0('wine_', 1:nrow(.))) %>%
+    column_to_rownames('ID')
+
+  distr_stats <- my_wines %>%
+    map_dfr(~tibble(variance = var(.x),
+                    mean = mean(.x),
+                    var_mean_ratio = var(.x)/mean(.x))) %>%
+    mutate(variable = names(my_wines)) %>%
+    relocate(variable)
+
+  top_variant_features <- distr_stats %>%
+    filter(var_mean_ratio > 0.1) %>%
+    .$variable
+
+  my_wines <- my_wines %>%
+    select(all_of(top_variant_features)) %>%
+    center_data('mean')
+
+  ## appending the parental data frame with wine classification
+  ## it will be used for the final validation of the results
+
+  my_wines <- my_wines %>%
+    mutate(vintage = vintages)
+
+  ## training: 100 randomly chosen observations
+  ## the rest used for validation#
+  ## created with caret's createDataPartition() to keep
+  ## the vintage distribution
+
+  set.seed(12345)
+
+  train_ids <- createDataPartition(my_wines$vintage, p = 100/177)[[1]]
+
+  wine_lst <-
+    list(train = my_wines[train_ids, ],
+         test = my_wines[-train_ids, ]) %>%
+    map(select, -vintage)
+
+# variable distribution ------
+
+  ## variable violin plots
+
+  clust_variables <- names(wine_lst$train)
+
+  var_distr_plot <- wine_lst$train %>%
+    pivot_longer(cols = all_of(clust_variables),
+                 names_to = 'variable',
+                 values_to = 'z_score') %>%
+    ggplot(aes(x = variable,
+               y = z_score)) +
+    geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
+                scale = 'width',
+                fill = 'cornsilk') +
+    geom_point(shape = 16,
+               size = 1,
+               alpha = 0.5,
+               position = position_jitter(width = 0.1))
+
+# constructing clustering structures in the training data set -------
+
+  train_clusters <- list()
+
+  ## canonical KMEANS
+
+  train_clusters$kmeans <- kcluster(data = wine_lst$train,
+                                    distance_method = 'squared_euclidean',
+                                    clust_fun = 'kmeans',
+                                    k = 3)
+
+  ## regularized KMEANS, finding lambda by CV-tuning
+
+  train_tune <- tune_htk(data = wine_lst$train,
+                         k = 3,
+                         lambdas = seq(0, 1, by = 0.025),
+                         select_stat = 'silhouette',
+                         type = 'cv',
+                         nfolds = 10,
+                         kNN = 11,
+                         .parallel = TRUE)
+
+  summary(train_tune)
+
+  plot(train_tune)
+
+  train_clusters$htk <- extract(train_tune, 'analysis')
+
+  ## renaming the clusters after their predominant vintages (checked elsewhere)
+
+  train_clusters$kmeans <- train_clusters$kmeans %>%
+    rename(c('2' = 'Barbera',
+             '1' = 'Barolo',
+             '3' = 'Grignolino'))
+
+  train_clusters$htk <- train_clusters$htk %>%
+    rename(c('3' = 'Barbera',
+             '1' = 'Barolo',
+             '2' = 'Grignolino'))
+
+# performance in the training data and cross-validation -------
+
+  train_clusters %>%
+    map(summary)
+
+  train_clusters %>%
+    map(cv,
+        type = 'propagation',
+        nfolds = 10,
+        kNN = 11) %>%
+    map(summary) %>%
+    map(select, ends_with('mean'))
+
+# performance in the test data -------
+
+  test_clusters <- train_clusters %>%
+    map(predict,
+        newdata = wine_lst$test,
+        type = 'propagation',
+        kNN = 11)
+
+  test_clusters %>%
+    map(summary)
+
+  ## cross distances
+
+  map2(train_clusters,
+       test_clusters,
+       cross_distance) %>%
+    map(plot, 'mean') %>%
+    map(~.x +
+          scale_fill_gradient2(low = 'firebrick',
+                               mid = 'white',
+                               high = 'steelblue',
+                               midpoint = 20,
+                               limits = c(8, 32)))
+
+# summary of the performance stats -----
+
+  clust_performance <-
+    c(train_clusters, test_clusters) %>%
+    map_dfr(summary) %>%
+    mutate(dataset = c(rep('training', 2),
+                       rep('test', 2)),
+           algorithm = rep(c('KMEANS', 'HTKmeans'), 2))
+
+  clust_performance %>%
+    pivot_longer(cols = c(sil_width,
+                          frac_misclassified,
+                          frac_var,
+                          frac_np),
+                 names_to = 'statistic',
+                 values_to = 'value') %>%
+    ggplot(aes(x = value,
+               y = algorithm,
+               fill = dataset)) +
+    geom_bar(stat = 'identity',
+             color = 'black',
+             position = position_dodge(0.9)) +
+    facet_wrap(facets = vars(statistic),
+               scales = 'free',
+               labeller = as_labeller(c(sil_width = 'silhouette\nwidth',
+                                        frac_misclassified = 'fraction\nmisclassified',
+                                        frac_var = 'explained\nvariance',
+                                        frac_np = 'fraction\npreserved neighbors'))) +
+    labs(title = 'Clustering of the wines dataset')
+
+
+# UMAP plots ------
+
+  umap_train <- wine_lst$train %>%
+    reduce_data(distance_method = 'euclidean',
+                kdim = 2,
+                red_fun = 'umap',
+                random_state = 12345)
+
+  kmeans_plots <- c(train_clusters,
+                    test_clusters) %>%
+    set_names(c('train_kmeans', 'train_htk',
+                'test_kmeans', 'test_htk')) %>%
+    map(plot,
+        type = 'components',
+        with = 'data',
+        red_fun = 'umap',
+        train_object = umap_train)
+
+  kmeans_plots <-
+    map2(kmeans_plots,
+         c('Wines, training, KMEANS',
+           'Wines, training, HTKmeans',
+           'Wines, test, KMEANS',
+           'Wines, test, HTKmeans'),
+         ~.x +
+           labs(title = .y) +
+           theme(plot.tag = element_blank(),
+                 legend.position = 'none'))
+
+  kmeans_plots$train_kmeans +
+    kmeans_plots$test_kmeans +
+    kmeans_plots$train_htk +
+    kmeans_plots$test_htk
+
+# variable importance -------
+
+  var_imp <- train_clusters %>%
+    map(impact,
+        n_iter = 50,
+        .parallel = TRUE)
+
+  ## plotting and customizing the title
+
+  var_imp_plots <- var_imp %>%
+    map(plot) %>%
+    map2(.,
+         paste('Variable importance, wines',
+               c('KMEANS', 'HTKmeans')),
+         ~.x +
+           geom_vline(xintercept = 0,
+                      linetype = 'dashed') +
+           labs(title = .y))
+
+  var_imp_plots$kmeans +
+    var_imp_plots$htk
+
+# Accuracy of vintage prediction ------
+
+  vintage_assignment <- my_wines %>%
+    rownames_to_column('observation') %>%
+    select(observation, vintage)
+
+  ## assignment of samples to the clusters and vintages
+
+  kmeans_assignment <- list(train = train_clusters$kmeans,
+                            test = test_clusters$kmeans) %>%
+    map(extract, 'assignment') %>%
+    map(left_join, vintage_assignment, by = 'observation')
+
+  htk_assignment <- list(train = train_clusters$htk,
+                         test = test_clusters$htk) %>%
+    map(extract, 'assignment') %>%
+    map(left_join, vintage_assignment, by = 'observation')
+
+  ## frequencies of vintages in the clusters
+
+  kmeans_wine_counts <- kmeans_assignment %>%
+    map(count, clust_id, vintage)
+
+  htk_wine_counts <- htk_assignment %>%
+    map(count, clust_id, vintage)
+
+  ## receiver-operating characteristic
+
+  kmeans_roc <- kmeans_assignment %>%
+    map(transmute,
+        obs = vintage,
+        pred = clust_id) %>%
+    map(as.data.frame) %>%
+    map(multiClassSummary,
+        lev = c('Barbera', 'Barolo', 'Grignolino'))
+
+  htk_roc <- htk_assignment %>%
+    map(transmute,
+        obs = vintage,
+        pred = clust_id) %>%
+    map(as.data.frame) %>%
+    map(multiClassSummary,
+        lev = c('Barbera', 'Barolo', 'Grignolino'))
+
+# summary of roc stats -------
+
+  roc_summary <- c(kmeans_roc, htk_roc) %>%
+    reduce(rbind) %>%
+    as_tibble %>%
+    mutate(dataset = rep(c('training', 'test'), 2),
+           algorithm = c(rep('KMEANS', 2),
+                         rep('HTKmeans', 2)))
+
+  roc_summary %>%
+    pivot_longer(cols = c(Accuracy,
+                          Kappa,
+                          Mean_Sensitivity,
+                          Mean_Specificity),
+                 names_to = 'statistic',
+                 values_to = 'value') %>%
+    ggplot(aes(x = value,
+               y = algorithm,
+               fill = dataset)) +
+    geom_bar(stat = 'identity',
+             color = 'black',
+             position = position_dodge(0.9)) +
+    facet_wrap(facets = vars(statistic),
+               scales = 'free') +
+    labs(title = 'Vintage prediction')
+
+
+# combi clustering --------
+
+  ## training, comparing kmeans with regularized kmeans
+
+  train_combi <- list()
+
+  train_combi$kmeans <- combi_cluster(data = wine_lst$train,
+                                      distance_som = 'cosine',
+                                      xdim = 5,
+                                      ydim = 4,
+                                      topo = 'hexagonal',
+                                      neighbourhood.fct = 'gaussian',
+                                      toroidal = FALSE,
+                                      rlen = 3000,
+                                      node_clust_fun = kcluster,
+                                      distance_nodes = 'squared_euclidean',
+                                      k = 3,
+                                      clust_fun = 'kmeans')
+
+  train_combi$htk <- combi_cluster(data = wine_lst$train,
+                                   distance_som = 'cosine',
+                                   xdim = 5,
+                                   ydim = 4,
+                                   topo = 'hexagonal',
+                                   neighbourhood.fct = 'gaussian',
+                                   toroidal = FALSE,
+                                   rlen = 3000,
+                                   node_clust_fun = htk_cluster,
+                                   lambdas = seq(0, 1, by = 0.025),
+                                   standardize = FALSE,
+                                   k = 3)
+
+  ## comparing performance in the training data set and cross-validation
+
+  train_combi %>%
+    map(summary)
+
+  train_combi %>%
+    map(cv,
+        nfolds = 10,
+        type = 'som',
+        .parallel = FALSE) %>%
+    map(summary) %>%
+    map(select, ends_with('mean'))
+
+  ## comparing performance in the test data set
+
+  train_combi %>%
+    map(predict,
+        newdata = wine_lst$test,
+        type = 'som') %>%
+    map(summary)
+
+  ## variable importance
+
+  train_combi %>%
+    map(impact,
+        n_iter = 10,
+        .parallel = TRUE)
+
+
+# END ------