diff --git a/DESCRIPTION b/DESCRIPTION index 358d23e..1d2a99e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: miaTime Type: Package Title: Microbiome Time Series Analysis -Version: 0.1.22 +Version: 0.1.23 Authors@R: c(person(given = "Leo", family = "Lahti", role = c("aut"), email = "leo.lahti@iki.fi", @@ -22,28 +22,29 @@ Description: biocViews: Microbiome, Software, Sequencing, Coverage License: Artistic-2.0 | file LICENSE Depends: - R (>= 4.0) + R (>= 4.0), + mia Imports: dplyr, - methods, - mia, + methods, S4Vectors, - SummarizedExperiment, SingleCellExperiment, - vegan, - scater + SummarizedExperiment, + TreeSummarizedExperiment Suggests: - TreeSummarizedExperiment, - tidySingleCellExperiment, - tidySummarizedExperiment, + BiocStyle, + devtools, ggplot2, - miaViz, + knitr, lubridate, + miaViz, rmarkdown, - knitr, - devtools, - BiocStyle, + scater, testthat, + tidySingleCellExperiment, + tidySummarizedExperiment, + TreeSummarizedExperiment, + vegan, zoo Remotes: github::microbiome/mia diff --git a/NAMESPACE b/NAMESPACE index 0dae9a6..eec4d77 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,20 +1,15 @@ # Generated by roxygen2: do not edit by hand +export(addBaselineDivergence) +export(addStepwiseDivergence) export(getBaselineDivergence) export(getStepwiseDivergence) -export(getTimeDivergence) -exportMethods(getTimeDivergence) -importFrom(S4Vectors,DataFrame) -importFrom(SingleCellExperiment,altExp) -importFrom(SummarizedExperiment,"colData<-") -importFrom(SummarizedExperiment,assay) -importFrom(SummarizedExperiment,colData) -importFrom(dplyr,"%>%") -importFrom(dplyr,filter) +exportMethods(addBaselineDivergence) +exportMethods(addStepwiseDivergence) +exportMethods(getBaselineDivergence) +exportMethods(getStepwiseDivergence) +importFrom(dplyr,arrange) importFrom(dplyr,group_by) +importFrom(dplyr,lag) importFrom(dplyr,mutate) -importFrom(dplyr,select) -importFrom(methods,is) -importFrom(mia,estimateDivergence) -importFrom(mia,mergeSEs) -importFrom(vegan,vegdist) +importFrom(dplyr,ungroup) diff --git a/R/data.R b/R/data.R index 2d5f659..fa7df16 100755 --- a/R/data.R +++ b/R/data.R @@ -84,7 +84,8 @@ NULL NULL #' @title SilvermanAGutData -#' @description The SilvermanAGutData dataset contains 16S rRNA gene based high-throughput +#' @description +#' The SilvermanAGutData dataset contains 16S rRNA gene based high-throughput #' profiling of 4 in vitro artificial gut models. The sampling was done hourly #' and daily to capture sub-daily dynamics of microbial community originating #' from human feces. The data consists of 413 taxa from 639 samples. diff --git a/R/getBaselineDivergence.R b/R/getBaselineDivergence.R index a03377e..0307e6a 100644 --- a/R/getBaselineDivergence.R +++ b/R/getBaselineDivergence.R @@ -1,244 +1,352 @@ +#' @name addBaselineDivergence +#' @export +#' +#' @title #' Beta diversity between the baseline and later time steps -#' +#' +#' @description #' Calculates sample dissimilarity between the given baseline and other #' time points, optionally within a group (subject, reaction chamber, or #' similar). The corresponding time difference is returned as well. -#' The method operates on `SummarizedExperiment` objects, and the results -#' are stored in `colData`. -#' -#' @inheritParams getStepwiseDivergence -#' @param baseline_sample \code{Character vector}. Specifies the baseline sample(s) to be used. If the -#' "group" argument is given, this must be a named vector; one element per group. (Default: \code{NULL}) -#' -#' @return a -#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} -#' or -#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} -#' containing the sample dissimilarity and corresponding time difference between -#' samples (across n time steps), within each level of the grouping factor. -#' +#' #' @details -#' The group argument allows calculating divergence per group. Otherwise, this is done across all samples at once. +#' The group argument allows calculating divergence per group. If given, the +#' divergence is calculated per group. e.g. subject, chamber, group etc. +#' Otherwise, this is done across all samples at once. #' -#' The baseline sample/s always need to belong to the data object i.e. they can be merged into it before -#' applying this function. The reason is that they need to have comparable sample data, at least some time point +#' The baseline sample(s) always need to belong to the data object i.e. they +#' can be merged into it before +#' applying this function. The reason is that they need to have comparable +#' sample data, at least some time point #' information for calculating time differences w.r.t. baseline sample. #' -#' The baseline time point is by default defined as the smallest time point (per group). Alternatively, -#' the user can provide the baseline vector, or a list of baseline vectors per group (named list per group). -#' -#' @importFrom mia mergeSEs -#' @importFrom dplyr %>% -#' @importFrom dplyr filter -#' @importFrom dplyr group_by -#' @importFrom dplyr mutate -#' @importFrom dplyr select -#' @importFrom vegan vegdist -#' @importFrom SummarizedExperiment assay -#' @importFrom SummarizedExperiment colData -#' @importFrom SummarizedExperiment colData<- -#' @importFrom SingleCellExperiment altExp +#' The baseline time point is by default defined as the smallest time point +#' (per group). Alternatively, +#' the user can provide the baseline vector, or a list of baseline vectors per +#' group (named list per group). +#' +#' @return +#' \code{getBaselineDivergence} returns \code{DataFrame} object +#' containing the sample dissimilarity and corresponding time difference between +#' samples. \code{addBaselineDivergence}, on the other hand, returns a +#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} +#' object with these results in its \code{colData}. +#' +#' @param x A +#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} +#' object. +#' +#' @param assay.type \code{Character scalar}. Specifies which assay values are +#' used in the dissimilarity estimation. (Default: \code{"counts"}) +#' +#' @param group \code{Character scalar}. Specifies a name of the column from +#' \code{colData} that identifies the grouping of the samples. +#' (Default: \code{NULL}) +#' +#' @param time.col \code{Character scalar}. Specifies a name of the column from +#' \code{colData} that identifies the sampling time points for the samples. +#' +#' @param method \code{Character scalar}. Used to calculate the dissimilarity +#' Method is passed to the function that is specified by \code{dis.fun}. +#' (Default: \code{"bray"}) +#' +#' @param reference \code{Character scalar}. Specifies a name of the column from +#' \code{colData} that identifies the baseline samples to be used. +#' (Default: \code{NULL}) +#' +#' @param name \code{Character scalar}. Specifies a column name for storing +#' divergence results. (Default: \code{"divergence"}) +#' +#' @param name.time \code{Character scalar}. Specifies a column name for storing +#' time differences. (Default: \code{"time_diff"}) +#' +#' @param ... Optional arguments passed into +#' \code{\link[mia:addDivergence]{mia::addDivergence()}}. #' #' @examples -#' #library(miaTime) -#' library(TreeSummarizedExperiment) -#' library(dplyr) +#' library(miaTime) +#' library(mia) #' #' data(hitchip1006) -#' tse <- mia::transformCounts(hitchip1006, method = "relabundance") -#' -#' # Subset to speed up example -#' tse <- tse[, colData(tse)$subject %in% c("900", "934", "843", "875")] -#' -#' tse2 <- getBaselineDivergence(tse, -#' group = "subject", -#' time_field = "time", -#' name_divergence = "divergence_from_baseline", -#' name_timedifference = "time_from_baseline", -#' assay.type="relabundance", -#' FUN = vegan::vegdist, -#' method="bray") -#' -#' tse2 <- getBaselineDivergence(tse, -#' baseline_sample = "Sample-875", -#' group = "subject", -#' time_field = "time", -#' name_divergence = "divergence_from_baseline", -#' name_timedifference = "time_from_baseline", -#' assay.type="relabundance", -#' FUN = vegan::vegdist, -#' method="bray") -#' -#' @name getBaselineDivergence -#' @export -getBaselineDivergence <- function(x, - group=NULL, - time_field, - name_divergence = "divergence_from_baseline", - name_timedifference = "time_from_baseline", - assay.type = "counts", - FUN = vegan::vegdist, - method="bray", - baseline_sample=NULL, - altexp = NULL, - dimred = NULL, - n_dimred = NULL, - ...){ - - # Store the original data object - xorig <- x - - # Use altExp if mentioned and available - if (!is.null(altexp)) { - .check_altExp_present(altexp, x) - x <- altExp(x, altexp) - } - - if (is.null(colnames(x))) { - colnames(x) <- as.character(seq_len(ncol(x))) - } - original.names <- colnames(x) - - # global vars - is <- NULL - group_by <- NULL - tmp_group_for_groupwise_splitting <- NULL - time <- NULL - filter <- NULL +#' tse <- transformAssay(hitchip1006, method = "relabundance") +#' +#' # By default, reference samples are the samples from the first timepoint +#' tse <- addBaselineDivergence( +#' tse, +#' group = "subject", +#' time.col = "time", +#' assay.type = "relabundance", +#' method = "bray") +#' +#' # Add reference samples to colData, if you want to specify reference +#' # samples manually +#' colData(tse)[["reference"]] <- "Sample-875" +#' tse <- addBaselineDivergence( +#' tse, +#' reference = "reference", +#' group = "subject", +#' time.col = "time", +#' name = "divergence_from_baseline", +#' name.time = "time_from_baseline", +#' assay.type = "relabundance", +#' method = "bray") +#' +#' @seealso +#' \code{\link[mia:addDivergence]{mia::addDivergence()}} +#' +NULL - # Add time - # colData(x)$time <- colData(x)[[time_field]] - x <- .add_values_to_colData(x, list(colData(x)[[time_field]]), "time") +#' @rdname addBaselineDivergence +#' @export +setGeneric("getBaselineDivergence", signature = "x", function(x, ...) + standardGeneric("getBaselineDivergence")) - # If group is not given, assume that all samples come from a single group - if (is.null(group)) { - colData(x)$tmp_group_for_groupwise_splitting <- rep(1, nrow=nrow(x)) - } else if (is.character(group)) { - colData(x)$tmp_group_for_groupwise_splitting <- as.character(colData(x)[[group]]) - } else { - stop("The group argument in getBaselineDivergence should be NULL or a character i.e. name of a colData grouping field.") +#' @rdname addBaselineDivergence +#' @export +setMethod("getBaselineDivergence", signature = c(x = "SummarizedExperiment"), + function( + x, + time.col, + assay.type = "counts", + reference = NULL, + group = NULL, + method = "bray", + ...){ + ############################# INPUT CHECK ############################## + x <- .check_and_get_altExp(x, ...) + # time.col must specify numeric column from colData + temp <- .check_input( + time.col, list("character scalar"), colnames(colData(x))) + if( !is.numeric(x[[time.col]]) ){ + stop("'time.col' must specify numeric column from colData(x)", + call. = FALSE) + } + # + .check_assay_present(assay.type, x) + # + temp <- .check_input( + reference, + list(NULL, "character scalar", "character vector")) + # + temp <- .check_input( + group, list(NULL, "character scalar"), colnames(colData(x))) + # + temp <- .check_input(method, list("character scalar")) + # + if( is.null(rownames(x)) ){ + rownames(x) <- paste0("row", seq_len(nrow(x))) + } + if( is.null(colnames(x)) ){ + colnames(x) <- paste0("col", seq_len(ncol(x))) + } + ########################### INPUT CHECK END ############################ + # Add baseline samples to colData + args <- .add_reference_samples_to_coldata( + x, time.col, group, reference, assay.type, method, + reference.method = "baseline", ...) + # Create an argument list. Do not include altexp as it is already taken + # into account. + args <- c( + args, + list(assay.type = assay.type, method = method), + list(...)[!names(list(...)) %in% c("altexp")] + ) + # Calculate divergences + res <- do.call(getDivergence, args) + # Add time difference + x <- args[["x"]] + reference <- args[["reference"]] + time_res <- .get_time_difference(x, time.col, reference) + # Create a DF to return + res <- .convert_divergence_to_df(x, res, time_res, ...) + return(res) } +) - # Split SE into a list, by grouping - # TODO: switch to mia::splitOn - spl <- split(seq_len(ncol(x)), colData(x)$tmp_group_for_groupwise_splitting) +#' @rdname addBaselineDivergence +#' @export +setGeneric("addBaselineDivergence", signature = "x", function(x, ...) + standardGeneric("addBaselineDivergence")) - # Sample with the smallest time point within each subject - # Use the smallest time point as the baseline - if (is.null(baseline_sample)) { - colData(x)$sample <- colnames(x) - baseline <- colData(x) %>% as.data.frame() %>% - group_by(tmp_group_for_groupwise_splitting) %>% - mutate(rank = rank(time, ties.method="first")) %>% - filter(rank==1) %>% - select(sample, tmp_group_for_groupwise_splitting) - baseline_sample <- baseline$sample - names(baseline_sample) <- baseline$tmp_group_for_groupwise_splitting - nams <- names(baseline_sample) - baseline_sample <- vapply(nams, function (g) {baseline_sample[[g]]}, "a") - names(baseline_sample) <- nams +#' @rdname addBaselineDivergence +#' @export +setMethod("addBaselineDivergence", signature = c(x = "SummarizedExperiment"), + function(x, name = "divergence", name.time = "time_diff", ...){ + # Calculate divergence + res <- getBaselineDivergence(x, ...) + # Add to colData + res <- as.list(res) |> unname() + x <- .add_values_to_colData(x, res, list(name, name.time), ...) + return(x) } +) - # Then make sure that the baseline is an SE object - if (is.character(baseline_sample)) { - if (length(baseline_sample)==1) { - baseline <- x[, baseline_sample] - } else { - if (is.null(names(baseline_sample))) {stop("Baseline sample has to be a named vector per group if it contains group-wise elements.")} - # Just make sure that the given baseline samples are in the same order than the grouping variable - baseline <- x[, baseline_sample[unique(colData(x)$tmp_group_for_groupwise_splitting)]] +################################ HELP FUNCTIONS ################################ - } - } else if (is(baseline_sample, "SummarizedExperiment")) { - baseline <- baseline_sample - } else { - stop("Baseline sample not recognized in getBaselineDivergence. Should be NULL or a (named) character vector.") +# This function unifies the input of baseline samples. Despite on how the +# baseline information was provided, this function output TreeSE with baseline +# info for each sample in colData. +.add_reference_samples_to_coldata <- function( + x, time.col, group, reference = NULL, + ref.name = "temporal_reference_for_divergence", + group.name = "temporal_group_for_divergence", + time.interval = NULL, + reference.method = "baseline", + ...){ + # + temp <- .check_input( + reference, + list(NULL, "character scalar", "character vector")) + # + temp <- .check_input(ref.name, list("character scalar")) + # + temp <- .check_input(group.name, list("character scalar")) + # + temp <- .check_input(time.interval, list(NULL, "numeric scalar")) + # + temp <- .check_input( + reference.method, list("character scalar"), + list("baseline", "stepwise")) + # + if( reference.method == "stepwise" && is.null(time.interval) ){ + stop("'time.interval' must be specified.", call. = FALSE) } - - # Apply the operation per group; with group-specific baselines - if (ncol(baseline) == 1) { - xli <- lapply(names(spl), function (g) { - .calculate_divergence_from_baseline(x[,spl[[g]]], baseline, - time_field, name_divergence, name_timedifference, assay.type, FUN, - method, dimred, n_dimred, ...)}) - } else { - xli <- lapply(names(spl), function (g) { - .calculate_divergence_from_baseline(x[,spl[[g]]], baseline[, baseline_sample[[g]]], - time_field, name_divergence, name_timedifference, assay.type, FUN, - method, dimred, n_dimred, ...)}) + # Get colData + cd <- colData(x) + + # Check that group is correctly defined. It can be either NULL, a column + # from colData or a vector that has group information for all samples. + # If it is NULL, add group info --> all samples are in same group + if( is.null(group) ){ + cd[[group.name]] <- rep("group", nrow(cd)) + group <- group.name } - - # Return the elements in a list - # FIXME: use SummarizedExperiment merge here or the new TreeSE merge thing - if (length(xli) > 1) { - x2 <- xli[[1]] - for (i in seq(2, length(xli), 1)) { - x2 <- TreeSummarizedExperiment::cbind(x2, xli[[i]]) - } - } else { - x2 <- xli[[1]] + # If it is a single character value, it should specify a column from + # colData + is_colname <- .is_non_empty_string(group) && group %in% colnames(cd) + # If it is a vector, then it should have values for all the samples + is_vector <- .is_non_empty_character(group) && length(group) == nrow(cd) + if( !(is_colname || is_vector) ){ + stop("'group' must be NULL or a single character value specifying ", + "a column from colData(x).", call. = FALSE) } - - # FIXME: reimplement the splitting so that we do not need intermediate variable like this - colData(x2)$tmp_group_for_groupwise_splitting <- NULL - - # Return - return(x2) - -} - - -# First define the function that calculates divergence for a given SE object -#' @importFrom mia estimateDivergence -#' @importFrom methods is -.calculate_divergence_from_baseline <- function (x, baseline, time_field, - name_divergence, name_timedifference, - assay.type, FUN, method, - dimred, n_dimred) { - - # Global vars - is <- NULL - - # If baseline is SE object then just ensure it has exactly one sample (well-defined baseline). - # Otherwise, split the baseline from the data object. - # Baseline is either an SE object with the same time field than x - # or baseline specifies one sample from x - if (is(baseline, "SummarizedExperiment")) { - if (ncol(baseline)>1) { - stop("If baseline is an SE object it should have a single sample.") - } else { - reference <- baseline - } - } else if (is.character(baseline) || is.numeric(baseline)) { - reference <- x[, baseline] - } else { - stop("Baseline must be character or numeric vector specifying the SE sample; or it must be an SE object.") + # If it was correctly defined vector, add it to colData + if( is_vector ){ + cd[[group.name]] <- group + group <- group.name } - - # Getting corresponding matrices, to calculate divergence - mat <- .get_mat_from_sce(x, assay.type, dimred, n_dimred) - ref_mat <- .get_mat_from_sce(reference, assay.type, dimred, n_dimred) - # transposing mat if taken from reducedDim - if (!is.null(dimred)){ - mat <- t(mat) - ref_mat <- t(ref_mat) + # If the reference is NULL, it means that user did not specify it. + # Get the reference samples. + if( is.null(reference) ){ + ref <- .get_reference_samples( + cd, time.col, time.interval, group, reference.method) + cd[[ref.name]] <- ref + reference <- ref.name + } + # If reference was specified, check that it is specifying samples + # correctly. + # It can be a single character value specifying a column from colData + # (preferred) or single character value specifying a sample. + is_colname <- .is_non_empty_string(reference) && reference %in% colnames(cd) + is_sample <- .is_non_empty_string(reference) && reference %in% rownames(cd) + # Column name from colData takes precedence + is_sample <- is_sample && !is_colname + # It can also be a character vector. Then its length should match with + # the length of sample or groups if "group" is specified. (At this point, + # group cannot be NULL, because we defined it earlier if it was not + # specified by user). Moreover, if the vector specified reference for each + # group, it must include names that links to groups. + is_vector_sam <- .is_non_empty_character(reference) && + length(reference) == nrow(cd) + is_vector_group <- .is_non_empty_character(reference) && + length(reference) == length(unique(cd[[group]])) && + !is.null(names(reference)) && all(names(reference) %in% cd[[group]]) + # Give warning if the input was incorrect + if( !(is_colname || is_sample || is_vector_sam || + is_vector_group) ){ + stop("'reference' must be NULL or a single character value specifying ", + "a column from colData(x).", call. = FALSE) + } + # If the vector was for each group, extend the vector for each sample + if( is_vector_group ){ + reference <- reference[ match(cd[[group]], names(reference)) ] + } + # If it was character vector or a sample name, add it to colData + if( is_vector_sam || is_vector_group || is_sample ){ + cd[[ref.name]] <- reference + reference <- ref.name } - # Beta divergence from baseline info - divergencevalues <- mia:::.calc_divergence( - cbind(mat, ref_mat), colnames(ref_mat), FUN = FUN, method = method) - divergencevalues <- divergencevalues[seq_len(ncol(mat)), "value"] + # Add modified colData back to TreeSE + colData(x) <- cd + # The returned value includes the TreeSE along with reference + # column name because it might be that we have modified it. + res <- list(x = x, reference = reference) + return(res) +} - # Add time divergence from baseline info; note this has to be a list - timevalues <- list(colData(x)[, time_field] - colData(reference)[, time_field]) +# This function returns the first sample for each group by default. +# Alternatively, it returns the previous ith sample for each sample in each +# group. +#' @importFrom dplyr group_by mutate arrange ungroup lag +.get_reference_samples <- function( + df, time.col, time.interval, group, reference.method){ + rowname_col <- "temporary_rownames_column" + reference_col <- "temporary_reference_column" + # Store rownames and add rownames as a column + df[[rowname_col]] <- original_order <- rownames(df) + # Convert to data.frame and group data based on group + df <- df |> + as.data.frame() |> + group_by(.data[[group]]) - x <- .add_values_to_colData(x, timevalues, name_timedifference) - x <- .add_values_to_colData(x, list(divergencevalues), name_divergence) - - # Return - return(x) - + # Determine the method and perform the respective operations + if( reference.method == "baseline" ){ + # Find first timepoint within a group + df <- df |> + mutate(!!reference_col := + .data[[rowname_col]][which.min(.data[[time.col]])]) + } else if( reference.method == "stepwise" ){ + # For each sample, get the previous ith sample. + # For each subject, get previous sample based on time. + df <- df |> + mutate(!!reference_col := lag( + .data[[rowname_col]], n = time.interval, + order_by = .data[[time.col]])) + } + # Ungroup to revert to the original structure and convert to DataFrame + df <- df |> + ungroup() |> + DataFrame() + # Put the data into original order + rownames(df) <- df[[rowname_col]] + df <- df[original_order, ] + # Get only reference samples + res <- df[[reference_col]] + return(res) } +# This function get time difference between a sample and its referene sample +.get_time_difference <- function(x, time.col, reference){ + # Get timepoints + time_point <- x[[time.col]] + # Get reference time points + ref <- colData(x)[x[[reference]], time.col] + # Get difference + res <- time_point - ref + return(res) +} +# This function converts time divergence results to DF object +.convert_divergence_to_df <- function( + x, res, time_res, name = "divergence", name.time = "time_diff", ...){ + # + temp <- .check_input(name, list("character scalar")) + # + temp <- .check_input(name.time, list("character scalar")) + # + df <- DataFrame(res, time_res, row.names = colnames(x)) + colnames(df) <- c(name, name.time) + return(df) +} diff --git a/R/getStepwiseDivergence.R b/R/getStepwiseDivergence.R index 11e6e01..adf9c95 100644 --- a/R/getStepwiseDivergence.R +++ b/R/getStepwiseDivergence.R @@ -1,246 +1,132 @@ -#' Beta diversity between consecutive time steps -#' -#' Calculates sample dissimilarity between consecutive time points (t, t+i), -#' within a group (subject, reaction chamber, or similar). The corresponding -#' time difference is returned as well. The method operates on -#' `SummarizedExperiment` objects, and the results are stored in `colData`. -#' -#' @param x A -#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} -#' object. -#' @param group \code{Character scalar}. Specifies the grouping -#' factor (name of a `colData` field). If given, the divergence is calculated -#' per group. e.g. subject, chamber, group etc.). (Default: \code{NULL}) -#' @param time_field \code{Character scalar}. Specifies the name of the -#' time series field in `colData`. -#' @param time_interval \code{Integer scalar}. Indicates the increment between time -#' steps. If you need to take every second, every third, or so, time step only, then -#' increase this accordingly. (Default: \code{1}) -#' @param name_divergence \code{Character scalar}. Shows beta diversity between samples. -#' (Default: \code{"time_divergence"}) -#' @param name_timedifference \code{Character scalar}. Field name for adding the time difference between -#' samples used to calculate beta diversity. (Default: \code{"time_difference"}) -#' @param assay.type \code{Character scalar}. Specifies which assay values are used in -#' the dissimilarity estimation. (Default: \code{"counts"}) -#' @param FUN \code{Function} for dissimilarity calculation. The function must -#' expect the input matrix as its first argument. With rows as samples -#' and columns as features. (Default: \code{vegan::vegdist}) -#' @param method \code{Character scalar}. Used to calculate the distance. Method is -#' passed to the function that is specified by \code{FUN}. (Default: \code{"bray"}) -#' @param altexp \code{Character scalar} or \code{integer scalar}. Specifies the alternative experiment -#' containing the input data. (Default: \code{NULL}) -#' @param dimred \code{Character scalar} or \code{integer scalar}. indicates the reduced dimension -#' result in `reducedDims` to use in the estimation. (Default: \code{NULL}) -#' @param n_dimred \code{Integer vector}. Specifies the dimensions to use if -#' \code{dimred} is specified. (Default: \code{NULL}) -#' @param ... Arguments to be passed -#' -#' @return a -#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} -#' or -#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} +#' @name addStepwiseDivergence +#' @export +#' +#' @title +#' Beta diversity between consecutive time steps +#' +#' @description +#' Calculates sample dissimilarity between consecutive time points along with +#' time difference. +#' +#' @details +#' These functions calculate time-wise divergence, meaning each sample is +#' compared to the previous i-th sample, where i is the specified time +#' interval (see \code{time.interval}). By default, the function calculates +#' divergence by comparing all samples with each other. However, it is often +#' more meaningful to calculate divergence within a specific patient or group +#' (see the \code{group} parameter). +#' +#' @return +#' \code{getStepwiseDivergence} returns \code{DataFrame} object #' containing the sample dissimilarity and corresponding time difference between -#' samples (across n time steps), within each level of the grouping factor. -#' -#' @importFrom mia mergeSEs -#' @importFrom vegan vegdist -#' @importFrom SummarizedExperiment assay -#' @importFrom SummarizedExperiment colData -#' @importFrom SummarizedExperiment colData<- -#' @importFrom SingleCellExperiment altExp -#' -#' @aliases getTimeDivergence -#' +#' samples. \code{addStepwiseDivergence}, on the other hand, returns a +#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} +#' object with these results in its \code{colData}. +#' +#' @inheritParams addBaselineDivergence +#' +#' @param time.interval \code{Integer scalar}. Indicates the increment between +#' time steps. By default, the function compares each sample to the +#' previous one. If you need to take every second, every third, or so, time +#' step, then increase this accordingly. (Default: \code{1L}) +# #' @examples -#' #library(miaTime) -#' library(TreeSummarizedExperiment) +#' library(miaTime) #' #' data(hitchip1006) -#' tse <- mia::transformCounts(hitchip1006, method = "relabundance") -#' -#' # Subset to speed up example -#' tse <- tse[, colData(tse)$subject %in% c("900", "934", "843", "875")] -#' -#' # Using vegdist for divergence calculation, one can pass -#' # the dissimilarity method from the vegan::vegdist options -#' # via the "method" argument -#' tse2 <- getStepwiseDivergence(tse, group = "subject", -#' time_interval = 1, -#' time_field = "time", -#' assay.type="relabundance", -#' FUN = vegan::vegdist, -#' method="bray") -#' -#' @name getStepwiseDivergence +#' tse <- transformAssay(hitchip1006, method = "relabundance") +#' +#' # Calculate divergence +#' tse <- addStepwiseDivergence( +#' tse, +#' group = "subject", +#' time.interval = 1, +#' time.col = "time", +#' assay.type = "relabundance" +#' ) +#' +#' @seealso +#' \code{\link[mia:addDivergence]{mia::addDivergence()}} +#' +NULL + +#' @rdname addStepwiseDivergence #' @export -getStepwiseDivergence <- function(x, - group=NULL, - time_field, - time_interval=1, - name_divergence = "time_divergence", - name_timedifference = "time_difference", - assay.type = "counts", - FUN = vegan::vegdist, - method="bray", - altexp = NULL, - dimred = NULL, - n_dimred = NULL, - ...){ - - # Store the original x - xorig <- x - - # Use altExp if mentioned and available - if (!is.null(altexp)) { - .check_altExp_present(altexp, x) - x <- altExp(x, altexp) - } - - # Temporary sample ID - x$tmp_sample_identifier_for_getStepwiseDivergence <- paste("SampleID", 1:ncol(x), sep="-") - - # If group is not given, assume that all samples come from a single group - # TODO: switch to mia::splitOn - if (is.null(group)) { - spl <- split(seq_len(ncol(x)), rep(1, nrow(x))) - } else { - # Split SE into a list, by grouping - if (is.factor(colData(x)[, group])) { - colData(x)[, group] <- droplevels(colData(x)[, group]) - } - spl <- split(seq_len(ncol(x)), colData(x)[, group]) - } - - # Separate the groups with multiple time points - spl_more <- spl[lapply(spl,length) > 1] - spl_one <- spl[lapply(spl,length) == 1] - - # Manipulate each subobject - x_more_list <- lapply(seq_along(spl_more), - function(i){.check_pairwise_dist(x = x[, spl_more[[i]]], - FUN=FUN, - time_interval, - name_divergence = name_divergence, - name_timedifference = name_timedifference, - time_field, - assay.type, - method, - altexp, - dimred, - n_dimred)}) - - x_one_list <- lapply(seq_along(spl_one), function(i) { - x[, spl_one[[i]]]} - ) - - for(i in seq_along(x_one_list)){ - colData(x_one_list[[i]])[, name_timedifference] <- NA - colData(x_one_list[[i]])[, name_divergence] <- NA - } - - # assign the names back to (T)SE objects - names(x_more_list) <- names(spl_more) - names(x_one_list) <- names(spl_one) - - # put lists together and put them in order - whole_x <- do.call(c, list(x_one_list, x_more_list)) - whole_x <- whole_x[order(as.numeric(names(whole_x)))] - - # Merge the objects back into a single X - whole_x <- whole_x[!sapply(whole_x,is.null)] +#' +setGeneric("getStepwiseDivergence", signature = c("x"), function(x, ...) + standardGeneric("getStepwiseDivergence")) - # Return the SE elements in a list - if (length(whole_x) > 1) { - x_new <- mergeSEs(whole_x) - } else { - x_new <- whole_x[[1]] +#' @rdname addStepwiseDivergence +#' @export +setMethod("getStepwiseDivergence", signature = c(x = "ANY"), + function( + x, + time.col, + assay.type = "counts", + time.interval = 1L, + group = NULL, + method = "bray", + ...){ + ############################# INPUT CHECK ############################## + temp <- .check_input( + time.col, list("character scalar"), colnames(colData(x))) + if( !is.numeric(x[[time.col]]) ){ + stop("'time.col' must specify numeric column from colData(x)", + call. = FALSE) + } + # + .check_assay_present(assay.type, x) + # + temp <- .check_input( + group, list(NULL, "character scalar"), colnames(colData(x))) + # + temp <- .check_input(method, list("character scalar")) + # + if( is.null(rownames(x)) ){ + rownames(x) <- paste0("row", seq_len(nrow(x))) + } + if( is.null(colnames(x)) ){ + colnames(x) <- paste0("col", seq_len(ncol(x))) + } + ########################### INPUT CHECK END ############################ + x <- .check_and_get_altExp(x, ...) + # Add stepwise samples to colData + args <- .add_reference_samples_to_coldata( + x, time.col, group, time.interval = time.interval, + reference.method = "stepwise", ...) + # Create an argument list. Do not include altexp as it is already taken + # into account. + args <- c( + args, + list(assay.type = assay.type, method = method), + list(...)[!names(list(...)) %in% c("altexp")] + ) + # Calculate divergences + res <- do.call(getDivergence, args) + # Add time difference + x <- args[["x"]] + reference <- args[["reference"]] + time_res <- .get_time_difference(x, time.col, reference) + # Create a DF to return + res <- .convert_divergence_to_df(x, res, time_res, ...) + return(res) } +) - # Ensure that sample sorting matches between the input and output data - inds <- match(x$tmp_sample_identifier_for_getStepwiseDivergence, - x_new$tmp_sample_identifier_for_getStepwiseDivergence) - x_new <- x_new[, inds] - - # Add the new fields to colData - # Just replace the colData for the original input - # colData(xorig) <- colData(x_new) - - # Add beta divergence from baseline info; note this has to be a list - timevalues <- list(colData(x_new)[, name_timedifference]) - divergencevalues <- list(colData(x_new)[, name_divergence]) - - xorig <- .add_values_to_colData(xorig, timevalues, name_timedifference) - xorig <- .add_values_to_colData(xorig, divergencevalues, name_divergence) - - return(xorig) - -} - - -#' @rdname getStepwiseDivergence +#' @rdname addStepwiseDivergence #' @export -setGeneric("getTimeDivergence", signature = c("x"), - function(x, ... ) - standardGeneric("getTimeDivergence")) +setGeneric("addStepwiseDivergence", signature = "x", function(x, ...) + standardGeneric("addStepwiseDivergence")) -#' @rdname getStepwiseDivergence +#' @rdname addStepwiseDivergence #' @export -setMethod("getTimeDivergence", - signature = c(x = "ANY"), - function(x, ...){ - - .Deprecated( msg = paste0("The name of the function 'getTimeDivergence' is", - " changed to 'getStepwiseDivergence'. \nPlease use the new", - " name instead.\n", - "See help('Deprecated')") ) - - getStepwiseDivergence(x, ...) +setMethod("addStepwiseDivergence", signature = c(x = "SummarizedExperiment"), + function(x, name = "divergence", name.time = "time_diff", ...){ + # Calculate divergence + res <- getStepwiseDivergence(x, ...) + # Add to colData + res <- as.list(res) |> unname() + x <- .add_values_to_colData(x, res, list(name, name.time), ...) + return(x) } ) - - - -.check_pairwise_dist <- function (x, - FUN, - time_interval, - name_divergence = "time_divergence", - name_timedifference = "time_difference", - time_field, - assay.type, - method, - altexp, - dimred, - n_dimred, - ...){ - - mat <- .get_mat_from_sce(x, assay.type, dimred, n_dimred) - ## transposing mat if taken from assay - if (is.null(dimred)) mat <- t(mat) - - time <- colData(x)[, time_field] - - ## Add new field to coldata - colData(x)[, name_divergence] <- rep(NA, nrow(mat)) - colData(x)[, name_timedifference] <- rep(NA, nrow(mat)) - - if (nrow(mat) > time_interval) { - - ## beta diversity calculation - n <- sapply(seq((time_interval+1), nrow(mat)), - function (i) {FUN(mat[c(i, i-time_interval), ], method=method, ...)}) - - for(i in seq((time_interval+1), ncol(x))){ - colData(x)[, name_divergence][[i]] <- n[[i-time_interval]] - } - - ## time difference calculation - time <- sapply((time_interval+1):nrow(mat), - function (i) {diff(colData(x)[c(i-time_interval, i), time_field])}) - - for(i in seq((time_interval+1), nrow(colData(x)))){ - colData(x)[, name_timedifference][[i]] <- time[[i-time_interval]] - } - } - return(x) - -} diff --git a/R/utils.R b/R/utils.R index f3a0077..6a4d25b 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,43 +1,168 @@ -################################################################################ -# internal methods loaded from other packages - -.check_altExp_present <- mia:::.check_altExp_present -.get_mat_from_sce <- scater:::.get_mat_from_sce - -################################################################################ -# internal wrappers for getter/setter +################################### TESTING ################################### +# Methods for testing -#' @importFrom SummarizedExperiment colData colData<- -#' @importFrom S4Vectors DataFrame -.add_values_to_colData <- function(x, values, name){ - # converts each value:name pair into a DataFrame - values <- mapply( - function(value, n){ - value <- DataFrame(value) - colnames(value)[1L] <- n - if(ncol(value) > 1L){ - i <- seq.int(2,ncol(value)) - colnames(value)[i] <- paste0(n,"_",colnames(value)[i]) +# This function unifies input testing. The message will always be in same format +# also it makes the code simpler in main function since testing is done here. +# Borrowed from HoloFoodR. +.check_input <- function( + variable, supported_class, supported_values = NULL, limits = NULL, + variable_name = .get_name_in_parent(variable)){ + # Convert supported classes to character + classes_char <- lapply(supported_class, function(class){ + if( is.null(class) ){ + class <- "NULL" + } + return(class) + }) + classes_char <- unlist(classes_char) + # Based on number of acceptable classes, the msg is different + class_txt <- .create_msg_from_list(classes_char) + # Create a message + msg <- paste0("'", variable_name, "' must be ", class_txt, "." ) + + # If supported values were provided + if( !is.null(supported_values) ){ + # Convert supported values to character + values_char <- lapply(supported_values, function(value){ + if( is.null(value) ){ + value <- "NULL" + } + value <- as.character(value) + return(value) + }) + values_char <- unlist(values_char) + # Collapse into text + values_txt <- paste0("'", paste(values_char, collapse = "', '"), "'") + msg <- paste0( + msg, " It must be one of the following options: ", values_txt) + } + + # If limits were provided + if( !is.null(limits) ){ + msg <- paste0(msg, " (Numeric constrains: ") + # Add thresholds to message + if( !is.null(limits$upper) ){ + msg <- paste0(msg, limits$upper, ">x") + } else if(!is.null(limits$upper_include)){ + msg <- paste0(msg, limits$upper, ">=x") + } + if( !is.null(limits$lower) ){ + msg <- paste0(msg, "x>", limits$lower) + } else if(!is.null(limits$lower_include)){ + msg <- paste0(msg, "x>=", limits$lower_include) + } + msg <- paste0(msg, ")") + } + + # List all the input types. Run the check if the variable must be that type. + # If correct type was found, change the result to TRUE. + input_correct <- FALSE + if( "NULL" %in% classes_char && is.null(variable) ){ + input_correct <- TRUE + } + if( "logical scalar" %in% classes_char && .is_a_bool(variable) ){ + input_correct <- TRUE + } + if( "logical vector" %in% classes_char && is.logical(variable) ){ + input_correct <- TRUE + } + if( "character scalar" %in% classes_char && .is_non_empty_string( + variable) ){ + input_correct <- TRUE + } + if( "character vector" %in% classes_char && .is_non_empty_character( + variable) ){ + input_correct <- TRUE + } + if( "numeric scalar" %in% classes_char && .is_a_numeric(variable) ){ + input_correct <- TRUE + } + if( "numeric vector" %in% classes_char && is.numeric(variable) ){ + input_correct <- TRUE + } + if( "integer vector" %in% classes_char && .is_integer(variable) ){ + input_correct <- TRUE + } + if( "integer scalar" %in% classes_char && .is_an_integer(variable) ){ + input_correct <- TRUE + } + if( "list" %in% classes_char && is.list(variable) && !is.data.frame( + variable) ){ + input_correct <- TRUE + } + if( "data.frame" %in% classes_char && is.data.frame(variable) ){ + input_correct <- TRUE + } + if( "matrix" %in% classes_char && is.matrix(variable) ){ + input_correct <- TRUE + } + # If supported values were provided + if( !is.null(supported_values) && !is.null(variable) ){ + # Test that if variable is in supported values + values_correct <- lapply(supported_values, function(value){ + res <- FALSE + if( is.null(value) && is.null(variable) || value %in% variable){ + res <- TRUE } - value - }, - values, - name) + return(res) + }) + values_correct <- unlist(values_correct) + # If not, then give FALSE even though class checks were correct + if( !any(values_correct) ){ + input_correct <- FALSE + } + } + # If limits were provided + if( !is.null(limits) && !is.null(variable) ){ + if( !is.null(limits$upper) && variable >= limits$upper ){ + input_correct <- FALSE + } else if( !is.null( + limits$upper_include) && variable > limits$upper_include ){ + input_correct <- FALSE + } + + if( !is.null(limits$lower) && variable <= limits$lower ){ + input_correct <- FALSE + } else if( !is.null( + limits$upper_include) && variable < limits$upper_include ){ + input_correct <- FALSE + } + } + # Give error if variable was not correct type + if( !input_correct ){ + stop(msg, call. = FALSE) + } + return(input_correct) +} - values <- do.call(cbind, values) +# This function creates a string from character values provided. The string +# can be used to messages. It creates a tidy list from list of values. +.create_msg_from_list <- function(classes_char, and_or = "or", ...){ + if( length(classes_char) > 2 ){ + class_txt <- paste0( + paste( + classes_char[seq_len(length(classes_char)-1)], collapse = ", "), + " ", and_or, " ", classes_char[length(classes_char)]) + } else if( length(classes_char) == 2 ){ + class_txt <- paste0( + classes_char[[1]], " ", and_or, " ", classes_char[[2]]) + } else{ + class_txt <- classes_char + } + return(class_txt) +} - # check for duplicated values - f <- colnames(colData(x)) %in% colnames(values) - if(any(f)) { - warning("The following values are already present in `colData` and ", - "will be overwritten: '", - paste(colnames(colData(x))[f], collapse = "', '"), - "'. Consider using the 'name' argument(s) to specify alternative ", - "names.", - call. = FALSE) - } - # keep only unique values - colData(x) <- cbind(colData(x)[!f], values) +#################### INTERNAL METHODS FROM EXTERNAL PACKAGES ################### +# internal methods loaded from other packages - x -} +.is_a_bool <- mia:::.is_a_bool +.is_non_empty_character <- mia:::.is_non_empty_character +.is_non_empty_string <- mia:::.is_non_empty_string +.is_an_integer <- mia:::.is_an_integer +.is_a_numeric <- mia:::.is_a_numeric +.get_name_in_parent <- mia:::.get_name_in_parent +.safe_deparse <- mia:::.safe_deparse +.check_altExp_present <- mia:::.check_altExp_present +.check_assay_present <- mia:::.check_assay_present +.add_values_to_colData <- mia:::.add_values_to_colData +.check_and_get_altExp <- mia:::.check_and_get_altExp diff --git a/man/addBaselineDivergence.Rd b/man/addBaselineDivergence.Rd new file mode 100644 index 0000000..b86ae1c --- /dev/null +++ b/man/addBaselineDivergence.Rd @@ -0,0 +1,117 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/getBaselineDivergence.R +\name{addBaselineDivergence} +\alias{addBaselineDivergence} +\alias{getBaselineDivergence} +\alias{getBaselineDivergence,SummarizedExperiment-method} +\alias{addBaselineDivergence,SummarizedExperiment-method} +\title{Beta diversity between the baseline and later time steps} +\usage{ +getBaselineDivergence(x, ...) + +\S4method{getBaselineDivergence}{SummarizedExperiment}( + x, + time.col, + assay.type = "counts", + reference = NULL, + group = NULL, + method = "bray", + ... +) + +addBaselineDivergence(x, ...) + +\S4method{addBaselineDivergence}{SummarizedExperiment}(x, name = "divergence", name.time = "time_diff", ...) +} +\arguments{ +\item{x}{A +\code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} +object.} + +\item{...}{Optional arguments passed into +\code{\link[mia:addDivergence]{mia::addDivergence()}}.} + +\item{time.col}{\code{Character scalar}. Specifies a name of the column from +\code{colData} that identifies the sampling time points for the samples.} + +\item{assay.type}{\code{Character scalar}. Specifies which assay values are +used in the dissimilarity estimation. (Default: \code{"counts"})} + +\item{reference}{\code{Character scalar}. Specifies a name of the column from +\code{colData} that identifies the baseline samples to be used. +(Default: \code{NULL})} + +\item{group}{\code{Character scalar}. Specifies a name of the column from +\code{colData} that identifies the grouping of the samples. +(Default: \code{NULL})} + +\item{method}{\code{Character scalar}. Used to calculate the dissimilarity +Method is passed to the function that is specified by \code{dis.fun}. +(Default: \code{"bray"})} + +\item{name}{\code{Character scalar}. Specifies a column name for storing +divergence results. (Default: \code{"divergence"})} + +\item{name.time}{\code{Character scalar}. Specifies a column name for storing +time differences. (Default: \code{"time_diff"})} +} +\value{ +\code{getBaselineDivergence} returns \code{DataFrame} object +containing the sample dissimilarity and corresponding time difference between +samples. \code{addBaselineDivergence}, on the other hand, returns a +\code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} +object with these results in its \code{colData}. +} +\description{ +Calculates sample dissimilarity between the given baseline and other +time points, optionally within a group (subject, reaction chamber, or +similar). The corresponding time difference is returned as well. +} +\details{ +The group argument allows calculating divergence per group. If given, the +divergence is calculated per group. e.g. subject, chamber, group etc. +Otherwise, this is done across all samples at once. + +The baseline sample(s) always need to belong to the data object i.e. they +can be merged into it before +applying this function. The reason is that they need to have comparable +sample data, at least some time point +information for calculating time differences w.r.t. baseline sample. + +The baseline time point is by default defined as the smallest time point +(per group). Alternatively, +the user can provide the baseline vector, or a list of baseline vectors per +group (named list per group). +} +\examples{ +library(miaTime) +library(mia) + +data(hitchip1006) +tse <- transformAssay(hitchip1006, method = "relabundance") + +# By default, reference samples are the samples from the first timepoint +tse <- addBaselineDivergence( + tse, + group = "subject", + time.col = "time", + assay.type = "relabundance", + method = "bray") + +# Add reference samples to colData, if you want to specify reference +# samples manually +colData(tse)[["reference"]] <- "Sample-875" +tse <- addBaselineDivergence( + tse, + reference = "reference", + group = "subject", + time.col = "time", + name = "divergence_from_baseline", + name.time = "time_from_baseline", + assay.type = "relabundance", + method = "bray") + +} +\seealso{ +\code{\link[mia:addDivergence]{mia::addDivergence()}} +} diff --git a/man/addStepwiseDivergence.Rd b/man/addStepwiseDivergence.Rd new file mode 100644 index 0000000..cd47736 --- /dev/null +++ b/man/addStepwiseDivergence.Rd @@ -0,0 +1,96 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/getStepwiseDivergence.R +\name{addStepwiseDivergence} +\alias{addStepwiseDivergence} +\alias{getStepwiseDivergence} +\alias{getStepwiseDivergence,ANY-method} +\alias{addStepwiseDivergence,SummarizedExperiment-method} +\title{Beta diversity between consecutive time steps} +\usage{ +getStepwiseDivergence(x, ...) + +\S4method{getStepwiseDivergence}{ANY}( + x, + time.col, + assay.type = "counts", + time.interval = 1L, + group = NULL, + method = "bray", + ... +) + +addStepwiseDivergence(x, ...) + +\S4method{addStepwiseDivergence}{SummarizedExperiment}(x, name = "divergence", name.time = "time_diff", ...) +} +\arguments{ +\item{x}{A +\code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} +object.} + +\item{...}{Optional arguments passed into +\code{\link[mia:addDivergence]{mia::addDivergence()}}.} + +\item{time.col}{\code{Character scalar}. Specifies a name of the column from +\code{colData} that identifies the sampling time points for the samples.} + +\item{assay.type}{\code{Character scalar}. Specifies which assay values are +used in the dissimilarity estimation. (Default: \code{"counts"})} + +\item{time.interval}{\code{Integer scalar}. Indicates the increment between +time steps. By default, the function compares each sample to the +previous one. If you need to take every second, every third, or so, time +step, then increase this accordingly. (Default: \code{1L})} + +\item{group}{\code{Character scalar}. Specifies a name of the column from +\code{colData} that identifies the grouping of the samples. +(Default: \code{NULL})} + +\item{method}{\code{Character scalar}. Used to calculate the dissimilarity +Method is passed to the function that is specified by \code{dis.fun}. +(Default: \code{"bray"})} + +\item{name}{\code{Character scalar}. Specifies a column name for storing +divergence results. (Default: \code{"divergence"})} + +\item{name.time}{\code{Character scalar}. Specifies a column name for storing +time differences. (Default: \code{"time_diff"})} +} +\value{ +\code{getStepwiseDivergence} returns \code{DataFrame} object +containing the sample dissimilarity and corresponding time difference between +samples. \code{addStepwiseDivergence}, on the other hand, returns a +\code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} +object with these results in its \code{colData}. +} +\description{ +Calculates sample dissimilarity between consecutive time points along with +time difference. +} +\details{ +These functions calculate time-wise divergence, meaning each sample is +compared to the previous i-th sample, where i is the specified time +interval (see \code{time.interval}). By default, the function calculates +divergence by comparing all samples with each other. However, it is often +more meaningful to calculate divergence within a specific patient or group +(see the \code{group} parameter). +} +\examples{ +library(miaTime) + +data(hitchip1006) +tse <- transformAssay(hitchip1006, method = "relabundance") + +# Calculate divergence +tse <- addStepwiseDivergence( + tse, + group = "subject", + time.interval = 1, + time.col = "time", + assay.type = "relabundance" + ) + +} +\seealso{ +\code{\link[mia:addDivergence]{mia::addDivergence()}} +} diff --git a/man/getBaselineDivergence.Rd b/man/getBaselineDivergence.Rd deleted file mode 100644 index 7775ba4..0000000 --- a/man/getBaselineDivergence.Rd +++ /dev/null @@ -1,120 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/getBaselineDivergence.R -\name{getBaselineDivergence} -\alias{getBaselineDivergence} -\title{Beta diversity between the baseline and later time steps} -\usage{ -getBaselineDivergence( - x, - group = NULL, - time_field, - name_divergence = "divergence_from_baseline", - name_timedifference = "time_from_baseline", - assay.type = "counts", - FUN = vegan::vegdist, - method = "bray", - baseline_sample = NULL, - altexp = NULL, - dimred = NULL, - n_dimred = NULL, - ... -) -} -\arguments{ -\item{x}{A -\code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} -object.} - -\item{group}{\code{Character scalar}. Specifies the grouping -factor (name of a \code{colData} field). If given, the divergence is calculated -per group. e.g. subject, chamber, group etc.). (Default: \code{NULL})} - -\item{time_field}{\code{Character scalar}. Specifies the name of the -time series field in \code{colData}.} - -\item{name_divergence}{\code{Character scalar}. Shows beta diversity between samples. -(Default: \code{"time_divergence"})} - -\item{name_timedifference}{\code{Character scalar}. Field name for adding the time difference between -samples used to calculate beta diversity. (Default: \code{"time_difference"})} - -\item{assay.type}{\code{Character scalar}. Specifies which assay values are used in -the dissimilarity estimation. (Default: \code{"counts"})} - -\item{FUN}{\code{Function} for dissimilarity calculation. The function must -expect the input matrix as its first argument. With rows as samples -and columns as features. (Default: \code{vegan::vegdist})} - -\item{method}{\code{Character scalar}. Used to calculate the distance. Method is -passed to the function that is specified by \code{FUN}. (Default: \code{"bray"})} - -\item{baseline_sample}{\code{Character vector}. Specifies the baseline sample(s) to be used. If the -"group" argument is given, this must be a named vector; one element per group. (Default: \code{NULL})} - -\item{altexp}{\code{Character scalar} or \code{integer scalar}. Specifies the alternative experiment -containing the input data. (Default: \code{NULL})} - -\item{dimred}{\code{Character scalar} or \code{integer scalar}. indicates the reduced dimension -result in \code{reducedDims} to use in the estimation. (Default: \code{NULL})} - -\item{n_dimred}{\code{Integer vector}. Specifies the dimensions to use if -\code{dimred} is specified. (Default: \code{NULL})} - -\item{...}{Arguments to be passed} -} -\value{ -a -\code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} -or -\code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} -containing the sample dissimilarity and corresponding time difference between -samples (across n time steps), within each level of the grouping factor. -} -\description{ -Calculates sample dissimilarity between the given baseline and other -time points, optionally within a group (subject, reaction chamber, or -similar). The corresponding time difference is returned as well. -The method operates on \code{SummarizedExperiment} objects, and the results -are stored in \code{colData}. -} -\details{ -The group argument allows calculating divergence per group. Otherwise, this is done across all samples at once. - -The baseline sample/s always need to belong to the data object i.e. they can be merged into it before -applying this function. The reason is that they need to have comparable sample data, at least some time point -information for calculating time differences w.r.t. baseline sample. - -The baseline time point is by default defined as the smallest time point (per group). Alternatively, -the user can provide the baseline vector, or a list of baseline vectors per group (named list per group). -} -\examples{ -#library(miaTime) -library(TreeSummarizedExperiment) -library(dplyr) - -data(hitchip1006) -tse <- mia::transformCounts(hitchip1006, method = "relabundance") - -# Subset to speed up example -tse <- tse[, colData(tse)$subject \%in\% c("900", "934", "843", "875")] - -tse2 <- getBaselineDivergence(tse, - group = "subject", - time_field = "time", - name_divergence = "divergence_from_baseline", - name_timedifference = "time_from_baseline", - assay.type="relabundance", - FUN = vegan::vegdist, - method="bray") - -tse2 <- getBaselineDivergence(tse, - baseline_sample = "Sample-875", - group = "subject", - time_field = "time", - name_divergence = "divergence_from_baseline", - name_timedifference = "time_from_baseline", - assay.type="relabundance", - FUN = vegan::vegdist, - method="bray") - -} diff --git a/man/getStepwiseDivergence.Rd b/man/getStepwiseDivergence.Rd deleted file mode 100644 index c1e1c74..0000000 --- a/man/getStepwiseDivergence.Rd +++ /dev/null @@ -1,106 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/getStepwiseDivergence.R -\name{getStepwiseDivergence} -\alias{getStepwiseDivergence} -\alias{getTimeDivergence} -\alias{getTimeDivergence,ANY-method} -\title{Beta diversity between consecutive time steps} -\usage{ -getStepwiseDivergence( - x, - group = NULL, - time_field, - time_interval = 1, - name_divergence = "time_divergence", - name_timedifference = "time_difference", - assay.type = "counts", - FUN = vegan::vegdist, - method = "bray", - altexp = NULL, - dimred = NULL, - n_dimred = NULL, - ... -) - -getTimeDivergence(x, ...) - -\S4method{getTimeDivergence}{ANY}(x, ...) -} -\arguments{ -\item{x}{A -\code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} -object.} - -\item{group}{\code{Character scalar}. Specifies the grouping -factor (name of a \code{colData} field). If given, the divergence is calculated -per group. e.g. subject, chamber, group etc.). (Default: \code{NULL})} - -\item{time_field}{\code{Character scalar}. Specifies the name of the -time series field in \code{colData}.} - -\item{time_interval}{\code{Integer scalar}. Indicates the increment between time -steps. If you need to take every second, every third, or so, time step only, then -increase this accordingly. (Default: \code{1})} - -\item{name_divergence}{\code{Character scalar}. Shows beta diversity between samples. -(Default: \code{"time_divergence"})} - -\item{name_timedifference}{\code{Character scalar}. Field name for adding the time difference between -samples used to calculate beta diversity. (Default: \code{"time_difference"})} - -\item{assay.type}{\code{Character scalar}. Specifies which assay values are used in -the dissimilarity estimation. (Default: \code{"counts"})} - -\item{FUN}{\code{Function} for dissimilarity calculation. The function must -expect the input matrix as its first argument. With rows as samples -and columns as features. (Default: \code{vegan::vegdist})} - -\item{method}{\code{Character scalar}. Used to calculate the distance. Method is -passed to the function that is specified by \code{FUN}. (Default: \code{"bray"})} - -\item{altexp}{\code{Character scalar} or \code{integer scalar}. Specifies the alternative experiment -containing the input data. (Default: \code{NULL})} - -\item{dimred}{\code{Character scalar} or \code{integer scalar}. indicates the reduced dimension -result in \code{reducedDims} to use in the estimation. (Default: \code{NULL})} - -\item{n_dimred}{\code{Integer vector}. Specifies the dimensions to use if -\code{dimred} is specified. (Default: \code{NULL})} - -\item{...}{Arguments to be passed} -} -\value{ -a -\code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} -or -\code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} -containing the sample dissimilarity and corresponding time difference between -samples (across n time steps), within each level of the grouping factor. -} -\description{ -Calculates sample dissimilarity between consecutive time points (t, t+i), -within a group (subject, reaction chamber, or similar). The corresponding -time difference is returned as well. The method operates on -\code{SummarizedExperiment} objects, and the results are stored in \code{colData}. -} -\examples{ -#library(miaTime) -library(TreeSummarizedExperiment) - -data(hitchip1006) -tse <- mia::transformCounts(hitchip1006, method = "relabundance") - -# Subset to speed up example -tse <- tse[, colData(tse)$subject \%in\% c("900", "934", "843", "875")] - -# Using vegdist for divergence calculation, one can pass -# the dissimilarity method from the vegan::vegdist options -# via the "method" argument -tse2 <- getStepwiseDivergence(tse, group = "subject", - time_interval = 1, - time_field = "time", - assay.type="relabundance", - FUN = vegan::vegdist, - method="bray") - -} diff --git a/tests/testthat/test-getBaselineDivergence.R b/tests/testthat/test-getBaselineDivergence.R index fe48ef2..057b9da 100644 --- a/tests/testthat/test-getBaselineDivergence.R +++ b/tests/testthat/test-getBaselineDivergence.R @@ -1,119 +1,243 @@ -test_that("getBaselineDivergence", { - - library(dplyr) - data(hitchip1006) - tse <- hitchip1006 - # Subset to speed up computing - # Just pick 4 subjects with 1-5 time points - tse <- tse[, colData(tse)$subject %in% c("900", "934", "843", "875", "836")] - tse2 <- getBaselineDivergence(tse, group = "subject", time_field = "time") - - # Input and output classes should match - expect_equal(class(tse), class(tse2)) - - # A subject to check time difference calculation - time2 <- colData(tse2)[, "time"][which(colData(tse2)[, "subject"] == "843")] - time_diff_2 <- colData(tse2)[, "time_from_baseline"][which(colData(tse2)[, "subject"] == "843")] - expect_true(all(time2==time_diff_2)) - - # Test divergences - inds0 <- which(colData(tse)[, "subject"] == "843") - inds <- which(colData(tse2)[, "subject"] == "843") - original.divergence <- as.matrix(vegan::vegdist(t(assay(tse[, inds0], "counts"))))[,1] - calculated.divergence <- colData(tse2)[inds, "divergence_from_baseline"] - expect_true(all(original.divergence==calculated.divergence)) - - # Should also work when baseline is not 0 - inds <- which(colData(tse)[, "subject"] == "843")[2:5] - tse2 <- getBaselineDivergence(tse[, inds], group = "subject", time_field = "time") - time2 <- colData(tse[, inds])[, "time"] - min(colData(tse[, inds])[, "time"]) - time_diff_2 <- colData(tse2)[, "time_from_baseline"] - expect_true(all(time2==time_diff_2)) - - # ----------------------------------------------------------- - - # devtools::load_all("~/Rpackages/microbiome/miaverse/miaTime/") - - data(hitchip1006) - tse <- hitchip1006 - # Just pick 1 subject with many time points - tse <- tse[, colData(tse)$subject == "843"] # The baseline time point 0 is Sample-843 - - # Should now work also without the "group" argument because there is just a single group (subject) - tse2a <- getBaselineDivergence(tse, time_field = "time") - tse2b <- getBaselineDivergence(tse, group="subject", time_field = "time") - expect_identical(tse2a, tse2b) - - # Define the baseline sample manually - tse2c <- getBaselineDivergence(tse, time_field = "time", baseline_sample="Sample-843") - tse2d <- getBaselineDivergence(tse, time_field = "time", baseline_sample="Sample-1075") - # Now the times from baseline should be shifted and dissimilarities differ - - # Sample baseline when the zero time baseline is automatically checked or manually set - expect_true(all(tse2b$time_from_baseline==tse2c$time_from_baseline)) - # The shifted case (different, middle sample as baseline) - expect_true(all(tse2c$time_from_baseline == tse2d$time_from_baseline + 0.7)) - - tse <- hitchip1006 - # Subset to speed up computing - # Just pick 4 subjects with 1-5 time points - tse <- tse[, colData(tse)$subject %in% c("900", "934", "843", "875", "836")] - tse2e <- getBaselineDivergence(tse[, colData(tse)$subject == "843"], group="subject", time_field = "time") - tse2f <- getBaselineDivergence(tse, group = "subject", time_field = "time") - tse2g <- getBaselineDivergence(tse, group = "subject", time_field = "time", baseline_sample="Sample-1075") - expect_identical(colData(tse2e)["Sample-843", "time_from_baseline"], colData(tse2f)["Sample-843", "time_from_baseline"]) - expect_identical(colData(tse2e)["Sample-843", "time_from_baseline"] - 0.7, colData(tse2g)["Sample-843", "time_from_baseline"]) - - # Test with full baseline list - baselines <- c("Sample-1041", "Sample-1075", "Sample-875", "Sample-900", "Sample-934") - names(baselines) <- names(split(colnames(tse), as.character(tse$subject))) - tse2h <- getBaselineDivergence(tse, group = "subject", time_field = "time", baseline_sample=baselines) - expect_identical(colData(tse2h)["Sample-843", "time_from_baseline"], colData(tse2g)["Sample-843", "time_from_baseline"]) - - # Single baseline - tse2i <- getBaselineDivergence(tse, group = "subject", time_field = "time", baseline_sample=tse[, "Sample-1075"]) - expect_identical(colData(tse2i)["Sample-1075", "time_from_baseline"], colData(tse2g)["Sample-1075", "time_from_baseline"]) - expect_identical(colData(tse2i)["Sample-843", "time_from_baseline"] + 0.7, colData(tse2g)["Sample-1075", "time_from_baseline"]) - - ## Test with ordination values - tse <- scater::runMDS(tse, FUN = vegan::vegdist, method = "bray", - name = "PCoA_BC", exprs_values = "counts", - na.rm = TRUE, ncomponents=4) - # testing with all ordination components; n_dimred=NULL --> all 4 components - tse2 <- getBaselineDivergence(tse, group = "subject", - time_field = "time", - name_timedifference="time_from_baseline_ord_4", - name_divergence="divergence_from_baseline_ord_4", - dimred = "PCoA_BC", - FUN=vegan::vegdist, - method="euclidean") - # Time differences should still match - expect_true(identical(tse2$time_from_baseline_ord_4, tse2f$time_from_baseline)) - # ordination based divergence values should not be equal to the ones on counts - expect_false(identical(tse2$divergence_from_baseline_ord_4, tse2f$divergence_from_baseline)) - # testing with 2 ordination components - tse2 <- getBaselineDivergence(tse2, group = "subject", - time_field = "time", - name_timedifference="time_from_baseline_ord_2", - name_divergence="divergence_from_baseline_ord_2", - dimred = "PCoA_BC", - n_dimred = 2, - FUN=vegan::vegdist, - method="euclidean") - # Time differences should still match - expect_true(identical(tse2$time_from_baseline_ord_4, tse2$time_from_baseline_ord_2)) - # ordination based divergence values should not be equal to the ones on counts - expect_false(identical(tse2$divergence_from_baseline_ord_4, tse2$divergence_from_baseline_ord_2)) - ## testing with altExp - SingleCellExperiment::altExp(tse2, "Family") <- mia::agglomerateByRank(tse2, rank="Family") - tse2 <- getBaselineDivergence(tse2, group = "subject", - time_field = "time", - altexp="Family", - name_timedifference="time_from_baseline_Fam", - name_divergence="divergence_from_baseline_Fam") - # Time differences should still match - expect_true(identical(tse2$time_from_baseline_Fam, tse2f$time_from_baseline)) - # divergence values based on Family rank counts should not be equal to the - # ones with Genus counts - expect_false(identical(tse2$divergence_from_baseline_Fam, tse2f$divergence_from_baseline)) +# Test that the divergence and time difference is correct +test_that("addBaselineDivergence output", { + data(hitchip1006) + tse <- hitchip1006 + tse2 <- addBaselineDivergence( + tse, group = "subject", time.col = "time", + name = "divergence_from_baseline", name.time = "time_from_baseline") + # Input and output classes should match + expect_equal(class(tse), class(tse2)) + # A subject to check time difference calculation + time2 <- colData(tse2)[which(tse2[["subject"]] == "843"), "time"] + time_diff_2 <- colData(tse2)[ + which(tse2[["subject"]] == "843"), "time_from_baseline"] + expect_true( all(time2 == time_diff_2) ) + # Test divergences + inds0 <- which(tse2[["subject"]] == "843") + inds <- which(tse2[["subject"]] == "843") + original.divergence <- as.matrix( + vegan::vegdist(t(assay(tse[, inds0], "counts"))))[, 1] + calculated.divergence <- colData(tse2)[inds, "divergence_from_baseline"] + expect_true( all(original.divergence == calculated.divergence) ) +}) + +# Test that the result is correct when baseline time point is not 0 +test_that("Divergence in baseline other than 0", { + data(hitchip1006) + tse <- hitchip1006 + # Should also work when baseline is not 0 + inds <- which(tse[["subject"]] == "843")[2:5] + tse2 <- addBaselineDivergence( + tse[, inds], group = "subject", time.col = "time", + name = "divergence_from_baseline", name.time = "time_from_baseline") + time2 <- tse[, inds][["time"]] - min(tse[, inds][["time"]]) + time_diff_2 <- tse2[["time_from_baseline"]] + expect_true( all(time2 == time_diff_2) ) +}) + +# Test that the reference work +test_that("addBaselineDivergence reference", { + data(hitchip1006) + tse <- hitchip1006 + # Just pick 1 subject with many time points + # The baseline time point 0 is Sample-843 + tse <- tse[, tse[["subject"]] == "843"] + tse2 <- addBaselineDivergence(tse, group = "subject", time.col = "time") + # Define the baseline sample manually + tse3 <- addBaselineDivergence( + tse, time.col = "time", group = "subject", reference = "Sample-843", + name.time = "time_from_baseline", name = "divergence_from_baseline") + tse4 <- addBaselineDivergence( + tse, time.col = "time", group = "subject", reference = "Sample-1075", + name.time = "time_from_baseline", name = "divergence_from_baseline") + # Now the times from baseline should be shifted and dissimilarities differ + # Sample baseline when the zero time baseline is automatically checked or + # manually set + expect_true(all(tse2$time_from_baseline==tse3$time_from_baseline)) + # The shifted case (different, middle sample as baseline) + expect_true(all(tse3$time_from_baseline == tse4$time_from_baseline + 0.7)) + + tse5 <- addBaselineDivergence( + tse[, tse[["subject"]] == "843"], group = "subject", + time.col = "time", name.time = "time_from_baseline", + name = "divergence_from_baseline") + tse6 <- addBaselineDivergence( + tse, group = "subject", time.col = "time", + name.time = "time_from_baseline", name = "divergence_from_baseline") + tse7 <- addBaselineDivergence( + tse, group = "subject", time.col = "time", reference = "Sample-1075", + name.time = "time_from_baseline", name = "divergence_from_baseline") + expect_identical( + colData(tse5)["Sample-843", "time_from_baseline"], + colData(tse6)["Sample-843", "time_from_baseline"]) + expect_identical( + colData(tse5)["Sample-843", "time_from_baseline"] - 0.7, + colData(tse7)["Sample-843", "time_from_baseline"]) + + tse <- hitchip1006 + subjects <- unique(tse$subject) + # Test with full baseline list + baselines <- sample(colnames(tse), length(subjects)) + names(baselines) <- subjects + baselines[names(baselines) == tse[, "Sample-843"][["subject"]]] <- + "Sample-1075" + tse8 <- addBaselineDivergence( + tse, group = "subject", time.col = "time", reference = baselines, + name.time = "time_from_baseline", name = "divergence_from_baseline") + expect_identical( + colData(tse7)["Sample-843", "time_from_baseline"], + colData(tse8)["Sample-843", "time_from_baseline"]) + tse[["reference_sam"]] <- baselines[ match(tse$subject, names(baselines)) ] + res <- addBaselineDivergence( + tse, group = "subject", time.col = "time", reference = "reference_sam", + name.time = "time_from_baseline", name = "divergence_from_baseline") + ref <- getDivergence(tse, reference = "reference_sam") + expect_equal(res[["divergence_from_baseline"]], ref) +}) + +# Test that altExp works +test_that("Test altExp", { + data(hitchip1006) + tse <- hitchip1006 + altExp(tse, "Family") <- agglomerateByRank(tse, rank = "Family") + tse <- addBaselineDivergence( + tse, group = "subject", time.col = "time", altexp = "Family") + altExp(tse, "Family_test") <- addBaselineDivergence( + altExp(tse, "Family"), group = "subject", time.col = "time", + name = "val", name.time = "time_val") + # Time differences should still match + expect_equal( + altExp(tse, "Family")$divergence, altExp(tse, "Family_test")$val) +}) + +# Test that get* and add* gives same result +test_that(".get_reference_samples with different time intervals", { + data(hitchip1006) + tse <- hitchip1006 + tse <- addBaselineDivergence( + tse, group = "subject", time.col = "time", + assay.type = "counts", method = "euclidean") + res <- getBaselineDivergence( + tse, group = "subject", time.col = "time", + assay.type = "counts", method = "euclidean") + expect_equal(colData(tse)[, c("divergence", "time_diff")], res) +}) + +# Basic SummarizedExperiment for testing +col_data <- DataFrame( + time = c(0, 1, 2, 1, 2, 0), + group = c("A", "A", "A", "B", "B", "B"), + row.names = c("Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6")) +count_data <- matrix(c(10, 20, 30, 40, 50, 60), ncol = 6, byrow = TRUE) +se <- SummarizedExperiment(assays = list(counts = count_data), colData = col_data) + +# Input validation for getBaselineDivergence +test_that("getBaselineDivergence input validations", { + expect_error(getBaselineDivergence(se, time.col = "nonexistent")) + expect_error(getBaselineDivergence(se, time.col = "time", assay.type = "unknown")) + expect_error(getBaselineDivergence(se, group = "nonexistent")) + expect_error(getBaselineDivergence(se, reference = "nonexistent")) + expect_error(getBaselineDivergence(se, name = "nonexistent")) + expect_error(getBaselineDivergence(se, name.time = "nonexistent")) +}) + +# Dissimilarity calculation test +test_that("getBaselineDivergence dissimilarity calculation", { + result <- getBaselineDivergence(se, time.col = "time", method = "bray") + expect_s4_class(result, "DataFrame") + expect_true(all(c("divergence", "time_diff") %in% colnames(result))) +}) + +# Correct time difference calculation test +test_that("getBaselineDivergence correct time difference calculation", { + result <- getBaselineDivergence(se, time.col = "time", method = "bray") + expect_true(all(result$time_diff >= 0)) +}) + +# addBaselineDivergence column addition test +test_that("addBaselineDivergence adds columns to colData", { + se_result <- addBaselineDivergence(se, time.col = "time", method = "bray") + expect_true("divergence" %in% colnames(colData(se_result))) + expect_true("time_diff" %in% colnames(colData(se_result))) +}) + +# Custom column naming test for addBaselineDivergence +test_that("addBaselineDivergence handles custom column names", { + se_result <- addBaselineDivergence( + se, time.col = "time", name = "custom_div", + name.time = "custom_time_diff") + expect_true("custom_div" %in% colnames(colData(se_result))) + expect_true("custom_time_diff" %in% colnames(colData(se_result))) +}) + +# Helper function: assign correct baselines +test_that(".add_reference_samples_to_coldata assigns correct baselines", { + res <- .add_reference_samples_to_coldata( + se, time.col = "time", group = "group") + expect_true("temporal_reference_for_divergence" %in% colnames(colData(res[[1]]))) +}) + +# Reference sample assignments +test_that(".get_reference_samples baseline", { + stepwise <- .get_reference_samples( + colData(se), time.col = "time", group = "group", + reference.method = "stepwise", time.interval = 1) + expect_equal(stepwise, c( + NA, "Sample1", "Sample2", "Sample6", "Sample4", NA)) +}) + +# Time difference calculation +test_that(".get_time_difference calculates correct time diff", { + reference <- c("Sample2", "Sample1", "Sample1", "Sample3", NA, "Sample4") + se2 <- se + colData(se2)[["ref"]] <- reference + time_diffs <- .get_time_difference( + se2, time.col = "time", reference = "ref") + expect_equal(time_diffs, c(-1, 1, 2, -1, NA, -1)) +}) + +# Convert divergence to DataFrame +test_that(".convert_divergence_to_df formats correctly", { + divergence <- c(0.1, 0.2, 0.3, 0, NA, 2) + time_diff <- c(0, 1, 2, 1, 0, NA) + df <- .convert_divergence_to_df( + se, divergence, time_diff, name = "test_div", + name.time = "test_time_diff") + expect_s4_class(df, "DataFrame") + expect_equal(colnames(df), c("test_div", "test_time_diff")) + expect_equal(df$test_div, divergence) + expect_equal(df$test_time_diff, time_diff) +}) + +# Test that works with different counts table +test_that("addBaselineDivergence with multiple assay types", { + assays(se, withDimnames = FALSE) <- list( + counts = count_data, alt_counts = count_data * 2) + se_result <- addBaselineDivergence( + se, time.col = "time", assay.type = "alt_counts") + expect_true("divergence" %in% colnames(colData(se_result))) +}) + +# Test that error occurs if if method is unsupported +test_that("getBaselineDivergence unsupported method", { + expect_error(getBaselineDivergence( + se, time.col = "time", method = "unsupported")) +}) + +# Test that the divergence is calculated correctly for specific reference sample +test_that("addBaselineDivergence with custom reference sample", { + se_result <- addBaselineDivergence( + se, time.col = "time", reference = "Sample1") + expect_equal(colData(se_result)["Sample1", "divergence"], 0) +}) + +# Test that postprocessing works with NA values +test_that(".convert_divergence_to_df with NA divergence values", { + divergence <- c(0.1, NA, 0.3, NA, 0.5, 0.6) + time_diff <- c(0, 1, 2, 1, 0, NA) + df <- .convert_divergence_to_df( + se, divergence, time_diff, name = "test_div", + name.time = "test_time_diff") + expect_s4_class(df, "DataFrame") + expect_true(all(is.na(df$test_div[is.na(divergence)]))) }) diff --git a/tests/testthat/test-getStepwiseDivergence.R b/tests/testthat/test-getStepwiseDivergence.R new file mode 100644 index 0000000..85c3e1e --- /dev/null +++ b/tests/testthat/test-getStepwiseDivergence.R @@ -0,0 +1,270 @@ +# Test: Basic functionality of addStepwiseDivergence +test_that("Basic functionality of addStepwiseDivergence", { + data(hitchip1006) + tse <- hitchip1006 + tse2 <- addStepwiseDivergence( + tse, group = "subject", time.interval = 1, time.col = "time", + assay.type="counts", dis.fun = vegan::vegdist, method = "bray", + name.time = "time_difference") + expect_equal(class(tse), class(tse2)) +}) + +# Test: Adding new colData field with existing name generates warning +test_that("Adding new colData field with existing name generates warning", { + data(hitchip1006) + tse <- hitchip1006 + tse[["time_difference"]] <- NA + expect_warning(addStepwiseDivergence( + tse, group = "subject", time.interval = 1, time.col = "time", + name.time = "time_difference")) +}) + +# Test: Time difference calculation for a specific subject +test_that("Time difference calculation is correct for a specific subject", { + data(hitchip1006) + tse <- hitchip1006 + tse2 <- addStepwiseDivergence( + tse, group = "subject", time.interval = 1, time.col = "time", + assay.type="counts", dis.fun = vegan::vegdist, method = "bray", + name.time = "time_difference") + obs_diff <- colData(tse2)[ + which(tse2[["subject"]] == "843"), "time_difference"] + exp_diff <- c(NA, diff(colData(tse)[ + which(tse[["subject"]] == "843"), "time"])) + expect_equal(obs_diff, exp_diff) +}) + +# Test: addStepwiseDivergence with n > 1 +test_that("addStepwiseDivergence with n > 1 calculates divergences correctly", { + data(hitchip1006) + tse <- hitchip1006 + tse2 <- addStepwiseDivergence( + tse, group = "subject", time.interval = 2, time.col = "time", + assay.type = "counts", dis.fun = vegan::vegdist, method = "bray", + name.time = "time_difference") + time_interval <- 2 + time <- colData(tse2)[which(tse2[["subject"]] == "843"), "time"] + time_diff <- colData(tse2)[which(tse2[["subject"]] == "843"), "time_difference"] + divergence_number <- length(time) - time_interval + divergence_calculated <- length(which(!is.na(time_diff) == TRUE)) + expect_equal(divergence_number, divergence_calculated) +}) + +# Test: Interval check for divergence calculation +test_that("Interval check for divergence calculation", { + data(hitchip1006) + tse <- hitchip1006 + tse2 <- addStepwiseDivergence( + tse, group = "subject", time.interval = 2, time.col = "time", + assay.type = "counts", dis.fun = vegan::vegdist, method = "bray", + name.time = "time_difference") + time <- colData(tse2)[which(tse2[["subject"]] == "843"), "time"] + calculated_diff <- time[(1 + 2):length(time)] - + time[seq_len(length(time) - 2)] + manual_diff <- c(rep( + NA, length(time) - length(calculated_diff)), calculated_diff) + expect_equal(colData(tse2)[ + which(tse2[["subject"]] == "843"), "time_difference"], manual_diff) +}) + +# Test: Single time point results in NA divergence values +test_that("Single time point results in NA divergence values", { + data(hitchip1006) + tse <- hitchip1006 + tse2 <- tse[, tse[["subject"]] %in% c("900", "843", "139")] + tse2 <- addStepwiseDivergence( + tse2, group = "subject", time.interval = 1, time.col = "time", + assay.type = "counts", dis.fun = vegan::vegdist, method = "bray", + name = "time_divergence", + name.time = "time_difference") + expect_true(all(is.na(colData(tse2)[ + which(duplicated(tse2[["subject"]]) == FALSE), + "time_divergence"]))) +}) + +# Test: Comparing vegan distances (bray vs euclidean) +test_that("Comparing vegan distances (bray vs euclidean)", { + data(hitchip1006) + tse <- hitchip1006 + tse2 <- addStepwiseDivergence( + tse, group = "subject", time.interval = 1, time.col = "time", + assay.type = "counts", dis.fun = vegan::vegdist, method = "bray", + name.time = "timedifference", name = "timedivergence") + tse2 <- addStepwiseDivergence( + tse2, group = "subject", time.interval = 1, time.col = "time", + assay.type = "counts", dis.fun = vegan::vegdist, method = "euclidean", + name.time = "timedifference2", name = "timedivergence2") + expect_true(identical(tse2$timedifference, tse2$timedifference2)) + expect_true(!identical(tse2$timedivergence, tse2$timedivergence2)) +}) + +# Test: AltExp functionality in addStepwiseDivergence +test_that("AltExp functionality in addStepwiseDivergence", { + data(hitchip1006) + tse <- hitchip1006 + altExp(tse, "Family") <- agglomerateByRank(tse, rank = "Family") + tse <- addStepwiseDivergence( + tse, group = "subject", time.interval = 1, time.col = "time", + altexp = "Family") + altExp(tse, "Family_test") <- addStepwiseDivergence( + altExp(tse, "Family"), group = "subject", time.interval = 1, + time.col = "time", name.time = "timedifference", name = "timedivergence") + expect_equal( + altExp(tse, "Family")$time_diff, + altExp(tse, "Family_test")$timedifference) + expect_equal( + altExp(tse, "Family")$divergence, + altExp(tse, "Family_test")$timedivergence) +}) + +# Test: getStepwiseDivergence output type +test_that("getStepwiseDivergence output type", { + data(hitchip1006) + tse <- hitchip1006 + divergence_result <- getStepwiseDivergence( + tse, group = "subject", time.interval = 1, time.col = "time", + assay.type = "counts", dis.fun = vegan::vegdist, method = "bray") + expect_s4_class(divergence_result, "DFrame") + expect_true(all(c("time_diff", "divergence") %in% names(divergence_result))) +}) + +# Test: Error if time column is missing +test_that("Error if time column is missing", { + data(hitchip1006) + tse <- hitchip1006 + expect_error(addStepwiseDivergence( + tse, group = "subject", time.interval = 1, time.col = "nonexistent_time")) +}) + +# Test: Error if specified assay type does not exist +test_that("Error if specified assay type does not exist", { + data(hitchip1006) + tse <- hitchip1006 + expect_error(addStepwiseDivergence( + tse, group = "subject", time.interval = 1, time.col = "time", + assay.type = "nonexistent_assay")) +}) + +# Test: Error if group column is invalid +test_that("Error if group column is invalid", { + data(hitchip1006) + tse <- hitchip1006 + expect_error(addStepwiseDivergence( + tse, group = "invalid_group", time.interval = 1, time.col = "time")) +}) + +# Test that time intervals calculation work +test_that(".get_reference_samples with different time intervals", { + data(hitchip1006) + tse <- hitchip1006 + interval_1 <- .get_reference_samples( + colData(tse), time.col = "time", group = "subject", + reference.method = "stepwise", time.interval = 1) + interval_2 <- .get_reference_samples( + colData(tse), time.col = "time", group = "subject", + reference.method = "stepwise", time.interval = 2) + expect_false(all(interval_1 == interval_2)) +}) + +# Test that get* and add* gives same result +test_that(".get_reference_samples with different time intervals", { + data(hitchip1006) + tse <- hitchip1006 + tse <- addStepwiseDivergence( + tse, group = "subject", time.interval = 2, time.col = "time", + assay.type = "counts", method = "euclidean") + res <- getStepwiseDivergence( + tse, group = "subject", time.interval = 2, time.col = "time", + assay.type = "counts", method = "euclidean") + expect_equal(colData(tse)[, c("divergence", "time_diff")], res) +}) + +# Basic SummarizedExperiment for testing +col_data <- DataFrame( + time = c(0, 1, 2, 1, 2, 0), + group = c("A", "A", "A", "B", "B", "B"), + row.names = c("Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6")) +count_data <- matrix(c(10, 20, 30, 40, 50, 60), ncol = 6, byrow = TRUE) +se <- SummarizedExperiment(assays = list(counts = count_data), colData = col_data) + +# Input validation for getStepwiseDivergence +test_that("getStepwiseDivergence input validations", { + expect_error(getStepwiseDivergence(se, time.col = "nonexistent")) + expect_error(getStepwiseDivergence(se, time.col = "time", assay.type = "unknown")) + expect_error(getStepwiseDivergence(se, group = "nonexistent")) + expect_error(getStepwiseDivergence(se, reference = "nonexistent")) + expect_error(getStepwiseDivergence(se, name = "nonexistent")) + expect_error(getStepwiseDivergence(se, name.time = "nonexistent")) +}) + +# Dissimilarity calculation test +test_that("getStepwiseDivergence dissimilarity calculation", { + result <- getStepwiseDivergence(se, time.col = "time", method = "bray") + expect_s4_class(result, "DataFrame") + expect_true(all(c("divergence", "time_diff") %in% colnames(result))) +}) + +# Correct time difference calculation test +test_that("getStepwiseDivergence correct time difference calculation", { + result <- getStepwiseDivergence(se, time.col = "time", method = "bray") + expect_true(any(is.na(result$time_diff))) +}) + +# addStepwiseDivergence column addition test +test_that("addStepwiseDivergence adds columns to colData", { + se_result <- addStepwiseDivergence(se, time.col = "time", method = "bray") + expect_true("divergence" %in% colnames(colData(se_result))) + expect_true("time_diff" %in% colnames(colData(se_result))) +}) + +# Custom column naming test for addStepwiseDivergence +test_that("addStepwiseDivergence handles custom column names", { + se_result <- addStepwiseDivergence( + se, time.col = "time", name = "custom_div", + name.time = "custom_time_diff") + expect_true("custom_div" %in% colnames(colData(se_result))) + expect_true("custom_time_diff" %in% colnames(colData(se_result))) +}) + +# Helper function: assign correct baselines +test_that(".add_reference_samples_to_coldata assigns correct baselines", { + res <- .add_reference_samples_to_coldata( + se, time.col = "time", group = "group") + expect_true("temporal_reference_for_divergence" %in% colnames(colData(res[[1]]))) +}) + +# Reference sample assignments +test_that(".get_reference_samples stepwise", { + stepwise <- .get_reference_samples( + colData(se), time.col = "time", group = "group", + reference.method = "stepwise", time.interval = 1) + expect_equal(stepwise, c( + NA, "Sample1", "Sample2", "Sample6", "Sample4", NA)) +}) + + +# Test that works with different counts table +test_that("addStepwiseDivergence with multiple assay types", { + assays(se, withDimnames = FALSE) <- list( + counts = count_data, alt_counts = count_data * 2) + se_result <- addStepwiseDivergence( + se, time.col = "time", assay.type = "alt_counts") + expect_true("divergence" %in% colnames(colData(se_result))) +}) + +# Test that error occurs if if method is unsupported +test_that("getStepwiseDivergence unsupported method", { + expect_error(getStepwiseDivergence( + se, time.col = "time", method = "unsupported")) +}) + +# Test that the divergence is calculated correctly for specific reference sample +test_that("addStepwiseDivergence with custom reference sample", { + res <- getStepwiseDivergence( + se, time.col = "time", group = "group") + se[["reference"]] <- c(NA, "Sample1", "Sample2", "Sample6", "Sample4", NA) + time_diff <- c(NA, 1, 1, 1, 1, NA) + ref <- getDivergence(se, reference = "reference") + expect_equal(res[["divergence"]], ref) + expect_equal(res[["time_diff"]], time_diff) +}) diff --git a/tests/testthat/test-getTimeDivergence.R b/tests/testthat/test-getTimeDivergence.R deleted file mode 100644 index e5fce6f..0000000 --- a/tests/testthat/test-getTimeDivergence.R +++ /dev/null @@ -1,125 +0,0 @@ -test_that("getStepwiseDivergence", { - data(hitchip1006) - tse <- hitchip1006 - # Subset to speed up computing - # Just pick 4 subjects with 1-5 time points - tse <- tse[, colData(tse)$subject %in% c("900", "934", "843", "875")] - tse2 <- getStepwiseDivergence(tse, group = "subject", - time_interval = 1, - time_field = "time") - - # Trying to add new coldata field with the same name - expect_warning(tse2 <- getStepwiseDivergence(tse2, group = "subject", - time_interval = 1, - time_field = "time")) - - # Input and output classes should match - expect_equal(class(tse), class(tse2)) - - # A subject to check time difference calculation - obs_diff <- colData(tse2)[which(colData(tse2)[, "subject"] == "843"), "time_difference"] - exp_diff <- c(NA,diff(colData(tse)[which(colData(tse)[, "subject"] == "843"), "time"])) - expect_equal(obs_diff, exp_diff) - - # n > 1 - tse3 <- getStepwiseDivergence(tse, group = "subject", - time_interval = 2, - time_field = "time") - time_invertal <- 2 - - time3 <- colData(tse3)[, "time"][which(colData(tse3)[, "subject"] == "843")] - - time_dif_3 <- colData(tse3)[, "time_difference"][which(colData(tse3)[, "subject"] == "843")] - - # number of divergences (n-k) check - divergence_number <- length(time3) - time_invertal - - divergence_calculated <- length(which(!is.na(time_dif_3) == TRUE)) - - expect_equal(divergence_number, divergence_calculated) - - # interval check - calculated_diff <- time3[(1+ 2):length(time3)] - time3[seq_len(length(time3)-2)] - - manual_diff <- c(rep(NA, length(time3) - length(calculated_diff)), calculated_diff) - - expect_equal(time_dif_3, manual_diff) - - # object with single time point has NA instead of divergence values - sub_hitchip <- hitchip1006[, colData(hitchip1006)$subject %in% c("900","843", "139")] - subset <- getStepwiseDivergence(sub_hitchip, group = "subject", - time_interval = 1, - time_field = "time") - - expect_true(all(is.na(colData(subset)[, "time_divergence"][which(duplicated(colData(subset)[, "subject"]) == FALSE)]))) - - - # Test vegan distances - tse2 <- getStepwiseDivergence(tse, group = "subject", - time_interval = 1, - time_field = "time", - FUN=vegan::vegdist, - method="bray", - name_timedifference="timedifference", - name_divergence="timedivergence") - # Test vegan distances - tse2 <- getStepwiseDivergence(tse2, group = "subject", - time_interval = 1, - time_field = "time", - FUN=vegan::vegdist, - method="euclidean", - name_timedifference="timedifference2", - name_divergence="timedivergence2") - - # Time differences should still match - expect_true(identical(tse2$timedifference, tse2$timedifference2)) - # ... but divergences should be different (bray vs. euclid) - expect_true(!identical(tse2$timedivergence, tse2$timedivergence2)) - - ## Test with ordination values - tse2 <- scater::runMDS(tse2, FUN = vegan::vegdist, method = "bray", - name = "PCoA_BC", exprs_values = "counts", - na.rm = TRUE, ncomponents=4) - # testing with all ordination components; n_dimred=NULL --> all 4 components - tse2 <- getStepwiseDivergence(tse2, group = "subject", - time_interval = 1, - time_field = "time", - name_timedifference="timedifference_ord_4", - name_divergence="timedivergence_ord_4", - dimred = "PCoA_BC", - FUN=vegan::vegdist, - method="euclidean") - # Time differences should still match - expect_true(identical(tse2$timedifference_ord_4, tse2$timedifference)) - # ordination based divergence values should not be equal to the ones on counts - expect_true(!identical(tse2$timedivergence_ord_4, tse2$timedivergence)) - - # testing with 2 ordination components - tse2 <- getStepwiseDivergence(tse2, group = "subject", - time_interval = 1, - time_field = "time", - name_timedifference="timedifference_ord_2", - name_divergence="timedivergence_ord_2", - dimred = "PCoA_BC", - n_dimred = 2, - FUN=vegan::vegdist, - method="euclidean") - # Time differences should still match - expect_true(identical(tse2$timedifference_ord_2, tse2$timedifference_ord_4)) - # not same values as using 4 components - expect_true(!identical(tse2$timedivergence_ord_2, tse2$timedivergence_ord_4)) - - ## testing with altExp - SingleCellExperiment::altExp(tse2, "Family") <- mia::agglomerateByRank(tse2, rank="Family") - tse2 <- getStepwiseDivergence(tse2, group = "subject", - time_interval = 1, - time_field = "time", - altexp="Family", - name_timedifference="timedifference_Fam", - name_divergence="timedivergence_Fam") - # Time differences should still match - expect_true(identical(tse2$timedifference_Fam, tse2$timedifference)) - # divergence values based on Family rank counts should not be equal to the - # ones with Genus counts - expect_true(!identical(tse2$timedivergence_Fam, tse2$timedivergence)) -}) diff --git a/vignettes/articles/minimalgut.Rmd b/vignettes/articles/minimalgut.Rmd index 1815aee..9e99cde 100644 --- a/vignettes/articles/minimalgut.Rmd +++ b/vignettes/articles/minimalgut.Rmd @@ -82,15 +82,14 @@ here hour zero until the end of the experiment. ```{r} ## Divergence from baseline i.e from hour zero. -tse <- mia::relAbundanceCounts(minimalgut) # get relative abundance -tse <- getBaselineDivergence(tse, +tse <- transformAssay(minimalgut, method = "relabundance") # get relative abundance +tse <- addBaselineDivergence(tse, group = "StudyIdentifier", - time_field = "Time.hr", - name_divergence = "divergence_from_baseline", - name_timedifference = "time_from_baseline", - assay.type="relabundance", - FUN = vegan::vegdist, - method="bray") + time.col = "Time.hr", + name = "divergence_from_baseline", + name.time = "time_from_baseline", + assay.type = "relabundance", + method = "bray") ``` @@ -124,7 +123,7 @@ Now visualize abundance of *Blautia hydrogenotrophica* using the `miaViz::plotSe ```{r fig.height=4, fig.width=8} library(miaViz) -plotSeries(mia::relAbundanceCounts(minimalgut), +plotSeries(transformAssay(minimalgut, method = "relabundance"), x = "Time.hr", y = "Blautia_hydrogenotrophica", colour_by = "Species", @@ -143,26 +142,25 @@ plotSeries(mia::relAbundanceCounts(minimalgut), ## Visualize the rate (slope) of divergence Sample dissimilarity between consecutive time steps(step size n >= 1) within -a group(subject, age, reaction chamber, etc.) can be calculated by `getStepwiseDivergence`. If we normalize this by the time interval, this gives an approximate slope for the change. +a group(subject, age, reaction chamber, etc.) can be calculated by `addStepwiseDivergence`. If we normalize this by the time interval, this gives an approximate slope for the change. -```{r getStepwiseDivergence, fig.height=4, fig.width=8, warning=FALSE} +```{r addStepwiseDivergence, fig.height=4, fig.width=8, warning=FALSE} # Load libraries library(miaTime) library(dplyr) -# Sort samples by time (necessary for getStepwiseDivergence) +# Sort samples by time (necessary for addStepwiseDivergence) tse <- tse[, order(colData(tse)$Time_hr_num)] # Divergence between consecutive time points -tse <- getStepwiseDivergence(tse, group = "StudyIdentifier", +tse <- addStepwiseDivergence(tse, group = "StudyIdentifier", time_interval = 1, - time_field = "Time_hr_num", - name_divergence = "divergence_from_previous_step", - name_timedifference = "time_from_previous_step", - assay.type ="relabundance", - FUN = vegan::vegdist, - method="bray") + time.col = "Time_hr_num", + name = "divergence_from_previous_step", + name.time = "time_from_previous_step", + assay.type = "relabundance", + method = "bray") # We have now new fields added in the colData: # time_from_previous_step, divergence_from_previous_step @@ -182,8 +180,6 @@ p <- tse |> ggplot(aes(x=time_from_baseline, print(p) ``` - - ## Moving average of the slope This shows how to calculate and plot moving average for the variable of interest (here: slope). @@ -193,7 +189,7 @@ This shows how to calculate and plot moving average for the variable of interest colData(tse)$slope <- colData(tse)$divergence_from_previous_step / colData(tse)$time_from_previous_step # Split by group and perform operation -tselist <- mia::splitOn(tse, "StudyIdentifier") +tselist <- splitOn(tse, "StudyIdentifier") # colData(tse)$divergence_from_previous_step addmean <- function (x, k, field, field_name) { @@ -212,7 +208,7 @@ addmean <- function (x, k, field, field_name) { tselist2 <- lapply(tselist, function (x) {addmean(x, k=3, field = "slope", field_name = "sliding_average")}) # Merge back -tse <- mia::mergeSEs(tselist2) +tse <- mergeSEs(tselist2) # Visualize theme_set(theme_bw(10))