diff --git a/R/genSlabLabels.R b/R/genSlabLabels.R index 443b2cd28..f5c234ee8 100644 --- a/R/genSlabLabels.R +++ b/R/genSlabLabels.R @@ -1,54 +1,52 @@ ## TODO: documentation / generalization -# note source data must be normalixed via slice() first, e.g. all share the same number of horizons +# note source data must be "normalized" via dice() first; assumes each profile has the same number of horizons # generate labels for slabs -genSlabLabels <- function(slab.structure=1, max.d, n.profiles) { +genSlabLabels <- function(slab.structure = 1, max.d, n.profiles) { # fixed-size slabs - if(length(slab.structure) == 1) { + if (length(slab.structure) == 1) { # generate sequence of segment labels - seg.label <- rep(1:ceiling(max.d / slab.structure), each=slab.structure, length=max.d) + seg.label <- rep(1:ceiling(max.d / slab.structure), each = slab.structure, length = max.d) + # general segment labels - seg.label.levels <- tapply(1:max.d, seg.label, function(i) {r <- range(i); paste(c(r[1]-1, r[2]), collapse='-') } ) + seg.label.levels <- tapply(1:max.d, seg.label, + function(i) { + r <- range(i, na.rm = TRUE) + paste(c(r[1] - 1, r[2]), collapse = '-') + }) } # user-defined slabs - if(length(slab.structure) > 1) { - # trival case where segments start from 0 - if(slab.structure[1] == 0 & length(slab.structure) > 2) - seg.label <- rep(slab.structure[-1], times=diff(slab.structure))[1:max.d] + if (length(slab.structure) > 1) { + if (slab.structure[1] == 0 & length(slab.structure) > 2) { + # trivial case where segments start from 0 + + seg.label <- rep(slab.structure[-1], times = diff(slab.structure))[1:max.d] - # other case: user defines an arbitrary lower and upper limit - else { - if(length(slab.structure) != 2) + } else { + # other case: user defines an arbitrary lower and upper limit + if (length(slab.structure) != 2) stop('user-defined slab boundaries must either start from 0, or contain two values between 0 and the max soil depth') - # proceed + # calculate thickness of slab slab.thickness <- diff(slab.structure) - # how many slices of NA before the slab? - padding.before <- rep(NA, times=slab.structure[1]) - # how many slices of NA afer the slab - padding.after <- rep(NA, times=abs(max.d - slab.structure[2])) + # make a new label for the slab - new.label <- paste(slab.structure, collapse='-') - # generate an index for the slab - slab.idx <- rep(new.label, times=slab.thickness) - # generate the entire index: padding+slab+padding = total number of slices (max_d) - # seg.label <- c(padding.before, slab.idx, padding.after) - seg.label <- slab.idx + new.label <- paste(slab.structure, collapse = '-') + + slab.idx <- rep(new.label, times = slab.thickness) + seg.label <- slab.idx } # generate segment labels - seg.label.levels <- sapply(1:(length(slab.structure)-1), function(i) paste(c(slab.structure[i], slab.structure[i+1]), collapse='-')) + seg.label.levels <- sapply(1:(length(slab.structure) - 1), function(i) { + paste(c(slab.structure[i], slab.structure[i + 1]), collapse = '-') + }) } # covert into a factor that can be used to split profiles into slabs - res <- factor(rep(seg.label, times=n.profiles), labels=seg.label.levels) + res <- factor(rep(seg.label, times = n.profiles), labels = seg.label.levels) return(res) } - - - - - diff --git a/R/slab.R b/R/slab.R index 5fb326d6f..3b7c1e81a 100644 --- a/R/slab.R +++ b/R/slab.R @@ -1,20 +1,16 @@ - - # default slab function for categorical variables # returns a named vector of results # this type of function is compatible with aggregate() -.slab.fun.factor.default <- function(values, cpm) { +.slab.fun.factor.default <- function(values, cpm, ...) { - # probabilities are relative to number of contributing profiles - if(cpm == 1) { - tb <- table(values, useNA='no') + if (cpm == 1) { + # probabilities are relative to number of contributing profiles + tb <- table(values, useNA = 'no') # convert to proportions pt <- prop.table(tb) - } - - # probabilities are relative to total number of profiles - else if(cpm == 2) { - tb <- table(values, useNA='always') + } else if (cpm == 2) { + # probabilities are relative to total number of profiles + tb <- table(values, useNA = 'always') # convert to proportions, # the last column will be named 'NA', and contains the tally of NAs --> remove it pt <- prop.table(tb) @@ -27,62 +23,53 @@ names(pt) <- make.names(levels(values)) return(pt) - } - - -## TODO: make these exported functions with documentation (https://github.com/ncss-tech/aqp/issues/99) - +} +.slab.fun.factor.weighted <- function(values, w, na.rm = TRUE, ...) { + # TODO +} + # default slab function for continuous variables # returns a named vector of results # this type of function is compatible with aggregate() -.slab.fun.numeric.default <- function(values) { - q.probs <- c(0.05, 0.25, 0.5, 0.75, 0.95) - ## 2019-10-30: dropping Hmisc as a suggested package - # res <- Hmisc::hdquantile(values, probs=q.probs, na.rm=TRUE) - res <- quantile(values, probs=q.probs, na.rm=TRUE) - names(res) <- paste('p.q', round(q.probs * 100), sep='') - return(res) +.slab.fun.numeric.default <- function(values, probs = c(0.05, 0.25, 0.5, 0.75, 0.95), na.rm = TRUE, ...) { + .slab.fun.numeric.fast(values, probs = probs, na.rm = na.rm, ...) +} + +# for a site or horizon level weight column, use Hmisc::wtd.quantile +# note that "frequency weights" are assumed to be most common use case, so normwt=FALSE by default, +# normwt=TRUE can be passed as optional argument for slab.fun from slab() interface +.slab.fun.numeric.weighted <- function(values, w, probs = c(0.05, 0.25, 0.5, 0.75, 0.95), normwt = FALSE, na.rm = TRUE, ...) { + if (!requireNamespace('Hmisc', quietly = TRUE)) + stop('please install the `Hmisc` package to use `wtd.quantile()` method', call. = FALSE) + res <- try(Hmisc::wtd.quantile(values, w, probs = probs, na.rm = na.rm, normwt = normwt), silent = TRUE) + if (inherits(res, 'try-error') && grepl("zero non-NA points", res[1])) { + res <- rep(NA_real_, length(probs)) + } else { + names(res) <- paste('p.q', round(probs * 100), sep = '') + } + return(res) } # easy specification of Hmisc::hdquantile if available -.slab.fun.numeric.HD <- function(values) { +.slab.fun.numeric.HD <- function(values, probs = c(0.05, 0.25, 0.5, 0.75, 0.95), na.rm = TRUE, ...) { # sanity check, need this for color distance eval - if(!requireNamespace('Hmisc', quietly = TRUE)) - stop('pleast install the `Hmisc` package.', call.=FALSE) - - q.probs <- c(0.05, 0.25, 0.5, 0.75, 0.95) - - res <- Hmisc::hdquantile(values, probs=q.probs, na.rm=TRUE) - - names(res) <- paste('p.q', round(q.probs * 100), sep='') + if (!requireNamespace('Hmisc', quietly = TRUE)) + stop('please install the `Hmisc` package to use `hdquantile()` method', call. = FALSE) + + res <- Hmisc::hdquantile(values, probs = probs, na.rm = na.rm) + + names(res) <- paste('p.q', round(probs * 100), sep = '') return(res) } -# basic quantile evaluation, better for large datasets -.slab.fun.numeric.fast <- function(values) { - q.probs <- c(0.05, 0.25, 0.5, 0.75, 0.95) - res <- quantile(values, probs=q.probs, na.rm=TRUE) - names(res) <- paste('p.q', round(q.probs * 100), sep='') - return(res) +# basic quantile evaluation, better for large data sets +.slab.fun.numeric.fast <- function(values, probs = c(0.05, 0.25, 0.5, 0.75, 0.95), na.rm = TRUE, ...) { + res <- quantile(values, probs = probs, na.rm = na.rm) + names(res) <- paste('p.q', round(probs * 100), sep = '') + return(res) } - -# ## TODO: not yet implemented -# # default slab function for weighted, continuous variables -# # returns a named vector of results -# # this type of function is compatible with aggregate() -# # NOTE: nw (normalize-weights) argument will affect the results! -# .slab.fun.wtd.numeric.default <- function(values, w, nw=TRUE) { -# q.probs <- c(0.05, 0.25, 0.5, 0.75, 0.95) -# res <- wtd.quantile(values, weights=w, probs=q.probs, na.rm=TRUE, normwt=nw) -# names(res) <- paste('p.q', round(q.probs * 100), sep='') -# return(res) -# } - - -## note: slab() uses slice() to resample to 1cm slices, to max(x) or slab.structure[2] if defined - # SoilProfileCollection method # object: SoilProfileCollection # fm: formula defining aggregation @@ -90,20 +77,23 @@ # progress: plyr-progress display # slab.fun: aggregate function applied to data chunks (must return a single row / chunk) # cpm: class probability normalization mode -# weights: character vector naming column containing weights +# weights: character vector naming column in site slot containing weights # ... : extra arguments passed on to slab.fun - -# this is about 40% slower than the old method -.slab <- function(object, fm, slab.structure=1, strict=FALSE, slab.fun=.slab.fun.numeric.default, cpm=1, weights=NULL, ...){ - # issue a message for now that things have changed - # message('usage of slab() has changed considerably, please see the manual page for details') - +.slab <- function(object, + fm, + slab.structure = 1, + strict = FALSE, + slab.fun = slab_function(method = "numeric"), + cpm = 1, + weights = NULL, + ...) { + + # define keywords for data.table + .SD <- NULL; .N <- NULL; value <- NULL + # get extra arguments: length of 0 if no extra arguments extra.args <- list(...) - ## important: change the default behavior of data.frame and melt - opt.original <- options(stringsAsFactors = FALSE) - # get unique list of names in object object.names <- unique(unlist(names(object))) @@ -120,109 +110,78 @@ original.levels <- NULL # extract components of the formula: - g <- all.vars(update(fm, .~0)) # left-hand side - vars <- all.vars(update(fm, 0~.)) # right-hand side - + g <- all.vars(update(fm, . ~ 0)) # left-hand side + vars <- all.vars(update(fm, 0 ~ .)) # right-hand side + # sanity check: - if(! inherits(fm, "formula")) - stop('must provide a valid formula: groups ~ var1 + var2 + ...', call.=FALSE) - + if (!inherits(fm, "formula")) + stop('must provide a valid formula: groups ~ var1 + var2 + ...', call. = FALSE) + # check for bogus left/right side problems with the formula - if(any(g %in% object.names) == FALSE & g != '.') # bogus grouping column - stop('group name either missing from formula, or does match any columns in dataframe', call.=FALSE) - - if(any(vars %in% object.names) == FALSE) # bogus column names in right-hand side - stop('column names in formula do not match column names in dataframe', call.=FALSE) + if (any(g %in% object.names) == FALSE & g != '.') # bogus grouping column + stop('group name either missing from formula, or does match any columns in dataframe', call. = FALSE) - - ## old slice() version ############################### + if (any(vars %in% object.names) == FALSE) # bogus column names in right-hand side + stop('column names in formula do not match column names in dataframe', call. = FALSE) # make formula for slicing - ## TODO: slice() returns an extra row, so only request slices to max-1 - fm.slice <- formula(paste('0:', max(object)-1, ' ~ ', paste(vars, collapse=' + '), sep='')) - - # short-cut for user-defined slab - ## TODO: slice() returns an extra row, so only request slices to max-1 - if(length(slab.structure) == 2 ) - fm.slice <- formula(paste(slab.structure[1], ':', slab.structure[2]-1, ' ~ ', paste(vars, collapse=' + '), sep='')) - - # slice into 1cm increments, result is a data.frame - data <- slice(object, fm.slice, strict=strict, just.the.data=TRUE) - - ####################################################### - - - ## dice() version #################################### - # https://github.com/ncss-tech/aqp/issues/115 - - ## TODO: - # * enforce filling / check assumptions about length - # * consider total re-write vs. adaptation - - # # make formula for slicing - # fm.slice <- formula(paste('0:', max(object), ' ~ ', paste(vars, collapse=' + '), sep='')) - # - # # short-cut for user-defined slab - # if(length(slab.structure) == 2 ) - # fm.slice <- formula(paste(slab.structure[1], ':', slab.structure[2], ' ~ ', paste(vars, collapse=' + '), sep='')) - # - # # slice into 1cm increments, result is a data.frame - # data <- dice(x = object, fm = fm.slice, strict = strict, SPC = FALSE, pctMissing = TRUE) - - ####################################################### - - + if (length(slab.structure) == 2) { + # user-defined single slab + fm.slice <- formula(paste(slab.structure[1], ':', slab.structure[2], ' ~ ', paste(vars, collapse = ' + '), sep = '')) + } else { + fm.slice <- formula(paste('0:', max(object), ' ~ ', paste(vars, collapse = ' + '), sep = '')) + } + # slice into 1cm increments, result is a data.frame + data <- dice(x = object, fm = fm.slice, strict = strict, SPC = FALSE, pctMissing = TRUE) + + # Note: in this case we need to subtract the extra slice included by slice()/dice() + # do it to sliced result so that the genSlabLabels have the correct length + ldx <- data[[horizonDepths(object)[2]]] > slab.structure[2] + if (length(slab.structure) == 2 && any(ldx, na.rm = TRUE)) { + data <- data[which(!ldx), ] + } # extract site data site.data <- site(object) # check groups for NA, if grouping variable was given - if(g != '.') - if(any(is.na(site.data[, g]))) - stop('grouping variable must not contain NA', call.=FALSE) - - ### !!! this runs out of memory when groups contains NA !!! - ## this is generating a lot of extra objects and possibly wasting memory + if (g != '.' && any(is.na(site.data[, g]))) { + stop('grouping variable must not contain NA', call. = FALSE) + } + # merge site data back into the result, this would include site-level weights - data <- merge(x = data, y = site.data, by=object.ID, all.x=TRUE, sort=FALSE) - - # clean-up - rm(object, site.data) - gc() + data <- merge(x = data, y = site.data, by = object.ID, all.x = TRUE, sort = FALSE) # check variable classes - if(length(vars) > 1) { + if (length(vars) > 1) { vars.numeric.test <- sapply(data[, vars], is.numeric) } else { vars.numeric.test <- is.numeric(data[[vars]]) } - # sanity check: all numeric, or single character/factor - if(any(! vars.numeric.test) & length(vars) > 1) { + if (any(!vars.numeric.test) & length(vars) > 1) { stop('mixed variable types and multiple categorical variables are not currently supported in the same call to slab', call. = FALSE) } - # check for single categorical variable, and convert to factor - if(length(vars) == 1 & inherits(data[, vars], c('character', 'factor'))) { + if (length(vars) == 1 & inherits(data[, vars], c('character', 'factor'))) { # if we have a character, then convert to factor - if(inherits(data[[vars]],'character')) { + if (inherits(data[[vars]],'character')) { message('Note: converting categorical variable to factor.') data[[vars]] <- factor(data[[vars]]) } # check for weights - if(!missing(weights)) { - stop('weighted aggregation of categorical variables not yet implemented', call.=FALSE) + if (!missing(weights)) { + slab.fun <- .slab.fun.factor.weighted + } else { + # re-set default function, currently no user-supply-able option + slab.fun <- .slab.fun.factor.default } - - # re-set default function, currently no user-supply-able option - slab.fun <- .slab.fun.factor.default - # add extra arguments required by this function # note that we cannot accept additional arguments when processing categorical values extra.args <- list(cpm = cpm) @@ -236,127 +195,128 @@ .factorFlag <- FALSE } - - #### - #### optimization note: use of factor labels could be slowing things down... - #### + ## Note: use of factor labels could be slowing things down... ## Note: this assumes ordering is correct in source data / segment labels - ## TODO: make sure that nrow(data) == genSlabLabels(slab.structure = slab.structure, max.d = max.d, n.profiles = n.profiles) ## TODO: investigate use of split() to speed things up, no need to keep everything in the safe DF: - ## - ## l <- split(data, data$seg.label, drop=FALSE) - ## - ## ... parallel processing with furrr + ## l <- split(data, data$seg.label, drop=FALSE) # add segmenting label to data data$seg.label <- genSlabLabels(slab.structure = slab.structure, max.d = max.d, n.profiles = n.profiles) # if there is no left-hand component in the formula, we are aggregating all data in the collection - if(g == '.') { + if (g == '.') { g <- 'all.profiles' # add new grouping variable to horizons data[, g] <- 1 } - - - ## - ## TODO: adding weighted-aggregate functionality here - ## we can't use aggregate() for this - ## we can use data.table methods - + + if (length(weights) > 1) { + stop("`weights` argument should be a character vector of length one containing (site-level) weight column name", call. = FALSE) + } + + if (!is.null(weights) && !weights %in% siteNames(object)) { + stop("column '", weights, "' not present in site data", call. = FALSE) + } + + # convert wide -> long format + d.long <- data.table::melt( + as.data.table(data[which(!is.na(data$seg.label)), ]), + id.vars = unique(c(object.ID, 'seg.label', g, weights)), + measure.vars = vars + ) + + # make a formula for aggregate() + aggregate.fm <- as.formula(paste('value ~ seg.label + variable + ', g, sep = '')) + # check for weights - if(!missing(weights)) - stop('weighted aggregation is not yet supported', call.=FALSE) - - ## TODO: why would this ever happen? - # throwing out those rows with an NA segment label - seg.label.is.not.NA <- which(!is.na(data$seg.label)) - - # convert into long format - # d.long.df <- reshape::melt(data[seg.label.is.not.NA, ], id.vars=c(object.ID, 'seg.label', g), measure.vars=vars) - - # convert wide -> long format - # using data.table::melt() - # note that this will not preserve factor levels when 'vars' is categorical - # must call unique() on `id.vars` - d.long <- melt( - as.data.table(data[seg.label.is.not.NA, ]), - id.vars = unique(c(object.ID, 'seg.label', g)), - measure.vars = vars, - ) - - # convert back to data.frame - d.long <- as.data.frame(d.long) - - # reset factor levels in d.long[[value]] - if(.factorFlag) { - d.long[['value']] <- factor(d.long[['value']], levels = original.levels) - } - - # make a formula for aggregate() - aggregate.fm <- as.formula(paste('value ~ seg.label + variable + ', g, sep='')) - - ## - ## TODO: this might be the place to implement parallel code in furrr - ## 1. split into a list based on number of cores/cpus available - ## 2. aggregate using seg.label + variable in parallel - ## 3. combine results (a list of data.frames) - ## - ## NOPE: use data.table which will automatically scale to multiple threads - ## - - + if (!missing(weights)) { + if (missing(slab.fun)) { + # default _weighted_ aggregation fun is .slab.fun.numeric.weighted + FUN <- .slab.fun.numeric.weighted + # custom weighted aggregation fun should take first argument as values + # second argument is "weights" argument + } else { + FUN <- slab.fun + } + wt <- eval(weights) + d.slabbed <- as.data.frame(d.long[, as.data.frame(t(FUN(value, .SD[[wt]], ...))), by = c('variable', g, 'seg.label')]) + d.slabbed$contributing_fraction <- d.long[, sum(!is.na(.SD[["value"]])) / .N, by = c('variable', g, 'seg.label')]$V1 - # process chunks according to group -> variable -> segment - # NA values are not explicitly dropped - if(length(extra.args) == 0) - d.slabbed <- aggregate(aggregate.fm, data=d.long, na.action=na.pass, FUN=slab.fun) - - # optionally account for extra arguments - else { - the.args <- c(list(aggregate.fm, data=d.long, na.action=na.pass, FUN=slab.fun), extra.args) - d.slabbed <- do.call(what='aggregate', args=the.args) + } else { + # convert back to data.frame + d.long <- as.data.frame(d.long) + + # reset factor levels in d.long[[value]] + if (.factorFlag) { + d.long[['value']] <- factor(d.long[['value']], levels = original.levels) + } + + # process chunks according to group -> variable -> segment + # NA values are not explicitly dropped + if (length(extra.args) == 0) { + d.slabbed <- aggregate(aggregate.fm, data = d.long, na.action = na.pass, FUN = slab.fun) + } else { + # optionally account for extra arguments + the.args <- c(list(aggregate.fm, data = d.long, na.action = na.pass, FUN = slab.fun), extra.args) + d.slabbed <- do.call(what = 'aggregate', args = the.args) + } + + # if slab.fun returns a vector of length > 1 we must: + # convert the complex data.frame returned by aggregate into a regular data.frame + # the column 'value' is a matrix with the results of slab.fun + if (inherits(d.slabbed$value, 'matrix')) { + d.slabbed <- cbind(d.slabbed[, 1:3], d.slabbed$value) + } + + # ensure that the names returned from slab.fun are legal + names(d.slabbed) <- make.names(names(d.slabbed)) + + # estimate contributing fraction + d.slabbed$contributing_fraction <- aggregate( + aggregate.fm, + data = d.long, + na.action = na.pass, + FUN = function(i) { + length(na.omit(i)) / length(i) + } + )$value + } - - - # if slab.fun returns a vector of length > 1 we must: - # convert the complex data.frame returned by aggregate into a regular data.frame - # the column 'value' is a matrix with the results of slab.fun - if(inherits(d.slabbed$value, 'matrix')) - d.slabbed <- cbind(d.slabbed[, 1:3], d.slabbed$value) - - # ensure that the names returned from slab.fun are legal - names(d.slabbed) <- make.names(names(d.slabbed)) - - # convert the slab.label column into top/bottom as integers - slab.depths <- strsplit(as.character(d.slabbed$seg.label), '-') - d.slabbed$top <- as.integer(lapply(slab.depths, function(i) i[1])) - d.slabbed$bottom <- as.integer(lapply(slab.depths, function(i) i[2])) - - # estimate contributing fraction - d.slabbed$contributing_fraction <- aggregate(aggregate.fm, data=d.long, na.action=na.pass, FUN=function(i) {length(na.omit(i)) / length(i)})$value - - + + # rename default data.table value column names + unn <- grepl("^V\\d+$", colnames(d.slabbed)) + if (any(unn)) { + colnames(d.slabbed)[unn] <- paste0("value", ifelse(sum(unn) == 1, "", 1:sum(unn))) + } + + # convert the slab.label column into top/bottom as integers + slab.depths <- strsplit(as.character(d.slabbed$seg.label), '-') + d.slabbed$top <- as.integer(lapply(slab.depths, function(i) i[1])) + d.slabbed$bottom <- as.integer(lapply(slab.depths, function(i) i[2])) + # remove seg.label from result d.slabbed$seg.label <- NULL # if categorical data have been aggregated, set an attribute with the original levels # this allows one to recover values that are not legal column names # e.g. 2Bt is corrupted to X2Bt - if(! is.null(original.levels)) + if (!is.null(original.levels)) { attr(d.slabbed, 'original.levels') <- original.levels - - # reset options: - options(opt.original) - - # done + } + return(d.slabbed) } # setup generic function -# if (!isGeneric("slab")) - setGeneric("slab", function(object, fm, slab.structure=1, strict=FALSE, slab.fun=.slab.fun.numeric.default, cpm=1, weights=NULL, ...) standardGeneric("slab")) +setGeneric("slab", function(object, + fm, + slab.structure = 1, + strict = FALSE, + slab.fun = slab_function(method = "numeric"), + cpm = 1, + weights = NULL, + ...) + standardGeneric("slab")) -# method dispatch #' Slab-Wise Aggregation of SoilProfileCollection Objects #' #' Aggregate soil properties along user-defined `slabs`, and optionally within @@ -373,6 +333,8 @@ #' \code{stats::quantile} from the Hmisc package, which requires at least 2 #' observations per chunk. Note that if `group` is a factor it must not contain #' NAs. +#' +#' `slab()` uses `dice()` to "resample" profiles to 1cm slices from depth 0 to `max(x)` (or `slab.structure[2]`, if defined). #' #' Sometimes \code{slab} is used to conveniently re-arrange data vs. aggregate. #' This is performed by specifying \code{identity} in \code{slab.fun}. See @@ -416,7 +378,7 @@ #' either: number of profiles with data at the current slice (cpm=1), or by the #' number of profiles in the collection (cpm=2). Mode 1 values will always sum #' to the contributing fraction, while mode 2 values will always sum to 1. -#' @param weights Column name containing weights. NOT YET IMPLEMENTED +#' @param weights Column name containing site-level weights #' @param \dots further arguments passed to \code{slab.fun} #' @return Output is returned in long format, such that slice-wise aggregates #' are returned once for each combination of grouping level (optional), @@ -526,7 +488,6 @@ #' scales=list(x=list(alternating=1)) #' ) #' -#' #' ## #' ## categorical variable example #' ## @@ -780,6 +741,7 @@ #' #' #' +# # TODO: update this when weights are implemented #' ## weighted-aggregation -- NOT YET IMPLEMENTED -- #' # load sample data, upgrade to SoilProfileCollection #' data(sp1) @@ -790,4 +752,24 @@ #' sp1$site.wts <- runif(n=length(sp1), min=20, max=100) #' } #' -setMethod(f='slab', signature='SoilProfileCollection', definition=.slab) +setMethod(f = 'slab', + signature = 'SoilProfileCollection', + definition = .slab) + +#' @param method one of `"numeric"`, `"factor"`, `"hd"`, `"weighted.numeric"`, `"weighted.factor"`, `"fast"` +#' @details `slab_function()`: The default `"numeric"` aggregation method is the `"fast"` numeric (quantile) method. Additional methods include `"factor"` for categorical data, `"hd"` to use the Harrell-Davis Distribution-Free Quantile Estimator from the Hmisc package, and "`weighted`" to use a weighted quantile method from the Hmisc package +#' @return `slab_function()`: return an aggregation function based on the `method` argument +#' @rdname slab-methods +#' @export +slab_function <- function(method = c("numeric", "factor", "hd", "weighted.numeric", "weighted.factor", "fast")) { + switch(method, + numeric = .slab.fun.numeric.default, # uses "fast" numeric method + factor = .slab.fun.factor.default, + weighted.factor = .slab.fun.factor.weighted, + hd = .slab.fun.numeric.HD, + weighted.numeric = .slab.fun.numeric.weighted, + weighted.factor = .slab.fun.numeric.weighted, + fast = .slab.fun.numeric.fast, + .slab.fun.numeric.default + ) +} diff --git a/man/slab-methods.Rd b/man/slab-methods.Rd index 305a43998..618c6cde1 100644 --- a/man/slab-methods.Rd +++ b/man/slab-methods.Rd @@ -7,6 +7,7 @@ \alias{slab2} \alias{genSlabLabels} \alias{slab,SoilProfileCollection-method} +\alias{slab_function} \title{Slab-Wise Aggregation of SoilProfileCollection Objects} \usage{ \S4method{slab}{SoilProfileCollection}( @@ -14,11 +15,15 @@ fm, slab.structure = 1, strict = FALSE, - slab.fun = .slab.fun.numeric.default, + slab.fun = slab_function(method = "numeric"), cpm = 1, weights = NULL, ... ) + +slab_function( + method = c("numeric", "factor", "hd", "weighted.numeric", "weighted.factor", "fast") +) } \arguments{ \item{object}{a SoilProfileCollection} @@ -42,9 +47,11 @@ either: number of profiles with data at the current slice (cpm=1), or by the number of profiles in the collection (cpm=2). Mode 1 values will always sum to the contributing fraction, while mode 2 values will always sum to 1.} -\item{weights}{Column name containing weights. NOT YET IMPLEMENTED} +\item{weights}{Column name containing site-level weights} \item{\dots}{further arguments passed to \code{slab.fun}} + +\item{method}{one of \code{"numeric"}, \code{"factor"}, \code{"hd"}, \code{"weighted.numeric"}, \code{"weighted.factor"}, \code{"fast"}} } \value{ Output is returned in long format, such that slice-wise aggregates @@ -74,6 +81,8 @@ fraction of profiles contributing to the aggregate value, ranges from 1/n_profiles to 1.} } + +\code{slab_function()}: return an aggregation function based on the \code{method} argument } \description{ Aggregate soil properties along user-defined \code{slabs}, and optionally within @@ -92,6 +101,8 @@ mean, mean-1SD and mean+1SD. The default slab function wraps observations per chunk. Note that if \code{group} is a factor it must not contain NAs. +\code{slab()} uses \code{dice()} to "resample" profiles to 1cm slices from depth 0 to \code{max(x)} (or \code{slab.structure[2]}, if defined). + Sometimes \code{slab} is used to conveniently re-arrange data vs. aggregate. This is performed by specifying \code{identity} in \code{slab.fun}. See examples beflow for a demonstration of this functionality. @@ -114,6 +125,8 @@ integers}{e.g. c(50, 60): data are aggregated over depths spanning 50--60 units} \item{a vector of 3 or more integers}{e.g. c(0, 5, 10, 50, 100): data are aggregated over the depths spanning 0--5, 5--10, 10--50, 50--100 units} } + +\code{slab_function()}: The default \code{"numeric"} aggregation method is the \code{"fast"} numeric (quantile) method. Additional methods include \code{"factor"} for categorical data, \code{"hd"} to use the Harrell-Davis Distribution-Free Quantile Estimator from the Hmisc package, and "\code{weighted}" to use a weighted quantile method from the Hmisc package } \note{ Arguments to \code{slab} have changed with \code{aqp} 1.5 (2012-12-29) @@ -191,7 +204,6 @@ xyplot(top ~ value | id, data=a, ylab='Depth', scales=list(x=list(alternating=1)) ) - ## ## categorical variable example ## diff --git a/tests/testthat/test-slab.R b/tests/testthat/test-slab.R index c45537f79..10a4c5739 100644 --- a/tests/testthat/test-slab.R +++ b/tests/testthat/test-slab.R @@ -1,7 +1,5 @@ context("slab method for SoilProfileCollection objects") - - test_that("basic slab functionality", { data(sp1, package = 'aqp') @@ -35,8 +33,102 @@ test_that("basic slab functionality", { expect_true(any(grepl('p.q', nm))) }) +test_that("slab structure for a single slab", { + data(sp4, package = 'aqp') + depths(sp4) <- id ~ top + bottom + + sp4$group <- c(rep('A',5), rep('B',5)) + a.1 <- slab( + sp4, + fm = ~ sand + silt + clay, + slab.structure = c(0, 10), + slab.fun = mean, + na.rm = TRUE + ) + expect_equal(nrow(a.1), 3) + + # again, this time within groups defined by a site-level attribute: + a.1 <- slab( + sp4, + fm = group ~ sand + silt + clay, + slab.structure = c(0, 10), + slab.fun = mean, + na.rm = TRUE + ) + expect_equal(nrow(a.1), 6) +}) +test_that("extended slab functionality: weighted aggregation", { + + data(sp1, package = 'aqp') + depths(sp1) <- id ~ top + bottom + + sp1$weights <- rep(1:2, length(sp1))[1:length(sp1)] + sp1$wtgrp <- rep(1, length(sp1)) + + # we expect quantile estimates to vary (given weighted v.s. unweighted) + a.0 <- slab(sp1, fm = ~ prop, weights = "weights") + a.1 <- slab(sp1, fm = ~ prop, strict = TRUE, weights = "weights") + a.2 <- slab(sp1, fm = wtgrp ~ prop, strict = TRUE, weights = "weights") + a.3 <- slab(sp1, fm = wtgrp ~ prop, strict = TRUE) + + # expect consistent structure for weighted/unweighted: same column names except for group + ungroupcols <- colnames(a.3) + ungroupcols[2] <- "all.profiles" + expect_equal(colnames(a.0), ungroupcols) + expect_equal(colnames(a.1), ungroupcols) + expect_equal(colnames(a.2), colnames(a.3)) + + # contributing fractions should be identical + expect_true(all(a.1$contributing_fraction == a.2$contributing_fraction)) + expect_true(all(a.2$contributing_fraction == a.3$contributing_fraction)) + + # component weighted averages + # mukey=461268; Doemill-Jokerst, 3 to 8 percent slopes (615) + x <- data.frame(cokey = c(21469960L, 21469960L, 21469960L, 21469960L, + 21469960L, 21469961L, 21469961L, 21469961L), + comppct_r = c(50L, 50L, 50L, 50L, 50L, 40L, 40L, 40L), + hzdept_r = c(0L, 3L, 13L, 23L, 36L, 0L, 3L, 10L), + hzdepb_r = c(3L, 13L, 23L, 36L, 61L, 3L, 10L, 35L), + ph1to1h2o_r = c(6.1, 6.8, 6.7, 6.7, NA, 6.4, 6.5, NA)) + depths(x) <- cokey ~ hzdept_r + hzdepb_r + site(x) <- ~ comppct_r + + # custom function, with actual comppct_r + a.0 <- slab(x, ~ ph1to1h2o_r, + slab.structure = c(0, 10), + weights = "comppct_r", + slab.fun = weighted.mean, + na.rm = TRUE + ) + expect_equal(a.0$value, 6.54, tolerance = 0.005) + + # should match this within the tolerance: + # soilDB::get_SDA_property(property = 'ph1to1h2o_r', method = "weighted average", + # mukeys = 461268, miscellaneous_areas = FALSE, include_minors = TRUE, + # top_depth = 0, bottom_depth = 10) + + # comppct_r adjusted to get higher result more like doemill + x$comppct_r <- c(90, 10) + a.1 <- slab(x, ~ ph1to1h2o_r, slab.structure = c(0, 10), weights = "comppct_r") + expect_equal(a.1$p.q50, 6.8) + + # comppct_r adjusted to get lower result more like jokerst + x$comppct_r <- c(10, 90) + a.2 <- slab(x, ~ ph1to1h2o_r, slab.structure = c(0, 10), weights = "comppct_r") + expect_equal(a.2$p.q50, 6.5) + + # comppct_r adjusted to get lower result more like jokerst + x$comppct_r <- c(NA, 90) + na.1 <- slab(x, ~ ph1to1h2o_r, slab.structure = c(0, 10), weights = "comppct_r", na.rm = TRUE) + expect_equal(na.1$p.q50, 6.5) + + na.2 <- slab(x, ~ ph1to1h2o_r, slab.structure = c(0, 10), weights = "comppct_r", slab.fun = weighted.mean, na.rm = TRUE) + expect_equal(na.2$value, NA_real_) + +}) + test_that("slab calculations: mean, single profile", { data(sp1, package = 'aqp') @@ -44,7 +136,14 @@ test_that("slab calculations: mean, single profile", { # aggregate single profile # custom slabs - a <- slab(sp1[1, ], fm = ~ prop, strict=TRUE, slab.structure=c(0,5,10,25,100), slab.fun = mean, na.rm=TRUE) + a <- slab( + sp1[1,], + fm = ~ prop, + strict = TRUE, + slab.structure = c(0, 5, 10, 25, 100), + slab.fun = mean, + na.rm = TRUE + ) # using mean and similar functions will cause slab to store single result / slab into 'value' column expect_true(any(grepl('value', names(a)))) @@ -52,13 +151,13 @@ test_that("slab calculations: mean, single profile", { # calculations done by hand (DEB) # weighted mean, single profile # 0-5: 9.400 - expect_equal(a$value[1], 9.4, tolerance=0.0001) + expect_equal(a$value[1], 9.4, tolerance = 0.0001) # 5-10: 7.000 - expect_equal(a$value[2], 7, tolerance=0.0001) + expect_equal(a$value[2], 7, tolerance = 0.0001) # 10-25: 8.466 - expect_equal(a$value[3], 8.4666, tolerance=0.0001) + expect_equal(a$value[3], 8.4666, tolerance = 0.0001) # 25-100: 15.625 - expect_equal(a$value[4], 15.625, tolerance=0.0001) + expect_equal(a$value[4], 15.625, tolerance = 0.0001) })