From 2c5e8e79921010269ad7dd35bff96faa21bb7121 Mon Sep 17 00:00:00 2001 From: "Andrew G. Brown" Date: Wed, 17 Aug 2022 13:01:42 -0700 Subject: [PATCH 1/7] Updates to `dice()` and `slab()` - `slab()`: replace slice() with dice() - `dice()`: address some TODOs --- R/dice.R | 91 +++++++++-------- R/genSlabLabels.R | 56 +++++------ R/slab.R | 250 +++++++++++++++++++--------------------------- 3 files changed, 180 insertions(+), 217 deletions(-) diff --git a/R/dice.R b/R/dice.R index ed7dfffeb..390768623 100644 --- a/R/dice.R +++ b/R/dice.R @@ -36,19 +36,26 @@ #' #' @export #' -dice <- function(x, fm = NULL, SPC = TRUE, pctMissing = FALSE, fill = FALSE, strict = TRUE, byhz = FALSE) { - +dice <- function(x, + fm = NULL, + SPC = TRUE, + pctMissing = FALSE, + fill = FALSE, + strict = TRUE, + byhz = FALSE, + verbose = FALSE) { + # sacrifice to R CMD check spirits .pctMissing <- NULL # find / flag / remove invalid profiles or horizons # this will generate an error if there are no valid profiles remaining - if(strict) { - x <- HzDepthLogicSubset(x, byhz = byhz) + if (strict) { + x <- HzDepthLogicSubset(x, byhz = TRUE) ## TODO: this could invoke 2x calls to fillHzGaps # removed horizons will trigger an automatic gap-filling - if(!is.null(metadata(x)$removed.horizons)) { + if (!is.null(metadata(x)$removed.horizons)) { message('filling gaps left by HzDepthLogicSubset') x <- fillHzGaps(x, flag = TRUE, to_top = NULL, to_bottom = NULL) } @@ -67,14 +74,13 @@ dice <- function(x, fm = NULL, SPC = TRUE, pctMissing = FALSE, fill = FALSE, str ids.top.bottom.idx <- match(c(idn, hzidn, htb), hznames) # interpret optional formula - if(!is.null(fm)) { + if (!is.null(fm)) { # something supplied # reasonable? - if(! inherits(fm, "formula")) { + if (!inherits(fm, "formula")) { stop('must provide a valid formula', call. = FALSE) } - # extract components of the formula: fm <- str_c(deparse(fm, 500), collapse = "") elements <- str_split(fm, fixed("~"))[[1]] @@ -92,17 +98,17 @@ dice <- function(x, fm = NULL, SPC = TRUE, pctMissing = FALSE, fill = FALSE, str vars <- fm[[2]] # check for bogus left/right side problems with the formula - if(any(z < 0) | any(is.na(z))) { + if (any(z < 0) | any(is.na(z))) { stop('z-slice must be >= 0', call. = FALSE) } # check for integer / numeric slices - if(! inherits(z, c('numeric','integer'))) { + if (!inherits(z, c('numeric','integer'))) { stop('z-slice must be either numeric or integer', call. = FALSE) } # if z-index is missing set to NULL for later on - if(length(z) == 0) { + if (length(z) == 0) { z <- NULL } else { # z index is specified @@ -111,13 +117,13 @@ dice <- function(x, fm = NULL, SPC = TRUE, pctMissing = FALSE, fill = FALSE, str } # check for '.' --> all variables, minus ID/depths - if(any(vars == '.')) { + if (any(vars == '.')) { # all variables except profile ID, horizon ID, top, bottom vars <- hznames[-ids.top.bottom.idx] } # check for column names that don't exist - if(! any(vars %in% hznames)) { + if (!any(vars %in% hznames)) { stop('names in formula do not match any horizon attributes', call. = FALSE) } @@ -127,7 +133,7 @@ dice <- function(x, fm = NULL, SPC = TRUE, pctMissing = FALSE, fill = FALSE, str # slice to-depth of collection # optionally fill all gaps between min(x) --- [gaps] --- max(x) - if(fill) { + if (fill) { x <- fillHzGaps(x, flag = TRUE, to_top = min(x), to_bottom = max(x)) } @@ -147,11 +153,12 @@ dice <- function(x, fm = NULL, SPC = TRUE, pctMissing = FALSE, fill = FALSE, str ) # convert to DT - if(! inherits(h, 'data.table')) { + if (!inherits(h, 'data.table')) { h <- as.data.table(h) } ## TODO: are keys worth the extra sorting / re-sorting? + ## no, probably not inherently worth it -AGB 2022/08/17 # init keys, sorts the data on hzID (alpha-sort) # setkeyv(h, hzidn) # consider an index, seems to have no effect @@ -163,21 +170,31 @@ dice <- function(x, fm = NULL, SPC = TRUE, pctMissing = FALSE, fill = FALSE, str # safe wrapper to base::seq # NA in from/to (hz depths) not allowed - # TODO: from == to (0-thickness) horizons ?? .seq.safe <- function(from, to, by) { - if(any(is.na(from)) | any(is.na(to))) { + if (any(is.na(from)) | any(is.na(to))) { stop('corrupt horizonation, consider using `strict = TRUE`', call. = FALSE) } - + if (from > to) { + toold <- to + to <- from + from <- toold + } return( - seq(from = from, to = to, by = by) + seq(from = from, to = to, by = abs(by)) ) } + # from == to (0-thickness) horizons -- these should be removed -AGB 2022/08/17 + zthk <- h[[htb[1]]] == h[[htb[2]]] + if (any(zthk, na.rm = TRUE)) { + message("horizons with zero thickness have been omitted from results") + } + h.sub <- h[which(!zthk),] + tops <- mapply( FUN = .seq.safe, - from = h[[htb[1]]], - to = h[[htb[2]]] - 1, + from = h.sub[[htb[1]]], + to = h.sub[[htb[2]]] - 1, by = 1, SIMPLIFY = FALSE ) @@ -188,30 +205,21 @@ dice <- function(x, fm = NULL, SPC = TRUE, pctMissing = FALSE, fill = FALSE, str # expand slice IDs (horizon IDs) # these are used to join with horizons sliceIDs <- rep( - h[[hzidn]], - times = h[[htb[2]]] - h[[htb[1]]] + h.sub[[hzidn]], + times = h.sub[[htb[2]]] - h.sub[[htb[1]]] ) - # assemble slice LUT for JOIN s <- data.table( sliceID = sliceIDs, - .sliceTop = tops, - .sliceBottom = bottoms + .sliceTop = tops[1:length(sliceIDs)], + .sliceBottom = bottoms[1:length(sliceIDs)] ) # re-name for simpler JOIN names(s)[1] <- hzidn - - - ## TODO: are keys worth the extra sorting / re-sorting? - # note: sorts data - # setkeyv(s, hzidn) - # consider an index, seems to have no effect - # setindexv(s, hzidn) - + # FULL JOIN via fast data.table compiled code - # using index (?) res <- merge(h, s, by = hzidn, all = TRUE, sort = FALSE) ## TODO: update to := syntax, but how does it work with with variables? @@ -231,14 +239,13 @@ dice <- function(x, fm = NULL, SPC = TRUE, pctMissing = FALSE, fill = FALSE, str res[['.sliceTop']] <- NULL res[['.sliceBottom']] <- NULL - ## TODO: move z-index subset"up" to original sequence creation to save a lot of time - if(!is.null(z)) { + if (!is.null(z)) { res <- res[which(res[[htb[1]]] %in% z), ] } # slice-wise "percent missing" calculation - if(pctMissing) { + if (pctMissing) { # this is quite slow: DT -> DF -> matrix -> apply (8x longer) # res[['.pctMissing']] <- .pctMissing(res, vars) @@ -257,7 +264,7 @@ dice <- function(x, fm = NULL, SPC = TRUE, pctMissing = FALSE, fill = FALSE, str } # only returning horizons as a data.frame - if(!SPC) { + if (!SPC) { return(as.data.frame(res)) } @@ -276,8 +283,10 @@ dice <- function(x, fm = NULL, SPC = TRUE, pctMissing = FALSE, fill = FALSE, str OBF <- round(f.size / o.size, 1) metadata(x)$OBF <- OBF - # possibly informative - message(sprintf("Object Bloat Factor: %s", OBF)) + if (verbose) { + # turned off by default; possibly informative + message(sprintf("Object Bloat Factor: %s", OBF)) + } return(x) } diff --git a/R/genSlabLabels.R b/R/genSlabLabels.R index 443b2cd28..623f9e1a6 100644 --- a/R/genSlabLabels.R +++ b/R/genSlabLabels.R @@ -1,54 +1,54 @@ ## TODO: documentation / generalization -# note source data must be normalixed via slice() first, e.g. all share the same number of horizons +# note source data must be "normalized" via dice() first; assumes each profile has the same number of horizons # generate labels for slabs -genSlabLabels <- function(slab.structure=1, max.d, n.profiles) { +genSlabLabels <- function(slab.structure = 1, max.d, n.profiles) { # fixed-size slabs - if(length(slab.structure) == 1) { + if (length(slab.structure) == 1) { # generate sequence of segment labels - seg.label <- rep(1:ceiling(max.d / slab.structure), each=slab.structure, length=max.d) + seg.label <- rep(1:ceiling(max.d / slab.structure), each = slab.structure, length = max.d) + # general segment labels - seg.label.levels <- tapply(1:max.d, seg.label, function(i) {r <- range(i); paste(c(r[1]-1, r[2]), collapse='-') } ) + seg.label.levels <- tapply(1:max.d, seg.label, + function(i) { + r <- range(i, na.rm = TRUE) + paste(c(r[1] - 1, r[2]), collapse = '-') + }) } # user-defined slabs - if(length(slab.structure) > 1) { - # trival case where segments start from 0 - if(slab.structure[1] == 0 & length(slab.structure) > 2) - seg.label <- rep(slab.structure[-1], times=diff(slab.structure))[1:max.d] + if (length(slab.structure) > 1) { + if (slab.structure[1] == 0 & length(slab.structure) > 2) { + # trivial case where segments start from 0 + + + seg.label <- rep(slab.structure[-1], times = diff(slab.structure))[1:max.d] - # other case: user defines an arbitrary lower and upper limit - else { - if(length(slab.structure) != 2) + } else { + # other case: user defines an arbitrary lower and upper limit + if (length(slab.structure) != 2) stop('user-defined slab boundaries must either start from 0, or contain two values between 0 and the max soil depth') - # proceed + # calculate thickness of each slab slab.thickness <- diff(slab.structure) - # how many slices of NA before the slab? - padding.before <- rep(NA, times=slab.structure[1]) - # how many slices of NA afer the slab - padding.after <- rep(NA, times=abs(max.d - slab.structure[2])) + # make a new label for the slab - new.label <- paste(slab.structure, collapse='-') + new.label <- paste(slab.structure, collapse = '-') + # generate an index for the slab - slab.idx <- rep(new.label, times=slab.thickness) - # generate the entire index: padding+slab+padding = total number of slices (max_d) - # seg.label <- c(padding.before, slab.idx, padding.after) + slab.idx <- rep(new.label, times = slab.thickness) seg.label <- slab.idx } # generate segment labels - seg.label.levels <- sapply(1:(length(slab.structure)-1), function(i) paste(c(slab.structure[i], slab.structure[i+1]), collapse='-')) + seg.label.levels <- sapply(1:(length(slab.structure) - 1), function(i) { + paste(c(slab.structure[i], slab.structure[i + 1]), collapse = '-') + }) } # covert into a factor that can be used to split profiles into slabs - res <- factor(rep(seg.label, times=n.profiles), labels=seg.label.levels) + res <- factor(rep(seg.label, times = n.profiles), labels = seg.label.levels) return(res) } - - - - - diff --git a/R/slab.R b/R/slab.R index 5fb326d6f..55de576ab 100644 --- a/R/slab.R +++ b/R/slab.R @@ -5,16 +5,14 @@ # this type of function is compatible with aggregate() .slab.fun.factor.default <- function(values, cpm) { - # probabilities are relative to number of contributing profiles - if(cpm == 1) { - tb <- table(values, useNA='no') + if (cpm == 1) { + # probabilities are relative to number of contributing profiles + tb <- table(values, useNA = 'no') # convert to proportions pt <- prop.table(tb) - } - - # probabilities are relative to total number of profiles - else if(cpm == 2) { - tb <- table(values, useNA='always') + } else if (cpm == 2) { + # probabilities are relative to total number of profiles + tb <- table(values, useNA = 'always') # convert to proportions, # the last column will be named 'NA', and contains the tally of NAs --> remove it pt <- prop.table(tb) @@ -27,12 +25,10 @@ names(pt) <- make.names(levels(values)) return(pt) - } - +} ## TODO: make these exported functions with documentation (https://github.com/ncss-tech/aqp/issues/99) - # default slab function for continuous variables # returns a named vector of results # this type of function is compatible with aggregate() @@ -40,34 +36,33 @@ q.probs <- c(0.05, 0.25, 0.5, 0.75, 0.95) ## 2019-10-30: dropping Hmisc as a suggested package # res <- Hmisc::hdquantile(values, probs=q.probs, na.rm=TRUE) - res <- quantile(values, probs=q.probs, na.rm=TRUE) - names(res) <- paste('p.q', round(q.probs * 100), sep='') + res <- quantile(values, probs = q.probs, na.rm = TRUE) + names(res) <- paste('p.q', round(q.probs * 100), sep = '') return(res) } # easy specification of Hmisc::hdquantile if available .slab.fun.numeric.HD <- function(values) { # sanity check, need this for color distance eval - if(!requireNamespace('Hmisc', quietly = TRUE)) - stop('pleast install the `Hmisc` package.', call.=FALSE) - + if (!requireNamespace('Hmisc', quietly = TRUE)) + stop('please install the `Hmisc` package to use `hdquantile()` method', call. = FALSE) + q.probs <- c(0.05, 0.25, 0.5, 0.75, 0.95) - - res <- Hmisc::hdquantile(values, probs=q.probs, na.rm=TRUE) - - names(res) <- paste('p.q', round(q.probs * 100), sep='') + + res <- Hmisc::hdquantile(values, probs = q.probs, na.rm = TRUE) + + names(res) <- paste('p.q', round(q.probs * 100), sep = '') return(res) } # basic quantile evaluation, better for large datasets .slab.fun.numeric.fast <- function(values) { q.probs <- c(0.05, 0.25, 0.5, 0.75, 0.95) - res <- quantile(values, probs=q.probs, na.rm=TRUE) - names(res) <- paste('p.q', round(q.probs * 100), sep='') + res <- quantile(values, probs = q.probs, na.rm = TRUE) + names(res) <- paste('p.q', round(q.probs * 100), sep = '') return(res) } - # ## TODO: not yet implemented # # default slab function for weighted, continuous variables # # returns a named vector of results @@ -80,9 +75,6 @@ # return(res) # } - -## note: slab() uses slice() to resample to 1cm slices, to max(x) or slab.structure[2] if defined - # SoilProfileCollection method # object: SoilProfileCollection # fm: formula defining aggregation @@ -92,11 +84,14 @@ # cpm: class probability normalization mode # weights: character vector naming column containing weights # ... : extra arguments passed on to slab.fun - -# this is about 40% slower than the old method -.slab <- function(object, fm, slab.structure=1, strict=FALSE, slab.fun=.slab.fun.numeric.default, cpm=1, weights=NULL, ...){ - # issue a message for now that things have changed - # message('usage of slab() has changed considerably, please see the manual page for details') +.slab <- function(object, + fm, + slab.structure = 1, + strict = FALSE, + slab.fun = .slab.fun.numeric.default, + cpm = 1, + weights = NULL, + ...) { # get extra arguments: length of 0 if no extra arguments extra.args <- list(...) @@ -124,102 +119,70 @@ vars <- all.vars(update(fm, 0~.)) # right-hand side # sanity check: - if(! inherits(fm, "formula")) - stop('must provide a valid formula: groups ~ var1 + var2 + ...', call.=FALSE) - + if (!inherits(fm, "formula")) + stop('must provide a valid formula: groups ~ var1 + var2 + ...', call. = FALSE) + # check for bogus left/right side problems with the formula - if(any(g %in% object.names) == FALSE & g != '.') # bogus grouping column - stop('group name either missing from formula, or does match any columns in dataframe', call.=FALSE) + if (any(g %in% object.names) == FALSE & g != '.') # bogus grouping column + stop('group name either missing from formula, or does match any columns in dataframe', call. = FALSE) - if(any(vars %in% object.names) == FALSE) # bogus column names in right-hand side - stop('column names in formula do not match column names in dataframe', call.=FALSE) - - - ## old slice() version ############################### + if (any(vars %in% object.names) == FALSE) # bogus column names in right-hand side + stop('column names in formula do not match column names in dataframe', call. = FALSE) # make formula for slicing - ## TODO: slice() returns an extra row, so only request slices to max-1 - fm.slice <- formula(paste('0:', max(object)-1, ' ~ ', paste(vars, collapse=' + '), sep='')) + fm.slice <- formula(paste('0:', max(object), ' ~ ', paste(vars, collapse = ' + '), sep = '')) # short-cut for user-defined slab - ## TODO: slice() returns an extra row, so only request slices to max-1 - if(length(slab.structure) == 2 ) - fm.slice <- formula(paste(slab.structure[1], ':', slab.structure[2]-1, ' ~ ', paste(vars, collapse=' + '), sep='')) - - # slice into 1cm increments, result is a data.frame - data <- slice(object, fm.slice, strict=strict, just.the.data=TRUE) - - ####################################################### - - - ## dice() version #################################### - # https://github.com/ncss-tech/aqp/issues/115 - - ## TODO: - # * enforce filling / check assumptions about length - # * consider total re-write vs. adaptation - - # # make formula for slicing - # fm.slice <- formula(paste('0:', max(object), ' ~ ', paste(vars, collapse=' + '), sep='')) - # - # # short-cut for user-defined slab - # if(length(slab.structure) == 2 ) - # fm.slice <- formula(paste(slab.structure[1], ':', slab.structure[2], ' ~ ', paste(vars, collapse=' + '), sep='')) - # - # # slice into 1cm increments, result is a data.frame - # data <- dice(x = object, fm = fm.slice, strict = strict, SPC = FALSE, pctMissing = TRUE) - - ####################################################### - - + if (length(slab.structure) == 2) { + fm.slice <- formula(paste(slab.structure[1], ':', slab.structure[2], ' ~ ', paste(vars, collapse = ' + '), sep = '')) + } + # slice into 1cm increments, result is a data.frame + data <- dice(x = object, fm = fm.slice, strict = strict, SPC = FALSE, pctMissing = TRUE) # extract site data site.data <- site(object) # check groups for NA, if grouping variable was given - if(g != '.') - if(any(is.na(site.data[, g]))) - stop('grouping variable must not contain NA', call.=FALSE) - + if (g != '.' && any(is.na(site.data[, g]))) { + stop('grouping variable must not contain NA', call. = FALSE) + } + ### !!! this runs out of memory when groups contains NA !!! ## this is generating a lot of extra objects and possibly wasting memory # merge site data back into the result, this would include site-level weights - data <- merge(x = data, y = site.data, by=object.ID, all.x=TRUE, sort=FALSE) + data <- merge(x = data, y = site.data, by = object.ID, all.x = TRUE, sort = FALSE) # clean-up rm(object, site.data) gc() # check variable classes - if(length(vars) > 1) { + if (length(vars) > 1) { vars.numeric.test <- sapply(data[, vars], is.numeric) } else { vars.numeric.test <- is.numeric(data[[vars]]) } - # sanity check: all numeric, or single character/factor - if(any(! vars.numeric.test) & length(vars) > 1) { + if (any(!vars.numeric.test) & length(vars) > 1) { stop('mixed variable types and multiple categorical variables are not currently supported in the same call to slab', call. = FALSE) } - # check for single categorical variable, and convert to factor - if(length(vars) == 1 & inherits(data[, vars], c('character', 'factor'))) { + if (length(vars) == 1 & inherits(data[, vars], c('character', 'factor'))) { # if we have a character, then convert to factor - if(inherits(data[[vars]],'character')) { + if (inherits(data[[vars]],'character')) { message('Note: converting categorical variable to factor.') data[[vars]] <- factor(data[[vars]]) } # check for weights - if(!missing(weights)) { - stop('weighted aggregation of categorical variables not yet implemented', call.=FALSE) + if (!missing(weights)) { + stop('weighted aggregation of categorical variables not yet implemented', call. = FALSE) } - # re-set default function, currently no user-supply-able option slab.fun <- .slab.fun.factor.default @@ -236,94 +199,70 @@ .factorFlag <- FALSE } - - #### - #### optimization note: use of factor labels could be slowing things down... - #### + ## Note: use of factor labels could be slowing things down... ## Note: this assumes ordering is correct in source data / segment labels + ## TODO: make sure that nrow(data) == genSlabLabels(slab.structure = slab.structure, max.d = max.d, n.profiles = n.profiles) + ## - I think the source of problems related to genSlabLabels not conforming with number of rows has been fixed -AGB 2022/08/17 + ## TODO: investigate use of split() to speed things up, no need to keep everything in the safe DF: - ## - ## l <- split(data, data$seg.label, drop=FALSE) - ## - ## ... parallel processing with furrr + ## l <- split(data, data$seg.label, drop=FALSE) # add segmenting label to data data$seg.label <- genSlabLabels(slab.structure = slab.structure, max.d = max.d, n.profiles = n.profiles) # if there is no left-hand component in the formula, we are aggregating all data in the collection - if(g == '.') { + if (g == '.') { g <- 'all.profiles' # add new grouping variable to horizons data[, g] <- 1 } - - ## - ## TODO: adding weighted-aggregate functionality here - ## we can't use aggregate() for this - ## we can use data.table methods - # check for weights - if(!missing(weights)) - stop('weighted aggregation is not yet supported', call.=FALSE) - + if (!missing(weights)) { + ## TODO: adding weighted aggregation via data.table + stop('weighted aggregation is not yet supported', call. = FALSE) + } + ## TODO: why would this ever happen? # throwing out those rows with an NA segment label seg.label.is.not.NA <- which(!is.na(data$seg.label)) - # convert into long format - # d.long.df <- reshape::melt(data[seg.label.is.not.NA, ], id.vars=c(object.ID, 'seg.label', g), measure.vars=vars) - - # convert wide -> long format - # using data.table::melt() - # note that this will not preserve factor levels when 'vars' is categorical - # must call unique() on `id.vars` + # convert wide -> long format w/ data.table::melt() + # will not preserve factor levels when 'vars' is categorical; must call unique() on `id.vars` d.long <- melt( as.data.table(data[seg.label.is.not.NA, ]), id.vars = unique(c(object.ID, 'seg.label', g)), measure.vars = vars, - ) + ) # convert back to data.frame d.long <- as.data.frame(d.long) # reset factor levels in d.long[[value]] - if(.factorFlag) { + if (.factorFlag) { d.long[['value']] <- factor(d.long[['value']], levels = original.levels) } # make a formula for aggregate() - aggregate.fm <- as.formula(paste('value ~ seg.label + variable + ', g, sep='')) - - ## - ## TODO: this might be the place to implement parallel code in furrr - ## 1. split into a list based on number of cores/cpus available - ## 2. aggregate using seg.label + variable in parallel - ## 3. combine results (a list of data.frames) - ## - ## NOPE: use data.table which will automatically scale to multiple threads - ## - - + aggregate.fm <- as.formula(paste('value ~ seg.label + variable + ', g, sep = '')) # process chunks according to group -> variable -> segment # NA values are not explicitly dropped - if(length(extra.args) == 0) - d.slabbed <- aggregate(aggregate.fm, data=d.long, na.action=na.pass, FUN=slab.fun) - - # optionally account for extra arguments - else { - the.args <- c(list(aggregate.fm, data=d.long, na.action=na.pass, FUN=slab.fun), extra.args) - d.slabbed <- do.call(what='aggregate', args=the.args) + if (length(extra.args) == 0) { + d.slabbed <- aggregate(aggregate.fm, data = d.long, na.action = na.pass, FUN = slab.fun) + } else { + # optionally account for extra arguments + the.args <- c(list(aggregate.fm, data = d.long, na.action = na.pass, FUN = slab.fun), extra.args) + d.slabbed <- do.call(what = 'aggregate', args = the.args) } - # if slab.fun returns a vector of length > 1 we must: # convert the complex data.frame returned by aggregate into a regular data.frame # the column 'value' is a matrix with the results of slab.fun - if(inherits(d.slabbed$value, 'matrix')) + if (inherits(d.slabbed$value, 'matrix')) { d.slabbed <- cbind(d.slabbed[, 1:3], d.slabbed$value) - + } + # ensure that the names returned from slab.fun are legal names(d.slabbed) <- make.names(names(d.slabbed)) @@ -333,8 +272,14 @@ d.slabbed$bottom <- as.integer(lapply(slab.depths, function(i) i[2])) # estimate contributing fraction - d.slabbed$contributing_fraction <- aggregate(aggregate.fm, data=d.long, na.action=na.pass, FUN=function(i) {length(na.omit(i)) / length(i)})$value - + d.slabbed$contributing_fraction <- aggregate( + aggregate.fm, + data = d.long, + na.action = na.pass, + FUN = function(i) { + length(na.omit(i)) / length(i) + } + )$value # remove seg.label from result d.slabbed$seg.label <- NULL @@ -342,21 +287,26 @@ # if categorical data have been aggregated, set an attribute with the original levels # this allows one to recover values that are not legal column names # e.g. 2Bt is corrupted to X2Bt - if(! is.null(original.levels)) + if (!is.null(original.levels)) { attr(d.slabbed, 'original.levels') <- original.levels - - # reset options: + } + + # reset options options(opt.original) - - # done return(d.slabbed) } # setup generic function -# if (!isGeneric("slab")) - setGeneric("slab", function(object, fm, slab.structure=1, strict=FALSE, slab.fun=.slab.fun.numeric.default, cpm=1, weights=NULL, ...) standardGeneric("slab")) +setGeneric("slab", function(object, + fm, + slab.structure = 1, + strict = FALSE, + slab.fun = .slab.fun.numeric.default, + cpm = 1, + weights = NULL, + ...) + standardGeneric("slab")) -# method dispatch #' Slab-Wise Aggregation of SoilProfileCollection Objects #' #' Aggregate soil properties along user-defined `slabs`, and optionally within @@ -373,6 +323,8 @@ #' \code{stats::quantile} from the Hmisc package, which requires at least 2 #' observations per chunk. Note that if `group` is a factor it must not contain #' NAs. +#' +#' `slab()` uses `dice()` to "resample" profiles to 1cm slices from depth 0 to `max(x)` (or `slab.structure[2]`, if defined). #' #' Sometimes \code{slab} is used to conveniently re-arrange data vs. aggregate. #' This is performed by specifying \code{identity} in \code{slab.fun}. See @@ -526,7 +478,6 @@ #' scales=list(x=list(alternating=1)) #' ) #' -#' #' ## #' ## categorical variable example #' ## @@ -780,6 +731,7 @@ #' #' #' +# # TODO: update this when weights are implemented #' ## weighted-aggregation -- NOT YET IMPLEMENTED -- #' # load sample data, upgrade to SoilProfileCollection #' data(sp1) @@ -790,4 +742,6 @@ #' sp1$site.wts <- runif(n=length(sp1), min=20, max=100) #' } #' -setMethod(f='slab', signature='SoilProfileCollection', definition=.slab) +setMethod(f = 'slab', + signature = 'SoilProfileCollection', + definition = .slab) From 05fa8fc10ea8827a1a87967addab062f40277fd8 Mon Sep 17 00:00:00 2001 From: "Andrew G. Brown" Date: Wed, 17 Aug 2022 17:11:48 -0700 Subject: [PATCH 2/7] dice docs --- R/dice.R | 2 ++ man/dice.Rd | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/R/dice.R b/R/dice.R index 390768623..60335e783 100644 --- a/R/dice.R +++ b/R/dice.R @@ -27,6 +27,8 @@ #' #' @param byhz Evaluate horizon depth logic at the horizon level (`TRUE`) or profile level (`FALSE`). Invalid depth logic invokes `HzDepthLogicSubset` which removes offending profiles or horizon records. #' +#' @param verbose Print information about object size/memory usage. Default: `FALSE` +#' #' @details For large and potentially messy collections that include missing horizon bottom depths, or 0-thickness horions, consider using `repairMissingHzDepths()` before `dice()`. #' #' diff --git a/man/dice.Rd b/man/dice.Rd index 6ac60c727..ce17dafe0 100644 --- a/man/dice.Rd +++ b/man/dice.Rd @@ -11,7 +11,8 @@ dice( pctMissing = FALSE, fill = FALSE, strict = TRUE, - byhz = FALSE + byhz = FALSE, + verbose = FALSE ) } \arguments{ @@ -28,6 +29,8 @@ dice( \item{strict}{perform horizon depth logic checking / flagging / removal} \item{byhz}{Evaluate horizon depth logic at the horizon level (\code{TRUE}) or profile level (\code{FALSE}). Invalid depth logic invokes \code{HzDepthLogicSubset} which removes offending profiles or horizon records.} + +\item{verbose}{Print information about object size/memory usage. Default: \code{FALSE}} } \value{ a \code{SoilProfileCollection} object, or \code{data.frame} when \code{SPC = FALSE} From 8cd031179e80c87be11b425d758ed6bea0b4ea36 Mon Sep 17 00:00:00 2001 From: "Andrew G. Brown" Date: Wed, 17 Aug 2022 17:01:43 -0700 Subject: [PATCH 3/7] `slab()`: implement `weights` argument - Add `slab_function()` --- R/slab.R | 236 +++++++++++++++++++------------------ man/slab-methods.Rd | 16 ++- tests/testthat/test-slab.R | 27 ++++- 3 files changed, 163 insertions(+), 116 deletions(-) diff --git a/R/slab.R b/R/slab.R index 55de576ab..2639f2b01 100644 --- a/R/slab.R +++ b/R/slab.R @@ -1,5 +1,3 @@ - - # default slab function for categorical variables # returns a named vector of results # this type of function is compatible with aggregate() @@ -27,54 +25,54 @@ return(pt) } -## TODO: make these exported functions with documentation (https://github.com/ncss-tech/aqp/issues/99) - +.slab.fun.factor.weighted <- function(values, weights) { + # TODO +} + # default slab function for continuous variables # returns a named vector of results # this type of function is compatible with aggregate() -.slab.fun.numeric.default <- function(values) { - q.probs <- c(0.05, 0.25, 0.5, 0.75, 0.95) - ## 2019-10-30: dropping Hmisc as a suggested package - # res <- Hmisc::hdquantile(values, probs=q.probs, na.rm=TRUE) - res <- quantile(values, probs = q.probs, na.rm = TRUE) - names(res) <- paste('p.q', round(q.probs * 100), sep = '') - return(res) +.slab.fun.numeric.default <- function(values, probs = c(0.05, 0.25, 0.5, 0.75, 0.95)) { + .slab.fun.numeric.fast(values, probs = probs) +} + +# for a site or horizon level weight column, use Hmisc::wtd.quantile +# note that "frequency weights" are assumed to be most common use case, so normwt=FALSE by default, +# normwt=TRUE can be passed as optional argument for slab.fun from slab() interface +.slab.fun.numeric.weighted <- function(values, probs = c(0.05, 0.25, 0.5, 0.75, 0.95), wt, normwt = FALSE) { + if (!requireNamespace('Hmisc', quietly = TRUE)) + stop('please install the `Hmisc` package to use `wtd.quantile()` method', call. = FALSE) + res <- try(Hmisc::wtd.quantile(values, weights = wt, probs = probs, na.rm = TRUE, normwt = normwt), silent = TRUE) + if (inherits(res, 'try-error') && grepl("zero non-NA points", res[1])) { + res <- rep(NA_real_, length(probs)) + } else if (inherits(res, 'try-error')) { + print(res) + return(NULL) + } else { + names(res) <- paste('p.q', round(probs * 100), sep = '') + } + return(res) } # easy specification of Hmisc::hdquantile if available -.slab.fun.numeric.HD <- function(values) { +.slab.fun.numeric.HD <- function(values, probs = c(0.05, 0.25, 0.5, 0.75, 0.95)) { # sanity check, need this for color distance eval if (!requireNamespace('Hmisc', quietly = TRUE)) stop('please install the `Hmisc` package to use `hdquantile()` method', call. = FALSE) - q.probs <- c(0.05, 0.25, 0.5, 0.75, 0.95) - - res <- Hmisc::hdquantile(values, probs = q.probs, na.rm = TRUE) + res <- Hmisc::hdquantile(values, probs = probs, na.rm = TRUE) - names(res) <- paste('p.q', round(q.probs * 100), sep = '') + names(res) <- paste('p.q', round(probs * 100), sep = '') return(res) } -# basic quantile evaluation, better for large datasets -.slab.fun.numeric.fast <- function(values) { - q.probs <- c(0.05, 0.25, 0.5, 0.75, 0.95) - res <- quantile(values, probs = q.probs, na.rm = TRUE) - names(res) <- paste('p.q', round(q.probs * 100), sep = '') - return(res) +# basic quantile evaluation, better for large data sets +.slab.fun.numeric.fast <- function(values, probs = c(0.05, 0.25, 0.5, 0.75, 0.95)) { + res <- quantile(values, probs = probs, na.rm = TRUE) + names(res) <- paste('p.q', round(probs * 100), sep = '') + return(res) } -# ## TODO: not yet implemented -# # default slab function for weighted, continuous variables -# # returns a named vector of results -# # this type of function is compatible with aggregate() -# # NOTE: nw (normalize-weights) argument will affect the results! -# .slab.fun.wtd.numeric.default <- function(values, w, nw=TRUE) { -# q.probs <- c(0.05, 0.25, 0.5, 0.75, 0.95) -# res <- wtd.quantile(values, weights=w, probs=q.probs, na.rm=TRUE, normwt=nw) -# names(res) <- paste('p.q', round(q.probs * 100), sep='') -# return(res) -# } - # SoilProfileCollection method # object: SoilProfileCollection # fm: formula defining aggregation @@ -82,23 +80,23 @@ # progress: plyr-progress display # slab.fun: aggregate function applied to data chunks (must return a single row / chunk) # cpm: class probability normalization mode -# weights: character vector naming column containing weights +# weights: character vector naming column in site slot containing weights # ... : extra arguments passed on to slab.fun .slab <- function(object, fm, slab.structure = 1, strict = FALSE, - slab.fun = .slab.fun.numeric.default, + slab.fun = slab_function(method = "numeric"), cpm = 1, weights = NULL, ...) { + # define keywords for data.table + .SD <- NULL; .N <- NULL + # get extra arguments: length of 0 if no extra arguments extra.args <- list(...) - ## important: change the default behavior of data.frame and melt - opt.original <- options(stringsAsFactors = FALSE) - # get unique list of names in object object.names <- unique(unlist(names(object))) @@ -117,7 +115,7 @@ # extract components of the formula: g <- all.vars(update(fm, .~0)) # left-hand side vars <- all.vars(update(fm, 0~.)) # right-hand side - + # sanity check: if (!inherits(fm, "formula")) stop('must provide a valid formula: groups ~ var1 + var2 + ...', call. = FALSE) @@ -145,18 +143,12 @@ # check groups for NA, if grouping variable was given if (g != '.' && any(is.na(site.data[, g]))) { - stop('grouping variable must not contain NA', call. = FALSE) + stop('grouping variable must not contain NA', call. = FALSE) } - ### !!! this runs out of memory when groups contains NA !!! - ## this is generating a lot of extra objects and possibly wasting memory # merge site data back into the result, this would include site-level weights data <- merge(x = data, y = site.data, by = object.ID, all.x = TRUE, sort = FALSE) - # clean-up - rm(object, site.data) - gc() - # check variable classes if (length(vars) > 1) { vars.numeric.test <- sapply(data[, vars], is.numeric) @@ -180,7 +172,7 @@ # check for weights if (!missing(weights)) { - stop('weighted aggregation of categorical variables not yet implemented', call. = FALSE) + slab.fun <- .slab.fun.factor.weighted } # re-set default function, currently no user-supply-able option @@ -216,71 +208,75 @@ g <- 'all.profiles' # add new grouping variable to horizons data[, g] <- 1 } - + + # convert wide -> long format + d.long <- data.table::melt( + as.data.table(data[which(!is.na(data$seg.label)), ]), + id.vars = unique(c(object.ID, 'seg.label', g, weights)), + measure.vars = vars + ) + + # make a formula for aggregate() + aggregate.fm <- as.formula(paste('value ~ seg.label + variable + ', g, sep = '')) + # check for weights if (!missing(weights)) { - ## TODO: adding weighted aggregation via data.table - stop('weighted aggregation is not yet supported', call. = FALSE) - } - - ## TODO: why would this ever happen? - # throwing out those rows with an NA segment label - seg.label.is.not.NA <- which(!is.na(data$seg.label)) - - # convert wide -> long format w/ data.table::melt() - # will not preserve factor levels when 'vars' is categorical; must call unique() on `id.vars` - d.long <- melt( - as.data.table(data[seg.label.is.not.NA, ]), - id.vars = unique(c(object.ID, 'seg.label', g)), - measure.vars = vars, - ) - - # convert back to data.frame - d.long <- as.data.frame(d.long) - - # reset factor levels in d.long[[value]] - if (.factorFlag) { - d.long[['value']] <- factor(d.long[['value']], levels = original.levels) - } - - # make a formula for aggregate() - aggregate.fm <- as.formula(paste('value ~ seg.label + variable + ', g, sep = '')) + if (missing(slab.fun)) { + # default _weighted_ aggregation fun is .slab.fun.numeric.weighted + FUN <- .slab.fun.numeric.weighted + # custom weighted aggregation fun should take first argument as values w. named "weights" argument + } else FUN <- slab.fun + + d.slabbed <- as.data.frame(d.long[, as.data.frame(t(FUN(value, wt = weights))), by = c('variable', g, 'seg.label')]) + d.slabbed$variable + d.slabbed$contributing_fraction <- d.long[, sum(!is.na(.SD[["value"]])) / .N, by = c('variable', g, 'seg.label')]$V1 - # process chunks according to group -> variable -> segment - # NA values are not explicitly dropped - if (length(extra.args) == 0) { - d.slabbed <- aggregate(aggregate.fm, data = d.long, na.action = na.pass, FUN = slab.fun) } else { - # optionally account for extra arguments - the.args <- c(list(aggregate.fm, data = d.long, na.action = na.pass, FUN = slab.fun), extra.args) - d.slabbed <- do.call(what = 'aggregate', args = the.args) + # convert back to data.frame + d.long <- as.data.frame(d.long) + + # reset factor levels in d.long[[value]] + if (.factorFlag) { + d.long[['value']] <- factor(d.long[['value']], levels = original.levels) + } + + # process chunks according to group -> variable -> segment + # NA values are not explicitly dropped + if (length(extra.args) == 0) { + d.slabbed <- aggregate(aggregate.fm, data = d.long, na.action = na.pass, FUN = slab.fun) + } else { + # optionally account for extra arguments + the.args <- c(list(aggregate.fm, data = d.long, na.action = na.pass, FUN = slab.fun), extra.args) + d.slabbed <- do.call(what = 'aggregate', args = the.args) + } + + # if slab.fun returns a vector of length > 1 we must: + # convert the complex data.frame returned by aggregate into a regular data.frame + # the column 'value' is a matrix with the results of slab.fun + if (inherits(d.slabbed$value, 'matrix')) { + d.slabbed <- cbind(d.slabbed[, 1:3], d.slabbed$value) + } + + # ensure that the names returned from slab.fun are legal + names(d.slabbed) <- make.names(names(d.slabbed)) + + # estimate contributing fraction + d.slabbed$contributing_fraction <- aggregate( + aggregate.fm, + data = d.long, + na.action = na.pass, + FUN = function(i) { + length(na.omit(i)) / length(i) + } + )$value + } - - # if slab.fun returns a vector of length > 1 we must: - # convert the complex data.frame returned by aggregate into a regular data.frame - # the column 'value' is a matrix with the results of slab.fun - if (inherits(d.slabbed$value, 'matrix')) { - d.slabbed <- cbind(d.slabbed[, 1:3], d.slabbed$value) - } - - # ensure that the names returned from slab.fun are legal - names(d.slabbed) <- make.names(names(d.slabbed)) - - # convert the slab.label column into top/bottom as integers - slab.depths <- strsplit(as.character(d.slabbed$seg.label), '-') - d.slabbed$top <- as.integer(lapply(slab.depths, function(i) i[1])) - d.slabbed$bottom <- as.integer(lapply(slab.depths, function(i) i[2])) - - # estimate contributing fraction - d.slabbed$contributing_fraction <- aggregate( - aggregate.fm, - data = d.long, - na.action = na.pass, - FUN = function(i) { - length(na.omit(i)) / length(i) - } - )$value - + + # convert the slab.label column into top/bottom as integers + slab.depths <- strsplit(as.character(d.slabbed$seg.label), '-') + d.slabbed$top <- as.integer(lapply(slab.depths, function(i) i[1])) + d.slabbed$bottom <- as.integer(lapply(slab.depths, function(i) i[2])) + # remove seg.label from result d.slabbed$seg.label <- NULL @@ -291,8 +287,6 @@ attr(d.slabbed, 'original.levels') <- original.levels } - # reset options - options(opt.original) return(d.slabbed) } @@ -301,7 +295,7 @@ setGeneric("slab", function(object, fm, slab.structure = 1, strict = FALSE, - slab.fun = .slab.fun.numeric.default, + slab.fun = slab_function(method = "numeric"), cpm = 1, weights = NULL, ...) @@ -368,7 +362,7 @@ setGeneric("slab", function(object, #' either: number of profiles with data at the current slice (cpm=1), or by the #' number of profiles in the collection (cpm=2). Mode 1 values will always sum #' to the contributing fraction, while mode 2 values will always sum to 1. -#' @param weights Column name containing weights. NOT YET IMPLEMENTED +#' @param weights Column name containing site-level weights #' @param \dots further arguments passed to \code{slab.fun} #' @return Output is returned in long format, such that slice-wise aggregates #' are returned once for each combination of grouping level (optional), @@ -745,3 +739,21 @@ setGeneric("slab", function(object, setMethod(f = 'slab', signature = 'SoilProfileCollection', definition = .slab) + +#' @param method one of `"numeric"`, `"factor"`, `"hd"`, `"weighted.numeric"`, `"weighted.factor"`, `"fast"` +#' @details `slab_function()`: The default `"numeric"` aggregation method is the `"fast"` numeric (quantile) method. Additional methods include `"factor"` for categorical data, `"hd"` to use the Harrell-Davis Distribution-Free Quantile Estimator from the Hmisc package, and "`weighted`" to use a weighted quantile method from the Hmisc package +#' @return `slab_function()`: return an aggregation function based on the `method` argument +#' @rdname slab-methods +#' @export +slab_function <- function(method = c("numeric", "factor", "hd", "weighted.numeric", "weighted.factor", "fast")) { + switch(method, + numeric = .slab.fun.numeric.default, # uses "fast" numeric method + factor = .slab.fun.factor.default, + weighted.factor = .slab.fun.factor.weighted, + hd = .slab.fun.numeric.HD, + weighted.numeric = .slab.fun.numeric.weighted, + weighted.factor = .slab.fun.numeric.weighted, + fast = .slab.fun.numeric.fast, + .slab.fun.numeric.default + ) +} diff --git a/man/slab-methods.Rd b/man/slab-methods.Rd index 305a43998..d0a626b95 100644 --- a/man/slab-methods.Rd +++ b/man/slab-methods.Rd @@ -7,6 +7,7 @@ \alias{slab2} \alias{genSlabLabels} \alias{slab,SoilProfileCollection-method} +\alias{slab_function} \title{Slab-Wise Aggregation of SoilProfileCollection Objects} \usage{ \S4method{slab}{SoilProfileCollection}( @@ -14,11 +15,13 @@ fm, slab.structure = 1, strict = FALSE, - slab.fun = .slab.fun.numeric.default, + slab.fun = slab_function(method = "default"), cpm = 1, weights = NULL, ... ) + +slab_function(method = c("default", "factor", "hd", "weighted", "fast")) } \arguments{ \item{object}{a SoilProfileCollection} @@ -42,9 +45,11 @@ either: number of profiles with data at the current slice (cpm=1), or by the number of profiles in the collection (cpm=2). Mode 1 values will always sum to the contributing fraction, while mode 2 values will always sum to 1.} -\item{weights}{Column name containing weights. NOT YET IMPLEMENTED} +\item{weights}{Column name containing site-level weights} \item{\dots}{further arguments passed to \code{slab.fun}} + +\item{method}{one of \code{"default"}, \code{"factor"}, \code{"hd"}, \code{"weighted"}, \code{"fast"}} } \value{ Output is returned in long format, such that slice-wise aggregates @@ -74,6 +79,8 @@ fraction of profiles contributing to the aggregate value, ranges from 1/n_profiles to 1.} } + +\code{slab_function()}: return an aggregation function based on the \code{method} argument } \description{ Aggregate soil properties along user-defined \code{slabs}, and optionally within @@ -92,6 +99,8 @@ mean, mean-1SD and mean+1SD. The default slab function wraps observations per chunk. Note that if \code{group} is a factor it must not contain NAs. +\code{slab()} uses \code{dice()} to "resample" profiles to 1cm slices from depth 0 to \code{max(x)} (or \code{slab.structure[2]}, if defined). + Sometimes \code{slab} is used to conveniently re-arrange data vs. aggregate. This is performed by specifying \code{identity} in \code{slab.fun}. See examples beflow for a demonstration of this functionality. @@ -114,6 +123,8 @@ integers}{e.g. c(50, 60): data are aggregated over depths spanning 50--60 units} \item{a vector of 3 or more integers}{e.g. c(0, 5, 10, 50, 100): data are aggregated over the depths spanning 0--5, 5--10, 10--50, 50--100 units} } + +\code{slab_function()}: The \code{"default"} aggregation method is the \code{"fast"} numeric (quantile) method. Additional methods include \code{"factor"} for categorical data, \code{"hd"} to use the Harrell-Davis Distribution-Free Quantile Estimator from the Hmisc package, and "\code{weighted}" to use a weighted quantile method from the Hmisc package } \note{ Arguments to \code{slab} have changed with \code{aqp} 1.5 (2012-12-29) @@ -191,7 +202,6 @@ xyplot(top ~ value | id, data=a, ylab='Depth', scales=list(x=list(alternating=1)) ) - ## ## categorical variable example ## diff --git a/tests/testthat/test-slab.R b/tests/testthat/test-slab.R index c45537f79..af455668d 100644 --- a/tests/testthat/test-slab.R +++ b/tests/testthat/test-slab.R @@ -35,7 +35,32 @@ test_that("basic slab functionality", { expect_true(any(grepl('p.q', nm))) }) - +test_that("extended slab functionality: weighted aggregation", { + + data(sp1, package = 'aqp') + depths(sp1) <- id ~ top + bottom + + sp1$weights <- rep(1:2, length(sp1))[1:length(sp1)] + sp1$wtgrp <- rep(1, length(sp1)) + + # we expect quantile estimates to vary (given weighted v.s. unweighted) + a.0 <- slab(sp1, fm = ~ prop, weights = "weights") + a.1 <- slab(sp1, fm = ~ prop, strict = TRUE, weights = "weights") + a.2 <- slab(sp1, fm = wtgrp ~ prop, strict = TRUE, weights = "weights") + a.3 <- slab(sp1, fm = wtgrp ~ prop, strict = TRUE) + + # expect consistent structure for weighted/unweighted: same column names except for group + ungroupcols <- colnames(a.3) + ungroupcols[2] <- "all.profiles" + expect_equal(colnames(a.0), ungroupcols) + expect_equal(colnames(a.1), ungroupcols) + expect_equal(colnames(a.2), colnames(a.3)) + + # contributing fractions should be identical + expect_true(all(a.1$contributing_fraction == a.2$contributing_fraction)) + expect_true(all(a.2$contributing_fraction == a.3$contributing_fraction)) + +}) test_that("slab calculations: mean, single profile", { From 2cb89ca409155f48a4df8c103e7c21ca85d1054d Mon Sep 17 00:00:00 2001 From: "Andrew G. Brown" Date: Thu, 18 Aug 2022 11:42:46 -0700 Subject: [PATCH 4/7] cleanup / fixing edge cases / more tests --- R/genSlabLabels.R | 6 ++---- R/slab.R | 23 +++++++++++++---------- man/slab-methods.Rd | 10 ++++++---- tests/testthat/test-slab.R | 28 ++++++++++++++++++++++++++-- 4 files changed, 47 insertions(+), 20 deletions(-) diff --git a/R/genSlabLabels.R b/R/genSlabLabels.R index 623f9e1a6..f5c234ee8 100644 --- a/R/genSlabLabels.R +++ b/R/genSlabLabels.R @@ -22,7 +22,6 @@ genSlabLabels <- function(slab.structure = 1, max.d, n.profiles) { if (slab.structure[1] == 0 & length(slab.structure) > 2) { # trivial case where segments start from 0 - seg.label <- rep(slab.structure[-1], times = diff(slab.structure))[1:max.d] } else { @@ -30,15 +29,14 @@ genSlabLabels <- function(slab.structure = 1, max.d, n.profiles) { if (length(slab.structure) != 2) stop('user-defined slab boundaries must either start from 0, or contain two values between 0 and the max soil depth') - # calculate thickness of each slab + # calculate thickness of slab slab.thickness <- diff(slab.structure) # make a new label for the slab new.label <- paste(slab.structure, collapse = '-') - # generate an index for the slab slab.idx <- rep(new.label, times = slab.thickness) - seg.label <- slab.idx + seg.label <- slab.idx } # generate segment labels diff --git a/R/slab.R b/R/slab.R index 2639f2b01..312a5c8cd 100644 --- a/R/slab.R +++ b/R/slab.R @@ -92,7 +92,7 @@ ...) { # define keywords for data.table - .SD <- NULL; .N <- NULL + .SD <- NULL; .N <- NULL; value <- NULL # get extra arguments: length of 0 if no extra arguments extra.args <- list(...) @@ -113,8 +113,8 @@ original.levels <- NULL # extract components of the formula: - g <- all.vars(update(fm, .~0)) # left-hand side - vars <- all.vars(update(fm, 0~.)) # right-hand side + g <- all.vars(update(fm, . ~ 0)) # left-hand side + vars <- all.vars(update(fm, 0 ~ .)) # right-hand side # sanity check: if (!inherits(fm, "formula")) @@ -128,15 +128,22 @@ stop('column names in formula do not match column names in dataframe', call. = FALSE) # make formula for slicing - fm.slice <- formula(paste('0:', max(object), ' ~ ', paste(vars, collapse = ' + '), sep = '')) - - # short-cut for user-defined slab if (length(slab.structure) == 2) { + # user-defined single slab fm.slice <- formula(paste(slab.structure[1], ':', slab.structure[2], ' ~ ', paste(vars, collapse = ' + '), sep = '')) + } else { + fm.slice <- formula(paste('0:', max(object), ' ~ ', paste(vars, collapse = ' + '), sep = '')) } # slice into 1cm increments, result is a data.frame data <- dice(x = object, fm = fm.slice, strict = strict, SPC = FALSE, pctMissing = TRUE) + + # Note: in this case we need to subtract the extra slice included by slice()/dice() + # do it to sliced result so that the genSlabLabels have the correct length + ldx <- data[[horizonDepths(object)[2]]] > slab.structure[2] + if (length(slab.structure) == 2 && any(ldx, na.rm = TRUE)) { + data <- data[which(!ldx), ] + } # extract site data site.data <- site(object) @@ -193,10 +200,6 @@ ## Note: use of factor labels could be slowing things down... ## Note: this assumes ordering is correct in source data / segment labels - - ## TODO: make sure that nrow(data) == genSlabLabels(slab.structure = slab.structure, max.d = max.d, n.profiles = n.profiles) - ## - I think the source of problems related to genSlabLabels not conforming with number of rows has been fixed -AGB 2022/08/17 - ## TODO: investigate use of split() to speed things up, no need to keep everything in the safe DF: ## l <- split(data, data$seg.label, drop=FALSE) diff --git a/man/slab-methods.Rd b/man/slab-methods.Rd index d0a626b95..618c6cde1 100644 --- a/man/slab-methods.Rd +++ b/man/slab-methods.Rd @@ -15,13 +15,15 @@ fm, slab.structure = 1, strict = FALSE, - slab.fun = slab_function(method = "default"), + slab.fun = slab_function(method = "numeric"), cpm = 1, weights = NULL, ... ) -slab_function(method = c("default", "factor", "hd", "weighted", "fast")) +slab_function( + method = c("numeric", "factor", "hd", "weighted.numeric", "weighted.factor", "fast") +) } \arguments{ \item{object}{a SoilProfileCollection} @@ -49,7 +51,7 @@ to the contributing fraction, while mode 2 values will always sum to 1.} \item{\dots}{further arguments passed to \code{slab.fun}} -\item{method}{one of \code{"default"}, \code{"factor"}, \code{"hd"}, \code{"weighted"}, \code{"fast"}} +\item{method}{one of \code{"numeric"}, \code{"factor"}, \code{"hd"}, \code{"weighted.numeric"}, \code{"weighted.factor"}, \code{"fast"}} } \value{ Output is returned in long format, such that slice-wise aggregates @@ -124,7 +126,7 @@ units} \item{a vector of 3 or more integers}{e.g. c(0, 5, 10, 50, 100): data are aggregated over the depths spanning 0--5, 5--10, 10--50, 50--100 units} } -\code{slab_function()}: The \code{"default"} aggregation method is the \code{"fast"} numeric (quantile) method. Additional methods include \code{"factor"} for categorical data, \code{"hd"} to use the Harrell-Davis Distribution-Free Quantile Estimator from the Hmisc package, and "\code{weighted}" to use a weighted quantile method from the Hmisc package +\code{slab_function()}: The default \code{"numeric"} aggregation method is the \code{"fast"} numeric (quantile) method. Additional methods include \code{"factor"} for categorical data, \code{"hd"} to use the Harrell-Davis Distribution-Free Quantile Estimator from the Hmisc package, and "\code{weighted}" to use a weighted quantile method from the Hmisc package } \note{ Arguments to \code{slab} have changed with \code{aqp} 1.5 (2012-12-29) diff --git a/tests/testthat/test-slab.R b/tests/testthat/test-slab.R index af455668d..fc2993e9f 100644 --- a/tests/testthat/test-slab.R +++ b/tests/testthat/test-slab.R @@ -1,7 +1,5 @@ context("slab method for SoilProfileCollection objects") - - test_that("basic slab functionality", { data(sp1, package = 'aqp') @@ -35,6 +33,32 @@ test_that("basic slab functionality", { expect_true(any(grepl('p.q', nm))) }) +test_that("slab structure for a single slab", { + data(sp4, package = 'aqp') + depths(sp4) <- id ~ top + bottom + + sp4$group <- c(rep('A',5), rep('B',5)) + a.1 <- slab( + sp4, + fm = ~ sand + silt + clay, + slab.structure = c(0, 10), + slab.fun = mean, + na.rm = TRUE + ) + expect_equal(nrow(a.1), 3) + + # again, this time within groups defined by a site-level attribute: + a.1 <- slab( + sp4, + fm = group ~ sand + silt + clay, + slab.structure = c(0, 10), + slab.fun = mean, + na.rm = TRUE + ) + expect_equal(nrow(a.1), 6) +}) + + test_that("extended slab functionality: weighted aggregation", { data(sp1, package = 'aqp') From 669fc9a239065db86fd3ca813386b34de4178513 Mon Sep 17 00:00:00 2001 From: "Andrew G. Brown" Date: Thu, 18 Aug 2022 14:56:28 -0700 Subject: [PATCH 5/7] fixweighted average/quantile methods - add test of "component weighted averages" --- R/slab.R | 41 ++++++++++++++++++++---------- tests/testthat/test-slab.R | 52 ++++++++++++++++++++++++++++++++++---- 2 files changed, 74 insertions(+), 19 deletions(-) diff --git a/R/slab.R b/R/slab.R index 312a5c8cd..600984d56 100644 --- a/R/slab.R +++ b/R/slab.R @@ -25,7 +25,7 @@ return(pt) } -.slab.fun.factor.weighted <- function(values, weights) { +.slab.fun.factor.weighted <- function(values, w) { # TODO } @@ -39,15 +39,12 @@ # for a site or horizon level weight column, use Hmisc::wtd.quantile # note that "frequency weights" are assumed to be most common use case, so normwt=FALSE by default, # normwt=TRUE can be passed as optional argument for slab.fun from slab() interface -.slab.fun.numeric.weighted <- function(values, probs = c(0.05, 0.25, 0.5, 0.75, 0.95), wt, normwt = FALSE) { +.slab.fun.numeric.weighted <- function(values, w, probs = c(0.05, 0.25, 0.5, 0.75, 0.95), normwt = FALSE) { if (!requireNamespace('Hmisc', quietly = TRUE)) stop('please install the `Hmisc` package to use `wtd.quantile()` method', call. = FALSE) - res <- try(Hmisc::wtd.quantile(values, weights = wt, probs = probs, na.rm = TRUE, normwt = normwt), silent = TRUE) + res <- try(Hmisc::wtd.quantile(values, w, probs = probs, na.rm = TRUE, normwt = normwt), silent = TRUE) if (inherits(res, 'try-error') && grepl("zero non-NA points", res[1])) { res <- rep(NA_real_, length(probs)) - } else if (inherits(res, 'try-error')) { - print(res) - return(NULL) } else { names(res) <- paste('p.q', round(probs * 100), sep = '') } @@ -180,11 +177,11 @@ # check for weights if (!missing(weights)) { slab.fun <- .slab.fun.factor.weighted + } else { + # re-set default function, currently no user-supply-able option + slab.fun <- .slab.fun.factor.default } - # re-set default function, currently no user-supply-able option - slab.fun <- .slab.fun.factor.default - # add extra arguments required by this function # note that we cannot accept additional arguments when processing categorical values extra.args <- list(cpm = cpm) @@ -212,6 +209,14 @@ data[, g] <- 1 } + if (length(weights) > 1) { + stop("`weights` argument should be a character vector of length one containing (site-level) weight column name", call. = FALSE) + } + + if (!is.null(weights) && !weights %in% siteNames(object)) { + stop("column '", weights, "' not present in site data", call. = FALSE) + } + # convert wide -> long format d.long <- data.table::melt( as.data.table(data[which(!is.na(data$seg.label)), ]), @@ -227,11 +232,13 @@ if (missing(slab.fun)) { # default _weighted_ aggregation fun is .slab.fun.numeric.weighted FUN <- .slab.fun.numeric.weighted - # custom weighted aggregation fun should take first argument as values w. named "weights" argument - } else FUN <- slab.fun - - d.slabbed <- as.data.frame(d.long[, as.data.frame(t(FUN(value, wt = weights))), by = c('variable', g, 'seg.label')]) - d.slabbed$variable + # custom weighted aggregation fun should take first argument as values + # second argument is "weights" argument + } else { + FUN <- slab.fun + } + wt <- eval(weights) + d.slabbed <- as.data.frame(d.long[, as.data.frame(t(FUN(value, .SD[[wt]]))), by = c('variable', g, 'seg.label')]) d.slabbed$contributing_fraction <- d.long[, sum(!is.na(.SD[["value"]])) / .N, by = c('variable', g, 'seg.label')]$V1 } else { @@ -275,6 +282,12 @@ } + # rename default data.table value column names + unn <- grepl("^V\\d+$", colnames(d.slabbed)) + if (any(unn)) { + colnames(d.slabbed)[unn] <- paste0("value", ifelse(sum(unn) == 1, "", 1:sum(unn))) + } + # convert the slab.label column into top/bottom as integers slab.depths <- strsplit(as.character(d.slabbed$seg.label), '-') d.slabbed$top <- as.integer(lapply(slab.depths, function(i) i[1])) diff --git a/tests/testthat/test-slab.R b/tests/testthat/test-slab.R index fc2993e9f..1391f63f6 100644 --- a/tests/testthat/test-slab.R +++ b/tests/testthat/test-slab.R @@ -84,6 +84,41 @@ test_that("extended slab functionality: weighted aggregation", { expect_true(all(a.1$contributing_fraction == a.2$contributing_fraction)) expect_true(all(a.2$contributing_fraction == a.3$contributing_fraction)) + # component weighted averages + # mukey=461268; Doemill-Jokerst, 3 to 8 percent slopes (615) + x <- data.frame(cokey = c(21469960L, 21469960L, 21469960L, 21469960L, + 21469960L, 21469961L, 21469961L, 21469961L), + comppct_r = c(50L, 50L, 50L, 50L, 50L, 40L, 40L, 40L), + hzdept_r = c(0L, 3L, 13L, 23L, 36L, 0L, 3L, 10L), + hzdepb_r = c(3L, 13L, 23L, 36L, 61L, 3L, 10L, 35L), + ph1to1h2o_r = c(6.1, 6.8, 6.7, 6.7, NA, 6.4, 6.5, NA)) + depths(x) <- cokey ~ hzdept_r + hzdepb_r + site(x) <- ~ comppct_r + + # custom function, with actual comppct_r + a.0 <- slab(x, ~ ph1to1h2o_r, + slab.structure = c(0, 10), + weights = "comppct_r", + slab.fun = weighted.mean, + na.rm = TRUE + ) + expect_equal(a.0$value, 6.54, tolerance = 0.005) + + # should match this within the tolerance: + # soilDB::get_SDA_property(property = 'ph1to1h2o_r', method = "weighted average", + # mukeys = 461268, miscellaneous_areas = FALSE, include_minors = TRUE, + # top_depth = 0, bottom_depth = 10) + + # comppct_r adjusted to get higher result more like doemill + x$comppct_r <- c(90, 10) + a.1 <- slab(x, ~ ph1to1h2o_r, slab.structure = c(0, 10), weights = "comppct_r") + expect_equal(a.1$p.q50, 6.8) + + # comppct_r adjusted to get lower result more like jokerst + x$comppct_r <- c(10, 90) + a.2 <- slab(x, ~ ph1to1h2o_r, slab.structure = c(0, 10), weights = "comppct_r") + expect_equal(a.2$p.q50, 6.5) + }) test_that("slab calculations: mean, single profile", { @@ -93,7 +128,14 @@ test_that("slab calculations: mean, single profile", { # aggregate single profile # custom slabs - a <- slab(sp1[1, ], fm = ~ prop, strict=TRUE, slab.structure=c(0,5,10,25,100), slab.fun = mean, na.rm=TRUE) + a <- slab( + sp1[1,], + fm = ~ prop, + strict = TRUE, + slab.structure = c(0, 5, 10, 25, 100), + slab.fun = mean, + na.rm = TRUE + ) # using mean and similar functions will cause slab to store single result / slab into 'value' column expect_true(any(grepl('value', names(a)))) @@ -101,13 +143,13 @@ test_that("slab calculations: mean, single profile", { # calculations done by hand (DEB) # weighted mean, single profile # 0-5: 9.400 - expect_equal(a$value[1], 9.4, tolerance=0.0001) + expect_equal(a$value[1], 9.4, tolerance = 0.0001) # 5-10: 7.000 - expect_equal(a$value[2], 7, tolerance=0.0001) + expect_equal(a$value[2], 7, tolerance = 0.0001) # 10-25: 8.466 - expect_equal(a$value[3], 8.4666, tolerance=0.0001) + expect_equal(a$value[3], 8.4666, tolerance = 0.0001) # 25-100: 15.625 - expect_equal(a$value[4], 15.625, tolerance=0.0001) + expect_equal(a$value[4], 15.625, tolerance = 0.0001) }) From 20666b1f2f7dfe102c8e20f7fc957a73e57ef155 Mon Sep 17 00:00:00 2001 From: "Andrew G. Brown" Date: Thu, 18 Aug 2022 15:17:25 -0700 Subject: [PATCH 6/7] internal slab functions: Allow for (possibly unused) additional arguments to avoid errors - explicitly support `na.rm` where appropriate (defaults to true) --- R/slab.R | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/R/slab.R b/R/slab.R index 600984d56..3b7c1e81a 100644 --- a/R/slab.R +++ b/R/slab.R @@ -1,7 +1,7 @@ # default slab function for categorical variables # returns a named vector of results # this type of function is compatible with aggregate() -.slab.fun.factor.default <- function(values, cpm) { +.slab.fun.factor.default <- function(values, cpm, ...) { if (cpm == 1) { # probabilities are relative to number of contributing profiles @@ -25,24 +25,24 @@ return(pt) } -.slab.fun.factor.weighted <- function(values, w) { +.slab.fun.factor.weighted <- function(values, w, na.rm = TRUE, ...) { # TODO } # default slab function for continuous variables # returns a named vector of results # this type of function is compatible with aggregate() -.slab.fun.numeric.default <- function(values, probs = c(0.05, 0.25, 0.5, 0.75, 0.95)) { - .slab.fun.numeric.fast(values, probs = probs) +.slab.fun.numeric.default <- function(values, probs = c(0.05, 0.25, 0.5, 0.75, 0.95), na.rm = TRUE, ...) { + .slab.fun.numeric.fast(values, probs = probs, na.rm = na.rm, ...) } # for a site or horizon level weight column, use Hmisc::wtd.quantile # note that "frequency weights" are assumed to be most common use case, so normwt=FALSE by default, # normwt=TRUE can be passed as optional argument for slab.fun from slab() interface -.slab.fun.numeric.weighted <- function(values, w, probs = c(0.05, 0.25, 0.5, 0.75, 0.95), normwt = FALSE) { +.slab.fun.numeric.weighted <- function(values, w, probs = c(0.05, 0.25, 0.5, 0.75, 0.95), normwt = FALSE, na.rm = TRUE, ...) { if (!requireNamespace('Hmisc', quietly = TRUE)) stop('please install the `Hmisc` package to use `wtd.quantile()` method', call. = FALSE) - res <- try(Hmisc::wtd.quantile(values, w, probs = probs, na.rm = TRUE, normwt = normwt), silent = TRUE) + res <- try(Hmisc::wtd.quantile(values, w, probs = probs, na.rm = na.rm, normwt = normwt), silent = TRUE) if (inherits(res, 'try-error') && grepl("zero non-NA points", res[1])) { res <- rep(NA_real_, length(probs)) } else { @@ -52,20 +52,20 @@ } # easy specification of Hmisc::hdquantile if available -.slab.fun.numeric.HD <- function(values, probs = c(0.05, 0.25, 0.5, 0.75, 0.95)) { +.slab.fun.numeric.HD <- function(values, probs = c(0.05, 0.25, 0.5, 0.75, 0.95), na.rm = TRUE, ...) { # sanity check, need this for color distance eval if (!requireNamespace('Hmisc', quietly = TRUE)) stop('please install the `Hmisc` package to use `hdquantile()` method', call. = FALSE) - res <- Hmisc::hdquantile(values, probs = probs, na.rm = TRUE) + res <- Hmisc::hdquantile(values, probs = probs, na.rm = na.rm) names(res) <- paste('p.q', round(probs * 100), sep = '') return(res) } # basic quantile evaluation, better for large data sets -.slab.fun.numeric.fast <- function(values, probs = c(0.05, 0.25, 0.5, 0.75, 0.95)) { - res <- quantile(values, probs = probs, na.rm = TRUE) +.slab.fun.numeric.fast <- function(values, probs = c(0.05, 0.25, 0.5, 0.75, 0.95), na.rm = TRUE, ...) { + res <- quantile(values, probs = probs, na.rm = na.rm) names(res) <- paste('p.q', round(probs * 100), sep = '') return(res) } @@ -238,7 +238,7 @@ FUN <- slab.fun } wt <- eval(weights) - d.slabbed <- as.data.frame(d.long[, as.data.frame(t(FUN(value, .SD[[wt]]))), by = c('variable', g, 'seg.label')]) + d.slabbed <- as.data.frame(d.long[, as.data.frame(t(FUN(value, .SD[[wt]], ...))), by = c('variable', g, 'seg.label')]) d.slabbed$contributing_fraction <- d.long[, sum(!is.na(.SD[["value"]])) / .N, by = c('variable', g, 'seg.label')]$V1 } else { From ea832fcea59feace2bc901d87e72eb4f7642ac49 Mon Sep 17 00:00:00 2001 From: "Andrew G. Brown" Date: Fri, 19 Aug 2022 09:01:06 -0700 Subject: [PATCH 7/7] Add tests with missing weights --- tests/testthat/test-slab.R | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/testthat/test-slab.R b/tests/testthat/test-slab.R index 1391f63f6..8b1584f4f 100644 --- a/tests/testthat/test-slab.R +++ b/tests/testthat/test-slab.R @@ -119,6 +119,14 @@ test_that("extended slab functionality: weighted aggregation", { a.2 <- slab(x, ~ ph1to1h2o_r, slab.structure = c(0, 10), weights = "comppct_r") expect_equal(a.2$p.q50, 6.5) + # comppct_r adjusted to get lower result more like jokerst + x$comppct_r <- c(NA, 90) + na.1 <- slab(x, ~ ph1to1h2o_r, slab.structure = c(0, 10), weights = "comppct_r", na.rm = TRUE) + expect_equal(na.1$p.q50, 6.5) + + na.2 <- slab(x, ~ ph1to1h2o_r, slab.structure = c(0, 10), weights = "comppct_r", slab.fun = weighted.mean, na.rm = TRUE) + expect_equal(na.2$value, NA) + }) test_that("slab calculations: mean, single profile", {