Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates to dice(), slab(), genSlabLabels() #258

Closed
wants to merge 7 commits into from
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
slab(): implement weights argument
 - Add `slab_function()`
  • Loading branch information
brownag committed Aug 19, 2022
commit 8cd031179e80c87be11b425d758ed6bea0b4ea36
236 changes: 124 additions & 112 deletions R/slab.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@


# default slab function for categorical variables
# returns a named vector of results
# this type of function is compatible with aggregate()
@@ -27,78 +25,78 @@
return(pt)
}

## TODO: make these exported functions with documentation (https://github.com/ncss-tech/aqp/issues/99)

.slab.fun.factor.weighted <- function(values, weights) {
# TODO
}

# default slab function for continuous variables
# returns a named vector of results
# this type of function is compatible with aggregate()
.slab.fun.numeric.default <- function(values) {
q.probs <- c(0.05, 0.25, 0.5, 0.75, 0.95)
## 2019-10-30: dropping Hmisc as a suggested package
# res <- Hmisc::hdquantile(values, probs=q.probs, na.rm=TRUE)
res <- quantile(values, probs = q.probs, na.rm = TRUE)
names(res) <- paste('p.q', round(q.probs * 100), sep = '')
return(res)
.slab.fun.numeric.default <- function(values, probs = c(0.05, 0.25, 0.5, 0.75, 0.95)) {
.slab.fun.numeric.fast(values, probs = probs)
}

# for a site or horizon level weight column, use Hmisc::wtd.quantile
# note that "frequency weights" are assumed to be most common use case, so normwt=FALSE by default,
# normwt=TRUE can be passed as optional argument for slab.fun from slab() interface
.slab.fun.numeric.weighted <- function(values, probs = c(0.05, 0.25, 0.5, 0.75, 0.95), wt, normwt = FALSE) {
if (!requireNamespace('Hmisc', quietly = TRUE))
stop('please install the `Hmisc` package to use `wtd.quantile()` method', call. = FALSE)
res <- try(Hmisc::wtd.quantile(values, weights = wt, probs = probs, na.rm = TRUE, normwt = normwt), silent = TRUE)
if (inherits(res, 'try-error') && grepl("zero non-NA points", res[1])) {
res <- rep(NA_real_, length(probs))
} else if (inherits(res, 'try-error')) {
print(res)
return(NULL)
} else {
names(res) <- paste('p.q', round(probs * 100), sep = '')
}
return(res)
}

# easy specification of Hmisc::hdquantile if available
.slab.fun.numeric.HD <- function(values) {
.slab.fun.numeric.HD <- function(values, probs = c(0.05, 0.25, 0.5, 0.75, 0.95)) {
# sanity check, need this for color distance eval
if (!requireNamespace('Hmisc', quietly = TRUE))
stop('please install the `Hmisc` package to use `hdquantile()` method', call. = FALSE)

q.probs <- c(0.05, 0.25, 0.5, 0.75, 0.95)

res <- Hmisc::hdquantile(values, probs = q.probs, na.rm = TRUE)
res <- Hmisc::hdquantile(values, probs = probs, na.rm = TRUE)

names(res) <- paste('p.q', round(q.probs * 100), sep = '')
names(res) <- paste('p.q', round(probs * 100), sep = '')
return(res)
}

# basic quantile evaluation, better for large datasets
.slab.fun.numeric.fast <- function(values) {
q.probs <- c(0.05, 0.25, 0.5, 0.75, 0.95)
res <- quantile(values, probs = q.probs, na.rm = TRUE)
names(res) <- paste('p.q', round(q.probs * 100), sep = '')
return(res)
# basic quantile evaluation, better for large data sets
.slab.fun.numeric.fast <- function(values, probs = c(0.05, 0.25, 0.5, 0.75, 0.95)) {
res <- quantile(values, probs = probs, na.rm = TRUE)
names(res) <- paste('p.q', round(probs * 100), sep = '')
return(res)
}

# ## TODO: not yet implemented
# # default slab function for weighted, continuous variables
# # returns a named vector of results
# # this type of function is compatible with aggregate()
# # NOTE: nw (normalize-weights) argument will affect the results!
# .slab.fun.wtd.numeric.default <- function(values, w, nw=TRUE) {
# q.probs <- c(0.05, 0.25, 0.5, 0.75, 0.95)
# res <- wtd.quantile(values, weights=w, probs=q.probs, na.rm=TRUE, normwt=nw)
# names(res) <- paste('p.q', round(q.probs * 100), sep='')
# return(res)
# }

# SoilProfileCollection method
# object: SoilProfileCollection
# fm: formula defining aggregation
# slab.structure: either regular segment interval, or user-defined segment boundaries {starting from 0, or length of 2}
# progress: plyr-progress display
# slab.fun: aggregate function applied to data chunks (must return a single row / chunk)
# cpm: class probability normalization mode
# weights: character vector naming column containing weights
# weights: character vector naming column in site slot containing weights
# ... : extra arguments passed on to slab.fun
.slab <- function(object,
fm,
slab.structure = 1,
strict = FALSE,
slab.fun = .slab.fun.numeric.default,
slab.fun = slab_function(method = "numeric"),
cpm = 1,
weights = NULL,
...) {

# define keywords for data.table
.SD <- NULL; .N <- NULL

# get extra arguments: length of 0 if no extra arguments
extra.args <- list(...)

## important: change the default behavior of data.frame and melt
opt.original <- options(stringsAsFactors = FALSE)

# get unique list of names in object
object.names <- unique(unlist(names(object)))

@@ -117,7 +115,7 @@
# extract components of the formula:
g <- all.vars(update(fm, .~0)) # left-hand side
vars <- all.vars(update(fm, 0~.)) # right-hand side

# sanity check:
if (!inherits(fm, "formula"))
stop('must provide a valid formula: groups ~ var1 + var2 + ...', call. = FALSE)
@@ -145,18 +143,12 @@

# check groups for NA, if grouping variable was given
if (g != '.' && any(is.na(site.data[, g]))) {
stop('grouping variable must not contain NA', call. = FALSE)
stop('grouping variable must not contain NA', call. = FALSE)
}

### !!! this runs out of memory when groups contains NA !!!
## this is generating a lot of extra objects and possibly wasting memory
# merge site data back into the result, this would include site-level weights
data <- merge(x = data, y = site.data, by = object.ID, all.x = TRUE, sort = FALSE)

# clean-up
rm(object, site.data)
gc()

# check variable classes
if (length(vars) > 1) {
vars.numeric.test <- sapply(data[, vars], is.numeric)
@@ -180,7 +172,7 @@

# check for weights
if (!missing(weights)) {
stop('weighted aggregation of categorical variables not yet implemented', call. = FALSE)
slab.fun <- .slab.fun.factor.weighted
}

# re-set default function, currently no user-supply-able option
@@ -216,71 +208,75 @@
g <- 'all.profiles' # add new grouping variable to horizons
data[, g] <- 1
}


# convert wide -> long format
d.long <- data.table::melt(
as.data.table(data[which(!is.na(data$seg.label)), ]),
id.vars = unique(c(object.ID, 'seg.label', g, weights)),
measure.vars = vars
)

# make a formula for aggregate()
aggregate.fm <- as.formula(paste('value ~ seg.label + variable + ', g, sep = ''))

# check for weights
if (!missing(weights)) {
## TODO: adding weighted aggregation via data.table
stop('weighted aggregation is not yet supported', call. = FALSE)
}

## TODO: why would this ever happen?
# throwing out those rows with an NA segment label
seg.label.is.not.NA <- which(!is.na(data$seg.label))

# convert wide -> long format w/ data.table::melt()
# will not preserve factor levels when 'vars' is categorical; must call unique() on `id.vars`
d.long <- melt(
as.data.table(data[seg.label.is.not.NA, ]),
id.vars = unique(c(object.ID, 'seg.label', g)),
measure.vars = vars,
)

# convert back to data.frame
d.long <- as.data.frame(d.long)

# reset factor levels in d.long[[value]]
if (.factorFlag) {
d.long[['value']] <- factor(d.long[['value']], levels = original.levels)
}

# make a formula for aggregate()
aggregate.fm <- as.formula(paste('value ~ seg.label + variable + ', g, sep = ''))
if (missing(slab.fun)) {
# default _weighted_ aggregation fun is .slab.fun.numeric.weighted
FUN <- .slab.fun.numeric.weighted
# custom weighted aggregation fun should take first argument as values w. named "weights" argument
} else FUN <- slab.fun

d.slabbed <- as.data.frame(d.long[, as.data.frame(t(FUN(value, wt = weights))), by = c('variable', g, 'seg.label')])
d.slabbed$variable
d.slabbed$contributing_fraction <- d.long[, sum(!is.na(.SD[["value"]])) / .N, by = c('variable', g, 'seg.label')]$V1

# process chunks according to group -> variable -> segment
# NA values are not explicitly dropped
if (length(extra.args) == 0) {
d.slabbed <- aggregate(aggregate.fm, data = d.long, na.action = na.pass, FUN = slab.fun)
} else {
# optionally account for extra arguments
the.args <- c(list(aggregate.fm, data = d.long, na.action = na.pass, FUN = slab.fun), extra.args)
d.slabbed <- do.call(what = 'aggregate', args = the.args)
# convert back to data.frame
d.long <- as.data.frame(d.long)

# reset factor levels in d.long[[value]]
if (.factorFlag) {
d.long[['value']] <- factor(d.long[['value']], levels = original.levels)
}

# process chunks according to group -> variable -> segment
# NA values are not explicitly dropped
if (length(extra.args) == 0) {
d.slabbed <- aggregate(aggregate.fm, data = d.long, na.action = na.pass, FUN = slab.fun)
} else {
# optionally account for extra arguments
the.args <- c(list(aggregate.fm, data = d.long, na.action = na.pass, FUN = slab.fun), extra.args)
d.slabbed <- do.call(what = 'aggregate', args = the.args)
}

# if slab.fun returns a vector of length > 1 we must:
# convert the complex data.frame returned by aggregate into a regular data.frame
# the column 'value' is a matrix with the results of slab.fun
if (inherits(d.slabbed$value, 'matrix')) {
d.slabbed <- cbind(d.slabbed[, 1:3], d.slabbed$value)
}

# ensure that the names returned from slab.fun are legal
names(d.slabbed) <- make.names(names(d.slabbed))

# estimate contributing fraction
d.slabbed$contributing_fraction <- aggregate(
aggregate.fm,
data = d.long,
na.action = na.pass,
FUN = function(i) {
length(na.omit(i)) / length(i)
}
)$value

}

# if slab.fun returns a vector of length > 1 we must:
# convert the complex data.frame returned by aggregate into a regular data.frame
# the column 'value' is a matrix with the results of slab.fun
if (inherits(d.slabbed$value, 'matrix')) {
d.slabbed <- cbind(d.slabbed[, 1:3], d.slabbed$value)
}

# ensure that the names returned from slab.fun are legal
names(d.slabbed) <- make.names(names(d.slabbed))

# convert the slab.label column into top/bottom as integers
slab.depths <- strsplit(as.character(d.slabbed$seg.label), '-')
d.slabbed$top <- as.integer(lapply(slab.depths, function(i) i[1]))
d.slabbed$bottom <- as.integer(lapply(slab.depths, function(i) i[2]))

# estimate contributing fraction
d.slabbed$contributing_fraction <- aggregate(
aggregate.fm,
data = d.long,
na.action = na.pass,
FUN = function(i) {
length(na.omit(i)) / length(i)
}
)$value


# convert the slab.label column into top/bottom as integers
slab.depths <- strsplit(as.character(d.slabbed$seg.label), '-')
d.slabbed$top <- as.integer(lapply(slab.depths, function(i) i[1]))
d.slabbed$bottom <- as.integer(lapply(slab.depths, function(i) i[2]))

# remove seg.label from result
d.slabbed$seg.label <- NULL

@@ -291,8 +287,6 @@
attr(d.slabbed, 'original.levels') <- original.levels
}

# reset options
options(opt.original)
return(d.slabbed)
}

@@ -301,7 +295,7 @@ setGeneric("slab", function(object,
fm,
slab.structure = 1,
strict = FALSE,
slab.fun = .slab.fun.numeric.default,
slab.fun = slab_function(method = "numeric"),
cpm = 1,
weights = NULL,
...)
@@ -368,7 +362,7 @@ setGeneric("slab", function(object,
#' either: number of profiles with data at the current slice (cpm=1), or by the
#' number of profiles in the collection (cpm=2). Mode 1 values will always sum
#' to the contributing fraction, while mode 2 values will always sum to 1.
#' @param weights Column name containing weights. NOT YET IMPLEMENTED
#' @param weights Column name containing site-level weights
#' @param \dots further arguments passed to \code{slab.fun}
#' @return Output is returned in long format, such that slice-wise aggregates
#' are returned once for each combination of grouping level (optional),
@@ -745,3 +739,21 @@ setGeneric("slab", function(object,
setMethod(f = 'slab',
signature = 'SoilProfileCollection',
definition = .slab)

#' @param method one of `"numeric"`, `"factor"`, `"hd"`, `"weighted.numeric"`, `"weighted.factor"`, `"fast"`
#' @details `slab_function()`: The default `"numeric"` aggregation method is the `"fast"` numeric (quantile) method. Additional methods include `"factor"` for categorical data, `"hd"` to use the Harrell-Davis Distribution-Free Quantile Estimator from the Hmisc package, and "`weighted`" to use a weighted quantile method from the Hmisc package
#' @return `slab_function()`: return an aggregation function based on the `method` argument
#' @rdname slab-methods
#' @export
slab_function <- function(method = c("numeric", "factor", "hd", "weighted.numeric", "weighted.factor", "fast")) {
switch(method,
numeric = .slab.fun.numeric.default, # uses "fast" numeric method
factor = .slab.fun.factor.default,
weighted.factor = .slab.fun.factor.weighted,
hd = .slab.fun.numeric.HD,
weighted.numeric = .slab.fun.numeric.weighted,
weighted.factor = .slab.fun.numeric.weighted,
fast = .slab.fun.numeric.fast,
.slab.fun.numeric.default
)
}
Loading