Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates to dice(), slab(), genSlabLabels() #258

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 52 additions & 41 deletions R/dice.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
#'
#' @param byhz Evaluate horizon depth logic at the horizon level (`TRUE`) or profile level (`FALSE`). Invalid depth logic invokes `HzDepthLogicSubset` which removes offending profiles or horizon records.
#'
#' @param verbose Print information about object size/memory usage. Default: `FALSE`
#'
#' @details For large and potentially messy collections that include missing horizon bottom depths, or 0-thickness horions, consider using `repairMissingHzDepths()` before `dice()`.
#'
#'
Expand All @@ -36,19 +38,26 @@
#'
#' @export
#'
dice <- function(x, fm = NULL, SPC = TRUE, pctMissing = FALSE, fill = FALSE, strict = TRUE, byhz = FALSE) {

dice <- function(x,
fm = NULL,
SPC = TRUE,
pctMissing = FALSE,
fill = FALSE,
strict = TRUE,
byhz = FALSE,
verbose = FALSE) {

# sacrifice to R CMD check spirits
.pctMissing <- NULL

# find / flag / remove invalid profiles or horizons
# this will generate an error if there are no valid profiles remaining
if(strict) {
x <- HzDepthLogicSubset(x, byhz = byhz)
if (strict) {
x <- HzDepthLogicSubset(x, byhz = TRUE)

## TODO: this could invoke 2x calls to fillHzGaps
# removed horizons will trigger an automatic gap-filling
if(!is.null(metadata(x)$removed.horizons)) {
if (!is.null(metadata(x)$removed.horizons)) {
message('filling gaps left by HzDepthLogicSubset')
x <- fillHzGaps(x, flag = TRUE, to_top = NULL, to_bottom = NULL)
}
Expand All @@ -67,14 +76,13 @@ dice <- function(x, fm = NULL, SPC = TRUE, pctMissing = FALSE, fill = FALSE, str
ids.top.bottom.idx <- match(c(idn, hzidn, htb), hznames)

# interpret optional formula
if(!is.null(fm)) {
if (!is.null(fm)) {
# something supplied
# reasonable?
if(! inherits(fm, "formula")) {
if (!inherits(fm, "formula")) {
stop('must provide a valid formula', call. = FALSE)
}


# extract components of the formula:
fm <- str_c(deparse(fm, 500), collapse = "")
elements <- str_split(fm, fixed("~"))[[1]]
Expand All @@ -92,17 +100,17 @@ dice <- function(x, fm = NULL, SPC = TRUE, pctMissing = FALSE, fill = FALSE, str
vars <- fm[[2]]

# check for bogus left/right side problems with the formula
if(any(z < 0) | any(is.na(z))) {
if (any(z < 0) | any(is.na(z))) {
stop('z-slice must be >= 0', call. = FALSE)
}

# check for integer / numeric slices
if(! inherits(z, c('numeric','integer'))) {
if (!inherits(z, c('numeric','integer'))) {
stop('z-slice must be either numeric or integer', call. = FALSE)
}

# if z-index is missing set to NULL for later on
if(length(z) == 0) {
if (length(z) == 0) {
z <- NULL
} else {
# z index is specified
Expand All @@ -111,13 +119,13 @@ dice <- function(x, fm = NULL, SPC = TRUE, pctMissing = FALSE, fill = FALSE, str
}

# check for '.' --> all variables, minus ID/depths
if(any(vars == '.')) {
if (any(vars == '.')) {
# all variables except profile ID, horizon ID, top, bottom
vars <- hznames[-ids.top.bottom.idx]
}

# check for column names that don't exist
if(! any(vars %in% hznames)) {
if (!any(vars %in% hznames)) {
stop('names in formula do not match any horizon attributes', call. = FALSE)
}

Expand All @@ -127,7 +135,7 @@ dice <- function(x, fm = NULL, SPC = TRUE, pctMissing = FALSE, fill = FALSE, str
# slice to-depth of collection

# optionally fill all gaps between min(x) --- [gaps] --- max(x)
if(fill) {
if (fill) {
x <- fillHzGaps(x, flag = TRUE, to_top = min(x), to_bottom = max(x))
}

Expand All @@ -147,11 +155,12 @@ dice <- function(x, fm = NULL, SPC = TRUE, pctMissing = FALSE, fill = FALSE, str
)

# convert to DT
if(! inherits(h, 'data.table')) {
if (!inherits(h, 'data.table')) {
h <- as.data.table(h)
}

## TODO: are keys worth the extra sorting / re-sorting?
## no, probably not inherently worth it -AGB 2022/08/17
# init keys, sorts the data on hzID (alpha-sort)
# setkeyv(h, hzidn)
# consider an index, seems to have no effect
Expand All @@ -163,21 +172,31 @@ dice <- function(x, fm = NULL, SPC = TRUE, pctMissing = FALSE, fill = FALSE, str

# safe wrapper to base::seq
# NA in from/to (hz depths) not allowed
# TODO: from == to (0-thickness) horizons ??
.seq.safe <- function(from, to, by) {
if(any(is.na(from)) | any(is.na(to))) {
if (any(is.na(from)) | any(is.na(to))) {
stop('corrupt horizonation, consider using `strict = TRUE`', call. = FALSE)
}

if (from > to) {
toold <- to
to <- from
from <- toold
}
return(
seq(from = from, to = to, by = by)
seq(from = from, to = to, by = abs(by))
)
}

# from == to (0-thickness) horizons -- these should be removed -AGB 2022/08/17
zthk <- h[[htb[1]]] == h[[htb[2]]]
if (any(zthk, na.rm = TRUE)) {
message("horizons with zero thickness have been omitted from results")
}
h.sub <- h[which(!zthk),]

tops <- mapply(
FUN = .seq.safe,
from = h[[htb[1]]],
to = h[[htb[2]]] - 1,
from = h.sub[[htb[1]]],
to = h.sub[[htb[2]]] - 1,
by = 1,
SIMPLIFY = FALSE
)
Expand All @@ -188,30 +207,21 @@ dice <- function(x, fm = NULL, SPC = TRUE, pctMissing = FALSE, fill = FALSE, str
# expand slice IDs (horizon IDs)
# these are used to join with horizons
sliceIDs <- rep(
h[[hzidn]],
times = h[[htb[2]]] - h[[htb[1]]]
h.sub[[hzidn]],
times = h.sub[[htb[2]]] - h.sub[[htb[1]]]
)


# assemble slice LUT for JOIN
s <- data.table(
sliceID = sliceIDs,
.sliceTop = tops,
.sliceBottom = bottoms
.sliceTop = tops[1:length(sliceIDs)],
.sliceBottom = bottoms[1:length(sliceIDs)]
)

# re-name for simpler JOIN
names(s)[1] <- hzidn


## TODO: are keys worth the extra sorting / re-sorting?
# note: sorts data
# setkeyv(s, hzidn)
# consider an index, seems to have no effect
# setindexv(s, hzidn)


# FULL JOIN via fast data.table compiled code
# using index (?)
res <- merge(h, s, by = hzidn, all = TRUE, sort = FALSE)

## TODO: update to := syntax, but how does it work with with variables?
Expand All @@ -231,14 +241,13 @@ dice <- function(x, fm = NULL, SPC = TRUE, pctMissing = FALSE, fill = FALSE, str
res[['.sliceTop']] <- NULL
res[['.sliceBottom']] <- NULL


## TODO: move z-index subset"up" to original sequence creation to save a lot of time
if(!is.null(z)) {
if (!is.null(z)) {
res <- res[which(res[[htb[1]]] %in% z), ]
}

# slice-wise "percent missing" calculation
if(pctMissing) {
if (pctMissing) {

# this is quite slow: DT -> DF -> matrix -> apply (8x longer)
# res[['.pctMissing']] <- .pctMissing(res, vars)
Expand All @@ -257,7 +266,7 @@ dice <- function(x, fm = NULL, SPC = TRUE, pctMissing = FALSE, fill = FALSE, str
}

# only returning horizons as a data.frame
if(!SPC) {
if (!SPC) {
return(as.data.frame(res))
}

Expand All @@ -276,8 +285,10 @@ dice <- function(x, fm = NULL, SPC = TRUE, pctMissing = FALSE, fill = FALSE, str
OBF <- round(f.size / o.size, 1)
metadata(x)$OBF <- OBF

# possibly informative
message(sprintf("Object Bloat Factor: %s", OBF))
if (verbose) {
# turned off by default; possibly informative
message(sprintf("Object Bloat Factor: %s", OBF))
}

return(x)
}
Expand Down
58 changes: 28 additions & 30 deletions R/genSlabLabels.R
Original file line number Diff line number Diff line change
@@ -1,54 +1,52 @@
## TODO: documentation / generalization
# note source data must be normalixed via slice() first, e.g. all share the same number of horizons
# note source data must be "normalized" via dice() first; assumes each profile has the same number of horizons

# generate labels for slabs
genSlabLabels <- function(slab.structure=1, max.d, n.profiles) {
genSlabLabels <- function(slab.structure = 1, max.d, n.profiles) {

# fixed-size slabs
if(length(slab.structure) == 1) {
if (length(slab.structure) == 1) {
# generate sequence of segment labels
seg.label <- rep(1:ceiling(max.d / slab.structure), each=slab.structure, length=max.d)
seg.label <- rep(1:ceiling(max.d / slab.structure), each = slab.structure, length = max.d)

# general segment labels
seg.label.levels <- tapply(1:max.d, seg.label, function(i) {r <- range(i); paste(c(r[1]-1, r[2]), collapse='-') } )
seg.label.levels <- tapply(1:max.d, seg.label,
function(i) {
r <- range(i, na.rm = TRUE)
paste(c(r[1] - 1, r[2]), collapse = '-')
})
}

# user-defined slabs
if(length(slab.structure) > 1) {
# trival case where segments start from 0
if(slab.structure[1] == 0 & length(slab.structure) > 2)
seg.label <- rep(slab.structure[-1], times=diff(slab.structure))[1:max.d]
if (length(slab.structure) > 1) {
if (slab.structure[1] == 0 & length(slab.structure) > 2) {
# trivial case where segments start from 0

seg.label <- rep(slab.structure[-1], times = diff(slab.structure))[1:max.d]

# other case: user defines an arbitrary lower and upper limit
else {
if(length(slab.structure) != 2)
} else {
# other case: user defines an arbitrary lower and upper limit
if (length(slab.structure) != 2)
stop('user-defined slab boundaries must either start from 0, or contain two values between 0 and the max soil depth')

# proceed
# calculate thickness of slab
slab.thickness <- diff(slab.structure)
# how many slices of NA before the slab?
padding.before <- rep(NA, times=slab.structure[1])
# how many slices of NA afer the slab
padding.after <- rep(NA, times=abs(max.d - slab.structure[2]))

# make a new label for the slab
new.label <- paste(slab.structure, collapse='-')
# generate an index for the slab
slab.idx <- rep(new.label, times=slab.thickness)
# generate the entire index: padding+slab+padding = total number of slices (max_d)
# seg.label <- c(padding.before, slab.idx, padding.after)
seg.label <- slab.idx
new.label <- paste(slab.structure, collapse = '-')

slab.idx <- rep(new.label, times = slab.thickness)
seg.label <- slab.idx
}

# generate segment labels
seg.label.levels <- sapply(1:(length(slab.structure)-1), function(i) paste(c(slab.structure[i], slab.structure[i+1]), collapse='-'))
seg.label.levels <- sapply(1:(length(slab.structure) - 1), function(i) {
paste(c(slab.structure[i], slab.structure[i + 1]), collapse = '-')
})
}

# covert into a factor that can be used to split profiles into slabs
res <- factor(rep(seg.label, times=n.profiles), labels=seg.label.levels)
res <- factor(rep(seg.label, times = n.profiles), labels = seg.label.levels)

return(res)
}





Loading