diff --git a/DESCRIPTION b/DESCRIPTION index 521d049..3d7a26a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: StageWise Title: Two-stage analysis of multi-environment trials for genomic selection and GWAS -Version: 1.09 +Version: 1.10 Author: Jeffrey B. Endelman Maintainer: Jeffrey Endelman Description: Fully efficient, two stage analysis of multi-environment trials, including directional dominance and multi-trait genomic selection @@ -8,7 +8,7 @@ Depends: R (>= 4.0) License: GPL-3 RoxygenNote: 7.2.3 Encoding: UTF-8 -Imports: Matrix (>= 1.5-0), ggplot2, methods, ggrepel, rlang, ggpubr, SpATS, spam, AGHmatrix, MASS, CVXR, ggforce +Imports: Matrix (>= 1.5-0), ggplot2, methods, ggrepel, rlang, ggpubr, SpATS, spam, AGHmatrix, MASS, CVXR, ggforce, data.table Suggests: knitr, rmarkdown, asreml Collate: 'Stage1.R' diff --git a/NEWS b/NEWS index 8febdea..afb1a17 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,7 @@ +Changes in 1.10 +* Added support for multi-population/multi-ploidy analysis +* Fixed error in Stage1 related to multiple experiments per env + Changes in 1.09 * changed read.csv to fread in read_geno diff --git a/R/Stage1.R b/R/Stage1.R index 5d055a3..a0659bf 100644 --- a/R/Stage1.R +++ b/R/Stage1.R @@ -30,7 +30,7 @@ #' @return List containing #' \describe{ #' \item{blues}{data frame of BLUEs} -#' \item{vcov}{list of variance-covariance matrices for the BLUEs, one per experiment (env)} +#' \item{vcov}{list of variance-covariance matrices for the BLUEs, one per env} #' \item{fit}{data frame with broad-sense H2 (plot basis) and/or AIC} #' \item{resid}{For single trait, list of diagnostic plots and data frame of residuals. For multi-trait, list of resid var-cov matrices.} #' } @@ -312,55 +312,64 @@ Stage1 <- function(filename,traits,effects=NULL,solver="asreml", blue.out$env <- data$env[match(blue.out$expt,data$expt)] nee <- sapply(expt.in.env,length) - iu <- which(nee==1) - if (length(iu) > 0) { - tmp <- names(vcov)[iu] - names(vcov)[iu] <- blue.out$env[match(tmp,blue.out$expt)] - } + #iu <- which(nee==1) + #if (length(iu) > 0) { + # tmp <- names(vcov)[iu] + # names(vcov)[iu] <- blue.out$env[match(tmp,blue.out$expt)] + #} - for (j in which(nee > 1)) { - omega.list <- vcov[expt.in.env[[j]]] - vcov2 <- mapply(FUN=function(x,y){ - tmp <- paste(y,rownames(x),sep=":") - dimnames(x) <- list(tmp,tmp) - return(x)},x=omega.list,y=as.list(expt.in.env[[j]])) + vcov.env <- vector("list",length(nee)) + names(vcov.env) <- names(nee) + iu <- which(nee==1) + if (length(iu) > 0) + vcov.env[names(iu)] <- vcov[unlist(expt.in.env[iu])] - .GlobalEnv$asremlOmega <- direct_sum(lapply(vcov2,solve)) - dname <- lapply(vcov2,rownames) - dimnames(.GlobalEnv$asremlOmega) <- list(unlist(dname),unlist(dname)) - attr(.GlobalEnv$asremlOmega,"INVERSE") <- TRUE + iu <- which(nee > 1) + if (length(iu) > 0) + for (j in iu) { + omega.list <- vcov[expt.in.env[[j]]] + vcov2 <- mapply(FUN=function(x,y){ + tmp <- paste(y,rownames(x),sep=":") + dimnames(x) <- list(tmp,tmp) + return(x)},x=omega.list,y=as.list(expt.in.env[[j]])) - ix <- which(blue.out$env==envs[j]) - data2 <- blue.out[ix,] - data2$expt.id <- factor(paste(data2$expt,data2$id,sep=":")) - data2$id <- factor(data2$id) - data2$expt <- factor(data2$expt) - start.table <- asreml::asreml(data=data2,fixed=BLUE~expt-1+id, - random=~vm(expt.id,source=asremlOmega), - residual=~idv(units),start.values=TRUE)$vparameters.table - k <- grep("Omega",start.table$Component,fixed=T) - start.table$Value[k] <- 1 - start.table$Constraint[k] <- "F" - ans3 <- asreml::asreml(data=data2,fixed=BLUE~expt-1+id, - random=~vm(expt.id,source=asremlOmega), - residual=~idv(units),G.param=start.table) + .GlobalEnv$asremlOmega <- direct_sum(lapply(vcov2,solve)) + dname <- lapply(vcov2,rownames) + dimnames(.GlobalEnv$asremlOmega) <- list(unlist(dname),unlist(dname)) + attr(.GlobalEnv$asremlOmega,"INVERSE") <- TRUE - predans3 <- asreml::predict.asreml(ans3,classify="id",vcov = TRUE) - blue3 <- data.frame(expt=NA,predans3$pvals[,c("id","predicted.value")],env=envs[j]) - colnames(blue3) <- c("expt","id","BLUE","env") - vcov3 <- predans3$vcov - dimnames(vcov3) <- list(blue3$id,blue3$id) - vcov2 <- vcov[-match(expt.in.env[[j]],names(vcov))] - vcov <- c(vcov2,vcov3) - names(vcov) <- c(names(vcov2),envs[j]) - blue.out <- rbind(blue.out[-ix,],blue3) - rm("asremlOmega",envir = .GlobalEnv) - } + ix <- which(blue.out$env==envs[j]) + data2 <- blue.out[ix,] + data2$expt.id <- factor(paste(data2$expt,data2$id,sep=":")) + data2$id <- factor(data2$id) + data2$expt <- factor(data2$expt) + start.table <- asreml::asreml(data=data2,fixed=BLUE~expt-1+id, + random=~vm(expt.id,source=asremlOmega), + residual=~idv(units),start.values=TRUE)$vparameters.table + k <- grep("Omega",start.table$Component,fixed=T) + start.table$Value[k] <- 1 + start.table$Constraint[k] <- "F" + ans3 <- asreml::asreml(data=data2,fixed=BLUE~expt-1+id, + random=~vm(expt.id,source=asremlOmega), + residual=~idv(units),G.param=start.table) + + predans3 <- asreml::predict.asreml(ans3,classify="id",vcov = TRUE) + blue3 <- data.frame(expt=NA,predans3$pvals[,c("id","predicted.value")],env=envs[j]) + colnames(blue3) <- c("expt","id","BLUE","env") + vcov3 <- predans3$vcov + dimnames(vcov3) <- list(blue3$id,blue3$id) + #vcov2 <- vcov[-match(expt.in.env[[j]],names(vcov))] + #vcov <- c(vcov2,vcov3) + #names(vcov) <- c(names(vcov2),envs[j]) + vcov.env[[names(iu)[j]]] <- vcov3 + blue.out <- rbind(blue.out[-ix,],blue3) + rm("asremlOmega",envir = .GlobalEnv) + } } if (n.trait > 1) { - vcov <- vector("list",n.env) - names(vcov) <- envs + vcov.env <- vector("list",n.env) + names(vcov.env) <- envs expt.in.env <- lapply(split(data$expt,factor(data$env,levels=envs)),unique) nexpt.per.env <- sapply(expt.in.env,length) @@ -404,10 +413,10 @@ Stage1 <- function(filename,traits,effects=NULL,solver="asreml", predans <- asreml::predict.asreml(ans,classify="id:trait",vcov = TRUE) tmp <- predans$pvals[,c("id","trait","predicted.value")] colnames(tmp) <- c("id","trait","BLUE") - vcov[[j]] <- predans$vcov + vcov.env[[j]] <- predans$vcov tmp$id <- as.character(tmp$id) id.trait <- apply(tmp[,1:2],1,paste,collapse=":") - dimnames(vcov[[j]]) <- list(id.trait,id.trait) + dimnames(vcov.env[[j]]) <- list(id.trait,id.trait) blue.out <- rbind(blue.out,data.frame(env=envs[j],tmp)) } } @@ -428,13 +437,13 @@ Stage1 <- function(filename,traits,effects=NULL,solver="asreml", stat_boxplot(outlier.color="red") + theme_bw() + theme(axis.text.x=element_text(angle=90,vjust=0.5)) p2 <- ggplot(data=blup.resid,aes(sample=.data$resid)) + stat_qq() + stat_qq_line() + facet_wrap(~expt) + theme_bw() + xlab("Expected") + ylab("Observed") if (solver=="ASREML") - return(list(blues=blue.out,vcov=vcov,fit=fit, + return(list(blues=blue.out,vcov=vcov.env,fit=fit, resid=list(boxplot=p1,qqplot=p2,table=blup.resid))) if (solver=="SPATS") - return(list(blues=blue.out,vcov=vcov,fit=fit, + return(list(blues=blue.out,vcov=vcov.env,fit=fit, resid=list(boxplot=p1,qqplot=p2,spatial=spatial.plot,table=blup.resid))) } else { #Multi-trait - return(list(blues=blue.out,vcov=vcov,fit=fit,resid=resid.vc)) + return(list(blues=blue.out,vcov=vcov.env,fit=fit,resid=resid.vc)) } } diff --git a/R/Stage2.R b/R/Stage2.R index 467a68f..e48f6ef 100644 --- a/R/Stage2.R +++ b/R/Stage2.R @@ -106,7 +106,8 @@ Stage2 <- function(data,vcov=NULL,geno=NULL,fix.eff.marker=NULL, diag(resid.vc) <- diag(resid.vc)/(n.trait-1) resid.vc <- fu(resid.vc) diag(B) <- diag(B)/(n.trait-1) - B <- fu(B) + if (max(abs(diag(B))) > .Machine$double.eps*10) + B <- fu(B) if (non.add!="none") { diag(geno2) <- diag(geno2)/(n.trait-1) @@ -138,6 +139,9 @@ Stage2 <- function(data,vcov=NULL,geno=NULL,fix.eff.marker=NULL, if (!is.null(geno)) { stopifnot(is(geno,"class_geno")) id <- sort(intersect(data$id,rownames(geno@G))) + if (length(geno@ploidy) > 1) + geno@ploidy <- geno@ploidy[id] + n <- length(id) data <- data[data$id %in% id,] id.weights <- table(factor(data$id,levels=id)) @@ -150,7 +154,7 @@ Stage2 <- function(data,vcov=NULL,geno=NULL,fix.eff.marker=NULL, meanD <- as.numeric(pvar(V=.GlobalEnv$asremlD,weights=id.weights)) dom.covariate <- as.numeric(geno@coeff.D[id,] %*% matrix(1,nrow=ncol(geno@coeff.D),ncol=1)) - dom.covariate <- dom.covariate/(geno@scale*(geno@ploidy-1)) + dom.covariate <- dom.covariate/(geno@scale[id]*(geno@ploidy-1)) names(dom.covariate) <- id data$dom <- dom.covariate[data$id] @@ -184,7 +188,7 @@ Stage2 <- function(data,vcov=NULL,geno=NULL,fix.eff.marker=NULL, data$env.id <- factor(data$env.id,levels=env.id) out <- vector("list",4) - names(out) <- c("aic","vars","fixed","random") + names(out) <- c("aic","vars","params","random") n.loc <- 1 n.trait <- 1 @@ -243,8 +247,9 @@ Stage2 <- function(data,vcov=NULL,geno=NULL,fix.eff.marker=NULL, model <- sub("FIX",paste(c("env",covariates,fix.eff.marker,dom),collapse="+"),model,fixed=T) } else { model <- "asreml(data=data,fixed=BLUE~FIX-1,random=~RANDOM,residual=~dsum(~idv(units)|loc)" - tmp <- gsub("+",":loc+",paste(c("env",fix.eff.marker,dom),collapse="+"),fixed=T) - model <- sub("FIX",paste(tmp,"loc",sep=":"),model,fixed=T) + tmp <- gsub("+",":loc+",paste(c(fix.eff.marker,dom,"env"),collapse="+"),fixed=T) + #model <- sub("FIX",paste(tmp,"loc",sep=":"),model,fixed=T) + model <- sub("FIX",tmp,model,fixed=T) } } else { model <- "asreml(data=data,fixed=BLUE~FIX-1,random=~RANDOM,residual=~id(env.id):us(Trait)" @@ -477,12 +482,11 @@ Stage2 <- function(data,vcov=NULL,geno=NULL,fix.eff.marker=NULL, } if (non.add=="dom") { - gamma <- (geno@ploidy/2 - 1)/(geno@ploidy - 1) ix <- grep("dom",rownames(beta),fixed=T) out$params$heterosis <- data.frame(trait=traits, estimate=as.numeric(beta[ix,1]), SE=as.numeric(beta[ix,2])) - + gamma <- mean((geno@ploidy/2 - 1)/(geno@ploidy - 1)) weights <- id.weights/sum(id.weights) for (i in 1:n.trait) for (j in i:n.trait) { diff --git a/R/blup.R b/R/blup.R index ad9f6d7..5483c54 100644 --- a/R/blup.R +++ b/R/blup.R @@ -46,12 +46,18 @@ blup <- function(data, geno=NULL, what, index.coeff=NULL, gwas.ncore=0L) { if (substr(what,1,2)=="DM" & (!(is(geno,"class_genoD")) | data[[1]]@model < 3L)) stop("Dominance model is required in geno and data") + n.id <- length(data[[1]]@id) gamma <- 0 if (!is.null(geno)) { if (substr(what,1,2)=="GV") gamma <- 1 if (substr(what,1,2)=="BV" & data[[1]]@model==3L) gamma <- (geno@ploidy/2 - 1)/(geno@ploidy - 1) + + if (length(gamma) > 1) { + tmp <- tapply(gamma,attr(geno@ploidy,"pop"),mean) + gamma <- sum(tmp[names(index.coeff)]*index.coeff)/sum(index.coeff) + } } if (data[[1]]@model < 2L) { @@ -60,8 +66,6 @@ blup <- function(data, geno=NULL, what, index.coeff=NULL, gwas.ncore=0L) { index.scale <- sqrt(unlist(sapply(data,function(z){diag(as.matrix(z@geno1.var))})) + gamma^2*unlist(sapply(data,function(z){diag(as.matrix(z@geno2.var))}))) } - - n.id <- length(data[[1]]@id) if (!multi.prep) { data <- data2 @@ -176,12 +180,12 @@ blup <- function(data, geno=NULL, what, index.coeff=NULL, gwas.ncore=0L) { if (what=="AM") { G <- kron(geno@eigen.G,1) - M <- kronecker(crossprod(geno@coeff,crossprod(G$inv)),matrix(index.coeff,nrow=1))/geno@scale + M <- kronecker(crossprod((geno@coeff/geno@scale),crossprod(G$inv)),matrix(index.coeff,nrow=1)) effect <- as.numeric(M %*% matrix(data@random[1:n.random],ncol=1)) V <- data@var.uhat[1:n.random,1:n.random] #used for GWAS } else { D <- kron(geno@eigen.D,1) - M <- kronecker(crossprod(geno@coeff.D,crossprod(D$inv)),matrix(index.coeff,nrow=1))/geno@scale.D + M <- kronecker(crossprod((geno@coeff.D/geno@scale.D),crossprod(D$inv)),matrix(index.coeff,nrow=1)) dhat <- -kronecker(matrix(geno@Fg[data@id],ncol=1),matrix(data@heterosis,ncol=1)) + matrix(data@random[n.random + 1:n.random],ncol=1) effect <- as.numeric(M %*% dhat) diff --git a/R/blup_prep.R b/R/blup_prep.R index 238ffd6..a802e53 100644 --- a/R/blup_prep.R +++ b/R/blup_prep.R @@ -4,13 +4,13 @@ #' #' The \code{method} argument can be used to control how the linear system is solved. "MME" leads to inversion of the MME coefficient matrix, while "Vinv" leads to inversion of the overall var-cov matrix for the response vector. If NULL, the software uses whichever method involves inverting the smaller matrix. If the number of random effects (m) is less than the number of BLUEs (n), "MME" is used. #' -#' For the multi-location model, if all of the environments for a location are masked, the average of the other locations is used when computed average fixed effects. +#' For the multi-location model, if all of the environments for a location are masked, the average of the other locations is used when computing average fixed effects. #' #' @param data data frame of BLUEs from Stage 1 #' @param vcov list of variance-covariance matrices for the BLUEs #' @param geno object of \code{\link{class_geno}} from \code{\link{read_geno}} #' @param vars object of \code{\link{class_var}} from \code{\link{Stage2}} -#' @param mask (optional) data frame with possible columns "id","env","trait" +#' @param mask (optional) data frame with possible columns "id","env","loc","trait" #' @param method (optional) "MME", "Vinv", NULL (defaut). see Details #' #' @return Object of \code{\link{class_prep}} @@ -50,7 +50,7 @@ blup_prep <- function(data,vcov=NULL,geno=NULL,vars,mask=NULL,method=NULL) { data <- data[!is.na(data$BLUE),] if (!is.null(mask)) { - tmp <- intersect(colnames(mask),c("id","env","trait")) + tmp <- intersect(colnames(mask),c("id","env","trait","loc")) stopifnot(length(tmp)>0) if (length(tmp) > 1) { tmp2 <- apply(mask[,tmp],1,paste,collapse=":") @@ -189,10 +189,10 @@ blup_prep <- function(data,vcov=NULL,geno=NULL,vars,mask=NULL,method=NULL) { Gmat2 <- kron(eigen.A=geno@eigen.D, B=vars@geno2) #add to X matrix - dom.covariate <- Z %*% kronecker(geno@coeff.D %*% matrix(1,nrow=ncol(geno@coeff.D),ncol=1), - diag(n.loc)) + dom.covariate <- Z %*% kronecker((geno@coeff.D/(geno@scale*(ploidy-1))) %*% + matrix(1,nrow=ncol(geno@coeff.D),ncol=1),diag(n.loc)) colnames(dom.covariate) <- paste("heterosis",locations,sep=":") - X <- cbind(X,dom.covariate/(geno@scale*(ploidy-1))) + X <- cbind(X,dom.covariate) } else { Gmat2 <- kron(eigen.A=eigen.I, B=vars@geno2) } @@ -213,8 +213,9 @@ blup_prep <- function(data,vcov=NULL,geno=NULL,vars,mask=NULL,method=NULL) { Gmat1 <- kron(eigen.A = geno@eigen.G, B=vars@geno1) if (vars@model==3L) { Gmat2 <- kron(eigen.A=geno@eigen.D, B=vars@geno2) - dom.covariate <- as.numeric(Z %*% geno@coeff.D %*% matrix(1,nrow=ncol(geno@coeff.D),ncol=1)) - X <- cbind(X,heterosis=dom.covariate/(geno@scale*(ploidy-1))) + dom.covariate <- as.numeric(Z %*% (geno@coeff.D/(geno@scale*(ploidy-1))) %*% + matrix(1,nrow=ncol(geno@coeff.D),ncol=1)) + X <- cbind(X,heterosis=dom.covariate) } else { Gmat2 <- kron(eigen.A=eigen.I, B=vars@geno2) } @@ -254,10 +255,10 @@ blup_prep <- function(data,vcov=NULL,geno=NULL,vars,mask=NULL,method=NULL) { Gmat2 <- kron(eigen.A=geno@eigen.D, B=vars@geno2) #add to X matrix - dom.covariate <- Z %*% kronecker(geno@coeff.D %*% matrix(1,nrow=ncol(geno@coeff.D),ncol=1), - diag(n.trait)) + dom.covariate <- Z %*% kronecker((geno@coeff.D/(geno@scale*(ploidy-1))) %*% + matrix(1,nrow=ncol(geno@coeff.D),ncol=1),diag(n.trait)) colnames(dom.covariate) <- paste("heterosis",traits,sep=":") - X <- cbind(X,dom.covariate/(geno@scale*(ploidy-1))) + X <- cbind(X,dom.covariate) } else { Gmat2 <- kron(eigen.A=eigen.I, B=vars@geno2) } diff --git a/R/class_geno.R b/R/class_geno.R index 02d2af6..c277232 100644 --- a/R/class_geno.R +++ b/R/class_geno.R @@ -1,9 +1,9 @@ #' S4 class for marker genotype data #' -#' @slot ploidy ploidy +#' @slot ploidy If mixed ploidy, then a vector equal to pop size; otherwise a single integer #' @slot map Marker map positions #' @slot coeff Coefficients of the marker effects (dim: indiv x marker) -#' @slot scale Scaling factor between markers and indiv +#' @slot scale Scaling factor between markers and indiv, vector of length equal to pop size #' @slot G Additive relationship matrix (from markers and potentially also pedigree) #' @slot eigen.G list of eigenvalues and eigenvectors #' diff --git a/R/dominance.R b/R/dominance.R index a99f596..aafa44c 100644 --- a/R/dominance.R +++ b/R/dominance.R @@ -2,7 +2,7 @@ #' #' Report dominance parameters #' -#' The dominance variance (Vd) and baseline heterosis (b) are quantified relative to additive variance (Va) and std. dev. (SDa), respectively. For single traits, the estimate and SE of the ratios are calculated based on the delta method (Rice 2007, p. 162-166). For a multi-trait model, the result is just the ratio of the estimates, with \code{index.coeff} specifying the coefficients of the standardized true values (see also \code{\link{blup}}) and \code{gamma} specifying the relative weight for non-additive to additive genetic merit (see also \code{\link{gain}}). +#' The dominance variance (Vd) and baseline heterosis (b) are quantified relative to additive variance (Va) and std. dev. (SDa), respectively. For single traits, the estimate and SE of the ratios are calculated based on the delta method (Rice 2007, p. 162-166). For a multi-trait/loc model, the result is just the ratio of the estimates, with \code{index.coeff} specifying the coefficients of the standardized true values (see also \code{\link{blup}}) and \code{gamma} specifying the relative weight for non-additive to additive genetic merit (see also \code{\link{gain}}). #' #' @param params list returned by \code{\link{Stage2}} #' @param digits number of digits for rounding @@ -20,22 +20,26 @@ dominance <- function(params, digits=2, index.coeff=NULL, gamma=0) { id <- grep("dominance",params$vc$name,fixed=T) ia <- grep("additive",params$vc$name,fixed=T) + vc <- params$vc[,-1] + rownames(vc) <- params$vc$name - n.trait <- nrow(params$heterosis) - if (n.trait > 1) { + nhet <- nrow(params$heterosis) + if (nhet > 1) { stopifnot(!is.null(index.coeff)) - traits <- params$heterosis$trait - Vd <- Va <- matrix(0,nrow=n.trait,ncol=n.trait,dimnames=list(traits,traits)) - - tmp <- strsplit(params$vc$name[ia],split=":") - ix <- cbind(sapply(tmp,"[[",2),sapply(tmp,"[[",3)) - Va[ix] <- params$vc$estimate[ia] - Va[upper.tri(Va)] <- Va[lower.tri(Va)] - - tmp <- strsplit(params$vc$name[id],split=":") - ix <- cbind(sapply(tmp,"[[",2),sapply(tmp,"[[",3)) - Vd[ix] <- params$vc$estimate[id] - Vd[upper.tri(Vd)] <- Vd[lower.tri(Vd)] + if (colnames(params$heterosis)[1]=="trait") { + traits <- params$heterosis$trait + #Vd <- Va <- matrix(0,nrow=nhet,ncol=nhet,dimnames=list(traits,traits)) + Va <- f.cov.trait(vc[ia,],traits,us=(nhet>2)) + Vd <- f.cov.trait(vc[id,],traits,us=(nhet>2)) + } else { + #multi-location + traits <- params$heterosis$loc + add <- f.cov.loc(vc[ia,], traits) + Va <- add$cov.mat + tmp <- matrix(vc[id,1],ncol=1) + rownames(tmp) <- traits + Vd <- tcrossprod(sqrt(tmp)) + } stopifnot(sort(names(index.coeff))==sort(traits)) index.coeff <- index.coeff[match(traits,names(index.coeff))] diff --git a/R/inbreeding.R b/R/inbreeding.R index 8f2bbfe..5ead215 100644 --- a/R/inbreeding.R +++ b/R/inbreeding.R @@ -16,7 +16,12 @@ inbreeding <- function(geno) { x <- data.frame(F.G=(diag(geno@G)-1)/(geno@ploidy-1)) if (inherits(geno,"class_genoD")) x$F.D <- geno@Fg - + + pops <- attr(geno@scale,"pop") + np <- length(unique(pops)) + if (np > 1) { + x$pop <- pops + } rownames(x) <- rownames(geno@G) return(x) } \ No newline at end of file diff --git a/R/read_geno.R b/R/read_geno.R index 47eb7ea..027a4ee 100644 --- a/R/read_geno.R +++ b/R/read_geno.R @@ -11,6 +11,8 @@ #' When \code{dominance=FALSE}, non-additive effects are captured using a residual genetic effect, with zero covariance. If \code{dominance=TRUE}, a (digenic) dominance covariance matrix is used instead. #' #' The argument \code{min.minor.allele} specifies the minimum number of individuals that must contain the minor allele. Markers that do not meet this threshold are discarded. +#' +#' Optional argument \code{pop.file} gives the name of a CSV file with two columns: id,pop. If the populations have different ploidy, this is indicated using a named vector for \code{ploidy}. #' #' @param filename Name of CSV file with marker allele dosage #' @param ploidy 2,4,6,etc. (even numbers) @@ -19,6 +21,7 @@ #' @param w blending parameter (see Details) #' @param ped optional, pedigree data frame with 3 or 4 columns (see Details) #' @param dominance TRUE/FALSE whether to include dominance covariance (see Details) +#' @param pop.file CSV file defining populations #' #' @return Variable of class \code{\link{class_geno}}. #' @@ -29,7 +32,7 @@ #' @importFrom data.table fread read_geno <- function(filename, ploidy, map, min.minor.allele=5, - w=1e-5, ped=NULL, dominance=FALSE) { + w=1e-5, ped=NULL, dominance=FALSE, pop.file=NULL) { if (any(w < 1e-5)) stop("Blending parameter should not be smaller than 1e-5") @@ -38,49 +41,107 @@ read_geno <- function(filename, ploidy, map, min.minor.allele=5, map <- data.frame(data[,1:3]) colnames(map) <- c("marker","chrom","position") geno <- as.matrix(data[,-(1:3)]) + rownames(geno) <- as.character(map$marker) } else { map <- data.frame(marker=character(0), chrom=character(0), position=numeric(0)) geno <- as.matrix(data[,-1]) + tmp <- data.frame(data[,1]) + colnames(tmp) <- "marker" + rownames(geno) <- as.character(tmp$marker) } - rownames(geno) <- as.character(data$V1) + m <- nrow(geno) id <- colnames(geno) - #check for recoding -1,0,1 to 0,1,2 - if ((min(geno[1:min(m,1000),],na.rm=T)==-1) & (ploidy==2)) { - geno <- geno + 1 - } + n.ploidy <- length(unique(ploidy)) - p <- apply(geno,1,mean,na.rm=T)/ploidy - n.minor <- apply(geno,1,function(z){ - p <- mean(z,na.rm=T)/ploidy - tab <- table(factor(round(z),levels=0:ploidy)) - if (p > 0.5) { - sum(tab) - tab[ploidy+1] + if (!is.null(pop.file)) { + pdat <- read.csv(pop.file,colClasses = c("character","character")) + colnames(pdat) <- c("id","pop") + pops <- unique(pdat$pop) + np <- length(pops) + ix <- match(id,pdat$id,nomatch = 0) + stopifnot(ix > 0) + pdat <- pdat[ix,] + if (n.ploidy > 1) { + iu <- match(pops,names(ploidy),nomatch=0) + stopifnot(iu>0) + ploidy <- ploidy[pops] } else { - sum(tab) - tab[1] + ploidy <- rep(ploidy,np) + names(ploidy) <- pops + } + pdat$ploidy <- ploidy[pdat$pop] + + } else { + stopifnot(length(ploidy)==1) + #check for recoding -1,0,1 to 0,1,2 + if ((min(geno[1:min(m,1000),],na.rm=T)==-1) & (ploidy==2)) { + geno <- geno + 1 } - }) + + np <- 1 + pops <- "1" + pdat <- data.frame(id=id,pop="1",ploidy=ploidy) + } + + pmat <- t(geno)/pdat$ploidy + p2 <- matrix(as.numeric(NA),nrow=m,ncol=np) + colnames(p2) <- pops + + for (i in 1:np) { + p2[,i] <- apply(pmat[pdat$pop==pops[i],],2,mean,na.rm=T) + } + popn <- as.numeric(table(pdat$pop)[pops]) + p <- apply(t(p2)*popn,2,sum)/sum(popn) + flip <- which(p > 0.5) + pmat[,flip] <- 1-pmat[,flip] + n.minor <- apply(pmat,2,function(z){sum(z > 0.1,na.rm=T)}) + + # n.minor <- apply(geno,1,function(z){ + # p <- mean(z,na.rm=T)/ploidy + # tab <- table(factor(round(z),levels=0:ploidy)) + # if (p > 0.5) { + # sum(tab) - tab[ploidy+1] + # } else { + # sum(tab) - tab[1] + # } + # }) ix <- which(n.minor >= min.minor.allele) m <- length(ix) stopifnot(m > 0) cat(sub("X",min.minor.allele,"Minor allele threshold = X genotypes\n")) cat(sub("X",m,"Number of markers = X\n")) geno <- geno[ix,] + p2 <- p2[ix,,drop=F] n <- ncol(geno) cat(sub("X",n,"Number of genotypes = X\n")) - p <- p[ix] + + #p <- apply(geno,1,mean,na.rm=T)/ploidy + #p <- p[ix] if (nrow(map)>0) map <- map[ix,] - coeff <- Matrix(scale(t(geno),scale=F), + coeff <- matrix(0,nrow=n,ncol=m, dimnames=list(id,rownames(geno))) + scale.param <- numeric(n) + names(scale.param) <- id + attr(scale.param,"pop") <- pdat$pop + + for (i in 1:np) { + ix <- which(pdat$pop==pops[i]) + pmat <- matrix(1,ncol=1,nrow=popn[i]) %*% matrix(p2[,i],nrow=1) + scale.param[ix] <- ploidy[i]*sum(p2[,i]*(1-p2[,i])) + coeff[ix,] <- t(geno[,ix])-ploidy[i]*pmat + } coeff[which(is.na(coeff))] <- 0 + coeff <- Matrix(coeff) - scale <- ploidy*sum(p*(1-p)) - G <- tcrossprod(coeff)/scale + #scale <- ploidy*sum(p*(1-p)) + #G <- tcrossprod(coeff)/scale + G <- tcrossprod(coeff/sqrt(scale.param)) nw <- length(w) H <- vector("list",nw) @@ -91,6 +152,8 @@ read_geno <- function(filename, ploidy, map, min.minor.allele=5, H[[i]] <- (1-w[i])*G + w[i]*mean(diag(G))*Diagonal(n=nrow(G)) } } else { + if (n.ploidy > 1) + stop("Pedigree option not available for mixed ploidy") if (ncol(ped)==4) { if (dominance) @@ -110,7 +173,7 @@ read_geno <- function(filename, ploidy, map, min.minor.allele=5, ped2 <- data.frame(id=1:nrow(ped), parent1=match(ped$parent1,ped$id,nomatch=0), parent2=match(ped$parent2,ped$id,nomatch=0)) - invisible(capture.output(A <- as(Amatrix(ped2,ploidy=ploidy),"symmetricMatrix"))) + invisible(capture.output(A <- as(Amatrix(ped2,ploidy=ploidy[1]),"symmetricMatrix"))) rownames(A) <- ped$id colnames(A) <- ped$id @@ -141,6 +204,15 @@ read_geno <- function(filename, ploidy, map, min.minor.allele=5, id <- c(id,id2) } + if (n.ploidy > 1) { + ploidy <- as.integer(pdat$ploidy) + names(ploidy) <- pdat$id + attr(ploidy,"pop") <- pdat$pop + } else { + ploidy <- as.integer(pdat$ploidy[1]) + names(ploidy) <- NULL + } + G.list <- vector("list",nw) for (i in 1:nw) { if (!is.null(H[[i]])) { @@ -153,25 +225,38 @@ read_geno <- function(filename, ploidy, map, min.minor.allele=5, H[[i]] <- tcrossprod(eigen.G$vectors%*%Diagonal(n=nrow(Hinv[[i]]),x=sqrt(eigen.G$values))) } class(eigen.G) <- "list" - G.list[[i]] <- new(Class="class_geno",ploidy=as.integer(ploidy),map=map,coeff=coeff,scale=scale, - G=H[[i]], eigen.G=eigen.G) + + G.list[[i]] <- new(Class="class_geno",ploidy=ploidy,map=map,coeff=coeff, + scale=scale.param,G=H[[i]], eigen.G=eigen.G) } if (dominance) { - Pmat <- kronecker(matrix(p,nrow=1,ncol=m),matrix(1,ncol=1,nrow=n)) - coeff.D <- Matrix(-2*choose(ploidy,2)*Pmat^2 + 2*(ploidy-1)*Pmat*t(geno) - t(geno)*(t(geno)-1)) + coeff.D <- matrix(0,nrow=n,ncol=m,dimnames=list(id,rownames(geno))) + scaleD.param <- numeric(n) + names(scaleD.param) <- id + for (i in 1:np) { + ix <- which(pdat$pop==pops[i]) + pmat <- matrix(1,ncol=1,nrow=popn[i]) %*% matrix(p2[,i],nrow=1) + scaleD.param[ix] <- 4*choose(pdat$ploidy[ix],2)*sum(p2[,i]^2*(1-p2[,i])^2) + #Pmat <- kronecker(matrix(p2[,i],nrow=1,ncol=m),matrix(1,ncol=1,nrow=popn[i])) + coeff.D[ix,] <- -2*choose(pdat$ploidy[ix],2)*pmat^2 + 2*(pdat$ploidy[ix]-1)*pmat*t(geno[,ix]) - + t(geno[,ix])*(t(geno[,ix])-1) + } + coeff.D[is.na(coeff.D)] <- 0 - scale.D <- 4*choose(ploidy,2)*sum(p^2*(1-p)^2) - D <- tcrossprod(coeff.D)/scale.D + coeff.D <- Matrix(coeff.D) + #scale.D <- 4*choose(ploidy,2)*sum(p^2*(1-p)^2) + D <- tcrossprod(coeff.D/sqrt(scaleD.param)) D <- (1-1e-5)*D + (1e-5)*mean(diag(D))*Diagonal(n=nrow(D)) eigen.D <- eigen(D,symmetric=TRUE) eigen.D$vectors <- Matrix(eigen.D$vectors,dimnames=list(id,id)) class(eigen.D) <- "list" - Fg <- apply(coeff.D,1,sum)/(-scale*(ploidy-1)) + Fg <- apply(coeff.D,1,sum)/(-scale.param*(pdat$ploidy-1)) names(Fg) <- id output <- lapply(G.list,function(x){new(Class="class_genoD",ploidy=x@ploidy,map=x@map, coeff=x@coeff,scale=x@scale,G=x@G,eigen.G=x@eigen.G, - coeff.D=coeff.D,scale.D=scale.D,D=D,eigen.D=eigen.D,Fg=Fg)}) + coeff.D=coeff.D,scale.D=scaleD.param,D=D,eigen.D=eigen.D, + Fg=Fg)}) } else { output <- G.list } diff --git a/docs/Vignette2.html b/docs/Vignette2.html index b431a07..e630be1 100644 --- a/docs/Vignette2.html +++ b/docs/Vignette2.html @@ -372,7 +372,7 @@

Vignette 2: Single trait analysis at explicitly used by Stage1 but is automatically detected and retained in the output for use in Stage2.

ans1 <- Stage1(filename=pheno.file, traits='Yield.Mg.ha')
-
## Online License checked out Fri Feb 16 15:27:34 2024
+
## Offline License checked out Thu Jul 18 03:16:21 2024
head(ans1$blues)
##       env          id    BLUE loc
 ## 1 CA_2011   A01143-3C  93.050  CA
@@ -401,7 +401,7 @@ 

Vignette 2: Single trait analysis at
summary(ans2a$vars)
## $var
 ##              Variance   PVE
-## env               307    NA
+## env               286    NA
 ## genotype           53 0.355
 ## g x loc            15 0.098
 ## g x env            22 0.150
@@ -428,7 +428,7 @@ 

Vignette 2: Single trait analysis at Stage2 can be used with the function uniplot to visualize model sufficiency and correlation structure (Cullis et al. 2010).

uniplot(ans2a$loadings)
-

+

The squared radius for each location is the proportion of variance explained (PVE) by the latent factors. With the exception of FL, the FA2 model appears to provide a good representation of the covariance @@ -456,12 +456,13 @@

Vignette 2: Single trait analysis at
summary(ans2c$vars)
## $var
 ##              Variance   PVE
-## env               267    NA
-## additive           35 0.214
-## add x loc           8 0.047
-## dominance          36 0.217
-## g x env            20 0.120
-## Stage1.error       67 0.403
+## env               311    NA
+## additive           35 0.208
+## add x loc           8 0.046
+## dominance          36 0.211
+## heterosis           4 0.024
+## g x env            20 0.117
+## Stage1.error       67 0.393
 ## 
 ## $cor
 ##       MI    MO    NC    NY    OR    WI
@@ -472,7 +473,7 @@ 

Vignette 2: Single trait analysis at ## OR 0.890 0.976 0.831 0.945 1.000 0.961 ## WI 0.908 0.939 0.707 0.967 0.961 1.000

uniplot(ans2c$loadings)
-

+

The correlation matrix and uniplot, which are based on the additive values, show that yield in NC was somewhat different compared to the other five sites.

@@ -494,7 +495,7 @@

Vignette 2: Single trait analysis at ggplot(pred,aes(x=r2.NC,y=r2.WI)) + geom_point() + coord_fixed(ratio=1) + xlim(0.3,0.8) + ylim(0.3,0.8) + geom_line(data=data.frame(x=c(0.3,0.8),y=c(0.3,0.8)),mapping=aes(x=x,y=y),linetype=2) + theme_bw() + xlab("NC") + ylab("WI") + ggtitle("BV Reliability")

-

+

The above figure shows that breeding values were predicted with higher reliability in WI than NC, which is to be expected from the high correlation between WI and the other four sites (OR,MO,MI,NY) compared @@ -516,7 +517,7 @@

Vignette 2: Single trait analysis at ggplot(plot.data,aes(x=without.WI.pheno,y=with.WI.pheno)) + geom_point() + coord_fixed(ratio=1) + xlim(0.3,0.8) + ylim(0.3,0.8) + geom_line(data=data.frame(x=c(0.3,0.8),y=c(0.3,0.8)),mapping=aes(x=x,y=y),linetype=2) + theme_bw() + xlab("Without WI phenotypes") + ylab("With WI phenotypes") + ggtitle("WI BV Reliability")

-

+

As expected, the reliability was higher with WI phenotypes.

diff --git a/docs/manual.pdf b/docs/manual.pdf index 6a30904..1b55e10 100644 Binary files a/docs/manual.pdf and b/docs/manual.pdf differ diff --git a/man/Stage1.Rd b/man/Stage1.Rd index f8750b4..61a0b5b 100644 --- a/man/Stage1.Rd +++ b/man/Stage1.Rd @@ -36,7 +36,7 @@ Stage1( List containing \describe{ \item{blues}{data frame of BLUEs} -\item{vcov}{list of variance-covariance matrices for the BLUEs, one per experiment (env)} +\item{vcov}{list of variance-covariance matrices for the BLUEs, one per env} \item{fit}{data frame with broad-sense H2 (plot basis) and/or AIC} \item{resid}{For single trait, list of diagnostic plots and data frame of residuals. For multi-trait, list of resid var-cov matrices.} } diff --git a/man/blup_prep.Rd b/man/blup_prep.Rd index 273c70b..386656f 100644 --- a/man/blup_prep.Rd +++ b/man/blup_prep.Rd @@ -15,7 +15,7 @@ blup_prep(data, vcov = NULL, geno = NULL, vars, mask = NULL, method = NULL) \item{vars}{object of \code{\link{class_var}} from \code{\link{Stage2}}} -\item{mask}{(optional) data frame with possible columns "id","env","trait"} +\item{mask}{(optional) data frame with possible columns "id","env","loc","trait"} \item{method}{(optional) "MME", "Vinv", NULL (defaut). see Details} } @@ -28,5 +28,5 @@ Prepare data for BLUP \details{ The \code{method} argument can be used to control how the linear system is solved. "MME" leads to inversion of the MME coefficient matrix, while "Vinv" leads to inversion of the overall var-cov matrix for the response vector. If NULL, the software uses whichever method involves inverting the smaller matrix. If the number of random effects (m) is less than the number of BLUEs (n), "MME" is used. -For the multi-location model, if all of the environments for a location are masked, the average of the other locations is used when computed average fixed effects. +For the multi-location model, if all of the environments for a location are masked, the average of the other locations is used when computing average fixed effects. } diff --git a/man/class_geno-class.Rd b/man/class_geno-class.Rd index bdd1ad1..25b8580 100644 --- a/man/class_geno-class.Rd +++ b/man/class_geno-class.Rd @@ -11,13 +11,13 @@ S4 class for marker genotype data \section{Slots}{ \describe{ -\item{\code{ploidy}}{ploidy} +\item{\code{ploidy}}{If mixed ploidy, then a vector equal to pop size; otherwise a single integer} \item{\code{map}}{Marker map positions} \item{\code{coeff}}{Coefficients of the marker effects (dim: indiv x marker)} -\item{\code{scale}}{Scaling factor between markers and indiv} +\item{\code{scale}}{Scaling factor between markers and indiv, vector of length equal to pop size} \item{\code{G}}{Additive relationship matrix (from markers and potentially also pedigree)} diff --git a/man/dominance.Rd b/man/dominance.Rd index 65847b8..5fbfbcd 100644 --- a/man/dominance.Rd +++ b/man/dominance.Rd @@ -22,7 +22,7 @@ data frame with estimates and SE Report dominance parameters } \details{ -The dominance variance (Vd) and baseline heterosis (b) are quantified relative to additive variance (Va) and std. dev. (SDa), respectively. For single traits, the estimate and SE of the ratios are calculated based on the delta method (Rice 2007, p. 162-166). For a multi-trait model, the result is just the ratio of the estimates, with \code{index.coeff} specifying the coefficients of the standardized true values (see also \code{\link{blup}}) and \code{gamma} specifying the relative weight for non-additive to additive genetic merit (see also \code{\link{gain}}). +The dominance variance (Vd) and baseline heterosis (b) are quantified relative to additive variance (Va) and std. dev. (SDa), respectively. For single traits, the estimate and SE of the ratios are calculated based on the delta method (Rice 2007, p. 162-166). For a multi-trait/loc model, the result is just the ratio of the estimates, with \code{index.coeff} specifying the coefficients of the standardized true values (see also \code{\link{blup}}) and \code{gamma} specifying the relative weight for non-additive to additive genetic merit (see also \code{\link{gain}}). } \references{ Rice JA (2007) Mathematical Statistics and Data Analysis, 3rd ed. Duxbury, Pacific Grove. diff --git a/man/read_geno.Rd b/man/read_geno.Rd index e158925..c7c864f 100644 --- a/man/read_geno.Rd +++ b/man/read_geno.Rd @@ -11,7 +11,8 @@ read_geno( min.minor.allele = 5, w = 1e-05, ped = NULL, - dominance = FALSE + dominance = FALSE, + pop.file = NULL ) } \arguments{ @@ -28,6 +29,8 @@ read_geno( \item{ped}{optional, pedigree data frame with 3 or 4 columns (see Details)} \item{dominance}{TRUE/FALSE whether to include dominance covariance (see Details)} + +\item{pop.file}{CSV file defining populations} } \value{ Variable of class \code{\link{class_geno}}. @@ -45,4 +48,6 @@ If the A matrix is not used, then G is blended with the identity matrix (times t When \code{dominance=FALSE}, non-additive effects are captured using a residual genetic effect, with zero covariance. If \code{dominance=TRUE}, a (digenic) dominance covariance matrix is used instead. The argument \code{min.minor.allele} specifies the minimum number of individuals that must contain the minor allele. Markers that do not meet this threshold are discarded. + +Optional argument \code{pop.file} gives the name of a CSV file with two columns: id,pop. If the populations have different ploidy, this is indicated using a named vector for \code{ploidy}. } diff --git a/vignettes/Vignette1.Rmd b/vignettes/Vignette1.Rmd index fa8e7a0..01278db 100644 --- a/vignettes/Vignette1.Rmd +++ b/vignettes/Vignette1.Rmd @@ -135,7 +135,7 @@ geno.file <- system.file("vignette_data", "geno1.csv", package = "StageWise") geno <- read.csv(geno.file,check.names=F) geno[1:4,1:6] -geno <- read_geno(filename=geno.file, ploidy=4, map=TRUE, min.minor.allele=5, +geno <- read_geno(filename=geno.file, ploidy=4L, map=TRUE, min.minor.allele=5, dominance=T) ```