diff --git a/DESCRIPTION b/DESCRIPTION
index 521d049..3d7a26a 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
Package: StageWise
Title: Two-stage analysis of multi-environment trials for genomic selection and GWAS
-Version: 1.09
+Version: 1.10
Author: Jeffrey B. Endelman
Maintainer: Jeffrey Endelman
Description: Fully efficient, two stage analysis of multi-environment trials, including directional dominance and multi-trait genomic selection
@@ -8,7 +8,7 @@ Depends: R (>= 4.0)
License: GPL-3
RoxygenNote: 7.2.3
Encoding: UTF-8
-Imports: Matrix (>= 1.5-0), ggplot2, methods, ggrepel, rlang, ggpubr, SpATS, spam, AGHmatrix, MASS, CVXR, ggforce
+Imports: Matrix (>= 1.5-0), ggplot2, methods, ggrepel, rlang, ggpubr, SpATS, spam, AGHmatrix, MASS, CVXR, ggforce, data.table
Suggests: knitr, rmarkdown, asreml
Collate:
'Stage1.R'
diff --git a/NEWS b/NEWS
index 8febdea..afb1a17 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,7 @@
+Changes in 1.10
+* Added support for multi-population/multi-ploidy analysis
+* Fixed error in Stage1 related to multiple experiments per env
+
Changes in 1.09
* changed read.csv to fread in read_geno
diff --git a/R/Stage1.R b/R/Stage1.R
index 5d055a3..a0659bf 100644
--- a/R/Stage1.R
+++ b/R/Stage1.R
@@ -30,7 +30,7 @@
#' @return List containing
#' \describe{
#' \item{blues}{data frame of BLUEs}
-#' \item{vcov}{list of variance-covariance matrices for the BLUEs, one per experiment (env)}
+#' \item{vcov}{list of variance-covariance matrices for the BLUEs, one per env}
#' \item{fit}{data frame with broad-sense H2 (plot basis) and/or AIC}
#' \item{resid}{For single trait, list of diagnostic plots and data frame of residuals. For multi-trait, list of resid var-cov matrices.}
#' }
@@ -312,55 +312,64 @@ Stage1 <- function(filename,traits,effects=NULL,solver="asreml",
blue.out$env <- data$env[match(blue.out$expt,data$expt)]
nee <- sapply(expt.in.env,length)
- iu <- which(nee==1)
- if (length(iu) > 0) {
- tmp <- names(vcov)[iu]
- names(vcov)[iu] <- blue.out$env[match(tmp,blue.out$expt)]
- }
+ #iu <- which(nee==1)
+ #if (length(iu) > 0) {
+ # tmp <- names(vcov)[iu]
+ # names(vcov)[iu] <- blue.out$env[match(tmp,blue.out$expt)]
+ #}
- for (j in which(nee > 1)) {
- omega.list <- vcov[expt.in.env[[j]]]
- vcov2 <- mapply(FUN=function(x,y){
- tmp <- paste(y,rownames(x),sep=":")
- dimnames(x) <- list(tmp,tmp)
- return(x)},x=omega.list,y=as.list(expt.in.env[[j]]))
+ vcov.env <- vector("list",length(nee))
+ names(vcov.env) <- names(nee)
+ iu <- which(nee==1)
+ if (length(iu) > 0)
+ vcov.env[names(iu)] <- vcov[unlist(expt.in.env[iu])]
- .GlobalEnv$asremlOmega <- direct_sum(lapply(vcov2,solve))
- dname <- lapply(vcov2,rownames)
- dimnames(.GlobalEnv$asremlOmega) <- list(unlist(dname),unlist(dname))
- attr(.GlobalEnv$asremlOmega,"INVERSE") <- TRUE
+ iu <- which(nee > 1)
+ if (length(iu) > 0)
+ for (j in iu) {
+ omega.list <- vcov[expt.in.env[[j]]]
+ vcov2 <- mapply(FUN=function(x,y){
+ tmp <- paste(y,rownames(x),sep=":")
+ dimnames(x) <- list(tmp,tmp)
+ return(x)},x=omega.list,y=as.list(expt.in.env[[j]]))
- ix <- which(blue.out$env==envs[j])
- data2 <- blue.out[ix,]
- data2$expt.id <- factor(paste(data2$expt,data2$id,sep=":"))
- data2$id <- factor(data2$id)
- data2$expt <- factor(data2$expt)
- start.table <- asreml::asreml(data=data2,fixed=BLUE~expt-1+id,
- random=~vm(expt.id,source=asremlOmega),
- residual=~idv(units),start.values=TRUE)$vparameters.table
- k <- grep("Omega",start.table$Component,fixed=T)
- start.table$Value[k] <- 1
- start.table$Constraint[k] <- "F"
- ans3 <- asreml::asreml(data=data2,fixed=BLUE~expt-1+id,
- random=~vm(expt.id,source=asremlOmega),
- residual=~idv(units),G.param=start.table)
+ .GlobalEnv$asremlOmega <- direct_sum(lapply(vcov2,solve))
+ dname <- lapply(vcov2,rownames)
+ dimnames(.GlobalEnv$asremlOmega) <- list(unlist(dname),unlist(dname))
+ attr(.GlobalEnv$asremlOmega,"INVERSE") <- TRUE
- predans3 <- asreml::predict.asreml(ans3,classify="id",vcov = TRUE)
- blue3 <- data.frame(expt=NA,predans3$pvals[,c("id","predicted.value")],env=envs[j])
- colnames(blue3) <- c("expt","id","BLUE","env")
- vcov3 <- predans3$vcov
- dimnames(vcov3) <- list(blue3$id,blue3$id)
- vcov2 <- vcov[-match(expt.in.env[[j]],names(vcov))]
- vcov <- c(vcov2,vcov3)
- names(vcov) <- c(names(vcov2),envs[j])
- blue.out <- rbind(blue.out[-ix,],blue3)
- rm("asremlOmega",envir = .GlobalEnv)
- }
+ ix <- which(blue.out$env==envs[j])
+ data2 <- blue.out[ix,]
+ data2$expt.id <- factor(paste(data2$expt,data2$id,sep=":"))
+ data2$id <- factor(data2$id)
+ data2$expt <- factor(data2$expt)
+ start.table <- asreml::asreml(data=data2,fixed=BLUE~expt-1+id,
+ random=~vm(expt.id,source=asremlOmega),
+ residual=~idv(units),start.values=TRUE)$vparameters.table
+ k <- grep("Omega",start.table$Component,fixed=T)
+ start.table$Value[k] <- 1
+ start.table$Constraint[k] <- "F"
+ ans3 <- asreml::asreml(data=data2,fixed=BLUE~expt-1+id,
+ random=~vm(expt.id,source=asremlOmega),
+ residual=~idv(units),G.param=start.table)
+
+ predans3 <- asreml::predict.asreml(ans3,classify="id",vcov = TRUE)
+ blue3 <- data.frame(expt=NA,predans3$pvals[,c("id","predicted.value")],env=envs[j])
+ colnames(blue3) <- c("expt","id","BLUE","env")
+ vcov3 <- predans3$vcov
+ dimnames(vcov3) <- list(blue3$id,blue3$id)
+ #vcov2 <- vcov[-match(expt.in.env[[j]],names(vcov))]
+ #vcov <- c(vcov2,vcov3)
+ #names(vcov) <- c(names(vcov2),envs[j])
+ vcov.env[[names(iu)[j]]] <- vcov3
+ blue.out <- rbind(blue.out[-ix,],blue3)
+ rm("asremlOmega",envir = .GlobalEnv)
+ }
}
if (n.trait > 1) {
- vcov <- vector("list",n.env)
- names(vcov) <- envs
+ vcov.env <- vector("list",n.env)
+ names(vcov.env) <- envs
expt.in.env <- lapply(split(data$expt,factor(data$env,levels=envs)),unique)
nexpt.per.env <- sapply(expt.in.env,length)
@@ -404,10 +413,10 @@ Stage1 <- function(filename,traits,effects=NULL,solver="asreml",
predans <- asreml::predict.asreml(ans,classify="id:trait",vcov = TRUE)
tmp <- predans$pvals[,c("id","trait","predicted.value")]
colnames(tmp) <- c("id","trait","BLUE")
- vcov[[j]] <- predans$vcov
+ vcov.env[[j]] <- predans$vcov
tmp$id <- as.character(tmp$id)
id.trait <- apply(tmp[,1:2],1,paste,collapse=":")
- dimnames(vcov[[j]]) <- list(id.trait,id.trait)
+ dimnames(vcov.env[[j]]) <- list(id.trait,id.trait)
blue.out <- rbind(blue.out,data.frame(env=envs[j],tmp))
}
}
@@ -428,13 +437,13 @@ Stage1 <- function(filename,traits,effects=NULL,solver="asreml",
stat_boxplot(outlier.color="red") + theme_bw() + theme(axis.text.x=element_text(angle=90,vjust=0.5))
p2 <- ggplot(data=blup.resid,aes(sample=.data$resid)) + stat_qq() + stat_qq_line() + facet_wrap(~expt) + theme_bw() + xlab("Expected") + ylab("Observed")
if (solver=="ASREML")
- return(list(blues=blue.out,vcov=vcov,fit=fit,
+ return(list(blues=blue.out,vcov=vcov.env,fit=fit,
resid=list(boxplot=p1,qqplot=p2,table=blup.resid)))
if (solver=="SPATS")
- return(list(blues=blue.out,vcov=vcov,fit=fit,
+ return(list(blues=blue.out,vcov=vcov.env,fit=fit,
resid=list(boxplot=p1,qqplot=p2,spatial=spatial.plot,table=blup.resid)))
} else {
#Multi-trait
- return(list(blues=blue.out,vcov=vcov,fit=fit,resid=resid.vc))
+ return(list(blues=blue.out,vcov=vcov.env,fit=fit,resid=resid.vc))
}
}
diff --git a/R/Stage2.R b/R/Stage2.R
index 467a68f..e48f6ef 100644
--- a/R/Stage2.R
+++ b/R/Stage2.R
@@ -106,7 +106,8 @@ Stage2 <- function(data,vcov=NULL,geno=NULL,fix.eff.marker=NULL,
diag(resid.vc) <- diag(resid.vc)/(n.trait-1)
resid.vc <- fu(resid.vc)
diag(B) <- diag(B)/(n.trait-1)
- B <- fu(B)
+ if (max(abs(diag(B))) > .Machine$double.eps*10)
+ B <- fu(B)
if (non.add!="none") {
diag(geno2) <- diag(geno2)/(n.trait-1)
@@ -138,6 +139,9 @@ Stage2 <- function(data,vcov=NULL,geno=NULL,fix.eff.marker=NULL,
if (!is.null(geno)) {
stopifnot(is(geno,"class_geno"))
id <- sort(intersect(data$id,rownames(geno@G)))
+ if (length(geno@ploidy) > 1)
+ geno@ploidy <- geno@ploidy[id]
+
n <- length(id)
data <- data[data$id %in% id,]
id.weights <- table(factor(data$id,levels=id))
@@ -150,7 +154,7 @@ Stage2 <- function(data,vcov=NULL,geno=NULL,fix.eff.marker=NULL,
meanD <- as.numeric(pvar(V=.GlobalEnv$asremlD,weights=id.weights))
dom.covariate <- as.numeric(geno@coeff.D[id,] %*% matrix(1,nrow=ncol(geno@coeff.D),ncol=1))
- dom.covariate <- dom.covariate/(geno@scale*(geno@ploidy-1))
+ dom.covariate <- dom.covariate/(geno@scale[id]*(geno@ploidy-1))
names(dom.covariate) <- id
data$dom <- dom.covariate[data$id]
@@ -184,7 +188,7 @@ Stage2 <- function(data,vcov=NULL,geno=NULL,fix.eff.marker=NULL,
data$env.id <- factor(data$env.id,levels=env.id)
out <- vector("list",4)
- names(out) <- c("aic","vars","fixed","random")
+ names(out) <- c("aic","vars","params","random")
n.loc <- 1
n.trait <- 1
@@ -243,8 +247,9 @@ Stage2 <- function(data,vcov=NULL,geno=NULL,fix.eff.marker=NULL,
model <- sub("FIX",paste(c("env",covariates,fix.eff.marker,dom),collapse="+"),model,fixed=T)
} else {
model <- "asreml(data=data,fixed=BLUE~FIX-1,random=~RANDOM,residual=~dsum(~idv(units)|loc)"
- tmp <- gsub("+",":loc+",paste(c("env",fix.eff.marker,dom),collapse="+"),fixed=T)
- model <- sub("FIX",paste(tmp,"loc",sep=":"),model,fixed=T)
+ tmp <- gsub("+",":loc+",paste(c(fix.eff.marker,dom,"env"),collapse="+"),fixed=T)
+ #model <- sub("FIX",paste(tmp,"loc",sep=":"),model,fixed=T)
+ model <- sub("FIX",tmp,model,fixed=T)
}
} else {
model <- "asreml(data=data,fixed=BLUE~FIX-1,random=~RANDOM,residual=~id(env.id):us(Trait)"
@@ -477,12 +482,11 @@ Stage2 <- function(data,vcov=NULL,geno=NULL,fix.eff.marker=NULL,
}
if (non.add=="dom") {
- gamma <- (geno@ploidy/2 - 1)/(geno@ploidy - 1)
ix <- grep("dom",rownames(beta),fixed=T)
out$params$heterosis <- data.frame(trait=traits,
estimate=as.numeric(beta[ix,1]),
SE=as.numeric(beta[ix,2]))
-
+ gamma <- mean((geno@ploidy/2 - 1)/(geno@ploidy - 1))
weights <- id.weights/sum(id.weights)
for (i in 1:n.trait)
for (j in i:n.trait) {
diff --git a/R/blup.R b/R/blup.R
index ad9f6d7..5483c54 100644
--- a/R/blup.R
+++ b/R/blup.R
@@ -46,12 +46,18 @@ blup <- function(data, geno=NULL, what, index.coeff=NULL, gwas.ncore=0L) {
if (substr(what,1,2)=="DM" & (!(is(geno,"class_genoD")) | data[[1]]@model < 3L))
stop("Dominance model is required in geno and data")
+ n.id <- length(data[[1]]@id)
gamma <- 0
if (!is.null(geno)) {
if (substr(what,1,2)=="GV")
gamma <- 1
if (substr(what,1,2)=="BV" & data[[1]]@model==3L)
gamma <- (geno@ploidy/2 - 1)/(geno@ploidy - 1)
+
+ if (length(gamma) > 1) {
+ tmp <- tapply(gamma,attr(geno@ploidy,"pop"),mean)
+ gamma <- sum(tmp[names(index.coeff)]*index.coeff)/sum(index.coeff)
+ }
}
if (data[[1]]@model < 2L) {
@@ -60,8 +66,6 @@ blup <- function(data, geno=NULL, what, index.coeff=NULL, gwas.ncore=0L) {
index.scale <- sqrt(unlist(sapply(data,function(z){diag(as.matrix(z@geno1.var))})) +
gamma^2*unlist(sapply(data,function(z){diag(as.matrix(z@geno2.var))})))
}
-
- n.id <- length(data[[1]]@id)
if (!multi.prep) {
data <- data2
@@ -176,12 +180,12 @@ blup <- function(data, geno=NULL, what, index.coeff=NULL, gwas.ncore=0L) {
if (what=="AM") {
G <- kron(geno@eigen.G,1)
- M <- kronecker(crossprod(geno@coeff,crossprod(G$inv)),matrix(index.coeff,nrow=1))/geno@scale
+ M <- kronecker(crossprod((geno@coeff/geno@scale),crossprod(G$inv)),matrix(index.coeff,nrow=1))
effect <- as.numeric(M %*% matrix(data@random[1:n.random],ncol=1))
V <- data@var.uhat[1:n.random,1:n.random] #used for GWAS
} else {
D <- kron(geno@eigen.D,1)
- M <- kronecker(crossprod(geno@coeff.D,crossprod(D$inv)),matrix(index.coeff,nrow=1))/geno@scale.D
+ M <- kronecker(crossprod((geno@coeff.D/geno@scale.D),crossprod(D$inv)),matrix(index.coeff,nrow=1))
dhat <- -kronecker(matrix(geno@Fg[data@id],ncol=1),matrix(data@heterosis,ncol=1)) +
matrix(data@random[n.random + 1:n.random],ncol=1)
effect <- as.numeric(M %*% dhat)
diff --git a/R/blup_prep.R b/R/blup_prep.R
index 238ffd6..a802e53 100644
--- a/R/blup_prep.R
+++ b/R/blup_prep.R
@@ -4,13 +4,13 @@
#'
#' The \code{method} argument can be used to control how the linear system is solved. "MME" leads to inversion of the MME coefficient matrix, while "Vinv" leads to inversion of the overall var-cov matrix for the response vector. If NULL, the software uses whichever method involves inverting the smaller matrix. If the number of random effects (m) is less than the number of BLUEs (n), "MME" is used.
#'
-#' For the multi-location model, if all of the environments for a location are masked, the average of the other locations is used when computed average fixed effects.
+#' For the multi-location model, if all of the environments for a location are masked, the average of the other locations is used when computing average fixed effects.
#'
#' @param data data frame of BLUEs from Stage 1
#' @param vcov list of variance-covariance matrices for the BLUEs
#' @param geno object of \code{\link{class_geno}} from \code{\link{read_geno}}
#' @param vars object of \code{\link{class_var}} from \code{\link{Stage2}}
-#' @param mask (optional) data frame with possible columns "id","env","trait"
+#' @param mask (optional) data frame with possible columns "id","env","loc","trait"
#' @param method (optional) "MME", "Vinv", NULL (defaut). see Details
#'
#' @return Object of \code{\link{class_prep}}
@@ -50,7 +50,7 @@ blup_prep <- function(data,vcov=NULL,geno=NULL,vars,mask=NULL,method=NULL) {
data <- data[!is.na(data$BLUE),]
if (!is.null(mask)) {
- tmp <- intersect(colnames(mask),c("id","env","trait"))
+ tmp <- intersect(colnames(mask),c("id","env","trait","loc"))
stopifnot(length(tmp)>0)
if (length(tmp) > 1) {
tmp2 <- apply(mask[,tmp],1,paste,collapse=":")
@@ -189,10 +189,10 @@ blup_prep <- function(data,vcov=NULL,geno=NULL,vars,mask=NULL,method=NULL) {
Gmat2 <- kron(eigen.A=geno@eigen.D, B=vars@geno2)
#add to X matrix
- dom.covariate <- Z %*% kronecker(geno@coeff.D %*% matrix(1,nrow=ncol(geno@coeff.D),ncol=1),
- diag(n.loc))
+ dom.covariate <- Z %*% kronecker((geno@coeff.D/(geno@scale*(ploidy-1))) %*%
+ matrix(1,nrow=ncol(geno@coeff.D),ncol=1),diag(n.loc))
colnames(dom.covariate) <- paste("heterosis",locations,sep=":")
- X <- cbind(X,dom.covariate/(geno@scale*(ploidy-1)))
+ X <- cbind(X,dom.covariate)
} else {
Gmat2 <- kron(eigen.A=eigen.I, B=vars@geno2)
}
@@ -213,8 +213,9 @@ blup_prep <- function(data,vcov=NULL,geno=NULL,vars,mask=NULL,method=NULL) {
Gmat1 <- kron(eigen.A = geno@eigen.G, B=vars@geno1)
if (vars@model==3L) {
Gmat2 <- kron(eigen.A=geno@eigen.D, B=vars@geno2)
- dom.covariate <- as.numeric(Z %*% geno@coeff.D %*% matrix(1,nrow=ncol(geno@coeff.D),ncol=1))
- X <- cbind(X,heterosis=dom.covariate/(geno@scale*(ploidy-1)))
+ dom.covariate <- as.numeric(Z %*% (geno@coeff.D/(geno@scale*(ploidy-1))) %*%
+ matrix(1,nrow=ncol(geno@coeff.D),ncol=1))
+ X <- cbind(X,heterosis=dom.covariate)
} else {
Gmat2 <- kron(eigen.A=eigen.I, B=vars@geno2)
}
@@ -254,10 +255,10 @@ blup_prep <- function(data,vcov=NULL,geno=NULL,vars,mask=NULL,method=NULL) {
Gmat2 <- kron(eigen.A=geno@eigen.D, B=vars@geno2)
#add to X matrix
- dom.covariate <- Z %*% kronecker(geno@coeff.D %*% matrix(1,nrow=ncol(geno@coeff.D),ncol=1),
- diag(n.trait))
+ dom.covariate <- Z %*% kronecker((geno@coeff.D/(geno@scale*(ploidy-1))) %*%
+ matrix(1,nrow=ncol(geno@coeff.D),ncol=1),diag(n.trait))
colnames(dom.covariate) <- paste("heterosis",traits,sep=":")
- X <- cbind(X,dom.covariate/(geno@scale*(ploidy-1)))
+ X <- cbind(X,dom.covariate)
} else {
Gmat2 <- kron(eigen.A=eigen.I, B=vars@geno2)
}
diff --git a/R/class_geno.R b/R/class_geno.R
index 02d2af6..c277232 100644
--- a/R/class_geno.R
+++ b/R/class_geno.R
@@ -1,9 +1,9 @@
#' S4 class for marker genotype data
#'
-#' @slot ploidy ploidy
+#' @slot ploidy If mixed ploidy, then a vector equal to pop size; otherwise a single integer
#' @slot map Marker map positions
#' @slot coeff Coefficients of the marker effects (dim: indiv x marker)
-#' @slot scale Scaling factor between markers and indiv
+#' @slot scale Scaling factor between markers and indiv, vector of length equal to pop size
#' @slot G Additive relationship matrix (from markers and potentially also pedigree)
#' @slot eigen.G list of eigenvalues and eigenvectors
#'
diff --git a/R/dominance.R b/R/dominance.R
index a99f596..aafa44c 100644
--- a/R/dominance.R
+++ b/R/dominance.R
@@ -2,7 +2,7 @@
#'
#' Report dominance parameters
#'
-#' The dominance variance (Vd) and baseline heterosis (b) are quantified relative to additive variance (Va) and std. dev. (SDa), respectively. For single traits, the estimate and SE of the ratios are calculated based on the delta method (Rice 2007, p. 162-166). For a multi-trait model, the result is just the ratio of the estimates, with \code{index.coeff} specifying the coefficients of the standardized true values (see also \code{\link{blup}}) and \code{gamma} specifying the relative weight for non-additive to additive genetic merit (see also \code{\link{gain}}).
+#' The dominance variance (Vd) and baseline heterosis (b) are quantified relative to additive variance (Va) and std. dev. (SDa), respectively. For single traits, the estimate and SE of the ratios are calculated based on the delta method (Rice 2007, p. 162-166). For a multi-trait/loc model, the result is just the ratio of the estimates, with \code{index.coeff} specifying the coefficients of the standardized true values (see also \code{\link{blup}}) and \code{gamma} specifying the relative weight for non-additive to additive genetic merit (see also \code{\link{gain}}).
#'
#' @param params list returned by \code{\link{Stage2}}
#' @param digits number of digits for rounding
@@ -20,22 +20,26 @@ dominance <- function(params, digits=2, index.coeff=NULL, gamma=0) {
id <- grep("dominance",params$vc$name,fixed=T)
ia <- grep("additive",params$vc$name,fixed=T)
+ vc <- params$vc[,-1]
+ rownames(vc) <- params$vc$name
- n.trait <- nrow(params$heterosis)
- if (n.trait > 1) {
+ nhet <- nrow(params$heterosis)
+ if (nhet > 1) {
stopifnot(!is.null(index.coeff))
- traits <- params$heterosis$trait
- Vd <- Va <- matrix(0,nrow=n.trait,ncol=n.trait,dimnames=list(traits,traits))
-
- tmp <- strsplit(params$vc$name[ia],split=":")
- ix <- cbind(sapply(tmp,"[[",2),sapply(tmp,"[[",3))
- Va[ix] <- params$vc$estimate[ia]
- Va[upper.tri(Va)] <- Va[lower.tri(Va)]
-
- tmp <- strsplit(params$vc$name[id],split=":")
- ix <- cbind(sapply(tmp,"[[",2),sapply(tmp,"[[",3))
- Vd[ix] <- params$vc$estimate[id]
- Vd[upper.tri(Vd)] <- Vd[lower.tri(Vd)]
+ if (colnames(params$heterosis)[1]=="trait") {
+ traits <- params$heterosis$trait
+ #Vd <- Va <- matrix(0,nrow=nhet,ncol=nhet,dimnames=list(traits,traits))
+ Va <- f.cov.trait(vc[ia,],traits,us=(nhet>2))
+ Vd <- f.cov.trait(vc[id,],traits,us=(nhet>2))
+ } else {
+ #multi-location
+ traits <- params$heterosis$loc
+ add <- f.cov.loc(vc[ia,], traits)
+ Va <- add$cov.mat
+ tmp <- matrix(vc[id,1],ncol=1)
+ rownames(tmp) <- traits
+ Vd <- tcrossprod(sqrt(tmp))
+ }
stopifnot(sort(names(index.coeff))==sort(traits))
index.coeff <- index.coeff[match(traits,names(index.coeff))]
diff --git a/R/inbreeding.R b/R/inbreeding.R
index 8f2bbfe..5ead215 100644
--- a/R/inbreeding.R
+++ b/R/inbreeding.R
@@ -16,7 +16,12 @@ inbreeding <- function(geno) {
x <- data.frame(F.G=(diag(geno@G)-1)/(geno@ploidy-1))
if (inherits(geno,"class_genoD"))
x$F.D <- geno@Fg
-
+
+ pops <- attr(geno@scale,"pop")
+ np <- length(unique(pops))
+ if (np > 1) {
+ x$pop <- pops
+ }
rownames(x) <- rownames(geno@G)
return(x)
}
\ No newline at end of file
diff --git a/R/read_geno.R b/R/read_geno.R
index 47eb7ea..027a4ee 100644
--- a/R/read_geno.R
+++ b/R/read_geno.R
@@ -11,6 +11,8 @@
#' When \code{dominance=FALSE}, non-additive effects are captured using a residual genetic effect, with zero covariance. If \code{dominance=TRUE}, a (digenic) dominance covariance matrix is used instead.
#'
#' The argument \code{min.minor.allele} specifies the minimum number of individuals that must contain the minor allele. Markers that do not meet this threshold are discarded.
+#'
+#' Optional argument \code{pop.file} gives the name of a CSV file with two columns: id,pop. If the populations have different ploidy, this is indicated using a named vector for \code{ploidy}.
#'
#' @param filename Name of CSV file with marker allele dosage
#' @param ploidy 2,4,6,etc. (even numbers)
@@ -19,6 +21,7 @@
#' @param w blending parameter (see Details)
#' @param ped optional, pedigree data frame with 3 or 4 columns (see Details)
#' @param dominance TRUE/FALSE whether to include dominance covariance (see Details)
+#' @param pop.file CSV file defining populations
#'
#' @return Variable of class \code{\link{class_geno}}.
#'
@@ -29,7 +32,7 @@
#' @importFrom data.table fread
read_geno <- function(filename, ploidy, map, min.minor.allele=5,
- w=1e-5, ped=NULL, dominance=FALSE) {
+ w=1e-5, ped=NULL, dominance=FALSE, pop.file=NULL) {
if (any(w < 1e-5))
stop("Blending parameter should not be smaller than 1e-5")
@@ -38,49 +41,107 @@ read_geno <- function(filename, ploidy, map, min.minor.allele=5,
map <- data.frame(data[,1:3])
colnames(map) <- c("marker","chrom","position")
geno <- as.matrix(data[,-(1:3)])
+ rownames(geno) <- as.character(map$marker)
} else {
map <- data.frame(marker=character(0),
chrom=character(0),
position=numeric(0))
geno <- as.matrix(data[,-1])
+ tmp <- data.frame(data[,1])
+ colnames(tmp) <- "marker"
+ rownames(geno) <- as.character(tmp$marker)
}
- rownames(geno) <- as.character(data$V1)
+
m <- nrow(geno)
id <- colnames(geno)
- #check for recoding -1,0,1 to 0,1,2
- if ((min(geno[1:min(m,1000),],na.rm=T)==-1) & (ploidy==2)) {
- geno <- geno + 1
- }
+ n.ploidy <- length(unique(ploidy))
- p <- apply(geno,1,mean,na.rm=T)/ploidy
- n.minor <- apply(geno,1,function(z){
- p <- mean(z,na.rm=T)/ploidy
- tab <- table(factor(round(z),levels=0:ploidy))
- if (p > 0.5) {
- sum(tab) - tab[ploidy+1]
+ if (!is.null(pop.file)) {
+ pdat <- read.csv(pop.file,colClasses = c("character","character"))
+ colnames(pdat) <- c("id","pop")
+ pops <- unique(pdat$pop)
+ np <- length(pops)
+ ix <- match(id,pdat$id,nomatch = 0)
+ stopifnot(ix > 0)
+ pdat <- pdat[ix,]
+ if (n.ploidy > 1) {
+ iu <- match(pops,names(ploidy),nomatch=0)
+ stopifnot(iu>0)
+ ploidy <- ploidy[pops]
} else {
- sum(tab) - tab[1]
+ ploidy <- rep(ploidy,np)
+ names(ploidy) <- pops
+ }
+ pdat$ploidy <- ploidy[pdat$pop]
+
+ } else {
+ stopifnot(length(ploidy)==1)
+ #check for recoding -1,0,1 to 0,1,2
+ if ((min(geno[1:min(m,1000),],na.rm=T)==-1) & (ploidy==2)) {
+ geno <- geno + 1
}
- })
+
+ np <- 1
+ pops <- "1"
+ pdat <- data.frame(id=id,pop="1",ploidy=ploidy)
+ }
+
+ pmat <- t(geno)/pdat$ploidy
+ p2 <- matrix(as.numeric(NA),nrow=m,ncol=np)
+ colnames(p2) <- pops
+
+ for (i in 1:np) {
+ p2[,i] <- apply(pmat[pdat$pop==pops[i],],2,mean,na.rm=T)
+ }
+ popn <- as.numeric(table(pdat$pop)[pops])
+ p <- apply(t(p2)*popn,2,sum)/sum(popn)
+ flip <- which(p > 0.5)
+ pmat[,flip] <- 1-pmat[,flip]
+ n.minor <- apply(pmat,2,function(z){sum(z > 0.1,na.rm=T)})
+
+ # n.minor <- apply(geno,1,function(z){
+ # p <- mean(z,na.rm=T)/ploidy
+ # tab <- table(factor(round(z),levels=0:ploidy))
+ # if (p > 0.5) {
+ # sum(tab) - tab[ploidy+1]
+ # } else {
+ # sum(tab) - tab[1]
+ # }
+ # })
ix <- which(n.minor >= min.minor.allele)
m <- length(ix)
stopifnot(m > 0)
cat(sub("X",min.minor.allele,"Minor allele threshold = X genotypes\n"))
cat(sub("X",m,"Number of markers = X\n"))
geno <- geno[ix,]
+ p2 <- p2[ix,,drop=F]
n <- ncol(geno)
cat(sub("X",n,"Number of genotypes = X\n"))
- p <- p[ix]
+
+ #p <- apply(geno,1,mean,na.rm=T)/ploidy
+ #p <- p[ix]
if (nrow(map)>0)
map <- map[ix,]
- coeff <- Matrix(scale(t(geno),scale=F),
+ coeff <- matrix(0,nrow=n,ncol=m,
dimnames=list(id,rownames(geno)))
+ scale.param <- numeric(n)
+ names(scale.param) <- id
+ attr(scale.param,"pop") <- pdat$pop
+
+ for (i in 1:np) {
+ ix <- which(pdat$pop==pops[i])
+ pmat <- matrix(1,ncol=1,nrow=popn[i]) %*% matrix(p2[,i],nrow=1)
+ scale.param[ix] <- ploidy[i]*sum(p2[,i]*(1-p2[,i]))
+ coeff[ix,] <- t(geno[,ix])-ploidy[i]*pmat
+ }
coeff[which(is.na(coeff))] <- 0
+ coeff <- Matrix(coeff)
- scale <- ploidy*sum(p*(1-p))
- G <- tcrossprod(coeff)/scale
+ #scale <- ploidy*sum(p*(1-p))
+ #G <- tcrossprod(coeff)/scale
+ G <- tcrossprod(coeff/sqrt(scale.param))
nw <- length(w)
H <- vector("list",nw)
@@ -91,6 +152,8 @@ read_geno <- function(filename, ploidy, map, min.minor.allele=5,
H[[i]] <- (1-w[i])*G + w[i]*mean(diag(G))*Diagonal(n=nrow(G))
}
} else {
+ if (n.ploidy > 1)
+ stop("Pedigree option not available for mixed ploidy")
if (ncol(ped)==4) {
if (dominance)
@@ -110,7 +173,7 @@ read_geno <- function(filename, ploidy, map, min.minor.allele=5,
ped2 <- data.frame(id=1:nrow(ped),
parent1=match(ped$parent1,ped$id,nomatch=0),
parent2=match(ped$parent2,ped$id,nomatch=0))
- invisible(capture.output(A <- as(Amatrix(ped2,ploidy=ploidy),"symmetricMatrix")))
+ invisible(capture.output(A <- as(Amatrix(ped2,ploidy=ploidy[1]),"symmetricMatrix")))
rownames(A) <- ped$id
colnames(A) <- ped$id
@@ -141,6 +204,15 @@ read_geno <- function(filename, ploidy, map, min.minor.allele=5,
id <- c(id,id2)
}
+ if (n.ploidy > 1) {
+ ploidy <- as.integer(pdat$ploidy)
+ names(ploidy) <- pdat$id
+ attr(ploidy,"pop") <- pdat$pop
+ } else {
+ ploidy <- as.integer(pdat$ploidy[1])
+ names(ploidy) <- NULL
+ }
+
G.list <- vector("list",nw)
for (i in 1:nw) {
if (!is.null(H[[i]])) {
@@ -153,25 +225,38 @@ read_geno <- function(filename, ploidy, map, min.minor.allele=5,
H[[i]] <- tcrossprod(eigen.G$vectors%*%Diagonal(n=nrow(Hinv[[i]]),x=sqrt(eigen.G$values)))
}
class(eigen.G) <- "list"
- G.list[[i]] <- new(Class="class_geno",ploidy=as.integer(ploidy),map=map,coeff=coeff,scale=scale,
- G=H[[i]], eigen.G=eigen.G)
+
+ G.list[[i]] <- new(Class="class_geno",ploidy=ploidy,map=map,coeff=coeff,
+ scale=scale.param,G=H[[i]], eigen.G=eigen.G)
}
if (dominance) {
- Pmat <- kronecker(matrix(p,nrow=1,ncol=m),matrix(1,ncol=1,nrow=n))
- coeff.D <- Matrix(-2*choose(ploidy,2)*Pmat^2 + 2*(ploidy-1)*Pmat*t(geno) - t(geno)*(t(geno)-1))
+ coeff.D <- matrix(0,nrow=n,ncol=m,dimnames=list(id,rownames(geno)))
+ scaleD.param <- numeric(n)
+ names(scaleD.param) <- id
+ for (i in 1:np) {
+ ix <- which(pdat$pop==pops[i])
+ pmat <- matrix(1,ncol=1,nrow=popn[i]) %*% matrix(p2[,i],nrow=1)
+ scaleD.param[ix] <- 4*choose(pdat$ploidy[ix],2)*sum(p2[,i]^2*(1-p2[,i])^2)
+ #Pmat <- kronecker(matrix(p2[,i],nrow=1,ncol=m),matrix(1,ncol=1,nrow=popn[i]))
+ coeff.D[ix,] <- -2*choose(pdat$ploidy[ix],2)*pmat^2 + 2*(pdat$ploidy[ix]-1)*pmat*t(geno[,ix]) -
+ t(geno[,ix])*(t(geno[,ix])-1)
+ }
+
coeff.D[is.na(coeff.D)] <- 0
- scale.D <- 4*choose(ploidy,2)*sum(p^2*(1-p)^2)
- D <- tcrossprod(coeff.D)/scale.D
+ coeff.D <- Matrix(coeff.D)
+ #scale.D <- 4*choose(ploidy,2)*sum(p^2*(1-p)^2)
+ D <- tcrossprod(coeff.D/sqrt(scaleD.param))
D <- (1-1e-5)*D + (1e-5)*mean(diag(D))*Diagonal(n=nrow(D))
eigen.D <- eigen(D,symmetric=TRUE)
eigen.D$vectors <- Matrix(eigen.D$vectors,dimnames=list(id,id))
class(eigen.D) <- "list"
- Fg <- apply(coeff.D,1,sum)/(-scale*(ploidy-1))
+ Fg <- apply(coeff.D,1,sum)/(-scale.param*(pdat$ploidy-1))
names(Fg) <- id
output <- lapply(G.list,function(x){new(Class="class_genoD",ploidy=x@ploidy,map=x@map,
coeff=x@coeff,scale=x@scale,G=x@G,eigen.G=x@eigen.G,
- coeff.D=coeff.D,scale.D=scale.D,D=D,eigen.D=eigen.D,Fg=Fg)})
+ coeff.D=coeff.D,scale.D=scaleD.param,D=D,eigen.D=eigen.D,
+ Fg=Fg)})
} else {
output <- G.list
}
diff --git a/docs/Vignette2.html b/docs/Vignette2.html
index b431a07..e630be1 100644
--- a/docs/Vignette2.html
+++ b/docs/Vignette2.html
@@ -372,7 +372,7 @@ Vignette 2: Single trait analysis at
explicitly used by Stage1
but is automatically detected and
retained in the output for use in Stage2
.
ans1 <- Stage1(filename=pheno.file, traits='Yield.Mg.ha')
-## Online License checked out Fri Feb 16 15:27:34 2024
+## Offline License checked out Thu Jul 18 03:16:21 2024
## env id BLUE loc
## 1 CA_2011 A01143-3C 93.050 CA
@@ -401,7 +401,7 @@ Vignette 2: Single trait analysis at
## $var
## Variance PVE
-## env 307 NA
+## env 286 NA
## genotype 53 0.355
## g x loc 15 0.098
## g x env 22 0.150
@@ -428,7 +428,7 @@ Vignette 2: Single trait analysis at
Stage2
can be used with the function uniplot
to visualize model sufficiency and correlation structure (Cullis et al. 2010).
-
+
The squared radius for each location is the proportion of variance
explained (PVE) by the latent factors. With the exception of FL, the FA2
model appears to provide a good representation of the covariance
@@ -456,12 +456,13 @@
Vignette 2: Single trait analysis at
## $var
## Variance PVE
-## env 267 NA
-## additive 35 0.214
-## add x loc 8 0.047
-## dominance 36 0.217
-## g x env 20 0.120
-## Stage1.error 67 0.403
+## env 311 NA
+## additive 35 0.208
+## add x loc 8 0.046
+## dominance 36 0.211
+## heterosis 4 0.024
+## g x env 20 0.117
+## Stage1.error 67 0.393
##
## $cor
## MI MO NC NY OR WI
@@ -472,7 +473,7 @@ Vignette 2: Single trait analysis at
## OR 0.890 0.976 0.831 0.945 1.000 0.961
## WI 0.908 0.939 0.707 0.967 0.961 1.000
-
+
The correlation matrix and uniplot, which are based on the additive
values, show that yield in NC was somewhat different compared to the
other five sites.
@@ -494,7 +495,7 @@ Vignette 2: Single trait analysis at
ggplot(pred,aes(x=r2.NC,y=r2.WI)) + geom_point() + coord_fixed(ratio=1) + xlim(0.3,0.8) + ylim(0.3,0.8) + geom_line(data=data.frame(x=c(0.3,0.8),y=c(0.3,0.8)),mapping=aes(x=x,y=y),linetype=2) + theme_bw() +
xlab("NC") + ylab("WI") + ggtitle("BV Reliability")
-
+
The above figure shows that breeding values were predicted with
higher reliability in WI than NC, which is to be expected from the high
correlation between WI and the other four sites (OR,MO,MI,NY) compared
@@ -516,7 +517,7 @@
Vignette 2: Single trait analysis at
ggplot(plot.data,aes(x=without.WI.pheno,y=with.WI.pheno)) + geom_point() + coord_fixed(ratio=1) + xlim(0.3,0.8) + ylim(0.3,0.8) + geom_line(data=data.frame(x=c(0.3,0.8),y=c(0.3,0.8)),mapping=aes(x=x,y=y),linetype=2) + theme_bw() +
xlab("Without WI phenotypes") + ylab("With WI phenotypes") + ggtitle("WI BV Reliability")
-
+
As expected, the reliability was higher with WI phenotypes.
diff --git a/docs/manual.pdf b/docs/manual.pdf
index 6a30904..1b55e10 100644
Binary files a/docs/manual.pdf and b/docs/manual.pdf differ
diff --git a/man/Stage1.Rd b/man/Stage1.Rd
index f8750b4..61a0b5b 100644
--- a/man/Stage1.Rd
+++ b/man/Stage1.Rd
@@ -36,7 +36,7 @@ Stage1(
List containing
\describe{
\item{blues}{data frame of BLUEs}
-\item{vcov}{list of variance-covariance matrices for the BLUEs, one per experiment (env)}
+\item{vcov}{list of variance-covariance matrices for the BLUEs, one per env}
\item{fit}{data frame with broad-sense H2 (plot basis) and/or AIC}
\item{resid}{For single trait, list of diagnostic plots and data frame of residuals. For multi-trait, list of resid var-cov matrices.}
}
diff --git a/man/blup_prep.Rd b/man/blup_prep.Rd
index 273c70b..386656f 100644
--- a/man/blup_prep.Rd
+++ b/man/blup_prep.Rd
@@ -15,7 +15,7 @@ blup_prep(data, vcov = NULL, geno = NULL, vars, mask = NULL, method = NULL)
\item{vars}{object of \code{\link{class_var}} from \code{\link{Stage2}}}
-\item{mask}{(optional) data frame with possible columns "id","env","trait"}
+\item{mask}{(optional) data frame with possible columns "id","env","loc","trait"}
\item{method}{(optional) "MME", "Vinv", NULL (defaut). see Details}
}
@@ -28,5 +28,5 @@ Prepare data for BLUP
\details{
The \code{method} argument can be used to control how the linear system is solved. "MME" leads to inversion of the MME coefficient matrix, while "Vinv" leads to inversion of the overall var-cov matrix for the response vector. If NULL, the software uses whichever method involves inverting the smaller matrix. If the number of random effects (m) is less than the number of BLUEs (n), "MME" is used.
-For the multi-location model, if all of the environments for a location are masked, the average of the other locations is used when computed average fixed effects.
+For the multi-location model, if all of the environments for a location are masked, the average of the other locations is used when computing average fixed effects.
}
diff --git a/man/class_geno-class.Rd b/man/class_geno-class.Rd
index bdd1ad1..25b8580 100644
--- a/man/class_geno-class.Rd
+++ b/man/class_geno-class.Rd
@@ -11,13 +11,13 @@ S4 class for marker genotype data
\section{Slots}{
\describe{
-\item{\code{ploidy}}{ploidy}
+\item{\code{ploidy}}{If mixed ploidy, then a vector equal to pop size; otherwise a single integer}
\item{\code{map}}{Marker map positions}
\item{\code{coeff}}{Coefficients of the marker effects (dim: indiv x marker)}
-\item{\code{scale}}{Scaling factor between markers and indiv}
+\item{\code{scale}}{Scaling factor between markers and indiv, vector of length equal to pop size}
\item{\code{G}}{Additive relationship matrix (from markers and potentially also pedigree)}
diff --git a/man/dominance.Rd b/man/dominance.Rd
index 65847b8..5fbfbcd 100644
--- a/man/dominance.Rd
+++ b/man/dominance.Rd
@@ -22,7 +22,7 @@ data frame with estimates and SE
Report dominance parameters
}
\details{
-The dominance variance (Vd) and baseline heterosis (b) are quantified relative to additive variance (Va) and std. dev. (SDa), respectively. For single traits, the estimate and SE of the ratios are calculated based on the delta method (Rice 2007, p. 162-166). For a multi-trait model, the result is just the ratio of the estimates, with \code{index.coeff} specifying the coefficients of the standardized true values (see also \code{\link{blup}}) and \code{gamma} specifying the relative weight for non-additive to additive genetic merit (see also \code{\link{gain}}).
+The dominance variance (Vd) and baseline heterosis (b) are quantified relative to additive variance (Va) and std. dev. (SDa), respectively. For single traits, the estimate and SE of the ratios are calculated based on the delta method (Rice 2007, p. 162-166). For a multi-trait/loc model, the result is just the ratio of the estimates, with \code{index.coeff} specifying the coefficients of the standardized true values (see also \code{\link{blup}}) and \code{gamma} specifying the relative weight for non-additive to additive genetic merit (see also \code{\link{gain}}).
}
\references{
Rice JA (2007) Mathematical Statistics and Data Analysis, 3rd ed. Duxbury, Pacific Grove.
diff --git a/man/read_geno.Rd b/man/read_geno.Rd
index e158925..c7c864f 100644
--- a/man/read_geno.Rd
+++ b/man/read_geno.Rd
@@ -11,7 +11,8 @@ read_geno(
min.minor.allele = 5,
w = 1e-05,
ped = NULL,
- dominance = FALSE
+ dominance = FALSE,
+ pop.file = NULL
)
}
\arguments{
@@ -28,6 +29,8 @@ read_geno(
\item{ped}{optional, pedigree data frame with 3 or 4 columns (see Details)}
\item{dominance}{TRUE/FALSE whether to include dominance covariance (see Details)}
+
+\item{pop.file}{CSV file defining populations}
}
\value{
Variable of class \code{\link{class_geno}}.
@@ -45,4 +48,6 @@ If the A matrix is not used, then G is blended with the identity matrix (times t
When \code{dominance=FALSE}, non-additive effects are captured using a residual genetic effect, with zero covariance. If \code{dominance=TRUE}, a (digenic) dominance covariance matrix is used instead.
The argument \code{min.minor.allele} specifies the minimum number of individuals that must contain the minor allele. Markers that do not meet this threshold are discarded.
+
+Optional argument \code{pop.file} gives the name of a CSV file with two columns: id,pop. If the populations have different ploidy, this is indicated using a named vector for \code{ploidy}.
}
diff --git a/vignettes/Vignette1.Rmd b/vignettes/Vignette1.Rmd
index fa8e7a0..01278db 100644
--- a/vignettes/Vignette1.Rmd
+++ b/vignettes/Vignette1.Rmd
@@ -135,7 +135,7 @@ geno.file <- system.file("vignette_data", "geno1.csv", package = "StageWise")
geno <- read.csv(geno.file,check.names=F)
geno[1:4,1:6]
-geno <- read_geno(filename=geno.file, ploidy=4, map=TRUE, min.minor.allele=5,
+geno <- read_geno(filename=geno.file, ploidy=4L, map=TRUE, min.minor.allele=5,
dominance=T)
```