v1.10

jendelman · Jul 18, 2024 · 3c1e618 · 3c1e618
1 parent bc26ef7
commit 3c1e618
Show file tree

Hide file tree

Showing 18 changed files with 259 additions and 137 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,14 +1,14 @@
 Package: StageWise
 Title: Two-stage analysis of multi-environment trials for genomic selection and GWAS
-Version: 1.09
+Version: 1.10
 Author: Jeffrey B. Endelman
 Maintainer: Jeffrey Endelman <[email protected]>
 Description: Fully efficient, two stage analysis of multi-environment trials, including directional dominance and multi-trait genomic selection
 Depends: R (>= 4.0)
 License: GPL-3
 RoxygenNote: 7.2.3
 Encoding: UTF-8
-Imports: Matrix (>= 1.5-0), ggplot2, methods, ggrepel, rlang, ggpubr, SpATS, spam, AGHmatrix, MASS, CVXR, ggforce
+Imports: Matrix (>= 1.5-0), ggplot2, methods, ggrepel, rlang, ggpubr, SpATS, spam, AGHmatrix, MASS, CVXR, ggforce, data.table
 Suggests: knitr, rmarkdown, asreml
 Collate: 
     'Stage1.R'

diff --git a/NEWS b/NEWS
@@ -1,3 +1,7 @@
+Changes in 1.10
+* Added support for multi-population/multi-ploidy analysis
+* Fixed error in Stage1 related to multiple experiments per env
+
 Changes in 1.09
 * changed read.csv to fread in read_geno
 

diff --git a/R/Stage1.R b/R/Stage1.R
@@ -30,7 +30,7 @@
 #' @return List containing
 #' \describe{
 #' \item{blues}{data frame of BLUEs}
-#' \item{vcov}{list of variance-covariance matrices for the BLUEs, one per experiment (env)}
+#' \item{vcov}{list of variance-covariance matrices for the BLUEs, one per env}
 #' \item{fit}{data frame with broad-sense H2 (plot basis) and/or AIC}
 #' \item{resid}{For single trait, list of diagnostic plots and data frame of residuals. For multi-trait, list of resid var-cov matrices.}
 #' }
@@ -312,55 +312,64 @@ Stage1 <- function(filename,traits,effects=NULL,solver="asreml",
     blue.out$env <- data$env[match(blue.out$expt,data$expt)]
 
     nee <- sapply(expt.in.env,length)
-    iu <- which(nee==1)
-    if (length(iu) > 0) {
-      tmp <- names(vcov)[iu]
-      names(vcov)[iu] <- blue.out$env[match(tmp,blue.out$expt)]
-    }
+    #iu <- which(nee==1)
+    #if (length(iu) > 0) {
+    #  tmp <- names(vcov)[iu]
+    #  names(vcov)[iu] <- blue.out$env[match(tmp,blue.out$expt)]
+    #}
 
-    for (j in which(nee > 1)) {
-      omega.list <- vcov[expt.in.env[[j]]]
-      vcov2 <- mapply(FUN=function(x,y){
-        tmp <- paste(y,rownames(x),sep=":")
-        dimnames(x) <- list(tmp,tmp)
-        return(x)},x=omega.list,y=as.list(expt.in.env[[j]]))
+    vcov.env <- vector("list",length(nee))
+    names(vcov.env) <- names(nee)
+    iu <- which(nee==1)
+    if (length(iu) > 0)
+      vcov.env[names(iu)] <- vcov[unlist(expt.in.env[iu])]
 
-      .GlobalEnv$asremlOmega <- direct_sum(lapply(vcov2,solve))
-      dname <- lapply(vcov2,rownames)
-      dimnames(.GlobalEnv$asremlOmega) <- list(unlist(dname),unlist(dname))
-      attr(.GlobalEnv$asremlOmega,"INVERSE") <- TRUE
+    iu <- which(nee > 1)
+    if (length(iu) > 0)
+      for (j in iu) {
+        omega.list <- vcov[expt.in.env[[j]]]
+        vcov2 <- mapply(FUN=function(x,y){
+          tmp <- paste(y,rownames(x),sep=":")
+          dimnames(x) <- list(tmp,tmp)
+          return(x)},x=omega.list,y=as.list(expt.in.env[[j]]))
 
-      ix <- which(blue.out$env==envs[j])
-      data2 <- blue.out[ix,]
-      data2$expt.id <- factor(paste(data2$expt,data2$id,sep=":"))
-      data2$id <- factor(data2$id)
-      data2$expt <- factor(data2$expt)
-      start.table <- asreml::asreml(data=data2,fixed=BLUE~expt-1+id,
-                            random=~vm(expt.id,source=asremlOmega),
-                            residual=~idv(units),start.values=TRUE)$vparameters.table
-      k <- grep("Omega",start.table$Component,fixed=T)
-      start.table$Value[k] <- 1
-      start.table$Constraint[k] <- "F"
-      ans3 <- asreml::asreml(data=data2,fixed=BLUE~expt-1+id,
-                     random=~vm(expt.id,source=asremlOmega),
-                     residual=~idv(units),G.param=start.table)
+        .GlobalEnv$asremlOmega <- direct_sum(lapply(vcov2,solve))
+        dname <- lapply(vcov2,rownames)
+        dimnames(.GlobalEnv$asremlOmega) <- list(unlist(dname),unlist(dname))
+        attr(.GlobalEnv$asremlOmega,"INVERSE") <- TRUE
 
-      predans3 <- asreml::predict.asreml(ans3,classify="id",vcov = TRUE)
-      blue3 <- data.frame(expt=NA,predans3$pvals[,c("id","predicted.value")],env=envs[j])
-      colnames(blue3) <- c("expt","id","BLUE","env")
-      vcov3 <- predans3$vcov
-      dimnames(vcov3) <- list(blue3$id,blue3$id)
-      vcov2 <- vcov[-match(expt.in.env[[j]],names(vcov))]
-      vcov <- c(vcov2,vcov3)
-      names(vcov) <- c(names(vcov2),envs[j])
-      blue.out <- rbind(blue.out[-ix,],blue3)
-      rm("asremlOmega",envir = .GlobalEnv)
-    }
+        ix <- which(blue.out$env==envs[j])
+        data2 <- blue.out[ix,]
+        data2$expt.id <- factor(paste(data2$expt,data2$id,sep=":"))
+        data2$id <- factor(data2$id)
+        data2$expt <- factor(data2$expt)
+        start.table <- asreml::asreml(data=data2,fixed=BLUE~expt-1+id,
+                              random=~vm(expt.id,source=asremlOmega),
+                              residual=~idv(units),start.values=TRUE)$vparameters.table
+        k <- grep("Omega",start.table$Component,fixed=T)
+        start.table$Value[k] <- 1
+        start.table$Constraint[k] <- "F"
+        ans3 <- asreml::asreml(data=data2,fixed=BLUE~expt-1+id,
+                       random=~vm(expt.id,source=asremlOmega),
+                       residual=~idv(units),G.param=start.table)
+
+        predans3 <- asreml::predict.asreml(ans3,classify="id",vcov = TRUE)
+        blue3 <- data.frame(expt=NA,predans3$pvals[,c("id","predicted.value")],env=envs[j])
+        colnames(blue3) <- c("expt","id","BLUE","env")
+        vcov3 <- predans3$vcov
+        dimnames(vcov3) <- list(blue3$id,blue3$id)
+        #vcov2 <- vcov[-match(expt.in.env[[j]],names(vcov))]
+        #vcov <- c(vcov2,vcov3)
+        #names(vcov) <- c(names(vcov2),envs[j])
+        vcov.env[[names(iu)[j]]] <- vcov3
+        blue.out <- rbind(blue.out[-ix,],blue3)
+        rm("asremlOmega",envir = .GlobalEnv)
+      }
   }
 
   if (n.trait > 1) {
-    vcov <- vector("list",n.env)
-    names(vcov) <- envs
+    vcov.env <- vector("list",n.env)
+    names(vcov.env) <- envs
 
     expt.in.env <- lapply(split(data$expt,factor(data$env,levels=envs)),unique)
     nexpt.per.env <- sapply(expt.in.env,length)
@@ -404,10 +413,10 @@ Stage1 <- function(filename,traits,effects=NULL,solver="asreml",
         predans <- asreml::predict.asreml(ans,classify="id:trait",vcov = TRUE)
         tmp <- predans$pvals[,c("id","trait","predicted.value")]
         colnames(tmp) <- c("id","trait","BLUE")
-        vcov[[j]] <- predans$vcov
+        vcov.env[[j]] <- predans$vcov
         tmp$id <- as.character(tmp$id)
         id.trait <- apply(tmp[,1:2],1,paste,collapse=":")
-        dimnames(vcov[[j]]) <- list(id.trait,id.trait)
+        dimnames(vcov.env[[j]]) <- list(id.trait,id.trait)
         blue.out <- rbind(blue.out,data.frame(env=envs[j],tmp))
       }
     }
@@ -428,13 +437,13 @@ Stage1 <- function(filename,traits,effects=NULL,solver="asreml",
       stat_boxplot(outlier.color="red") + theme_bw() + theme(axis.text.x=element_text(angle=90,vjust=0.5))
     p2 <- ggplot(data=blup.resid,aes(sample=.data$resid)) + stat_qq() + stat_qq_line() + facet_wrap(~expt) + theme_bw() + xlab("Expected") + ylab("Observed")
     if (solver=="ASREML")
-      return(list(blues=blue.out,vcov=vcov,fit=fit,
+      return(list(blues=blue.out,vcov=vcov.env,fit=fit,
                   resid=list(boxplot=p1,qqplot=p2,table=blup.resid)))
     if (solver=="SPATS")
-      return(list(blues=blue.out,vcov=vcov,fit=fit,
+      return(list(blues=blue.out,vcov=vcov.env,fit=fit,
                   resid=list(boxplot=p1,qqplot=p2,spatial=spatial.plot,table=blup.resid)))
   } else {
     #Multi-trait
-    return(list(blues=blue.out,vcov=vcov,fit=fit,resid=resid.vc))
+    return(list(blues=blue.out,vcov=vcov.env,fit=fit,resid=resid.vc))
   }
 }
diff --git a/R/Stage2.R b/R/Stage2.R
@@ -106,7 +106,8 @@ Stage2 <- function(data,vcov=NULL,geno=NULL,fix.eff.marker=NULL,
       diag(resid.vc) <- diag(resid.vc)/(n.trait-1)
       resid.vc <- fu(resid.vc)
       diag(B) <- diag(B)/(n.trait-1)
-      B <- fu(B)
+      if (max(abs(diag(B))) > .Machine$double.eps*10)
+        B <- fu(B)
 
       if (non.add!="none") {
         diag(geno2) <- diag(geno2)/(n.trait-1)
@@ -138,6 +139,9 @@ Stage2 <- function(data,vcov=NULL,geno=NULL,fix.eff.marker=NULL,
   if (!is.null(geno)) {
     stopifnot(is(geno,"class_geno"))
     id <- sort(intersect(data$id,rownames(geno@G)))
+    if (length(geno@ploidy) > 1)
+      geno@ploidy <- geno@ploidy[id]
+
     n <- length(id)
     data <- data[data$id %in% id,]
     id.weights <- table(factor(data$id,levels=id))
@@ -150,7 +154,7 @@ Stage2 <- function(data,vcov=NULL,geno=NULL,fix.eff.marker=NULL,
       meanD <- as.numeric(pvar(V=.GlobalEnv$asremlD,weights=id.weights))
 
       dom.covariate <- as.numeric(geno@coeff.D[id,] %*% matrix(1,nrow=ncol(geno@coeff.D),ncol=1))
-      dom.covariate <- dom.covariate/(geno@scale*(geno@ploidy-1))
+      dom.covariate <- dom.covariate/(geno@scale[id]*(geno@ploidy-1))
       names(dom.covariate) <- id
       data$dom <- dom.covariate[data$id]
 
@@ -184,7 +188,7 @@ Stage2 <- function(data,vcov=NULL,geno=NULL,fix.eff.marker=NULL,
   data$env.id <- factor(data$env.id,levels=env.id)
 
   out <- vector("list",4)
-  names(out) <- c("aic","vars","fixed","random")
+  names(out) <- c("aic","vars","params","random")
   n.loc <- 1
   n.trait <- 1
 
@@ -243,8 +247,9 @@ Stage2 <- function(data,vcov=NULL,geno=NULL,fix.eff.marker=NULL,
       model <- sub("FIX",paste(c("env",covariates,fix.eff.marker,dom),collapse="+"),model,fixed=T)
     } else {
       model <- "asreml(data=data,fixed=BLUE~FIX-1,random=~RANDOM,residual=~dsum(~idv(units)|loc)"
-      tmp <- gsub("+",":loc+",paste(c("env",fix.eff.marker,dom),collapse="+"),fixed=T)
-      model <- sub("FIX",paste(tmp,"loc",sep=":"),model,fixed=T)
+      tmp <- gsub("+",":loc+",paste(c(fix.eff.marker,dom,"env"),collapse="+"),fixed=T)
+      #model <- sub("FIX",paste(tmp,"loc",sep=":"),model,fixed=T)
+      model <- sub("FIX",tmp,model,fixed=T)
     }
   } else {
     model <- "asreml(data=data,fixed=BLUE~FIX-1,random=~RANDOM,residual=~id(env.id):us(Trait)"
@@ -477,12 +482,11 @@ Stage2 <- function(data,vcov=NULL,geno=NULL,fix.eff.marker=NULL,
     }
 
     if (non.add=="dom") {
-      gamma <- (geno@ploidy/2 - 1)/(geno@ploidy - 1)
       ix <- grep("dom",rownames(beta),fixed=T)
       out$params$heterosis <- data.frame(trait=traits,
                                          estimate=as.numeric(beta[ix,1]),
                                          SE=as.numeric(beta[ix,2]))
-
+      gamma <- mean((geno@ploidy/2 - 1)/(geno@ploidy - 1))
       weights <- id.weights/sum(id.weights)
       for (i in 1:n.trait) 
         for (j in i:n.trait) {

diff --git a/R/blup.R b/R/blup.R
@@ -46,12 +46,18 @@ blup <- function(data, geno=NULL, what, index.coeff=NULL, gwas.ncore=0L) {
   if (substr(what,1,2)=="DM" & (!(is(geno,"class_genoD")) | data[[1]]@model < 3L))
     stop("Dominance model is required in geno and data")
 
+  n.id <- length(data[[1]]@id)
   gamma <- 0
   if (!is.null(geno)) {
     if (substr(what,1,2)=="GV")  
       gamma <- 1
     if (substr(what,1,2)=="BV" & data[[1]]@model==3L)
       gamma <- (geno@ploidy/2 - 1)/(geno@ploidy - 1)
+
+    if (length(gamma) > 1) {
+      tmp <- tapply(gamma,attr(geno@ploidy,"pop"),mean)
+      gamma <- sum(tmp[names(index.coeff)]*index.coeff)/sum(index.coeff)
+    }
   }
 
   if (data[[1]]@model < 2L) {
@@ -60,8 +66,6 @@ blup <- function(data, geno=NULL, what, index.coeff=NULL, gwas.ncore=0L) {
     index.scale <- sqrt(unlist(sapply(data,function(z){diag(as.matrix(z@geno1.var))})) + 
                           gamma^2*unlist(sapply(data,function(z){diag(as.matrix(z@geno2.var))})))
   } 
-
-  n.id <- length(data[[1]]@id)
 
   if (!multi.prep) {
     data <- data2
@@ -176,12 +180,12 @@ blup <- function(data, geno=NULL, what, index.coeff=NULL, gwas.ncore=0L) {
 
   if (what=="AM") {
     G <- kron(geno@eigen.G,1)
-    M <- kronecker(crossprod(geno@coeff,crossprod(G$inv)),matrix(index.coeff,nrow=1))/geno@scale
+    M <- kronecker(crossprod((geno@coeff/geno@scale),crossprod(G$inv)),matrix(index.coeff,nrow=1))
     effect <- as.numeric(M %*% matrix(data@random[1:n.random],ncol=1))
     V <- data@var.uhat[1:n.random,1:n.random] #used for GWAS
   } else {
     D <- kron(geno@eigen.D,1)
-    M <- kronecker(crossprod(geno@coeff.D,crossprod(D$inv)),matrix(index.coeff,nrow=1))/geno@scale.D
+    M <- kronecker(crossprod((geno@coeff.D/geno@scale.D),crossprod(D$inv)),matrix(index.coeff,nrow=1))
     dhat <- -kronecker(matrix(geno@Fg[data@id],ncol=1),matrix(data@heterosis,ncol=1)) + 
       matrix(data@random[n.random + 1:n.random],ncol=1)
     effect <- as.numeric(M %*% dhat)

diff --git a/R/blup_prep.R b/R/blup_prep.R
@@ -4,13 +4,13 @@
 #' 
 #' The \code{method} argument can be used to control how the linear system is solved. "MME" leads to inversion of the MME coefficient matrix, while "Vinv" leads to inversion of the overall var-cov matrix for the response vector. If NULL, the software uses whichever method involves inverting the smaller matrix. If the number of random effects (m) is less than the number of BLUEs (n), "MME" is used.
 #' 
-#' For the multi-location model, if all of the environments for a location are masked, the average of the other locations is used when computed average fixed effects.
+#' For the multi-location model, if all of the environments for a location are masked, the average of the other locations is used when computing average fixed effects.
 #' 
 #' @param data data frame of BLUEs from Stage 1 
 #' @param vcov list of variance-covariance matrices for the BLUEs
 #' @param geno object of \code{\link{class_geno}} from \code{\link{read_geno}}
 #' @param vars object of \code{\link{class_var}} from \code{\link{Stage2}}
-#' @param mask (optional) data frame with possible columns "id","env","trait"
+#' @param mask (optional) data frame with possible columns "id","env","loc","trait"
 #' @param method (optional) "MME", "Vinv", NULL (defaut). see Details
 #' 
 #' @return Object of \code{\link{class_prep}}
@@ -50,7 +50,7 @@ blup_prep <- function(data,vcov=NULL,geno=NULL,vars,mask=NULL,method=NULL) {
   data <- data[!is.na(data$BLUE),]
 
   if (!is.null(mask)) {
-    tmp <- intersect(colnames(mask),c("id","env","trait"))
+    tmp <- intersect(colnames(mask),c("id","env","trait","loc"))
     stopifnot(length(tmp)>0)
     if (length(tmp) > 1) {
       tmp2 <- apply(mask[,tmp],1,paste,collapse=":")
@@ -189,10 +189,10 @@ blup_prep <- function(data,vcov=NULL,geno=NULL,vars,mask=NULL,method=NULL) {
             Gmat2 <- kron(eigen.A=geno@eigen.D, B=vars@geno2)
 
             #add to X matrix
-            dom.covariate <- Z %*% kronecker(geno@coeff.D %*% matrix(1,nrow=ncol(geno@coeff.D),ncol=1),
-                                                      diag(n.loc))
+            dom.covariate <- Z %*% kronecker((geno@coeff.D/(geno@scale*(ploidy-1))) %*% 
+                                               matrix(1,nrow=ncol(geno@coeff.D),ncol=1),diag(n.loc))
             colnames(dom.covariate) <- paste("heterosis",locations,sep=":")
-            X <- cbind(X,dom.covariate/(geno@scale*(ploidy-1)))
+            X <- cbind(X,dom.covariate)
           } else {
             Gmat2 <- kron(eigen.A=eigen.I, B=vars@geno2)
           }
@@ -213,8 +213,9 @@ blup_prep <- function(data,vcov=NULL,geno=NULL,vars,mask=NULL,method=NULL) {
           Gmat1 <- kron(eigen.A = geno@eigen.G, B=vars@geno1)
           if (vars@model==3L) {
             Gmat2 <- kron(eigen.A=geno@eigen.D, B=vars@geno2)
-            dom.covariate <- as.numeric(Z %*% geno@coeff.D %*% matrix(1,nrow=ncol(geno@coeff.D),ncol=1))
-            X <- cbind(X,heterosis=dom.covariate/(geno@scale*(ploidy-1)))
+            dom.covariate <- as.numeric(Z %*% (geno@coeff.D/(geno@scale*(ploidy-1))) %*% 
+                                          matrix(1,nrow=ncol(geno@coeff.D),ncol=1))
+            X <- cbind(X,heterosis=dom.covariate)
           } else {
             Gmat2 <- kron(eigen.A=eigen.I, B=vars@geno2)
           }
@@ -254,10 +255,10 @@ blup_prep <- function(data,vcov=NULL,geno=NULL,vars,mask=NULL,method=NULL) {
           Gmat2 <- kron(eigen.A=geno@eigen.D, B=vars@geno2)
 
           #add to X matrix
-          dom.covariate <- Z %*% kronecker(geno@coeff.D %*% matrix(1,nrow=ncol(geno@coeff.D),ncol=1),
-                                         diag(n.trait))
+          dom.covariate <- Z %*% kronecker((geno@coeff.D/(geno@scale*(ploidy-1))) %*%
+                                             matrix(1,nrow=ncol(geno@coeff.D),ncol=1),diag(n.trait))
           colnames(dom.covariate) <- paste("heterosis",traits,sep=":")
-          X <- cbind(X,dom.covariate/(geno@scale*(ploidy-1)))
+          X <- cbind(X,dom.covariate)
         } else {
           Gmat2 <- kron(eigen.A=eigen.I, B=vars@geno2)
         }

diff --git a/R/class_geno.R b/R/class_geno.R
@@ -1,9 +1,9 @@
 #' S4 class for marker genotype data
 #' 
-#' @slot ploidy ploidy
+#' @slot ploidy If mixed ploidy, then a vector equal to pop size; otherwise a single integer
 #' @slot map Marker map positions
 #' @slot coeff Coefficients of the marker effects (dim: indiv x marker)
-#' @slot scale Scaling factor between markers and indiv
+#' @slot scale Scaling factor between markers and indiv, vector of length equal to pop size
 #' @slot G Additive relationship matrix (from markers and potentially also pedigree)
 #' @slot eigen.G list of eigenvalues and eigenvectors
 #'