v0.37

jendelman · Jan 2, 2024 · 86aabc7 · 86aabc7
1 parent e4395e7
commit 86aabc7
Show file tree

Hide file tree

Showing 32 changed files with 30,397 additions and 13,326 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: polyBreedR
 Title: Genomics-assisted breeding for polyploids (and diploids)
-Version: 0.36
+Version: 0.37
 Author: Jeffrey B. Endelman
 Maintainer: Jeffrey Endelman <[email protected]>
 Description: Genomics-assisted breeding for polyploids (and diploids)

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+export(ADsplit)
 export(A_mat)
 export(GT2DS)
 export(G_mat)

diff --git a/NEWS b/NEWS
@@ -1,3 +1,6 @@
+Changes in 0.37
+* Vignette 3 
+
 Changes in 0.36
 * PolyOrigin option moved to impute_PO
 * GT2DS has multicore option

diff --git a/R/ADsplit.R b/R/ADsplit.R
@@ -0,0 +1,39 @@
+#' Extract read counts from AD string
+#' 
+#' Extract read counts from AD string
+#' 
+#' @param AD array of AD strings
+#' @param ALT TRUE or FALSE (= REF)
+#' @param n.core number of cores
+#' 
+#' @return integer data with same dimensions as AD
+#' @export
+#' @importFrom parallel makeCluster stopCluster parApply clusterExport
+
+ADsplit <- function(AD, ALT, n.core=1) {
+
+  stopifnot(inherits(ALT,"logical"))
+
+  f <- function(AD,k) {
+    x <- strsplit(AD,split=",",fixed=T)
+    as.integer(sapply(x,"[[",k))
+  }
+
+  k <- as.integer(ALT)+1
+
+  if (inherits(AD,"matrix")) {
+    if (n.core > 1) {
+      cl <- makeCluster(n.core)
+      clusterExport(cl=cl,varlist=NULL)
+      out <- parApply(cl=cl,X=AD,MARGIN=2,FUN=f,k=k)
+      stopCluster(cl)
+    } else {
+      out <- apply(AD,2,f,k=k)
+    }
+    dimnames(out) <- dimnames(AD)
+  } else {
+    out <- f(AD,k)
+    names(out) <- names(AD)
+  }
+  return(out)
+}
diff --git a/R/array2vcf.R b/R/array2vcf.R
@@ -21,7 +21,13 @@
 
 array2vcf <- function(array.file, map.file, model.file=NULL, ploidy, vcf.file) {
 
-  con <- file(array.file,open="r")
+  nc <- nchar(array.file)
+  if (substr(array.file,nc-1,nc)==".gz") {
+    con <- gzfile(array.file,open="r")
+  } else {
+    con <- file(array.file,open="r")
+  }
+
   tmp <- readLines(con,n=1)
   close(con)
   if (tmp=="[Header]") {

diff --git a/R/dart2vcf.R b/R/dart2vcf.R
@@ -8,7 +8,7 @@
 #' 
 #' @param counts.file DArTag collapsed counts file
 #' @param dosage.file DArTag dosage file
-#' @param vcf.file name of VCF output file (uncompressed)
+#' @param vcf.file name of VCF output file 
 #' @param ploidy ploidy
 #' @param first.data.row default is 9 for DArTag format
 #' 

diff --git a/R/gbs.R b/R/gbs.R
@@ -2,37 +2,49 @@
 #' 
 #' Genotype calls for genotype-by-sequencing (GBS) data
 #' 
-#' VCF input file must contain AD field. Posterior mode and mean genotypes are added as GT and DS fields. GQ is also added based on probability of posterior mode. Binomial calculation uses R/updog package (Gerard et al. 2018). Previous INFO is discarded; adds NS, DP.AVG, AF.GT, AB, OD, SE.
+#' VCF input file must contain AD field. Posterior mode and mean genotypes are added as GT and DS fields. GQ is also added based on probability of posterior mode. Binomial calculation uses R/updog package (Gerard et al. 2018) with "norm" prior. Previous INFO is discarded; adds NS, DP.AVG, AF.GT, AB, OD, SE.
 #' 
 #' @param in.file VCF input file
 #' @param out.file VCF output file
 #' @param ploidy ploidy
-#' @param prior model for prior (see Details)
 #' @param bias TRUE/FALSE, whether to estimate allelic bias
 #' @param n.core number of cores
+#' @param silent TRUE/FALSE
 #' 
-#' @return marker x indiv matrix of read depths
+#' @return nothing
 #'
 #' @export
 #' @importFrom updog flexdog
 #' @importFrom parallel makeCluster clusterExport parLapply stopCluster
 #' @importFrom stats anova lm chisq.test
 
-gbs <- function(in.file, out.file, ploidy, prior="norm", 
-                bias=TRUE, n.core=1) {
+gbs <- function(in.file, out.file, ploidy, bias=TRUE, n.core=1,
+                silent=FALSE) {
 
+  prior <- "norm"
   if (bias) {
     bias_init <- exp(c(-1, -0.5, 0, 0.5, 1))
   } else {
     bias_init <- 1
   }
   prep <- vcf_prep(in.file)
 
-  con.out <- file(out.file,"w")
+  nc <- nchar(out.file)
+  if (substr(out.file,nc-1,nc)==".gz") {
+    con.out <- gzfile(out.file,open="w")
+  } else {
+    con.out <- file(out.file,open="w")
+  }
+
   for (i in 1:length(prep$new.meta))
     writeLines(con.out,text=prep$new.meta[i])
 
-  con.in <- file(in.file,"r")
+  nc <- nchar(in.file)
+  if (substr(in.file,nc-1,nc)==".gz") {
+    con.in <- gzfile(in.file,open="r")
+  } else {
+    con.in <- file(in.file,open="r")
+  }
   tmp <- readLines(con.in,n=prep$old.meta)
 
   if (n.core > 1) {
@@ -97,7 +109,8 @@ gbs <- function(in.file, out.file, ploidy, prior="norm",
   for (i in 1:nb) {
     tmp <- readLines(con.in,block.size)
     m <- length(tmp)
-    cat(sub("X",(i-1)*block.size + m,"Progress: X markers\n"))
+    if (!silent)
+      cat(sub("X",(i-1)*block.size + m,"Progress: X markers\n"))
     tmp2 <- strsplit(tmp,split="\t",fixed=T)
     x <- lapply(tmp2,function(x){vcf_extract(x[-(1:8)],"AD")})
     if (n.core > 1) {

diff --git a/R/impute_PO.R b/R/impute_PO.R
@@ -8,32 +8,42 @@
 #' 
 #' VCF is assumed for the low-density file. The pedigree file must follow PolyOrigin format.
 #' 
-#' Two output files are generated, corresponding to the posterior maximum and mean genotypes.
+#' The output file contains the posterior maximum geontypes.
+#'
+#' A temporary directory "tmp" is created to store intermediate files and then deleted.
 #' 
+#' @param ped.file pedigree file for progeny (must follow PO format)
 #' @param high.file name of high density file with phased parents
 #' @param low.file name of low density VCF file with progeny
 #' @param low.format either "GT" (default) or "AD"
-#' @param ped.file pedigree file for progeny (must follow PO format)
-#' @param out.prefix prefix for output files
+#' @param out.file name of CSV output file
 #' 
 #' @return NULL
 #' 
 #' @import vcfR
-#' @importFrom data.table fwrite
+#' @importFrom data.table fwrite fread
 #' @export
 
-impute_PO <- function(high.file, low.file, low.format="GT", ped.file, out.prefix) {
+impute_PO <- function(ped.file, high.file, low.file, low.format="GT",
+                      out.file, n.thread=1) {
 
   stopifnot(low.format %in% c("GT","AD"))
 
   high <- fread(high.file,check.names=F)
   low <- read.vcfR(low.file,verbose=F)
+  low.id <- colnames(low@gt)[-1]
+
+  ped <- read.csv(ped.file)
+  id <- ped$id[ped$pop > 0]
+  if (length(setdiff(id,low.id)) > 0)
+    stop("Low density file is missing individuals in the pedigree file.")
+
   meta <- queryMETA(low)
   if (low.format=="GT") {
-    geno <- GT2DS(extract.gt(low,element="GT"))
+    geno <- GT2DS(extract.gt(low,element="GT")[,id])
   } else {
     stopifnot("FORMAT=ID=AD" %in% meta)
-    geno <- extract.gt(low,element="AD")
+    geno <- extract.gt(low,element="AD")[,id]
     geno <- gsub(",","|",geno)
   }
 
@@ -54,12 +64,9 @@ impute_PO <- function(high.file, low.file, low.format="GT", ped.file, out.prefix
   if (low.format=="AD")
     combo[is.na(combo)] <- "0|0"
 
-  ans <- try(setwd("tmp"),silent=T)
-  if (inherits(ans,"try-error")) {
+  if (!dir.exists("tmp")) 
     dir.create("tmp")
-  } else {
-    setwd("..")
-  }
+
   fwrite(combo,"tmp/PO_geno.csv",na = "NA")
 
   #construct Julia script
@@ -69,20 +76,20 @@ impute_PO <- function(high.file, low.file, low.format="GT", ped.file, out.prefix
   writeLines(sub("X",ped.file,"pedfile=\"X\";"),con)
   writeLines("polyOrigin(genofile,pedfile,isphysmap=false,refineorder=false,refinemap=false,outstem=\"tmp/imputed\");",con)
   close(con)
-  system("julia tmp/po.jl")
+  system("julia -t auto tmp/po.jl")
 
   imputed <- read.csv("tmp/imputed_postdoseprob.csv",check.names=F,as.is=T)
   cols <- colnames(geno)
 
-  post.mean <- apply(imputed[,cols],2,function(z){
-    u <- strsplit(z,split="|",fixed=T)
-    sapply(u,function(x){
-      x <- as.numeric(x)
-      ploidy <- length(x)-1
-      round(sum(x*(0:ploidy)),2)
-      })
-    })
-  fwrite(cbind(map,post.mean),file=sub("Q",out.prefix,"Q_post_mean.csv"))
+  # post.mean <- apply(imputed[,cols],2,function(z){
+  #   u <- strsplit(z,split="|",fixed=T)
+  #   sapply(u,function(x){
+  #     x <- as.numeric(x)
+  #     ploidy <- length(x)-1
+  #     round(sum(x*(0:ploidy)),2)
+  #     })
+  #   })
+  # fwrite(cbind(map,post.mean),file=sub("Q",out.prefix,"Q_post_mean.csv"))
 
   post.max <- apply(imputed[,cols],2,function(z){
     u <- strsplit(z,split="|",fixed=T)
@@ -91,5 +98,6 @@ impute_PO <- function(high.file, low.file, low.format="GT", ped.file, out.prefix
     })
   })
 
-  fwrite(cbind(map,post.max),file=sub("Q",out.prefix,"Q_post_max.csv"))
+  fwrite(cbind(map,post.max),file=out.file)
+  unlink("tmp",recursive=TRUE)
 }
diff --git a/R/vcf_functions.R b/R/vcf_functions.R
@@ -2,7 +2,13 @@ vcf_prep <- function(vcf.file) {
 
   metadata <- character(1000)
 
-  con <- file(vcf.file,"r")
+  nc <- nchar(vcf.file)
+  if (substr(vcf.file,nc-1,nc)==".gz") {
+    con <- gzfile(vcf.file,open="r")
+  } else {
+    con <- file(vcf.file,open="r")
+  }
+
   temp <- readLines(con,1)
   h <- 0
   while(substr(temp,1,2)=="##") {

diff --git a/R/write_vcf.R b/R/write_vcf.R
@@ -36,7 +36,13 @@ write_vcf <- function(filename, fixed, geno, other.meta=NULL) {
   stopifnot(colnames(fixed)==c("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"))
   fixed[is.na(fixed)] <- "."
 
-  con <- file(filename,open="w")
+  nc <- nchar(filename)
+  if (substr(filename,nc-1,nc)==".gz") {
+    con <- gzfile(filename,open="w")
+  } else {
+    con <- file(filename,open="w")
+  }
+
   writeLines(con=con, text="##fileformat=VCFv4.3")
 
   if (!is.null(other.meta))

diff --git a/README.Rmd b/README.Rmd
@@ -4,7 +4,7 @@ author: Jeffrey Endelman
 output: github_document
 ---
 
-This R package was designed to facilitate the use of genome-wide markers for breeding autotetraploid (4x) species, but it also functionality for diploids. The software has been developed and tested using data from the University of Wisconsin-Madison [potato breeding program](http://potatobreeding.cals.wisc.edu).
+This R package was designed to facilitate the use of genome-wide markers for breeding autotetraploid (4x) species, but it is also works for diploids. The software has been developed and tested using data from the University of Wisconsin-Madison [potato breeding program](http://potatobreeding.cals.wisc.edu).
 
 To install and load the package:
 ```R
@@ -20,6 +20,8 @@ library(polyBreedR)
 
 [Vignette 2](https://jendelman.github.io/polyBreedR/polyBreedR_Vignette2.html) covers the analysis of marker data for dihaploids, including checking ploidy and parentage. Financial support has come from USDA NIFA Awards 2014-67013-22434 (AFRI) and 2019-51181-30021 (SCRI).
 
+[Vignette 3](https://jendelman.github.io/polyBreedR/polyBreedR_Vignette2.html) introduces functions for representing DArTag GBS and SNP array data in Variant Call Format, and functions for imputing from low to high density markers. Financial support has come from USDA NIFA Awards 2019-51181-30021, 2020-51181-32156, and 2021-34141-35447. 
+
 For a complete specification of the package functions, consult the [reference manual.](https://jendelman.github.io/polyBreedR/polyBreedR_Manual.pdf)
 
 

diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@ polyBreedR
 Jeffrey Endelman
 
 This R package was designed to facilitate the use of genome-wide markers
-for breeding autotetraploid (4x) species, but it also functionality for
+for breeding autotetraploid (4x) species, but it is also works for
 diploids. The software has been developed and tested using data from the
 University of Wisconsin-Madison [potato breeding
 program](http://potatobreeding.cals.wisc.edu).
@@ -34,6 +34,13 @@ covers the analysis of marker data for dihaploids, including checking
 ploidy and parentage. Financial support has come from USDA NIFA Awards
 2014-67013-22434 (AFRI) and 2019-51181-30021 (SCRI).
 
+[Vignette
+3](https://jendelman.github.io/polyBreedR/polyBreedR_Vignette2.html)
+introduces functions for representing DArTag GBS and SNP array data in
+Variant Call Format, and functions for imputing from low to high density
+markers. Financial support has come from USDA NIFA Awards
+2019-51181-30021, 2020-51181-32156, and 2021-34141-35447.
+
 For a complete specification of the package functions, consult the
 [reference
 manual.](https://jendelman.github.io/polyBreedR/polyBreedR_Manual.pdf)
diff --git a/docs/polyBreedR_Manual.pdf b/docs/polyBreedR_Manual.pdf