v0.39

jendelman · Jan 12, 2024 · 0fc3d8e · 0fc3d8e
1 parent d3a5e97
commit 0fc3d8e
Show file tree

Hide file tree

Showing 11 changed files with 10,820 additions and 10,778 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: polyBreedR
 Title: Genomics-assisted breeding for polyploids (and diploids)
-Version: 0.38
+Version: 0.39
 Author: Jeffrey B. Endelman
 Maintainer: Jeffrey Endelman <[email protected]>
 Description: Genomics-assisted breeding for polyploids (and diploids)

diff --git a/NAMESPACE b/NAMESPACE
@@ -14,7 +14,7 @@ export(geno_call)
 export(get_pedigree)
 export(impute)
 export(impute_L2H)
-export(impute_PO)
+export(impute_LA)
 export(madc)
 export(merge_impute)
 export(readXY)

diff --git a/NEWS b/NEWS
@@ -1,3 +1,6 @@
+Changes in 0.39
+* Changed impute_PO to impute_LA to allow for other methods eventually (MAPpoly)
+
 Changes in 0.38
 * Added chunk.size option to gbs
 * exception handling in impute_L2H

diff --git a/R/impute_L2H.R b/R/impute_L2H.R
@@ -2,7 +2,7 @@
 #' 
 #' Impute from low to high density markers by Random Forest
 #' 
-#' Argument \code{params} is a list with three named elements: format, n.tree, n.mark. \code{format} can have values "GT" (integer dosage) or "DS" (real numbers between 0 and ploidy). Classification trees are used for GT and regression trees for DS. \code{n.tree} is the number of trees (default = 100). \code{n.mark} is the number of markers to use as predictors (default = 100), chosen based on minimum distance to the target. 
+#' Argument \code{params} is a list with three named elements: format, n.tree, n.mark. \code{format} can have values "GT" (integer dosage) or "DS" (real numbers between 0 and ploidy). Classification trees are used for GT and regression trees for DS. \code{n.tree} is the number of trees (default = 100). \code{n.mark} is the number of markers to use as predictors (default = 200), chosen based on minimum distance to the target. 
 #' 
 #' The \code{exclude} argument is useful for cross-validation. 
 #' 
@@ -83,7 +83,7 @@ impute_L2H <- function(high.file, low.file, out.file, params=list(),
   if (!("n.tree" %in% np))
     params$n.tree <- 100
   if (!("n.mark" %in% np))
-    params$n.mark <- 100
+    params$n.mark <- 200
 
   vcf1 <- length(grep("VCF",toupper(high.file),fixed=T)) > 0
   if (vcf1) {

diff --git a/R/impute_PO.R → R/impute_LA.R b/R/impute_PO.R → R/impute_LA.R
@@ -1,20 +1,20 @@
-#' Impute from low to high density markers with PolyOrigin
+#' Impute from low to high density markers by Linkage Analysis (LA)
 #' 
-#' Impute from low to high density markers by linkage analysis with PolyOrigin
+#' Impute from low to high density markers by Linkage Analysis 
 #' 
 #' You must have separately installed PolyOrigin and Julia for this function to work.
 #'
-#' The high density file contains phased parental genotypes in PolyOrigin format. The first 3 columns are the genetic map in cM: marker, chrom, position. To output imputed data with physical rather than genetic map positions, including a fourth column named "bp". Subsequent columns are the phased parental genotypes. 
+#' The high density file contains phased parental genotypes using 0|1 format. The first 3 columns are the genetic map in cM: marker, chrom, position. To output imputed data with physical rather than genetic map positions, including a fourth column named "bp". Subsequent columns are the phased parental genotypes. 
 #' 
-#' VCF is assumed for the low-density file. The pedigree file must follow PolyOrigin format.
+#' VCF is assumed for the low-density file. The pedigree file has four columns: id, pop, mother, father, ploidy.
 #' 
-#' The output file contains the posterior maximum geontypes.
+#' The output file contains the posterior maximum genotypes.
 #'
 #' A temporary directory "tmp" is created to store intermediate files and then deleted.
 #' 
-#' @param ped.file pedigree file for progeny (must follow PO format)
-#' @param high.file name of high density file with phased parents
-#' @param low.file name of low density VCF file with progeny
+#' @param ped.file pedigree file for progeny
+#' @param high.file name of file with phased parental genotypes
+#' @param low.file name of VCF file with progeny
 #' @param low.format either "GT" (default) or "AD"
 #' @param out.file name of CSV output file
 #' 
@@ -24,7 +24,7 @@
 #' @importFrom data.table fwrite fread
 #' @export
 
-impute_PO <- function(ped.file, high.file, low.file, low.format="GT",
+impute_LA <- function(ped.file, high.file, low.file, low.format="GT",
                       out.file, n.thread=1) {
 
   stopifnot(low.format %in% c("GT","AD"))
@@ -57,7 +57,13 @@ impute_PO <- function(ped.file, high.file, low.file, low.format="GT",
   colnames(high) <- replace(colnames(high),1:3,c("marker","chrom","pos"))
   colnames(map) <- c("marker","chrom","pos")
   geno <- geno[rownames(geno) %in% high$marker,] 
-
+
+  #For PO, change from 0|1 to 1|2
+  f1 <- function(x){
+    gsub("0","1",gsub("1","2",x))
+  }
+  high <- cbind(high[,1:3],apply(high[,4:ncol(high),drop=F],2,f1))
+
   combo <- merge(high,data.frame(marker=rownames(geno),geno,check.names = F),
                  by="marker",all.x=T)
   combo <- combo[match(map$marker,combo$marker),]

diff --git a/docs/polyBreedR_Vignette3.html b/docs/polyBreedR_Vignette3.html
@@ -453,7 +453,7 @@ <h2>Variant Call Format (VCF)</h2>
 ## 16 32 31  6</code></pre>
 <p>As shown above, the <code>GT2DS</code> function in polyBreedR can be
 used to convert VCF GT character strings to integer allele dosages (DS).
-The code below compares the genotype calls between DArT (data1) updog
+The code below compares the genotype calls between DArT (data1) and updog
 (data2) for the first marker, which were identical except for one
 sample.</p>
 <div class="sourceCode" id="cb11"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1" tabindex="-1"></a>data2 <span class="ot">&lt;-</span> <span class="fu">read.vcfR</span>(<span class="st">&quot;DArTag_gbs.vcf.gz&quot;</span>,<span class="at">verbose=</span>F)</span>
@@ -517,16 +517,16 @@ <h2>Imputation</h2>
 <code>impute_L2H</code> uses the Random Forest method based on a
 training population of individuals genotyped under both platforms. In
 general, a larger training population leads to better the imputation
-accuracy. The default behavior is to use the 100 closest markers and 100
+accuracy. The default behavior is to use the 200 closest markers and 100
 classification trees, but these are adjustable parameters. Consult the
 help page for more details.</p>
-<p>The other function, <code>impute_PO</code>, uses the <a href="https://github.com/chaozhi/PolyOrigin.jl">software PolyOrigin</a>
-to impute markers in the F1 progeny of a partial diallel population
-based on linkage analysis. To use <code>impute_PO</code>, you will need
-to install Julia and PolyOrigin and prepare for <a href="https://julialang.org/downloads/platform/">command line
-execution</a>. The high-density marker file for <code>impute_PO</code>
-contains the phased parental genotypes, which can be generated with
-PolyOrigin.</p>
+<p>The other function, <code>impute_LA</code>, was designed to impute
+markers in F1 populations based on linkage analysis (LA). The current
+implementation requires the <a href="https://github.com/chaozhi/PolyOrigin.jl">software PolyOrigin</a>,
+for which you will need to install Julia and PolyOrigin separately and
+prepare for <a href="https://julialang.org/downloads/platform/">command
+line execution</a>. For <code>impute_LA</code>, the high-density marker
+file contains the phased parental genotypes in 0|1 format.</p>
 <p>To illustrate, the low and high density marker files are provided for
 a half-diallel population of 85 clones, derived from 5 parents. The high
 density file contains 10,695 phased SNPs based on array genotyping of
@@ -540,23 +540,50 @@ <h2>Imputation</h2>
 <span id="cb15-5"><a href="#cb15-5" tabindex="-1"></a>low.file <span class="ot">&lt;-</span> <span class="fu">system.file</span>(<span class="st">&quot;vignette_data&quot;</span>, <span class="st">&quot;diallel_DArTag.vcf.gz&quot;</span>, </span>
 <span id="cb15-6"><a href="#cb15-6" tabindex="-1"></a>                        <span class="at">package =</span> <span class="st">&quot;polyBreedR&quot;</span>)</span>
 <span id="cb15-7"><a href="#cb15-7" tabindex="-1"></a></span>
-<span id="cb15-8"><a href="#cb15-8" tabindex="-1"></a><span class="fu">impute_PO</span>(<span class="at">ped.file =</span> ped.file, <span class="at">high.file =</span> high.file,</span>
-<span id="cb15-9"><a href="#cb15-9" tabindex="-1"></a>          <span class="at">low.file =</span> low.file, <span class="at">low.format =</span> <span class="st">&quot;GT&quot;</span>, <span class="at">out.file=</span><span class="st">&quot;imputed.csv.gz&quot;</span>)</span></code></pre></div>
+<span id="cb15-8"><a href="#cb15-8" tabindex="-1"></a><span class="co">#peek at phased parental genotype file</span></span>
+<span id="cb15-9"><a href="#cb15-9" tabindex="-1"></a><span class="fu">head</span>(<span class="fu">read.csv</span>(high.file, <span class="at">check.names=</span>F))</span></code></pre></div>
+<pre><code>##                marker chromosome position     bp W6609-3 W12078-76 W13NYP102-7
+## 1 solcap_snp_c2_36608      chr01        0 508800 0|0|1|1   1|1|0|1     0|1|0|1
+## 2 solcap_snp_c2_36658      chr01        0 527068 0|1|0|0   0|0|1|0     0|0|0|0
+## 3 solcap_snp_c1_10930      chr01        0 566972 0|1|0|0   0|0|1|0     0|0|0|0
+## 4       PotVar0120126      chr01        0 603013 0|1|1|1   1|1|1|1     1|1|1|1
+## 5       PotVar0120085      chr01        0 681193 0|0|0|0   0|0|0|0     0|0|0|0
+## 6       PotVar0120080      chr01        0 681379 1|1|1|1   1|1|1|1     1|1|1|1
+##   W14NYQ4-1 W14NYQ9-2
+## 1   0|1|0|0   0|0|0|0
+## 2   0|0|0|0   1|0|0|1
+## 3   0|0|0|0   1|0|0|1
+## 4   0|1|1|0   1|0|0|1
+## 5   0|0|0|0   0|0|0|0
+## 6   1|1|1|1   1|1|1|1</code></pre>
+<div class="sourceCode" id="cb17"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb17-1"><a href="#cb17-1" tabindex="-1"></a><span class="co">#peek at pedigree file</span></span>
+<span id="cb17-2"><a href="#cb17-2" tabindex="-1"></a><span class="fu">read.csv</span>(ped.file, <span class="at">check.names=</span>F)[<span class="dv">1</span><span class="sc">:</span><span class="dv">8</span>,]</span></code></pre></div>
+<pre><code>##            id pop    mother      father ploidy
+## 1     W6609-3   0         0           0      4
+## 2   W12078-76   0         0           0      4
+## 3 W13NYP102-7   0         0           0      4
+## 4   W14NYQ4-1   0         0           0      4
+## 5   W14NYQ9-2   0         0           0      4
+## 6   W19012-12   1 W12078-76 W13NYP102-7      4
+## 7   W19012-13   1 W12078-76 W13NYP102-7      4
+## 8   W19012-14   1 W12078-76 W13NYP102-7      4</code></pre>
+<div class="sourceCode" id="cb19"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb19-1"><a href="#cb19-1" tabindex="-1"></a><span class="fu">impute_LA</span>(<span class="at">ped.file =</span> ped.file, <span class="at">high.file =</span> high.file,</span>
+<span id="cb19-2"><a href="#cb19-2" tabindex="-1"></a>          <span class="at">low.file =</span> low.file, <span class="at">low.format =</span> <span class="st">&quot;GT&quot;</span>, <span class="at">out.file=</span><span class="st">&quot;imputed.csv.gz&quot;</span>)</span></code></pre></div>
 <p>The following code computes the root-mean-squared-error of the
 imputation.</p>
-<div class="sourceCode" id="cb16"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1" tabindex="-1"></a>array.file <span class="ot">&lt;-</span> <span class="fu">system.file</span>(<span class="st">&quot;vignette_data&quot;</span>, <span class="st">&quot;diallel_array.vcf.gz&quot;</span>, </span>
-<span id="cb16-2"><a href="#cb16-2" tabindex="-1"></a>                        <span class="at">package =</span> <span class="st">&quot;polyBreedR&quot;</span>)</span>
-<span id="cb16-3"><a href="#cb16-3" tabindex="-1"></a>array <span class="ot">&lt;-</span> <span class="fu">read.vcfR</span>(array.file,<span class="at">verbose=</span>F)</span>
-<span id="cb16-4"><a href="#cb16-4" tabindex="-1"></a>geno.array <span class="ot">&lt;-</span> <span class="fu">GT2DS</span>(<span class="fu">extract.gt</span>(array))</span>
-<span id="cb16-5"><a href="#cb16-5" tabindex="-1"></a></span>
-<span id="cb16-6"><a href="#cb16-6" tabindex="-1"></a>imputed <span class="ot">&lt;-</span> <span class="fu">read.csv</span>(<span class="st">&quot;imputed.csv.gz&quot;</span>, <span class="at">check.names=</span>F)</span>
-<span id="cb16-7"><a href="#cb16-7" tabindex="-1"></a>geno.imputed <span class="ot">&lt;-</span> <span class="fu">as.matrix</span>(imputed[,<span class="sc">-</span>(<span class="dv">1</span><span class="sc">:</span><span class="dv">3</span>)]) <span class="co">#remove map</span></span>
-<span id="cb16-8"><a href="#cb16-8" tabindex="-1"></a><span class="fu">rownames</span>(geno.imputed) <span class="ot">&lt;-</span> imputed<span class="sc">$</span>marker</span>
-<span id="cb16-9"><a href="#cb16-9" tabindex="-1"></a>marks <span class="ot">&lt;-</span> <span class="fu">intersect</span>(<span class="fu">rownames</span>(geno.imputed),<span class="fu">rownames</span>(geno.array))</span>
-<span id="cb16-10"><a href="#cb16-10" tabindex="-1"></a>id <span class="ot">&lt;-</span> <span class="fu">intersect</span>(<span class="fu">colnames</span>(geno.imputed),<span class="fu">colnames</span>(geno.array))</span>
-<span id="cb16-11"><a href="#cb16-11" tabindex="-1"></a></span>
-<span id="cb16-12"><a href="#cb16-12" tabindex="-1"></a><span class="co">#RMSE</span></span>
-<span id="cb16-13"><a href="#cb16-13" tabindex="-1"></a><span class="fu">sqrt</span>(<span class="fu">mean</span>((geno.imputed[marks,id] <span class="sc">-</span> geno.array[marks,id])<span class="sc">^</span><span class="dv">2</span>))</span></code></pre></div>
+<div class="sourceCode" id="cb20"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb20-1"><a href="#cb20-1" tabindex="-1"></a>array.file <span class="ot">&lt;-</span> <span class="fu">system.file</span>(<span class="st">&quot;vignette_data&quot;</span>, <span class="st">&quot;diallel_array.vcf.gz&quot;</span>, </span>
+<span id="cb20-2"><a href="#cb20-2" tabindex="-1"></a>                        <span class="at">package =</span> <span class="st">&quot;polyBreedR&quot;</span>)</span>
+<span id="cb20-3"><a href="#cb20-3" tabindex="-1"></a>array <span class="ot">&lt;-</span> <span class="fu">read.vcfR</span>(array.file,<span class="at">verbose=</span>F)</span>
+<span id="cb20-4"><a href="#cb20-4" tabindex="-1"></a>geno.array <span class="ot">&lt;-</span> <span class="fu">GT2DS</span>(<span class="fu">extract.gt</span>(array))</span>
+<span id="cb20-5"><a href="#cb20-5" tabindex="-1"></a></span>
+<span id="cb20-6"><a href="#cb20-6" tabindex="-1"></a>imputed <span class="ot">&lt;-</span> <span class="fu">read.csv</span>(<span class="st">&quot;imputed.csv.gz&quot;</span>, <span class="at">check.names=</span>F)</span>
+<span id="cb20-7"><a href="#cb20-7" tabindex="-1"></a>geno.imputed <span class="ot">&lt;-</span> <span class="fu">as.matrix</span>(imputed[,<span class="sc">-</span>(<span class="dv">1</span><span class="sc">:</span><span class="dv">3</span>)]) <span class="co">#remove map</span></span>
+<span id="cb20-8"><a href="#cb20-8" tabindex="-1"></a><span class="fu">rownames</span>(geno.imputed) <span class="ot">&lt;-</span> imputed<span class="sc">$</span>marker</span>
+<span id="cb20-9"><a href="#cb20-9" tabindex="-1"></a>marks <span class="ot">&lt;-</span> <span class="fu">intersect</span>(<span class="fu">rownames</span>(geno.imputed),<span class="fu">rownames</span>(geno.array))</span>
+<span id="cb20-10"><a href="#cb20-10" tabindex="-1"></a>id <span class="ot">&lt;-</span> <span class="fu">intersect</span>(<span class="fu">colnames</span>(geno.imputed),<span class="fu">colnames</span>(geno.array))</span>
+<span id="cb20-11"><a href="#cb20-11" tabindex="-1"></a></span>
+<span id="cb20-12"><a href="#cb20-12" tabindex="-1"></a><span class="co">#RMSE</span></span>
+<span id="cb20-13"><a href="#cb20-13" tabindex="-1"></a><span class="fu">sqrt</span>(<span class="fu">mean</span>((geno.imputed[marks,id] <span class="sc">-</span> geno.array[marks,id])<span class="sc">^</span><span class="dv">2</span>))</span></code></pre></div>
 <pre><code>## [1] 0.1340864</code></pre>
 </div>