diff --git a/R/MyFun_BIC_Meng.R b/R/MyFun_BIC_Meng.R
index 084b51b..aae20da 100644
--- a/R/MyFun_BIC_Meng.R
+++ b/R/MyFun_BIC_Meng.R
@@ -43,9 +43,9 @@
#' function for BIC calculation
#'
-#' @param y describe documentation
-#' @param PC describe documentation
-#' @param K describe documentation
+#' @param y length N vector
+#' @param PC matrix of principal components with N rows and P columns
+#' @param K kinship matrix with N rows and N columns
#'
#' @import rrBLUP
#' @importFrom MASS ginv
diff --git a/R/mod_help.R b/R/mod_help.R
index 19960f0..52b7206 100644
--- a/R/mod_help.R
+++ b/R/mod_help.R
@@ -14,16 +14,16 @@ mod_help_ui <- function(id){
column(width=12),
column(width=12,
box(title="DArT Report2VCF", id = "DArT_Report2VCF_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE,
- "**Draft**This tab is designed to convert the DArT Dose Report and Counts files to a VCF file. **DArT Website**",
+ "This tab converts the processed genotype and counts files from DArT into a VCF file (v4.3). This file can then be used as the genotype input for the analyses within BIGapp or used with other genomics applications.",
br(), br(),
bs4Dash::tabsetPanel(id = "DArT_Report2VCF_tabset",
- tabPanel("Parameters description", value = "DArT_Report2VCF_par",
+ tabPanel("Parameters description", value = "DArT_Report2VCF_par", br(),
includeMarkdown(system.file("help_files/DArT_Report2VCF_par.Rmd", package = "BIGapp"))
),
- tabPanel("Results description", value = "DArT_Report2VCF_results",
+ tabPanel("Results description", value = "DArT_Report2VCF_results", br(),
includeMarkdown(system.file("help_files/DArT_Report2VCF_res.Rmd", package = "BIGapp"))
),
- tabPanel("How to cite", value = "DArT_Report2VCF_cite",
+ tabPanel("How to cite", value = "DArT_Report2VCF_cite", br(),
includeMarkdown(system.file("help_files/DArT_Report2VCF_cite.Rmd", package = "BIGapp"))
))
),
@@ -42,52 +42,58 @@ mod_help_ui <- function(id){
))
),
box(title="VCF Filtering", id = "VCF_Filtering_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE,
+ "Filter SNPs and samples in a VCF file based on missing data, minor allele frequency, read depth, and Updog dosage calling metrics",
+ br(), br(),
bs4Dash::tabsetPanel(id = "VCF_Filtering_tabset",
- tabPanel("Parameters description", value = "VCF_Filtering_par",
+ tabPanel("Parameters description", value = "VCF_Filtering_par", br(),
includeMarkdown(system.file("help_files/VCF_Filtering_par.Rmd", package = "BIGapp"))
),
- tabPanel("Results description", value = "VCF_Filtering_results",
- includeMarkdown(system.file("help_files/VCF_Filtering_par.Rmd", package = "BIGapp"))
+ tabPanel("Results description", value = "VCF_Filtering_results", br(),
+ includeMarkdown(system.file("help_files/VCF_Filtering_res.Rmd", package = "BIGapp"))
),
- tabPanel("How to cite", value = "Updog_Dosage_Calling_cite",
+ tabPanel("How to cite", value = "Updog_Dosage_Calling_cite", br(),
includeMarkdown(system.file("help_files/VCF_Filtering_cite.Rmd", package = "BIGapp"))
))
),
box(title="PCA", id = "PCA_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE,
+ "This tab is used to perform a PCA to visualize the genomic relationships between samples (population structure)",
+ br(), br(),
bs4Dash::tabsetPanel(id = "PCA_tabset",
- tabPanel("Parameters description", value = "PCA_par",
+ tabPanel("Parameters description", value = "PCA_par", br(),
includeMarkdown(system.file("help_files/PCA_par.Rmd", package = "BIGapp"))
),
- tabPanel("Results description", value = "PCA_results",
+ tabPanel("Results description", value = "PCA_results", br(),
includeMarkdown(system.file("help_files/PCA_res.Rmd", package = "BIGapp"))
),
- tabPanel("How to cite", value = "PCA_cite",
+ tabPanel("How to cite", value = "PCA_cite", br(),
includeMarkdown(system.file("help_files/PCA_cite.Rmd", package = "BIGapp"))
))
),
box(title="DAPC", id = "DAPC_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE,
+ "This tab group estimates the number of distinct groups that are present within the genomic dataset, and classifies each sample into a distinct group.",
+ br(), br(),
bs4Dash::tabsetPanel(id = "DAPC_tabset",
- tabPanel("Parameters description", value = "DAPC_par",
+ tabPanel("Parameters description", value = "DAPC_par", br(),
includeMarkdown(system.file("help_files/DAPC_par.Rmd", package = "BIGapp"))
),
- tabPanel("Results description", value = "DAPC_results",
+ tabPanel("Results description", value = "DAPC_results", br(),
includeMarkdown(system.file("help_files/DAPC_res.Rmd", package = "BIGapp"))
),
- tabPanel("How to cite", value = "DAPC_cite",
+ tabPanel("How to cite", value = "DAPC_cite", br(),
includeMarkdown(system.file("help_files/DAPC_cite.Rmd", package = "BIGapp"))
))
),
box(title="Genomic Diversity", id = "Genomic_Diversity_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE,
- "**Draft**This tab is dedicated to analyzing genomic diversity within the population. It calculates various diversity metrics such as heterozygosity and minor allele frequency (MAF). The app includes functionalities to visualize these metrics through histograms and other plots. Users can download the calculated diversity metrics as CSV files. This tab helps in understanding the genetic variability and distribution of alleles within the population.",
+ "This tab estimates summary metrics for the samples and SNPs within a genomic dataset and produces figures and tables.",
br(), br(),
bs4Dash::tabsetPanel(id = "Genomic_Diversity_tabset",
- tabPanel("Parameters description", value = "Genomic_Diversity_par",
+ tabPanel("Parameters description", value = "Genomic_Diversity_par", br(),
includeMarkdown(system.file("help_files/Genomic_Diversity_par.Rmd", package = "BIGapp"))
),
- tabPanel("Results description", value = "Genomic_Diversity_results",
+ tabPanel("Results description", value = "Genomic_Diversity_results", br(),
includeMarkdown(system.file("help_files/Genomic_Diversity_res.Rmd", package = "BIGapp"))
),
- tabPanel("How to cite", value = "Genomic_Diversity_cite",
+ tabPanel("How to cite", value = "Genomic_Diversity_cite", br(),
includeMarkdown(system.file("help_files/Genomic_Diversity_cite.Rmd", package = "BIGapp"))
))
),
@@ -106,26 +112,30 @@ mod_help_ui <- function(id){
))
),
box(title="Predictive Ability", id = "Predictive_Ability_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE,
+ "This tab provides the predictive ability of a GBLUP model for each trait across all samples within a genomic dataset",
+ br(), br(),
bs4Dash::tabsetPanel(id = "Predictive_Ability_tabset",
- tabPanel("Parameters description", value = "Predictive_Ability_par",
+ tabPanel("Parameters description", value = "Predictive_Ability_par", br(),
includeMarkdown(system.file("help_files/Predictive_Ability_par.Rmd", package = "BIGapp"))
),
- tabPanel("Results description", value = "Predictive_Ability_results",
+ tabPanel("Results description", value = "Predictive_Ability_results", br(),
includeMarkdown(system.file("help_files/Predictive_Ability_res.Rmd", package = "BIGapp"))
),
- tabPanel("How to cite", value = "Predictive_Ability_cite",
+ tabPanel("How to cite", value = "Predictive_Ability_cite", br(),
includeMarkdown(system.file("help_files/Predictive_Ability_cite.Rmd", package = "BIGapp"))
))
),
box(title="Genomic Prediction", id = "Genomic_Prediction_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE,
+ "his tab estimates the trait and estimated-breeding-values (EBVs) for either all individuals in a genomic dataset, or by training the model with one genomic dataset to predict the values in another.",
+ br(), br(),
bs4Dash::tabsetPanel(id = "Genomic_Prediction_tabset",
- tabPanel("Parameters description", value = "Genomic_Prediction_par",
+ tabPanel("Parameters description", value = "Genomic_Prediction_par", br(),
includeMarkdown(system.file("help_files/Genomic_Prediction_par.Rmd", package = "BIGapp"))
),
- tabPanel("Results description", value = "Genomic_Prediction_results",
+ tabPanel("Results description", value = "Genomic_Prediction_results", br(),
includeMarkdown(system.file("help_files/Genomic_Prediction_res.Rmd", package = "BIGapp"))
),
- tabPanel("How to cite", value = "Genomic_Prediction_cite",
+ tabPanel("How to cite", value = "Genomic_Prediction_cite", br(),
includeMarkdown(system.file("help_files/Genomic_Prediction_cite.Rmd", package = "BIGapp"))
))
),
diff --git a/R/utils.R b/R/utils.R
index 975e339..9e78b21 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -4,7 +4,11 @@ get_counts <- function(madc_file, output_name) {
# Note: This assumes that the first 7 rows are not useful here like in the Strawberry DSt23-8501_MADC file
# Read the madc file
- madc_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE)
+ madc_df <- read.csv(madc_file, sep = ',', check.names = FALSE, header = FALSE)
+ header <- grep("AlleleID", madc_df[,1])
+ if(header > 1) madc_df <- madc_df[-c(1:(grep("AlleleID", madc_df[,1]))-1),]
+ colnames(madc_df) <- madc_df[1,]
+ madc_df <- madc_df[-1,]
# Retain only the Ref and Alt haplotypes
filtered_df <- madc_df[!grepl("\\|AltMatch|\\|RefMatch", madc_df$AlleleID), ]
@@ -21,23 +25,23 @@ get_counts <- function(madc_file, output_name) {
#Add functionality here to stop the script if indentical() is False
get_matrices <- function(result_df) {
#This function takes the dataframe of ref and alt counts for each sample, and converts them to ref, alt, and size(total count) matrices for Updog
-
+
update_df <- result_df
-
+
# Filter rows where 'AlleleID' ends with 'Ref'
ref_df <- subset(update_df, grepl("Ref$", AlleleID))
-
+
# Filter rows where 'AlleleID' ends with 'Alt'
alt_df <- subset(update_df, grepl("Alt$", AlleleID))
-
+
#Ensure that each has the same SNPs and that they are in the same order
same <- identical(alt_df$CloneID,ref_df$CloneID)
-
+
###Convert the ref and alt counts into matrices with the CloneID as the index
#Set SNP names as index
row.names(ref_df) <- ref_df$CloneID
row.names(alt_df) <- alt_df$CloneID
-
+
#Retain only the rows in common if they are not identical and provide warning
if (same == FALSE) {
warning("Mismatch between Ref and Alt Markers. MADC likely altered. Markers without a Ref or Alt match removed.")
@@ -47,26 +51,30 @@ get_matrices <- function(result_df) {
ref_df <- ref_df[common_ids, ]
alt_df <- alt_df[common_ids, ]
}
-
+
#Remove unwanted columns and convert to matrix
- ref_matrix <- as.matrix(ref_df[, -c(1:16)])
- alt_matrix <- as.matrix(alt_df[, -c(1:16)])
-
+ rm.col <- c("AlleleID", "CloneID", "AlleleSequence", "ClusterConsensusSequence",
+ "CallRate", "OneRatioRef", "OneRatioSnp", "FreqHomRef", "FreqHomSnp",
+ "FreqHets", "PICRef", "PICSnp", "AvgPIC", "AvgCountRef", "AvgCountSnp","RatioAvgCountRefAvgCountSnp")
+
+ ref_matrix <- as.matrix(ref_df[, -which(colnames(ref_df) %in% rm.col)])
+ alt_matrix <- as.matrix(alt_df[, -which(colnames(alt_df) %in% rm.col)])
+
#Convert elements to numeric
class(ref_matrix) <- "numeric"
class(alt_matrix) <- "numeric"
-
+
#Make the size matrix by combining the two matrices
size_matrix <- (ref_matrix + alt_matrix)
-
+
#Count the number of cells with 0 count to estimate missing data
# Count the number of cells with the value 0
count_zeros <- sum(size_matrix == 0)
-
+
# Print the result
ratio_missing_data <- count_zeros / length(size_matrix)
cat("Ratio of missing data =", ratio_missing_data, "\n")
-
+
# Return the ref and alt matrices as a list
matrices_list <- list(ref_matrix = ref_matrix, size_matrix = size_matrix)
return(matrices_list)
diff --git a/inst/help_files/DAPC_cite.Rmd b/inst/help_files/DAPC_cite.Rmd
index 1458b7c..c590227 100644
--- a/inst/help_files/DAPC_cite.Rmd
+++ b/inst/help_files/DAPC_cite.Rmd
@@ -4,3 +4,16 @@ output: html_document
date: "2024-08-29"
---
+* **BIGapp**
+
+* **BIGr**
+
+* **vcfR**
+
+Knaus BJ, Grünwald NJ (2017). “VCFR: a package to manipulate and visualize variant call format data in R.” Molecular Ecology Resources, 17(1), 44–53. ISSN 757, https://dx.doi.org/10.1111/1755-0998.12549.
+
+Knaus BJ, Grünwald NJ (2016). “VcfR: an R package to manipulate and visualize VCF format data.” BioRxiv. https://dx.doi.org/10.1101/041277.
+
+* **adegenet**
+
+Jombart, T. (2008). adegenet: a R package for the multivariate analysis of genetic markers. Bioinformatics, 24(11), 1403–1405.
diff --git a/inst/help_files/DAPC_par.Rmd b/inst/help_files/DAPC_par.Rmd
index b566e42..ce4577a 100644
--- a/inst/help_files/DAPC_par.Rmd
+++ b/inst/help_files/DAPC_par.Rmd
@@ -4,3 +4,30 @@ output: html_document
date: "2024-08-29"
---
+* **VCF file**
+Variant Call Format (VCF) is a standard file format to store genetic variant information. The genotype (GT) data within the VCF is required for the analysis in this tab. For more details about the VCF format, see this document: https://samtools.github.io/hts-specs/VCFv4.2.pdf.
+
+* **Passport file**
+A comma-separated values (CSV) file containing individual names (Sample_ID) in the first column and phenotype values in the subsequent columns. The phenotype column names should correspond to the phenotype ID. Example:
+
+
+
+|Sample_ID | Sepal.Length| Sepal.Width| Petal.Length| Petal.Width|Species |
+|:---------:|:------------:|:-----------:|:------------:|:-----------:|:-------:|
+|Sample_1 | 5.1| 3.5| 1.4| 0.2|versicolor |
+|Sample_2 | 4.9| 3.0| 1.4| 0.2|setosa |
+|Sample_3 | 4.7| 3.2| 1.3| 0.2|setosa |
+|Sample_4 | 4.6| 3.1| 1.5| 0.2|setosa |
+|Sample_5 | 5.0| 3.6| 1.4| 0.2|setosa |
+|Sample_6 | 5.4| 3.9| 1.7| 0.4|setosa |
+
+
+
+
+
+* **Species Ploidy**
+Specifies the ploidy level of the species. The current analysis supports both diploids and autopolyploids.
+
+* **Maximum K**
+
+* **Number of Clusters (K)**
diff --git a/inst/help_files/DAPC_res.Rmd b/inst/help_files/DAPC_res.Rmd
index 0932956..99bce83 100644
--- a/inst/help_files/DAPC_res.Rmd
+++ b/inst/help_files/DAPC_res.Rmd
@@ -4,3 +4,22 @@ output: html_document
date: "2024-08-29"
---
+* **Cluster assignments table**
+
+
+
+|K | BIC |
+|:---------:|:------------:|
+|Sample_1 | 5.1|
+|Sample_2 | 4.9|
+|Sample_3 | 4.7|
+|Sample_4 | 4.6|
+|Sample_5 | 5.0|
+|Sample_6 | 5.4|
+
+
+
+
+* **BIC plot**
+
+* **DAPC plot**
diff --git a/inst/help_files/DArT_Report2VCF_cite.Rmd b/inst/help_files/DArT_Report2VCF_cite.Rmd
index d3be88a..7ff8c10 100644
--- a/inst/help_files/DArT_Report2VCF_cite.Rmd
+++ b/inst/help_files/DArT_Report2VCF_cite.Rmd
@@ -4,3 +4,13 @@ output: html_document
date: "2024-08-29"
---
+* **BIGapp**
+
+
+* **BIGr**
+
+* **vcfR**
+
+Knaus BJ, Grünwald NJ (2017). “VCFR: a package to manipulate and visualize variant call format data in R.” Molecular Ecology Resources, 17(1), 44–53. ISSN 757, https://dx.doi.org/10.1111/1755-0998.12549.
+
+Knaus BJ, Grünwald NJ (2016). “VcfR: an R package to manipulate and visualize VCF format data.” BioRxiv. https://dx.doi.org/10.1101/041277.
diff --git a/inst/help_files/DArT_Report2VCF_par.Rmd b/inst/help_files/DArT_Report2VCF_par.Rmd
index ccf8d3c..4e6447f 100644
--- a/inst/help_files/DArT_Report2VCF_par.Rmd
+++ b/inst/help_files/DArT_Report2VCF_par.Rmd
@@ -4,3 +4,14 @@ output: html_document
date: "2024-08-29"
---
+* **DArTag Dosage Report**
+
+The DArT Dosage Report is a tab-separated file provided by DArT from a sequencing project. It contains the genotype information for each of the target markers for all samples in the sequencing project. The markers are in rows and the samples are in the columns. There are several summary metric columns that preceed the sample genotype columns. The genotype calls are the count of the reference allele, where 0 is equal to homozygous alternate.
+
+* **DArTag Counts File**
+
+The DArT counts file is a tab-separated file provided by DArT from a sequencing project. It contains the read count information for the referance and alternate allele at each target marker. The marker information are in the rows and the samples are in the columns. There are several information columns that preceed the sample columns. There are two versions of this file. The “collapsed counts” version contains the target markers that includes their multiallic read counts in their total counts. The “Counts” file contains the read counts for the target markers only (excluding the multiallelic read count information).
+
+* **Species Ploidy**
+
+Specifies the ploidy level of the species. The current analysis supports both diploids and autopolyploids.
diff --git a/inst/help_files/DArT_Report2VCF_res.Rmd b/inst/help_files/DArT_Report2VCF_res.Rmd
index 8f71982..0c70d3f 100644
--- a/inst/help_files/DArT_Report2VCF_res.Rmd
+++ b/inst/help_files/DArT_Report2VCF_res.Rmd
@@ -4,3 +4,6 @@ output: html_document
date: "2024-08-29"
---
+* **VCF file (v4.3)**
+
+Variant Call Format (VCF) is a standard file format to store genetic variant information. The genotype (GT) data within the VCF is converted from the numeric dosage call information. Included is the read counts for each marker/sample and the numeric dosage call (UD) data. For more details about the VCF format, see this document: https://samtools.github.io/hts-specs/VCFv4.2.pdf.
diff --git a/inst/help_files/GWAS_par.Rmd b/inst/help_files/GWAS_par.Rmd
index 8a8aaff..7082696 100644
--- a/inst/help_files/GWAS_par.Rmd
+++ b/inst/help_files/GWAS_par.Rmd
@@ -35,3 +35,7 @@ date: "2024-08-29"
* **Number of CPU Cores**: Defines the number of CPU cores to be used for the GWAS analysis, enabling faster processing by splitting the workload across multiple cores.
+BIGapp uses GWASpoly random polygenic effect to control for population structure. By default, all markers are used to calculate a single covariance matrix (parameter LOCO = FALSE in GWASpoly set.k function).
+
+BIGapp tests the inclusion of principal components as fixed effects (P + K model). For that, the BIC is calculated for models including 1 to 10 of the first principal components and the kinship matrix. In this step, the mixed model for GWAS is fitted using mixed.solve function of rrBLUP. Then, using the estimated parameters, log-likelihood is calculated by using the equation (2) in Kang et al., 2008. Finally, BIC is calculated by using the standard formula (BIC = K * log(N) - 2 * LL).
+
diff --git a/inst/help_files/GWAS_res.Rmd b/inst/help_files/GWAS_res.Rmd
index 80003a4..8d11df5 100644
--- a/inst/help_files/GWAS_res.Rmd
+++ b/inst/help_files/GWAS_res.Rmd
@@ -4,16 +4,23 @@ output: html_document
date: "2024-08-29"
---
-* BIC plot
-
-* BIC Table
-
-* LD plot
-
-* Manhattan Plot
-
-* QQ Plot
-
-* QTL - significant markers
-
-* Multiple QTL model results table
+* **BIC plot**
+Plot of the BIC of the tested models including PCs and kinship. The model using the number of PC that resulted in the lower BIC is the one used by BIGapp.
+
+* **BIC Table**
+Table with BIC for the tests including PC and kinship. The model using the number of PC that resulted in the lower BIC is the one used by BIGapp.
+
+* **LD plot**
+Plot LD vs distance. A monotone decreasing, convex spline is fit using R package scam.
+
+* **Manhattan Plot**
+From GWASpoly documentation: Results for the ref and alt versions of the dominance model are combined. If data is the output from set.threshold, then the threshold is displayed as a horizontal dashed line when models contains a single model. Because the threshold varies between models, it is not drawn when multiple models are included. Although the ref and alt versions of each dominance model are slightly different (as seen with qq.plot), they are treated as a single model for the Manhattan plot, and the average threshold is shown.
+
+* **QQ Plot**
+From GWASpoly documentation: One of the standard diagnostics in GWAS is to check the inflation of the -log10(p) values (aka “scores”). This can be done using a quantile-quantile plot of the observed vs. expected values under the null hypothesis, which follows a uniform distribution and is shown with a dotted line
+
+* **QTL - significant markers**
+Describes significant markers after screening with GWASpoly function
+
+* **Multiple QTL model results table**
+Results after fit.QTL function
diff --git a/inst/help_files/Genomic_Diversity_cite.Rmd b/inst/help_files/Genomic_Diversity_cite.Rmd
index b85b481..1870dbc 100644
--- a/inst/help_files/Genomic_Diversity_cite.Rmd
+++ b/inst/help_files/Genomic_Diversity_cite.Rmd
@@ -4,3 +4,12 @@ output: html_document
date: "2024-08-29"
---
+* **BIGapp**
+
+* **BIGr**
+
+* **vcfR**
+
+Knaus BJ, Grünwald NJ (2017). “VCFR: a package to manipulate and visualize variant call format data in R.” Molecular Ecology Resources, 17(1), 44–53. ISSN 757, https://dx.doi.org/10.1111/1755-0998.12549.
+
+Knaus BJ, Grünwald NJ (2016). “VcfR: an R package to manipulate and visualize VCF format data.” BioRxiv. https://dx.doi.org/10.1101/041277.
diff --git a/inst/help_files/Genomic_Diversity_par.Rmd b/inst/help_files/Genomic_Diversity_par.Rmd
index b236539..c4581aa 100644
--- a/inst/help_files/Genomic_Diversity_par.Rmd
+++ b/inst/help_files/Genomic_Diversity_par.Rmd
@@ -4,3 +4,6 @@ output: html_document
date: "2024-08-29"
---
+* **VCF file**: Variant Call Format (VCF) is a standard file format to store genetic variant information. The genotype (GT) data within the VCF is required for the analysis in this tab. For more details about the VCF format, see this document: https://samtools.github.io/hts-specs/VCFv4.2.pdf.
+
+* **Species Ploidy**: Specifies the ploidy level of the species. The current analysis supports both diploids and autopolyploids.
diff --git a/inst/help_files/Genomic_Diversity_res.Rmd b/inst/help_files/Genomic_Diversity_res.Rmd
index 752af92..2046b00 100644
--- a/inst/help_files/Genomic_Diversity_res.Rmd
+++ b/inst/help_files/Genomic_Diversity_res.Rmd
@@ -4,3 +4,14 @@ output: html_document
date: "2024-08-29"
---
+* **MAF plot**
+
+* **OHet plot**
+
+* **Dosage Ratio plot**
+
+* **Marker distribution plot**
+
+* **MAF table**
+
+* **OHet table**
diff --git a/inst/help_files/Genomic_Prediction_cite.Rmd b/inst/help_files/Genomic_Prediction_cite.Rmd
index 55097f3..1e59726 100644
--- a/inst/help_files/Genomic_Prediction_cite.Rmd
+++ b/inst/help_files/Genomic_Prediction_cite.Rmd
@@ -4,3 +4,18 @@ output: html_document
date: "2024-08-29"
---
+* **BIGapp**
+
+* **vcfR**
+
+Knaus BJ, Grünwald NJ (2017). “VCFR: a package to manipulate and visualize variant call format data in R.” Molecular Ecology Resources, 17(1), 44–53. ISSN 757, https://dx.doi.org/10.1111/1755-0998.12549.
+
+Knaus BJ, Grünwald NJ (2016). “VcfR: an R package to manipulate and visualize VCF format data.” BioRxiv. https://dx.doi.org/10.1101/041277.
+
+* **rrBLUP**
+
+Endelman JB (2011). “Ridge regression and other kernels for genomic selection with R package rrBLUP.” Plant Genome, 4, 250-255.
+
+* **AGHmatrix**
+
+R Amadeu R, Franco Garcia A, Munoz P, V Ferrao L (2023). “AGHmatrix: genetic relationship matrices in R .” Bioinformatics, 39(7).
diff --git a/inst/help_files/Genomic_Prediction_par.Rmd b/inst/help_files/Genomic_Prediction_par.Rmd
index b4b18cb..b411335 100644
--- a/inst/help_files/Genomic_Prediction_par.Rmd
+++ b/inst/help_files/Genomic_Prediction_par.Rmd
@@ -4,3 +4,32 @@ output: html_document
date: "2024-08-29"
---
+This tab estimates the trait and estimated-breeding-values (EBVs) for either all individuals in a genomic dataset, or by training the model with one genomic dataset to predict the values in another. The trait and EBV information can then be used to make selections for the next breeding cycle.
+
+* **VCF file**: Variant Call Format (VCF) is a standard file format to store genetic variant information. The genotype (GT) data within the VCF is required for the analysis in this tab. For more details about the VCF format, see this document: https://samtools.github.io/hts-specs/VCFv4.2.pdf.
+
+* **Passport file**: A comma-separated values (CSV) file containing individual names (Sample_ID) in the first column and phenotype values in the subsequent columns. The phenotype column names should correspond to the phenotype ID. Example:
+
+
+
+|Sample_ID | Sepal.Length| Sepal.Width| Petal.Length| Petal.Width|Species |
+|:---------:|:------------:|:-----------:|:------------:|:-----------:|:-------:|
+|Sample_1 | 5.1| 3.5| 1.4| 0.2|versicolor |
+|Sample_2 | 4.9| 3.0| 1.4| 0.2|setosa |
+|Sample_3 | 4.7| 3.2| 1.3| 0.2|setosa |
+|Sample_4 | 4.6| 3.1| 1.5| 0.2|setosa |
+|Sample_5 | 5.0| 3.6| 1.4| 0.2|setosa |
+|Sample_6 | 5.4| 3.9| 1.7| 0.4|setosa |
+
+
+
+
+
+* **Prediction VCF file**
+
+* **Species Ploidy**: Specifies the ploidy level of the species. The current analysis supports both diploids and autopolyploids.
+
+* **Matrix type**: Specifies the matrix type to use for the GBLUP prediction model. The choices are:
+ * **Gmatrix**: An additive relationship matrix between all samples is created using the genotype information from the VCF file
+ * **Amatrix**: An additive relationship matrix between all samples is created using a user supplied pedigree file
+ * **Hmatrix**: An additive relationship matrix between all samples is created by using the information from both the VCF file and the pedigree file
diff --git a/inst/help_files/Genomic_Prediction_res.Rmd b/inst/help_files/Genomic_Prediction_res.Rmd
index c48e1ed..d467256 100644
--- a/inst/help_files/Genomic_Prediction_res.Rmd
+++ b/inst/help_files/Genomic_Prediction_res.Rmd
@@ -4,3 +4,27 @@ output: html_document
date: "2024-08-29"
---
+* **Predicted Trait table**: The trait values are predicted for all samples in either the input VCF file (if only one provided), or for all of the samples in the predictive VCF file. It is in the format of samples IDs in the first column, and each subsequent column being the information for the traits selected by the user.
+
+| Sample ID | Sepal Length | Sepal Width |
+|------------|--------------|-------------|
+| Sample_1 | 4.8 | 3.5 |
+| Sample_2 | 4.9 | 3.0 |
+| Sample_3 | 4.7 | 3.2 |
+| Sample_4 | 4.6 | 3.1 |
+| Sample_5 | 5.0 | 3.6 |
+| Sample_6 | 5.4 | 3.9 |
+
+
+
+
+* **EBV table**: Estimated Breeding Values (EBVs) from genomic prediction are statistical estimates of an individual's genetic potential for a specific trait, calculated by combining genomic information with phenotypic and pedigree data. These values help predict an organism's ability to pass on desirable traits to its offspring, allowing for more accurate selection in breeding programs. The EBVs are predicted for all samples in either the input VCF file (if only one provided), or for all of the samples in the predictive VCF file. It is in the format of samples IDs in the first column, and each subsequent column being the information for the traits selected by the user.
+
+| Sample ID | Sepal Length | Sepal Width |
+|------------|--------------|-------------|
+| Sample_1 | 0.32 | 0.48 |
+| Sample_2 | -0.12 | -0.28 |
+| Sample_3 | 0.14 | 0.31 |
+| Sample_4 | 1.21 | 1.03 |
+| Sample_5 | 0.43 | 0.33 |
+| Sample_6 | 0.03 | 0.91 |
diff --git a/inst/help_files/PCA_cite.Rmd b/inst/help_files/PCA_cite.Rmd
index 45d5f7f..f65632a 100644
--- a/inst/help_files/PCA_cite.Rmd
+++ b/inst/help_files/PCA_cite.Rmd
@@ -4,3 +4,16 @@ output: html_document
date: "2024-08-29"
---
+* **BIGapp**
+
+* **BIGr**
+
+* **vcfR**
+
+Knaus BJ, Grünwald NJ (2017). “VCFR: a package to manipulate and visualize variant call format data in R.” Molecular Ecology Resources, 17(1), 44–53. ISSN 757, https://dx.doi.org/10.1111/1755-0998.12549.
+
+Knaus BJ, Grünwald NJ (2016). “VcfR: an R package to manipulate and visualize VCF format data.” BioRxiv. https://dx.doi.org/10.1101/041277.
+
+* **AGHmatrix**
+
+R Amadeu R, Franco Garcia A, Munoz P, V Ferrao L (2023). “AGHmatrix: genetic relationship matrices in R .” Bioinformatics, 39(7).
diff --git a/inst/help_files/PCA_par.Rmd b/inst/help_files/PCA_par.Rmd
index 61a0f27..61f859d 100644
--- a/inst/help_files/PCA_par.Rmd
+++ b/inst/help_files/PCA_par.Rmd
@@ -4,3 +4,39 @@ output: html_document
date: "2024-08-29"
---
+
+* **VCF file**
+Variant Call Format (VCF) is a standard file format to store genetic variant information. The genotype (GT) data within the VCF is required for the analysis in this tab. For more details about the VCF format, see this document: https://samtools.github.io/hts-specs/VCFv4.2.pdf.
+
+* **Passport file**
+A comma-separated values (CSV) file containing individual names (Sample_ID) in the first column and phenotype values in the subsequent columns. The phenotype column names should correspond to the phenotype ID. Example:
+
+
+
+|Sample_ID | Sepal.Length| Sepal.Width| Petal.Length| Petal.Width|Species |
+|:---------:|:------------:|:-----------:|:------------:|:-----------:|:-------:|
+|Sample_1 | 5.1| 3.5| 1.4| 0.2|versicolor |
+|Sample_2 | 4.9| 3.0| 1.4| 0.2|setosa |
+|Sample_3 | 4.7| 3.2| 1.3| 0.2|setosa |
+|Sample_4 | 4.6| 3.1| 1.5| 0.2|setosa |
+|Sample_5 | 5.0| 3.6| 1.4| 0.2|setosa |
+|Sample_6 | 5.4| 3.9| 1.7| 0.4|setosa |
+
+
+
+
+
+* **Species Ploidy**
+Specifies the ploidy level of the species. The current analysis supports both diploids and autopolyploids.
+
+* **Variable to color**
+Specifies which column to use in the passport data file to color the samples in the PCA plots by.
+
+* **Category to color**
+Specifies which specific item(s) within the selected column to color the samples in the PCA plots. All other samples will be a shade of grey.
+
+* **Color palette**
+Select which color palette to use for coloring the sample points. The different color palettes are separated by those that are color-blind friendly and those that are not.
+
+* **Axes to visualize**
+Choose which axes to display on the 2D PCA plot.
diff --git a/inst/help_files/PCA_res.Rmd b/inst/help_files/PCA_res.Rmd
index 1f2f534..07c678d 100644
--- a/inst/help_files/PCA_res.Rmd
+++ b/inst/help_files/PCA_res.Rmd
@@ -4,3 +4,8 @@ output: html_document
date: "2024-08-29"
---
+* **3D PCA plot**
+
+* **2D PCA plot**
+
+* **Scree plot**
diff --git a/inst/help_files/Predictive_Ability_cite.Rmd b/inst/help_files/Predictive_Ability_cite.Rmd
index 92d6654..2bdeda1 100644
--- a/inst/help_files/Predictive_Ability_cite.Rmd
+++ b/inst/help_files/Predictive_Ability_cite.Rmd
@@ -4,3 +4,18 @@ output: html_document
date: "2024-08-29"
---
+* **BIGapp**
+
+* **vcfR**
+
+Knaus BJ, Grünwald NJ (2017). “VCFR: a package to manipulate and visualize variant call format data in R.” Molecular Ecology Resources, 17(1), 44–53. ISSN 757, https://dx.doi.org/10.1111/1755-0998.12549.
+
+Knaus BJ, Grünwald NJ (2016). “VcfR: an R package to manipulate and visualize VCF format data.” BioRxiv. https://dx.doi.org/10.1101/041277.
+
+* **rrBLUP**
+
+Endelman JB (2011). “Ridge regression and other kernels for genomic selection with R package rrBLUP.” Plant Genome, 4, 250-255.
+
+* **AGHmatrix**
+
+R Amadeu R, Franco Garcia A, Munoz P, V Ferrao L (2023). “AGHmatrix: genetic relationship matrices in R .” Bioinformatics, 39(7).
diff --git a/inst/help_files/Predictive_Ability_par.Rmd b/inst/help_files/Predictive_Ability_par.Rmd
index c3eab0f..ce189f2 100644
--- a/inst/help_files/Predictive_Ability_par.Rmd
+++ b/inst/help_files/Predictive_Ability_par.Rmd
@@ -4,3 +4,21 @@ output: html_document
date: "2024-08-29"
---
+This tab provides the predictive ability of a GBLUP model for each trait across all samples within a genomic dataset. The model is based on a 5-fold cross validation, where the samples are evenly grouped into 5 groups, and 4 of the groups are used to train the GBLUP model, while the trait is predicted for the 5th group. This continues until each group has had their trait information predicted, and the predictive ability is the pearson correlation between the known trait values and the predicted values. Each 5-fold cross-validation can be performed multiple times (iterations) to get a more confident estimate in the predictive ability of the model. This supports the use of genomic and pedigree information.
+
+
+* **VCF file**: Variant Call Format (VCF) is a standard file format to store genetic variant information. The genotype (GT) data within the VCF is required for the analysis in this tab. For more details about the VCF format, see this document: https://samtools.github.io/hts-specs/VCFv4.2.pdf.
+
+* **Passport file**: A comma-separated values (CSV) file containing individual names (Sample_ID) in the first column and phenotype values in the subsequent columns. The phenotype column names should correspond to the phenotype ID. Example:
+
+* **Species ploidy**: Specifies the ploidy level of the species. The current analysis supports both diploids and autopolyploids.
+
+* **Iterations**: This is the number of runs of five-fold cross-validation that you would like to perform to estimate predictive ability. The accuracy results are averaged over all iterations. The more iterations that are performed, the higher confidence in the final predictive ability estimates.
+
+* **Matrix type**: Specifies the matrix type to use for the GBLUP prediction model. The choices are:
+
+ * **Gmatrix**: An additive relationship matrix between all samples is created using the genotype information from the VCF file
+
+ * **Amatrix**: An additive relationship matrix between all samples is created using a user supplied pedigree file
+
+ * **Hmatrix**: An additive relationship matrix between all samples is created by using the information from both the VCF file and the pedigree file
diff --git a/inst/help_files/Predictive_Ability_res.Rmd b/inst/help_files/Predictive_Ability_res.Rmd
index 14e1952..bfb1db6 100644
--- a/inst/help_files/Predictive_Ability_res.Rmd
+++ b/inst/help_files/Predictive_Ability_res.Rmd
@@ -4,3 +4,19 @@ output: html_document
date: "2024-08-29"
---
+* **Violin plot**
+
+* **Box plot**
+
+* **Predictive ability table**
+
+
+|Iter | Sepal.Length | Sepal.Width |
+|:---------:|:------------:|:-----------:|
+|1 | 0.728| 0.571|
+|2 | 0.721| 0.568|
+|3 | 0.724| 0.543|
+
+
+
+
diff --git a/inst/help_files/Updog_Dosage_Calling_par.Rmd b/inst/help_files/Updog_Dosage_Calling_par.Rmd
index 40305c8..f80e6ec 100644
--- a/inst/help_files/Updog_Dosage_Calling_par.Rmd
+++ b/inst/help_files/Updog_Dosage_Calling_par.Rmd
@@ -11,7 +11,8 @@ date: "2024-08-29"
* **VCF file**:
Variant Call Format (VCF) is a standard file format to store genetic variant information. The genotype (GT) data within the VCF is required for the analysis in this tab. For more details about the VCF format, see this document: https://samtools.github.io/hts-specs/VCFv4.2.pdf.
-* **Passport File**: A comma-separated values (CSV) file containing individual names (Sample_ID) in the first column and phenotype values in the subsequent columns. The phenotype column names should correspond to the phenotype ID.
+* **Passport File**:
+A comma-separated values (CSV) file containing individual names (Sample_ID) in the first column and phenotype values in the subsequent columns. The phenotype column names should correspond to the phenotype ID.
* **Select Category Subset**: After loading the passport file, this option will be available. You can select the column name to base the subsetting for the samples
@@ -45,6 +46,7 @@ The following information is from the Updog manual. Possible values of the genot
`f1` This prior assumes the individuals are all full-siblings resulting from one generation of a bi-parental cross. This model assumes a particular type of meiotic behavior: polysomic inheritance with bivalent, non-preferential pairing. `f1pp` This prior allows for double reduction and preferential pairing in an F1 population of tretraploids. `s1pp` This prior allows for double reduction and preferential pairing in an S1 population of tretraploids. `flex` Generically any categorical distribution. Theoretically, this works well if you have a lot of individuals. In practice, it seems to be much less robust to violations in modeling assumptions.`uniform` A discrete uniform distribution. This should never be used in practice."
* **Parent**: If “s1” or “s1pp” model is selected you must define which sample is correspondent to the parent including the sample ID in this box. The input sample ID must match to the sample ID in the input genotype file
+
* **Parent1 and Parent2**: if “f1” or “f1pp” model is selected you must define which samples correspondent to the parent1 and parent2 including the samples ID in the respective boxes. The input sample ID must match to the sample ID in the input genotype file
* **Number of CPU Cores**: Number of cores to be used in the multidog function paralelization
diff --git a/inst/help_files/VCF_Filtering_cite.Rmd b/inst/help_files/VCF_Filtering_cite.Rmd
index 0699277..1958c9d 100644
--- a/inst/help_files/VCF_Filtering_cite.Rmd
+++ b/inst/help_files/VCF_Filtering_cite.Rmd
@@ -4,3 +4,16 @@ output: html_document
date: "2024-08-29"
---
+* **BIGapp**
+
+* **BIGr**
+
+* **Updog** (if filtering parameters used)
+
+Gerard, D., Ferrão, L. F. V., Garcia, A. A. F., & Stephens, M. (2018). Genotyping Polyploids from Messy Sequencing Data. Genetics, 210(3), 789-807. doi: 10.1534/genetics.118.301468.
+
+* **vcfR**
+
+Knaus BJ, Grünwald NJ (2017). “VCFR: a package to manipulate and visualize variant call format data in R.” Molecular Ecology Resources, 17(1), 44–53. ISSN 757, https://dx.doi.org/10.1111/1755-0998.12549.
+
+Knaus BJ, Grünwald NJ (2016). “VcfR: an R package to manipulate and visualize VCF format data.” BioRxiv. https://dx.doi.org/10.1101/041277.
diff --git a/inst/help_files/VCF_Filtering_par.Rmd b/inst/help_files/VCF_Filtering_par.Rmd
index 2abbd30..d845367 100644
--- a/inst/help_files/VCF_Filtering_par.Rmd
+++ b/inst/help_files/VCF_Filtering_par.Rmd
@@ -3,3 +3,23 @@ title: "VCF_Filtering_par"
output: html_document
date: "2024-08-29"
---
+
+
+* **VCF file**: Variant Call Format (VCF) is a standard file format to store genetic variant information. The genotype (GT) data within the VCF is required for the analysis in this tab. For more details about the VCF format, see this document: https://samtools.github.io/hts-specs/VCFv4.2.pdf.
+
+* **Species Ploidy**: Specifies the ploidy level of the species. The current analysis supports both diploids and autopolyploids.
+
+* **Minor-Allele-Frequency**: The frequency of the minor allele within the population for each SNP. SNPs with a very low MAF (MAF < 0.01) are typically removed since they could be due to sequencing errors and could bias the GWAS and PCA analyses.
+
+* **Read Depth per marker/sample**: This the read depth for each marker at each sample. A low read depth suggests that a given marker at a sample had poor genotyping performance, and should be assigned as missing. Typical read depth thresholds are set so that genotypes with a read depth per marker/sample of less than 10 are assigned as missing data.
+
+* **SNP missing data**: The ratio of missing data across all samples for each SNP. Low missing data (minimum <= 50%) is necessary to not bias and have confidence in the downstream results.
+
+* **Sample missing data**: The ratio of missing data across all SNPs for each sample. Low missing data (minimum <= 50%) is necessary to not bias and have confidence in the downstream results.
+
+* **Updog parameters**:
+
+ * **OD**: The estimated overdispersion parameter of the SNP from updog
+ * **Bias**: The estimated allele bias of the SNP from updog
+ * **Prop_mis**: The estimated proportion of individuals misclassified in the SNP from updog
+ * **Maxpostprob**: Maximum posterior probability for that dosage call from updog
diff --git a/inst/help_files/VCF_Filtering_res.Rmd b/inst/help_files/VCF_Filtering_res.Rmd
index e3430b6..8243c09 100644
--- a/inst/help_files/VCF_Filtering_res.Rmd
+++ b/inst/help_files/VCF_Filtering_res.Rmd
@@ -4,3 +4,6 @@ output: html_document
date: "2024-08-29"
---
+* **VCF file (v4.3)**
+
+Variant Call Format (VCF) is a standard file format to store genetic variant information. The genotype (GT) data within the VCF is required for the analysis in this tab. For more details about the VCF format, see this document: https://samtools.github.io/hts-specs/VCFv4.2.pdf.
diff --git a/tests/testthat/test-DosageCall.R b/tests/testthat/test-DosageCall.R
index b49b881..a3ac1d0 100644
--- a/tests/testthat/test-DosageCall.R
+++ b/tests/testthat/test-DosageCall.R
@@ -6,6 +6,7 @@ context("Dosage Calling")
test_that("Dosage Calling from MADC file",{
madc_file <- system.file("iris_DArT_MADC.csv", package="BIGapp")
+
output_name <- "output"
ploidy <- 2
cores <- 2