Breeding-Insight · alex-sandercock · Dec 20, 2024 · Sep 26, 2024 · Dec 6, 2024 · Dec 8, 2024
diff --git a/R/MyFun_BIC_Meng.R b/R/MyFun_BIC_Meng.R
@@ -43,9 +43,9 @@
 
 #' function for BIC calculation
 #'
-#' @param y describe documentation
-#' @param PC describe documentation
-#' @param K describe documentation
+#' @param y length N vector
+#' @param PC matrix of principal components with N rows and P columns
+#' @param K kinship matrix with N rows and N columns
 #'
 #' @import rrBLUP
 #' @importFrom MASS ginv

diff --git a/R/mod_help.R b/R/mod_help.R
@@ -14,16 +14,16 @@ mod_help_ui <- function(id){
       column(width=12),
       column(width=12,
              box(title="DArT Report2VCF", id = "DArT_Report2VCF_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE,
-                 "**Draft**This tab is designed to convert the DArT Dose Report and Counts files to a VCF file. **DArT Website**",
+                 "This tab converts the processed genotype and counts files from DArT into a VCF file (v4.3). This file can then be used as the genotype input for the analyses within BIGapp or used with other genomics applications.",
                  br(), br(),
                  bs4Dash::tabsetPanel(id = "DArT_Report2VCF_tabset",
-                                      tabPanel("Parameters description", value = "DArT_Report2VCF_par",
+                                      tabPanel("Parameters description", value = "DArT_Report2VCF_par", br(),
                                                includeMarkdown(system.file("help_files/DArT_Report2VCF_par.Rmd", package = "BIGapp"))
                                       ),
-                                      tabPanel("Results description", value = "DArT_Report2VCF_results",
+                                      tabPanel("Results description", value = "DArT_Report2VCF_results", br(),
                                                includeMarkdown(system.file("help_files/DArT_Report2VCF_res.Rmd", package = "BIGapp"))
                                       ),
-                                      tabPanel("How to cite", value = "DArT_Report2VCF_cite",
+                                      tabPanel("How to cite", value = "DArT_Report2VCF_cite", br(),
                                                includeMarkdown(system.file("help_files/DArT_Report2VCF_cite.Rmd", package = "BIGapp"))
                                       ))
              ),
@@ -42,52 +42,58 @@ mod_help_ui <- function(id){
                                       ))
              ),
              box(title="VCF Filtering", id = "VCF_Filtering_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE,
+                 "Filter SNPs and samples in a VCF file based on missing data, minor allele frequency, read depth, and Updog dosage calling metrics",
+                 br(), br(),
                  bs4Dash::tabsetPanel(id = "VCF_Filtering_tabset",
-                                      tabPanel("Parameters description", value = "VCF_Filtering_par",
+                                      tabPanel("Parameters description", value = "VCF_Filtering_par", br(),
                                                includeMarkdown(system.file("help_files/VCF_Filtering_par.Rmd", package = "BIGapp"))
                                       ),
-                                      tabPanel("Results description", value = "VCF_Filtering_results",
-                                               includeMarkdown(system.file("help_files/VCF_Filtering_par.Rmd", package = "BIGapp"))
+                                      tabPanel("Results description", value = "VCF_Filtering_results", br(),
+                                               includeMarkdown(system.file("help_files/VCF_Filtering_res.Rmd", package = "BIGapp"))
                                       ),
-                                      tabPanel("How to cite", value = "Updog_Dosage_Calling_cite",
+                                      tabPanel("How to cite", value = "Updog_Dosage_Calling_cite", br(),
                                                includeMarkdown(system.file("help_files/VCF_Filtering_cite.Rmd", package = "BIGapp"))
                                       ))
              ),
              box(title="PCA", id = "PCA_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE,
+                 "This tab is used to perform a PCA to visualize the genomic relationships between samples (population structure)",
+                 br(), br(),
                  bs4Dash::tabsetPanel(id = "PCA_tabset",
-                                      tabPanel("Parameters description", value = "PCA_par",
+                                      tabPanel("Parameters description", value = "PCA_par", br(),
                                                includeMarkdown(system.file("help_files/PCA_par.Rmd", package = "BIGapp"))
                                       ),
-                                      tabPanel("Results description", value = "PCA_results",
+                                      tabPanel("Results description", value = "PCA_results", br(),
                                                includeMarkdown(system.file("help_files/PCA_res.Rmd", package = "BIGapp"))
                                       ),
-                                      tabPanel("How to cite", value = "PCA_cite",
+                                      tabPanel("How to cite", value = "PCA_cite", br(),
                                                includeMarkdown(system.file("help_files/PCA_cite.Rmd", package = "BIGapp"))
                                       ))
              ),
              box(title="DAPC", id = "DAPC_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE,
+                 "This tab group estimates the number of distinct groups that are present within the genomic dataset, and classifies each sample into a distinct group.",
+                 br(), br(),
                  bs4Dash::tabsetPanel(id = "DAPC_tabset",
-                                      tabPanel("Parameters description", value = "DAPC_par",
+                                      tabPanel("Parameters description", value = "DAPC_par", br(),
                                                includeMarkdown(system.file("help_files/DAPC_par.Rmd", package = "BIGapp"))
                                       ),
-                                      tabPanel("Results description", value = "DAPC_results",
+                                      tabPanel("Results description", value = "DAPC_results", br(),
                                                includeMarkdown(system.file("help_files/DAPC_res.Rmd", package = "BIGapp"))
                                       ),
-                                      tabPanel("How to cite", value = "DAPC_cite",
+                                      tabPanel("How to cite", value = "DAPC_cite", br(),
                                                includeMarkdown(system.file("help_files/DAPC_cite.Rmd", package = "BIGapp"))
                                       ))
              ),
              box(title="Genomic Diversity", id = "Genomic_Diversity_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE,
-                 "**Draft**This tab is dedicated to analyzing genomic diversity within the population. It calculates various diversity metrics such as heterozygosity and minor allele frequency (MAF). The app includes functionalities to visualize these metrics through histograms and other plots. Users can download the calculated diversity metrics as CSV files. This tab helps in understanding the genetic variability and distribution of alleles within the population.",
+                 "This tab estimates summary metrics for the samples and SNPs within a genomic dataset and produces figures and tables.",
                  br(), br(),
                  bs4Dash::tabsetPanel(id = "Genomic_Diversity_tabset",
-                                      tabPanel("Parameters description", value = "Genomic_Diversity_par",
+                                      tabPanel("Parameters description", value = "Genomic_Diversity_par", br(),
                                                includeMarkdown(system.file("help_files/Genomic_Diversity_par.Rmd", package = "BIGapp"))
                                       ),
-                                      tabPanel("Results description", value = "Genomic_Diversity_results",
+                                      tabPanel("Results description", value = "Genomic_Diversity_results", br(),
                                                includeMarkdown(system.file("help_files/Genomic_Diversity_res.Rmd", package = "BIGapp"))
                                       ),
-                                      tabPanel("How to cite", value = "Genomic_Diversity_cite",
+                                      tabPanel("How to cite", value = "Genomic_Diversity_cite", br(),
                                                includeMarkdown(system.file("help_files/Genomic_Diversity_cite.Rmd", package = "BIGapp"))
                                       ))
              ),
@@ -106,26 +112,30 @@ mod_help_ui <- function(id){
                                       ))
              ),
              box(title="Predictive Ability", id = "Predictive_Ability_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE,
+                 "This tab provides the predictive ability of a GBLUP model for each trait across all samples within a genomic dataset",
+                 br(), br(),
                  bs4Dash::tabsetPanel(id = "Predictive_Ability_tabset",
-                                      tabPanel("Parameters description", value = "Predictive_Ability_par",
+                                      tabPanel("Parameters description", value = "Predictive_Ability_par", br(),
                                                includeMarkdown(system.file("help_files/Predictive_Ability_par.Rmd", package = "BIGapp"))
                                       ),
-                                      tabPanel("Results description", value = "Predictive_Ability_results",
+                                      tabPanel("Results description", value = "Predictive_Ability_results", br(),
                                                includeMarkdown(system.file("help_files/Predictive_Ability_res.Rmd", package = "BIGapp"))
                                       ),
-                                      tabPanel("How to cite", value = "Predictive_Ability_cite",
+                                      tabPanel("How to cite", value = "Predictive_Ability_cite", br(),
                                                includeMarkdown(system.file("help_files/Predictive_Ability_cite.Rmd", package = "BIGapp"))
                                       ))
              ),
              box(title="Genomic Prediction", id = "Genomic_Prediction_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE,
+                 "his tab estimates the trait and estimated-breeding-values (EBVs) for either all individuals in a genomic dataset, or by training the model with one genomic dataset to predict the values in another.",
+                 br(), br(),
                  bs4Dash::tabsetPanel(id = "Genomic_Prediction_tabset",
-                                      tabPanel("Parameters description", value = "Genomic_Prediction_par",
+                                      tabPanel("Parameters description", value = "Genomic_Prediction_par", br(),
                                                includeMarkdown(system.file("help_files/Genomic_Prediction_par.Rmd", package = "BIGapp"))
                                       ),
-                                      tabPanel("Results description", value = "Genomic_Prediction_results",
+                                      tabPanel("Results description", value = "Genomic_Prediction_results", br(),
                                                includeMarkdown(system.file("help_files/Genomic_Prediction_res.Rmd", package = "BIGapp"))
                                       ),
-                                      tabPanel("How to cite", value = "Genomic_Prediction_cite",
+                                      tabPanel("How to cite", value = "Genomic_Prediction_cite", br(),
                                                includeMarkdown(system.file("help_files/Genomic_Prediction_cite.Rmd", package = "BIGapp"))
                                       ))
              ),

diff --git a/R/utils.R b/R/utils.R
@@ -4,7 +4,11 @@ get_counts <- function(madc_file, output_name) {
   # Note: This assumes that the first 7 rows are not useful here like in the Strawberry DSt23-8501_MADC file
 
   # Read the madc file
-  madc_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE)
+  madc_df <- read.csv(madc_file, sep = ',', check.names = FALSE, header = FALSE)
+  header <- grep("AlleleID", madc_df[,1])
+  if(header > 1) madc_df <- madc_df[-c(1:(grep("AlleleID", madc_df[,1]))-1),]
+  colnames(madc_df) <- madc_df[1,]
+  madc_df <- madc_df[-1,]
 
   # Retain only the Ref and Alt haplotypes
   filtered_df <- madc_df[!grepl("\\|AltMatch|\\|RefMatch", madc_df$AlleleID), ]
@@ -21,23 +25,23 @@ get_counts <- function(madc_file, output_name) {
 #Add functionality here to stop the script if indentical() is False
 get_matrices <- function(result_df) {
   #This function takes the dataframe of ref and alt counts for each sample, and converts them to ref, alt, and size(total count) matrices for Updog
-  
+
   update_df <- result_df
-  
+
   # Filter rows where 'AlleleID' ends with 'Ref'
   ref_df <- subset(update_df, grepl("Ref$", AlleleID))
-  
+
   # Filter rows where 'AlleleID' ends with 'Alt'
   alt_df <- subset(update_df, grepl("Alt$", AlleleID))
-  
+
   #Ensure that each has the same SNPs and that they are in the same order
   same <- identical(alt_df$CloneID,ref_df$CloneID)
-  
+
   ###Convert the ref and alt counts into matrices with the CloneID as the index
   #Set SNP names as index
   row.names(ref_df) <- ref_df$CloneID
   row.names(alt_df) <- alt_df$CloneID
-  
+
   #Retain only the rows in common if they are not identical and provide warning
   if (same == FALSE) {
     warning("Mismatch between Ref and Alt Markers. MADC likely altered. Markers without a Ref or Alt match removed.")
@@ -47,26 +51,30 @@ get_matrices <- function(result_df) {
     ref_df <- ref_df[common_ids, ]
     alt_df <- alt_df[common_ids, ]
   }
-  
+
   #Remove unwanted columns and convert to matrix
-  ref_matrix <- as.matrix(ref_df[, -c(1:16)])
-  alt_matrix <- as.matrix(alt_df[, -c(1:16)])
-
+  rm.col <- c("AlleleID", "CloneID", "AlleleSequence", "ClusterConsensusSequence",
+              "CallRate", "OneRatioRef", "OneRatioSnp", "FreqHomRef", "FreqHomSnp",
+              "FreqHets", "PICRef", "PICSnp", "AvgPIC", "AvgCountRef", "AvgCountSnp","RatioAvgCountRefAvgCountSnp")
+
+  ref_matrix <- as.matrix(ref_df[, -which(colnames(ref_df) %in% rm.col)])
+  alt_matrix <- as.matrix(alt_df[, -which(colnames(alt_df) %in% rm.col)])
+
   #Convert elements to numeric
   class(ref_matrix) <- "numeric"
   class(alt_matrix) <- "numeric"
-  
+
   #Make the size matrix by combining the two matrices
   size_matrix <- (ref_matrix + alt_matrix)
-  
+
   #Count the number of cells with 0 count to estimate missing data
   # Count the number of cells with the value 0
   count_zeros <- sum(size_matrix == 0)
-  
+
   # Print the result
   ratio_missing_data <- count_zeros / length(size_matrix)
   cat("Ratio of missing data =", ratio_missing_data, "\n")
-  
+
   # Return the ref and alt matrices as a list
   matrices_list <- list(ref_matrix = ref_matrix, size_matrix = size_matrix)
   return(matrices_list)

diff --git a/inst/help_files/DAPC_cite.Rmd b/inst/help_files/DAPC_cite.Rmd
@@ -4,3 +4,16 @@ output: html_document
 date: "2024-08-29"
 ---
 
+* **BIGapp**
+
+* **BIGr**
+
+* **vcfR**
+
+Knaus BJ, Grünwald NJ (2017). “VCFR: a package to manipulate and visualize variant call format data in R.” Molecular Ecology Resources, 17(1), 44–53. ISSN 757, https://dx.doi.org/10.1111/1755-0998.12549.
+
+Knaus BJ, Grünwald NJ (2016). “VcfR: an R package to manipulate and visualize VCF format data.” BioRxiv. https://dx.doi.org/10.1101/041277.
+
+* **adegenet**
+
+Jombart, T. (2008). adegenet: a R package for the multivariate analysis of genetic markers. Bioinformatics, 24(11), 1403–1405.
diff --git a/inst/help_files/DAPC_par.Rmd b/inst/help_files/DAPC_par.Rmd
@@ -4,3 +4,30 @@ output: html_document
 date: "2024-08-29"
 ---
 
+* **VCF file**
+Variant Call Format (VCF) is a standard file format to store genetic variant information. The genotype (GT) data within the VCF is required for the analysis in this tab. For more details about the VCF format, see this document: https://samtools.github.io/hts-specs/VCFv4.2.pdf. 
+
+* **Passport file**
+A comma-separated values (CSV) file containing individual names (Sample_ID) in the first column and phenotype values in the subsequent columns. The phenotype column names should correspond to the phenotype ID. Example: 
+
+<center>
+
+|Sample_ID | Sepal.Length| Sepal.Width| Petal.Length| Petal.Width|Species |
+|:---------:|:------------:|:-----------:|:------------:|:-----------:|:-------:|
+|Sample_1  |          5.1|         3.5|          1.4|         0.2|versicolor   |
+|Sample_2  |          4.9|         3.0|          1.4|         0.2|setosa  |
+|Sample_3  |          4.7|         3.2|          1.3|         0.2|setosa  |
+|Sample_4  |          4.6|         3.1|          1.5|         0.2|setosa  |
+|Sample_5  |          5.0|         3.6|          1.4|         0.2|setosa  |
+|Sample_6  |          5.4|         3.9|          1.7|         0.4|setosa  |
+
+</center>
+
+&nbsp;
+
+* **Species Ploidy**
+Specifies the ploidy level of the species. The current analysis supports both diploids and autopolyploids. 
+
+* **Maximum K**
+
+* **Number of Clusters (K)** 
diff --git a/inst/help_files/DAPC_res.Rmd b/inst/help_files/DAPC_res.Rmd
@@ -4,3 +4,22 @@ output: html_document
 date: "2024-08-29"
 ---
 
+* **Cluster assignments table**
+
+
+
+|K          |       BIC    | 
+|:---------:|:------------:|
+|Sample_1   |           5.1|  
+|Sample_2   |           4.9|  
+|Sample_3   |           4.7|  
+|Sample_4   |           4.6|  
+|Sample_5   |           5.0|  
+|Sample_6   |           5.4|  
+
+
+&nbsp;
+
+* **BIC plot**
+
+* **DAPC plot**
diff --git a/inst/help_files/DArT_Report2VCF_cite.Rmd b/inst/help_files/DArT_Report2VCF_cite.Rmd
@@ -4,3 +4,13 @@ output: html_document
 date: "2024-08-29"
 ---
 
+* **BIGapp** 
+
+
+* **BIGr**
+
+* **vcfR** 
+
+Knaus BJ, Grünwald NJ (2017). “VCFR: a package to manipulate and visualize variant call format data in R.” Molecular Ecology Resources, 17(1), 44–53. ISSN 757, https://dx.doi.org/10.1111/1755-0998.12549.
+
+Knaus BJ, Grünwald NJ (2016). “VcfR: an R package to manipulate and visualize VCF format data.” BioRxiv. https://dx.doi.org/10.1101/041277.
diff --git a/inst/help_files/DArT_Report2VCF_par.Rmd b/inst/help_files/DArT_Report2VCF_par.Rmd
@@ -4,3 +4,14 @@ output: html_document
 date: "2024-08-29"
 ---
 
+* **DArTag Dosage Report**
+
+The DArT Dosage Report is a tab-separated file provided by DArT from a sequencing project. It contains the genotype information for each of the target markers for all samples in the sequencing project. The markers are in rows and the samples are in the columns. There are several summary metric columns that preceed the sample genotype columns. The genotype calls are the count of the reference allele, where 0 is equal to homozygous alternate. 
+
+* **DArTag Counts File**
+
+The DArT counts file is a tab-separated file provided by DArT from a sequencing project. It contains the read count information for the referance and alternate allele at each target marker. The marker information are in the rows and the samples are in the columns. There are several information columns that preceed the sample columns. There are two versions of this file. The “collapsed counts” version contains the target markers that includes their multiallic read counts in their total counts. The “Counts” file contains the read counts for the target markers only (excluding the multiallelic read count information). 
+
+* **Species Ploidy**
+
+Specifies the ploidy level of the species. The current analysis supports both diploids and autopolyploids. 
diff --git a/inst/help_files/DArT_Report2VCF_res.Rmd b/inst/help_files/DArT_Report2VCF_res.Rmd
@@ -4,3 +4,6 @@ output: html_document
 date: "2024-08-29"
 ---
 
+* **VCF file (v4.3)**
+
+Variant Call Format (VCF) is a standard file format to store genetic variant information. The genotype (GT) data within the VCF is converted from the numeric dosage call information. Included is the read counts for each marker/sample and the numeric dosage call (UD) data. For more details about the VCF format, see this document: https://samtools.github.io/hts-specs/VCFv4.2.pdf.