Merge branch 'dev' into main

smped · Jun 12, 2022 · 9a783ce · 9a783ce
2 parents feca5f9 + fc43483
commit 9a783ce
Show file tree

Hide file tree

Showing 21 changed files with 1,838 additions and 895 deletions.
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ Full documentation can be found [here](https://steveped.github.io/GRAVI/)
 
 ## Snakemake Implementation
 
-The basic workflow is written `snakemake` and can be called using the following steps.
+The basic workflow is written `snakemake`, requiring at least v7.7, and can be called using the following steps.
 
 Firstly, setup the required conda environments
 
@@ -49,3 +49,7 @@ snakemake \
 Note that this creates common environments able to be called by other workflows and is dependent on the user.
 For me, my global conda environments are stored in `/home/steveped/mambaforge/envs/`.
 For other users, this path will need to be modified.
+
+If wishing to tidy the directory after a successful run, you can check which non-essential files can be deleted using `snakemake -n --delete-temp-output --cores 1`.
+If the files earmarked for deletion are considered to be non-essential, they can be deleted by removing the `-n` flag from the above code: `snakemake --delete-temp-output --cores 1`.
+As the bedgraph files produced by `macs2 callpeak` are typically very large, hence their conversion to bigwig files during the workflow, this step can free a considerable amount of disk space.
diff --git a/analysis/references.bib b/analysis/references.bib
@@ -0,0 +1,174 @@
+@article{Landt01092012,
+author = {Landt, Stephen G. and Marinov, Georgi K. and Kundaje, Anshul and Kheradpour, Pouya and Pauli, Florencia and Batzoglou, Serafim and Bernstein, Bradley E. and Bickel, Peter and Brown, James B. and Cayting, Philip and Chen, Yiwen and DeSalvo, Gilberto and Epstein, Charles and Fisher-Aylor, Katherine I. and Euskirchen, Ghia and Gerstein, Mark and Gertz, Jason and Hartemink, Alexander J. and Hoffman, Michael M. and Iyer, Vishwanath R. and Jung, Youngsook L. and Karmakar, Subhradip and Kellis, Manolis and Kharchenko, Peter V. and Li, Qunhua and Liu, Tao and Liu, X. Shirley and Ma, Lijia and Milosavljevic, Aleksandar and Myers, Richard M. and Park, Peter J. and Pazin, Michael J. and Perry, Marc D. and Raha, Debasish and Reddy, Timothy E. and Rozowsky, Joel and Shoresh, Noam and Sidow, Arend and Slattery, Matthew and Stamatoyannopoulos, John A. and Tolstorukov, Michael Y. and White, Kevin P. and Xi, Simon and Farnham, Peggy J. and Lieb, Jason D. and Wold, Barbara J. and Snyder, Michael}, 
+title = {ChIP-seq guidelines and practices of the ENCODE and modENCODE consortia},
+volume = {22}, 
+number = {9}, 
+pages = {1813-1831}, 
+year = {2012}, 
+doi = {10.1101/gr.136184.111}, 
+abstract ={Chromatin immunoprecipitation (ChIP) followed by high-throughput DNA sequencing (ChIP-seq) has become a valuable and widely used approach for mapping the genomic location of transcription-factor binding and histone modifications in living cells. Despite its widespread use, there are considerable differences in how these experiments are conducted, how the results are scored and evaluated for quality, and how the data and metadata are archived for public use. These practices affect the quality and utility of any global ChIP experiment. Through our experience in performing ChIP-seq experiments, the ENCODE and modENCODE consortia have developed a set of working standards and guidelines for ChIP experiments that are updated routinely. The current guidelines address antibody validation, experimental replication, sequencing depth, data and metadata reporting, and data quality assessment. We discuss how ChIP quality, assessed in these ways, affects different uses of ChIP-seq data. All data sets used in the analysis have been deposited for public viewing and downloading at the ENCODE (http://encodeproject.org/ENCODE/) and modENCODE (http://www.modencode.org/) portals.}, 
+URL = {http://genome.cshlp.org/content/22/9/1813.abstract}, 
+eprint = {http://genome.cshlp.org/content/22/9/1813.full.pdf+html}, 
+journal = {Genome Research} 
+}
+
+% 18798982 
+@Article{Zhang18798982,
+   Author="Zhang, Y.  and Liu, T.  and Meyer, C. A.  and Eeckhoute, J.  and Johnson, D. S.  and Bernstein, B. E.  and Nusbaum, C.  and Myers, R. M.  and Brown, M.  and Li, W.  and Liu, X. S. ",
+   Title="{{M}odel-based analysis of {C}h{I}{P}-{S}eq ({M}{A}{C}{S})}",
+   Journal="Genome Biol",
+   Year="2008",
+   Volume="9",
+   Number="9",
+   Pages="R137"
+}
+
+@article{EwelsMultiQC2016,
+    author = {Ewels, Philip and Magnusson, Måns and Lundin, Sverker and Käller, Max},
+    title = "{MultiQC: summarize analysis results for multiple tools and samples in a single report}",
+    journal = {Bioinformatics},
+    volume = {32},
+    number = {19},
+    pages = {3047-3048},
+    year = {2016},
+    month = {06},
+    abstract = "{Motivation: Fast and accurate quality control is essential for studies involving next-generation sequencing data. Whilst numerous tools exist to quantify QC metrics, there is no common approach to flexibly integrate these across tools and large sample sets. Assessing analysis results across an entire project can be time consuming and error prone; batch effects and outlier samples can easily be missed in the early stages of analysis.Results: We present MultiQC, a tool to create a single report visualising output from multiple tools across many samples, enabling global trends and biases to be quickly identified. MultiQC can plot data from many common bioinformatics tools and is built to allow easy extension and customization.Availability and implementation: MultiQC is available with an GNU GPLv3 license on GitHub, the Python Package Index and Bioconda. Documentation and example reports are available at http://multiqc.infoContact:[email protected]}",
+    issn = {1367-4803},
+    doi = {10.1093/bioinformatics/btw354},
+    url = {https://doi.org/10.1093/bioinformatics/btw354},
+    eprint = {https://academic.oup.com/bioinformatics/article-pdf/32/19/3047/25072524/btw354.pdf},
+}
+
+@article{WardNgsReports2019,
+    author = {Ward, Christopher M and To, Thu-Hien and Pederson, Stephen M},
+    title = "{ngsReports: a Bioconductor package for managing FastQC reports and other NGS related log files}",
+    journal = {Bioinformatics},
+    volume = {36},
+    number = {8},
+    pages = {2587-2588},
+    year = {2019},
+    month = {12},
+    abstract = "{High throughput next generation sequencing (NGS) has become exceedingly cheap, facilitating studies to be undertaken containing large sample numbers. Quality control (QC) is an essential stage during analytic pipelines and the outputs of popular bioinformatics tools such as FastQC and Picard can provide information on individual samples. Although these tools provide considerable power when carrying out QC, large sample numbers can make inspection of all samples and identification of systemic bias a challenge.We present ngsReports, an R package designed for the management and visualization of NGS reports from within an R environment. The available methods allow direct import into R of FastQC reports along with outputs from other tools. Visualization can be carried out across many samples using default, highly customizable plots with options to perform hierarchical clustering to quickly identify outlier libraries. Moreover, these can be displayed in an interactive shiny app or HTML report for ease of analysis.The ngsReports package is available on Bioconductor and the GUI shiny app is available at https://github.com/UofABioinformaticsHub/shinyNgsreports.Supplementary data are available at Bioinformatics online.}",
+    issn = {1367-4803},
+    doi = {10.1093/bioinformatics/btz937},
+    url = {https://doi.org/10.1093/bioinformatics/btz937},
+    eprint = {https://academic.oup.com/bioinformatics/article-pdf/36/8/2587/33116802/btz937.pdf},
+}
+
+@Article{LunSmythCsaw2014,
+    author = {Aaron T L Lun and Gordon K Smyth},
+    title = {{D}e novo detection of differentially bound regions for {C}h{I}{P}-seq data using peaks and windows: controlling error rates correctly},
+    journal = {Nucleic Acids Res.},
+    year = {2014},
+    volume = {42},
+    number = {11},
+    pages = {e95},
+  }
+
+
+@Article{LunSmythF1000,
+   Author="Lun, A. T.  and Smyth, G. K. ",
+   Title="{{F}rom reads to regions: a {B}ioconductor workflow to detect differential binding in {C}h{I}{P}-seq data}",
+   Journal="F1000Res",
+   Year="2015",
+   Volume="4",
+   Pages="1080"
+}
+
+@article{HicksSQN2017,
+    author = {Hicks, Stephanie C and Okrah, Kwame and Paulson, Joseph N and Quackenbush, John and Irizarry, Rafael A and Bravo, Héctor Corrada},
+    title = "{Smooth quantile normalization}",
+    journal = {Biostatistics},
+    volume = {19},
+    number = {2},
+    pages = {185-198},
+    year = {2017},
+    month = {07},
+    abstract = "{Between-sample normalization is a critical step in genomic data analysis to remove systematic bias and unwanted technical variation in high-throughput data. Global normalization methods are based on the assumption that observed variability in global properties is due to technical reasons and are unrelated to the biology of interest. For example, some methods correct for differences in sequencing read counts by scaling features to have similar median values across samples, but these fail to reduce other forms of unwanted technical variation. Methods such as quantile normalization transform the statistical distributions across samples to be the same and assume global differences in the distribution are induced by only technical variation. However, it remains unclear how to proceed with normalization if these assumptions are violated, for example, if there are global differences in the statistical distributions between biological conditions or groups, and external information, such as negative or control features, is not available. Here, we introduce a generalization of quantile normalization, referred to as smooth quantile normalization (qsmooth), which is based on the assumption that the statistical distribution of each sample should be the same (or have the same distributional shape) within biological groups or conditions, but allowing that they may differ between groups. We illustrate the advantages of our method on several high-throughput datasets with global differences in distributions corresponding to different biological conditions. We also perform a Monte Carlo simulation study to illustrate the bias-variance tradeoff and root mean squared error of qsmooth compared to other global normalization methods. A software implementation is available from https://github.com/stephaniehicks/qsmooth.}",
+    issn = {1465-4644},
+    doi = {10.1093/biostatistics/kxx028},
+    url = {https://doi.org/10.1093/biostatistics/kxx028},
+    eprint = {https://academic.oup.com/biostatistics/article-pdf/19/2/185/24268721/kxx028.pdf},
+}
+
+@Article{LawVoom2014,
+   Author="Law, C. W.  and Chen, Y.  and Shi, W.  and Smyth, G. K. ",
+   Title="{voom: {P}recision weights unlock linear model analysis tools for {R}{N}{A}-seq read counts}",
+   Journal="Genome Biol",
+   Year="2014",
+   Volume="15",
+   Number="2",
+   Pages="R29",
+   Month="Feb"
+}
+
+@article{McCarthyTreat2009,
+    author = {McCarthy, Davis J. and Smyth, Gordon K.},
+    title = "{Testing significance relative to a fold-change threshold is a TREAT}",
+    journal = {Bioinformatics},
+    volume = {25},
+    number = {6},
+    pages = {765-771},
+    year = {2009},
+    month = {01},
+    abstract = "{Motivation: Statistical methods are used to test for the differential expression of genes in microarray experiments. The most widely used methods successfully test whether the true differential expression is different from zero, but give no assurance that the differences found are large enough to be biologically meaningful.Results: We present a method, t-tests relative to a threshold (TREAT), that allows researchers to test formally the hypothesis (with associated p-values) that the differential expression in a microarray experiment is greater than a given (biologically meaningful) threshold. We have evaluated the method using simulated data, a dataset from a quality control experiment for microarrays and data from a biological experiment investigating histone deacetylase inhibitors. When the magnitude of differential expression is taken into account, TREAT improves upon the false discovery rate of existing methods and identifies more biologically relevant genes.Availability: R code implementing our methods is contributed to the software package limma available at http://www.bioconductor.org.Contact:[email protected]}",
+    issn = {1367-4803},
+    doi = {10.1093/bioinformatics/btp053},
+    url = {https://doi.org/10.1093/bioinformatics/btp053},
+    eprint = {https://academic.oup.com/bioinformatics/article-pdf/25/6/765/698211/btp053.pdf},
+}
+
+@Article{IgnatiadisIHW2016,
+   Author="Ignatiadis, N.  and Klaus, B.  and Zaugg, J. B.  and Huber, W. ",
+   Title="{{D}ata-driven hypothesis weighting increases detection power in genome-scale multiple testing}",
+   Journal="Nat Methods",
+   Year="2016",
+   Volume="13",
+   Number="7",
+   Pages="577--580",
+   Month="07"
+}
+
+@Article{HicksQuantro2015,
+   Author="Hicks, S. C.  and Irizarry, R. A. ",
+   Title="{quantro: a data-driven approach to guide the choice of an appropriate normalization method}",
+   Journal="Genome Biol",
+   Year="2015",
+   Volume="16",
+   Pages="117",
+   Month="Jun"
+}
+
+@Article{YoungGoseq2010,
+   Author="Young, M. D.  and Wakefield, M. J.  and Smyth, G. K.  and Oshlack, A. ",
+   Title="{{G}ene ontology analysis for {R}{N}{A}-seq: accounting for selection bias}",
+   Journal="Genome Biol",
+   Year="2010",
+   Volume="11",
+   Number="2",
+   Pages="R14"
+}
+
+@article{SubramanianGsea2005,
+author = {Aravind Subramanian  and Pablo Tamayo  and Vamsi K. Mootha  and Sayan Mukherjee  and Benjamin L. Ebert  and Michael A. Gillette  and Amanda Paulovich  and Scott L. Pomeroy  and Todd R. Golub  and Eric S. Lander  and Jill P. Mesirov },
+title = {Gene set enrichment analysis: A knowledge-based approach for interpreting genome-wide expression profiles},
+journal = {Proceedings of the National Academy of Sciences},
+volume = {102},
+number = {43},
+pages = {15545-15550},
+year = {2005},
+doi = {10.1073/pnas.0506580102},
+URL = {https://www.pnas.org/doi/abs/10.1073/pnas.0506580102},
+eprint = {https://www.pnas.org/doi/pdf/10.1073/pnas.0506580102},
+abstract = {Although genomewide RNA expression analysis has become a routine tool in biomedical research, extracting biological insight from such information remains a major challenge. Here, we describe a powerful analytical method called Gene Set Enrichment Analysis (GSEA) for interpreting gene expression data. The method derives its power by focusing on gene sets, that is, groups of genes that share common biological function, chromosomal location, or regulation. We demonstrate how GSEA yields insights into several cancer-related data sets, including leukemia and lung cancer. Notably, where single-gene analysis finds little similarity between two independent studies of patient survival in lung cancer, GSEA reveals many biological pathways in common. The GSEA method is embodied in a freely available software package, together with an initial database of 1,325 biologically defined gene sets.}
+}
+
+@Article{KorotkevichFgsea2019,
+    author = {Gennady Korotkevich and Vladimir Sukhov and Alexey Sergushichev},
+    title = {Fast gene set enrichment analysis},
+    year = {2019},
+    doi = {10.1101/060012},
+    publisher = {Cold Spring Harbor Labs Journals},
+    url = {http://biorxiv.org/content/early/2016/06/20/060012},
+    journal = {bioRxiv},
+  }
diff --git a/config/params.yml b/config/params.yml
@@ -9,12 +9,34 @@ gene_regions:
   upstream: 5000
   intergenic: 10000
 
-## The categories to use from MSigDB. These are passed to msigdbr in the
-## columns of the same name
-msigdb:
-  gs_cat: "H"
-  gs_subcat:
-    - "CP:KEGG"
-    - "CP:REACTOME"
-    - "CP:WIKIPATHWAYS"
-    - "TFT:GTRD"
+## The values used when mapping peaks to genes.
+## Passed to `extraChIPs::mapByFeature()`
+## If including H3K27ac HiChIP for long range-interactions, it is advised to
+## set `enh2gene` as zero, given that long range interactions in this case
+## will more accurately map long-range enhancer interactions
+mapping:
+  gr2gene: 100000
+  prom2gene: 0
+  enh2gene: 100000
+  gi2gene: 0
+
+enrichment:
+  ## P-value adjustment methods & cutoff (alpha)
+  adj: "fdr"
+  alpha: 0.05
+  ## Only perform goseq enrichment if the number of genes mapped is between
+  ## these two proportions of either the total genes (no RNA-Seq) or detected
+  ## genes (with RNA-Seq). Seeting these avoids imtermittent crashes of the
+  ## underlying sampling algorithms using in the Wallenius Non-Central
+  ## Hypergeometric distribution at extreme values
+  min_prop_goseq: 0.001
+  max_prop_goseq: 0.99
+  ## The categories to use from MSigDB. These are passed to msigdbr in the
+  ## columns of the same name
+  msigdb:
+    gs_cat: "H"
+    gs_subcat:
+      - "CP:KEGG"
+      - "CP:REACTOME"
+      - "CP:WIKIPATHWAYS"
+      - "TFT:GTRD"
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -100,6 +100,7 @@ def make_pairwise(x):
 ## Check whether Git is installed & the directory has a repo ##
 ###############################################################
 git_add = check_git(".")
+git_tries = 100
 # git_add = False
 
 ####################
@@ -119,9 +120,9 @@ pairs=make_pairwise(config['comparisons']['contrasts'])
 here_file = get_here_file()
 bam_path = config['paths']['bam']
 rmd_path = "analysis"
-# macs2_path = config['paths']['macs2']
-macs2_path = os.path.join("output", "macs2")
 annotation_path = os.path.join("output", "annotations")
+diff_path = os.path.join("output", "differential_binding")
+macs2_path = os.path.join("output", "macs2")
 log_path = os.path.join("workflow", "logs")
 
 ###############################
@@ -238,7 +239,7 @@ ALL_OUTPUTS.extend(INDIV_BW)
 MERGED_BW = expand(
 	os.path.join(macs2_path, "{path}_merged_treat_pileup.{suffix}"),
 	path = merged_pre,
-	suffix = ['bw']#, 'summary']
+	suffix = ['bw', 'summary']
 )
 ALL_OUTPUTS.extend(MERGED_BW)
 

diff --git a/workflow/modules/annotation_description.Rmd b/workflow/modules/annotation_description.Rmd
@@ -1,8 +1,6 @@
 ---
 title: "Annotations"
 date: "`r format(Sys.Date(), '%d %B, %Y')`"
-editor_options:
-  chunk_output_type: console
 ---
 
 ```{r set-knitr-opts, echo=FALSE, child = here::here('analysis/setup_chunk.Rmd')}
@@ -73,6 +71,14 @@ samples <- here::here(config$samples$file) %>%
     target = as.factor(target)
   )
 treat_levels <- levels(samples$treat)
+has_external <- length(config$external$coverage) > 0
+if (has_external) {
+  treat_levels <- c(
+    treat_levels,
+    lapply(config$external$coverage, names) %>% unlist()
+  ) %>% 
+    unique()
+}
 ```
 
 
@@ -673,7 +679,6 @@ hic %>%
 # Colour Schemes {.tabset}
 
 ```{r set_colours}
-
 ## qc_colours need to have `Pass` and `Fail`
 missing_qc_cols <- setdiff(c("pass", "fail"), names(colours$qc))
 if ("pass" %in% missing_qc_cols) colours$qc$pass <- "#0571B0" # Blue