Merged dev into main

smped · May 30, 2022 · feca5f9 · feca5f9
2 parents cda61ce + a064a28
commit feca5f9
Show file tree

Hide file tree

Showing 21 changed files with 442 additions and 268 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,10 +4,12 @@
 .Ruserdata
 .Rapp.history
 .DS_Store
+.vscode
 *.Rproj
 *_cache
 workflow/logs/*
 .snakemake/*
+data/*
 *bam
 *bam.bai
 *bdg

diff --git a/config/config.yml b/config/config.yml
@@ -4,8 +4,6 @@ samples:
 
 paths:
   bam: "data/aligned"
-  macs2: "data/macs2"
-  bigwig: "data/bigwig"
 
 comparisons:
   fc: 1.2

diff --git a/config/samples.tsv b/config/samples.tsv
@@ -10,4 +10,4 @@ AR_Veh_2	AR	Veh	2	AR_Input
 AR_Veh_3	AR	Veh	3	AR_Input
 AR_DHT_1	AR	DHT	1	AR_Input
 AR_DHT_2	AR	DHT	2	AR_Input
-AR_DHT_3	AR	DHT	3	AR_Input
+AR_DHT_3	AR	DHT	3	AR_Input
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -12,6 +12,34 @@ configfile: "config/config.yml"
 ## Figure out how to run check_yaml from here if possible
 ## python scripts/check_yaml.py
 
+## Set the here file as defined by here::here() for Rmarkdown files
+## In a valid Rproj file is found, use that. Otherwise use .here
+def get_here_file():
+	wd = os.getcwd()
+	rproj = os.path.basename(wd) + ".Rproj"
+	rproj_path = os.path.join(wd, rproj)
+	here_file = os.path.join(wd, ".here")
+	if os.path.isfile(rproj_path):
+		## Check contents of file for Version in line 1
+		with open(rproj_path) as f:
+			ln = f.readline().rstrip()
+		if 'Version:' in ln:
+			here_file = rproj_path
+	return(here_file)
+
+####################################
+## Check all external files exist ##
+####################################
+all_exist=True
+# if config['external']['rnaseq'] != '':
+# 	if not os.path.isfile(config['external']['rnaseq']):
+# 		all_exist=False
+# 		print(config['external']['rnaseq'] + " does not exist")
+
+
+if not all_exist:
+	sys.exit(1)
+
 def get_ucsc_genome(x):
 	map = pd.Series(['hg19', 'hg38'], index = ['GRCh37', 'GRCh38'])
 	if not x in map.keys():
@@ -88,13 +116,13 @@ pairs=make_pairwise(config['comparisons']['contrasts'])
 ###############
 ## Key Paths ##
 ###############
+here_file = get_here_file()
 bam_path = config['paths']['bam']
-bw_path = config['paths']['bigwig']
-macs2_path = config['paths']['macs2']
-# ext_path = os.path.join("data", "external")
 rmd_path = "analysis"
-## This still needs to be changed internally in the RMD files
+# macs2_path = config['paths']['macs2']
+macs2_path = os.path.join("output", "macs2")
 annotation_path = os.path.join("output", "annotations")
+log_path = os.path.join("workflow", "logs")
 
 ###############################
 ## External Annotation Files ##
@@ -168,7 +196,7 @@ ALL_OUTPUTS.extend([os.path.join("docs", "index.html")])
 
 ## Peaks generated from the Rmd files
 CONS_PEAKS = expand(
-	os.path.join("output", "{target}", "{file}"),
+	os.path.join(macs2_path, "{target}", "{file}"),
 	target = targets,
 	file = ['consensus_peaks.bed', 'oracle_peaks.rds']
 )
@@ -202,13 +230,13 @@ ALL_OUTPUTS.extend(MERGED_PEAKS)
 ## BigWig Files ##
 ##################
 INDIV_BW = expand(
-	os.path.join(bw_path, "{path}_treat_pileup.{suffix}"),
+	os.path.join(macs2_path, "{path}_treat_pileup.{suffix}"),
 	path = indiv_pre,
 	suffix = ['bw']#, 'summary']
 )
 ALL_OUTPUTS.extend(INDIV_BW)
 MERGED_BW = expand(
-	os.path.join(bw_path, "{path}_merged_treat_pileup.{suffix}"),
+	os.path.join(macs2_path, "{path}_merged_treat_pileup.{suffix}"),
 	path = merged_pre,
 	suffix = ['bw']#, 'summary']
 )

diff --git a/workflow/modules/annotation_description.Rmd b/workflow/modules/annotation_description.Rmd
@@ -5,7 +5,7 @@ editor_options:
   chunk_output_type: console
 ---
 
-```{r set-knitr-opts, echo=FALSE, child = here::here('workflow/modules/setup_chunk.Rmd')}
+```{r set-knitr-opts, echo=FALSE, child = here::here('analysis/setup_chunk.Rmd')}
 ```
 
 ```{r packages}

diff --git a/workflow/modules/differential_binding.Rmd b/workflow/modules/differential_binding.Rmd
@@ -60,7 +60,8 @@ source(here::here("workflow", "scripts", "custom_functions.R"))
 ```{r load-config}
 params <- read_yaml(here::here("config", "params.yml"))
 config <- read_yaml(here::here("config", "config.yml"))
-samples <- here::here("output", target, "qc_samples.tsv") %>% 
+macs2_path <- here::here("output", "macs2", target)
+samples <- file.path(macs2_path, "qc_samples.tsv") %>% 
   read_tsv() %>% 
   dplyr::filter(
     qc == "pass",
@@ -81,8 +82,6 @@ bam_path <- here::here(config$paths$bam)
 stopifnot(dir.exists(bam_path))
 annotation_path <- here::here("output", "annotations")
 stopifnot(dir.exists(annotation_path))
-bw_path <- here::here(config$paths$bigwig, target)
-stopifnot(dir.exists(bw_path))
 out_path <- here::here("output", target)
 if (!dir.exists(out_path)) dir.create(out_path)
 ```
@@ -286,7 +285,7 @@ all_out <- list(
 # Data Preparation
 
 ```{r import-peaks}
-consensus_peaks <- here::here("output", target, "consensus_peaks.bed") %>%
+consensus_peaks <- here::here(macs2_path, "consensus_peaks.bed") %>%
 	import.bed(seqinfo = sq)
 ```
 
@@ -370,8 +369,8 @@ Taking the first `r comma(ys)` alignments, a brief inspection of the Bam Files r
 ## Sliding Window Counts
 
 ```{r window-counts}
-macs2_merged_logs <- here::here(
-	"data", "macs2", target, glue("{treat_levels}_merged_callpeak.log")
+macs2_merged_logs <- file.path(
+  macs2_path, glue("{treat_levels}_merged_callpeak.log")
 	) %>%
 	importNgsLogs()
 fl <- max(macs2_merged_logs$fragment_length)
@@ -874,7 +873,7 @@ For visualisation purposes, only genes which were considered as detected in any
 ```{r make-heatmaps}
 profile_width <- 5e3
 n_bins <- 100
-bwfl <- file.path(bw_path, glue("{treat_levels}_merged_treat_pileup.bw")) %>% 
+bwfl <- file.path(macs2_path, glue("{treat_levels}_merged_treat_pileup.bw")) %>% 
   BigWigFileList() %>% 
   setNames(treat_levels) 
 sig_ranges <- merged_results %>% 
@@ -1655,7 +1654,7 @@ grl_to_plot <- GRangesList(
 ## The coverage
 bwfl <- list2(
   "{target}" := file.path(
-    bw_path, glue("{treat_levels}_merged_treat_pileup.bw")
+    macs2_path, glue("{treat_levels}_merged_treat_pileup.bw")
   ) %>% 
     BigWigFileList() %>% 
     setNames(treat_levels)
@@ -2677,12 +2676,18 @@ file.create(all_out$de_genes)
 if (any_detected) write_csv(de_genes_db_regions, all_out$de_genes)
 
 file.create(all_out$up_regions)
-if (length(sig_ranges$Up) > 0) 
-  export.bed(granges(sig_ranges$Up), all_out$up_regions)
+if (length(sig_ranges$Up) > 0) {
+  sig_ranges$Up %>% 
+    granges() %>% 
+    export.bed(all_out$up_regions)
+}
 
 file.create(all_out$down_regions)
-if (length(sig_ranges$Down) > 0) 
-  export.bed(granges(sig_ranges$Down), all_out$down_regions)
+if (length(sig_ranges$Down) > 0) {
+  sig_ranges$Down %>% 
+    granges() %>% 
+    export.bed(all_out$down_regions)
+}
 
 save.image(all_out$renv)
 ```

diff --git a/workflow/modules/ihw_targets.Rmd b/workflow/modules/ihw_targets.Rmd
@@ -7,7 +7,7 @@ other_targets <- here::here(config$samples$file) %>%
 ihw_all <- other_targets %>% 
   sapply(
     function(x) {
-      here::here("output", x, "consensus_peaks.bed") %>%
+      here::here(dirname(macs2_path), x, "consensus_peaks.bed") %>%
         import.bed(seqinfo = sq)
     },
     simplify = FALSE

diff --git a/analysis/index.Rmd → workflow/modules/index.Rmd b/analysis/index.Rmd → workflow/modules/index.Rmd
@@ -34,14 +34,14 @@ Treatment groups and targets are specified using `config/config.yml`.
 
 ## Workflow
 
-```{r plot-workflow, fig.height = 10, fig.width = 10, fig.cap = "*Summary of processing workflow. The primary data pipeline is shown in red, preparatory steps are shown in blue whilst collation of final output is in green.*"}
+```{r plot-workflow, fig.height = 10, fig.width = 10, fig.cap = "*Summary of processing workflow. The primary data pipeline producing key outputs is shown in red, preparatory steps are shown in blue whilst steps involved in the collation of final output are in green.*"}
 here::here("workflow", "rules", "rulegraph.dot") %>%
   readLines() %>%
   rm_dot_node(node = "\"all\"") %>%
   add_input_node(node = "Alignments\n(Bam Files)", col = "red", ignore = "(download|define|macs2|create|install)") %>%
-  change_node_colour("(setup|index|macs2|bedgraph|identify|binding|install)", "red") %>%
+  change_node_colour("(index|macs2|annotations|bedgraph|compile)", "red") %>%
+  change_node_colour("(create|build)", "forestgreen") %>%
   change_node_colour("(download|write|copy|install)", "blue") %>%
-  change_node_colour("(site|create|build|differential|pairwise)", "forestgreen") %>%
   str_replace_all("_", "\n") %>%
   str_replace_all("snakemake\ndag", "snakemake_dag") %>%
   str_replace_all("fontsize=[0-9]+", "fontsize=16") %>%

diff --git a/workflow/modules/macs2_summary.Rmd b/workflow/modules/macs2_summary.Rmd
@@ -46,6 +46,7 @@ register(MulticoreParam(workers = threads))
 source(here::here("workflow/scripts/custom_functions.R"))
 ## This has been pushed to the main repo of ggside, but is not yet
 ## incorporated into the packge. The functions are geom_*sidelabel
+## Once ggside 0.2.1 is released, this can be deleted
 source(here::here("workflow/scripts/geom_sidelabel.R"))
 ```
 
@@ -55,11 +56,7 @@ config <- read_yaml(
   here::here("config", "config.yml")
 )
 bam_path <- here::here(config$paths$bam, target)
-stopifnot(dir.exists(bam_path))
-macs2_path <- here::here(config$paths$macs2, target)
-stopifnot(dir.exists(macs2_path))
-out_path <- here::here("output", target)
-if (!dir.exists(out_path)) dir.create(out_path)
+macs2_path <- here::here("output", "macs2", target)
 annotation_path <- here::here("output", "annotations")
 colours <- read_rds(
   file.path(annotation_path, "colours.rds")
@@ -68,7 +65,7 @@ colours <- read_rds(
 ```
 
 ```{r read-samples}
-samples <-read_tsv(file.path(out_path, "qc_samples.tsv"))
+samples <-read_tsv(file.path(macs2_path, "qc_samples.tsv"))
 treat_levels <- unique(samples$treat)
 if (!is.null(config$comparisons$contrasts)) {
   ## Ensure levels respect those provided in contrasts
@@ -325,7 +322,7 @@ suppressWarnings(
 ### Cross Correlations
 
 ```{r plot_correlation, fig.height=6, fig.cap = glue("*Cross Correlaton between alignments up to 1kb apart. The dashed, grey, vertical line is the fragment length estimated by `macs2 callpeak` for each sample. For speed, only the first 5 chromosomes were used for sample-specific estimates.*")}
-file.path(out_path, "cross_correlations.tsv") %>% 
+file.path(macs2_path, "cross_correlations.tsv") %>% 
   read_tsv() %>% 
   left_join(samples, by = "sample") %>% 
   ggplot(aes(fl, correlation, colour = treat)) +
@@ -680,8 +677,8 @@ consensus_peaks %>%
 
 ```{r export}
 all_out <- list(
-  consensus_peaks = file.path(out_path, "consensus_peaks.bed"),
-  oracle_peaks = file.path(out_path, "oracle_peaks.rds"),
+  consensus_peaks = file.path(macs2_path, "consensus_peaks.bed"),
+  oracle_peaks = file.path(macs2_path, "oracle_peaks.rds"),
   renv = file.path(
     here::here("output/envs"), 
     glue("{target}_macs2_summary.RData")
@@ -693,6 +690,7 @@ oracle_peaks %>%
   lapply(select, -keep) %>%
   GRangesList() %>%
   write_rds(all_out$oracle_peaks, compress = "gz")
+if (!dir.exists(dirname(all_out$renv))) dir.create(dirname(all_out$renv))
 save.image(all_out$renv)
 ```