From 24d8fda38043485ac2018c15b1a4ab3bed512687 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Mon, 13 Jul 2020 10:54:39 -0400 Subject: [PATCH 01/10] fix typo --- docs/annotation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/annotation.md b/docs/annotation.md index f8679d15..d463927d 100644 --- a/docs/annotation.md +++ b/docs/annotation.md @@ -2,7 +2,7 @@ The pipeline uses reference data at various stages. If you're using a common genome assembly, these resources are pre-built and can be easily downloaded using `refgenie pull`, as described in the setup instructions. If the resources are not available, you'll have to build them. Read [how to build `refgenie` assets](http://refgenie.databio.org/en/latest/build/) in the `refgenie` docs. You may also [learn about the current buildable assets](http://refgenie.databio.org/en/latest/available_assets/) to which `refgenie` knows the recipe. -##Use a custom `feat_annotation` asset +## Use a custom `feat_annotation` asset The pipeline will calculate the fraction of reads in genomic features using the `refgenie` `feat_annotation` asset, but you can also specify this file yourself at the command line (`--anno-name`). From 7857b719ce84cd56bae9e89e1c607a9d3dceb8e5 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Mon, 13 Jul 2020 10:54:54 -0400 Subject: [PATCH 02/10] fix asset reference --- pipelines/pepatac.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 943e625f..8ac88615 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -1742,7 +1742,7 @@ def report_peak_count(): black_local = os.path.join(raw_folder, args.genome_assembly + "_blacklist.bed") - cmd = ("ln -sf " + res.feat_annotation + " " + black_local) + cmd = ("ln -sf " + res.blacklist + " " + black_local) pm.run(cmd, black_local) else: print("Skipping peak filtering...") From fd559bf798fe1dbd5e3a716a050da1d218038506 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Wed, 22 Jul 2020 09:22:50 -0400 Subject: [PATCH 03/10] fix gsub to be literal --- PEPATACr/R/PEPATACr.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PEPATACr/R/PEPATACr.R b/PEPATACr/R/PEPATACr.R index aa73419a..9a722675 100644 --- a/PEPATACr/R/PEPATACr.R +++ b/PEPATACr/R/PEPATACr.R @@ -739,7 +739,7 @@ plotFRiF <- function(sample_name, num_reads, genome_size, } else { info <- file.info(file.path(bedFile[i])) name <- basename(tools::file_path_sans_ext(bedFile[i])) - name <- gsub(sample_name, "", name) + name <- gsub(sample_name, "", name, fixed=TRUE) name <- gsub("^.*?_", "", name) numFields <- 2 for(j in 1:numFields) name <- gsub("_[^_]*$", "", name) From 89e359902fa651843c72d97e2c44d9806fc0a51d Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Wed, 22 Jul 2020 09:23:05 -0400 Subject: [PATCH 04/10] fix zip file check --- pipelines/pepatac.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 8ac88615..c36af5ce 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -875,8 +875,9 @@ def main(): pm.clean_add(tempdir) # If there are no prealignments, unmap_fq1 will be unzipped - if pypiper.is_gzipped_fastq(unmap_fq1): + if os.path.exists(unmap_fq1 + ".gz"): unmap_fq1 = unmap_fq1 + ".gz" + if os.path.exists(unmap_fq2 + ".gz"): unmap_fq2 = unmap_fq2 + ".gz" bt2_index = os.path.join(rgc.seek(args.genome_assembly, BT2_IDX_KEY)) From 0c36037fd49bf581496e141b7919cad7c706fafd Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Wed, 22 Jul 2020 09:24:18 -0400 Subject: [PATCH 05/10] update site info --- mkdocs.yml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/mkdocs.yml b/mkdocs.yml index 073e8253..a33aaffb 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -2,10 +2,10 @@ theme: databio site_name: PEPATAC site_author: Jason Smith -site_url: http://code.databio.org/PEPATAC/ +site_url: http://pepatac.databio.org/ site_logo: img/pepatac_logo_white.svg repo_url: https://github.com/databio/pepatac/ -google_analytics: ['UA-127092878-1', 'code.databio.org/PEPATAC'] +google_analytics: ['UA-127092878-1', 'pepatac.databio.org'] markdown_extensions: - fontawesome_markdown # pip install --user fontawesome-markdown @@ -47,6 +47,3 @@ navbar: - text: Software & Data icon: fa-code fa-lg href: http://databio.org/software/ - # - text: Contact us - # icon: fa-envelope - # href: contact \ No newline at end of file From a4bf9f0ca7fd2bbead763de4a90ace469f1d3d3c Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Wed, 22 Jul 2020 10:10:05 -0400 Subject: [PATCH 06/10] add try catch in plotAnno for TSS plots --- PEPATACr/R/PEPATACr.R | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/PEPATACr/R/PEPATACr.R b/PEPATACr/R/PEPATACr.R index 9a722675..065a751e 100644 --- a/PEPATACr/R/PEPATACr.R +++ b/PEPATACr/R/PEPATACr.R @@ -1381,7 +1381,6 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"), if (tolower(plot) == "chromosome") { # Chromosome distribution plot - # TODO: makes this a try-catch x <- tryCatch( { suppressMessages(calcChromBinsRef(query, genome)) @@ -1413,9 +1412,26 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"), } } else if (tolower(plot) == "tss") { # Feature distance distribution plots - TSS_dist <- calcFeatureDistRefTSS(query, genome) + x <- tryCatch( + { + suppressMessages(calcFeatureDistRefTSS(query, genome)) + }, + error=function(e) { + message("calcFeatureDistRefTSS(): ", e) + return(NULL) + }, + warning=function(e) { + message("calcFeatureDistRefTSS(): ", e) + return(NULL) + } + ) + + if (is.null(x)) { + return(ggplot()) + } + if (!is.na(TSS_dist[1])) { - TSS_plot <- plotFeatureDist(TSS_dist, featureName="TSS") + TSS_plot <- plotFeatureDist(x, featureName="TSS") return(TSS_plot) } else { message("Unable to produce TSS distribution plot.") From ab5396fbf232980130f5aa785b738a3d8db6871f Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 4 Aug 2020 13:05:42 -0400 Subject: [PATCH 07/10] change consensus peak to dramatically reduce memory requirements --- PEPATACr/R/PEPATACr.R | 495 ++++++++++++++++++++++++------------------ 1 file changed, 283 insertions(+), 212 deletions(-) diff --git a/PEPATACr/R/PEPATACr.R b/PEPATACr/R/PEPATACr.R index 065a751e..fc21caf0 100644 --- a/PEPATACr/R/PEPATACr.R +++ b/PEPATACr/R/PEPATACr.R @@ -705,7 +705,7 @@ plotFRiF <- function(sample_name, num_reads, genome_size, } else { bedCov <- calcFRiF(bed, num_reads, reads) name <- basename(tools::file_path_sans_ext(bedFile[1])) - name <- gsub(sample_name, "", name) + name <- gsub(sample_name, "", name, fixed=TRUE) name <- gsub("^.*?_", "", name) numFields <- 1 for(i in 1:numFields) name <- gsub("_[^_]*$", "", name) @@ -741,7 +741,7 @@ plotFRiF <- function(sample_name, num_reads, genome_size, name <- basename(tools::file_path_sans_ext(bedFile[i])) name <- gsub(sample_name, "", name, fixed=TRUE) name <- gsub("^.*?_", "", name) - numFields <- 2 + numFields <- 1 for(j in 1:numFields) name <- gsub("_[^_]*$", "", name) } @@ -1430,7 +1430,7 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"), return(ggplot()) } - if (!is.na(TSS_dist[1])) { + if (!is.na(x[1])) { TSS_plot <- plotFeatureDist(x, featureName="TSS") return(TSS_plot) } else { @@ -1847,7 +1847,8 @@ plotAlignedRaw <- function(prj, summary_dir, stats) { paste(unique(stats$Genome)))) } - align_raw$sample <- factor(align_raw$sample, levels = align_raw$sample) + align_raw$sample <- factor(align_raw$sample, + levels = unique(align_raw$sample)) melt_align_raw <- melt(align_raw, id.vars = "sample") max_reads <- max(rowSums(align_raw[,2:ncol(align_raw)])) @@ -2026,7 +2027,7 @@ plotAlignedPct <- function(prj, summary_dir, stats) { } align_percent$sample <- factor(align_percent$sample, - levels=align_percent$sample) + levels=unique(align_percent$sample)) # Warn user if sample has aberrant values aberrant_samples <- data.frame(Sample=character(), @@ -2214,7 +2215,8 @@ plotTSSscores <- function(prj, summary_dir, stats, cutoff=6) { upper_limit <- roundUpNice(max_TSS) chart_height <- (length(unique(TSS_score$sample))) * 0.75 - TSS_score$sample <- factor(TSS_score$sample, levels=TSS_score$sample) + TSS_score$sample <- factor(TSS_score$sample, + levels=unique(TSS_score$sample)) TSS_plot <- ggplot(TSS_score, aes(x=sample, y=as.numeric(TSS))) + geom_bar(colour="black", size=0.25, width=0.7, stat="identity", @@ -2313,7 +2315,7 @@ plotLibSizes <- function(prj, summary_dir, stats) { chart_height <- (length(unique(picard_lib_size$sample))) * 0.75 picard_lib_size$sample <- factor(picard_lib_size$sample, - levels = picard_lib_size$sample) + levels=unique(picard_lib_size$sample)) lib_size_plot <- ggplot(picard_lib_size, aes(x = sample, y = as.numeric(LibSize))) + @@ -2357,6 +2359,7 @@ summarizer <- function(project, output_dir) { summary_file <- file.path(output_dir, paste0(pepr::config(project)$name, "_stats_summary.tsv")) + write(paste0("Creating summary plots..."), stdout()) # Set the output directory summary_dir <- suppressMessages(file.path(output_dir, "summary")) # Produce output directory (if needed) @@ -2398,21 +2401,21 @@ summarizer <- function(project, output_dir) { #' Internal helper function for \code{collapsePeaks} #' Count the number of times a peak appears across a list of peak files #' -#' @param peakList A list of data.table objects representing -#' narrowPeak files -#' @param peakDT A single data.table of peaks -countReproduciblePeaks <- function(peakList, peakDT) { - setkey(peakDT, chr, start, end) - setkey(peakList, chr, start, end) - hits <- foverlaps(peakList, peakDT, +#' @param peak_list A list of data.table objects representing +#' narrowPeak files +#' @param peak_DT A single data.table of peaks +countReproduciblePeaks <- function(peak_list, peak_DT) { + setkey(peak_DT, chr, start, end) + setkey(peak_list, chr, start, end) + hits <- foverlaps(peak_list, peak_DT, by.x=c("chr", "start", "end"), type="any", which=TRUE, nomatch=0) # track the number of overlaps of final peak set peaks - if (!"count" %in% colnames(peakDT)) { - peakDT[hits$yid, count := 1] - peakDT[is.na(get("count")), ("count") := 0] + if (!"count" %in% colnames(peak_DT)) { + peak_DT[hits$yid, count := 1] + peak_DT[is.na(get("count")), ("count") := 0] } else { - peakDT[hits$yid, count := get("count") + 1] + peak_DT[hits$yid, count := get("count") + 1] } } @@ -2443,19 +2446,29 @@ collapsePeaks <- function(sample_table, chrom_sizes, min_score=5) { "peak") setkey(peaks, chr, start, end) # keep highest scored peaks - hits <- foverlaps(peaks, peaks, - by.x=c("chr", "start", "end"), - type="any", which=TRUE, nomatch=0) - scores <- data.table(index=rep(1:nrow(peaks)), score=peaks$score) - setkey(hits, xid) - setkey(scores, index) - out <- hits[scores, nomatch=0] - keep <- out[out[,.I[which.max(score)],by=yid]$V1] - indices <- unique(keep$xid) - final <- peaks[indices,] - # trim any bad peaks (extend beyond chromosome) - # can't be negative - final[start < 0, start := 0] + # hits <- foverlaps(peaks, peaks, + # by.x=c("chr", "start", "end"), + # type="any", which=TRUE, nomatch=0) + # split by chromosome to minimize memory requirements + peaks_by_chr <- split(peaks, peaks$chr) + hit_aggregator <- function(x) { + peaksGR <- makeGRangesFromDataFrame(x, keep.extra.columns=FALSE) + hitsGR <- suppressWarnings( + findOverlaps(peaksGR, peaksGR, ignore.strand=TRUE)) + hits <- data.table::data.table(xid=queryHits(hitsGR), + yid=subjectHits(hitsGR)) + setkey(hits, xid) + scores <- data.table(index=rep(1:nrow(x)), score=x$score) + setkey(scores, index) + out <- hits[scores, nomatch=0] + keep <- out[out[,.I[which.max(score)],by=yid]$V1] + indices <- unique(keep$xid) + final <- x[indices,] + final[start < 0, start := 0] + return(final) + } + final <- rbindlist(lapply(peaks_by_chr, hit_aggregator)) + # can't extend past chromosome for (i in nrow(chrom_sizes)) { final[chr == chrom_sizes$chr[i] & end > chrom_sizes$size[i], @@ -2463,12 +2476,11 @@ collapsePeaks <- function(sample_table, chrom_sizes, min_score=5) { } # identify reproducible peaks - peakSet <- copy(peaks) - peakSet[,group := gsub("_peak.*","",name)] - peakList <- splitDataTable(peakSet, "group") - #original <- copy(final) - - invisible(sapply(peakList, countReproduciblePeaks, peakDT=final)) + peaks[,group := gsub("_peak.*","",name)] + peak_list <- splitDataTable(peaks, "group") + rm(peaks) + invisible(gc()) + invisible(sapply(peak_list, countReproduciblePeaks, peak_DT=final)) # keep peaks present in 2 or more individual peak sets # keep peaks with score per million >= 5 @@ -2518,25 +2530,27 @@ consensusPeaks <- function(project, output_dir, results_subdir, assets) { if (nrow(files) == 0) { return(consensus_peak_files) } - sample_table <- sample_table[files, .SD, nomatch=0L, - on="peak_files", .SDcols=names(sample_table)] + #sample_table <- sample_table[files, .SD, nomatch=0L, + # on="peak_files", .SDcols=names(sample_table)] + sample_table <- unique( + sample_table[sample_table$peak_files %in% files$peak_files,]) # Need to group by genome, then create a consensus list by genome! st_list = splitDataTable(sample_table, "genome") - for (genome in names(st_list)) { - if (nrow(st_list[[genome]]) == 1) { + for (g in names(st_list)) { + if (nrow(st_list[[g]]) == 1) { err_msg = paste0("Found only a single valid peak file for ", - genome, ".") + g, ".") warning(err_msg) next } - if (nrow(st_list[[genome]]) == 0) { + if (nrow(st_list[[g]]) == 0) { warning("Unable to find any valid peak files.") warning("Confirm peak files exist for your samples.") next } - c_path <- unique(sample_table[genome == genome, c_path]) + c_path <- unique(sample_table[genome == g, c_path]) if (file.exists(c_path)) { c_size <- fread(c_path) colnames(c_size) <- c("chr", "size") @@ -2546,14 +2560,19 @@ consensusPeaks <- function(project, output_dir, results_subdir, assets) { " is present before continuing.")) final <- NULL } - final <- collapsePeaks(st_list[[genome]], c_size) + message(paste0("Calculating ", g, " consensus peak set from ", + nrow(st_list[[g]]), " samples...")) + final <- collapsePeaks(st_list[[g]], c_size) + #} if (!is.null(final)) { # save consensus peak set - file_name <- paste0("_", genome,"_consensusPeaks.narrowPeak") + file_name <- paste0("_", g,"_consensusPeaks.narrowPeak") output_file <- file.path(summary_dir, paste0(project_name, file_name)) fwrite(final, output_file, sep="\t", col.names=FALSE) consensus_peak_files <- c(consensus_peak_files, output_file) + rm(final) + invisible(gc()) } else { warning("Unable to produce a consensus peak file.") warning("Check that individual peak files exist for your samples.") @@ -2626,11 +2645,10 @@ peakCounts <- function(project, output_dir, results_subdir, assets) { # Produce output directory (if needed) dir.create(summary_dir, showWarnings = FALSE) - # TODO: handle multiple genomes in single project sample_names <- pepr::sampleTable(project)$sample_name genomes <- as.list(pepr::sampleTable(project)$genome) names(genomes) <- sample_names - sample_names <- as.character(pepr::sampleTable(project)$sample_name) + sample_names <- unique(as.character(pepr::sampleTable(project)$sample_name)) sample_table <- data.table( sample_name=pepr::sampleTable(project)$sample_name, @@ -2653,138 +2671,46 @@ peakCounts <- function(project, output_dir, results_subdir, assets) { # generate paths to peak coverage files sample_table[,peak_files:=.((file.path( - results_subdir, - sample_table$sample_name, - paste0("peak_calling_", sample_table$genome), - paste0(sample_table$sample_name, peak_file_name))))] - peak_files = sample_table$peak_files - - if (reference) { - peaks_dt = data.table(chr=as.character(), - start=as.numeric(), - end=as.numeric(), - name=as.character(), - score=as.numeric(), - strand=as.character(), - signalValue=as.numeric(), - pValue=as.numeric(), - qValue=as.numeric(), - peak=as.numeric(), - read_count=as.numeric(), - base_count=as.numeric(), - width=as.numeric(), - frac=as.numeric(), - norm=as.numeric(), - group=as.character()) - - # Load peak files - num_files <- 0 - name_list <- as.character() - for (file in peak_files) { - info <- file.info(file.path(file)) - if (file.exists(file.path(file)) && info$size != 0) { - num_files <- num_files + 1 - peaks <- fread(file) - name <- gsub(peak_file_name, "", basename(file)) - name_list <- append(name_list, name) - colnames(peaks) <- c("chr", "start", "end", "name", "score", - "strand", "signalValue", "pValue", - "qValue", "peak", "read_count", - "base_count", "width", "frac", "norm") - peaks$group <- name - setkey(peaks, chr, start, end) - peaks_dt <- rbind(peaks_dt, peaks) - } - } - peak_list <- splitDataTable(peaks_dt, "group") - # confirm identical chr, start, end coordinates exist in all peak files - same <- all(sapply(lapply(peak_list, function(x) {x[,c(1:3)]}), - identical, - lapply(peak_list, function(x) {x[,c(1:3)]})[[1]])) - if (same) { - new_list <- lapply(names(peak_list), function(x){ - colnames(peak_list[[x]]) <- c("chr", "start", "end", "name", - "score", "strand", "signalValue", - "pValue", "qValue", "peak", - "read_count", "base_count", - "width", "frac", x, "group") - peak_list[[x]] - }) - names(new_list) <- names(peak_list) - reduce_dt <- Reduce(function(...) merge(..., all=F), - lapply(new_list, function(x) {x[,c(1:3,15)]})) - } else { - warning("Reference peak set is inconsistent between samples") - warning("Confirm the same reference peak set was passed to `--frip-ref-peaks`") - return(NULL) + results_subdir, + sample_table$sample_name, + paste0("peak_calling_", sample_table$genome), + paste0(sample_table$sample_name, peak_file_name))))] + + #Only keep samples with valid peak coverage files + file_list <- sample_table$peak_files + file_exists <- character() + for (i in 1:length(file_list)) { + if(file.exists(file.path(file_list[i]))) { + file_exists <- append(file_exists, file.path(file_list[i])) } - } else { - peaks_dt = data.table(chr=as.character(), - start=as.numeric(), - end=as.numeric(), - read_count=as.numeric(), - base_count=as.numeric(), - width=as.numeric(), - frac=as.numeric(), - norm=as.numeric()) - # Concatenate peak files - for (file in peak_files) { - info <- file.info(file.path(file)) - if (file.exists(file.path(file)) && info$size != 0) { - peaks <- fread(file) - colnames(peaks) <- c("chr", "start", "end", "read_count", - "base_count", "width", "frac", "norm") - setkey(peaks, chr, start, end) - peaks_dt <- rbind(peaks_dt, peaks) - } + } + files <- data.table(peak_files=file_exists) + consensus_peak_files = list() + if (nrow(files) == 0) { + return(consensus_peak_files) + } + #sample_table <- sample_table[files, .SD, nomatch=0L, + # on="peak_files", .SDcols=names(sample_table)] + sample_table <- unique( + sample_table[sample_table$peak_files %in% files$peak_files,]) + peak_files <- sample_table$peak_files + + # Need to group by genome, then create a counts table by genome + st_list <- splitDataTable(sample_table, "genome") + + for (g in names(st_list)) { + if (nrow(st_list[[g]]) == 1) { + err_msg <- paste0("Found only a single valid peak file for ", + g, ".") + warning(err_msg) + next } - - peaksGR <- makeGRangesFromDataFrame(peaks_dt, keep.extra.columns=T) - reduceGR <- reduce(peaksGR) - - # instead, different column for each sample is the counts columns, plural - reduce_dt <- data.table(chr=as.character(seqnames(reduceGR)), - start=start(reduceGR), - end=end(reduceGR)) - f <- function(x) {list(0)} - reduce_dt[, (sample_names) := f()] - - # for each peak file - for (file in peak_files) { - info <- file.info(file.path(file)) - if (file.exists(file.path(file)) && info$size != 0) { - peaks <- fread(file) - name <- gsub("_peaks_coverage.bed","", basename(file)) - colnames(peaks) <- c("chr", "start", "end", "read_count", - "base_count", "width", "frac", "norm") - setkey(peaks, chr, start, end) - - reduceGR <- makeGRangesFromDataFrame(reduce_dt) - peaksGR <- makeGRangesFromDataFrame(peaks) - hitsGR <- findOverlaps(query=reduceGR, subject=peaksGR) - - # Weight counts by percent overlap - olap <- pintersect(reduceGR[queryHits(hitsGR)], - peaksGR[subjectHits(hitsGR)]) - polap <- width(olap) / width(peaksGR[subjectHits(hitsGR)]) - counts <- data.table(index=rep(1:nrow(peaks)), - counts=peaks$norm*polap) - hits <- data.table(xid=queryHits(hitsGR), - yid=subjectHits(hitsGR)) - setkey(hits, yid) - setkey(counts, index) - out <- hits[counts, nomatch=0] - keep <- out[out[,.I[which.max(counts)],by=yid]$V1] - reduce_dt[keep$xid][[name]] <- keep$counts - } + if (nrow(st_list[[g]]) == 0) { + warning("Unable to find any valid peak files.") + warning("Confirm peak files exist for your samples.") + next } - # trim any bad peaks (extend beyond chromosome) - # can't be negative - reduce_dt[start < 0, start := 0] - # ensure sorted - setorderv(reduce_dt, cols = c("chr", "start")) - # can't extend past chromosome - c_path <- sample_table[peak_files == file, c_path] + c_path <- unique(sample_table[genome == g, c_path]) if (file.exists(c_path)) { c_size <- fread(c_path) colnames(c_size) <- c("chr", "size") @@ -2792,44 +2718,189 @@ peakCounts <- function(project, output_dir, results_subdir, assets) { warning("Unable to load the chromosome sizes file.") warning(paste0("Confirm that ", c_path, " is present before continuing.")) + reduce_dt <- NULL + } + message(paste0("Calculating ", g, " peak counts for ", + nrow(st_list[[g]]), " samples...")) + if (reference) { + peaks_dt <- data.table(chr=as.character(), + start=as.numeric(), + end=as.numeric(), + name=as.character(), + score=as.numeric(), + strand=as.character(), + signalValue=as.numeric(), + pValue=as.numeric(), + qValue=as.numeric(), + peak=as.numeric(), + read_count=as.numeric(), + base_count=as.numeric(), + width=as.numeric(), + frac=as.numeric(), + norm=as.numeric(), + group=as.character()) + + # Load peak files + peaks <- rbindlist(lapply(st_list[[g]]$peak_files, fread)) + colnames(peaks) <- c("chr", "start", "end", "name", "score", + "strand", "signalValue", "pValue", + "qValue", "peak", "read_count", + "base_count", "width", "frac", "norm") + setkey(peaks, chr, start, end) + peaks_dt <- rbind(peaks_dt, peaks) + peak_set <- copy(peaks) + peak_set[,group := gsub(peak_file_name, "", name)] + peak_list <- splitDataTable(peak_set, "group") + + # confirm identical chr, start, end coordinates exist in all peak files + same <- all(sapply(lapply(peak_list, function(x) {x[,c(1:3)]}), + identical, + lapply(peak_list, function(x) {x[,c(1:3)]})[[1]])) + if (same) { + new_list <- lapply(names(peak_list), function(x){ + colnames(peak_list[[x]]) <- c("chr", "start", "end", "name", + "score", "strand", "signalValue", + "pValue", "qValue", "peak", + "read_count", "base_count", + "width", "frac", x, "group") + peak_list[[x]] + }) + names(new_list) <- names(peak_list) + reduce_dt <- Reduce(function(...) merge(..., all=F), + lapply(new_list, function(x) {x[,c(1:3,15)]})) + } else { + warning(strwrap(prefix = " ", initial = "", + "Reference peak set was inconsistent between samples. + Pass a single reference peak set with the + `--frip-ref-peaks` argument.")) + return(NULL) + } + } else { + peaks_dt <- data.table(chr=as.character(), + start=as.numeric(), + end=as.numeric(), + read_count=as.numeric(), + base_count=as.numeric(), + width=as.numeric(), + frac=as.numeric(), + norm=as.numeric()) + + peaks <- rbindlist(lapply(st_list[[g]]$peak_files, fread)) + colnames(peaks) <- c("chr", "start", "end", "read_count", + "base_count", "width", "frac", "norm") + setkey(peaks, chr, start, end) + peaks_dt <- rbind(peaks_dt, peaks) + # Convert to GRanges for more efficient findOverlaps vs data.table + peaksGR <- makeGRangesFromDataFrame(peaks_dt, + keep.extra.columns=TRUE) + reduceGR <- reduce(peaksGR) + + # instead, different column for each sample is the counts columns, plural + reduce_dt <- data.table(chr=as.character(seqnames(reduceGR)), + start=start(reduceGR), + end=end(reduceGR)) + f <- function(x) {list(0)} + #reduce_dt[, (sample_names) := f()] + # Need to make syntactically valid names + valid_names <- make.unique(make.names(st_list[[g]]$sample_name)) + reduce_dt[, (valid_names) := f()] + + # Report name changes + original_invalid <- setdiff( + make.unique(st_list[[g]]$sample_name), valid_names) + new_valid <- setdiff( + valid_names, make.unique(st_list[[g]]$sample_name)) + if(length(new_valid) > 0) { + warning(strwrap(prefix = " ", initial = "", + "Some sample names were changed to be syntactically + valid in R:")) + warning(paste(capture.output(print( + data.frame(Original=original_invalid, + Valid=new_valid))), + collapse = "\n")) + } + st_list[[g]]$sample_name <- make.unique(make.names(st_list[[g]]$sample_name)) + # for each peak file + i <- 1 + for (file in st_list[[g]]$peak_files) { + info <- file.info(file.path(file)) + if (file.exists(file.path(file)) && info$size != 0) { + p <- fread(file) + #name <- gsub("_peaks_coverage.bed","", basename(file)) + name <- make.unique(make.names(st_list[[g]][i]$sample_name)) + #message(paste0("name: ", name)) + i <- i + 1 + #message(paste0("i: ", i)) + colnames(p) <- c("chr", "start", "end", "read_count", + "base_count", "width", "frac", "norm") + setkey(p, chr, start, end) + + reduceGR <- makeGRangesFromDataFrame(reduce_dt) + peaksGR <- makeGRangesFromDataFrame(p) + hitsGR <- findOverlaps(query=reduceGR, subject=peaksGR) + + # Weight counts by percent overlap + olap <- pintersect(reduceGR[queryHits(hitsGR)], + peaksGR[subjectHits(hitsGR)]) + polap <- width(olap) / width(peaksGR[subjectHits(hitsGR)]) + counts <- data.table(index=rep(1:nrow(p)), + counts=p$norm*polap) + hits <- data.table(xid=queryHits(hitsGR), + yid=subjectHits(hitsGR)) + setkey(hits, yid) + setkey(counts, index) + out <- hits[counts, nomatch=0] + keep <- out[out[,.I[which.max(counts)],by=yid]$V1] + reduce_dt[keep$xid][[name]] <- keep$counts + rm(p) + invisible(gc()) + } + } + # trim any bad peaks (extend beyond chromosome) + # can't be negative + reduce_dt[start < 0, start := 0] + # ensure sorted + setorderv(reduce_dt, cols = c("chr", "start")) + # can't extend past chromosome + for (i in nrow(c_size)) { + reduce_dt[chr == c_size$chr[i] & end > c_size$size[i], + end := c_size$size[i]] + } + columnIndices <- c(4:ncol(reduce_dt)) + # Drop regions with no coverage + keepRows <- rowMeans(reduce_dt[, ..columnIndices]) > 0 + reduce_dt <- reduce_dt[keepRows] + # identify reproducible peaks + reduce_dt$count <- apply(reduce_dt[, ..columnIndices], 1, + function(x) sum(x > 0)) + if (length(st_list[[g]]$peak_files) > 1) { + # keep peaks present in 2 or more individual peak sets + reduce_dt <- reduce_dt[count >= 2,] + } + reduce_dt[,count := NULL] + } + # Create matrix rownames + reduce_dt <- cbind(name=paste(reduce_dt$chr, + reduce_dt$start, + reduce_dt$end, sep="_"), + reduce_dt) + reduce_dt <- reduce_dt[,-c("chr","start","end")] + + # save counts table + if (exists("reduce_dt")) { + output_file <- file.path(summary_dir, + paste0(project_name, "_", g, "_peaks_coverage.tsv")) + # save consensus peak set + fwrite(reduce_dt, output_file, sep="\t", col.names=TRUE) + counts_path <- c(counts_path, output_file) + } else { + warning(strwrap(prefix = " ", initial = "", + "Unable to produce a peak coverage file. Check that + individual peak coverage files exist for your samples.")) return(NULL) } - for (i in nrow(c_size)) { - reduce_dt[chr == c_size$chr[i] & end > c_size$size[i], - end := c_size$size[i]] - } - columnIndices <- c(4:ncol(reduce_dt)) - # Drop regions with no coverage - keepRows <- rowMeans(reduce_dt[, ..columnIndices]) > 0 - reduce_dt <- reduce_dt[keepRows] - # identify reproducible peaks - reduce_dt$count <- apply(reduce_dt[, ..columnIndices], 1, - function(x) sum(x > 0)) - if (length(peak_files) > 1) { - # keep peaks present in 2 or more individual peak sets - reduce_dt <- reduce_dt[count >= 2,] - } - reduce_dt[,count := NULL] - } - # Create matrix rownames - reduce_dt <- cbind(name=paste(reduce_dt$chr, - reduce_dt$start, - reduce_dt$end, sep="_"), - reduce_dt) - reduce_dt <- reduce_dt[,-c("chr","start","end")] - - # save counts table - if (exists("reduce_dt")) { - counts_path <- file.path(summary_dir, - paste0(project_name, "_peaks_coverage.tsv")) - # save consensus peak set - fwrite(reduce_dt, counts_path, sep="\t", col.names=TRUE) - return(counts_path) - } else { - warning("Unable to produce a peak coverage file.") - warning("Check that individual peak coverage files exist for your samples.") - return(NULL) } + return(counts_path) } From 4c28c91ecc2576986b363d8d7b1fdc2b592edcd4 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 4 Aug 2020 13:05:59 -0400 Subject: [PATCH 08/10] update ability to run multiple genomes for the peak count tables --- tools/PEPATAC_summarizer.R | 48 +++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/tools/PEPATAC_summarizer.R b/tools/PEPATAC_summarizer.R index aaf66a2b..7819f373 100755 --- a/tools/PEPATAC_summarizer.R +++ b/tools/PEPATAC_summarizer.R @@ -110,7 +110,6 @@ if (dir.exists(argv$results)) { quit() } - # Get assets assets <- PEPATACr::createAssetsSummary(prj, argv$output, results_subdir) if (nrow(assets) == 0) { @@ -172,7 +171,7 @@ if (!file.exists(complexity_path) || argv$new_start) { complexity_flag <- TRUE } } else { - warning("Project level library complexity plot already exists.") + message("Project level library complexity plot already exists.") message(paste0("Project library complexity plot: ", complexity_path, "\n")) complexity_flag <- TRUE } @@ -185,7 +184,7 @@ if (summarizer_flag && complexity_flag) { for (genome in unique(genomes)) { file_name <- paste0("_", genome,"_consensusPeaks.narrowPeak") consensus_path <- file.path(summary_dir, paste0(project_name, file_name)) - if (file.exists(consensus_path)) { + if (file.exists(consensus_path) && !argv$new_start) { message(paste0("Consensus peak set (", genome, "): ", consensus_path, "\n")) } @@ -193,7 +192,7 @@ for (genome in unique(genomes)) { # Calculate consensus peaks if (!file.exists(consensus_path) || argv$new_start) { - write(paste0("Creating consensus peak set..."), stdout()) + #write(paste0("Creating consensus peak set..."), stdout()) consensus_paths <- PEPATACr::consensusPeaks(prj, argv$output, argv$results, assets) if (!length(consensus_paths) == 0) { @@ -213,24 +212,35 @@ if (!file.exists(consensus_path) || argv$new_start) { } } +# Report existing counts tables +# TODO: move genome handling out of the called function? +for (genome in unique(genomes)) { + file_name <- paste0("_", genome,"_peaks_coverage.tsv") + counts_path <- file.path(summary_dir, paste0(project_name, file_name)) + if (file.exists(counts_path) && !argv$new_start) { + message(paste0("Peak counts table (", genome, "): ", + counts_path, "\n")) + } +} + # Create count matrix -counts_path <- file.path(summary_dir, - paste0(project_name, "_peaks_coverage.tsv")) if (!file.exists(counts_path) || argv$new_start) { - write(paste0("Creating gene count table..."), stdout()) - counts_path <- PEPATACr::peakCounts(prj, argv$output, argv$results, assets) - if (!is.null(counts_path) && file.exists(counts_path)) { - message("Counts table: ", counts_path, "\n") - icon <- PEPATACr::fileIcon() - output_file <- file.path(summary_dir, - paste0(project_name, "_peaks_coverage.png")) - png(filename = output_file, height = 275, width=275, - bg="transparent") - suppressWarnings(print(icon)) - invisible(dev.off()) + #write(paste0("Creating peak count table(s)..."), stdout()) + counts_paths <- PEPATACr::peakCounts(prj, argv$output, argv$results, assets) + if (!length(counts_paths) == 0) { + for (counts_table in counts_paths) { + if (file.exists(counts_table)) { + message("Counts table: ", counts_table, "\n") + icon <- PEPATACr::fileIcon() + output_file <- file.path(summary_dir, + paste0(project_name, "_", genome, "_peaks_coverage.png")) + png(filename = output_file, height = 275, width=275, + bg="transparent") + suppressWarnings(print(icon)) + invisible(dev.off()) + } + } } -} else { - message("Counts table: ", counts_path, "\n") } ################################################################################ \ No newline at end of file From eb1e9ec6cc728d7f17ed8ee575b44c3ece29706a Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 4 Aug 2020 13:06:10 -0400 Subject: [PATCH 09/10] updates --- pepatac_output_schema.yaml | 11 ++++---- pipelines/pepatac.py | 54 +++++++++++++++++++++++++++++++++----- 2 files changed, 53 insertions(+), 12 deletions(-) diff --git a/pepatac_output_schema.yaml b/pepatac_output_schema.yaml index 11f400b8..edd2dc1e 100644 --- a/pepatac_output_schema.yaml +++ b/pepatac_output_schema.yaml @@ -52,11 +52,12 @@ properties: consensus_peaks_file: title: "Consensus peaks file" description: "A set of consensus peaks across samples." - thumbnail_path: "summary/{name}_consensusPeaks.png" - path: "summary/{name}_consensusPeaks.narrowPeak" - type: image + thumbnail_path: "summary/{name}_*_consensusPeaks.png" + path: "summary/{name}_*_consensusPeaks.narrowPeak" + type: string counts_table: title: "Project peak coverage file" description: "Project peak coverages: chr_start_end X sample" - path: "summary/{name}_peaks_coverage.tsv" - type: link + thumbnail_path: "summary/{name}_*_peaks_coverage.png" + path: "summary/{name}_*_peaks_coverage.tsv" + type: string diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index c36af5ce..4e80685d 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -123,6 +123,10 @@ def parse_arguments(): help="Only keep minimal, essential output to conserve " "disk space.") + parser.add_argument("--skipqc", dest="skipqc", action='store_true', + help="Skip FastQC. Useful for bugs in FastQC " + "that appear with some sequence read files.") + parser.add_argument("-V", "--version", action="version", version="%(prog)s {v}".format(v=__version__)) @@ -712,6 +716,13 @@ def main(): trimming_prefix = out_fastq_pre trimmed_fastq = trimming_prefix + "_R1_trim.fastq" trimmed_fastq_R2 = trimming_prefix + "_R2_trim.fastq" + fastqc_folder = os.path.join(param.outfolder, "fastqc") + fastqc_report = os.path.join(fastqc_folder, + trimming_prefix + "_R1_trim_fastqc.html") + fastqc_report_R2 = os.path.join(fastqc_folder, + trimming_prefix + "_R2_trim_fastqc.html") + if ngstk.check_command(tools.fastqc): + ngstk.make_dir(fastqc_folder) # Create trimming command(s). if args.trimmer == "pyadapt": @@ -726,7 +737,7 @@ def main(): ("-o", out_fastq_pre), "-u" ] - cmd = build_command(trim_cmd_chunks) + trim_cmd = build_command(trim_cmd_chunks) elif args.trimmer == "skewer": # Create the primary skewer command. @@ -762,7 +773,7 @@ def main(): for old, new in skewer_filename_pairs] # Pypiper submits the commands serially. - cmd = [trimming_command] + trimming_renaming_commands + trim_cmd = [trimming_command] + trimming_renaming_commands else: # Default to trimmomatic. @@ -780,13 +791,42 @@ def main(): trimming_prefix + "_R2_unpaired.fq" if args.paired_end else "", "ILLUMINACLIP:" + res.adapters + ":2:30:10" ] - cmd = build_command(trim_cmd_chunks) + trim_cmd = build_command(trim_cmd_chunks) + + def check_trim(): + pm.info("Evaluating read trimming") + + if args.paired_end and not trimmed_fastq_R2: + pm.warning("Specified paired-end but no R2 file") + + n_trim = float(ngstk.count_reads(trimmed_fastq, args.paired_end)) + pm.report_result("Trimmed_reads", int(n_trim)) + try: + rr = float(pm.get_stat("Raw_reads")) + except: + pm.warning("Can't calculate trim loss rate without raw read result.") + else: + pm.report_result( + "Trim_loss_rate", round((rr - n_trim) * 100 / rr, 2)) + + # Also run a fastqc (if installed/requested) + if fastqc_folder and not args.skipqc: + if fastqc_folder and os.path.isabs(fastqc_folder): + ngstk.make_sure_path_exists(fastqc_folder) + cmd = (tools.fastqc + " --noextract --outdir " + + fastqc_folder + " " + trimmed_fastq) + pm.run(cmd, fastqc_report, nofail=False) + pm.report_object("FastQC report r1", fastqc_report) + + if args.paired_end and trimmed_fastq_R2: + cmd = (tools.fastqc + " --noextract --outdir " + + fastqc_folder + " " + trimmed_fastq_R2) + pm.run(cmd, fastqc_report_R2, nofail=False) + pm.report_object("FastQC report r2", fastqc_report_R2) if not os.path.exists(rmdup_bam) or args.new_start: - pm.run(cmd, trimmed_fastq, - follow=ngstk.check_trim( - trimmed_fastq, args.paired_end, trimmed_fastq_R2, - fastqc_folder=os.path.join(param.outfolder, "fastqc"))) + pm.debug("trim_cmd: {}".format(trim_cmd)) + pm.run(trim_cmd, trimmed_fastq, follow=check_trim) pm.clean_add(os.path.join(fastq_folder, "*.fastq"), conditional=True) pm.clean_add(os.path.join(fastq_folder, "*.log"), conditional=True) From 21144a2104a4a50e3c30df161d42325af66650ac Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 4 Aug 2020 13:08:47 -0400 Subject: [PATCH 10/10] version bump --- docs/changelog.md | 6 ++++++ docs/usage.md | 6 ++++-- pipelines/pepatac.py | 2 +- usage.txt | 6 ++++-- 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 831b7165..36a1356d 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,6 +1,12 @@ # Change log All notable changes to this project will be documented in this file. +## [0.9.2] -- 2020-08-04 + +### Changed + - Reduce memory requirements of consensus peak generation + - Enable multiple genome projects for peak count tables + ## [0.9.1] -- 2020-07-13 ### Changed diff --git a/docs/usage.md b/docs/usage.md index a1122425..a5fb5244 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -19,9 +19,9 @@ usage: pepatac.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V] [--peak-type {fixed,variable}] [--extend EXTEND] [--frip-ref-peaks FRIP_REF_PEAKS] [--motif] [--sob] [--no-scale] [--prioritize] [--keep] [--noFIFO] [--lite] - [-V] + [--skipqc] [-V] -PEPATAC version 0.9.1 +PEPATAC version 0.9.2 optional arguments: -h, --help show this help message and exit @@ -86,6 +86,8 @@ optional arguments: --noFIFO Do NOT use named pipes during prealignments --lite Only keep minimal, essential output to conserve disk space. + --skipqc Skip FastQC. Useful for bugs in FastQC that appear + with some sequence read files. -V, --version show program's version number and exit required named arguments: diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 4e80685d..0117c505 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -5,7 +5,7 @@ __author__ = ["Jin Xu", "Nathan Sheffield", "Jason Smith"] __email__ = "jasonsmith@virginia.edu" -__version__ = "0.9.1" +__version__ = "0.9.2" from argparse import ArgumentParser diff --git a/usage.txt b/usage.txt index 6c31a986..5b34720b 100644 --- a/usage.txt +++ b/usage.txt @@ -11,9 +11,9 @@ usage: pepatac.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V] [--peak-type {fixed,variable}] [--extend EXTEND] [--frip-ref-peaks FRIP_REF_PEAKS] [--motif] [--sob] [--no-scale] [--prioritize] [--keep] [--noFIFO] [--lite] - [-V] + [--skipqc] [-V] -PEPATAC version 0.9.1 +PEPATAC version 0.9.2 optional arguments: -h, --help show this help message and exit @@ -78,6 +78,8 @@ optional arguments: --noFIFO Do NOT use named pipes during prealignments --lite Only keep minimal, essential output to conserve disk space. + --skipqc Skip FastQC. Useful for bugs in FastQC that appear + with some sequence read files. -V, --version show program's version number and exit required named arguments: