From 24d8fda38043485ac2018c15b1a4ab3bed512687 Mon Sep 17 00:00:00 2001
From: jpsmith5 <jasonsmith@virginia.edu>
Date: Mon, 13 Jul 2020 10:54:39 -0400
Subject: [PATCH 01/10] fix typo

---
 docs/annotation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/annotation.md b/docs/annotation.md
index f8679d15..d463927d 100644
--- a/docs/annotation.md
+++ b/docs/annotation.md
@@ -2,7 +2,7 @@
 
 The pipeline uses reference data at various stages. If you're using a common genome assembly, these resources are pre-built and can be easily downloaded using `refgenie pull`, as described in the setup instructions. If the resources are not available, you'll have to build them. Read [how to build `refgenie` assets](http://refgenie.databio.org/en/latest/build/) in the `refgenie` docs.  You may also [learn about the current buildable assets](http://refgenie.databio.org/en/latest/available_assets/) to which `refgenie` knows the recipe.
 
-##Use a custom `feat_annotation` asset
+## Use a custom `feat_annotation` asset
 
 The pipeline will calculate the fraction of reads in genomic features using the `refgenie` `feat_annotation` asset, but you can also specify this file yourself at the command line (`--anno-name`).
 

From 7857b719ce84cd56bae9e89e1c607a9d3dceb8e5 Mon Sep 17 00:00:00 2001
From: jpsmith5 <jasonsmith@virginia.edu>
Date: Mon, 13 Jul 2020 10:54:54 -0400
Subject: [PATCH 02/10] fix asset reference

---
 pipelines/pepatac.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py
index 943e625f..8ac88615 100755
--- a/pipelines/pepatac.py
+++ b/pipelines/pepatac.py
@@ -1742,7 +1742,7 @@ def report_peak_count():
                     black_local = os.path.join(raw_folder,
                                                args.genome_assembly +
                                                "_blacklist.bed")
-                    cmd = ("ln -sf " + res.feat_annotation + " " + black_local)
+                    cmd = ("ln -sf " + res.blacklist + " " + black_local)
                     pm.run(cmd, black_local)
                 else:
                     print("Skipping peak filtering...")

From fd559bf798fe1dbd5e3a716a050da1d218038506 Mon Sep 17 00:00:00 2001
From: jpsmith5 <jasonsmith@virginia.edu>
Date: Wed, 22 Jul 2020 09:22:50 -0400
Subject: [PATCH 03/10] fix gsub to be literal

---
 PEPATACr/R/PEPATACr.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PEPATACr/R/PEPATACr.R b/PEPATACr/R/PEPATACr.R
index aa73419a..9a722675 100644
--- a/PEPATACr/R/PEPATACr.R
+++ b/PEPATACr/R/PEPATACr.R
@@ -739,7 +739,7 @@ plotFRiF <- function(sample_name, num_reads, genome_size,
             } else {
                 info       <- file.info(file.path(bedFile[i]))
                 name       <- basename(tools::file_path_sans_ext(bedFile[i]))
-                name       <- gsub(sample_name, "", name)
+                name       <- gsub(sample_name, "", name, fixed=TRUE)
                 name       <- gsub("^.*?_", "", name)
                 numFields  <- 2
                 for(j in 1:numFields) name <- gsub("_[^_]*$", "", name)

From 89e359902fa651843c72d97e2c44d9806fc0a51d Mon Sep 17 00:00:00 2001
From: jpsmith5 <jasonsmith@virginia.edu>
Date: Wed, 22 Jul 2020 09:23:05 -0400
Subject: [PATCH 04/10] fix zip file check

---
 pipelines/pepatac.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py
index 8ac88615..c36af5ce 100755
--- a/pipelines/pepatac.py
+++ b/pipelines/pepatac.py
@@ -875,8 +875,9 @@ def main():
     pm.clean_add(tempdir)
 
     # If there are no prealignments, unmap_fq1 will be unzipped
-    if pypiper.is_gzipped_fastq(unmap_fq1):
+    if os.path.exists(unmap_fq1 + ".gz"):
         unmap_fq1 = unmap_fq1 + ".gz"
+    if os.path.exists(unmap_fq2 + ".gz"):
         unmap_fq2 = unmap_fq2 + ".gz"
 
     bt2_index = os.path.join(rgc.seek(args.genome_assembly, BT2_IDX_KEY))

From 0c36037fd49bf581496e141b7919cad7c706fafd Mon Sep 17 00:00:00 2001
From: jpsmith5 <jasonsmith@virginia.edu>
Date: Wed, 22 Jul 2020 09:24:18 -0400
Subject: [PATCH 05/10] update site info

---
 mkdocs.yml | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/mkdocs.yml b/mkdocs.yml
index 073e8253..a33aaffb 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -2,10 +2,10 @@ theme: databio
 
 site_name: PEPATAC
 site_author: Jason Smith
-site_url: http://code.databio.org/PEPATAC/
+site_url: http://pepatac.databio.org/
 site_logo: img/pepatac_logo_white.svg
 repo_url: https://github.com/databio/pepatac/
-google_analytics: ['UA-127092878-1', 'code.databio.org/PEPATAC']
+google_analytics: ['UA-127092878-1', 'pepatac.databio.org']
 
 markdown_extensions:
   - fontawesome_markdown  # pip install --user fontawesome-markdown
@@ -47,6 +47,3 @@ navbar:
   - text: Software & Data
     icon: fa-code fa-lg
     href: http://databio.org/software/
-  # - text: Contact us
-  #   icon: fa-envelope
-  #   href: contact
\ No newline at end of file

From a4bf9f0ca7fd2bbead763de4a90ace469f1d3d3c Mon Sep 17 00:00:00 2001
From: jpsmith5 <jasonsmith@virginia.edu>
Date: Wed, 22 Jul 2020 10:10:05 -0400
Subject: [PATCH 06/10] add try catch in plotAnno for TSS plots

---
 PEPATACr/R/PEPATACr.R | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/PEPATACr/R/PEPATACr.R b/PEPATACr/R/PEPATACr.R
index 9a722675..065a751e 100644
--- a/PEPATACr/R/PEPATACr.R
+++ b/PEPATACr/R/PEPATACr.R
@@ -1381,7 +1381,6 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"),
 
     if (tolower(plot) == "chromosome") {
         # Chromosome distribution plot
-        # TODO: makes this a try-catch
         x <- tryCatch(
             {
                 suppressMessages(calcChromBinsRef(query, genome))
@@ -1413,9 +1412,26 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"),
         }
     } else if (tolower(plot) == "tss") {
         # Feature distance distribution plots
-        TSS_dist <- calcFeatureDistRefTSS(query, genome)
+        x <- tryCatch(
+            {
+                suppressMessages(calcFeatureDistRefTSS(query, genome))
+            },
+            error=function(e) {
+                message("calcFeatureDistRefTSS(): ", e)
+                return(NULL)
+            },
+            warning=function(e) {
+                message("calcFeatureDistRefTSS(): ", e)
+                return(NULL)
+            }
+        )
+
+        if (is.null(x)) {
+            return(ggplot())
+        }
+        
         if (!is.na(TSS_dist[1])) {
-            TSS_plot <- plotFeatureDist(TSS_dist, featureName="TSS")
+            TSS_plot <- plotFeatureDist(x, featureName="TSS")
             return(TSS_plot)
         } else {
             message("Unable to produce TSS distribution plot.")

From ab5396fbf232980130f5aa785b738a3d8db6871f Mon Sep 17 00:00:00 2001
From: jpsmith5 <jasonsmith@virginia.edu>
Date: Tue, 4 Aug 2020 13:05:42 -0400
Subject: [PATCH 07/10] change consensus peak to dramatically reduce memory
 requirements

---
 PEPATACr/R/PEPATACr.R | 495 ++++++++++++++++++++++++------------------
 1 file changed, 283 insertions(+), 212 deletions(-)

diff --git a/PEPATACr/R/PEPATACr.R b/PEPATACr/R/PEPATACr.R
index 065a751e..fc21caf0 100644
--- a/PEPATACr/R/PEPATACr.R
+++ b/PEPATACr/R/PEPATACr.R
@@ -705,7 +705,7 @@ plotFRiF <- function(sample_name, num_reads, genome_size,
         } else {
             bedCov     <- calcFRiF(bed, num_reads, reads)
             name       <- basename(tools::file_path_sans_ext(bedFile[1]))
-            name       <- gsub(sample_name, "", name)
+            name       <- gsub(sample_name, "", name, fixed=TRUE)
             name       <- gsub("^.*?_", "", name)
             numFields  <- 1
             for(i in 1:numFields) name <- gsub("_[^_]*$", "", name)
@@ -741,7 +741,7 @@ plotFRiF <- function(sample_name, num_reads, genome_size,
                 name       <- basename(tools::file_path_sans_ext(bedFile[i]))
                 name       <- gsub(sample_name, "", name, fixed=TRUE)
                 name       <- gsub("^.*?_", "", name)
-                numFields  <- 2
+                numFields  <- 1
                 for(j in 1:numFields) name <- gsub("_[^_]*$", "", name)
             }
 
@@ -1430,7 +1430,7 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"),
             return(ggplot())
         }
         
-        if (!is.na(TSS_dist[1])) {
+        if (!is.na(x[1])) {
             TSS_plot <- plotFeatureDist(x, featureName="TSS")
             return(TSS_plot)
         } else {
@@ -1847,7 +1847,8 @@ plotAlignedRaw <- function(prj, summary_dir, stats) {
                                 paste(unique(stats$Genome))))
     }
 
-    align_raw$sample <- factor(align_raw$sample, levels = align_raw$sample)
+    align_raw$sample <- factor(align_raw$sample,
+                               levels = unique(align_raw$sample))
 
     melt_align_raw <- melt(align_raw, id.vars = "sample")
     max_reads      <- max(rowSums(align_raw[,2:ncol(align_raw)]))
@@ -2026,7 +2027,7 @@ plotAlignedPct <- function(prj, summary_dir, stats) {
     }
 
     align_percent$sample <- factor(align_percent$sample,
-                                   levels=align_percent$sample)
+                                   levels=unique(align_percent$sample))
 
     # Warn user if sample has aberrant values
     aberrant_samples <- data.frame(Sample=character(),
@@ -2214,7 +2215,8 @@ plotTSSscores <- function(prj, summary_dir, stats, cutoff=6) {
     upper_limit  <- roundUpNice(max_TSS)
     chart_height <- (length(unique(TSS_score$sample))) * 0.75
 
-    TSS_score$sample <- factor(TSS_score$sample, levels=TSS_score$sample)
+    TSS_score$sample <- factor(TSS_score$sample,
+                               levels=unique(TSS_score$sample))
 
     TSS_plot <- ggplot(TSS_score, aes(x=sample, y=as.numeric(TSS))) +
         geom_bar(colour="black", size=0.25, width=0.7, stat="identity",
@@ -2313,7 +2315,7 @@ plotLibSizes <- function(prj, summary_dir, stats) {
     chart_height    <- (length(unique(picard_lib_size$sample))) * 0.75
     
     picard_lib_size$sample <- factor(picard_lib_size$sample, 
-                                     levels = picard_lib_size$sample)
+                                     levels=unique(picard_lib_size$sample))
     
     lib_size_plot <- ggplot(picard_lib_size, 
                             aes(x = sample, y = as.numeric(LibSize))) +
@@ -2357,6 +2359,7 @@ summarizer <- function(project, output_dir) {
     summary_file <- file.path(output_dir,
         paste0(pepr::config(project)$name, "_stats_summary.tsv"))
 
+    write(paste0("Creating summary plots..."), stdout())
     # Set the output directory
     summary_dir <- suppressMessages(file.path(output_dir, "summary"))
     # Produce output directory (if needed)
@@ -2398,21 +2401,21 @@ summarizer <- function(project, output_dir) {
 #' Internal helper function for \code{collapsePeaks}
 #' Count the number of times a peak appears across a list of peak files
 #'
-#' @param peakList A list of data.table objects representing
-#'                 narrowPeak files
-#' @param peakDT A single data.table of peaks
-countReproduciblePeaks <- function(peakList, peakDT) {
-    setkey(peakDT, chr, start, end)
-    setkey(peakList, chr, start, end)
-    hits <- foverlaps(peakList, peakDT,
+#' @param peak_list A list of data.table objects representing
+#'                  narrowPeak files
+#' @param peak_DT A single data.table of peaks
+countReproduciblePeaks <- function(peak_list, peak_DT) {
+    setkey(peak_DT, chr, start, end)
+    setkey(peak_list, chr, start, end)
+    hits <- foverlaps(peak_list, peak_DT,
                       by.x=c("chr", "start", "end"),
                       type="any", which=TRUE, nomatch=0)
     # track the number of overlaps of final peak set peaks
-    if (!"count" %in% colnames(peakDT)) {
-        peakDT[hits$yid, count := 1]
-        peakDT[is.na(get("count")), ("count") := 0]
+    if (!"count" %in% colnames(peak_DT)) {
+        peak_DT[hits$yid, count := 1]
+        peak_DT[is.na(get("count")), ("count") := 0]
     } else {
-        peakDT[hits$yid, count := get("count") + 1] 
+        peak_DT[hits$yid, count := get("count") + 1] 
     }
 }
 
@@ -2443,19 +2446,29 @@ collapsePeaks <- function(sample_table, chrom_sizes, min_score=5) {
                          "peak")
     setkey(peaks, chr, start, end)
     # keep highest scored peaks
-    hits    <- foverlaps(peaks, peaks,
-                         by.x=c("chr", "start", "end"),
-                         type="any", which=TRUE, nomatch=0)
-    scores  <- data.table(index=rep(1:nrow(peaks)), score=peaks$score)
-    setkey(hits, xid)
-    setkey(scores, index)
-    out     <- hits[scores, nomatch=0]
-    keep    <- out[out[,.I[which.max(score)],by=yid]$V1]
-    indices <- unique(keep$xid)
-    final   <- peaks[indices,]
-    # trim any bad peaks (extend beyond chromosome)
-    # can't be negative
-    final[start < 0, start := 0]
+    # hits    <- foverlaps(peaks, peaks,
+                         # by.x=c("chr", "start", "end"),
+                         # type="any", which=TRUE, nomatch=0)
+    # split by chromosome to minimize memory requirements
+    peaks_by_chr   <- split(peaks, peaks$chr)
+    hit_aggregator <- function(x) {
+        peaksGR <- makeGRangesFromDataFrame(x, keep.extra.columns=FALSE)
+        hitsGR  <- suppressWarnings(
+            findOverlaps(peaksGR, peaksGR, ignore.strand=TRUE))
+        hits    <- data.table::data.table(xid=queryHits(hitsGR),
+                                          yid=subjectHits(hitsGR))
+        setkey(hits, xid)
+        scores  <- data.table(index=rep(1:nrow(x)), score=x$score)
+        setkey(scores, index)
+        out     <- hits[scores, nomatch=0]
+        keep    <- out[out[,.I[which.max(score)],by=yid]$V1]
+        indices <- unique(keep$xid)
+        final   <- x[indices,]
+        final[start < 0, start := 0]
+        return(final)
+    }
+    final <- rbindlist(lapply(peaks_by_chr, hit_aggregator))
+
     # can't extend past chromosome
     for (i in nrow(chrom_sizes)) {
         final[chr == chrom_sizes$chr[i] & end > chrom_sizes$size[i],
@@ -2463,12 +2476,11 @@ collapsePeaks <- function(sample_table, chrom_sizes, min_score=5) {
     }
 
     # identify reproducible peaks
-    peakSet <- copy(peaks)
-    peakSet[,group := gsub("_peak.*","",name)]
-    peakList <- splitDataTable(peakSet, "group")
-    #original <- copy(final)
-
-    invisible(sapply(peakList, countReproduciblePeaks, peakDT=final))
+    peaks[,group := gsub("_peak.*","",name)]
+    peak_list <- splitDataTable(peaks, "group")
+    rm(peaks)
+    invisible(gc())
+    invisible(sapply(peak_list, countReproduciblePeaks, peak_DT=final))
 
     # keep peaks present in 2 or more individual peak sets
     # keep peaks with score per million >= 5
@@ -2518,25 +2530,27 @@ consensusPeaks <- function(project, output_dir, results_subdir, assets) {
     if (nrow(files) == 0) {
         return(consensus_peak_files)
     }
-    sample_table <- sample_table[files, .SD, nomatch=0L,
-                                 on="peak_files", .SDcols=names(sample_table)]
+    #sample_table <- sample_table[files, .SD, nomatch=0L,
+    #                             on="peak_files", .SDcols=names(sample_table)]
+    sample_table <- unique(
+        sample_table[sample_table$peak_files %in% files$peak_files,])
     
     # Need to group by genome, then create a consensus list by genome!
     st_list = splitDataTable(sample_table, "genome")
 
-    for (genome in names(st_list)) {
-        if (nrow(st_list[[genome]]) == 1) {
+    for (g in names(st_list)) {
+        if (nrow(st_list[[g]]) == 1) {
             err_msg = paste0("Found only a single valid peak file for ",
-                             genome, ".")
+                             g, ".")
             warning(err_msg)
             next
         }
-        if (nrow(st_list[[genome]]) == 0) {
+        if (nrow(st_list[[g]]) == 0) {
             warning("Unable to find any valid peak files.")
             warning("Confirm peak files exist for your samples.")
             next
         }
-        c_path <- unique(sample_table[genome == genome, c_path])
+        c_path <- unique(sample_table[genome == g, c_path])
         if (file.exists(c_path)) {
             c_size <- fread(c_path)
             colnames(c_size) <- c("chr", "size")
@@ -2546,14 +2560,19 @@ consensusPeaks <- function(project, output_dir, results_subdir, assets) {
                            " is present before continuing."))
             final <- NULL
         }
-        final <- collapsePeaks(st_list[[genome]], c_size)
+        message(paste0("Calculating ", g, " consensus peak set from ",
+                       nrow(st_list[[g]]), " samples..."))
+        final <- collapsePeaks(st_list[[g]], c_size)
+        #}
         if (!is.null(final)) {
             # save consensus peak set
-            file_name   <- paste0("_", genome,"_consensusPeaks.narrowPeak")
+            file_name   <- paste0("_", g,"_consensusPeaks.narrowPeak")
             output_file <- file.path(summary_dir,
                                      paste0(project_name, file_name))
             fwrite(final, output_file, sep="\t", col.names=FALSE)
             consensus_peak_files <- c(consensus_peak_files, output_file)
+            rm(final)
+            invisible(gc())
         } else {
             warning("Unable to produce a consensus peak file.")
             warning("Check that individual peak files exist for your samples.")
@@ -2626,11 +2645,10 @@ peakCounts <- function(project, output_dir, results_subdir, assets) {
     # Produce output directory (if needed)
     dir.create(summary_dir, showWarnings = FALSE)
 
-    # TODO: handle multiple genomes in single project
     sample_names   <- pepr::sampleTable(project)$sample_name
     genomes        <- as.list(pepr::sampleTable(project)$genome)
     names(genomes) <- sample_names
-    sample_names   <- as.character(pepr::sampleTable(project)$sample_name)
+    sample_names   <- unique(as.character(pepr::sampleTable(project)$sample_name))
     
     sample_table <- data.table(
         sample_name=pepr::sampleTable(project)$sample_name,
@@ -2653,138 +2671,46 @@ peakCounts <- function(project, output_dir, results_subdir, assets) {
     
     # generate paths to peak coverage files
     sample_table[,peak_files:=.((file.path(
-               results_subdir,
-               sample_table$sample_name,
-               paste0("peak_calling_", sample_table$genome),
-               paste0(sample_table$sample_name, peak_file_name))))]
-    peak_files = sample_table$peak_files
-
-    if (reference) {
-        peaks_dt = data.table(chr=as.character(),
-                              start=as.numeric(),
-                              end=as.numeric(),
-                              name=as.character(),
-                              score=as.numeric(),
-                              strand=as.character(),
-                              signalValue=as.numeric(),
-                              pValue=as.numeric(),
-                              qValue=as.numeric(),
-                              peak=as.numeric(),
-                              read_count=as.numeric(),
-                              base_count=as.numeric(),
-                              width=as.numeric(),
-                              frac=as.numeric(),
-                              norm=as.numeric(),
-                              group=as.character())
-
-        # Load peak files
-        num_files <- 0
-        name_list <- as.character()
-        for (file in peak_files) {
-            info <- file.info(file.path(file))
-            if (file.exists(file.path(file)) && info$size != 0) {
-                num_files <- num_files + 1
-                peaks     <- fread(file)
-                name      <- gsub(peak_file_name, "", basename(file))
-                name_list <- append(name_list, name)
-                colnames(peaks) <- c("chr", "start", "end", "name", "score",
-                                     "strand", "signalValue", "pValue",
-                                     "qValue", "peak", "read_count",
-                                     "base_count", "width", "frac", "norm")
-                peaks$group <- name
-                setkey(peaks, chr, start, end)
-                peaks_dt <- rbind(peaks_dt, peaks)
-            }
-        }
-        peak_list  <- splitDataTable(peaks_dt, "group")
-        # confirm identical chr, start, end coordinates exist in all peak files
-        same <- all(sapply(lapply(peak_list, function(x) {x[,c(1:3)]}),
-                    identical,
-                    lapply(peak_list, function(x) {x[,c(1:3)]})[[1]]))
-        if (same) {
-            new_list <- lapply(names(peak_list), function(x){
-                colnames(peak_list[[x]]) <- c("chr", "start", "end", "name",
-                                              "score", "strand", "signalValue",
-                                              "pValue", "qValue", "peak",
-                                              "read_count", "base_count",
-                                              "width", "frac", x, "group")
-                peak_list[[x]]
-            })
-            names(new_list) <- names(peak_list)
-            reduce_dt <- Reduce(function(...) merge(..., all=F),
-                                lapply(new_list, function(x) {x[,c(1:3,15)]}))
-        } else {
-            warning("Reference peak set is inconsistent between samples")
-            warning("Confirm the same reference peak set was passed to `--frip-ref-peaks`")
-            return(NULL)
+                 results_subdir,
+                 sample_table$sample_name,
+                 paste0("peak_calling_", sample_table$genome),
+                 paste0(sample_table$sample_name, peak_file_name))))]
+               
+    #Only keep samples with valid peak coverage files
+    file_list   <- sample_table$peak_files
+    file_exists <- character()
+    for (i in 1:length(file_list)) {
+        if(file.exists(file.path(file_list[i]))) {
+            file_exists <- append(file_exists, file.path(file_list[i]))
         }
-    } else {
-        peaks_dt = data.table(chr=as.character(),
-                              start=as.numeric(),
-                              end=as.numeric(),
-                              read_count=as.numeric(),
-                              base_count=as.numeric(),
-                              width=as.numeric(),
-                              frac=as.numeric(),
-                              norm=as.numeric())
-        # Concatenate peak files
-        for (file in peak_files) {
-            info <- file.info(file.path(file))
-            if (file.exists(file.path(file)) && info$size != 0) {
-                peaks <- fread(file)
-                colnames(peaks) <- c("chr", "start", "end", "read_count",
-                                     "base_count", "width", "frac", "norm")
-                setkey(peaks, chr, start, end)
-                peaks_dt <- rbind(peaks_dt, peaks)
-            }
+    }
+    files <- data.table(peak_files=file_exists)
+    consensus_peak_files = list()
+    if (nrow(files) == 0) {
+        return(consensus_peak_files)
+    }
+    #sample_table <- sample_table[files, .SD, nomatch=0L,
+    #                             on="peak_files", .SDcols=names(sample_table)]
+    sample_table <- unique(
+        sample_table[sample_table$peak_files %in% files$peak_files,])
+    peak_files   <- sample_table$peak_files
+
+    # Need to group by genome, then create a counts table by genome
+    st_list <- splitDataTable(sample_table, "genome")
+    
+    for (g in names(st_list)) {
+        if (nrow(st_list[[g]]) == 1) {
+            err_msg <- paste0("Found only a single valid peak file for ",
+                              g, ".")
+            warning(err_msg)
+            next
         }
-        
-        peaksGR   <- makeGRangesFromDataFrame(peaks_dt, keep.extra.columns=T)
-        reduceGR  <- reduce(peaksGR)
-        
-        # instead, different column for each sample is the counts columns, plural
-        reduce_dt <- data.table(chr=as.character(seqnames(reduceGR)),
-                                start=start(reduceGR),
-                                end=end(reduceGR))
-        f <- function(x) {list(0)}
-        reduce_dt[, (sample_names) := f()]
-       
-        # for each peak file
-        for (file in peak_files) {
-            info <- file.info(file.path(file))
-            if (file.exists(file.path(file)) && info$size != 0) {
-                peaks   <- fread(file)
-                name    <- gsub("_peaks_coverage.bed","", basename(file))
-                colnames(peaks) <- c("chr", "start", "end", "read_count",
-                                     "base_count", "width", "frac", "norm")
-                setkey(peaks, chr, start, end)
-                
-                reduceGR <- makeGRangesFromDataFrame(reduce_dt)
-                peaksGR  <- makeGRangesFromDataFrame(peaks)
-                hitsGR   <- findOverlaps(query=reduceGR, subject=peaksGR)
-
-                # Weight counts by percent overlap
-                olap   <- pintersect(reduceGR[queryHits(hitsGR)],
-                                     peaksGR[subjectHits(hitsGR)])
-                polap  <- width(olap) / width(peaksGR[subjectHits(hitsGR)])
-                counts <- data.table(index=rep(1:nrow(peaks)),
-                                     counts=peaks$norm*polap)
-                hits   <- data.table(xid=queryHits(hitsGR),
-                                     yid=subjectHits(hitsGR))
-                setkey(hits, yid)
-                setkey(counts, index)
-                out    <- hits[counts, nomatch=0]
-                keep   <- out[out[,.I[which.max(counts)],by=yid]$V1]
-                reduce_dt[keep$xid][[name]] <- keep$counts
-            }
+        if (nrow(st_list[[g]]) == 0) {
+            warning("Unable to find any valid peak files.")
+            warning("Confirm peak files exist for your samples.")
+            next
         }
-        # trim any bad peaks (extend beyond chromosome)
-        # can't be negative
-        reduce_dt[start < 0, start := 0]
-        # ensure sorted
-        setorderv(reduce_dt, cols = c("chr", "start"))
-        # can't extend past chromosome
-        c_path <- sample_table[peak_files == file, c_path]
+        c_path <- unique(sample_table[genome == g, c_path])
         if (file.exists(c_path)) {
             c_size <- fread(c_path)
             colnames(c_size) <- c("chr", "size")
@@ -2792,44 +2718,189 @@ peakCounts <- function(project, output_dir, results_subdir, assets) {
             warning("Unable to load the chromosome sizes file.")
             warning(paste0("Confirm that ", c_path,
                            " is present before continuing."))
+            reduce_dt <- NULL
+        }
+        message(paste0("Calculating ", g, " peak counts for ",
+                       nrow(st_list[[g]]), " samples..."))
+        if (reference) {
+            peaks_dt <- data.table(chr=as.character(),
+                                   start=as.numeric(),
+                                   end=as.numeric(),
+                                   name=as.character(),
+                                   score=as.numeric(),
+                                   strand=as.character(),
+                                   signalValue=as.numeric(),
+                                   pValue=as.numeric(),
+                                   qValue=as.numeric(),
+                                   peak=as.numeric(),
+                                   read_count=as.numeric(),
+                                   base_count=as.numeric(),
+                                   width=as.numeric(),
+                                   frac=as.numeric(),
+                                   norm=as.numeric(),
+                                   group=as.character())
+
+            # Load peak files
+            peaks <- rbindlist(lapply(st_list[[g]]$peak_files, fread))
+            colnames(peaks) <- c("chr", "start", "end", "name", "score",
+                                 "strand", "signalValue", "pValue",
+                                 "qValue", "peak", "read_count",
+                                 "base_count", "width", "frac", "norm")
+            setkey(peaks, chr, start, end)
+            peaks_dt <- rbind(peaks_dt, peaks)
+            peak_set <- copy(peaks)
+            peak_set[,group := gsub(peak_file_name, "", name)]
+            peak_list <- splitDataTable(peak_set, "group")
+
+            # confirm identical chr, start, end coordinates exist in all peak files
+            same <- all(sapply(lapply(peak_list, function(x) {x[,c(1:3)]}),
+                        identical,
+                        lapply(peak_list, function(x) {x[,c(1:3)]})[[1]]))
+            if (same) {
+                new_list <- lapply(names(peak_list), function(x){
+                    colnames(peak_list[[x]]) <- c("chr", "start", "end", "name",
+                                                  "score", "strand", "signalValue",
+                                                  "pValue", "qValue", "peak",
+                                                  "read_count", "base_count",
+                                                  "width", "frac", x, "group")
+                    peak_list[[x]]
+                })
+                names(new_list) <- names(peak_list)
+                reduce_dt       <- Reduce(function(...) merge(..., all=F),
+                    lapply(new_list, function(x) {x[,c(1:3,15)]}))
+            } else {
+                warning(strwrap(prefix = " ", initial = "",
+                        "Reference peak set was inconsistent between samples.
+                         Pass a single reference peak set with the 
+                         `--frip-ref-peaks` argument."))
+                return(NULL)
+            }
+        } else {
+            peaks_dt <- data.table(chr=as.character(),
+                                   start=as.numeric(),
+                                   end=as.numeric(),
+                                   read_count=as.numeric(),
+                                   base_count=as.numeric(),
+                                   width=as.numeric(),
+                                   frac=as.numeric(),
+                                   norm=as.numeric())
+                                  
+            peaks           <- rbindlist(lapply(st_list[[g]]$peak_files, fread))
+            colnames(peaks) <- c("chr", "start", "end", "read_count",
+                                 "base_count", "width", "frac", "norm")
+            setkey(peaks, chr, start, end)
+            peaks_dt  <- rbind(peaks_dt, peaks)
+            # Convert to GRanges for more efficient findOverlaps vs data.table
+            peaksGR   <- makeGRangesFromDataFrame(peaks_dt,
+                                                  keep.extra.columns=TRUE)
+            reduceGR  <- reduce(peaksGR)
+            
+            # instead, different column for each sample is the counts columns, plural
+            reduce_dt <- data.table(chr=as.character(seqnames(reduceGR)),
+                                    start=start(reduceGR),
+                                    end=end(reduceGR))
+            f <- function(x) {list(0)}
+            #reduce_dt[, (sample_names) := f()]
+            # Need to make syntactically valid names
+            valid_names <- make.unique(make.names(st_list[[g]]$sample_name))
+            reduce_dt[, (valid_names) := f()]
+            
+            # Report name changes
+            original_invalid <- setdiff(
+                make.unique(st_list[[g]]$sample_name), valid_names)
+            new_valid        <- setdiff(
+                valid_names, make.unique(st_list[[g]]$sample_name))
+            if(length(new_valid) > 0) {
+                warning(strwrap(prefix = " ", initial = "", 
+                        "Some sample names were changed to be syntactically
+                         valid in R:"))
+                warning(paste(capture.output(print(
+                        data.frame(Original=original_invalid,
+                                   Valid=new_valid))),
+                        collapse = "\n"))
+            }
+            st_list[[g]]$sample_name <- make.unique(make.names(st_list[[g]]$sample_name))
+            # for each peak file
+            i <- 1
+            for (file in st_list[[g]]$peak_files) {
+                info <- file.info(file.path(file))
+                if (file.exists(file.path(file)) && info$size != 0) {
+                    p    <- fread(file)
+                    #name <- gsub("_peaks_coverage.bed","", basename(file))
+                    name <- make.unique(make.names(st_list[[g]][i]$sample_name))
+                    #message(paste0("name: ", name))
+                    i    <- i + 1
+                    #message(paste0("i: ", i))
+                    colnames(p) <- c("chr", "start", "end", "read_count",
+                                     "base_count", "width", "frac", "norm")
+                    setkey(p, chr, start, end)
+                    
+                    reduceGR <- makeGRangesFromDataFrame(reduce_dt)
+                    peaksGR  <- makeGRangesFromDataFrame(p)
+                    hitsGR   <- findOverlaps(query=reduceGR, subject=peaksGR)
+
+                    # Weight counts by percent overlap
+                    olap   <- pintersect(reduceGR[queryHits(hitsGR)],
+                                         peaksGR[subjectHits(hitsGR)])
+                    polap  <- width(olap) / width(peaksGR[subjectHits(hitsGR)])
+                    counts <- data.table(index=rep(1:nrow(p)),
+                                         counts=p$norm*polap)
+                    hits   <- data.table(xid=queryHits(hitsGR),
+                                         yid=subjectHits(hitsGR))
+                    setkey(hits, yid)
+                    setkey(counts, index)
+                    out    <- hits[counts, nomatch=0]
+                    keep   <- out[out[,.I[which.max(counts)],by=yid]$V1]
+                    reduce_dt[keep$xid][[name]] <- keep$counts
+                    rm(p)
+                    invisible(gc())
+                }
+            }
+            # trim any bad peaks (extend beyond chromosome)
+            # can't be negative
+            reduce_dt[start < 0, start := 0]
+            # ensure sorted
+            setorderv(reduce_dt, cols = c("chr", "start"))
+            # can't extend past chromosome
+            for (i in nrow(c_size)) {
+                reduce_dt[chr == c_size$chr[i] & end > c_size$size[i],
+                          end := c_size$size[i]]
+            }
+            columnIndices <- c(4:ncol(reduce_dt))
+            # Drop regions with no coverage
+            keepRows      <- rowMeans(reduce_dt[, ..columnIndices]) > 0
+            reduce_dt     <- reduce_dt[keepRows]
+            # identify reproducible peaks
+            reduce_dt$count <- apply(reduce_dt[, ..columnIndices], 1,
+                                     function(x) sum(x > 0))
+            if (length(st_list[[g]]$peak_files) > 1) {
+                # keep peaks present in 2 or more individual peak sets
+                reduce_dt <- reduce_dt[count >= 2,]
+            }
+            reduce_dt[,count := NULL]
+        }
+        # Create matrix rownames
+        reduce_dt <- cbind(name=paste(reduce_dt$chr,
+                                      reduce_dt$start,
+                                      reduce_dt$end, sep="_"),
+                           reduce_dt)
+        reduce_dt <- reduce_dt[,-c("chr","start","end")]
+
+        # save counts table
+        if (exists("reduce_dt")) {
+                output_file <- file.path(summary_dir,
+                    paste0(project_name, "_", g, "_peaks_coverage.tsv"))
+                # save consensus peak set
+                fwrite(reduce_dt, output_file, sep="\t", col.names=TRUE)
+                counts_path <- c(counts_path, output_file)
+        } else {
+            warning(strwrap(prefix = " ", initial = "",
+                    "Unable to produce a peak coverage file. Check that 
+                     individual peak coverage files exist for your samples."))
             return(NULL)
         }
-        for (i in nrow(c_size)) {
-            reduce_dt[chr == c_size$chr[i] & end > c_size$size[i],
-                      end := c_size$size[i]]
-        }
-        columnIndices <- c(4:ncol(reduce_dt))
-        # Drop regions with no coverage
-        keepRows      <- rowMeans(reduce_dt[, ..columnIndices]) > 0
-        reduce_dt     <- reduce_dt[keepRows]
-        # identify reproducible peaks
-        reduce_dt$count <- apply(reduce_dt[, ..columnIndices], 1,
-                                 function(x) sum(x > 0))
-        if (length(peak_files) > 1) {
-            # keep peaks present in 2 or more individual peak sets
-            reduce_dt <- reduce_dt[count >= 2,]
-        }        
-        reduce_dt[,count := NULL]
-    }
-    # Create matrix rownames
-    reduce_dt <- cbind(name=paste(reduce_dt$chr,
-                                  reduce_dt$start,
-                                  reduce_dt$end, sep="_"),
-                       reduce_dt)
-    reduce_dt <- reduce_dt[,-c("chr","start","end")]
-
-    # save counts table
-    if (exists("reduce_dt")) {
-            counts_path <- file.path(summary_dir,
-                paste0(project_name, "_peaks_coverage.tsv"))
-            # save consensus peak set
-            fwrite(reduce_dt, counts_path, sep="\t", col.names=TRUE)
-            return(counts_path)
-    } else {
-        warning("Unable to produce a peak coverage file.")
-        warning("Check that individual peak coverage files exist for your samples.")
-        return(NULL)
     }
+    return(counts_path)
 }
 
 

From 4c28c91ecc2576986b363d8d7b1fdc2b592edcd4 Mon Sep 17 00:00:00 2001
From: jpsmith5 <jasonsmith@virginia.edu>
Date: Tue, 4 Aug 2020 13:05:59 -0400
Subject: [PATCH 08/10] update ability to run multiple genomes for the peak
 count tables

---
 tools/PEPATAC_summarizer.R | 48 +++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 19 deletions(-)

diff --git a/tools/PEPATAC_summarizer.R b/tools/PEPATAC_summarizer.R
index aaf66a2b..7819f373 100755
--- a/tools/PEPATAC_summarizer.R
+++ b/tools/PEPATAC_summarizer.R
@@ -110,7 +110,6 @@ if (dir.exists(argv$results)) {
     quit()
 }
 
-
 # Get assets
 assets  <- PEPATACr::createAssetsSummary(prj, argv$output, results_subdir)
 if (nrow(assets) == 0) {
@@ -172,7 +171,7 @@ if (!file.exists(complexity_path) || argv$new_start) {
         complexity_flag <- TRUE
     }
 } else {
-    warning("Project level library complexity plot already exists.")
+    message("Project level library complexity plot already exists.")
     message(paste0("Project library complexity plot: ", complexity_path, "\n"))
     complexity_flag <- TRUE
 }
@@ -185,7 +184,7 @@ if (summarizer_flag && complexity_flag) {
 for (genome in unique(genomes)) {
     file_name      <- paste0("_", genome,"_consensusPeaks.narrowPeak")
     consensus_path <- file.path(summary_dir, paste0(project_name, file_name))
-    if (file.exists(consensus_path)) {
+    if (file.exists(consensus_path) && !argv$new_start) {
         message(paste0("Consensus peak set (", genome, "): ",
                        consensus_path, "\n"))
     }
@@ -193,7 +192,7 @@ for (genome in unique(genomes)) {
 
 # Calculate consensus peaks
 if (!file.exists(consensus_path) || argv$new_start) {
-    write(paste0("Creating consensus peak set..."), stdout())
+    #write(paste0("Creating consensus peak set..."), stdout())
     consensus_paths <- PEPATACr::consensusPeaks(prj, argv$output,
                                                 argv$results, assets)
     if (!length(consensus_paths) == 0) {
@@ -213,24 +212,35 @@ if (!file.exists(consensus_path) || argv$new_start) {
     }
 }
 
+# Report existing counts tables
+# TODO: move genome handling out of the called function?
+for (genome in unique(genomes)) {
+    file_name   <- paste0("_", genome,"_peaks_coverage.tsv")
+    counts_path <- file.path(summary_dir, paste0(project_name, file_name))
+    if (file.exists(counts_path) && !argv$new_start) {
+        message(paste0("Peak counts table (", genome, "): ",
+                       counts_path, "\n"))
+    }
+}
+
 # Create count matrix
-counts_path <- file.path(summary_dir,
-                         paste0(project_name, "_peaks_coverage.tsv"))
 if (!file.exists(counts_path) || argv$new_start) {
-    write(paste0("Creating gene count table..."), stdout())
-    counts_path <- PEPATACr::peakCounts(prj, argv$output, argv$results, assets)
-    if (!is.null(counts_path) && file.exists(counts_path)) {
-        message("Counts table: ", counts_path, "\n")
-        icon <- PEPATACr::fileIcon()
-        output_file <- file.path(summary_dir,
-                                 paste0(project_name, "_peaks_coverage.png"))
-        png(filename = output_file, height = 275, width=275,
-            bg="transparent")
-        suppressWarnings(print(icon))
-        invisible(dev.off())
+    #write(paste0("Creating peak count table(s)..."), stdout())
+    counts_paths <- PEPATACr::peakCounts(prj, argv$output, argv$results, assets)
+    if (!length(counts_paths) == 0) {
+        for (counts_table in counts_paths) {
+            if (file.exists(counts_table)) {
+                message("Counts table: ", counts_table, "\n")
+                icon        <- PEPATACr::fileIcon()
+                output_file <- file.path(summary_dir,
+                    paste0(project_name, "_", genome, "_peaks_coverage.png"))
+                png(filename = output_file, height = 275, width=275,
+                    bg="transparent")
+                suppressWarnings(print(icon))
+                invisible(dev.off())
+            }
+        }
     }
-} else {
-   message("Counts table: ", counts_path, "\n")
 }
 
 ################################################################################
\ No newline at end of file

From eb1e9ec6cc728d7f17ed8ee575b44c3ece29706a Mon Sep 17 00:00:00 2001
From: jpsmith5 <jasonsmith@virginia.edu>
Date: Tue, 4 Aug 2020 13:06:10 -0400
Subject: [PATCH 09/10] updates

---
 pepatac_output_schema.yaml | 11 ++++----
 pipelines/pepatac.py       | 54 +++++++++++++++++++++++++++++++++-----
 2 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/pepatac_output_schema.yaml b/pepatac_output_schema.yaml
index 11f400b8..edd2dc1e 100644
--- a/pepatac_output_schema.yaml
+++ b/pepatac_output_schema.yaml
@@ -52,11 +52,12 @@ properties:
   consensus_peaks_file:
     title: "Consensus peaks file"
     description: "A set of consensus peaks across samples."
-    thumbnail_path: "summary/{name}_consensusPeaks.png"
-    path: "summary/{name}_consensusPeaks.narrowPeak"
-    type: image
+    thumbnail_path: "summary/{name}_*_consensusPeaks.png"
+    path: "summary/{name}_*_consensusPeaks.narrowPeak"
+    type: string
   counts_table:
     title: "Project peak coverage file"
     description: "Project peak coverages: chr_start_end X sample"
-    path: "summary/{name}_peaks_coverage.tsv"
-    type: link
+    thumbnail_path: "summary/{name}_*_peaks_coverage.png"
+    path: "summary/{name}_*_peaks_coverage.tsv"
+    type: string
diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py
index c36af5ce..4e80685d 100755
--- a/pipelines/pepatac.py
+++ b/pipelines/pepatac.py
@@ -123,6 +123,10 @@ def parse_arguments():
                         help="Only keep minimal, essential output to conserve "
                              "disk space.")
 
+    parser.add_argument("--skipqc", dest="skipqc", action='store_true',
+                        help="Skip FastQC. Useful for bugs in FastQC "
+                             "that appear with some sequence read files.")
+
     parser.add_argument("-V", "--version", action="version",
                         version="%(prog)s {v}".format(v=__version__))
 
@@ -712,6 +716,13 @@ def main():
         trimming_prefix = out_fastq_pre
     trimmed_fastq = trimming_prefix + "_R1_trim.fastq"
     trimmed_fastq_R2 = trimming_prefix + "_R2_trim.fastq"
+    fastqc_folder = os.path.join(param.outfolder, "fastqc")
+    fastqc_report = os.path.join(fastqc_folder,
+        trimming_prefix + "_R1_trim_fastqc.html")
+    fastqc_report_R2 = os.path.join(fastqc_folder,
+        trimming_prefix + "_R2_trim_fastqc.html")
+    if ngstk.check_command(tools.fastqc):
+        ngstk.make_dir(fastqc_folder)
 
     # Create trimming command(s).
     if args.trimmer == "pyadapt":
@@ -726,7 +737,7 @@ def main():
             ("-o", out_fastq_pre),
             "-u"
         ]
-        cmd = build_command(trim_cmd_chunks)
+        trim_cmd = build_command(trim_cmd_chunks)
 
     elif args.trimmer == "skewer":
         # Create the primary skewer command.
@@ -762,7 +773,7 @@ def main():
                                       for old, new in skewer_filename_pairs]
 
         # Pypiper submits the commands serially.
-        cmd = [trimming_command] + trimming_renaming_commands
+        trim_cmd = [trimming_command] + trimming_renaming_commands
 
     else:
         # Default to trimmomatic.
@@ -780,13 +791,42 @@ def main():
             trimming_prefix + "_R2_unpaired.fq" if args.paired_end else "",
             "ILLUMINACLIP:" + res.adapters + ":2:30:10"
         ]
-        cmd = build_command(trim_cmd_chunks)
+        trim_cmd = build_command(trim_cmd_chunks)
+
+    def check_trim():
+        pm.info("Evaluating read trimming")
+
+        if args.paired_end and not trimmed_fastq_R2:
+            pm.warning("Specified paired-end but no R2 file")
+
+        n_trim = float(ngstk.count_reads(trimmed_fastq, args.paired_end))
+        pm.report_result("Trimmed_reads", int(n_trim))
+        try:
+            rr = float(pm.get_stat("Raw_reads"))
+        except:
+            pm.warning("Can't calculate trim loss rate without raw read result.")
+        else:
+            pm.report_result(
+                "Trim_loss_rate", round((rr - n_trim) * 100 / rr, 2))
+
+        # Also run a fastqc (if installed/requested)
+        if fastqc_folder and not args.skipqc:
+            if fastqc_folder and os.path.isabs(fastqc_folder):
+                ngstk.make_sure_path_exists(fastqc_folder)
+            cmd = (tools.fastqc + " --noextract --outdir " +
+                   fastqc_folder + " " + trimmed_fastq)
+            pm.run(cmd, fastqc_report, nofail=False)
+            pm.report_object("FastQC report r1", fastqc_report)
+
+            if args.paired_end and trimmed_fastq_R2:
+                cmd = (tools.fastqc + " --noextract --outdir " +
+                       fastqc_folder + " " + trimmed_fastq_R2)
+                pm.run(cmd, fastqc_report_R2, nofail=False)
+                pm.report_object("FastQC report r2", fastqc_report_R2)
 
     if not os.path.exists(rmdup_bam) or args.new_start:
-        pm.run(cmd, trimmed_fastq,
-               follow=ngstk.check_trim(
-                   trimmed_fastq, args.paired_end, trimmed_fastq_R2,
-                   fastqc_folder=os.path.join(param.outfolder, "fastqc")))
+        pm.debug("trim_cmd: {}".format(trim_cmd))
+        pm.run(trim_cmd, trimmed_fastq, follow=check_trim) 
 
     pm.clean_add(os.path.join(fastq_folder, "*.fastq"), conditional=True)
     pm.clean_add(os.path.join(fastq_folder, "*.log"), conditional=True)

From 21144a2104a4a50e3c30df161d42325af66650ac Mon Sep 17 00:00:00 2001
From: jpsmith5 <jasonsmith@virginia.edu>
Date: Tue, 4 Aug 2020 13:08:47 -0400
Subject: [PATCH 10/10] version bump

---
 docs/changelog.md    | 6 ++++++
 docs/usage.md        | 6 ++++--
 pipelines/pepatac.py | 2 +-
 usage.txt            | 6 ++++--
 4 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index 831b7165..36a1356d 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,6 +1,12 @@
 # Change log
 All notable changes to this project will be documented in this file.
 
+## [0.9.2] -- 2020-08-04
+
+### Changed
+ - Reduce memory requirements of consensus peak generation
+ - Enable multiple genome projects for peak count tables
+
 ## [0.9.1] -- 2020-07-13
 
 ### Changed
diff --git a/docs/usage.md b/docs/usage.md
index a1122425..a5fb5244 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -19,9 +19,9 @@ usage: pepatac.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V]
                   [--peak-type {fixed,variable}] [--extend EXTEND]
                   [--frip-ref-peaks FRIP_REF_PEAKS] [--motif] [--sob]
                   [--no-scale] [--prioritize] [--keep] [--noFIFO] [--lite]
-                  [-V]
+                  [--skipqc] [-V]
 
-PEPATAC version 0.9.1
+PEPATAC version 0.9.2
 
 optional arguments:
   -h, --help            show this help message and exit
@@ -86,6 +86,8 @@ optional arguments:
   --noFIFO              Do NOT use named pipes during prealignments
   --lite                Only keep minimal, essential output to conserve disk
                         space.
+  --skipqc              Skip FastQC. Useful for bugs in FastQC that appear
+                        with some sequence read files.
   -V, --version         show program's version number and exit
 
 required named arguments:
diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py
index 4e80685d..0117c505 100755
--- a/pipelines/pepatac.py
+++ b/pipelines/pepatac.py
@@ -5,7 +5,7 @@
 
 __author__ = ["Jin Xu", "Nathan Sheffield", "Jason Smith"]
 __email__ = "jasonsmith@virginia.edu"
-__version__ = "0.9.1"
+__version__ = "0.9.2"
 
 
 from argparse import ArgumentParser
diff --git a/usage.txt b/usage.txt
index 6c31a986..5b34720b 100644
--- a/usage.txt
+++ b/usage.txt
@@ -11,9 +11,9 @@ usage: pepatac.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V]
                   [--peak-type {fixed,variable}] [--extend EXTEND]
                   [--frip-ref-peaks FRIP_REF_PEAKS] [--motif] [--sob]
                   [--no-scale] [--prioritize] [--keep] [--noFIFO] [--lite]
-                  [-V]
+                  [--skipqc] [-V]
 
-PEPATAC version 0.9.1
+PEPATAC version 0.9.2
 
 optional arguments:
   -h, --help            show this help message and exit
@@ -78,6 +78,8 @@ optional arguments:
   --noFIFO              Do NOT use named pipes during prealignments
   --lite                Only keep minimal, essential output to conserve disk
                         space.
+  --skipqc              Skip FastQC. Useful for bugs in FastQC that appear
+                        with some sequence read files.
   -V, --version         show program's version number and exit
 
 required named arguments: