Merge pull request #143 from databio/dev

Dev
databio · Aug 4, 2020 · ad34491 · ad34491
2 parents 10e75d1 + 21144a2
commit ad34491
Show file tree

Hide file tree

Showing 9 changed files with 406 additions and 260 deletions.
diff --git a/PEPATACr/R/PEPATACr.R b/PEPATACr/R/PEPATACr.R
diff --git a/docs/annotation.md b/docs/annotation.md
@@ -2,7 +2,7 @@
 
 The pipeline uses reference data at various stages. If you're using a common genome assembly, these resources are pre-built and can be easily downloaded using `refgenie pull`, as described in the setup instructions. If the resources are not available, you'll have to build them. Read [how to build `refgenie` assets](http://refgenie.databio.org/en/latest/build/) in the `refgenie` docs.  You may also [learn about the current buildable assets](http://refgenie.databio.org/en/latest/available_assets/) to which `refgenie` knows the recipe.
 
-##Use a custom `feat_annotation` asset
+## Use a custom `feat_annotation` asset
 
 The pipeline will calculate the fraction of reads in genomic features using the `refgenie` `feat_annotation` asset, but you can also specify this file yourself at the command line (`--anno-name`).
 

diff --git a/docs/changelog.md b/docs/changelog.md
@@ -1,6 +1,12 @@
 # Change log
 All notable changes to this project will be documented in this file.
 
+## [0.9.2] -- 2020-08-04
+
+### Changed
+ - Reduce memory requirements of consensus peak generation
+ - Enable multiple genome projects for peak count tables
+
 ## [0.9.1] -- 2020-07-13
 
 ### Changed

diff --git a/docs/usage.md b/docs/usage.md
@@ -19,9 +19,9 @@ usage: pepatac.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V]
                   [--peak-type {fixed,variable}] [--extend EXTEND]
                   [--frip-ref-peaks FRIP_REF_PEAKS] [--motif] [--sob]
                   [--no-scale] [--prioritize] [--keep] [--noFIFO] [--lite]
-                  [-V]
+                  [--skipqc] [-V]
 
-PEPATAC version 0.9.1
+PEPATAC version 0.9.2
 
 optional arguments:
   -h, --help            show this help message and exit
@@ -86,6 +86,8 @@ optional arguments:
   --noFIFO              Do NOT use named pipes during prealignments
   --lite                Only keep minimal, essential output to conserve disk
                         space.
+  --skipqc              Skip FastQC. Useful for bugs in FastQC that appear
+                        with some sequence read files.
   -V, --version         show program's version number and exit
 
 required named arguments:

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -2,10 +2,10 @@ theme: databio
 
 site_name: PEPATAC
 site_author: Jason Smith
-site_url: http://code.databio.org/PEPATAC/
+site_url: http://pepatac.databio.org/
 site_logo: img/pepatac_logo_white.svg
 repo_url: https://github.com/databio/pepatac/
-google_analytics: ['UA-127092878-1', 'code.databio.org/PEPATAC']
+google_analytics: ['UA-127092878-1', 'pepatac.databio.org']
 
 markdown_extensions:
   - fontawesome_markdown  # pip install --user fontawesome-markdown
@@ -47,6 +47,3 @@ navbar:
   - text: Software & Data
     icon: fa-code fa-lg
     href: http://databio.org/software/
-  # - text: Contact us
-  #   icon: fa-envelope
-  #   href: contact
diff --git a/pepatac_output_schema.yaml b/pepatac_output_schema.yaml
@@ -52,11 +52,12 @@ properties:
   consensus_peaks_file:
     title: "Consensus peaks file"
     description: "A set of consensus peaks across samples."
-    thumbnail_path: "summary/{name}_consensusPeaks.png"
-    path: "summary/{name}_consensusPeaks.narrowPeak"
-    type: image
+    thumbnail_path: "summary/{name}_*_consensusPeaks.png"
+    path: "summary/{name}_*_consensusPeaks.narrowPeak"
+    type: string
   counts_table:
     title: "Project peak coverage file"
     description: "Project peak coverages: chr_start_end X sample"
-    path: "summary/{name}_peaks_coverage.tsv"
-    type: link
+    thumbnail_path: "summary/{name}_*_peaks_coverage.png"
+    path: "summary/{name}_*_peaks_coverage.tsv"
+    type: string
diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py
@@ -5,7 +5,7 @@
 
 __author__ = ["Jin Xu", "Nathan Sheffield", "Jason Smith"]
 __email__ = "[email protected]"
-__version__ = "0.9.1"
+__version__ = "0.9.2"
 
 
 from argparse import ArgumentParser
@@ -123,6 +123,10 @@ def parse_arguments():
                         help="Only keep minimal, essential output to conserve "
                              "disk space.")
 
+    parser.add_argument("--skipqc", dest="skipqc", action='store_true',
+                        help="Skip FastQC. Useful for bugs in FastQC "
+                             "that appear with some sequence read files.")
+
     parser.add_argument("-V", "--version", action="version",
                         version="%(prog)s {v}".format(v=__version__))
 
@@ -712,6 +716,13 @@ def main():
         trimming_prefix = out_fastq_pre
     trimmed_fastq = trimming_prefix + "_R1_trim.fastq"
     trimmed_fastq_R2 = trimming_prefix + "_R2_trim.fastq"
+    fastqc_folder = os.path.join(param.outfolder, "fastqc")
+    fastqc_report = os.path.join(fastqc_folder,
+        trimming_prefix + "_R1_trim_fastqc.html")
+    fastqc_report_R2 = os.path.join(fastqc_folder,
+        trimming_prefix + "_R2_trim_fastqc.html")
+    if ngstk.check_command(tools.fastqc):
+        ngstk.make_dir(fastqc_folder)
 
     # Create trimming command(s).
     if args.trimmer == "pyadapt":
@@ -726,7 +737,7 @@ def main():
             ("-o", out_fastq_pre),
             "-u"
         ]
-        cmd = build_command(trim_cmd_chunks)
+        trim_cmd = build_command(trim_cmd_chunks)
 
     elif args.trimmer == "skewer":
         # Create the primary skewer command.
@@ -762,7 +773,7 @@ def main():
                                       for old, new in skewer_filename_pairs]
 
         # Pypiper submits the commands serially.
-        cmd = [trimming_command] + trimming_renaming_commands
+        trim_cmd = [trimming_command] + trimming_renaming_commands
 
     else:
         # Default to trimmomatic.
@@ -780,13 +791,42 @@ def main():
             trimming_prefix + "_R2_unpaired.fq" if args.paired_end else "",
             "ILLUMINACLIP:" + res.adapters + ":2:30:10"
         ]
-        cmd = build_command(trim_cmd_chunks)
+        trim_cmd = build_command(trim_cmd_chunks)
+
+    def check_trim():
+        pm.info("Evaluating read trimming")
+
+        if args.paired_end and not trimmed_fastq_R2:
+            pm.warning("Specified paired-end but no R2 file")
+
+        n_trim = float(ngstk.count_reads(trimmed_fastq, args.paired_end))
+        pm.report_result("Trimmed_reads", int(n_trim))
+        try:
+            rr = float(pm.get_stat("Raw_reads"))
+        except:
+            pm.warning("Can't calculate trim loss rate without raw read result.")
+        else:
+            pm.report_result(
+                "Trim_loss_rate", round((rr - n_trim) * 100 / rr, 2))
+
+        # Also run a fastqc (if installed/requested)
+        if fastqc_folder and not args.skipqc:
+            if fastqc_folder and os.path.isabs(fastqc_folder):
+                ngstk.make_sure_path_exists(fastqc_folder)
+            cmd = (tools.fastqc + " --noextract --outdir " +
+                   fastqc_folder + " " + trimmed_fastq)
+            pm.run(cmd, fastqc_report, nofail=False)
+            pm.report_object("FastQC report r1", fastqc_report)
+
+            if args.paired_end and trimmed_fastq_R2:
+                cmd = (tools.fastqc + " --noextract --outdir " +
+                       fastqc_folder + " " + trimmed_fastq_R2)
+                pm.run(cmd, fastqc_report_R2, nofail=False)
+                pm.report_object("FastQC report r2", fastqc_report_R2)
 
     if not os.path.exists(rmdup_bam) or args.new_start:
-        pm.run(cmd, trimmed_fastq,
-               follow=ngstk.check_trim(
-                   trimmed_fastq, args.paired_end, trimmed_fastq_R2,
-                   fastqc_folder=os.path.join(param.outfolder, "fastqc")))
+        pm.debug("trim_cmd: {}".format(trim_cmd))
+        pm.run(trim_cmd, trimmed_fastq, follow=check_trim) 
 
     pm.clean_add(os.path.join(fastq_folder, "*.fastq"), conditional=True)
     pm.clean_add(os.path.join(fastq_folder, "*.log"), conditional=True)
@@ -875,8 +915,9 @@ def main():
     pm.clean_add(tempdir)
 
     # If there are no prealignments, unmap_fq1 will be unzipped
-    if pypiper.is_gzipped_fastq(unmap_fq1):
+    if os.path.exists(unmap_fq1 + ".gz"):
         unmap_fq1 = unmap_fq1 + ".gz"
+    if os.path.exists(unmap_fq2 + ".gz"):
         unmap_fq2 = unmap_fq2 + ".gz"
 
     bt2_index = os.path.join(rgc.seek(args.genome_assembly, BT2_IDX_KEY))
@@ -1742,7 +1783,7 @@ def report_peak_count():
                     black_local = os.path.join(raw_folder,
                                                args.genome_assembly +
                                                "_blacklist.bed")
-                    cmd = ("ln -sf " + res.feat_annotation + " " + black_local)
+                    cmd = ("ln -sf " + res.blacklist + " " + black_local)
                     pm.run(cmd, black_local)
                 else:
                     print("Skipping peak filtering...")

diff --git a/tools/PEPATAC_summarizer.R b/tools/PEPATAC_summarizer.R
@@ -110,7 +110,6 @@ if (dir.exists(argv$results)) {
     quit()
 }
 
-
 # Get assets
 assets  <- PEPATACr::createAssetsSummary(prj, argv$output, results_subdir)
 if (nrow(assets) == 0) {
@@ -172,7 +171,7 @@ if (!file.exists(complexity_path) || argv$new_start) {
         complexity_flag <- TRUE
     }
 } else {
-    warning("Project level library complexity plot already exists.")
+    message("Project level library complexity plot already exists.")
     message(paste0("Project library complexity plot: ", complexity_path, "\n"))
     complexity_flag <- TRUE
 }
@@ -185,15 +184,15 @@ if (summarizer_flag && complexity_flag) {
 for (genome in unique(genomes)) {
     file_name      <- paste0("_", genome,"_consensusPeaks.narrowPeak")
     consensus_path <- file.path(summary_dir, paste0(project_name, file_name))
-    if (file.exists(consensus_path)) {
+    if (file.exists(consensus_path) && !argv$new_start) {
         message(paste0("Consensus peak set (", genome, "): ",
                        consensus_path, "\n"))
     }
 }
 
 # Calculate consensus peaks
 if (!file.exists(consensus_path) || argv$new_start) {
-    write(paste0("Creating consensus peak set..."), stdout())
+    #write(paste0("Creating consensus peak set..."), stdout())
     consensus_paths <- PEPATACr::consensusPeaks(prj, argv$output,
                                                 argv$results, assets)
     if (!length(consensus_paths) == 0) {
@@ -213,24 +212,35 @@ if (!file.exists(consensus_path) || argv$new_start) {
     }
 }
 
+# Report existing counts tables
+# TODO: move genome handling out of the called function?
+for (genome in unique(genomes)) {
+    file_name   <- paste0("_", genome,"_peaks_coverage.tsv")
+    counts_path <- file.path(summary_dir, paste0(project_name, file_name))
+    if (file.exists(counts_path) && !argv$new_start) {
+        message(paste0("Peak counts table (", genome, "): ",
+                       counts_path, "\n"))
+    }
+}
+
 # Create count matrix
-counts_path <- file.path(summary_dir,
-                         paste0(project_name, "_peaks_coverage.tsv"))
 if (!file.exists(counts_path) || argv$new_start) {
-    write(paste0("Creating gene count table..."), stdout())
-    counts_path <- PEPATACr::peakCounts(prj, argv$output, argv$results, assets)
-    if (!is.null(counts_path) && file.exists(counts_path)) {
-        message("Counts table: ", counts_path, "\n")
-        icon <- PEPATACr::fileIcon()
-        output_file <- file.path(summary_dir,
-                                 paste0(project_name, "_peaks_coverage.png"))
-        png(filename = output_file, height = 275, width=275,
-            bg="transparent")
-        suppressWarnings(print(icon))
-        invisible(dev.off())
+    #write(paste0("Creating peak count table(s)..."), stdout())
+    counts_paths <- PEPATACr::peakCounts(prj, argv$output, argv$results, assets)
+    if (!length(counts_paths) == 0) {
+        for (counts_table in counts_paths) {
+            if (file.exists(counts_table)) {
+                message("Counts table: ", counts_table, "\n")
+                icon        <- PEPATACr::fileIcon()
+                output_file <- file.path(summary_dir,
+                    paste0(project_name, "_", genome, "_peaks_coverage.png"))
+                png(filename = output_file, height = 275, width=275,
+                    bg="transparent")
+                suppressWarnings(print(icon))
+                invisible(dev.off())
+            }
+        }
     }
-} else {
-   message("Counts table: ", counts_path, "\n")
 }
 
 ################################################################################
diff --git a/usage.txt b/usage.txt
@@ -11,9 +11,9 @@ usage: pepatac.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V]
                   [--peak-type {fixed,variable}] [--extend EXTEND]
                   [--frip-ref-peaks FRIP_REF_PEAKS] [--motif] [--sob]
                   [--no-scale] [--prioritize] [--keep] [--noFIFO] [--lite]
-                  [-V]
+                  [--skipqc] [-V]
 
-PEPATAC version 0.9.1
+PEPATAC version 0.9.2
 
 optional arguments:
   -h, --help            show this help message and exit
@@ -78,6 +78,8 @@ optional arguments:
   --noFIFO              Do NOT use named pipes during prealignments
   --lite                Only keep minimal, essential output to conserve disk
                         space.
+  --skipqc              Skip FastQC. Useful for bugs in FastQC that appear
+                        with some sequence read files.
   -V, --version         show program's version number and exit
 
 required named arguments: