Merge pull request #161 from databio/dev

0.12.0
databio · Aug 14, 2019 · 49ceff6 · 49ceff6
2 parents a5b7f12 + 54e398c
commit 49ceff6
Show file tree

Hide file tree

Showing 14 changed files with 776 additions and 265 deletions.
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -1,20 +1,35 @@
 # Changelog
 
+## [0.12.0] -- unreleased
+
+### Added
+- Use profile to determine total elapsed time
+- `logging` functions directly on `PipelineManager`
+- Re-export `add_logging_options` from `logmuse`, for direct use by a pipeline author.
+- `logger_via_cli` that defaults to the `strict=False` behavior of the same-named function from `logmuse`
+- Use logging for pypiper-generated output.
+
+### Fixed
+- Fix childless processes memory monitoring issue
+- Fix problems with runtime reading from pipeline profile TSV formatted according to two styles
+- Fix problems running containerized executables that would sometimes hang
+- Fix inaccurate elapsed time accumulation 
+
+### Changed
+- The hashes in the pipeline profile are produced from the entire original command, even if it is a pipe  
+
 ## [0.11.3] -- 2019-06-17
 ### Fixed
 - Fixed a bug that caused an OSError removing lock files for some filesystems.
 
-
 ## [0.11.2] -- 2019-06-06
 ### Fixed
 - Elevate `attmap` depdendency bound to require inclusion of improved path expansion behavior.
 
-
 ## [0.11.1] -- 2019-05-30
 ### Fixed
 - Elevate `attmap` dependency bound to require inclusion of a bugfix there.
 
-
 ## [0.11.0] -- 2019-05-13
 - Improve python3 handling of integers and strings
 - Fixed a bug with cleanup scripts in `dirty` mode
@@ -25,7 +40,6 @@
 - Some performance improvements for ngstk functions
 - Allow `ngstk.input_to_fastq` to yield gzipped fastq files
 
-
 ## [0.10.0] -- 2019-03-22
 - Fixed a bug that raised exception with empty commands
 - Fixed the pipeline profiling issues

diff --git a/example_pipelines/logmuse_example.py b/example_pipelines/logmuse_example.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python
+
+"""
+Counts reads.
+"""
+
+__author__ = "Nathan Sheffield"
+__email__ = "[email protected]"
+__license__ = "GPL3"
+__version__ = "0.1"
+
+from argparse import ArgumentParser
+import os, re
+import sys
+import subprocess
+import yaml
+import pypiper
+
+
+
+def build_argparser():
+
+    parser = ArgumentParser(
+        description="A pipeline to count the number of reads and file size. Accepts"
+        " BAM, fastq, or fastq.gz files.")
+
+    # First, add standard arguments from Pypiper.
+    # groups="pypiper" will add all the arguments that pypiper uses,
+    # and adding "common" adds arguments for --input and --sample--name
+    # and "output_parent". You can read more about your options for standard
+    # arguments in the pypiper docs (section "command-line arguments")
+    parser = pypiper.add_pypiper_args(parser, groups=["pypiper", "common", "ngs", "logmuse"],
+                                        args=["output-parent", "config"],
+                                        required=['sample-name', 'output-parent'])
+
+    # Add any pipeline-specific arguments if you like here.
+
+    # args for `output_parent` and `sample_name` were added by the standard 
+    # `add_pypiper_args` function. 
+
+    return parser
+
+def run_pipeline():
+    # A good practice is to make an output folder for each sample, housed under
+    # the parent output folder, like this:
+    outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name))
+
+    # Create a PipelineManager object and start the pipeline
+    pm = pypiper.PipelineManager(name="logmuse-test",
+                                 outfolder=outfolder, 
+                                 args=args)
+    pm.info("Getting started!")
+    # NGSTk is a "toolkit" that comes with pypiper, providing some functions
+    # for dealing with genome sequence data. You can read more about toolkits in the
+    # documentation
+
+    files = [str(x) + ".tmp" for x in range(1,20)]
+
+    pm.run("touch " + " ".join(files), target=files, clean=True)
+
+    # Create a ngstk object
+    ngstk = pypiper.NGSTk(pm=pm)
+
+    raw_folder = os.path.join(outfolder, "raw/")
+    fastq_folder = os.path.join(outfolder, "fastq/")
+
+    # Merge/Link sample input and Fastq conversion
+    # These commands merge (if multiple) or link (if single) input files,
+    # then convert (if necessary, for bam, fastq, or gz format) files to fastq.
+
+    # We'll start with a timestamp that will provide a division for this section
+    # in the log file
+    pm.timestamp("### Merge/link and fastq conversion: ")
+
+    # Now we'll rely on 2 NGSTk functions that can handle inputs of various types
+    # and convert these to fastq files.
+
+    local_input_files = ngstk.merge_or_link(
+                            [args.input, args.input2],
+                            raw_folder,
+                            args.sample_name)
+
+    cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq(
+                                                local_input_files,
+                                                args.sample_name,
+                                                args.paired_end,
+                                                fastq_folder)
+
+
+    # Now we'll use another NGSTk function to grab the file size from the input files
+    #
+    pm.report_result("File_mb", ngstk.get_file_size(local_input_files))
+
+
+    # And then count the number of reads in the file
+
+    n_input_files = len(list(filter(bool, local_input_files)))
+
+    raw_reads = sum([int(ngstk.count_reads(input_file, args.paired_end)) 
+                    for input_file in local_input_files]) / n_input_files
+
+    # Finally, we use the report_result() function to print the output and 
+    # log the key-value pair in the standard stats.tsv file
+    pm.report_result("Raw_reads", str(raw_reads))
+
+    # Cleanup
+    pm.stop_pipeline()
+
+
+if __name__ == '__main__':
+    try:
+        parser = build_argparser()
+        args = parser.parse_args()
+
+        if not args.input or not args.output_parent:
+            parser.print_help()
+            raise SystemExit
+
+        if args.single_or_paired == "paired":
+            args.paired_end = True
+        else:
+            args.paired_end = False
+
+        sys.exit(run_pipeline())
+    except KeyboardInterrupt:
+        sys.exit(1)
diff --git a/pypiper/__init__.py b/pypiper/__init__.py
@@ -5,3 +5,6 @@
 from .pipeline import *
 from .exceptions import *
 from .stage import *
+
+# Implicitly re-export so logmuse usage by pipeline author routes through here.
+from logmuse import add_logging_options
diff --git a/pypiper/_version.py b/pypiper/_version.py
@@ -1 +1 @@
-__version__ = "0.11.3"
+__version__ = "0.12.0dev"
diff --git a/pypiper/const.py b/pypiper/const.py
@@ -4,3 +4,4 @@
 CHECKPOINT_EXTENSION = ".checkpoint"
 PIPELINE_CHECKPOINT_DELIMITER = "_"
 STAGE_NAME_SPACE_REPLACEMENT = "-"
+PROFILE_COLNAMES = ['pid', 'hash', 'cid', 'runtime', 'mem', 'cmd', 'lock']