diff --git a/.travis.yml b/.travis.yml index f345282e..2308e86f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,6 @@ language: python python: - "2.7" - - "3.4" - "3.5" - "3.6" os: diff --git a/README.md b/README.md index f17aa958..046003da 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,9 @@ # Pypiper [![Documentation Status](https://readthedocs.org/projects/pypiper/badge/?version=latest)](http://pypiper.readthedocs.org/en/latest/?badge=latest) -[![Build Status](https://travis-ci.org/databio/pypiper.svg?branch=master)](https://travis-ci.org/databio/pypiper) +[![Build Status](https://github.com/databio/pypiper/actions/workflows/run-pytest.yml/badge.svg)](https://github.com/databio/pypiper/actions/workflows/run-pytest.yml) +[![PEP compatible](http://pepkit.github.io/img/PEP-compatible-green.svg)](http://pepkit.github.io) +[![pypi-badge](https://img.shields.io/pypi/v/piper)](https://pypi.org/project/piper) +[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) A lightweight python toolkit for gluing together restartable, robust shell pipelines. Learn more in the [documentation](http://pypiper.databio.org). diff --git a/docs/README.md b/docs/README.md index c13e8ba0..00f4b633 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,12 +1,17 @@ # a developer's pipeline framework [![PEP compatible](http://pepkit.github.io/img/PEP-compatible-green.svg)](http://pepkit.github.io) +[![pypi-badge](https://img.shields.io/pypi/v/piper)](https://pypi.org/project/piper) +[![Documentation Status](https://readthedocs.org/projects/pypiper/badge/?version=latest)](http://pypiper.readthedocs.org/en/latest/?badge=latest) +[![Build Status](https://github.com/databio/pypiper/actions/workflows/run-pytest.yml/badge.svg)](https://github.com/databio/pypiper/actions/workflows/run-pytest.yml) +[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) ## What is pypiper? `Pypiper` is a **development-oriented** pipeline framework. It is a python package that helps you write robust pipelines directly in python, handling mundane tasks like restartability, monitoring for time and memory use, monitoring job status, copious log output, robust error handling, easy debugging tools, and guaranteed file output integrity. + ## What makes pypiper better? With Pypiper, **simplicity is paramount**. Prerequisites are few: base python and 2 common packages (`pyyaml` and `psutil`). It should take fewer than 15 minutes to build your first pipeline and only an hour or two to learn the advanced features. Pypiper pipelines are: diff --git a/docs/changelog.md b/docs/changelog.md index ddecfdf5..34a500c9 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,5 +1,9 @@ # Changelog +## [0.13.0] -- 2023-06-29 +### Added + +- [pipestat](http://pipestat.databio.org/en/latest/) support ## [0.12.3] -- 2022-01-25 @@ -11,7 +15,6 @@ ### Fixed - Removed use2to3 for compatibility with setuptools 58 - ## [0.12.1] -- 2019-08-29 ### Fixed diff --git a/docs/conf.py b/docs/conf.py index 27ec7815..0566b0b9 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,66 +12,72 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys import os +import sys # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) -sys.path.insert(0, os.path.abspath('../../')) +# sys.path.insert(0, os.path.abspath('.')) +sys.path.insert(0, os.path.abspath("../../")) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.intersphinx', - 'sphinx.ext.todo', - 'sphinx.ext.coverage', - 'sphinx.ext.viewcode', + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.intersphinx", + "sphinx.ext.todo", + "sphinx.ext.coverage", + "sphinx.ext.viewcode", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = u'pypiper' -copyright = u'2015, Nathan Sheffield, Johanna Klughammer, Andre Rendeiro' +project = "pypiper" +copyright = "2015, Nathan Sheffield, Johanna Klughammer, Andre Rendeiro" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = open(os.path.join("..", "..", "pypiper", "_version.py")).read().strip().split(" ")[-1].strip('"') +version = ( + open(os.path.join("..", "..", "pypiper", "_version.py")) + .read() + .strip() + .split(" ")[-1] + .strip('"') +) # The full version, including alpha/beta/rc tags. release = version # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -#language = None +# language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -79,27 +85,27 @@ # The reST default role (used for this markup: `text`) to use for all # documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False +# keep_warnings = False # -- Options for HTML output ---------------------------------------------- @@ -115,122 +121,125 @@ # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -#html_theme_options = {} +# html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. -#html_extra_path = [] +# html_extra_path = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True +# html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'pypiperdoc' +htmlhelp_basename = "pypiperdoc" # -- Options for LaTeX output --------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + #'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - ('index', 'pypiper.tex', u'pypiper Documentation', - u'Nathan Sheffield, Johanna Klughammer, Andre Rendeiro', 'manual'), + ( + "index", + "pypiper.tex", + "pypiper Documentation", + "Nathan Sheffield, Johanna Klughammer, Andre Rendeiro", + "manual", + ), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output --------------------------------------- @@ -238,12 +247,17 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'pypiper', u'pypiper Documentation', - [u'Nathan Sheffield, Johanna Klughammer, Andre Rendeiro'], 1) + ( + "index", + "pypiper", + "pypiper Documentation", + ["Nathan Sheffield, Johanna Klughammer, Andre Rendeiro"], + 1, + ) ] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------- @@ -252,93 +266,99 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'pypiper', u'pypiper Documentation', - u'Nathan Sheffield, Johanna Klughammer, Andre Rendeiro', 'pypiper', 'One line description of project.', - 'Miscellaneous'), + ( + "index", + "pypiper", + "pypiper Documentation", + "Nathan Sheffield, Johanna Klughammer, Andre Rendeiro", + "pypiper", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False +# texinfo_no_detailmenu = False # -- Options for Epub output ---------------------------------------------- # Bibliographic Dublin Core info. -epub_title = u'pypiper' -epub_author = u'Nathan Sheffield, Johanna Klughammer, Andre Rendeiro' -epub_publisher = u'Nathan Sheffield, Johanna Klughammer, Andre Rendeiro' -epub_copyright = u'2015, Nathan Sheffield, Johanna Klughammer, Andre Rendeiro' +epub_title = "pypiper" +epub_author = "Nathan Sheffield, Johanna Klughammer, Andre Rendeiro" +epub_publisher = "Nathan Sheffield, Johanna Klughammer, Andre Rendeiro" +epub_copyright = "2015, Nathan Sheffield, Johanna Klughammer, Andre Rendeiro" # The basename for the epub file. It defaults to the project name. -#epub_basename = u'pypiper' +# epub_basename = u'pypiper' # The HTML theme for the epub output. Since the default themes are not optimized # for small screen space, using the same theme for HTML and epub output is # usually not wise. This defaults to 'epub', a theme designed to save visual # space. -#epub_theme = 'epub' +# epub_theme = 'epub' # The language of the text. It defaults to the language option # or en if the language is not set. -#epub_language = '' +# epub_language = '' # The scheme of the identifier. Typical schemes are ISBN or URL. -#epub_scheme = '' +# epub_scheme = '' # The unique identifier of the text. This can be a ISBN number # or the project homepage. -#epub_identifier = '' +# epub_identifier = '' # A unique identification for the text. -#epub_uid = '' +# epub_uid = '' # A tuple containing the cover image and cover page html template filenames. -#epub_cover = () +# epub_cover = () # A sequence of (type, uri, title) tuples for the guide element of content.opf. -#epub_guide = () +# epub_guide = () # HTML files that should be inserted before the pages created by sphinx. # The format is a list of tuples containing the path and title. -#epub_pre_files = [] +# epub_pre_files = [] # HTML files shat should be inserted after the pages created by sphinx. # The format is a list of tuples containing the path and title. -#epub_post_files = [] +# epub_post_files = [] # A list of files that should not be packed into the epub file. -epub_exclude_files = ['search.html'] +epub_exclude_files = ["search.html"] # The depth of the table of contents in toc.ncx. -#epub_tocdepth = 3 +# epub_tocdepth = 3 # Allow duplicate toc entries. -#epub_tocdup = True +# epub_tocdup = True # Choose between 'default' and 'includehidden'. -#epub_tocscope = 'default' +# epub_tocscope = 'default' # Fix unsupported image types using the PIL. -#epub_fix_images = False +# epub_fix_images = False # Scale large images. -#epub_max_image_width = 0 +# epub_max_image_width = 0 # How to display URL addresses: 'footnote', 'no', or 'inline'. -#epub_show_urls = 'inline' +# epub_show_urls = 'inline' # If false, no index is generated. -#epub_use_index = True +# epub_use_index = True # Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {'http://docs.python.org/': None} +intersphinx_mapping = {"http://docs.python.org/": None} diff --git a/docs/outputs.md b/docs/outputs.md index 30ad8b5e..786086b1 100644 --- a/docs/outputs.md +++ b/docs/outputs.md @@ -9,7 +9,7 @@ Assume you are using a pypiper pipeline named `PIPE` ( it passes `name="PIPE"` t * **PIPE_status.flag** As the pipeline runs, it produces a flag in the output directory, which can be either `PIPE_running.flag`, `PIPE_failed.flag`, or `PIPE_completed.flag`. These flags make it easy to assess the current state of running pipelines for individual samples, and for many samples in a project simultaneously. -* **stats.tsv** +* **stats.yaml** Any results reported by the pipeline are saved as key-value pairs in this file, for easy parsing. * **PIPE_profile.md** diff --git a/docs/pipestat.md b/docs/pipestat.md new file mode 100644 index 00000000..77534f09 --- /dev/null +++ b/docs/pipestat.md @@ -0,0 +1,122 @@ +# Pipestat + +Starting with pypiper v0.13.0 [pipestat](http://pipestat.databio.org) is the recommended way of reporting pipeline statistics. +You can browse the pipestat documentation to learn more about it, but briefly pipestat is a tool that standardizes reporting of pipeline results. It provides 1) a standard specification for how pipeline outputs should be stored; and 2) an implementation to easily write results to that format from within Python or from the command line. + +## Advancements + +There are a multiple advantages of using pipestat instead of the current pipeline results reporting system: + +1. **Database results storage:** the results can be stored either in a database or a YAML-formatted results file. This way a pypiper pipeline running in an emphemeral compute environment can report the results to the database and exit. No need to sync the results with a central results storage. +2. **Strict and clear results definition:** all the results that can be reported by a pipeline run *must* be pre-defined in a [pipestat results schema](http://pipestat.databio.org/en/latest/pipestat_specification/#pipestat-schema-format) that in a simplest case just indicates the result's type. This presents pipestat clients with the possibility to *reliably* gather all the possible results and related metadata. +3. **On-the-fly results validation:** the schema is used to validate and/or convert the reported result to a strictly determined type, which makes the connection of pypiper with downstream pipeline results processing software seamless. +4. **Unified, pipeline-agnostic results interface:** other pipelines, possibly created with different pipeline frameworks, can read and write results via Python API or command line interface. This feature significantly incerases your pipeline interoperability. + +## Setup + +In order to start reporting results with pipestat in your pipeline all you need to do is define a [pipestat resuts schema](http://pipestat.databio.org/en/latest/pipestat_specification/#pipestat-schema-format): + +```yaml +my_int_result: + type: integer + description: "This is my first result" +my_str_result: + type: string +``` + +And in the simplest case... that's it! Now you can use `pipestat` property of the `PipelineManager` object to report/retrieve results. + +Pypiper *by default* will use a YAML-formated file to store the reported results in the selected `outfolder` and will look for `pipestat_results_schema.yaml` file in the pipeline Python script directory. + +### Advanced features + +Pypiper-pipestat integration really shines when more advanced features are used. Here's how to set them up. + +#### Configure custom pipestat options + +You can configure pipestat by passing arguments with custom values to `pypiper.PipelineManager` constructor: + +```python +pm = pypiper.PipelineManager( + ..., + pipestat_schema="custom_results_schema.yaml", + pipestat_results_file="custom_results_file.yaml", + pipestat_sample_name="my_record", + pipestat_project_name="my_namespace", + pipestat_config="custom_pipestat_config.yaml", +) +``` + +#### Use a database to store reported results + +In order to establish a database connection pipestat requires few pieces of information, which *must* be provided in a [pipestat configuration file](http://pipestat.databio.org/en/latest/config/) passed to the `PipelineManager` constructor. + +This is an example of such a file: + +```yaml +database: + name: pypiper # database name + user: pypiper # database user name + password: pypiper # database password + host: localhost # database host address + port: 5433 # port the database is running on + dialect: postgresql # type of the databse + driver: psycopg2 # driver to use to communicate +``` + +For reference, here is a Docker command that would run a PostgreSQL instance that could be used to store the pipeline results when configured with with the configuration file above: + +```console +docker volume create postgres-data + +docker run -d --name pypiper-postgres \ +-p 5432:5433 -e POSTGRES_PASSWORD=pypiper \ +-e POSTGRES_USER=pypiper -e POSTGRES_DB=pypiper \ +-v postgres-data:/var/lib/postgresql/data postgres +``` + +#### Highlight results + +The pipestat results schema can include any number of additional attributes for results. An example of that is *results highlighting*. + +When a `highlight: true` attribute is included attribute under result identifier in the schema file the highlighted results can be later retrieved by pipestat clients via `PipelineManager.pipestat.highlighted_results` property, which simply returns a list of result identifiers. to be presented in a special way. + +### Usage + +Since a pipeline run-specific `PipestatManager` instance is attached to the `PipelineManager` object all the public pipestat API can be used. Please refer to the [pipestat API documentation](http://pipestat.databio.org/en/latest/autodoc_build/pipestat/) to read about all the currently available features. + +Here we present the most commonly used features: + +- results reporting + +*report a result, convert to schema-defined type and overwrite previously reported result* + +```python +results = { + "my_int_result": 10, + "my_str_result": "test" +} +pm.pipestat.report( + values=results, + strict_type=True, + force_overwrite=True +) +``` + +- results retrieval + +```python +pm.pipestat.retrieve(result_identifier="my_int_result") +``` + +- results schema exploration + +```python +pm.pipestat.schema +``` + +- exploration of canonical [jsonschema](https://json-schema.org/) representation of result schemas + +```python +pm.pipestat.result_schemas +``` diff --git a/docs/report.md b/docs/report.md index 7e8e5f46..fd50c270 100644 --- a/docs/report.md +++ b/docs/report.md @@ -6,6 +6,8 @@ When you call `pm.report_result(key, value)`, pypiper simply writes the key-valu ## Reporting objects +**Note**: Reporting objects will be deprecated in a future release. It is recommended to use `report_result`. + Starting in version 0.8, pypiper now implements a second reporting function, `report_object`. This is analogous to the `report_result` function, but instead of reporting simple key-value pairs, it lets you record any produced file as an output. Most commonly, this is used to record figures (PDFs, PNGs, etc.) produced by the pipeline. It can also be used to report other files, like HTML files. Pypiper writes results to `objects.tsv`, which can then be aggregated for project-level summaries of plots and other pipeline result files. diff --git a/example_pipelines/basic.py b/example_pipelines/basic.py index d4c7bd55..34a0d377 100755 --- a/example_pipelines/basic.py +++ b/example_pipelines/basic.py @@ -8,13 +8,13 @@ # First, make sure you can import the pypiper package import os + import pypiper # Create a PipelineManager instance (don't forget to name it!) # This starts the pipeline. -pm = pypiper.PipelineManager(name="BASIC", - outfolder="pipeline_output/") +pm = pypiper.PipelineManager(name="BASIC", outfolder="pipeline_output/") # Now just build shell command strings, and use the run function # to execute them in order. run needs 2 things: a command, and the @@ -57,5 +57,5 @@ # Now, stop the pipeline to complete gracefully. pm.stop_pipeline() -# Observe your outputs in the pipeline_output folder +# Observe your outputs in the pipeline_output folder # to see what you've created. diff --git a/example_pipelines/count_reads.py b/example_pipelines/count_reads.py index c9703da9..f7648dec 100755 --- a/example_pipelines/count_reads.py +++ b/example_pipelines/count_reads.py @@ -9,25 +9,32 @@ __license__ = "GPL3" __version__ = "0.1" -from argparse import ArgumentParser -import os, re -import sys +import os +import re import subprocess +import sys +from argparse import ArgumentParser + import yaml + import pypiper parser = ArgumentParser( description="A pipeline to count the number of reads and file size. Accepts" - " BAM, fastq, or fastq.gz files.") + " BAM, fastq, or fastq.gz files." +) # First, add standard arguments from Pypiper. # groups="pypiper" will add all the arguments that pypiper uses, # and adding "common" adds arguments for --input and --sample--name # and "output_parent". You can read more about your options for standard # arguments in the pypiper docs (section "command-line arguments") -parser = pypiper.add_pypiper_args(parser, groups=["pypiper", "common", "ngs"], - args=["output-parent", "config"], - required=['sample-name', 'output-parent']) +parser = pypiper.add_pypiper_args( + parser, + groups=["pypiper", "common", "ngs"], + args=["output-parent", "config"], + required=["sample-name", "output-parent"], +) # Add any pipeline-specific arguments if you like here. @@ -42,16 +49,14 @@ else: args.paired_end = False -# args for `output_parent` and `sample_name` were added by the standard -# `add_pypiper_args` function. +# args for `output_parent` and `sample_name` were added by the standard +# `add_pypiper_args` function. # A good practice is to make an output folder for each sample, housed under # the parent output folder, like this: outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name)) # Create a PipelineManager object and start the pipeline -pm = pypiper.PipelineManager(name="count", - outfolder=outfolder, - args=args) +pm = pypiper.PipelineManager(name="count", outfolder=outfolder, args=args) # NGSTk is a "toolkit" that comes with pypiper, providing some functions # for dealing with genome sequence data. You can read more about toolkits in the @@ -75,15 +80,12 @@ # and convert these to fastq files. local_input_files = ngstk.merge_or_link( - [args.input, args.input2], - raw_folder, - args.sample_name) + [args.input, args.input2], raw_folder, args.sample_name +) cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq( - local_input_files, - args.sample_name, - args.paired_end, - fastq_folder) + local_input_files, args.sample_name, args.paired_end, fastq_folder +) # Now we'll use another NGSTk function to grab the file size from the input files @@ -95,10 +97,17 @@ n_input_files = len(list(filter(bool, local_input_files))) -raw_reads = sum([int(ngstk.count_reads(input_file, args.paired_end)) - for input_file in local_input_files]) / n_input_files - -# Finally, we use the report_result() function to print the output and +raw_reads = ( + sum( + [ + int(ngstk.count_reads(input_file, args.paired_end)) + for input_file in local_input_files + ] + ) + / n_input_files +) + +# Finally, we use the report_result() function to print the output and # log the key-value pair in the standard stats.tsv file pm.report_result("Raw_reads", str(raw_reads)) diff --git a/example_pipelines/hello_pypiper.py b/example_pipelines/hello_pypiper.py index 2824a142..88abecfd 100755 --- a/example_pipelines/hello_pypiper.py +++ b/example_pipelines/hello_pypiper.py @@ -1,7 +1,8 @@ #!/usr/bin/env python import pypiper -outfolder = "hello_pypiper_results" # Choose a folder for your results + +outfolder = "hello_pypiper_results" # Choose a folder for your results # Create a PipelineManager, the workhorse of pypiper pm = pypiper.PipelineManager(name="hello_pypiper", outfolder=outfolder) diff --git a/example_pipelines/logmuse_example.py b/example_pipelines/logmuse_example.py index 91fe73f2..3b98b6df 100755 --- a/example_pipelines/logmuse_example.py +++ b/example_pipelines/logmuse_example.py @@ -9,52 +9,56 @@ __license__ = "GPL3" __version__ = "0.1" -from argparse import ArgumentParser -import os, re -import sys +import os +import re import subprocess +import sys +from argparse import ArgumentParser + import yaml -import pypiper +import pypiper def build_argparser(): - parser = ArgumentParser( description="A pipeline to count the number of reads and file size. Accepts" - " BAM, fastq, or fastq.gz files.") + " BAM, fastq, or fastq.gz files." + ) # First, add standard arguments from Pypiper. # groups="pypiper" will add all the arguments that pypiper uses, # and adding "common" adds arguments for --input and --sample--name # and "output_parent". You can read more about your options for standard # arguments in the pypiper docs (section "command-line arguments") - parser = pypiper.add_pypiper_args(parser, groups=["pypiper", "common", "ngs", "logmuse"], - args=["output-parent", "config"], - required=['sample-name', 'output-parent']) + parser = pypiper.add_pypiper_args( + parser, + groups=["pypiper", "common", "ngs", "logmuse"], + args=["output-parent", "config"], + required=["sample-name", "output-parent"], + ) # Add any pipeline-specific arguments if you like here. - # args for `output_parent` and `sample_name` were added by the standard - # `add_pypiper_args` function. + # args for `output_parent` and `sample_name` were added by the standard + # `add_pypiper_args` function. return parser + def run_pipeline(): # A good practice is to make an output folder for each sample, housed under # the parent output folder, like this: outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name)) # Create a PipelineManager object and start the pipeline - pm = pypiper.PipelineManager(name="logmuse-test", - outfolder=outfolder, - args=args) + pm = pypiper.PipelineManager(name="logmuse-test", outfolder=outfolder, args=args) pm.info("Getting started!") # NGSTk is a "toolkit" that comes with pypiper, providing some functions # for dealing with genome sequence data. You can read more about toolkits in the # documentation - files = [str(x) + ".tmp" for x in range(1,20)] + files = [str(x) + ".tmp" for x in range(1, 20)] pm.run("touch " + " ".join(files), target=files, clean=True) @@ -76,30 +80,32 @@ def run_pipeline(): # and convert these to fastq files. local_input_files = ngstk.merge_or_link( - [args.input, args.input2], - raw_folder, - args.sample_name) + [args.input, args.input2], raw_folder, args.sample_name + ) cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq( - local_input_files, - args.sample_name, - args.paired_end, - fastq_folder) - + local_input_files, args.sample_name, args.paired_end, fastq_folder + ) # Now we'll use another NGSTk function to grab the file size from the input files # pm.report_result("File_mb", ngstk.get_file_size(local_input_files)) - # And then count the number of reads in the file n_input_files = len(list(filter(bool, local_input_files))) - raw_reads = sum([int(ngstk.count_reads(input_file, args.paired_end)) - for input_file in local_input_files]) / n_input_files - - # Finally, we use the report_result() function to print the output and + raw_reads = ( + sum( + [ + int(ngstk.count_reads(input_file, args.paired_end)) + for input_file in local_input_files + ] + ) + / n_input_files + ) + + # Finally, we use the report_result() function to print the output and # log the key-value pair in the standard stats.tsv file pm.report_result("Raw_reads", str(raw_reads)) @@ -107,7 +113,7 @@ def run_pipeline(): pm.stop_pipeline() -if __name__ == '__main__': +if __name__ == "__main__": try: parser = build_argparser() args = parser.parse_args() diff --git a/init_interactive.py b/init_interactive.py index b63e4fb5..15dfab1f 100644 --- a/init_interactive.py +++ b/init_interactive.py @@ -1,14 +1,12 @@ """ Create dummy PipelineManager and NGSTk instance for interactive session. """ import os -from pypiper import PipelineManager -from pypiper import NGSTk +from pypiper import NGSTk, PipelineManager __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - pm = PipelineManager(name="interactive", outfolder=os.path.expanduser("~")) tk = NGSTk(pm=pm) diff --git a/mkdocs.yml b/mkdocs.yml index e3eb2694..f3a3a1fd 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -16,6 +16,7 @@ nav: - Automatic command-line arguments: cli.md - Configuring pipelines: configuration.md - Reporting statistics: report.md + - Reporting statistics with pipestat: pipestat.md - Cleaning up intermediate files: clean.md - Best practices: best-practices.md - Toolkits: diff --git a/pypiper/__init__.py b/pypiper/__init__.py index 6a1802d1..3076285e 100644 --- a/pypiper/__init__.py +++ b/pypiper/__init__.py @@ -1,10 +1,10 @@ +# Implicitly re-export so logmuse usage by pipeline author routes through here. +from logmuse import add_logging_options + from ._version import __version__ +from .exceptions import * from .manager import * from .ngstk import * -from .utils import * from .pipeline import * -from .exceptions import * from .stage import * - -# Implicitly re-export so logmuse usage by pipeline author routes through here. -from logmuse import add_logging_options +from .utils import * diff --git a/pypiper/_version.py b/pypiper/_version.py index 8e1395bd..f23a6b39 100644 --- a/pypiper/_version.py +++ b/pypiper/_version.py @@ -1 +1 @@ -__version__ = "0.12.3" +__version__ = "0.13.0" diff --git a/pypiper/const.py b/pypiper/const.py index 5f2d66e8..27495297 100644 --- a/pypiper/const.py +++ b/pypiper/const.py @@ -2,6 +2,7 @@ CHECKPOINT_EXTENSION = ".checkpoint" +DEFAULT_SAMPLE_NAME = "DEFAULT_SAMPLE_NAME" PIPELINE_CHECKPOINT_DELIMITER = "_" STAGE_NAME_SPACE_REPLACEMENT = "-" -PROFILE_COLNAMES = ['pid', 'hash', 'cid', 'runtime', 'mem', 'cmd', 'lock'] +PROFILE_COLNAMES = ["pid", "hash", "cid", "runtime", "mem", "cmd", "lock"] diff --git a/pypiper/exceptions.py b/pypiper/exceptions.py index 33e3a10c..063b3641 100644 --- a/pypiper/exceptions.py +++ b/pypiper/exceptions.py @@ -4,41 +4,46 @@ __email__ = "vreuter@virginia.edu" -__all__ = ["PipelineError", "PipelineHalt", "IllegalPipelineDefinitionError", - "IllegalPipelineExecutionError", "MissingCheckpointError", - "UnknownPipelineStageError", "UnsupportedFiletypeException", - "SubprocessError"] - - +__all__ = [ + "PipelineError", + "PipelineHalt", + "IllegalPipelineDefinitionError", + "IllegalPipelineExecutionError", + "MissingCheckpointError", + "UnknownPipelineStageError", + "UnsupportedFiletypeException", + "SubprocessError", +] class PipelineError(Exception): - """ General pipeline error. """ + """General pipeline error.""" + pass + class SubprocessError(Exception): pass + class IllegalPipelineDefinitionError(PipelineError): pass - class IllegalPipelineExecutionError(PipelineError): - """ Represent cases of illogical start/stop run() declarations. """ - pass + """Represent cases of illogical start/stop run() declarations.""" + pass class MissingCheckpointError(Exception): - """ Represent case of expected but absent checkpoint file. """ + """Represent case of expected but absent checkpoint file.""" def __init__(self, checkpoint, filepath): msg = "{}: '{}'".format(checkpoint, filepath) super(MissingCheckpointError, self).__init__(msg) - class UnknownPipelineStageError(Exception): """ Triggered by use of unknown/undefined name for a pipeline stage. @@ -47,7 +52,6 @@ class UnknownPipelineStageError(Exception): :param pypiper.Pipeline pipeline: Pipeline for which the stage is unknown/undefined. """ - def __init__(self, stage_name, pipeline=None): message = stage_name if pipeline is not None: @@ -57,12 +61,12 @@ def __init__(self, stage_name, pipeline=None): # Just don't contextualize the error with known stages. pass else: - message = "{}; defined stages: {}". \ - format(message, ", ".join(map(str, stages))) + message = "{}; defined stages: {}".format( + message, ", ".join(map(str, stages)) + ) super(UnknownPipelineStageError, self).__init__(message) - class PipelineHalt(Exception): """ Execution-stopping exception for halting a pipeline. @@ -74,6 +78,7 @@ class PipelineHalt(Exception): PipelineManager's halt method raise this exception. """ + def __init__(self, checkpoint=None, finished=None): if checkpoint is None: super(PipelineHalt, self).__init__() @@ -81,8 +86,9 @@ def __init__(self, checkpoint=None, finished=None): if isinstance(checkpoint, str): last_stage_done = checkpoint else: - last_stage_done = getattr(checkpoint, "name", None) or \ - getattr(checkpoint, "__name__", None) + last_stage_done = getattr(checkpoint, "name", None) or getattr( + checkpoint, "__name__", None + ) if not last_stage_done: super(PipelineHalt, self).__init__() else: @@ -95,9 +101,9 @@ def __init__(self, checkpoint=None, finished=None): super(PipelineHalt, self).__init__(msg) - class UnsupportedFiletypeException(Exception): - """ Restrict filetype domain. """ + """Restrict filetype domain.""" + # Use superclass ctor to allow file name/path or extension to pass # through as the message for why this error is occurring. pass diff --git a/pypiper/flags.py b/pypiper/flags.py index 09e3fb85..21e97d27 100644 --- a/pypiper/flags.py +++ b/pypiper/flags.py @@ -8,5 +8,4 @@ PAUSE_FLAG = "partial" FLAGS = [RUN_FLAG, COMPLETE_FLAG, FAIL_FLAG, WAIT_FLAG, PAUSE_FLAG] -__all__ = ["COMPLETE_FLAG", "FAIL_FLAG", "FLAGS", - "PAUSE_FLAG", "RUN_FLAG", "WAIT_FLAG"] +__all__ = ["COMPLETE_FLAG", "FAIL_FLAG", "FLAGS", "PAUSE_FLAG", "RUN_FLAG", "WAIT_FLAG"] diff --git a/pypiper/folder_context.py b/pypiper/folder_context.py index 360d6c0c..77828af5 100644 --- a/pypiper/folder_context.py +++ b/pypiper/folder_context.py @@ -2,14 +2,12 @@ import os - __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - class FolderContext(object): - """ Context manager for temporarily changing directory. """ + """Context manager for temporarily changing directory.""" def __init__(self, folder): """ @@ -18,18 +16,18 @@ def __init__(self, folder): :param str folder: Path to set as new working directory """ if not os.path.isdir(folder): - raise ValueError( - "Requested temp entry to non-folder: {}".format(folder)) + raise ValueError("Requested temp entry to non-folder: {}".format(folder)) self._prevdir = os.getcwd() self._currdir = folder def __enter__(self): - """ Make the working directory switch. """ + """Make the working directory switch.""" os.chdir(self._currdir) def __exit__(self, exc_type, exc_val, exc_tb): - """ Switch back to the previous working directory. """ + """Switch back to the previous working directory.""" if not os.path.isdir(self._prevdir): - raise RuntimeError("Return path is no longer a directory: {}". - format(self._prevdir)) + raise RuntimeError( + "Return path is no longer a directory: {}".format(self._prevdir) + ) os.chdir(self._prevdir) diff --git a/pypiper/manager.py b/pypiper/manager.py index aa43389a..9d32100b 100644 --- a/pypiper/manager.py +++ b/pypiper/manager.py @@ -8,35 +8,51 @@ """ import atexit -from collections.abc import Iterable import datetime import errno import glob import os import platform -import psutil import re import shlex # for splitting commands like a shell does import signal import subprocess import sys import time -import pandas as _pd +import warnings -from attmap import AttMapEcho +from collections.abc import Iterable from hashlib import md5 + +import __main__ import logmuse +import pandas as _pd +import psutil +from attmap import AttMapEcho +from pipestat import PipestatError, PipestatManager from yacman import load_yaml + +from ._version import __version__ +from .const import PROFILE_COLNAMES, DEFAULT_SAMPLE_NAME from .exceptions import PipelineHalt, SubprocessError from .flags import * -from .utils import \ - check_shell, checkpoint_filepath, clear_flags, default_pipeline_config, \ - flag_name, get_proc_name, is_multi_target, logger_via_cli, make_lock_name, \ - parse_cmd, pipeline_filepath, CHECKPOINT_SPECIFICATIONS -from .const import PROFILE_COLNAMES -from ._version import __version__ -import __main__ - +from .utils import ( + CHECKPOINT_SPECIFICATIONS, + check_shell, + checkpoint_filepath, + clear_flags, + default_pipeline_config, + flag_name, + get_proc_name, + is_multi_target, + logger_via_cli, + make_lock_name, + parse_cmd, + pipeline_filepath, + default_pipestat_output_schema, + result_formatter_markdown, +) +from pipestat.helpers import read_yaml_data __all__ = ["PipelineManager"] @@ -84,7 +100,7 @@ class PipelineManager(object): even if the preceding command is not run. By default, following functions are only run if the preceding command is run. :param int cores: number of processors to use, default 1 - :param str mem: amount of memory to use. Default units are megabytes unless + :param str mem: amount of memory to use. Default units are megabytes unless specified using the suffix [K|M|G|T]." :param str config_file: path to pipeline configuration file, optional :param str output_parent: path to folder in which output folder will live @@ -100,12 +116,34 @@ class PipelineManager(object): via args namespace, or if both stopping types (exclusive/prospective and inclusive/retrospective) are provided. """ - def __init__( - self, name, outfolder, version=None, args=None, multi=False, - dirty=False, recover=False, new_start=False, force_follow=False, - cores=1, mem="1000M", config_file=None, output_parent=None, - overwrite_checkpoints=False, logger_kwargs=None, **kwargs): + # TODO: add pipestat-related args docstrings + + def __init__( + self, + name, + outfolder, + version=None, + args=None, + multi=False, + dirty=False, + recover=False, + new_start=False, + force_follow=False, + cores=1, + mem="1000M", + config_file=None, + output_parent=None, + overwrite_checkpoints=False, + logger_kwargs=None, + pipestat_project_name=None, + pipestat_sample_name=None, + pipestat_schema=None, + pipestat_results_file=None, + pipestat_config=None, + pipestat_result_formatter=None, + **kwargs, + ): # Params defines the set of options that could be updated via # command line args to a pipeline run, that can be forwarded # to Pypiper. If any pypiper arguments are passed @@ -114,15 +152,15 @@ def __init__( # Establish default params params = { - 'dirty': dirty, - 'recover': recover, - 'new_start': new_start, - 'force_follow': force_follow, - 'config_file': config_file, - 'output_parent': output_parent, - 'cores': cores, - 'mem': mem, - 'testmode': False + "dirty": dirty, + "recover": recover, + "new_start": new_start, + "force_follow": force_follow, + "config_file": config_file, + "output_parent": output_parent, + "cores": cores, + "mem": mem, + "testmode": False, } # Transform the command-line namespace into a Mapping. @@ -142,8 +180,10 @@ def __init__( checkpoint = args_dict.pop(optname, None) setattr(self, optname, checkpoint) if self.stop_before and self.stop_after: - raise TypeError("Cannot specify both pre-stop and post-stop; " - "got '{}' and '{}'".format(self.stop_before, self.stop_after)) + raise TypeError( + "Cannot specify both pre-stop and post-stop; " + "got '{}' and '{}'".format(self.stop_before, self.stop_after) + ) # Update this manager's parameters with non-checkpoint-related # command-line parameterization. @@ -161,14 +201,13 @@ def __init__( # Pipeline settings self.name = name self.tee = None - self.overwrite_locks = params['recover'] - self.new_start = params['new_start'] - self.force_follow = params['force_follow'] - self.dirty = params['dirty'] - self.cores = params['cores'] - self.output_parent = params['output_parent'] - self.testmode = params['testmode'] - + self.overwrite_locks = params["recover"] + self.new_start = params["new_start"] + self.force_follow = params["force_follow"] + self.dirty = params["dirty"] + self.cores = params["cores"] + self.output_parent = params["output_parent"] + self.testmode = params["testmode"] # Set up logger logger_kwargs = logger_kwargs or {} @@ -203,11 +242,11 @@ def __init__( # total memory limit provided. # This will give a little breathing room for non-heap java memory use. - if not params['mem'].endswith(('K','M','G','T')): - self.mem = params['mem'] + "M" + if not params["mem"].endswith(("K", "M", "G", "T")): + self.mem = params["mem"] + "M" else: # Assume the memory is in megabytes. - self.mem = params['mem'] + self.mem = params["mem"] self.javamem = str(int(int(self.mem[:-1]) * 0.95)) + self.mem[-1:] @@ -232,27 +271,21 @@ def __init__( self.pl_version = version # Set relative output_parent directory to absolute # not necessary after all. . . - #if self.output_parent and not os.path.isabs(self.output_parent): + # if self.output_parent and not os.path.isabs(self.output_parent): # self.output_parent = os.path.join(os.getcwd(), self.output_parent) # File paths: - self.outfolder = os.path.join(outfolder, '') # trailing slash + self.outfolder = os.path.join(outfolder, "") # trailing slash + self.make_sure_path_exists(self.outfolder) self.pipeline_log_file = pipeline_filepath(self, suffix="_log.md") - self.pipeline_profile_file = \ - pipeline_filepath(self, suffix="_profile.tsv") + self.pipeline_profile_file = pipeline_filepath(self, suffix="_profile.tsv") # Stats and figures are general and so lack the pipeline name. - self.pipeline_stats_file = \ - pipeline_filepath(self, filename="stats.tsv") - self.pipeline_figures_file = \ - pipeline_filepath(self, filename="figures.tsv") - self.pipeline_objects_file = \ - pipeline_filepath(self, filename="objects.tsv") + self.pipeline_stats_file = pipeline_filepath(self, filename="stats.yaml") # Record commands used and provide manual cleanup script. - self.pipeline_commands_file = \ - pipeline_filepath(self, suffix="_commands.sh") + self.pipeline_commands_file = pipeline_filepath(self, suffix="_commands.sh") self.cleanup_file = pipeline_filepath(self, suffix="_cleanup.sh") # Pipeline status variables @@ -263,7 +296,7 @@ def __init__( self.locks = [] self.running_procs = {} self.completed_procs = {} - + self.wait = True # turn off for debugging # Initialize status and flags @@ -275,6 +308,11 @@ def __init__( # In-memory holder for report_result self.stats_dict = {} + # Result formatter to pass to pipestat + self.pipestat_result_formatter = ( + pipestat_result_formatter or result_formatter_markdown + ) + # Checkpoint-related parameters self.overwrite_checkpoints = overwrite_checkpoints self.halt_on_next = False @@ -290,6 +328,35 @@ def __init__( signal.signal(signal.SIGINT, self._signal_int_handler) signal.signal(signal.SIGTERM, self._signal_term_handler) + # pipestat setup + self.pipestat_sample_name = pipestat_sample_name or DEFAULT_SAMPLE_NAME + # getattr(self, "sample_name", DEFAULT_SAMPLE_NAME) + + # don't force default pipestat_results_file value unless + # pipestat config not provided + if pipestat_config is None and pipestat_results_file is None: + pipestat_results_file = pipeline_filepath( + self, filename="pipestat_results.yaml" + ) + + def _get_arg(args_dict, arg_name): + """safely get argument from arg dict -- return None if doesn't exist""" + return None if arg_name not in args_dict else args_dict[arg_name] + + self._pipestat_manager = PipestatManager( + sample_name=self.pipestat_sample_name + or _get_arg(args_dict, "pipestat_sample_name") + or DEFAULT_SAMPLE_NAME, + pipeline_name=self.name, + schema_path=pipestat_schema + or _get_arg(args_dict, "pipestat_schema") + or default_pipestat_output_schema(sys.argv[0]), + results_file_path=self.pipeline_stats_file + or _get_arg(args_dict, "pipestat_results_file"), + config_file=pipestat_config or _get_arg(args_dict, "pipestat_config"), + multi_pipelines=multi, + ) + self.start_pipeline(args, multi) # Handle config file if it exists @@ -330,8 +397,9 @@ def __init__( default_config = default_pipeline_config(sys.argv[0]) if os.path.isfile(default_config): config_to_load = default_config - self.debug("Using default pipeline config file: {}". - format(config_to_load)) + self.debug( + "Using default pipeline config file: {}".format(config_to_load) + ) # Finally load the config we found. if config_to_load is not None: @@ -341,7 +409,25 @@ def __init__( self.debug("No config file") self.config = None + @property + def pipestat(self): + """ + `pipestat.PipestatManager` object to use for pipeline results reporting and status management + Depending on the object configuration it can report to + a YAML-formatted file or PostgreSQL database. Please refer to pipestat + documentation for more details: http://pipestat.databio.org/ + + :return pipestat.PipestatManager: object to use for results reporting + """ + try: + return getattr(self, "_pipestat_manager") + except AttributeError: + raise PipestatError( + f"{PipestatManager.__name__} has not been configured for this pipeline run. " + f"Provide an output schema to the {PipelineManager.__name__} object " + f"in order to initialize it." + ) @property def _completed(self): @@ -350,7 +436,10 @@ def _completed(self): :return bool: Whether the managed pipeline is in a completed state. """ - return self.status == COMPLETE_FLAG + return ( + self.pipestat.get_status(self._pipestat_manager.sample_name) + == COMPLETE_FLAG + ) @property def _failed(self): @@ -359,16 +448,17 @@ def _failed(self): :return bool: Whether the managed pipeline is in a failed state. """ - return self.status == FAIL_FLAG + return self.pipestat.get_status(self._pipestat_manager.sample_name) == FAIL_FLAG @property def halted(self): """ Is the managed pipeline in a paused/halted state? - :return bool: Whether the managed pipeline is in a paused/halted state. """ - return self.status == PAUSE_FLAG + return ( + self.pipestat.get_status(self._pipestat_manager.sample_name) == PAUSE_FLAG + ) @property def _has_exit_status(self): @@ -395,20 +485,22 @@ def start_pipeline(self, args=None, multi=False): You provide only the output directory (used for pipeline stats, log, and status flag files). """ # Perhaps this could all just be put into __init__, but I just kind of like the idea of a start function - self.make_sure_path_exists(self.outfolder) + # self.make_sure_path_exists(self.outfolder) # By default, Pypiper will mirror every operation so it is displayed both # on sys.stdout **and** to a log file. Unfortunately, interactive python sessions - # ruin this by interfering with stdout. So, for interactive mode, we do not enable + # ruin this by interfering with stdout. So, for interactive mode, we do not enable # the tee subprocess, sending all output to screen only. # Starting multiple PipelineManagers in the same script has the same problem, and # must therefore be run in interactive_mode. interactive_mode = multi or not hasattr(__main__, "__file__") if interactive_mode: - self.warning("Warning: You're running an interactive python session. " - "This works, but pypiper cannot tee the output, so results " - "are only logged to screen.") + self.warning( + "Warning: You're running an interactive python session. " + "This works, but pypiper cannot tee the output, so results " + "are only logged to screen." + ) else: sys.stdout = Unbuffered(sys.stdout) # sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) # Unbuffer output @@ -423,10 +515,12 @@ def start_pipeline(self, args=None, multi=False): # manually (in the exit handler). # a for append to file - + tee = subprocess.Popen( - ["tee", "-a", self.pipeline_log_file], stdin=subprocess.PIPE, - preexec_fn=self._ignore_interrupts) + ["tee", "-a", self.pipeline_log_file], + stdin=subprocess.PIPE, + preexec_fn=self._ignore_interrupts, + ) # If the pipeline is terminated with SIGTERM/SIGINT, # make sure we kill this spawned tee subprocess as well. @@ -456,29 +550,83 @@ def start_pipeline(self, args=None, multi=False): try: # pypiper dir ppd = os.path.dirname(os.path.realpath(__file__)) - gitvars['pypiper_dir'] = ppd - gitvars['pypiper_hash'] = subprocess.check_output("cd " + ppd + "; git rev-parse --verify HEAD 2>/dev/null", shell=True).decode().strip() - gitvars['pypiper_date'] = subprocess.check_output("cd " + ppd + "; git show -s --format=%ai HEAD 2>/dev/null", shell=True).decode().strip() - gitvars['pypiper_diff'] = subprocess.check_output("cd " + ppd + "; git diff --shortstat HEAD 2>/dev/null", shell=True).decode().strip() - gitvars['pypiper_branch'] = subprocess.check_output("cd " + ppd + "; git branch | grep '*' 2>/dev/null", shell=True).decode().strip() + gitvars["pypiper_dir"] = ppd + gitvars["pypiper_hash"] = ( + subprocess.check_output( + "cd " + ppd + "; git rev-parse --verify HEAD 2>/dev/null", + shell=True, + ) + .decode() + .strip() + ) + gitvars["pypiper_date"] = ( + subprocess.check_output( + "cd " + ppd + "; git show -s --format=%ai HEAD 2>/dev/null", + shell=True, + ) + .decode() + .strip() + ) + gitvars["pypiper_diff"] = ( + subprocess.check_output( + "cd " + ppd + "; git diff --shortstat HEAD 2>/dev/null", shell=True + ) + .decode() + .strip() + ) + gitvars["pypiper_branch"] = ( + subprocess.check_output( + "cd " + ppd + "; git branch | grep '*' 2>/dev/null", shell=True + ) + .decode() + .strip() + ) except Exception: pass try: # pipeline dir pld = os.path.dirname(os.path.realpath(sys.argv[0])) - gitvars['pipe_dir'] = pld - gitvars['pipe_hash'] = subprocess.check_output("cd " + pld + "; git rev-parse --verify HEAD 2>/dev/null", shell=True).decode().strip() - gitvars['pipe_date'] = subprocess.check_output("cd " + pld + "; git show -s --format=%ai HEAD 2>/dev/null", shell=True).decode().strip() - gitvars['pipe_diff'] = subprocess.check_output("cd " + pld + "; git diff --shortstat HEAD 2>/dev/null", shell=True).decode().strip() - gitvars['pipe_branch'] = subprocess.check_output("cd " + pld + "; git branch | grep '*' 2>/dev/null", shell=True).decode().strip() + gitvars["pipe_dir"] = pld + gitvars["pipe_hash"] = ( + subprocess.check_output( + "cd " + pld + "; git rev-parse --verify HEAD 2>/dev/null", + shell=True, + ) + .decode() + .strip() + ) + gitvars["pipe_date"] = ( + subprocess.check_output( + "cd " + pld + "; git show -s --format=%ai HEAD 2>/dev/null", + shell=True, + ) + .decode() + .strip() + ) + gitvars["pipe_diff"] = ( + subprocess.check_output( + "cd " + pld + "; git diff --shortstat HEAD 2>/dev/null", shell=True + ) + .decode() + .strip() + ) + gitvars["pipe_branch"] = ( + subprocess.check_output( + "cd " + pld + "; git branch | grep '*' 2>/dev/null", shell=True + ) + .decode() + .strip() + ) except Exception: pass - + # Print out a header section in the pipeline log: # Wrap things in backticks to prevent markdown from interpreting underscores as emphasis. # print("----------------------------------------") self.info("### Pipeline run code and environment:\n") - self.info("* " + "Command".rjust(20) + ": " + "`" + str(" ".join(sys.argv)) + "`") + self.info( + "* " + "Command".rjust(20) + ": " + "`" + str(" ".join(sys.argv)) + "`" + ) self.info("* " + "Compute host".rjust(20) + ": " + platform.node()) self.info("* " + "Working dir".rjust(20) + ": " + os.getcwd()) self.info("* " + "Outfolder".rjust(20) + ": " + self.outfolder) @@ -488,25 +636,75 @@ def start_pipeline(self, args=None, multi=False): self.info("\n### Version log:\n") self.info("* " + "Python version".rjust(20) + ": " + platform.python_version()) try: - self.info("* " + "Pypiper dir".rjust(20) + ": " + "`" + gitvars['pypiper_dir'].strip() + "`") + self.info( + "* " + + "Pypiper dir".rjust(20) + + ": " + + "`" + + gitvars["pypiper_dir"].strip() + + "`" + ) self.info("* " + "Pypiper version".rjust(20) + ": " + __version__) - self.info("* " + "Pypiper hash".rjust(20) + ": " + str(gitvars['pypiper_hash'])) - self.info("* " + "Pypiper branch".rjust(20) + ": " + str(gitvars['pypiper_branch'])) - self.info("* " + "Pypiper date".rjust(20) + ": " + str(gitvars['pypiper_date'])) - if gitvars['pypiper_diff']: - self.info("* " + "Pypiper diff".rjust(20) + ": " + str(gitvars['pypiper_diff'])) + self.info( + "* " + "Pypiper hash".rjust(20) + ": " + str(gitvars["pypiper_hash"]) + ) + self.info( + "* " + + "Pypiper branch".rjust(20) + + ": " + + str(gitvars["pypiper_branch"]) + ) + self.info( + "* " + "Pypiper date".rjust(20) + ": " + str(gitvars["pypiper_date"]) + ) + if gitvars["pypiper_diff"]: + self.info( + "* " + + "Pypiper diff".rjust(20) + + ": " + + str(gitvars["pypiper_diff"]) + ) except KeyError: # It is ok if keys aren't set, it means pypiper isn't in a git repo. pass try: - self.info("* " + "Pipeline dir".rjust(20) + ": " + "`" + gitvars['pipe_dir'].strip() + "`") - self.info("* " + "Pipeline version".rjust(20) + ": " + str(self.pl_version)) - self.info("* " + "Pipeline hash".rjust(20) + ": " + str(gitvars['pipe_hash']).strip()) - self.info("* " + "Pipeline branch".rjust(20) + ": " + str(gitvars['pipe_branch']).strip()) - self.info("* " + "Pipeline date".rjust(20) + ": " + str(gitvars['pipe_date']).strip()) - if (gitvars['pipe_diff'] != ""): - self.info("* " + "Pipeline diff".rjust(20) + ": " + str(gitvars['pipe_diff']).strip()) + self.info( + "* " + + "Pipeline dir".rjust(20) + + ": " + + "`" + + gitvars["pipe_dir"].strip() + + "`" + ) + self.info( + "* " + "Pipeline version".rjust(20) + ": " + str(self.pl_version) + ) + self.info( + "* " + + "Pipeline hash".rjust(20) + + ": " + + str(gitvars["pipe_hash"]).strip() + ) + self.info( + "* " + + "Pipeline branch".rjust(20) + + ": " + + str(gitvars["pipe_branch"]).strip() + ) + self.info( + "* " + + "Pipeline date".rjust(20) + + ": " + + str(gitvars["pipe_date"]).strip() + ) + if gitvars["pipe_diff"] != "": + self.info( + "* " + + "Pipeline diff".rjust(20) + + ": " + + str(gitvars["pipe_diff"]).strip() + ) except KeyError: # It is ok if keys aren't set, it means the pipeline isn't a git repo. pass @@ -517,18 +715,37 @@ def start_pipeline(self, args=None, multi=False): argtext = "`{}`".format(arg) valtext = "`{}`".format(val) self.info("* {}: {}".format(argtext.rjust(20), valtext)) + + self.info("\n### Initialized Pipestat Object:\n") + results = self._pipestat_manager.__str__().split("\n") + for i in results: + self.info("* " + i) + self.info("* Sample name: " + self.pipestat_sample_name + "\n") self.info("\n----------------------------------------\n") - self._set_status_flag(RUN_FLAG) + self.status = "running" + self.pipestat.set_status( + sample_name=self._pipestat_manager.sample_name, status_identifier="running" + ) # Record the start in PIPE_profile and PIPE_commands output files so we # can trace which run they belong to with open(self.pipeline_commands_file, "a") as myfile: - myfile.write("# Pipeline started at " + time.strftime("%m-%d %H:%M:%S", time.localtime(self.starttime)) + "\n\n") + myfile.write( + "# Pipeline started at " + + time.strftime("%m-%d %H:%M:%S", time.localtime(self.starttime)) + + "\n\n" + ) with open(self.pipeline_profile_file, "a") as myfile: - myfile.write("# Pipeline started at " + time.strftime("%m-%d %H:%M:%S", time.localtime(self.starttime)) - + "\n\n" + "# " + "\t".join(PROFILE_COLNAMES) + "\n") + myfile.write( + "# Pipeline started at " + + time.strftime("%m-%d %H:%M:%S", time.localtime(self.starttime)) + + "\n\n" + + "# " + + "\t".join(PROFILE_COLNAMES) + + "\n" + ) def _set_status_flag(self, status): """ @@ -552,9 +769,10 @@ def _set_status_flag(self, status): # Set new status. prev_status = self.status self.status = status - self._create_file(self._flag_file_path()) - self.debug("\nChanged status from {} to {}.".format( - prev_status, self.status)) + self.pipestat.set_status( + sample_name=self._pipestat_manager.sample_name, status_identifier=status + ) + self.debug("\nChanged status from {} to {}.".format(prev_status, self.status)) def _flag_file_path(self, status=None): """ @@ -566,14 +784,29 @@ def _flag_file_path(self, status=None): :param str status: flag file type to create, default to current status :return str: path to flag file of indicated or current status. """ - flag_file_name = "{}_{}".format( - self.name, flag_name(status or self.status)) + + flag_file_name = "{}_{}_{}".format( + self._pipestat_manager["_pipeline_name"], + self.pipestat_sample_name, + flag_name(status or self.status), + ) return pipeline_filepath(self, filename=flag_file_name) ################################### # Process calling functions ################################### - def run(self, cmd, target=None, lock_name=None, shell=None, nofail=False, clean=False, follow=None, container=None): + def run( + self, + cmd, + target=None, + lock_name=None, + shell=None, + nofail=False, + clean=False, + follow=None, + container=None, + default_return_code=0, + ): """ The primary workhorse function of PipelineManager, this runs a command. @@ -603,29 +836,55 @@ def run(self, cmd, target=None, lock_name=None, shell=None, nofail=False, clean= to an auto cleanup list. Optional. :param callable follow: Function to call after executing (each) command. :param str container: Name for Docker container in which to run commands. + :param Any default_return_code: Return code to use, might be used to discriminate + between runs that did not execute any commands and runs that did. :return int: Return code of process. If a list of commands is passed, this is the maximum of all return codes for all commands. """ + def _max_ret_code(codes_list): + """ + Return the maximum of a list of return codes. + + :param list[int] code: List of return codes to compare. + :return int: Maximum of list. + """ + # filter out codes that are None + codes_list = [code for code in codes_list if code is not None] + # get the max of the remaining codes + if codes_list: + return max(codes_list) + # if no codes are left, return None + return + + # validate default return code + if default_return_code is not None and not isinstance(default_return_code, int): + raise TypeError("default_return_code must be an int or None") + # If the pipeline's not been started, skip ahead. if not self._active: cmds = [cmd] if isinstance(cmd, str) else cmd cmds_text = [c if isinstance(c, str) else " ".join(c) for c in cmds] - self.info("Pipeline is inactive; skipping {} command(s):\n{}". - format(len(cmds), "\n".join(cmds_text))) - return 0 + self.info( + "Pipeline is inactive; skipping {} command(s):\n{}".format( + len(cmds), "\n".join(cmds_text) + ) + ) + return default_return_code # Short-circuit if the checkpoint file exists and the manager's not # been configured to overwrite such files. if self.curr_checkpoint is not None: check_fpath = checkpoint_filepath(self.curr_checkpoint, self) if os.path.isfile(check_fpath) and not self.overwrite_checkpoints: - self.info("Checkpoint file exists for '{}' ('{}'), and the {} has " - "been configured to not overwrite checkpoints; " - "skipping command '{}'".format( - self.curr_checkpoint, check_fpath, - self.__class__.__name__, cmd)) - return 0 + self.info( + "Checkpoint file exists for '{}' ('{}'), and the {} has " + "been configured to not overwrite checkpoints; " + "skipping command '{}'".format( + self.curr_checkpoint, check_fpath, self.__class__.__name__, cmd + ) + ) + return default_return_code # TODO: consider making the logic such that locking isn't implied, or # TODO (cont.): that we can make it otherwise such that it's not @@ -634,25 +893,30 @@ def run(self, cmd, target=None, lock_name=None, shell=None, nofail=False, clean= # Therefore, a targetless command that you want # to lock must specify a lock_name manually. if target is None and lock_name is None: - self.fail_pipeline(Exception( - "You must provide either a target or a lock_name.")) + self.fail_pipeline( + Exception("You must provide either a target or a lock_name.") + ) # Downstream code requires target to be a list, so convert if only # a single item was given if not is_multi_target(target) and target is not None: target = [target] - # Downstream code requires a list of locks; convert + # Downstream code requires a list of locks; convert if isinstance(lock_name, str): lock_name = [lock_name] - + # Default lock_name (if not provided) is based on the target file name, # but placed in the parent pipeline outfolder - self.debug("Lock_name {}; target '{}', outfolder '{}'".format(lock_name, target, self.outfolder)) + self.debug( + "Lock_name {}; target '{}', outfolder '{}'".format( + lock_name, target, self.outfolder + ) + ) lock_name = lock_name or make_lock_name(target, self.outfolder) lock_files = [self._make_lock_path(ln) for ln in lock_name] - process_return_code = 0 + process_return_code = default_return_code local_maxmem = 0 # Decide how to do follow-up. @@ -660,8 +924,11 @@ def run(self, cmd, target=None, lock_name=None, shell=None, nofail=False, clean= call_follow = lambda: None elif not hasattr(follow, "__call__"): # Warn about non-callable argument to follow-up function. - self.warning("Follow-up function is not callable and won't be used: {}". - format(type(follow))) + self.warning( + "Follow-up function is not callable and won't be used: {}".format( + type(follow) + ) + ) call_follow = lambda: None else: # Wrap the follow-up function so that the log shows what's going on. @@ -672,7 +939,6 @@ def call_follow(): follow() self.in_follow = False - # The while=True loop here is unlikely to be triggered, and is just a # wrapper to prevent race conditions; the lock_file must be created by # the current loop. If not, we loop again and then re-do the tests. @@ -684,18 +950,22 @@ def call_follow(): # is found that needs to be recovered or overwritten. It instructs us to # ignore lock files on the next iteration. local_recover = False - local_newstart = False + local_newstart = False proceed_through_locks = False while True: ##### Tests block # Base case: All targets exists and not set to overwrite targets break loop, don't run process. # os.path.exists returns True for either a file or directory; .isfile is file-only - if target is not None and all([os.path.exists(t) for t in target]) \ - and not any([os.path.isfile(l) for l in lock_files]) \ - and not local_newstart: + if ( + target is not None + and all([os.path.exists(t) for t in target]) + and not any([os.path.isfile(l) for l in lock_files]) + and not local_newstart + ): for tgt in target: - if os.path.exists(tgt): self.info("Target exists: `" + tgt + "` ") + if os.path.exists(tgt): + self.info("Target exists: `" + tgt + "` ") if self.new_start: self.info("New start mode; run anyway. ") # Set the local_newstart flag so the command will run anyway. @@ -712,11 +982,17 @@ def call_follow(): for c in cmd: count = len(parse_cmd(c, shell)) self.proc_count += count - self.debug(increment_info_pattern.format(str(c), count, self.proc_count)) + self.debug( + increment_info_pattern.format( + str(c), count, self.proc_count + ) + ) else: count = len(parse_cmd(cmd, shell)) self.proc_count += count - self.debug(increment_info_pattern.format(str(cmd), count, self.proc_count)) + self.debug( + increment_info_pattern.format(str(cmd), count, self.proc_count) + ) break # Do not run command # Scenario 1: Lock file exists, but we're supposed to overwrite target; Run process. @@ -729,8 +1005,10 @@ def call_follow(): self.info("Overwriting target...") proceed_through_locks = True elif os.path.isfile(recover_file): - self.info("Found dynamic recovery file ({}); " - "overwriting target...".format(recover_file)) + self.info( + "Found dynamic recovery file ({}); " + "overwriting target...".format(recover_file) + ) # remove the lock file which will then be promptly re-created for the current run. local_recover = True proceed_through_locks = True @@ -742,8 +1020,7 @@ def call_follow(): # time (to see if the target exists now) continue - - # If you get to this point, the target doesn't exist, and the lock_file doesn't exist + # If you get to this point, the target doesn't exist, and the lock_file doesn't exist # (or we should overwrite). create the lock (if you can) # Initialize lock in master lock list for lock_file in lock_files: @@ -755,10 +1032,13 @@ def call_follow(): self._create_file_racefree(lock_file) # Create lock except OSError as e: if e.errno == errno.EEXIST: # File already exists - self.info("Lock file created after test! Looping again: {}".format( - lock_file)) + self.info( + "Lock file created after test! Looping again: {}".format( + lock_file + ) + ) - # Since a lock file was created by a different source, + # Since a lock file was created by a different source, # we need to reset this flag to re-check the locks. proceed_through_locks = False continue # Go back to start @@ -767,24 +1047,34 @@ def call_follow(): # If you make it past these tests, we should proceed to run the process. if target is not None: - self.info("Target to produce: {} ".format(",".join(['`'+x+'`' for x in target]))) + self.info( + "Target to produce: {} ".format( + ",".join(["`" + x + "`" for x in target]) + ) + ) else: self.info("Targetless command, running... ") if isinstance(cmd, list): # Handle command lists for cmd_i in cmd: - list_ret, maxmem = \ - self.callprint(cmd_i, shell, lock_file, nofail, container) + list_ret, maxmem = self.callprint( + cmd_i, shell, lock_file, nofail, container + ) maxmem = max(maxmem) if isinstance(maxmem, Iterable) else maxmem - local_maxmem = max(local_maxmem, maxmem) - list_ret = max(list_ret) if isinstance(list_ret, Iterable) else list_ret - process_return_code = max(process_return_code, list_ret) + local_maxmem = max(local_maxmem, maxmem) + list_ret = ( + _max_ret_code(list_ret) + if isinstance(list_ret, Iterable) + else list_ret + ) + process_return_code = _max_ret_code([process_return_code, list_ret]) else: # Single command (most common) - process_return_code, local_maxmem = \ - self.callprint(cmd, shell, lock_file, nofail, container) # Run command + process_return_code, local_maxmem = self.callprint( + cmd, shell, lock_file, nofail, container + ) # Run command if isinstance(process_return_code, list): - process_return_code = max(process_return_code) + process_return_code = _max_ret_code(process_return_code) # For temporary files, you can specify a clean option to automatically # add them to the clean list, saving you a manual call to clean_add @@ -806,7 +1096,7 @@ def checkprint(self, cmd, shell=None, nofail=False): """ Just like callprint, but checks output -- so you can get a variable in python corresponding to the return value of the command you call. - This is equivalent to running subprocess.check_output() + This is equivalent to running subprocess.check_output() instead of subprocess.call(). :param str | Iterable[str] cmd: Bash command(s) to be run. :param bool | str shell: If command requires should be run in its own shell. Optional. @@ -830,9 +1120,11 @@ def checkprint(self, cmd, shell=None, nofail=False): if not shell: if likely_shell: - self.debug("Should this command run in a shell instead of directly in a subprocess?") + self.debug( + "Should this command run in a shell instead of directly in a subprocess?" + ) cmd = shlex.split(cmd) - + try: return subprocess.check_output(cmd, shell=shell).decode().strip() except Exception as e: @@ -841,7 +1133,7 @@ def checkprint(self, cmd, shell=None, nofail=False): def _attend_process(self, proc, sleeptime): """ Waits on a process for a given time to see if it finishes, returns True - if it's still running after the given time or False as soon as it + if it's still running after the given time or False as soon as it returns. :param psutil.Popen proc: Process object opened by psutil.Popen() @@ -892,10 +1184,12 @@ def get_mem_child_sum(proc): if children: mem_sum += sum([x.memory_info().rss for x in children]) # return in gigs - return mem_sum/1e9 + return mem_sum / 1e9 except (psutil.NoSuchProcess, psutil.ZombieProcess) as e: self.warning(e) - self.warning("Warning: couldn't add memory use for process: {}".format(proc.pid)) + self.warning( + "Warning: couldn't add memory use for process: {}".format(proc.pid) + ) return 0 def display_memory(memval): @@ -910,7 +1204,11 @@ def make_hash(o): try: hsh = md5(str(o).encode("utf-8")).hexdigest()[:10] except Exception as e: - self.debug("Could not create hash for '{}', caught exception: {}".format(str(o), e.__class__.__name__)) + self.debug( + "Could not create hash for '{}', caught exception: {}".format( + str(o), e.__class__.__name__ + ) + ) hsh = None return hsh @@ -943,7 +1241,7 @@ def make_hash(o): "container": container, "p": processes[-1], "args_hash": make_hash(conc_cmd), - "local_proc_id": self.process_counter() + "local_proc_id": self.process_counter(), } self._report_command(cmd, [x.pid for x in processes]) @@ -969,16 +1267,22 @@ def proc_wrapup(i): current_pid = processes[i].pid info = "PID: {pid};\tCommand: {cmd};\tReturn code: {ret};\tMemory used: {mem}".format( - pid=current_pid, + pid=current_pid, cmd=self.running_procs[current_pid]["proc_name"], ret=processes[i].returncode, - mem=display_memory(local_maxmems[i])) - + mem=display_memory(local_maxmems[i]), + ) + # report process profile - self._report_profile(self.running_procs[current_pid]["proc_name"], lock_file, - time.time() - self.running_procs[current_pid]["start_time"], local_maxmems[i], - current_pid, self.running_procs[current_pid]["args_hash"], - self.running_procs[current_pid]["local_proc_id"]) + self._report_profile( + self.running_procs[current_pid]["proc_name"], + lock_file, + time.time() - self.running_procs[current_pid]["start_time"], + local_maxmems[i], + current_pid, + self.running_procs[current_pid]["args_hash"], + self.running_procs[current_pid]["local_proc_id"], + ) # Remove this as a running subprocess self.running_procs[current_pid]["info"] = info @@ -991,29 +1295,37 @@ def proc_wrapup(i): returncodes[i] = returncode return info - sleeptime = .0001 - + sleeptime = 0.0001 + while running_processes: self.debug("running") for i in running_processes: - local_maxmems[i] = max(local_maxmems[i], (get_mem_child_sum(processes[i]))) + local_maxmems[i] = max( + local_maxmems[i], (get_mem_child_sum(processes[i])) + ) self.peak_memory = max(self.peak_memory, local_maxmems[i]) self.debug(processes[i]) if not self._attend_process(processes[i], sleeptime): proc_wrapup_text[i] = proc_wrapup(i) - # the sleeptime is extremely short at the beginning and gets longer exponentially + # the sleeptime is extremely short at the beginning and gets longer exponentially # (+ constant to prevent copious checks at the very beginning) # = more precise mem tracing for short processes - sleeptime = min((sleeptime + 0.25) * 3, 60/len(processes)) + sleeptime = min((sleeptime + 0.25) * 3, 60 / len(processes)) # All jobs are done, print a final closing and job info stop_time = time.time() proc_message = "Command completed. {info}" - info = "Elapsed time: " + str(datetime.timedelta(seconds=self.time_elapsed(start_time))) + "." - info += " Running peak memory: {pipe}.".format(pipe=display_memory(self.peak_memory)) + info = ( + "Elapsed time: " + + str(datetime.timedelta(seconds=self.time_elapsed(start_time))) + + "." + ) + info += " Running peak memory: {pipe}.".format( + pipe=display_memory(self.peak_memory) + ) # if len(proc_wrapup_text) == 1: - # info += " {}".format(proc_wrapup_text[0]) + # info += " {}".format(proc_wrapup_text[0]) for i in completed_processes: info += " \n {}".format(self.completed_procs[processes[i].pid]["info"]) @@ -1024,7 +1336,9 @@ def proc_wrapup(i): for rc in returncodes: if rc != 0: - msg = "Subprocess returned nonzero result. Check above output for details" + msg = ( + "Subprocess returned nonzero result. Check above output for details" + ) self._triage_error(SubprocessError(msg), nofail) return [returncodes, local_maxmems] @@ -1059,7 +1373,7 @@ def _wait_for_process(self, p, shell=False): :param bool shell: If command requires should be run in its own shell. Optional. Default: False. """ local_maxmem = -1 - sleeptime = .5 + sleeptime = 0.5 while p.poll() is None: if not shell: local_maxmem = max(local_maxmem, self._memory_usage(p.pid) / 1e6) @@ -1068,7 +1382,7 @@ def _wait_for_process(self, p, shell=False): sleeptime = min(sleeptime + 5, 60) self.peak_memory = max(self.peak_memory, local_maxmem) - + del self.running_procs[p.pid] info = "Process " + str(p.pid) + " returned: (" + str(p.returncode) + ")." @@ -1087,7 +1401,7 @@ def _wait_for_lock(self, lock_file): :param str lock_file: Lock file to wait upon. """ - sleeptime = .5 + sleeptime = 0.5 first_message_flag = False long_message_flag = False dot_count = 0 @@ -1096,12 +1410,18 @@ def _wait_for_lock(self, lock_file): while os.path.isfile(lock_file): if first_message_flag is False: self.timestamp("Waiting for file lock: " + lock_file) - self.warning("This indicates that another process may be executing this " + self.warning( + "This indicates that another process may be executing this " "command, or the pipeline was not properly shut down. If the " "pipeline was not properly shut down last time, " "you should restart it in 'recover' mode (-R) to indicate that " - "this step should be restarted.") - self._set_status_flag(WAIT_FLAG) + "this step should be restarted." + ) + # self._set_status_flag(WAIT_FLAG) + self.pipestat.set_status( + sample_name=self._pipestat_manager.sample_name, + status_identifier="waiting", + ) first_message_flag = True else: sys.stdout.write(".") @@ -1121,7 +1441,11 @@ def _wait_for_lock(self, lock_file): if first_message_flag: self.timestamp("File unlocked.") - self._set_status_flag(RUN_FLAG) + # self._set_status_flag(RUN_FLAG) + self.pipestat.set_status( + sample_name=self._pipestat_manager.sample_name, + status_identifier="running", + ) ################################### # Logging functions @@ -1145,8 +1469,7 @@ def critical(self, msg, *args, **kwargs): def fatal(self, msg, *args, **kwargs): self._logger.fatal(msg, *args, **kwargs) - def timestamp(self, message="", checkpoint=None, - finished=False, raise_error=True): + def timestamp(self, message="", checkpoint=None, finished=False, raise_error=True): """ Print message, time, and time elapsed, perhaps creating checkpoint. @@ -1189,7 +1512,9 @@ def timestamp(self, message="", checkpoint=None, self.curr_checkpoint = checkpoint self._checkpoint(self.prev_checkpoint) # Handle the two halting conditions. - if (finished and checkpoint == self.stop_after) or (not finished and checkpoint == self.stop_before): + if (finished and checkpoint == self.stop_after) or ( + not finished and checkpoint == self.stop_before + ): self.halt(checkpoint, finished, raise_error=raise_error) # Determine if we've started executing. elif checkpoint == self.start_point: @@ -1203,13 +1528,17 @@ def timestamp(self, message="", checkpoint=None, elapsed = self.time_elapsed(self.last_timestamp) t = time.strftime("%m-%d %H:%M:%S") if checkpoint is None: - msg = "{m} ({t}) elapsed: {delta_t} _TIME_".\ - format(m=message, t=t, delta_t=elapsed) + msg = "{m} ({t}) elapsed: {delta_t} _TIME_".format( + m=message, t=t, delta_t=elapsed + ) else: - msg = "{m} ({t}) ({status} {stage}) elapsed: {delta_t} _TIME_".\ - format(m=message, t=t, - status="finished" if finished else "starting", - stage=checkpoint, delta_t=elapsed) + msg = "{m} ({t}) ({status} {stage}) elapsed: {delta_t} _TIME_".format( + m=message, + t=t, + status="finished" if finished else "starting", + stage=checkpoint, + delta_t=elapsed, + ) if re.match("^###", message): msg = "\n{}\n".format(msg) self.info(msg) @@ -1224,59 +1553,78 @@ def time_elapsed(time_since): """ return round(time.time() - time_since, 0) - def _report_profile(self, command, lock_name, elapsed_time, memory, pid, args_hash, proc_count): + def _report_profile( + self, command, lock_name, elapsed_time, memory, pid, args_hash, proc_count + ): """ Writes a string to self.pipeline_profile_file. """ - rel_lock_name = lock_name if lock_name is None else os.path.relpath(lock_name, self.outfolder) - message_raw = str(pid) + "\t" + \ - str(args_hash) + "\t" + \ - str(proc_count) + "\t" + \ - str(datetime.timedelta(seconds=round(elapsed_time, 2))) + "\t " + \ - str(round(memory, 4)) + "\t" + \ - str(command) + "\t" + \ - str(rel_lock_name) + rel_lock_name = ( + lock_name + if lock_name is None + else os.path.relpath(lock_name, self.outfolder) + ) + message_raw = ( + str(pid) + + "\t" + + str(args_hash) + + "\t" + + str(proc_count) + + "\t" + + str(datetime.timedelta(seconds=round(elapsed_time, 2))) + + "\t " + + str(round(memory, 4)) + + "\t" + + str(command) + + "\t" + + str(rel_lock_name) + ) with open(self.pipeline_profile_file, "a") as myfile: myfile.write(message_raw + "\n") - def report_result(self, key, value, annotation=None, nolog=False): + def report_result(self, key, value, nolog=False, result_formatter=None): """ - Writes a string to self.pipeline_stats_file. - + Writes a key:value pair to self.pipeline_stats_file. + :param str key: name (key) of the stat - :param str annotation: By default, the stats will be annotated with the - pipeline name, so you can tell which pipeline records which stats. - If you want, you can change this; use annotation='shared' if you - need the stat to be used by another pipeline (using get_stat()). + :param dict value: value of the stat to report. :param bool nolog: Turn on this flag to NOT print this result in the logfile. Use sparingly in case you will be printing the result in a different format. - """ - # Default annotation is current pipeline name. - annotation = str(annotation or self.name) - - # In case the value is passed with trailing whitespace. - value = str(value).strip() + :param str result_formatter: function for formatting via pipestat backend + :return str reported_result: the reported result is returned as a list of formatted strings. + """ # keep the value in memory: self.stats_dict[key] = value - message_raw = "{key}\t{value}\t{annotation}".format( - key=key, value=value, annotation=annotation) - message_markdown = "\n> `{key}`\t{value}\t{annotation}\t_RES_".format( - key=key, value=value, annotation=annotation) + rf = result_formatter or self.pipestat_result_formatter + + reported_result = self.pipestat.report( + values={key: value}, + sample_name=self.pipestat_sample_name, + result_formatter=rf, + ) if not nolog: - self.info(message_markdown) + for r in reported_result: + self.info(r) - # Just to be extra careful, let's lock the file while we we write - # in case multiple pipelines write to the same file. - self._safe_write_to_file(self.pipeline_stats_file, message_raw) + return reported_result - def report_object(self, key, filename, anchor_text=None, anchor_image=None, annotation=None): + def report_object( + self, + key, + filename, + anchor_text=None, + anchor_image=None, + annotation=None, + nolog=False, + result_formatter=None, + ): """ - Writes a string to self.pipeline_objects_file. Used to report figures - and others. + Writes a key:value pair to self.pipeline_stats_file. Note: this function + will be deprecated. Using report_result is recommended. :param str key: name (key) of the object :param str filename: relative path to the file (relative to parent @@ -1289,74 +1637,63 @@ def report_object(self, key, filename, anchor_text=None, anchor_image=None, anno :param str annotation: By default, the figures will be annotated with the pipeline name, so you can tell which pipeline records which figures. If you want, you can change this. - """ - + :param bool nolog: Turn on this flag to NOT print this result in the + logfile. Use sparingly in case you will be printing the result in a + different format. + :param str result_formatter: function for formatting via pipestat backend + :return str reported_result: the reported result is returned as a list of formatted strings. + """ + warnings.warn( + "This function may be removed in future release. " + "The recommended way to report pipeline results is using PipelineManager.pipestat.report().", + category=DeprecationWarning, + ) + rf = result_formatter or self.pipestat_result_formatter # Default annotation is current pipeline name. annotation = str(annotation or self.name) - # In case the value is passed with trailing whitespace. filename = str(filename).strip() if anchor_text: anchor_text = str(anchor_text).strip() else: anchor_text = str(key).strip() - # better to use a relative path in this file # convert any absolute paths into relative paths - relative_filename = os.path.relpath(filename, self.outfolder) \ - if os.path.isabs(filename) else filename + relative_filename = ( + os.path.relpath(filename, self.outfolder) + if os.path.isabs(filename) + else filename + ) if anchor_image: - relative_anchor_image = os.path.relpath(anchor_image, self.outfolder) \ - if os.path.isabs(anchor_image) else anchor_image + relative_anchor_image = ( + os.path.relpath(anchor_image, self.outfolder) + if os.path.isabs(anchor_image) + else anchor_image + ) else: relative_anchor_image = "None" - message_raw = "{key}\t{filename}\t{anchor_text}\t{anchor_image}\t{annotation}".format( - key=key, filename=relative_filename, anchor_text=anchor_text, - anchor_image=relative_anchor_image, annotation=annotation) - - message_markdown = "> `{key}`\t{filename}\t{anchor_text}\t{anchor_image}\t{annotation}\t_OBJ_".format( - key=key, filename=relative_filename, anchor_text=anchor_text, - anchor_image=relative_anchor_image, annotation=annotation) - - self.warning(message_markdown) + message_raw = "{filename}\t{anchor_text}\t{anchor_image}\t{annotation}".format( + filename=relative_filename, + anchor_text=anchor_text, + anchor_image=relative_anchor_image, + annotation=annotation, + ) - self._safe_write_to_file(self.pipeline_objects_file, message_raw) - - def _safe_write_to_file(self, file, message): - """ - Writes a string to a file safely (with file locks). - """ - target = file - lock_name = make_lock_name(target, self.outfolder) - lock_file = self._make_lock_path(lock_name) + val = {key: message_raw.replace("\t", " ")} - while True: - if os.path.isfile(lock_file): - self._wait_for_lock(lock_file) - else: - try: - self.locks.append(lock_file) - self._create_file_racefree(lock_file) - except OSError as e: - if e.errno == errno.EEXIST: - self.warning("Lock file created after test! Looping again.") - continue # Go back to start - - # Proceed with file writing - with open(file, "a") as myfile: - myfile.write(message + "\n") - - os.remove(lock_file) - self.locks.remove(lock_file) - - # If you make it to the end of the while loop, you're done - break + reported_result = self.pipestat.report( + values=val, sample_name=self.pipestat_sample_name, result_formatter=rf + ) + if not nolog: + for r in reported_result: + self.info(r) + return reported_result def _report_command(self, cmd, procs=None): """ - Writes a command to both stdout and to the commands log file + Writes a command to both stdout and to the commands log file (self.pipeline_commands_file). :param str cmd: command to report @@ -1385,22 +1722,22 @@ def _report_command(self, cmd, procs=None): @staticmethod def _create_file(file): """ - Creates a file, but will not fail if the file already exists. - This is vulnerable to race conditions; use this for cases where it + Creates a file, but will not fail if the file already exists. + This is vulnerable to race conditions; use this for cases where it doesn't matter if this process is the one that created the file. :param str file: File to create. """ - with open(file, 'w') as fout: - fout.write('') + with open(file, "w") as fout: + fout.write("") @staticmethod def _create_file_racefree(file): """ Creates a file, but fails if the file already exists. - - This function will thus only succeed if this process actually creates - the file; if the file already exists, it will cause an OSError, + + This function will thus only succeed if this process actually creates + the file; if the file already exists, it will cause an OSError, solving race conditions. :param str file: File to create. @@ -1411,15 +1748,18 @@ def _create_file_racefree(file): @staticmethod def _ensure_lock_prefix(lock_name_base): - """ Ensure that an alleged lock file is correctly prefixed. """ - return lock_name_base if lock_name_base.startswith(LOCK_PREFIX) \ - else LOCK_PREFIX + lock_name_base + """Ensure that an alleged lock file is correctly prefixed.""" + return ( + lock_name_base + if lock_name_base.startswith(LOCK_PREFIX) + else LOCK_PREFIX + lock_name_base + ) def _make_lock_path(self, lock_name_base): """ Create path to lock file with given name as base. - - :param str lock_name_base: Lock file name, designed to not be prefixed + + :param str lock_name_base: Lock file name, designed to not be prefixed with the lock file designation, but that's permitted. :return str: Path to the lock file. """ @@ -1436,8 +1776,8 @@ def _make_lock_path(self, lock_name_base): def _recoverfile_from_lockfile(self, lockfile): """ Create path to recovery file with given name as base. - - :param str lockfile: Name of file on which to base this path, + + :param str lockfile: Name of file on which to base this path, perhaps already prefixed with the designation of a lock file. :return str: Path to recovery file. """ @@ -1453,7 +1793,7 @@ def make_sure_path_exists(path): Creates all directories in a path if it does not exist. :param str path: Path to create. - :raises Exception: if the path creation attempt hits an error with + :raises Exception: if the path creation attempt hits an error with a code indicating a cause other than pre-existence. """ try: @@ -1468,41 +1808,32 @@ def make_sure_path_exists(path): def _refresh_stats(self): """ - Loads up the stats sheet created for this pipeline run and reads + Loads up the stats yaml created for this pipeline run and reads those stats into memory """ - # regex identifies all possible stats files. - #regex = self.outfolder + "*_stats.tsv" - #stats_files = glob.glob(regex) - #stats_files.insert(self.pipeline_stats_file) # last one is the current pipeline - #for stats_file in stats_files: - - stats_file = self.pipeline_stats_file if os.path.isfile(self.pipeline_stats_file): - with open(stats_file, 'r') as stat_file: - for line in stat_file: - try: - # Someone may have put something that's not 3 columns in the stats file - # if so, shame on him, but we can just ignore it. - key, value, annotation = line.split('\t') - except ValueError: - self.warning("WARNING: Each row in a stats file is expected to have 3 columns") - - if annotation.rstrip() == self.name or annotation.rstrip() == "shared": - self.stats_dict[key] = value.strip() - #if os.path.isfile(self.pipeline_stats_file): + _, data = read_yaml_data(path=self.pipeline_stats_file, what="stats_file") + print(data) + pipeline_key = list( + data[self.pipestat["_pipeline_name"]][self.pipestat["_pipeline_type"]] + )[0] + if self.name == pipeline_key: + for key, value in data[self.pipestat["_pipeline_name"]][ + self.pipestat["_pipeline_type"] + ][pipeline_key].items(): + self.stats_dict[key] = value.strip() def get_stat(self, key): """ Returns a stat that was previously reported. This is necessary for reporting new stats that are - derived from two stats, one of which may have been reported by an earlier run. For example, + derived from two stats, one of which may have been reported by an earlier run. For example, if you first use report_result to report (number of trimmed reads), and then in a later stage - want to report alignment rate, then this second stat (alignment rate) will require knowing the + want to report alignment rate, then this second stat (alignment rate) will require knowing the first stat (number of trimmed reads); however, that may not have been calculated in the current - pipeline run, so we must retrieve it from the stats.tsv output file. This command will retrieve + pipeline run, so we must retrieve it from the stats.yaml output file. This command will retrieve such previously reported stats if they were not already calculated in the current pipeline run. - :param key: key of stat to retrieve + :param key: key of stat to retrieve """ try: @@ -1562,9 +1893,12 @@ def _checkpoint(self, stage): # be expected to characterize the extension of a file name/path. base, ext = os.path.splitext(stage) if ext and "." not in base: - self.warning("WARNING: '{}' looks like it may be the name or path of " - "a file; for such a checkpoint, use touch_checkpoint.". - format(stage)) + self.warning( + "WARNING: '{}' looks like it may be the name or path of " + "a file; for such a checkpoint, use touch_checkpoint.".format( + stage + ) + ) else: if not is_checkpoint: self.warning("Not a checkpoint: {}".format(stage)) @@ -1596,9 +1930,12 @@ def _touch_checkpoint(self, check_file): other_folder = os.path.join(folder, "") this_folder = os.path.join(self.outfolder, "") if other_folder != this_folder: - errmsg = "Path provided as checkpoint file isn't in pipeline " \ - "output folder. '{}' is not in '{}'".format( - check_file, self.outfolder) + errmsg = ( + "Path provided as checkpoint file isn't in pipeline " + "output folder. '{}' is not in '{}'".format( + check_file, self.outfolder + ) + ) raise ValueError(errmsg) fpath = check_file else: @@ -1607,14 +1944,14 @@ def _touch_checkpoint(self, check_file): # Create/update timestamp for checkpoint, but base return value on # whether the action was a simple update or a novel creation. already_exists = os.path.isfile(fpath) - open(fpath, 'w').close() + open(fpath, "w").close() action = "Updated" if already_exists else "Created" self.info("{} checkpoint file: '{}'".format(action, fpath)) return already_exists def complete(self): - """ Stop a completely finished pipeline. """ + """Stop a completely finished pipeline.""" self.stop_pipeline(status=COMPLETE_FLAG) def fail_pipeline(self, exc, dynamic_recover=False): @@ -1652,7 +1989,11 @@ def fail_pipeline(self, exc, dynamic_recover=False): total_time = datetime.timedelta(seconds=self.time_elapsed(self.starttime)) self.info("Total time: " + str(total_time)) self.info("Failure reason: " + str(exc)) - self._set_status_flag(FAIL_FLAG) + # self._set_status_flag(FAIL_FLAG) + self.pipestat.set_status( + sample_name=self._pipestat_manager.sample_name, + status_identifier="failed", + ) if isinstance(exc, str): exc = RuntimeError(exc) @@ -1683,16 +2024,21 @@ def get_elapsed_time(self): :return int: sum of runtimes in seconds """ if os.path.isfile(self.pipeline_profile_file): - df = _pd.read_csv(self.pipeline_profile_file, sep="\t", comment="#", names=PROFILE_COLNAMES) + df = _pd.read_csv( + self.pipeline_profile_file, + sep="\t", + comment="#", + names=PROFILE_COLNAMES, + ) try: - df['runtime'] = _pd.to_timedelta(df['runtime']) + df["runtime"] = _pd.to_timedelta(df["runtime"]) except ValueError: # return runtime estimate # this happens if old profile style is mixed with the new one # and the columns do not match return self.time_elapsed(self.starttime) - unique_df = df[~df.duplicated('cid', keep='last').values] - return sum(unique_df['runtime'].apply(lambda x: x.total_seconds())) + unique_df = df[~df.duplicated("cid", keep="last").values] + return sum(unique_df["runtime"].apply(lambda x: x.total_seconds())) return self.time_elapsed(self.starttime) def stop_pipeline(self, status=COMPLETE_FLAG): @@ -1701,30 +2047,41 @@ def stop_pipeline(self, status=COMPLETE_FLAG): This is the "healthy" pipeline completion function. The normal pipeline completion function, to be run by the pipeline - at the end of the script. It sets status flag to completed and records + at the end of the script. It sets status flag to completed and records some time and memory statistics to the log file. """ - self._set_status_flag(status) + # self._set_status_flag(status) + self.pipestat.set_status( + sample_name=self._pipestat_manager.sample_name, status_identifier=status + ) self._cleanup() - elapsed_time_this_run = str(datetime.timedelta(seconds=self.time_elapsed(self.starttime))) - self.report_result("Time", - elapsed_time_this_run, - nolog=True) - self.report_result("Success", - time.strftime("%m-%d-%H:%M:%S"), - nolog=True) + elapsed_time_this_run = str( + datetime.timedelta(seconds=self.time_elapsed(self.starttime)) + ) + self.report_result("Time", elapsed_time_this_run, nolog=True) + self.report_result("Success", time.strftime("%m-%d-%H:%M:%S"), nolog=True) self.info("\n### Pipeline completed. Epilogue") # print("* " + "Total elapsed time".rjust(20) + ": " # + str(datetime.timedelta(seconds=self.time_elapsed(self.starttime)))) - self.info("* " + "Elapsed time (this run)".rjust(30) + ": " + - elapsed_time_this_run) - self.info("* " + "Total elapsed time (all runs)".rjust(30) + ": " + - str(datetime.timedelta(seconds=round(self.get_elapsed_time())))) - self.info("* " + "Peak memory (this run)".rjust(30) + ": " + - str(round(self.peak_memory, 4)) + " GB") - # self.info("* " + "Total peak memory (all runs)".rjust(30) + ": " + - # str(round(self.peak_memory, 4)) + " GB") + self.info( + "* " + "Elapsed time (this run)".rjust(30) + ": " + elapsed_time_this_run + ) + self.info( + "* " + + "Total elapsed time (all runs)".rjust(30) + + ": " + + str(datetime.timedelta(seconds=round(self.get_elapsed_time()))) + ) + self.info( + "* " + + "Peak memory (this run)".rjust(30) + + ": " + + str(round(self.peak_memory, 4)) + + " GB" + ) + # self.info("* " + "Total peak memory (all runs)".rjust(30) + ": " + + # str(round(self.peak_memory, 4)) + " GB") if self.halted: return @@ -1745,7 +2102,7 @@ def _signal_term_handler(self, signal, frame): """ signal_type = "SIGTERM" self._generic_signal_handler(signal_type) - + def _generic_signal_handler(self, signal_type): """ Function for handling both SIGTERM and SIGINT @@ -1764,7 +2121,7 @@ def _generic_signal_handler(self, signal_type): # passed directly to the tee subprocess, so I could handle that on # my own; hence, now I believe I no longer need to do this. I'm # leaving this code here as a relic in case something comes up. - #with open(self.pipeline_log_file, "a") as myfile: + # with open(self.pipeline_log_file, "a") as myfile: # myfile.write(message + "\n") def _signal_int_handler(self, signal, frame): @@ -1799,10 +2156,9 @@ def _exit_handler(self): self.fail_pipeline(Exception("Pipeline failure. See details above.")) if self.tee: - self.tee.kill() + self.tee.kill() def _terminate_running_subprocesses(self): - # make a copy of the list to iterate over since we'll be removing items for pid in self.running_procs.copy(): proc_dict = self.running_procs[pid] @@ -1810,9 +2166,18 @@ def _terminate_running_subprocesses(self): # Close the preformat tag that we opened when the process was spawned. # record profile of any running processes before killing elapsed_time = time.time() - self.running_procs[pid]["start_time"] - process_peak_mem = self._memory_usage(pid, container=proc_dict["container"])/1e6 - self._report_profile(self.running_procs[pid]["proc_name"], None, elapsed_time, process_peak_mem, pid, - self.running_procs[pid]["args_hash"], self.running_procs[pid]["local_proc_id"]) + process_peak_mem = ( + self._memory_usage(pid, container=proc_dict["container"]) / 1e6 + ) + self._report_profile( + self.running_procs[pid]["proc_name"], + None, + elapsed_time, + process_peak_mem, + pid, + self.running_procs[pid]["args_hash"], + self.running_procs[pid]["local_proc_id"], + ) self._kill_child_process(pid, proc_dict["proc_name"]) del self.running_procs[pid] @@ -1842,10 +2207,10 @@ def pskill(proc_pid, sig=signal.SIGINT): if proc_name: proc_string = " ({proc_name})".format(proc_name=proc_name) - # First a gentle kill + # First a gentle kill sys.stdout.flush() still_running = self._attend_process(psutil.Process(child_pid), 0) - sleeptime = .25 + sleeptime = 0.25 time_waiting = 0 while still_running and time_waiting < 3: @@ -1873,9 +2238,12 @@ def pskill(proc_pid, sig=signal.SIGINT): if still_running: # still running!? - self.warning("Child process {child_pid}{proc_string} never responded" - "I just can't take it anymore. I don't know what to do...".format(child_pid=child_pid, - proc_string=proc_string)) + self.warning( + "Child process {child_pid}{proc_string} never responded" + "I just can't take it anymore. I don't know what to do...".format( + child_pid=child_pid, proc_string=proc_string + ) + ) else: if time_waiting > 0: note = "terminated after {time} sec".format(time=int(time_waiting)) @@ -1883,12 +2251,13 @@ def pskill(proc_pid, sig=signal.SIGINT): note = "was already terminated" msg = "Child process {child_pid}{proc_string} {note}.".format( - child_pid=child_pid, proc_string=proc_string, note=note) + child_pid=child_pid, proc_string=proc_string, note=note + ) self.info(msg) @staticmethod def _atexit_register(*args): - """ Convenience alias to register exit functions without having to import atexit in the pipeline. """ + """Convenience alias to register exit functions without having to import atexit in the pipeline.""" atexit.register(*args) def get_container(self, image, mounts): @@ -1954,11 +2323,17 @@ def clean_add(self, regex, conditional=False, manual=False): try: with open(self.cleanup_file, "a") as myfile: if os.path.isabs(filename): - relative_filename = os.path.relpath(filename, self.outfolder) + relative_filename = os.path.relpath( + filename, self.outfolder + ) absolute_filename = filename else: - relative_filename = os.path.relpath(filename, self.outfolder) - absolute_filename = os.path.abspath(os.path.join(self.outfolder, relative_filename)) + relative_filename = os.path.relpath( + filename, self.outfolder + ) + absolute_filename = os.path.abspath( + os.path.join(self.outfolder, relative_filename) + ) if os.path.isfile(absolute_filename): # print("Adding file to cleanup: {}".format(filename)) myfile.write("rm " + relative_filename + "\n") @@ -1969,9 +2344,15 @@ def clean_add(self, regex, conditional=False, manual=False): # and the directory itself myfile.write("rmdir " + relative_filename + "\n") else: - self.info("File not added to cleanup: {}".format(relative_filename)) + self.info( + "File not added to cleanup: {}".format( + relative_filename + ) + ) except Exception as e: - self.error("Error in clean_add on path {}: {}".format(filename, str(e))) + self.error( + "Error in clean_add on path {}: {}".format(filename, str(e)) + ) elif conditional: self.cleanup_list_conditional.append(regex) else: @@ -1998,9 +2379,11 @@ def _cleanup(self, dry_run=False): n_to_clean_cond = len(self.cleanup_list_conditional) if n_to_clean + n_to_clean_cond > 0: - self.info("Starting cleanup: {} files; {} conditional files for cleanup".format( - n_to_clean, - n_to_clean_cond)) + self.info( + "Starting cleanup: {} files; {} conditional files for cleanup".format( + n_to_clean, n_to_clean_cond + ) + ) else: self.debug("No files to clean.") @@ -2034,9 +2417,17 @@ def _cleanup(self, dry_run=False): if n_to_clean_cond > 0: run_flag = flag_name(RUN_FLAG) - flag_files = [fn for fn in glob.glob(self.outfolder + flag_name("*")) - if COMPLETE_FLAG not in os.path.basename(fn) - and not "{}_{}".format(self.name, run_flag) == os.path.basename(fn)] + flag_files = [ + fn + for fn in glob.glob(self.outfolder + flag_name("*")) + if COMPLETE_FLAG not in os.path.basename(fn) + and not "{}_{}_{}".format( + self._pipestat_manager["_pipeline_name"], + self.pipestat_sample_name, + run_flag, + ) + == os.path.basename(fn) + ] if len(flag_files) == 0 and not dry_run: self.info("\nCleaning up conditional list. . .") for expr in self.cleanup_list_conditional: @@ -2055,9 +2446,14 @@ def _cleanup(self, dry_run=False): except: pass else: - self.info("\nConditional flag found: " + str([os.path.basename(i) for i in flag_files])) - self.info("\nThese conditional files were left in place:\n\n- " + - "\n- ".join(self.cleanup_list_conditional)) + self.info( + "\nConditional flag found: " + + str([os.path.basename(i) for i in flag_files]) + ) + self.info( + "\nThese conditional files were left in place:\n\n- " + + "\n- ".join(self.cleanup_list_conditional) + ) # Produce a cleanup script. no_cleanup_script = [] for cleandir in self.cleanup_list_conditional: @@ -2071,10 +2467,13 @@ def _cleanup(self, dry_run=False): clean_script.write("rmdir " + clean_item + "\n") except Exception as e: no_cleanup_script.append(cleandir) - if no_cleanup_script: - self.warning('\n\nCould not produce cleanup script for item(s):\n\n- ' + '\n- '.join(no_cleanup_script)) + if no_cleanup_script: + self.warning( + "\n\nCould not produce cleanup script for item(s):\n\n- " + + "\n- ".join(no_cleanup_script) + ) - def _memory_usage(self, pid='self', category="hwm", container=None): + def _memory_usage(self, pid="self", category="hwm", container=None): """ Memory usage of the process in kilobytes. @@ -2087,8 +2486,8 @@ def _memory_usage(self, pid='self', category="hwm", container=None): cmd = "docker stats " + container + " --format '{{.MemUsage}}' --no-stream" mem_use_str = subprocess.check_output(cmd, shell=True).decode() - mem_num = re.findall('[\d\.]+', mem_use_str.split("/")[0])[0] - mem_scale = re.findall('[A-Za-z]+', mem_use_str.split("/")[0])[0] + mem_num = re.findall("[\d\.]+", mem_use_str.split("/")[0])[0] + mem_scale = re.findall("[A-Za-z]+", mem_use_str.split("/")[0])[0] mem_num = float(mem_num) if mem_scale == "GiB": @@ -2103,13 +2502,13 @@ def _memory_usage(self, pid='self', category="hwm", container=None): # Thanks Martin Geisler: status = None - result = {'peak': 0, 'rss': 0, 'hwm': 0} - + result = {"peak": 0, "rss": 0, "hwm": 0} + try: # This will only work on systems with a /proc file system # (like Linux). # status = open('/proc/self/status') - proc_spot = '/proc/%s/status' % pid + proc_spot = "/proc/%s/status" % pid status = open(proc_spot) for line in status: parts = line.split() @@ -2126,13 +2525,17 @@ def _memory_usage(self, pid='self', category="hwm", container=None): return result[category] def _triage_error(self, e, nofail): - """ Print a message and decide what to do about an error. """ + """Print a message and decide what to do about an error.""" if not nofail: self.fail_pipeline(e) elif self._failed: - self.info("This is a nofail process, but the pipeline was terminated for other reasons, so we fail.") + self.info( + "This is a nofail process, but the pipeline was terminated for other reasons, so we fail." + ) raise e else: self.error(e) - self.error("ERROR: Subprocess returned nonzero result, but pipeline is continuing because nofail=True") + self.error( + "ERROR: Subprocess returned nonzero result, but pipeline is continuing because nofail=True" + ) # TODO: return nonzero, or something. . .? diff --git a/pypiper/ngstk.py b/pypiper/ngstk.py index dcc57e8e..329b321b 100755 --- a/pypiper/ngstk.py +++ b/pypiper/ngstk.py @@ -1,11 +1,13 @@ """ Broadly applicable NGS processing/analysis functionality """ +import errno import os import re import subprocess -import errno + from attmap import AttMapEcho from yacman import load_yaml + from .exceptions import UnsupportedFiletypeException from .utils import is_fastq, is_gzipped_fastq, is_sam_or_bam @@ -43,7 +45,8 @@ def __init__(self, config_file=None, pm=None): # parse yaml into the project's attributes # self.add_entries(**config) super(NGSTk, self).__init__( - None if config_file is None else load_yaml(config_file)) + None if config_file is None else load_yaml(config_file) + ) # Keep a link to the pipeline manager, if one is provided. # if None is provided, instantiate "tools" and "parameters" with empty AttMaps @@ -63,12 +66,15 @@ def __init__(self, config_file=None, pm=None): self.parameters = AttMapEcho() # If pigz is available, use that. Otherwise, default to gzip. - if hasattr(self.pm, "cores") and self.pm.cores > 1 and self.check_command("pigz"): + if ( + hasattr(self.pm, "cores") + and self.pm.cores > 1 + and self.check_command("pigz") + ): self.ziptool_cmd = "pigz -f -p {}".format(self.pm.cores) else: self.ziptool_cmd = "gzip -f" - def _ensure_folders(self, *paths): """ Ensure that paths to folder(s) exist. @@ -90,7 +96,6 @@ def _ensure_folders(self, *paths): # Otherwise, just ensure that we have path to file's folder. self.make_dir(fpath if ext else p) - @property def ziptool(self): """ @@ -100,7 +105,6 @@ def ziptool(self): """ return self.ziptool_cmd - def make_dir(self, path): """ Forge path to directory, creating intermediates as needed. @@ -113,12 +117,10 @@ def make_dir(self, path): if exception.errno != errno.EEXIST: raise - def make_sure_path_exists(self, path): - """ Alias for make_dir """ + """Alias for make_dir""" self.make_dir(path) - # Borrowed from looper def check_command(self, command): """ @@ -126,7 +128,9 @@ def check_command(self, command): """ # Use `command` to see if command is callable, store exit code - code = os.system("command -v {0} >/dev/null 2>&1 || {{ exit 1; }}".format(command)) + code = os.system( + "command -v {0} >/dev/null 2>&1 || {{ exit 1; }}".format(command) + ) # If exit code is not 0, report which command failed and return False, else return True if code != 0: @@ -135,7 +139,6 @@ def check_command(self, command): else: return True - def get_file_size(self, filenames): """ Get size of all files in string (space-separated) in megabytes (Mb). @@ -149,10 +152,15 @@ def get_file_size(self, filenames): if type(filenames) is list: return sum([self.get_file_size(filename) for filename in filenames]) - return round(sum([float(os.stat(f).st_size) for f in filenames.split(" ")]) / (1024 ** 2), 4) - + return round( + sum([float(os.stat(f).st_size) for f in filenames.split(" ")]) + / (1024**2), + 4, + ) - def mark_duplicates(self, aligned_file, out_file, metrics_file, remove_duplicates="True"): + def mark_duplicates( + self, aligned_file, out_file, metrics_file, remove_duplicates="True" + ): cmd = self.tools.java if self.pm.javamem: # If a memory restriction exists. cmd += " -Xmx" + self.pm.javamem @@ -163,9 +171,9 @@ def mark_duplicates(self, aligned_file, out_file, metrics_file, remove_duplicate cmd += " REMOVE_DUPLICATES=" + remove_duplicates return cmd - - def bam2fastq(self, input_bam, output_fastq, - output_fastq2=None, unpaired_fastq=None): + def bam2fastq( + self, input_bam, output_fastq, output_fastq2=None, unpaired_fastq=None + ): """ Create command to convert BAM(s) to FASTQ(s). @@ -185,7 +193,6 @@ def bam2fastq(self, input_bam, output_fastq, cmd += " UNPAIRED_FASTQ={0}".format(unpaired_fastq) return cmd - def bam_to_fastq(self, bam_file, out_fastq_pre, paired_end): """ Build command to convert BAM file to FASTQ file(s) (R1/R2). @@ -209,11 +216,10 @@ def bam_to_fastq(self, bam_file, out_fastq_pre, paired_end): cmd += " VALIDATION_STRINGENCY=SILENT" return cmd - def bam_to_fastq_awk(self, bam_file, out_fastq_pre, paired_end, zipmode=False): """ - This converts bam file to fastq files, but using awk. As of 2016, this is much faster - than the standard way of doing this using Picard, and also much faster than the + This converts bam file to fastq files, but using awk. As of 2016, this is much faster + than the standard way of doing this using Picard, and also much faster than the bedtools implementation as well; however, it does no sanity checks and assumes the reads (for paired data) are all paired (no singletons), in the correct order. :param bool zipmode: Should the output be zipped? @@ -222,29 +228,27 @@ def bam_to_fastq_awk(self, bam_file, out_fastq_pre, paired_end, zipmode=False): fq1 = out_fastq_pre + "_R1.fastq" fq2 = out_fastq_pre + "_R2.fastq" - if zipmode: fq1 = fq1 + ".gz" fq2 = fq2 + ".gz" - fq1_target = " | \"" + self.ziptool + " -c > " + fq1 + '"' - fq2_target = " | \"" + self.ziptool + " -c > " + fq2 + '"' + fq1_target = ' | "' + self.ziptool + " -c > " + fq1 + '"' + fq2_target = ' | "' + self.ziptool + " -c > " + fq2 + '"' else: fq1_target = ' > "' + fq1 + '"' fq2_target = ' > "' + fq2 + '"' - + if paired_end: cmd = self.tools.samtools + " view " + bam_file + " | awk '" - cmd += r'{ if (NR%2==1) print "@"$1"/1\n"$10"\n+\n"$11' + fq1_target + ';' - cmd += r' else print "@"$1"/2\n"$10"\n+\n"$11' + fq2_target + '; }' + cmd += r'{ if (NR%2==1) print "@"$1"/1\n"$10"\n+\n"$11' + fq1_target + ";" + cmd += r' else print "@"$1"/2\n"$10"\n+\n"$11' + fq2_target + "; }" cmd += "'" # end the awk command else: fq2 = None cmd = self.tools.samtools + " view " + bam_file + " | awk '" - cmd += r'{ print "@"$1"\n"$10"\n+\n"$11' + fq1_target + '; }' + cmd += r'{ print "@"$1"\n"$10"\n+\n"$11' + fq1_target + "; }" cmd += "'" return cmd, fq1, fq2 - def bam_to_fastq_bedtools(self, bam_file, out_fastq_pre, paired_end): """ Converts bam to fastq; A version using bedtools @@ -252,14 +256,20 @@ def bam_to_fastq_bedtools(self, bam_file, out_fastq_pre, paired_end): self.make_sure_path_exists(os.path.dirname(out_fastq_pre)) fq1 = out_fastq_pre + "_R1.fastq" fq2 = None - cmd = self.tools.bedtools + " bamtofastq -i " + bam_file + " -fq " + fq1 + ".fastq" + cmd = ( + self.tools.bedtools + + " bamtofastq -i " + + bam_file + + " -fq " + + fq1 + + ".fastq" + ) if paired_end: fq2 = out_fastq_pre + "_R2.fastq" cmd += " -fq2 " + fq2 return cmd, fq1, fq2 - def get_input_ext(self, input_file): """ Get the extension of the input_file. Assumes you're using either @@ -272,12 +282,13 @@ def get_input_ext(self, input_file): elif input_file.endswith(".fastq") or input_file.endswith(".fq"): input_ext = ".fastq" else: - errmsg = "'{}'; this pipeline can only deal with .bam, .fastq, " \ - "or .fastq.gz files".format(input_file) + errmsg = ( + "'{}'; this pipeline can only deal with .bam, .fastq, " + "or .fastq.gz files".format(input_file) + ) raise UnsupportedFiletypeException(errmsg) return input_ext - def merge_or_link(self, input_args, raw_folder, local_base="sample"): """ Standardizes various input possibilities by converting either .bam, @@ -312,8 +323,7 @@ class of inputs (which can in turn be a string or a list). else: local_base_extended = local_base if input_arg: - out = self.merge_or_link( - input_arg, raw_folder, local_base_extended) + out = self.merge_or_link(input_arg, raw_folder, local_base_extended) print("Local input file: '{}'".format(out)) # Make sure file exists: @@ -343,7 +353,8 @@ class of inputs (which can in turn be a string or a list). self.pm.run( "ln -sf " + input_arg + " " + local_input_abs, target=local_input_abs, - shell=True) + shell=True, + ) # return the local (linked) filename absolute path return local_input_abs @@ -365,11 +376,11 @@ class of inputs (which can in turn be a string or a list). if all([self.get_input_ext(x) == ".fastq.gz" for x in input_args]): sample_merged_gz = local_base + ".merged.fastq.gz" output_merge_gz = os.path.join(raw_folder, sample_merged_gz) - #cmd1 = self.ziptool + "-d -c " + " ".join(input_args) + " > " + output_merge - #cmd2 = self.ziptool + " " + output_merge - #self.pm.run([cmd1, cmd2], output_merge_gz) + # cmd1 = self.ziptool + "-d -c " + " ".join(input_args) + " > " + output_merge + # cmd2 = self.ziptool + " " + output_merge + # self.pm.run([cmd1, cmd2], output_merge_gz) # you can save yourself the decompression/recompression: - cmd = "cat " + " ".join(input_args) + " > " + output_merge_gz + cmd = "cat " + " ".join(input_args) + " > " + output_merge_gz self.pm.run(cmd, output_merge_gz) return output_merge_gz @@ -383,13 +394,20 @@ class of inputs (which can in turn be a string or a list). # At this point, we don't recognize the input file types or they # do not match. raise NotImplementedError( - "Input files must be of the same type, and can only " - "merge bam or fastq.") - + "Input files must be of the same type, and can only " + "merge bam or fastq." + ) def input_to_fastq( - self, input_file, sample_name, paired_end, fastq_folder, - output_file=None, multiclass=False, zipmode=False): + self, + input_file, + sample_name, + paired_end, + fastq_folder, + output_file=None, + multiclass=False, + zipmode=False, + ): """ Builds a command to convert input file to fastq, for various inputs. @@ -424,10 +442,15 @@ def input_to_fastq( output_file = [] for in_i, in_arg in enumerate(input_file): output = fastq_prefix + "_R" + str(in_i + 1) + ".fastq" - result_cmd, uf, result_file = \ - self.input_to_fastq(in_arg, sample_name, paired_end, - fastq_folder, output, multiclass=True, - zipmode=zipmode) + result_cmd, uf, result_file = self.input_to_fastq( + in_arg, + sample_name, + paired_end, + fastq_folder, + output, + multiclass=True, + zipmode=zipmode, + ) cmd.append(result_cmd) output_file.append(result_file) @@ -444,8 +467,10 @@ def input_to_fastq( if input_ext == ".bam": print("Found .bam file") - #cmd = self.bam_to_fastq(input_file, fastq_prefix, paired_end) - cmd, fq1, fq2 = self.bam_to_fastq_awk(input_file, fastq_prefix, paired_end, zipmode) + # cmd = self.bam_to_fastq(input_file, fastq_prefix, paired_end) + cmd, fq1, fq2 = self.bam_to_fastq_awk( + input_file, fastq_prefix, paired_end, zipmode + ) # pm.run(cmd, output_file, follow=check_fastq) if fq2: output_file = [fq1, fq2] @@ -455,20 +480,24 @@ def input_to_fastq( print("Found .fastq.gz file") if paired_end and not multiclass: if zipmode: - raise NotImplementedError("Can't use zipmode on interleaved fastq data.") + raise NotImplementedError( + "Can't use zipmode on interleaved fastq data." + ) # For paired-end reads in one fastq file, we must split the # file into 2. The pipeline author will need to include this - # python script in the scripts directory. + # python script in the scripts directory. # TODO: make this self-contained in pypiper. This is a rare # use case these days, as fastq files are almost never # interleaved anymore. - script_path = os.path.join( - self.tools.scripts_dir, "fastq_split.py") + script_path = os.path.join(self.tools.scripts_dir, "fastq_split.py") cmd = self.tools.python + " -u " + script_path cmd += " -i " + input_file cmd += " -o " + fastq_prefix # Must also return the set of output files - output_file = [fastq_prefix + "_R1.fastq", fastq_prefix + "_R2.fastq"] + output_file = [ + fastq_prefix + "_R1.fastq", + fastq_prefix + "_R2.fastq", + ] else: if zipmode: # we do nothing! @@ -477,7 +506,9 @@ def input_to_fastq( else: # For single-end reads, we just unzip the fastq.gz file. # or, paired-end reads that were already split. - cmd = self.ziptool + " -d -c " + input_file + " > " + output_file + cmd = ( + self.ziptool + " -d -c " + input_file + " > " + output_file + ) # a non-shell version # cmd1 = "gunzip --force " + input_file # cmd2 = "mv " + os.path.splitext(input_file)[0] + " " + output_file @@ -491,7 +522,6 @@ def input_to_fastq( return [cmd, fastq_prefix, output_file] - def check_fastq(self, input_files, output_files, paired_end): """ Returns a follow sanity-check function to be run after a fastq conversion. @@ -510,9 +540,9 @@ def check_fastq(self, input_files, output_files, paired_end): # This is AFTER merge, so if there are multiple files it means the # files were split into read1/read2; therefore I must divide by number # of files for final reads. - def temp_func(input_files=input_files, output_files=output_files, - paired_end=paired_end): - + def temp_func( + input_files=input_files, output_files=output_files, paired_end=paired_end + ): if type(input_files) != list: input_files = [input_files] if type(output_files) != list: @@ -521,35 +551,45 @@ def temp_func(input_files=input_files, output_files=output_files, n_input_files = len(list(filter(bool, input_files))) n_output_files = len(list(filter(bool, output_files))) - total_reads = sum([int(self.count_reads(input_file, paired_end)) - for input_file in input_files]) + total_reads = sum( + [ + int(self.count_reads(input_file, paired_end)) + for input_file in input_files + ] + ) raw_reads = int(total_reads / n_input_files) - self.pm.report_result("Raw_reads", str(raw_reads)) + self.pm.pipestat.report(values={"Raw_reads": str(raw_reads)}) total_fastq_reads = sum( - [int(self.count_reads(output_file, paired_end)) - for output_file in output_files]) + [ + int(self.count_reads(output_file, paired_end)) + for output_file in output_files + ] + ) fastq_reads = int(total_fastq_reads / n_output_files) - self.pm.report_result("Fastq_reads", fastq_reads) + self.pm.pipestat.report(values={"Fastq_reads": fastq_reads}) input_ext = self.get_input_ext(input_files[0]) # We can only assess pass filter reads in bam files with flags. if input_ext == ".bam": num_failed_filter = sum( - [int(self.count_fail_reads(f, paired_end)) - for f in input_files]) + [int(self.count_fail_reads(f, paired_end)) for f in input_files] + ) pf_reads = int(raw_reads) - num_failed_filter - self.pm.report_result("PF_reads", str(pf_reads)) + self.pm.pipestat.report(values={"PF_reads": str(pf_reads)}) if fastq_reads != int(raw_reads): - raise Exception("Fastq conversion error? Number of input reads " - "doesn't number of output reads.") + raise Exception( + "Fastq conversion error? Number of input reads " + "doesn't number of output reads." + ) return fastq_reads return temp_func - - def check_trim(self, trimmed_fastq, paired_end, trimmed_fastq_R2=None, fastqc_folder=None): + def check_trim( + self, trimmed_fastq, paired_end, trimmed_fastq_R2=None, fastqc_folder=None + ): """ Build function to evaluate read trimming, and optionally run fastqc. @@ -567,21 +607,21 @@ def check_trim(self, trimmed_fastq, paired_end, trimmed_fastq_R2=None, fastqc_fo """ def temp_func(): - print("Evaluating read trimming") if paired_end and not trimmed_fastq_R2: print("WARNING: specified paired-end but no R2 file") n_trim = float(self.count_reads(trimmed_fastq, paired_end)) - self.pm.report_result("Trimmed_reads", int(n_trim)) + self.pm.pipestat.report(values={"Trimmed_reads": int(n_trim)}) try: - rr = float(self.pm.get_stat("Raw_reads")) + rr = float(self.pm.pipestat.retrieve("Raw_reads")) except: print("Can't calculate trim loss rate without raw read result.") else: self.pm.report_result( - "Trim_loss_rate", round((rr - n_trim) * 100 / rr, 2)) + "Trim_loss_rate", round((rr - n_trim) * 100 / rr, 2) + ) # Also run a fastqc (if installed/requested) if fastqc_folder: @@ -591,18 +631,31 @@ def temp_func(): self.pm.run(cmd, lock_name="trimmed_fastqc", nofail=True) fname, ext = os.path.splitext(os.path.basename(trimmed_fastq)) fastqc_html = os.path.join(fastqc_folder, fname + "_fastqc.html") - self.pm.report_object("FastQC report r1", fastqc_html) + self.pm.pipestat.report( + values={ + "FastQC_report_R1": { + "path": fastqc_html, + "title": "FastQC report R1", + } + } + ) if paired_end and trimmed_fastq_R2: cmd = self.fastqc(trimmed_fastq_R2, fastqc_folder) self.pm.run(cmd, lock_name="trimmed_fastqc_R2", nofail=True) fname, ext = os.path.splitext(os.path.basename(trimmed_fastq_R2)) fastqc_html = os.path.join(fastqc_folder, fname + "_fastqc.html") - self.pm.report_object("FastQC report r2", fastqc_html) + self.pm.pipestat.report( + values={ + "FastQC_report_R2": { + "path": fastqc_html, + "title": "FastQC report R2", + } + } + ) return temp_func - def validate_bam(self, input_bam): """ Wrapper for Picard's ValidateSamFile. @@ -615,7 +668,6 @@ def validate_bam(self, input_bam): cmd += " INPUT=" + input_bam return cmd - def merge_bams(self, input_bams, merged_bam, in_sorted="TRUE", tmp_dir=None): """ Combine multiple files into one. @@ -653,27 +705,25 @@ def merge_bams(self, input_bams, merged_bam, in_sorted="TRUE", tmp_dir=None): cmd += " TMP_DIR=" + tmp_dir return cmd - - + def merge_bams_samtools(self, input_bams, merged_bam): - cmd = self.tools.samtools + " merge -f " + cmd = self.tools.samtools + " merge -f " cmd += " -@ " + str(self.pm.cores) - cmd += " " + merged_bam + " " + cmd += " " + merged_bam + " " cmd += " ".join(input_bams) return cmd - def merge_fastq(self, inputs, output, run=False, remove_inputs=False): """ Merge FASTQ files (zipped or not) into one. - + :param Iterable[str] inputs: Collection of paths to files to merge. :param str output: Path to single output file. :param bool run: Whether to run the command. :param bool remove_inputs: Whether to keep the original files. - :return NoneType | str: Null if running the command, otherwise the + :return NoneType | str: Null if running the command, otherwise the command itself - :raise ValueError: Raise ValueError if the call is such that + :raise ValueError: Raise ValueError if the call is such that inputs are to be deleted but command is not run. """ if remove_inputs and not run: @@ -687,14 +737,16 @@ def merge_fastq(self, inputs, output, run=False, remove_inputs=False): else: return cmd - def count_lines(self, file_name): """ Uses the command-line utility wc to count the number of lines in a file. For MacOS, must strip leading whitespace from wc. :param str file_name: name of file whose lines are to be counted """ - x = subprocess.check_output("wc -l " + file_name + " | sed -E 's/^[[:space:]]+//' | cut -f1 -d' '", shell=True) + x = subprocess.check_output( + "wc -l " + file_name + " | sed -E 's/^[[:space:]]+//' | cut -f1 -d' '", + shell=True, + ) return x.decode().strip() def count_lines_zip(self, file_name): @@ -703,7 +755,13 @@ def count_lines_zip(self, file_name): For compressed files. :param file: file_name """ - x = subprocess.check_output(self.ziptool + " -d -c " + file_name + " | wc -l | sed -E 's/^[[:space:]]+//' | cut -f1 -d' '", shell=True) + x = subprocess.check_output( + self.ziptool + + " -d -c " + + file_name + + " | wc -l | sed -E 's/^[[:space:]]+//' | cut -f1 -d' '", + shell=True, + ) return x.decode().strip() def get_chrs_from_bam(self, file_name): @@ -711,7 +769,13 @@ def get_chrs_from_bam(self, file_name): Uses samtools to grab the chromosomes from the header that are contained in this bam file. """ - x = subprocess.check_output(self.tools.samtools + " view -H " + file_name + " | grep '^@SQ' | cut -f2| sed s'/SN://'", shell=True) + x = subprocess.check_output( + self.tools.samtools + + " view -H " + + file_name + + " | grep '^@SQ' | cut -f2| sed s'/SN://'", + shell=True, + ) # Chromosomes will be separated by newlines; split into list to return return x.decode().split() @@ -735,14 +799,25 @@ def count_unique_reads(self, file_name, paired_end): if file_name.endswith("bam"): param = "" if paired_end: - r1 = self.samtools_view(file_name, param=param + " -f64", postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'") - r2 = self.samtools_view(file_name, param=param + " -f128", postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'") + r1 = self.samtools_view( + file_name, + param=param + " -f64", + postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'", + ) + r2 = self.samtools_view( + file_name, + param=param + " -f128", + postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'", + ) else: - r1 = self.samtools_view(file_name, param=param + "", postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'") + r1 = self.samtools_view( + file_name, + param=param + "", + postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'", + ) r2 = 0 return int(r1) + int(r2) - def count_unique_mapped_reads(self, file_name, paired_end): """ For a bam or sam file with paired or or single-end reads, returns the @@ -759,21 +834,32 @@ def count_unique_mapped_reads(self, file_name, paired_end): if ext == ".sam": param = "-S -F4" - elif ext == "bam": + elif ext == ".bam": param = "-F4" else: raise ValueError("Not a SAM or BAM: '{}'".format(file_name)) - if paired_end: - r1 = self.samtools_view(file_name, param=param + " -f64", postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'") - r2 = self.samtools_view(file_name, param=param + " -f128", postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'") + if paired_end: + r1 = self.samtools_view( + file_name, + param=param + " -f64", + postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'", + ) + r2 = self.samtools_view( + file_name, + param=param + " -f128", + postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'", + ) else: - r1 = self.samtools_view(file_name, param=param + "", postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'") + r1 = self.samtools_view( + file_name, + param=param + "", + postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'", + ) r2 = 0 return int(r1) + int(r2) - def count_flag_reads(self, file_name, flag, paired_end): """ Counts the number of reads with the specified flag. @@ -791,7 +877,6 @@ def count_flag_reads(self, file_name, flag, paired_end): param += " -S" return self.samtools_view(file_name, param=param) - def count_multimapping_reads(self, file_name, paired_end): """ Counts the number of reads that mapped to multiple locations. Warning: @@ -807,7 +892,6 @@ def count_multimapping_reads(self, file_name, paired_end): """ return int(self.count_flag_reads(file_name, 256, paired_end)) - def count_uniquelymapping_reads(self, file_name, paired_end): """ Counts the number of reads that mapped to a unique position. @@ -820,7 +904,6 @@ def count_uniquelymapping_reads(self, file_name, paired_end): param += " -S" return self.samtools_view(file_name, param=param) - def count_fail_reads(self, file_name, paired_end): """ Counts the number of reads that failed platform/vendor quality checks. @@ -831,7 +914,6 @@ def count_fail_reads(self, file_name, paired_end): """ return int(self.count_flag_reads(file_name, 512, paired_end)) - def samtools_view(self, file_name, param, postpend=""): """ Run samtools view, with flexible parameters and post-processing. @@ -843,13 +925,11 @@ def samtools_view(self, file_name, param, postpend=""): :param str postpend: String to append to the samtools command; useful to add cut, sort, wc operations to the samtools view output. """ - cmd = "{} view {} {} {}".format( - self.tools.samtools, param, file_name, postpend) + cmd = "{} view {} {} {}".format(self.tools.samtools, param, file_name, postpend) # in python 3, check_output returns a byte string which causes issues. # with python 3.6 we could use argument: "encoding='UTF-8'"" return subprocess.check_output(cmd, shell=True).decode().strip() - def count_reads(self, file_name, paired_end): """ Count reads in a file. @@ -874,13 +954,14 @@ def count_reads(self, file_name, paired_end): param_text = "-c" if ext == ".bam" else "-c -S" return self.samtools_view(file_name, param=param_text) else: - num_lines = self.count_lines_zip(file_name) \ - if is_gzipped_fastq(file_name) \ - else self.count_lines(file_name) + num_lines = ( + self.count_lines_zip(file_name) + if is_gzipped_fastq(file_name) + else self.count_lines(file_name) + ) divisor = 2 if paired_end else 4 return int(num_lines) / divisor - def count_concordant(self, aligned_bam): """ Count only reads that "aligned concordantly exactly 1 time." @@ -889,9 +970,8 @@ def count_concordant(self, aligned_bam): """ cmd = self.tools.samtools + " view " + aligned_bam + " | " cmd += "grep 'YT:Z:CP'" + " | uniq -u | wc -l | sed -E 's/^[[:space:]]+//'" - - return subprocess.check_output(cmd, shell=True).decode().strip() + return subprocess.check_output(cmd, shell=True).decode().strip() def count_mapped_reads(self, file_name, paired_end): """ @@ -912,35 +992,84 @@ def count_mapped_reads(self, file_name, paired_end): return self.samtools_view(file_name, param="-c -F4 -S") return -1 - def sam_conversions(self, sam_file, depth=True): """ Convert sam files to bam files, then sort and index them for later use. :param bool depth: also calculate coverage over each position """ - cmd = self.tools.samtools + " view -bS " + sam_file + " > " + sam_file.replace(".sam", ".bam") + "\n" - cmd += self.tools.samtools + " sort " + sam_file.replace(".sam", ".bam") + " -o " + sam_file.replace(".sam", "_sorted.bam") + "\n" - cmd += self.tools.samtools + " index " + sam_file.replace(".sam", "_sorted.bam") + "\n" + cmd = ( + self.tools.samtools + + " view -bS " + + sam_file + + " > " + + sam_file.replace(".sam", ".bam") + + "\n" + ) + cmd += ( + self.tools.samtools + + " sort " + + sam_file.replace(".sam", ".bam") + + " -o " + + sam_file.replace(".sam", "_sorted.bam") + + "\n" + ) + cmd += ( + self.tools.samtools + + " index " + + sam_file.replace(".sam", "_sorted.bam") + + "\n" + ) if depth: - cmd += self.tools.samtools + " depth " + sam_file.replace(".sam", "_sorted.bam") + " > " + sam_file.replace(".sam", "_sorted.depth") + "\n" + cmd += ( + self.tools.samtools + + " depth " + + sam_file.replace(".sam", "_sorted.bam") + + " > " + + sam_file.replace(".sam", "_sorted.depth") + + "\n" + ) return cmd - def bam_conversions(self, bam_file, depth=True): """ Sort and index bam files for later use. :param bool depth: also calculate coverage over each position """ - cmd = self.tools.samtools + " view -h " + bam_file + " > " + bam_file.replace(".bam", ".sam") + "\n" - cmd += self.tools.samtools + " sort " + bam_file + " -o " + bam_file.replace(".bam", "_sorted.bam") + "\n" - cmd += self.tools.samtools + " index " + bam_file.replace(".bam", "_sorted.bam") + "\n" + cmd = ( + self.tools.samtools + + " view -h " + + bam_file + + " > " + + bam_file.replace(".bam", ".sam") + + "\n" + ) + cmd += ( + self.tools.samtools + + " sort " + + bam_file + + " -o " + + bam_file.replace(".bam", "_sorted.bam") + + "\n" + ) + cmd += ( + self.tools.samtools + + " index " + + bam_file.replace(".bam", "_sorted.bam") + + "\n" + ) if depth: - cmd += self.tools.samtools + " depth " + bam_file.replace(".bam", "_sorted.bam") + " > " + bam_file.replace(".bam", "_sorted.depth") + "\n" + cmd += ( + self.tools.samtools + + " depth " + + bam_file.replace(".bam", "_sorted.bam") + + " > " + + bam_file.replace(".bam", "_sorted.depth") + + "\n" + ) return cmd - def fastqc(self, file, output_dir): """ Create command to run fastqc on a FASTQ file @@ -959,9 +1088,9 @@ def fastqc(self, file, output_dir): if not os.path.isabs(output_dir) and pm is not None: output_dir = os.path.join(pm.outfolder, output_dir) self.make_sure_path_exists(output_dir) - return "{} --noextract --outdir {} {}".\ - format(self.tools.fastqc, output_dir, file) - + return "{} --noextract --outdir {} {}".format( + self.tools.fastqc, output_dir, file + ) def fastqc_rename(self, input_bam, output_dir, sample_name): """ @@ -984,20 +1113,29 @@ def fastqc_rename(self, input_bam, output_dir, sample_name): cmd1 = self.fastqc(input_bam, output_dir) cmds.append(cmd1) cmd2 = "if [[ ! -s {1}_fastqc.html ]]; then mv {0}_fastqc.html {1}_fastqc.html; mv {0}_fastqc.zip {1}_fastqc.zip; fi".format( - os.path.join(output_dir, initial), os.path.join(output_dir, sample_name)) + os.path.join(output_dir, initial), os.path.join(output_dir, sample_name) + ) cmds.append(cmd2) return cmds - def samtools_index(self, bam_file): """Index a bam file.""" cmd = self.tools.samtools + " index {0}".format(bam_file) return cmd - def slurm_header( - self, job_name, output, queue="shortq", n_tasks=1, time="10:00:00", - cpus_per_task=8, mem_per_cpu=2000, nodes=1, user_mail="", mail_type="end"): + self, + job_name, + output, + queue="shortq", + n_tasks=1, + time="10:00:00", + cpus_per_task=8, + mem_per_cpu=2000, + nodes=1, + user_mail="", + mail_type="end", + ): cmd = """ #!/bin/bash #SBATCH --partition={0} #SBATCH --ntasks={1} @@ -1018,51 +1156,65 @@ def slurm_header( date """.format( - queue, n_tasks, time, cpus_per_task, mem_per_cpu, - nodes, job_name, output, mail_type, user_mail) + queue, + n_tasks, + time, + cpus_per_task, + mem_per_cpu, + nodes, + job_name, + output, + mail_type, + user_mail, + ) return cmd - def slurm_footer(self): return " date" - def slurm_submit_job(self, job_file): return os.system("sbatch %s" % job_file) - def remove_file(self, file_name): return "rm {0}".format(file_name) - def move_file(self, old, new): return "mv {0} {1}".format(old, new) - def preseq_curve(self, bam_file, output_prefix): return """ preseq c_curve -B -P -o {0}.yield.txt {1} - """.format(output_prefix, bam_file) - + """.format( + output_prefix, bam_file + ) def preseq_extrapolate(self, bam_file, output_prefix): return """ preseq lc_extrap -v -B -P -e 1e+9 -o {0}.future_yield.txt {1} - """.format(output_prefix, bam_file) - + """.format( + output_prefix, bam_file + ) def preseq_coverage(self, bam_file, output_prefix): return """ preseq gc_extrap -o {0}.future_coverage.txt {1} - """.format(output_prefix, bam_file) - + """.format( + output_prefix, bam_file + ) def trimmomatic( - self, input_fastq1, output_fastq1, cpus, adapters, log, - input_fastq2=None, output_fastq1_unpaired=None, - output_fastq2=None, output_fastq2_unpaired=None): - + self, + input_fastq1, + output_fastq1, + cpus, + adapters, + log, + input_fastq2=None, + output_fastq1_unpaired=None, + output_fastq2=None, + output_fastq2_unpaired=None, + ): PE = False if input_fastq2 is None else True pe = "PE" if PE else "SE" cmd = self.tools.java + " -Xmx" + self.pm.javamem @@ -1072,17 +1224,26 @@ def trimmomatic( cmd += " {0}".format(input_fastq2) cmd += " {0}".format(output_fastq1) if PE: - cmd += " {0} {1} {2}".format(output_fastq1_unpaired, output_fastq2, output_fastq2_unpaired) + cmd += " {0} {1} {2}".format( + output_fastq1_unpaired, output_fastq2, output_fastq2_unpaired + ) cmd += " ILLUMINACLIP:{0}:1:40:15:8:true".format(adapters) cmd += " LEADING:3 TRAILING:3" cmd += " SLIDINGWINDOW:4:10" cmd += " MINLEN:36" return cmd - def skewer( - self, input_fastq1, output_prefix, output_fastq1, - log, cpus, adapters, input_fastq2=None, output_fastq2=None): + self, + input_fastq1, + output_prefix, + output_fastq1, + log, + cpus, + adapters, + input_fastq2=None, + output_fastq2=None, + ): """ Create commands with which to run skewer. @@ -1117,17 +1278,33 @@ def skewer( cmd2 = "mv {0} {1}".format(output_prefix + "-trimmed.fastq", output_fastq1) cmds.append(cmd2) else: - cmd2 = "mv {0} {1}".format(output_prefix + "-trimmed-pair1.fastq", output_fastq1) + cmd2 = "mv {0} {1}".format( + output_prefix + "-trimmed-pair1.fastq", output_fastq1 + ) cmds.append(cmd2) - cmd3 = "mv {0} {1}".format(output_prefix + "-trimmed-pair2.fastq", output_fastq2) + cmd3 = "mv {0} {1}".format( + output_prefix + "-trimmed-pair2.fastq", output_fastq2 + ) cmds.append(cmd3) cmd4 = "mv {0} {1}".format(output_prefix + "-trimmed.log", log) cmds.append(cmd4) return cmds - def bowtie2_map(self, input_fastq1, output_bam, log, metrics, genome_index, max_insert, cpus, input_fastq2=None): + def bowtie2_map( + self, + input_fastq1, + output_bam, + log, + metrics, + genome_index, + max_insert, + cpus, + input_fastq2=None, + ): # Admits 2000bp-long fragments (--maxins option) - cmd = self.tools.bowtie2 + " --very-sensitive --no-discordant -p {0}".format(cpus) + cmd = self.tools.bowtie2 + " --very-sensitive --no-discordant -p {0}".format( + cpus + ) cmd += " -x {0}".format(genome_index) cmd += " --met-file {0}".format(metrics) if input_fastq2 is None: @@ -1136,15 +1313,24 @@ def bowtie2_map(self, input_fastq1, output_bam, log, metrics, genome_index, max_ cmd += " --maxins {0}".format(max_insert) cmd += " -1 {0}".format(input_fastq1) cmd += " -2 {0}".format(input_fastq2) - cmd += " 2> {0} | samtools view -S -b - | samtools sort -o {1} -".format(log, output_bam) + cmd += " 2> {0} | samtools view -S -b - | samtools sort -o {1} -".format( + log, output_bam + ) return cmd def topHat_map(self, input_fastq, output_dir, genome, transcriptome, cpus): # TODO: # Allow paired input - cmd = self.tools.tophat + " --GTF {0} --b2-L 15 --library-type fr-unstranded --mate-inner-dist 120".format(transcriptome) + cmd = ( + self.tools.tophat + + " --GTF {0} --b2-L 15 --library-type fr-unstranded --mate-inner-dist 120".format( + transcriptome + ) + ) cmd += " --max-multihits 100 --no-coverage-search" - cmd += " --num-threads {0} --output-dir {1} {2} {3}".format(cpus, output_dir, genome, input_fastq) + cmd += " --num-threads {0} --output-dir {1} {2} {3}".format( + cpus, output_dir, genome, input_fastq + ) return cmd def picard_mark_duplicates(self, input_bam, output_bam, metrics_file, temp_dir="."): @@ -1164,33 +1350,50 @@ def picard_mark_duplicates(self, input_bam, output_bam, metrics_file, temp_dir=" return [cmd1, cmd2, cmd3] def sambamba_remove_duplicates(self, input_bam, output_bam, cpus=16): - cmd = self.tools.sambamba + " markdup -t {0} -r {1} {2}".format(cpus, input_bam, output_bam) + cmd = self.tools.sambamba + " markdup -t {0} -r {1} {2}".format( + cpus, input_bam, output_bam + ) return cmd def get_mitochondrial_reads(self, bam_file, output, cpus=4): - """ - """ + """ """ tmp_bam = bam_file + "tmp_rmMe" cmd1 = self.tools.sambamba + " index -t {0} {1}".format(cpus, bam_file) - cmd2 = self.tools.sambamba + " slice {0} chrM | {1} markdup -t 4 /dev/stdin {2} 2> {3}".format(bam_file, self.tools.sambamba, tmp_bam, output) + cmd2 = ( + self.tools.sambamba + + " slice {0} chrM | {1} markdup -t 4 /dev/stdin {2} 2> {3}".format( + bam_file, self.tools.sambamba, tmp_bam, output + ) + ) cmd3 = "rm {}".format(tmp_bam) return [cmd1, cmd2, cmd3] - def filter_reads(self, input_bam, output_bam, metrics_file, paired=False, cpus=16, Q=30): + def filter_reads( + self, input_bam, output_bam, metrics_file, paired=False, cpus=16, Q=30 + ): """ Remove duplicates, filter for >Q, remove multiple mapping reads. For paired-end reads, keep only proper pairs. """ nodups = re.sub("\.bam$", "", output_bam) + ".nodups.nofilter.bam" - cmd1 = self.tools.sambamba + " markdup -t {0} -r --compression-level=0 {1} {2} 2> {3}".format(cpus, input_bam, nodups, metrics_file) - cmd2 = self.tools.sambamba + ' view -t {0} -f bam --valid'.format(cpus) + cmd1 = ( + self.tools.sambamba + + " markdup -t {0} -r --compression-level=0 {1} {2} 2> {3}".format( + cpus, input_bam, nodups, metrics_file + ) + ) + cmd2 = self.tools.sambamba + " view -t {0} -f bam --valid".format(cpus) if paired: cmd2 += ' -F "not (unmapped or mate_is_unmapped) and proper_pair' else: cmd2 += ' -F "not unmapped' - cmd2 += ' and not (secondary_alignment or supplementary) and mapping_quality >= {0}"'.format(Q) - cmd2 += ' {0} |'.format(nodups) - cmd2 += self.tools.sambamba + " sort -t {0} /dev/stdin -o {1}".format(cpus, output_bam) + cmd2 += ' and not (secondary_alignment or supplementary) and mapping_quality >= {0}"'.format( + Q + ) + cmd2 += " {0} |".format(nodups) + cmd2 += self.tools.sambamba + " sort -t {0} /dev/stdin -o {1}".format( + cpus, output_bam + ) cmd3 = "if [[ -s {0} ]]; then rm {0}; fi".format(nodups) cmd4 = "if [[ -s {0} ]]; then rm {0}; fi".format(nodups + ".bai") return [cmd1, cmd2, cmd3, cmd4] @@ -1203,7 +1406,6 @@ def shift_reads(self, input_bam, genome, output_bam): cmd += " " + self.tools.samtools + " sort -o {0} -".format(output_bam) return cmd - def sort_index_bam(self, input_bam, output_bam): tmp_bam = re.sub("\.bam", ".sorted", input_bam) cmd1 = self.tools.samtools + " sort {0} {1}".format(input_bam, tmp_bam) @@ -1211,12 +1413,10 @@ def sort_index_bam(self, input_bam, output_bam): cmd3 = self.tools.samtools + " index {0}".format(output_bam) return [cmd1, cmd2, cmd3] - def index_bam(self, input_bam): cmd = self.tools.samtools + " index {0}".format(input_bam) return cmd - def run_spp(self, input_bam, output, plot, cpus): """ Run the SPP read peak analysis tool. @@ -1229,38 +1429,40 @@ def run_spp(self, input_bam, output, plot, cpus): """ base = "{} {} -rf -savp".format(self.tools.Rscript, self.tools.spp) cmd = base + " -savp={} -s=0:5:500 -c={} -out={} -p={}".format( - plot, input_bam, output, cpus) + plot, input_bam, output, cpus + ) return cmd - def get_fragment_sizes(self, bam_file): try: - import pysam import numpy as np + import pysam except: return frag_sizes = list() - bam = pysam.Samfile(bam_file, 'rb') + bam = pysam.Samfile(bam_file, "rb") for read in bam: if bam.getrname(read.tid) != "chrM" and read.tlen < 1500: frag_sizes.append(read.tlen) bam.close() return np.array(frag_sizes) - - def plot_atacseq_insert_sizes(self, bam, plot, output_csv, max_insert=1500, smallest_insert=30): + def plot_atacseq_insert_sizes( + self, bam, plot, output_csv, max_insert=1500, smallest_insert=30 + ): """ Heavy inspiration from here: https://github.com/dbrg77/ATAC/blob/master/ATAC_seq_read_length_curve_fitting.ipynb """ try: - import pysam - import numpy as np + import matplotlib import matplotlib.mlab as mlab - from scipy.optimize import curve_fit + import numpy as np + import pysam from scipy.integrate import simps - import matplotlib - matplotlib.use('Agg') + from scipy.optimize import curve_fit + + matplotlib.use("Agg") import matplotlib.pyplot as plt except: print("Necessary Python modules couldn't be loaded.") @@ -1268,6 +1470,7 @@ def plot_atacseq_insert_sizes(self, bam, plot, output_csv, max_insert=1500, smal try: import seaborn as sns + sns.set_style("whitegrid") except: pass @@ -1275,7 +1478,7 @@ def plot_atacseq_insert_sizes(self, bam, plot, output_csv, max_insert=1500, smal def get_fragment_sizes(bam, max_insert=1500): frag_sizes = list() - bam = pysam.Samfile(bam, 'rb') + bam = pysam.Samfile(bam, "rb") for i, read in enumerate(bam): if read.tlen < max_insert: @@ -1293,11 +1496,13 @@ def mixture_function(x, *p): nfr = expo(x, 2.9e-02, 2.8e-02) nfr[:smallest_insert] = 0 - return (mlab.normpdf(x, m1, s1) * w1 + - mlab.normpdf(x, m2, s2) * w2 + - mlab.normpdf(x, m3, s3) * w3 + - mlab.normpdf(x, m4, s4) * w4 + - nfr) + return ( + mlab.normpdf(x, m1, s1) * w1 + + mlab.normpdf(x, m2, s2) * w2 + + mlab.normpdf(x, m3, s3) * w3 + + mlab.normpdf(x, m4, s4) * w4 + + nfr + ) def expo(x, q, r): """ @@ -1316,17 +1521,30 @@ def expo(x, q, r): # Parameters are empirical, need to check paramGuess = [ - 200, 50, 0.7, # gaussians - 400, 50, 0.15, - 600, 50, 0.1, - 800, 55, 0.045, - 2.9e-02, 2.8e-02 # exponential + 200, + 50, + 0.7, # gaussians + 400, + 50, + 0.15, + 600, + 50, + 0.1, + 800, + 55, + 0.045, + 2.9e-02, + 2.8e-02, # exponential ] try: popt3, pcov3 = curve_fit( - mixture_function, x[smallest_insert:], y[smallest_insert:], - p0=paramGuess, maxfev=100000) + mixture_function, + x[smallest_insert:], + y[smallest_insert:], + p0=paramGuess, + maxfev=100000, + ) except: print("Nucleosomal fit could not be found.") return @@ -1340,19 +1558,19 @@ def expo(x, q, r): plt.hist(frag_sizes, numBins, histtype="step", ec="k", normed=1, alpha=0.5) # Plot nucleosomal fits - plt.plot(x, mlab.normpdf(x, m1, s1) * w1, 'r-', lw=1.5, label="1st nucleosome") - plt.plot(x, mlab.normpdf(x, m2, s2) * w2, 'g-', lw=1.5, label="2nd nucleosome") - plt.plot(x, mlab.normpdf(x, m3, s3) * w3, 'b-', lw=1.5, label="3rd nucleosome") - plt.plot(x, mlab.normpdf(x, m4, s4) * w4, 'c-', lw=1.5, label="4th nucleosome") + plt.plot(x, mlab.normpdf(x, m1, s1) * w1, "r-", lw=1.5, label="1st nucleosome") + plt.plot(x, mlab.normpdf(x, m2, s2) * w2, "g-", lw=1.5, label="2nd nucleosome") + plt.plot(x, mlab.normpdf(x, m3, s3) * w3, "b-", lw=1.5, label="3rd nucleosome") + plt.plot(x, mlab.normpdf(x, m4, s4) * w4, "c-", lw=1.5, label="4th nucleosome") # Plot nucleosome-free fit nfr = expo(x, 2.9e-02, 2.8e-02) nfr[:smallest_insert] = 0 - plt.plot(x, nfr, 'k-', lw=1.5, label="nucleosome-free") + plt.plot(x, nfr, "k-", lw=1.5, label="nucleosome-free") # Plot sum of fits ys = mixture_function(x, *popt3) - plt.plot(x, ys, 'k--', lw=3.5, label="fit sum") + plt.plot(x, ys, "k--", lw=3.5, label="fit sum") plt.legend() plt.xlabel("Fragment size (bp)") @@ -1363,10 +1581,26 @@ def expo(x, q, r): areas = [ ["fraction", "area under curve", "max density"], ["Nucleosome-free fragments", simps(nfr), max(nfr)], - ["1st nucleosome", simps(mlab.normpdf(x, m1, s1) * w1), max(mlab.normpdf(x, m1, s1) * w1)], - ["2nd nucleosome", simps(mlab.normpdf(x, m2, s2) * w1), max(mlab.normpdf(x, m2, s2) * w2)], - ["3rd nucleosome", simps(mlab.normpdf(x, m3, s3) * w1), max(mlab.normpdf(x, m3, s3) * w3)], - ["4th nucleosome", simps(mlab.normpdf(x, m4, s4) * w1), max(mlab.normpdf(x, m4, s4) * w4)] + [ + "1st nucleosome", + simps(mlab.normpdf(x, m1, s1) * w1), + max(mlab.normpdf(x, m1, s1) * w1), + ], + [ + "2nd nucleosome", + simps(mlab.normpdf(x, m2, s2) * w1), + max(mlab.normpdf(x, m2, s2) * w2), + ], + [ + "3rd nucleosome", + simps(mlab.normpdf(x, m3, s3) * w1), + max(mlab.normpdf(x, m3, s3) * w3), + ], + [ + "4th nucleosome", + simps(mlab.normpdf(x, m4, s4) * w1), + max(mlab.normpdf(x, m4, s4) * w4), + ], ] try: @@ -1380,8 +1614,15 @@ def expo(x, q, r): # TODO: parameterize in terms of normalization factor. def bam_to_bigwig( - self, input_bam, output_bigwig, genome_sizes, genome, - tagmented=False, normalize=False, norm_factor=1000): + self, + input_bam, + output_bigwig, + genome_sizes, + genome, + tagmented=False, + normalize=False, + norm_factor=1000, + ): """ Convert a BAM file to a bigWig file. @@ -1401,34 +1642,63 @@ def bam_to_bigwig( transient_file = os.path.abspath(re.sub("\.bigWig", "", output_bigwig)) cmd1 = self.tools.bedtools + " bamtobed -i {0} |".format(input_bam) if not tagmented: - cmd1 += " " + self.tools.bedtools + " slop -i stdin -g {0} -s -l 0 -r 130 |".format(genome_sizes) + cmd1 += ( + " " + + self.tools.bedtools + + " slop -i stdin -g {0} -s -l 0 -r 130 |".format(genome_sizes) + ) cmd1 += " fix_bedfile_genome_boundaries.py {0} |".format(genome) - cmd1 += " " + self.tools.genomeCoverageBed + " {0}-bg -g {1} -i stdin > {2}.cov".format( - "-5 " if tagmented else "", - genome_sizes, - transient_file + cmd1 += ( + " " + + self.tools.genomeCoverageBed + + " {0}-bg -g {1} -i stdin > {2}.cov".format( + "-5 " if tagmented else "", genome_sizes, transient_file + ) ) cmds.append(cmd1) if normalize: - cmds.append("""awk 'NR==FNR{{sum+= $4; next}}{{ $4 = ($4 / sum) * {1}; print}}' {0}.cov {0}.cov | sort -k1,1 -k2,2n > {0}.normalized.cov""".format(transient_file, norm_factor)) - cmds.append(self.tools.bedGraphToBigWig + " {0}{1}.cov {2} {3}".format(transient_file, ".normalized" if normalize else "", genome_sizes, output_bigwig)) + cmds.append( + """awk 'NR==FNR{{sum+= $4; next}}{{ $4 = ($4 / sum) * {1}; print}}' {0}.cov {0}.cov | sort -k1,1 -k2,2n > {0}.normalized.cov""".format( + transient_file, norm_factor + ) + ) + cmds.append( + self.tools.bedGraphToBigWig + + " {0}{1}.cov {2} {3}".format( + transient_file, + ".normalized" if normalize else "", + genome_sizes, + output_bigwig, + ) + ) # remove tmp files cmds.append("if [[ -s {0}.cov ]]; then rm {0}.cov; fi".format(transient_file)) if normalize: - cmds.append("if [[ -s {0}.normalized.cov ]]; then rm {0}.normalized.cov; fi".format(transient_file)) + cmds.append( + "if [[ -s {0}.normalized.cov ]]; then rm {0}.normalized.cov; fi".format( + transient_file + ) + ) cmds.append("chmod 755 {0}".format(output_bigwig)) return cmds - - def add_track_to_hub(self, sample_name, track_url, track_hub, colour, five_prime=""): - cmd1 = """echo "track type=bigWig name='{0} {1}' description='{0} {1}'""".format(sample_name, five_prime) - cmd1 += """ height=32 visibility=full maxHeightPixels=32:32:25 bigDataUrl={0} color={1}" >> {2}""".format(track_url, colour, track_hub) + def add_track_to_hub( + self, sample_name, track_url, track_hub, colour, five_prime="" + ): + cmd1 = ( + """echo "track type=bigWig name='{0} {1}' description='{0} {1}'""".format( + sample_name, five_prime + ) + ) + cmd1 += """ height=32 visibility=full maxHeightPixels=32:32:25 bigDataUrl={0} color={1}" >> {2}""".format( + track_url, colour, track_hub + ) cmd2 = "chmod 755 {0}".format(track_hub) return [cmd1, cmd2] - def link_to_track_hub(self, track_hub_url, file_name, genome): import textwrap + db = "org" if genome == "hg19" else "db" # different database call for human genome = "human" if genome == "hg19" else genome # change hg19 to human html = """ @@ -1438,35 +1708,56 @@ def link_to_track_hub(self, track_hub_url, file_name, genome): html += """{db}={genome}&hgt.customText={track_hub_url}" /> - """.format(track_hub_url=track_hub_url, genome=genome, db=db) - with open(file_name, 'w') as handle: + """.format( + track_hub_url=track_hub_url, genome=genome, db=db + ) + with open(file_name, "w") as handle: handle.write(textwrap.dedent(html)) - def htseq_count(self, input_bam, gtf, output): sam = input_bam.replace("bam", "sam") cmd1 = "samtools view {0} > {1}".format(input_bam, sam) - cmd2 = "htseq-count -f sam -t exon -i transcript_id -m union {0} {1} > {2}".format(sam, gtf, output) + cmd2 = ( + "htseq-count -f sam -t exon -i transcript_id -m union {0} {1} > {2}".format( + sam, gtf, output + ) + ) cmd3 = "rm {0}".format(sam) return [cmd1, cmd2, cmd3] - - def kallisto(self, input_fastq, output_dir, output_bam, transcriptome_index, cpus, input_fastq2=None, size=180, b=200): - cmd1 = self.tools.kallisto + " quant --bias --pseudobam -b {0} -l {1} -i {2} -o {3} -t {4}".format(b, size, transcriptome_index, output_dir, cpus) + def kallisto( + self, + input_fastq, + output_dir, + output_bam, + transcriptome_index, + cpus, + input_fastq2=None, + size=180, + b=200, + ): + cmd1 = ( + self.tools.kallisto + + " quant --bias --pseudobam -b {0} -l {1} -i {2} -o {3} -t {4}".format( + b, size, transcriptome_index, output_dir, cpus + ) + ) if input_fastq2 is None: cmd1 += " --single {0}".format(input_fastq) else: cmd1 += " {0} {1}".format(input_fastq, input_fastq2) cmd1 += " | " + self.tools.samtools + " view -Sb - > {0}".format(output_bam) - cmd2 = self.tools.kallisto + " h5dump -o {0} {0}/abundance.h5".format(output_dir) + cmd2 = self.tools.kallisto + " h5dump -o {0} {0}/abundance.h5".format( + output_dir + ) return [cmd1, cmd2] - def genome_wide_coverage(self, input_bam, genome_windows, output): - cmd = self.tools.bedtools + " coverage -counts -abam {0} -b {1} > {2}".format(input_bam, genome_windows, output) + cmd = self.tools.bedtools + " coverage -counts -abam {0} -b {1} > {2}".format( + input_bam, genome_windows, output + ) return cmd - def calc_frip(self, input_bam, input_bed, threads=4): """ Calculate fraction of reads in peaks. @@ -1483,14 +1774,12 @@ def calc_frip(self, input_bam, input_bed, threads=4): cmd = self.simple_frip(input_bam, input_bed, threads) return subprocess.check_output(cmd.split(" "), shell=True).decode().strip() - def simple_frip(self, input_bam, input_bed, threads=4): cmd = "{} view".format(self.tools.samtools) cmd += " -@ {} -c -L {}".format(threads, input_bed) cmd += " " + input_bam return cmd - def calculate_frip(self, input_bam, input_bed, output, cpus=4): cmd = self.tools.sambamba + " depth region -t {0}".format(cpus) cmd += " -L {0}".format(input_bed) @@ -1498,11 +1787,19 @@ def calculate_frip(self, input_bam, input_bed, output, cpus=4): cmd += " | awk '{{sum+=$5}} END {{print sum}}' > {0}".format(output) return cmd - def macs2_call_peaks( - self, treatment_bams, output_dir, sample_name, genome, - control_bams=None, broad=False, paired=False, - pvalue=None, qvalue=None, include_significance=None): + self, + treatment_bams, + output_dir, + sample_name, + genome, + control_bams=None, + broad=False, + paired=False, + pvalue=None, + qvalue=None, + include_significance=None, + ): """ Use MACS2 to call peaks. @@ -1527,7 +1824,13 @@ def macs2_call_peaks( specified but no value is provided for p-value or q-value. :return str: Command to run. """ - sizes = {"hg38": 2.7e9, "hg19": 2.7e9, "mm10": 1.87e9, "dr7": 1.412e9, "mm9": 1.87e9} + sizes = { + "hg38": 2.7e9, + "hg19": 2.7e9, + "mm10": 1.87e9, + "dr7": 1.412e9, + "mm9": 1.87e9, + } # Whether to specify to MACS2 a value for statistical significance # can be either directly indicated, but if not, it's determined by @@ -1537,10 +1840,14 @@ def macs2_call_peaks( if include_significance is None: include_significance = broad - cmd = self.tools.macs2 + " callpeak -t {0}".format(treatment_bams if type(treatment_bams) is str else " ".join(treatment_bams)) + cmd = self.tools.macs2 + " callpeak -t {0}".format( + treatment_bams if type(treatment_bams) is str else " ".join(treatment_bams) + ) if control_bams is not None: - cmd += " -c {0}".format(control_bams if type(control_bams) is str else " ".join(control_bams)) + cmd += " -c {0}".format( + control_bams if type(control_bams) is str else " ".join(control_bams) + ) if paired: cmd += " -f BAMPE " @@ -1561,26 +1868,46 @@ def macs2_call_peaks( cmd += " --qvalue {}".format(qvalue) else: cmd += " --pvalue {}".format(pvalue or 0.00001) - cmd += " -g {0} -n {1} --outdir {2}".format(sizes[genome], sample_name, output_dir) + cmd += " -g {0} -n {1} --outdir {2}".format( + sizes[genome], sample_name, output_dir + ) return cmd def macs2_call_peaks_atacseq(self, treatment_bam, output_dir, sample_name, genome): - genome_sizes = {"hg38": 2.7e9, "hg19": 2.7e9, "mm10": 1.87e9, "dr7": 1.412e9, "mm9": 1.87e9} + genome_sizes = { + "hg38": 2.7e9, + "hg19": 2.7e9, + "mm10": 1.87e9, + "dr7": 1.412e9, + "mm9": 1.87e9, + } cmd = self.tools.macs2 + " callpeak -t {0}".format(treatment_bam) - cmd += " --nomodel --extsize 147 -g {0} -n {1} --outdir {2}".format(genome_sizes[genome], sample_name, output_dir) + cmd += " --nomodel --extsize 147 -g {0} -n {1} --outdir {2}".format( + genome_sizes[genome], sample_name, output_dir + ) return cmd def macs2_plot_model(self, r_peak_model_file, sample_name, output_dir): # run macs r script cmd1 = "{} {}".format(self.tools.Rscript, r_peak_model_file) # move output plot to sample dir - cmd2 = "mv {0}/{1}_model.pdf {2}/{1}_model.pdf".format(os.getcwd(), sample_name, output_dir) + cmd2 = "mv {0}/{1}_model.pdf {2}/{1}_model.pdf".format( + os.getcwd(), sample_name, output_dir + ) return [cmd1, cmd2] def spp_call_peaks( - self, treatment_bam, control_bam, treatment_name, control_name, - output_dir, broad, cpus, qvalue=None): + self, + treatment_bam, + control_bam, + treatment_name, + control_name, + output_dir, + broad, + cpus, + qvalue=None, + ): """ Build command for R script to call peaks with SPP. @@ -1595,20 +1922,33 @@ def spp_call_peaks( :return str: Command to run. """ broad = "TRUE" if broad else "FALSE" - cmd = self.tools.Rscript + " `which spp_peak_calling.R` {0} {1} {2} {3} {4} {5} {6}".format( - treatment_bam, control_bam, treatment_name, control_name, broad, cpus, output_dir + cmd = ( + self.tools.Rscript + + " `which spp_peak_calling.R` {0} {1} {2} {3} {4} {5} {6}".format( + treatment_bam, + control_bam, + treatment_name, + control_name, + broad, + cpus, + output_dir, + ) ) if qvalue is not None: cmd += " {}".format(qvalue) return cmd def bam_to_bed(self, input_bam, output_bed): - cmd = self.tools.bedtools + " bamtobed -i {0} > {1}".format(input_bam, output_bed) + cmd = self.tools.bedtools + " bamtobed -i {0} > {1}".format( + input_bam, output_bed + ) return cmd def zinba_call_peaks(self, treatment_bed, control_bed, cpus, tagmented=False): fragmentLength = 80 if tagmented else 180 - cmd = self.tools.Rscript + " `which zinba.R` -l {0} -t {1} -c {2}".format(fragmentLength, treatment_bed, control_bed) + cmd = self.tools.Rscript + " `which zinba.R` -l {0} -t {1} -c {2}".format( + fragmentLength, treatment_bed, control_bed + ) return cmd def filter_peaks_mappability(self, peaks, alignability, filtered_peaks): @@ -1616,22 +1956,37 @@ def filter_peaks_mappability(self, peaks, alignability, filtered_peaks): cmd += " -a {0} -b {1} > {2} ".format(peaks, alignability, filtered_peaks) return cmd - def homer_find_motifs(self, peak_file, genome, output_dir, size=150, length="8,10,12,14,16", n_motifs=12): + def homer_find_motifs( + self, + peak_file, + genome, + output_dir, + size=150, + length="8,10,12,14,16", + n_motifs=12, + ): cmd = "findMotifsGenome.pl {0} {1} {2}".format(peak_file, genome, output_dir) cmd += " -mask -size {0} -len {1} -S {2}".format(size, length, n_motifs) return cmd def homer_annotate_pPeaks(self, peak_file, genome, motif_file, output_bed): - cmd = "annotatePeaks.pl {0} {1} -mask -mscore -m {2} |".format(peak_file, genome, motif_file) + cmd = "annotatePeaks.pl {0} {1} -mask -mscore -m {2} |".format( + peak_file, genome, motif_file + ) cmd += "tail -n +2 | cut -f 1,5,22 > {3}".format(output_bed) return cmd - def center_peaks_on_motifs(self, peak_file, genome, window_width, motif_file, output_bed): - - cmd = "annotatePeaks.pl {0} {1} -size {2} -center {3} |".format(peak_file, genome, window_width, motif_file) + def center_peaks_on_motifs( + self, peak_file, genome, window_width, motif_file, output_bed + ): + cmd = "annotatePeaks.pl {0} {1} -size {2} -center {3} |".format( + peak_file, genome, window_width, motif_file + ) cmd += " awk -v OFS='\t' '{print $2, $3, $4, $1, $6, $5}' |" cmd += """ awk -v OFS='\t' -F '\t' '{ gsub("0", "+", $6) ; gsub("1", "-", $6) ; print }' |""" - cmd += " fix_bedfile_genome_boundaries.py {0} | sortBed > {1}".format(genome, output_bed) + cmd += " fix_bedfile_genome_boundaries.py {0} | sortBed > {1}".format( + genome, output_bed + ) return cmd def get_read_type(self, bam_file, n=10): @@ -1641,10 +1996,13 @@ def get_read_type(self, bam_file, n=10): :param int n: Number of lines to read from bam file. :return str, int: tuple of read type and read length """ + from collections.abc import Counter + try: - p = subprocess.Popen([self.tools.samtools, 'view', bam_file], - stdout=subprocess.PIPE) + p = subprocess.Popen( + [self.tools.samtools, "view", bam_file], stdout=subprocess.PIPE + ) # Count paired alignments paired = 0 read_length = Counter() @@ -1661,19 +2019,28 @@ def get_read_type(self, bam_file, n=10): # Get most abundant read read_length read_length = sorted(read_length)[-1] # If at least half is paired, return True - if paired > (n / 2.): + if paired > (n / 2.0): return "PE", read_length else: return "SE", read_length - def parse_bowtie_stats(self, stats_file): """ Parses Bowtie2 stats file, returns series with values. :param str stats_file: Bowtie2 output file with alignment statistics. """ import pandas as pd - stats = pd.Series(index=["readCount", "unpaired", "unaligned", "unique", "multiple", "alignmentRate"]) + + stats = pd.Series( + index=[ + "readCount", + "unpaired", + "unaligned", + "unique", + "multiple", + "alignmentRate", + ] + ) try: with open(stats_file) as handle: content = handle.readlines() # list of strings per line @@ -1681,27 +2048,46 @@ def parse_bowtie_stats(self, stats_file): return stats # total reads try: - line = [i for i in range(len(content)) if " reads; of these:" in content[i]][0] + line = [ + i for i in range(len(content)) if " reads; of these:" in content[i] + ][0] stats["readCount"] = re.sub("\D.*", "", content[line]) if 7 > len(content) > 2: - line = [i for i in range(len(content)) if "were unpaired; of these:" in content[i]][0] + line = [ + i + for i in range(len(content)) + if "were unpaired; of these:" in content[i] + ][0] stats["unpaired"] = re.sub("\D", "", re.sub("\(.*", "", content[line])) else: - line = [i for i in range(len(content)) if "were paired; of these:" in content[i]][0] - stats["unpaired"] = stats["readCount"] - int(re.sub("\D", "", re.sub("\(.*", "", content[line]))) - line = [i for i in range(len(content)) if "aligned 0 times" in content[i]][0] + line = [ + i + for i in range(len(content)) + if "were paired; of these:" in content[i] + ][0] + stats["unpaired"] = stats["readCount"] - int( + re.sub("\D", "", re.sub("\(.*", "", content[line])) + ) + line = [i for i in range(len(content)) if "aligned 0 times" in content[i]][ + 0 + ] stats["unaligned"] = re.sub("\D", "", re.sub("\(.*", "", content[line])) - line = [i for i in range(len(content)) if "aligned exactly 1 time" in content[i]][0] + line = [ + i for i in range(len(content)) if "aligned exactly 1 time" in content[i] + ][0] stats["unique"] = re.sub("\D", "", re.sub("\(.*", "", content[line])) - line = [i for i in range(len(content)) if "aligned >1 times" in content[i]][0] + line = [i for i in range(len(content)) if "aligned >1 times" in content[i]][ + 0 + ] stats["multiple"] = re.sub("\D", "", re.sub("\(.*", "", content[line])) - line = [i for i in range(len(content)) if "overall alignment rate" in content[i]][0] + line = [ + i for i in range(len(content)) if "overall alignment rate" in content[i] + ][0] stats["alignmentRate"] = re.sub("\%.*", "", content[line]).strip() except IndexError: pass return stats - def parse_duplicate_stats(self, stats_file): """ Parses sambamba markdup output, returns series with values. @@ -1709,6 +2095,7 @@ def parse_duplicate_stats(self, stats_file): :param str stats_file: sambamba output file with duplicate statistics. """ import pandas as pd + series = pd.Series() try: with open(stats_file) as handle: @@ -1716,17 +2103,32 @@ def parse_duplicate_stats(self, stats_file): except: return series try: - line = [i for i in range(len(content)) if "single ends (among them " in content[i]][0] + line = [ + i + for i in range(len(content)) + if "single ends (among them " in content[i] + ][0] series["single-ends"] = re.sub("\D", "", re.sub("\(.*", "", content[line])) - line = [i for i in range(len(content)) if " end pairs... done in " in content[i]][0] - series["paired-ends"] = re.sub("\D", "", re.sub("\.\.\..*", "", content[line])) - line = [i for i in range(len(content)) if " duplicates, sorting the list... done in " in content[i]][0] - series["duplicates"] = re.sub("\D", "", re.sub("\.\.\..*", "", content[line])) + line = [ + i + for i in range(len(content)) + if " end pairs... done in " in content[i] + ][0] + series["paired-ends"] = re.sub( + "\D", "", re.sub("\.\.\..*", "", content[line]) + ) + line = [ + i + for i in range(len(content)) + if " duplicates, sorting the list... done in " in content[i] + ][0] + series["duplicates"] = re.sub( + "\D", "", re.sub("\.\.\..*", "", content[line]) + ) except IndexError: pass return series - def parse_qc(self, qc_file): """ Parse phantompeakqualtools (spp) QC table and return quality metrics. @@ -1735,10 +2137,13 @@ def parse_qc(self, qc_file): contains sample quality measurements. """ import pandas as pd + series = pd.Series() try: with open(qc_file) as handle: - line = handle.readlines()[0].strip().split("\t") # list of strings per line + line = ( + handle.readlines()[0].strip().split("\t") + ) # list of strings per line series["NSC"] = line[-3] series["RSC"] = line[-2] series["qualityTag"] = line[-1] @@ -1746,7 +2151,6 @@ def parse_qc(self, qc_file): pass return series - def get_peak_number(self, sample): """ Counts number of peaks from a sample's peak file. @@ -1758,7 +2162,6 @@ def get_peak_number(self, sample): sample["peakNumber"] = re.sub("\D.*", "", out) return sample - def get_frip(self, sample): """ Calculates the fraction of reads in peaks for a given sample. @@ -1766,6 +2169,7 @@ def get_frip(self, sample): :param pipelines.Sample sample: Sample object with "peaks" attribute. """ import pandas as pd + with open(sample.frip, "r") as handle: content = handle.readlines() reads_in_peaks = int(re.sub("\D", "", content[0])) diff --git a/pypiper/pipeline.py b/pypiper/pipeline.py index 69474c22..88c61734 100644 --- a/pypiper/pipeline.py +++ b/pypiper/pipeline.py @@ -4,16 +4,22 @@ import glob import os import sys + from collections.abc import Iterable, Mapping -from .exceptions import \ - IllegalPipelineDefinitionError, IllegalPipelineExecutionError, \ - UnknownPipelineStageError +from .exceptions import ( + IllegalPipelineDefinitionError, + IllegalPipelineExecutionError, + UnknownPipelineStageError, +) from .manager import PipelineManager from .stage import Stage -from .utils import \ - checkpoint_filepath, flag_name, parse_stage_name, translate_stage_name - +from .utils import ( + checkpoint_filepath, + flag_name, + parse_stage_name, + translate_stage_name, +) __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" @@ -22,7 +28,6 @@ __all__ = ["Pipeline", "UnknownPipelineStageError"] - class Pipeline(object): """ Generic pipeline framework. @@ -45,35 +50,40 @@ class Pipeline(object): :raise pypiper.IllegalPipelineDefinitionError: Definition of collection of stages must be non-empty. """ - + __metaclass__ = abc.ABCMeta - - def __init__(self, name=None, manager=None, outfolder=None, args=None, - **pl_mgr_kwargs): + def __init__( + self, name=None, manager=None, outfolder=None, args=None, **pl_mgr_kwargs + ): super(Pipeline, self).__init__() try: self.name = name or manager.name except AttributeError: raise TypeError( - "If a pipeline manager isn't provided to create " - "{}, a name is required.".format(Pipeline.__name__)) + "If a pipeline manager isn't provided to create " + "{}, a name is required.".format(Pipeline.__name__) + ) else: if not self.name: raise ValueError( "Invalid name, possible inferred from pipeline manager: " - "{} ({})".format(self.name, type(self.name))) + "{} ({})".format(self.name, type(self.name)) + ) # Determine the PipelineManager. if manager: self.manager = manager if outfolder: - print("Ignoring explicit output folder ({}) and using that of " - "pipeline manager ({})".format(outfolder, - manager.outfolder)) + print( + "Ignoring explicit output folder ({}) and using that of " + "pipeline manager ({})".format(outfolder, manager.outfolder) + ) if name and name != manager.name: - print("Warning: name for pipeline ('{}') doesn't match that " - "of the given manager ('{}')".format(name, manager.name)) + print( + "Warning: name for pipeline ('{}') doesn't match that " + "of the given manager ('{}')".format(name, manager.name) + ) elif outfolder: # We're guaranteed by the upfront exception block around # name setting that we'll either have set the name for this @@ -81,10 +91,13 @@ def __init__(self, name=None, manager=None, outfolder=None, args=None, # protected from passing a null name argument to the pipeline # manager's constructor. self.manager = PipelineManager( - self.name, outfolder, args=args, **pl_mgr_kwargs) + self.name, outfolder, args=args, **pl_mgr_kwargs + ) else: - raise TypeError("To create a {} instance, 'manager' or 'outfolder' " - "is required".format(self.__class__.__name__)) + raise TypeError( + "To create a {} instance, 'manager' or 'outfolder' " + "is required".format(self.__class__.__name__) + ) # Require that checkpoints be overwritten. self.manager.overwrite_checkpoints = True @@ -94,14 +107,19 @@ def __init__(self, name=None, manager=None, outfolder=None, args=None, # stage names are handled, parsed, and translated. self._unordered = _is_unordered(self.stages()) if self._unordered: - print("NOTICE: Unordered definition of stages for " - "pipeline {}".format(self.name)) + print( + "NOTICE: Unordered definition of stages for " + "pipeline {}".format(self.name) + ) # Get to a sequence of pairs of key (possibly in need of translation) # and actual callable. Key is stage name and value is either stage # callable or an already-made stage object. - stages = self.stages().items() \ - if isinstance(self.stages(), Mapping) else self.stages() + stages = ( + self.stages().items() + if isinstance(self.stages(), Mapping) + else self.stages() + ) # Stage spec. parser handles callable validation. name_stage_pairs = [_parse_stage_spec(s) for s in stages] @@ -122,16 +140,18 @@ def __init__(self, name=None, manager=None, outfolder=None, args=None, self._stages = [] for name, stage in name_stage_pairs: - # Use external translator to further confound redefinition. internal_name = translate_stage_name(name) # Check that there's not a checkpoint name collision. if internal_name in _internal_to_external: already_mapped = _internal_to_external[internal_name] - errmsg = "Duplicate stage name resolution (stage names are too " \ - "similar.) '{}' and '{}' both resolve to '{}'".\ - format(name, already_mapped, internal_name) + errmsg = ( + "Duplicate stage name resolution (stage names are too " + "similar.) '{}' and '{}' both resolve to '{}'".format( + name, already_mapped, internal_name + ) + ) raise IllegalPipelineDefinitionError(errmsg) # Store the stage name translations and the stage itself. @@ -141,7 +161,6 @@ def __init__(self, name=None, manager=None, outfolder=None, args=None, self.skipped, self.executed = None, None - @property def outfolder(self): """ @@ -151,7 +170,6 @@ def outfolder(self): """ return self.manager.outfolder - @abc.abstractmethod def stages(self): """ @@ -161,7 +179,6 @@ def stages(self): """ pass - @property def stage_names(self): """ @@ -173,7 +190,6 @@ class author (i.e., not necessarily those that are used for the """ return [parse_stage_name(s) for s in self._stages] - def checkpoint(self, stage, msg=""): """ Touch checkpoint file for given stage and provide timestamp message. @@ -188,8 +204,8 @@ def checkpoint(self, stage, msg=""): # pipeline completes, so fix the 'finished' parameter to the manager's # timestamp method to be True. return self.manager.timestamp( - message=msg, checkpoint=stage.checkpoint_name, finished=True) - + message=msg, checkpoint=stage.checkpoint_name, finished=True + ) def completed_stage(self, stage): """ @@ -203,12 +219,10 @@ def completed_stage(self, stage): check_path = checkpoint_filepath(stage, self.manager) return os.path.exists(check_path) - def halt(self, **kwargs): - """ Halt the pipeline """ + """Halt the pipeline""" self.manager.halt(**kwargs) - def list_flags(self, only_name=False): """ Determine the flag files associated with this pipeline. @@ -223,7 +237,6 @@ def list_flags(self, only_name=False): else: return paths - def run(self, start_point=None, stop_before=None, stop_after=None): """ Run the pipeline, optionally specifying start and/or stop points. @@ -249,7 +262,8 @@ def run(self, start_point=None, stop_before=None, stop_after=None): if stop_before and stop_after: raise IllegalPipelineExecutionError( - "Cannot specify both inclusive and exclusive stops.") + "Cannot specify both inclusive and exclusive stops." + ) if stop_before: stop = stop_before @@ -271,8 +285,10 @@ def run(self, start_point=None, stop_before=None, stop_after=None): # Permit order-agnostic pipelines, but warn. if self._unordered and (start_point or stop_before or stop_after): - print("WARNING: Starting and stopping points are nonsense for " - "pipeline with unordered stages.") + print( + "WARNING: Starting and stopping points are nonsense for " + "pipeline with unordered stages." + ) # TODO: consider context manager based on start/stop points. @@ -283,7 +299,8 @@ def run(self, start_point=None, stop_before=None, stop_after=None): assert stop_index <= len(self._stages) if start_index >= stop_index: raise IllegalPipelineExecutionError( - "Cannot start pipeline at or after stopping point") + "Cannot start pipeline at or after stopping point" + ) # TODO: consider storing just stage name rather than entire stage. # TODO (cont.): the bad case for whole-Stage is if associated data @@ -297,7 +314,6 @@ def run(self, start_point=None, stop_before=None, stop_after=None): skip_mode = True for stage in self._stages[start_index:stop_index]: - # TODO: Note that there's no way to tell whether a non-checkpointed # TODO (cont.) Stage has been completed, and thus this seek # TODO (cont.) operation will find the first Stage, starting @@ -330,15 +346,15 @@ def run(self, start_point=None, stop_before=None, stop_after=None): self.halt(raise_error=False) def wrapup(self): - """ Final mock stage to run after final one finishes. """ + """Final mock stage to run after final one finishes.""" self.manager.complete() def _reset(self): - """ Scrub decks with respect to Stage status/label tracking. """ + """Scrub decks with respect to Stage status/label tracking.""" self.skipped, self.executed = [], [] def _start_index(self, start=None): - """ Seek to the first stage to run. """ + """Seek to the first stage to run.""" if start is None: return 0 start_stage = translate_stage_name(start) @@ -374,7 +390,6 @@ def _stop_index(self, stop_point, inclusive): return stop_index + 1 if inclusive else stop_index - def _is_unordered(collection): """ Determine whether a collection appears to be unordered. @@ -390,14 +405,11 @@ def _is_unordered(collection): illogical to investigate whether it's ordered. """ if not isinstance(collection, Iterable): - raise TypeError("Non-iterable alleged collection: {}". - format(type(collection))) - + raise TypeError("Non-iterable alleged collection: {}".format(type(collection))) return isinstance(collection, set) or isinstance(collection, dict) - def _parse_stage_spec(stage_spec): """ Handle alternate Stage specifications, returning name and Stage. @@ -413,9 +425,11 @@ def _parse_stage_spec(stage_spec): """ # The logic used here, a message to a user about how to specify Stage. - req_msg = "Stage specification must be either a {0} itself, a " \ - "(, {0}) pair, or a callable with a __name__ attribute " \ - "(e.g., a non-anonymous function)".format(Stage.__name__) + req_msg = ( + "Stage specification must be either a {0} itself, a " + "(, {0}) pair, or a callable with a __name__ attribute " + "(e.g., a non-anonymous function)".format(Stage.__name__) + ) # Simplest case is stage itself. if isinstance(stage_spec, Stage): diff --git a/pypiper/stage.py b/pypiper/stage.py index 29f83c08..6f1d551a 100644 --- a/pypiper/stage.py +++ b/pypiper/stage.py @@ -11,16 +11,13 @@ __all__ = ["Stage"] - class Stage(object): """ Single stage/phase of a pipeline; a logical processing "unit". A stage is a collection of commands that is checkpointed. """ - - def __init__(self, func, f_args=None, f_kwargs=None, - name=None, checkpoint=True): + def __init__(self, func, f_args=None, f_kwargs=None, name=None, checkpoint=True): """ A function, perhaps with arguments, defines the stage. @@ -39,7 +36,6 @@ def __init__(self, func, f_args=None, f_kwargs=None, self.name = name or func.__name__ self.checkpoint = checkpoint - @property def checkpoint_name(self): """ @@ -50,37 +46,42 @@ def checkpoint_name(self): """ return translate_stage_name(self.name) if self.checkpoint else None - def run(self, *args, **kwargs): - """ Alternate form for direct call; execute stage. """ + """Alternate form for direct call; execute stage.""" self(*args, **kwargs) - def __call__(self, *args, **update_kwargs): - """ Execute the stage, allowing updates to args/kwargs. """ + """Execute the stage, allowing updates to args/kwargs.""" kwargs = copy.deepcopy(self.f_kwargs) kwargs.update(update_kwargs) args = args or self.f_args self.f(*args, **kwargs) - def __eq__(self, other): - return isinstance(other, Stage) and \ - self.f.__name__ == other.f.__name__ and \ - ({k: v for k, v in self.__dict__.items() if k != "f"} == - {k: v for k, v in other.__dict__.items() if k != "f"}) - + return ( + isinstance(other, Stage) + and self.f.__name__ == other.f.__name__ + and ( + {k: v for k, v in self.__dict__.items() if k != "f"} + == {k: v for k, v in other.__dict__.items() if k != "f"} + ) + ) def __ne__(self, other): return not (self == other) - def __repr__(self): - return "{klass} '{n}': f={f}, args={pos}, kwargs={kwd}, " \ - "checkpoint={check}".format(klass=self.__class__.__name__, - f=self.f, n=self.name, pos=self.f_args, kwd=self.f_kwargs, - check=self.checkpoint) - + return ( + "{klass} '{n}': f={f}, args={pos}, kwargs={kwd}, " + "checkpoint={check}".format( + klass=self.__class__.__name__, + f=self.f, + n=self.name, + pos=self.f_args, + kwd=self.f_kwargs, + check=self.checkpoint, + ) + ) def __str__(self): return "{}: '{}'".format(self.__class__.__name__, self.name) diff --git a/pypiper/utils.py b/pypiper/utils.py index 43eb789b..2c5ac753 100644 --- a/pypiper/utils.py +++ b/pypiper/utils.py @@ -1,27 +1,29 @@ """ Shared utilities """ -from collections.abc import Iterable, Mapping, Sequence import os -import sys import re -from subprocess import PIPE +import sys + +from collections.abc import Iterable, Mapping, Sequence from shlex import split +from subprocess import PIPE -if sys.version_info < (3, ): +if sys.version_info < (3,): CHECK_TEXT_TYPES = (str, unicode) from inspect import getargspec as get_fun_sig else: - CHECK_TEXT_TYPES = (str, ) + CHECK_TEXT_TYPES = (str,) from inspect import getfullargspec as get_fun_sig from ubiquerg import expandpath, is_command_callable -from .const import \ - CHECKPOINT_EXTENSION, PIPELINE_CHECKPOINT_DELIMITER, \ - STAGE_NAME_SPACE_REPLACEMENT +from .const import ( + CHECKPOINT_EXTENSION, + PIPELINE_CHECKPOINT_DELIMITER, + STAGE_NAME_SPACE_REPLACEMENT, +) from .flags import FLAGS - __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" @@ -29,15 +31,24 @@ # What to export/attach to pypiper package namespace. # Conceptually, reserve this for functions expected to be used in other # packages, and import from utils within pypiper for other functions. -__all__ = ["add_pypiper_args", "build_command", "check_all_commands", - "determine_uncallable", "get_first_value", "head", "logger_via_cli"] +__all__ = [ + "add_pypiper_args", + "build_command", + "check_all_commands", + "determine_uncallable", + "get_first_value", + "head", + "logger_via_cli", + "result_formatter_markdown", +] CHECKPOINT_SPECIFICATIONS = ["start_point", "stop_before", "stop_after"] -def add_pypiper_args(parser, groups=("pypiper", ), args=None, - required=None, all_args=False): +def add_pypiper_args( + parser, groups=("pypiper",), args=None, required=None, all_args=False +): """ Use this to add standardized pypiper arguments to your python pipeline. @@ -57,7 +68,8 @@ def add_pypiper_args(parser, groups=("pypiper", ), args=None, pypiper arguments added """ args_to_add = _determine_args( - argument_groups=groups, arguments=args, use_all_args=all_args) + argument_groups=groups, arguments=args, use_all_args=all_args + ) parser = _add_args(parser, args_to_add, required) return parser @@ -81,8 +93,7 @@ def build_command(chunks): """ if not chunks: - raise ValueError( - "No command parts: {} ({})".format(chunks, type(chunks))) + raise ValueError("No command parts: {} ({})".format(chunks, type(chunks))) if isinstance(chunks, str): return chunks @@ -149,8 +160,7 @@ def checkpoint_filename(checkpoint, pipeline_name=None): except AttributeError: base = translate_stage_name(checkpoint) if pipeline_name: - base = "{}{}{}".format( - pipeline_name, PIPELINE_CHECKPOINT_DELIMITER, base) + base = "{}{}{}".format(pipeline_name, PIPELINE_CHECKPOINT_DELIMITER, base) return base + CHECKPOINT_EXTENSION @@ -178,7 +188,8 @@ def checkpoint_filepath(checkpoint, pm): else: raise ValueError( "Absolute checkpoint path '{}' is not in pipeline output " - "folder '{}'".format(checkpoint, pm.outfolder)) + "folder '{}'".format(checkpoint, pm.outfolder) + ) _, ext = os.path.splitext(checkpoint) if ext == CHECKPOINT_EXTENSION: return pipeline_filepath(pm, filename=checkpoint) @@ -226,9 +237,12 @@ def check_shell_asterisk(cmd): def check_all_commands( - cmds, - get_bad_result=lambda bads: Exception("{} uncallable commands: {}".format(len(bads), bads)), - handle=None): + cmds, + get_bad_result=lambda bads: Exception( + "{} uncallable commands: {}".format(len(bads), bads) + ), + handle=None, +): """ Determine whether all commands are callable @@ -246,10 +260,12 @@ def check_all_commands( if not bads: return True if handle is None: + def handle(res): if isinstance(res, Exception): raise res print("Command check result: {}".format(res)) + elif not hasattr(handle, "__call__") or not 1 == len(get_fun_sig(handle).args): raise TypeError("Command check error handler must be a one-arg function") handle(get_bad_result(bads)) @@ -257,12 +273,17 @@ def handle(res): def determine_uncallable( - commands, transformations=( - (lambda f: isinstance(f, str) and - os.path.isfile(expandpath(f)) and - expandpath(f).endswith(".jar"), - lambda f: "java -jar {}".format(expandpath(f))), - ), accumulate=False): + commands, + transformations=( + ( + lambda f: isinstance(f, str) + and os.path.isfile(expandpath(f)) + and expandpath(f).endswith(".jar"), + lambda f: "java -jar {}".format(expandpath(f)), + ), + ), + accumulate=False, +): """ Determine which commands are not callable. @@ -282,23 +303,41 @@ def determine_uncallable( """ commands = [commands] if isinstance(commands, str) else commands if transformations: - trans = transformations.values() if isinstance(transformations, Mapping) else transformations - if not isinstance(transformations, Iterable) or isinstance(transformations, str) or \ - not all(map(lambda func_pair: isinstance(func_pair, tuple) and len(func_pair) == 2, trans)): + trans = ( + transformations.values() + if isinstance(transformations, Mapping) + else transformations + ) + if ( + not isinstance(transformations, Iterable) + or isinstance(transformations, str) + or not all( + map( + lambda func_pair: isinstance(func_pair, tuple) + and len(func_pair) == 2, + trans, + ) + ) + ): raise TypeError( "Transformations argument should be a collection of pairs; got " - "{} ({})".format(transformations, type(transformations).__name__)) + "{} ({})".format(transformations, type(transformations).__name__) + ) if accumulate: + def finalize(cmd): for p, t in transformations: if p(cmd): cmd = t(cmd) return cmd + else: if not isinstance(transformations, (tuple, list)): raise Exception( "If transformations are unordered, non-accumulation of " - "effects may lead to nondeterministic behavior.") + "effects may lead to nondeterministic behavior." + ) + def finalize(cmd): print("Transformations: {}".format(transformations)) for p, t in transformations: @@ -308,14 +347,16 @@ def finalize(cmd): else: finalize = lambda cmd: cmd - return [(orig, used) for orig, used in - map(lambda c: (c, finalize(c)), commands) - if not is_command_callable(used)] + return [ + (orig, used) + for orig, used in map(lambda c: (c, finalize(c)), commands) + if not is_command_callable(used) + ] def split_by_pipes_nonnested(cmd): """ - Split the command by shell pipes, but preserve contents in + Split the command by shell pipes, but preserve contents in parentheses and braces. :param str cmd: Command to investigate. @@ -323,7 +364,7 @@ def split_by_pipes_nonnested(cmd): """ # for posterity, this one will do parens only: re.compile(r'(?:[^|(]|\([^)]*\))+') # r = re.compile(r'(?:[^|({]|[\({][^)}]*[\)}])+') - r = re.compile(r'(?:[^|(]|\([^)]*\)+|\{[^}]*\})') + r = re.compile(r"(?:[^|(]|\([^)]*\)+|\{[^}]*\})") return r.findall(cmd) @@ -332,15 +373,15 @@ def split_by_pipes_nonnested(cmd): def split_by_pipes(cmd): """ - Split the command by shell pipes, but preserve contents in + Split the command by shell pipes, but preserve contents in parentheses and braces. Also handles nested parens and braces. :param str cmd: Command to investigate. :return list: List of sub commands to be linked - """ + """ # Build a simple finite state machine to split on pipes, while - # handling nested braces or parentheses. + # handling nested braces or parentheses. stack_brace = [] stack_paren = [] cmdlist = [] @@ -386,10 +427,10 @@ def check_shell_pipes(cmd): def strip_braced_txt(cmd): curly_braces = True while curly_braces: - SRE_match_obj = re.search(r'\{(.*?)}',cmd) + SRE_match_obj = re.search(r"\{(.*?)}", cmd) if not SRE_match_obj is None: - cmd = cmd[:SRE_match_obj.start()] + cmd[(SRE_match_obj.end()+1):] - if re.search(r'\{(.*?)}',cmd) is None: + cmd = cmd[: SRE_match_obj.start()] + cmd[(SRE_match_obj.end() + 1) :] + if re.search(r"\{(.*?)}", cmd) is None: curly_braces = False else: curly_braces = False @@ -460,7 +501,7 @@ def get_proc_name(cmd): if isinstance(cmd, Iterable) and not isinstance(cmd, str): cmd = " ".join(cmd) - return cmd.split()[0].replace('(', '').replace(')', '') + return cmd.split()[0].replace("(", "").replace(")", "") def get_first_value(param, param_pools, on_missing=None, error=True): @@ -506,7 +547,8 @@ def get_first_value(param, param_pools, on_missing=None, error=True): raise TypeError( "Any callable passed as the action to take when a requested " "parameter is missing should accept that parameter and return " - "a value.") + "a value." + ) return on_missing @@ -581,7 +623,7 @@ def is_sam_or_bam(file_name): :param str file_name: Name/path of file to check as SAM-formatted. :return bool: Whether file appears to be SAM-formatted - """ + """ _, ext = os.path.splitext(file_name) return ext in [".bam", ".sam"] @@ -595,7 +637,9 @@ def logger_via_cli(opts, **kwargs): :return logging.Logger: newly created and configured logger """ from copy import deepcopy + import logmuse + kwds = deepcopy(kwargs) # By default, don't require the logging options to have been added to the parser. kwds.setdefault("strict", False) @@ -617,6 +661,7 @@ def make_lock_name(original_path, path_base_folder): :return str: Name or perhaps relative (to the base folder path indicated) path to lock file """ + def make_name(p): if p: return p.replace(path_base_folder, "").replace(os.sep, "__") @@ -628,8 +673,11 @@ def make_name(p): elif isinstance(original_path, Sequence): result = [make_name(p) for p in original_path] return [x for x in result if x] - raise TypeError("Neither string nor other sequence type: {} ({})". - format(original_path, type(original_path))) + raise TypeError( + "Neither string nor other sequence type: {} ({})".format( + original_path, type(original_path) + ) + ) def is_multi_target(target): @@ -645,8 +693,11 @@ def is_multi_target(target): elif isinstance(target, Sequence): return len(target) > 1 else: - raise TypeError("Could not interpret argument as a target: {} ({})". - format(target, type(target))) + raise TypeError( + "Could not interpret argument as a target: {} ({})".format( + target, type(target) + ) + ) def parse_cmd(cmd, shell): @@ -657,12 +708,18 @@ def parse_cmd(cmd, shell): :param bool shell: if the command should be run in the shell rather that in a subprocess :return list[dict]: list of dicts of commands """ + def _make_dict(command): - a, s = (command, True) if check_shell(command, shell) else (split(command), False) + a, s = ( + (command, True) if check_shell(command, shell) else (split(command), False) + ) return dict(args=a, stdout=PIPE, shell=s) - return [_make_dict(c) for c in split_by_pipes(cmd)] if not shell and check_shell_pipes(cmd) \ + return ( + [_make_dict(c) for c in split_by_pipes(cmd)] + if not shell and check_shell_pipes(cmd) else [dict(args=cmd, stdout=None, shell=True)] + ) def parse_cores(cores, pm, default): @@ -730,16 +787,16 @@ def pipeline_filepath(pm, filename=None, suffix=None): """ if filename is None and suffix is None: - raise TypeError("Provide filename and/or suffix to create " - "path to a pipeline file.") + raise TypeError( + "Provide filename and/or suffix to create " "path to a pipeline file." + ) filename = (filename or pm.name) + (suffix or "") # Note that Pipeline and PipelineManager define the same outfolder. # In fact, a Pipeline just references its manager's outfolder. # So we can handle argument of either type to pm parameter. - return filename if os.path.isabs(filename) \ - else os.path.join(pm.outfolder, filename) + return filename if os.path.isabs(filename) else os.path.join(pm.outfolder, filename) def translate_stage_name(stage): @@ -801,8 +858,8 @@ def _determine_args(argument_groups, arguments, use_all_args=False): from collections.abc import Iterable - from logmuse import LOGGING_CLI_OPTDATA + # Define the argument groups. args_by_group = { "pypiper": ["recover", "new-start", "dirty", "force-follow", "testmode"] @@ -810,10 +867,17 @@ def _determine_args(argument_groups, arguments, use_all_args=False): "config": ["config"], "checkpoint": ["stop-before", "stop-after"], "resource": ["mem", "cores"], - "looper": ["config", "output-parent", "mem", "cores"], + "looper": ["config", "output-parent", "mem", "cores", "pipeline-name"], "common": ["input", "sample-name"], "ngs": ["sample-name", "input", "input2", "genome", "single-or-paired"], - "logmuse": [*LOGGING_CLI_OPTDATA] + "logmuse": [*LOGGING_CLI_OPTDATA], + "pipestat": [ + "pipestat-namespace", + "pipestat-record-id", + "pipestat-schema", + "pipestat-results-file", + "pipestat-config", + ], } # Handle various types of group specifications. @@ -861,6 +925,20 @@ def default_pipeline_config(pipeline_filepath): return os.path.splitext(os.path.basename(pipeline_filepath))[0] + ".yaml" +def default_pipestat_output_schema(pipeline_filepath): + """ + Determine the default filepath for a pipeline's pipestat output schema. + + :param str pipeline_filepath: path to a pipeline + :return str: default filepath for a pipeline's pipestat output schema. + """ + pipestat_results_schema = os.path.join( + os.path.dirname(pipeline_filepath), "pipestat_output_schema.yaml" + ) + print(f"Using default schema: {pipestat_results_schema}") + return pipestat_results_schema if os.path.exists(pipestat_results_schema) else None + + def _add_args(parser, args, required): """ Add new arguments to an ArgumentParser. @@ -879,68 +957,127 @@ def _add_args(parser, args, required): # Define the arguments. argument_data = { - "testmode": - ("-T", {"action": "store_true", - "help": "Only print commands, don't run"}), - "recover": - ("-R", {"action": "store_true", - "help": "Overwrite locks to recover from previous failed run"}), - "new-start": - ("-N", {"action": "store_true", - "help": "Overwrite all results to start a fresh run"}), - "dirty": - ("-D", {"action": "store_true", - "help": "Don't auto-delete intermediate files"}), - "force-follow": - ("-F", {"action": "store_true", - "help": "Always run 'follow' commands"}), - "start-point": - {"help": "Name of pipeline stage at which to begin"}, - "stop-before": - {"help": "Name of pipeline stage at which to stop " - "(exclusive, i.e. not run)"}, - "stop-after": - {"help": "Name of pipeline stage at which to stop " - "(inclusive, i.e. run)"}, - "config": - ("-C", {"dest": "config_file", "metavar": "CONFIG_FILE", - "default": default_config, - "help": "Pipeline configuration file (YAML). " - "Relative paths are with respect to the " - "pipeline script."}), - "sample-name": - ("-S", {"metavar": "SAMPLE_NAME", - "help": "Name for sample to run"}), - "output-parent": - ("-O", {"metavar": "PARENT_OUTPUT_FOLDER", - "help": "Parent output directory of project"}), - "cores": - ("-P", {"type": int, "default": 1, "metavar": "NUMBER_OF_CORES", - "help": "Number of cores for parallelized processes"}), - "mem": - ("-M", {"default": "4000", "metavar": "MEMORY_LIMIT", - "help": "Memory limit for processes accepting such. " - "Default units are megabytes unless specified " - "using the suffix [K|M|G|T]."}), - "input": - ("-I", {"nargs": "+", "metavar": "INPUT_FILES", - "help": "One or more primary input files"}), - "input2": - ("-I2", {"nargs": "*", "metavar": "INPUT_FILES2", - "help": "Secondary input files, such as read2"}), - "genome": - ("-G", {"dest": "genome_assembly", - "help": "Identifier for genome assembly"}), - "single-or-paired": - ("-Q", {"default": "single", - "help": "Single- or paired-end sequencing protocol"}) + "testmode": ( + "-T", + {"action": "store_true", "help": "Only print commands, don't run"}, + ), + "recover": ( + "-R", + { + "action": "store_true", + "help": "Overwrite locks to recover from previous failed run", + }, + ), + "new-start": ( + "-N", + { + "action": "store_true", + "help": "Overwrite all results to start a fresh run", + }, + ), + "dirty": ( + "-D", + {"action": "store_true", "help": "Don't auto-delete intermediate files"}, + ), + "force-follow": ( + "-F", + {"action": "store_true", "help": "Always run 'follow' commands"}, + ), + "start-point": {"help": "Name of pipeline stage at which to begin"}, + "stop-before": { + "help": "Name of pipeline stage at which to stop " + "(exclusive, i.e. not run)" + }, + "stop-after": { + "help": "Name of pipeline stage at which to stop " "(inclusive, i.e. run)" + }, + "config": ( + "-C", + { + "dest": "config_file", + "metavar": "CONFIG_FILE", + "default": default_config, + "help": "Pipeline configuration file (YAML). " + "Relative paths are with respect to the " + "pipeline script.", + }, + ), + "pipeline-name": {"metavar": "PIPELINE_NAME", "help": "Name of the pipeline"}, + "sample-name": ( + "-S", + {"metavar": "SAMPLE_NAME", "help": "Name for sample to run"}, + ), + "output-parent": ( + "-O", + { + "metavar": "PARENT_OUTPUT_FOLDER", + "help": "Parent output directory of project", + }, + ), + "cores": ( + "-P", + { + "type": int, + "default": 1, + "metavar": "NUMBER_OF_CORES", + "help": "Number of cores for parallelized processes", + }, + ), + "mem": ( + "-M", + { + "default": "4000", + "metavar": "MEMORY_LIMIT", + "help": "Memory limit for processes accepting such. " + "Default units are megabytes unless specified " + "using the suffix [K|M|G|T].", + }, + ), + "input": ( + "-I", + { + "nargs": "+", + "metavar": "INPUT_FILES", + "help": "One or more primary input files", + }, + ), + "input2": ( + "-I2", + { + "nargs": "*", + "metavar": "INPUT_FILES2", + "help": "Secondary input files, such as read2", + }, + ), + "genome": ( + "-G", + {"dest": "genome_assembly", "help": "Identifier for genome assembly"}, + ), + "single-or-paired": ( + "-Q", + {"default": "single", "help": "Single- or paired-end sequencing protocol"}, + ), + "pipestat-namespace": { + "help": "Namespace to report into. This will be the DB table name " + "if using DB as the object back-end" + }, + "pipestat-record-id": {"help": "Record identifier to report for"}, + "pipestat-schema": { + "help": "Path to the output schema that formalizes the " "results structure" + }, + "pipestat-config": {"help": "Path to the configuration file"}, + "pipestat-results-file": { + "help": "YAML file to report into, if file is used as " + "the object back-end" + }, } - + from logmuse import LOGGING_CLI_OPTDATA + argument_data.update(LOGGING_CLI_OPTDATA) if len(required) > 0: - required_named = parser.add_argument_group('required named arguments') + required_named = parser.add_argument_group("required named arguments") # Configure the parser for each argument. for arg in args: @@ -958,15 +1095,26 @@ def _add_args(parser, args, required): raise TypeError( "Option name must map to dict or two-tuple (short " "name and dict) of argument command-line argument " - "specification data.") + "specification data." + ) argdata["required"] = arg in required long_opt = "--{}".format(arg) - opts = (short_opt, long_opt) if short_opt else (long_opt, ) + opts = (short_opt, long_opt) if short_opt else (long_opt,) if arg in required: required_named.add_argument(*opts, **argdata) else: parser.add_argument(*opts, **argdata) return parser + + +def result_formatter_markdown(pipeline_name, sample_name, res_id, value) -> str: + """ + Returns Markdown formatted value as string + """ + + message_markdown = "\n> `{key}`\t{value}\t_RES_".format(key=res_id, value=value) + + return message_markdown diff --git a/requirements/requirements-docs.txt b/requirements/requirements-docs.txt index ad53e8dc..4471914e 100644 --- a/requirements/requirements-docs.txt +++ b/requirements/requirements-docs.txt @@ -2,4 +2,5 @@ mkdocs>=1.0 markdown-include pydoc-markdown piper -https://github.com/databio/mkdocs-databio/archive/master.zip +pipestat>=0.4.0 +https://github.com/databio/mkdocs-databio/archive/master.zip \ No newline at end of file diff --git a/requirements/requirements-ngstk.txt b/requirements/requirements-ngstk.txt index 80184784..226df0a0 100644 --- a/requirements/requirements-ngstk.txt +++ b/requirements/requirements-ngstk.txt @@ -2,3 +2,4 @@ numpy pandas pysam yacman +pipestat>=0.1.0 \ No newline at end of file diff --git a/requirements/requirements-pypiper.txt b/requirements/requirements-pypiper.txt index fba597d0..886be3e3 100644 --- a/requirements/requirements-pypiper.txt +++ b/requirements/requirements-pypiper.txt @@ -4,3 +4,4 @@ psutil pandas ubiquerg>=0.4.5 yacman +pipestat>=0.4.0 diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt index 38cfaf32..f4b7458c 100644 --- a/requirements/requirements-test.txt +++ b/requirements/requirements-test.txt @@ -1,7 +1,7 @@ mock==2.0.0 -pytest>=4.2.1 -hypothesis -coveralls>=1.1 -pytest-cov==2.6.1 +pytest>=4.6.9 +pytest-cov>=2.8.1 +hypothesis==4.38.0 +coveralls veracitools pytest-remotedata diff --git a/setup.py b/setup.py index a689838b..d4850712 100644 --- a/setup.py +++ b/setup.py @@ -7,24 +7,22 @@ try: from setuptools import setup - if sys.version_info < (2, 7): - extra['install_requires'] = ['argparse'] except ImportError: from distutils.core import setup - if sys.version_info < (2, 7): - extra['dependencies'] = ['argparse'] def read_reqs_file(reqs_name): - """ Read requirements file for given requirements group. """ + """Read requirements file for given requirements group.""" path_reqs_file = os.path.join( - "requirements", "requirements-{}.txt".format(reqs_name)) - with open(path_reqs_file, 'r') as reqs_file: - return [pkg.rstrip() for pkg in reqs_file.readlines() - if not pkg.startswith("#")] + "requirements", "requirements-{}.txt".format(reqs_name) + ) + with open(path_reqs_file, "r") as reqs_file: + return [ + pkg.rstrip() for pkg in reqs_file.readlines() if not pkg.startswith("#") + ] -with open(os.path.join("pypiper", "_version.py"), 'r') as versionfile: +with open(os.path.join("pypiper", "_version.py"), "r") as versionfile: version = versionfile.readline().split()[-1].strip("\"'\n") @@ -34,8 +32,9 @@ def read_reqs_file(reqs_name): test_reqs = read_reqs_file("test") # Allow specification of desired features, which implies dependencies. -addl_reqs = {bundle_name: read_reqs_file(bundle_name) - for bundle_name in ["ngstk", "plot"]} +addl_reqs = { + bundle_name: read_reqs_file(bundle_name) for bundle_name in ["ngstk", "plot"] +} # Complete collection of user requirements. addl_reqs["all"] = list({pkg for bundle in addl_reqs.values() for pkg in bundle}) @@ -47,29 +46,30 @@ def read_reqs_file(reqs_name): long_description = f.read() setup( - name='piper', - packages=['pypiper'], + name="piper", + packages=["pypiper"], install_requires=basic_reqs, version=version, - description='A lightweight python toolkit for gluing together restartable, robust command line pipelines', + description="A lightweight python toolkit for gluing together restartable, robust command line pipelines", long_description=long_description, - long_description_content_type='text/markdown', + long_description_content_type="text/markdown", classifiers=[ "Development Status :: 4 - Beta", "License :: OSI Approved :: BSD License", "Programming Language :: Python :: 2.7", - "Topic :: Scientific/Engineering :: Bio-Informatics" - ], - author='Nathan Sheffield, Johanna Klughammer, Andre Rendeiro', - author_email='nathan@code.databio.org, jklughammer@cemm.oeaw.ac.at, arendeiro@cemm.oeaw.ac.at', - url='https://github.com/databio/pypiper/', + "Topic :: Scientific/Engineering :: Bio-Informatics", + ], + author="Nathan Sheffield, Johanna Klughammer, Andre Rendeiro", + author_email="nathan@code.databio.org, jklughammer@cemm.oeaw.ac.at, arendeiro@cemm.oeaw.ac.at", + url="https://github.com/databio/pypiper/", license="BSD2", - test_suite="tests", # python setup.py test - tests_require=test_reqs, # Test-specific package dependencies + test_suite="tests", # python setup.py test + tests_require=test_reqs, # Test-specific package dependencies # Extra package if doing `python setup.py test` - setup_requires=(["pytest-runner"] if {"test", "pytest", "ptr"} & set(sys.argv) else []), + setup_requires=( + ["pytest-runner"] if {"test", "pytest", "ptr"} & set(sys.argv) else [] + ), extras_require=addl_reqs, # Version-specific items **extra ) - diff --git a/tests/Data/default_pipestat_output_schema.yaml b/tests/Data/default_pipestat_output_schema.yaml new file mode 100644 index 00000000..55dec57e --- /dev/null +++ b/tests/Data/default_pipestat_output_schema.yaml @@ -0,0 +1,9 @@ +#NOTE: +# This is output schema can be customized for your specific pipeline. +#See here for more details: +# https://pipestat.databio.org/en/latest/pipestat_specification/#pipestat-schema-format +pipeline_name: default_pipeline_name +samples: + number_of_things: + type: integer + description: "Number of things" \ No newline at end of file diff --git a/tests/Data/sample_output_schema.yaml b/tests/Data/sample_output_schema.yaml new file mode 100644 index 00000000..131cb3f8 --- /dev/null +++ b/tests/Data/sample_output_schema.yaml @@ -0,0 +1,24 @@ +pipeline_name: test_pipe +samples: + number_of_things: + type: integer + description: "Number of things" + percentage_of_things: + type: number + description: "Percentage of things" + name_of_something: + type: string + description: "Name of something" + switch_value: + type: boolean + description: "Is the switch on or off" + output_file: + type: file + description: "This a path to the output file" + output_image: + type: image + description: "This a path to the output image" + md5sum: + type: string + description: "MD5SUM of an object" + highlight: true diff --git a/tests/conftest.py b/tests/conftest.py index 08db22b8..2269408e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,14 +1,13 @@ """ Fixtures and configuration visible to all tests """ import copy -from functools import partial import os +from functools import partial import pytest from pypiper import Pipeline, PipelineManager, Stage - __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" @@ -31,10 +30,10 @@ FILE_TEXT_PAIRS = list(zip(FILENAMES, CONTENTS)) - @pytest.fixture def get_pipe_manager(tmpdir): - """ Provide safe creation of pipeline manager, with multi=True. """ + """Provide safe creation of pipeline manager, with multi=True.""" + def get_mgr(**kwargs): if "outfolder" in kwargs: kwd_args = kwargs @@ -42,52 +41,48 @@ def get_mgr(**kwargs): kwd_args = copy.deepcopy(kwargs) kwd_args["outfolder"] = tmpdir.strpath return PipelineManager(multi=True, **kwd_args) - return get_mgr + return get_mgr @pytest.fixture def pl_mgr(request, get_pipe_manager): - """ Provide a PipelineManager and ensure that it's stopped. """ + """Provide a PipelineManager and ensure that it's stopped.""" pm = get_pipe_manager(name=TEST_PIPE_NAME) + def _ensure_stopped(): pm.stop_pipeline() + request.addfinalizer(_ensure_stopped) return pm - @pytest.fixture def dummy_pipe(pl_mgr): - """ Provide a basic Pipeline instance for a test case. """ + """Provide a basic Pipeline instance for a test case.""" return DummyPipeline(pl_mgr) - def write_file1(folder): _write(*FILE_TEXT_PAIRS[0], folder=folder) - def write_file2(folder): _write(*FILE_TEXT_PAIRS[1], folder=folder) - def write_file3(folder): _write(*FILE_TEXT_PAIRS[2], folder=folder) - def _write(filename, content, folder=None): path = os.path.join(folder, filename) - with open(path, 'w') as f: + with open(path, "w") as f: f.write(content) - class DummyPipeline(Pipeline): - """ Basic pipeline implementation for tests """ + """Basic pipeline implementation for tests""" def __init__(self, manager): super(DummyPipeline, self).__init__(TEST_PIPE_NAME, manager=manager) diff --git a/tests/helpers.py b/tests/helpers.py index 16f842c6..3a91a88e 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -1,18 +1,18 @@ """ Helpers for tests """ -from functools import partial import glob import os +from functools import partial + import pytest + from pypiper import Pipeline from pypiper.utils import checkpoint_filepath - __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - def assert_equal_dirpath(p1, p2): """ Assert that a pair of folder paths has two equal members. @@ -23,7 +23,6 @@ def assert_equal_dirpath(p1, p2): assert p1.rstrip(os.sep) == p2.rstrip(os.sep) - def fetch_checkpoint_files(pm): """ Fetch all of a manager's checkpoint file paths. @@ -37,7 +36,6 @@ def fetch_checkpoint_files(pm): return glob.glob(pattern) - def named_param(argnames, argvalues): """ Improve pytest's native labeling of test case parameterization. @@ -53,15 +51,19 @@ def named_param(argnames, argvalues): :return functools.partial: Parameterize version of parametrize, with values and ids fixed. """ - return partial(pytest.mark.parametrize( - argnames=argnames, argvalues=argvalues, - ids=lambda val: "{}={}".format(argnames, val))) - + return partial( + pytest.mark.parametrize( + argnames=argnames, + argvalues=argvalues, + ids=lambda val: "{}={}".format(argnames, val), + ) + ) class SafeTestPipeline(Pipeline): - """ Pipeline for tests that protects against bad file descriptor. """ + """Pipeline for tests that protects against bad file descriptor.""" + def __init__(self, *args, **kwargs): - kwd_args = {"multi": True} # Like interactive mode. + kwd_args = {"multi": True} # Like interactive mode. kwd_args.update(kwargs) super(SafeTestPipeline, self).__init__(*args, **kwd_args) diff --git a/tests/pipeline/conftest.py b/tests/pipeline/conftest.py index 28be5db0..9f72e657 100644 --- a/tests/pipeline/conftest.py +++ b/tests/pipeline/conftest.py @@ -1,27 +1,24 @@ """ Test configuration for Pipeline tests. """ import os + import pytest + from pypiper import Stage from tests.helpers import SafeTestPipeline - __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - READ_ALIGNER_FILENAME = "aligner.lst" PEAK_CALLER_FILENAME = "caller.lst" - def pytest_generate_tests(metafunc): - """ Dynamic test case parameterization. """ + """Dynamic test case parameterization.""" if "pl_name" in metafunc.fixturenames: - metafunc.parametrize( - "pl_name", [read_aligner.__name__, call_peaks.__name__]) - + metafunc.parametrize("pl_name", [read_aligner.__name__, call_peaks.__name__]) # Dummy functions used as elements of pipeline stages() collections. @@ -41,10 +38,8 @@ def call_peaks(): pass - class FunctionNameWriterPipeline(SafeTestPipeline): - """ Basic pipeline that writes to file the names of its functions. """ - + """Basic pipeline that writes to file the names of its functions.""" def __init__(self, name, outfolder, filename, functions): """ @@ -62,9 +57,7 @@ def __init__(self, name, outfolder, filename, functions): self.name_output_file = filename self.functions = functions # Get the stages() benefit of superclass extension. - super(FunctionNameWriterPipeline, self).__init__( - name=name, outfolder=outfolder) - + super(FunctionNameWriterPipeline, self).__init__(name=name, outfolder=outfolder) def write_name(self, func): """ @@ -73,12 +66,11 @@ def write_name(self, func): :param callable func: Name of function to write to the output file. """ outpath = os.path.join(self.outfolder, self.name_output_file) - with open(outpath, 'a') as f: + with open(outpath, "a") as f: f.write(func.__name__ + os.linesep) - def run(self, **kwargs): - """ Start with clean output file, then use superclass method. """ + """Start with clean output file, then use superclass method.""" # Ensure that we start with a clean file since the nature of the # operations performed (sequential file writes) creates desire to # open output file in append mode rather than write mode. @@ -87,30 +79,26 @@ def run(self, **kwargs): os.unlink(output_file) super(FunctionNameWriterPipeline, self).run(**kwargs) - def stages(self): - """ Sequence of operations to perform. """ - return [Stage(self.write_name, (f,), name=f.__name__) - for f in self.functions] - + """Sequence of operations to perform.""" + return [Stage(self.write_name, (f,), name=f.__name__) for f in self.functions] # Functions and fixtures + def get_read_aligner(outfolder): - """ Create a dummy 'read aligner' pipeline. """ + """Create a dummy 'read aligner' pipeline.""" return FunctionNameWriterPipeline( - "read-aligner", outfolder, - READ_ALIGNER_FILENAME, [merge_input, qc, align_reads]) - + "read-aligner", outfolder, READ_ALIGNER_FILENAME, [merge_input, qc, align_reads] + ) def get_peak_caller(outfolder): - """ Create a dummy 'peak caller' pipeline. """ + """Create a dummy 'peak caller' pipeline.""" return FunctionNameWriterPipeline( - "peak-caller", outfolder, - PEAK_CALLER_FILENAME, [align_reads, call_peaks]) - + "peak-caller", outfolder, PEAK_CALLER_FILENAME, [align_reads, call_peaks] + ) def get_pipeline(name, outfolder): @@ -129,15 +117,13 @@ def get_pipeline(name, outfolder): raise ValueError("Unknown pipeline request: '{}'".format(name)) - @pytest.fixture def read_aligner(tmpdir): - """ Provide test case with a read aligner pipeline instance. """ + """Provide test case with a read aligner pipeline instance.""" return get_read_aligner(outfolder=tmpdir.strpath) - @pytest.fixture def peak_caller(tmpdir): - """ Provide test case with a 'PeakCaller' pipeline instance. """ + """Provide test case with a 'PeakCaller' pipeline instance.""" return get_peak_caller(outfolder=tmpdir.strpath) diff --git a/tests/pipeline/test_multi_pipeline_sample.py b/tests/pipeline/test_multi_pipeline_sample.py index 4d3e7503..f8874d36 100644 --- a/tests/pipeline/test_multi_pipeline_sample.py +++ b/tests/pipeline/test_multi_pipeline_sample.py @@ -1,18 +1,18 @@ """ Tests for case in which multiple pipelines process a single sample. """ import os + from pypiper.utils import checkpoint_filepath from tests.helpers import fetch_checkpoint_files, named_param -from .conftest import get_peak_caller, get_pipeline, get_read_aligner +from .conftest import get_peak_caller, get_pipeline, get_read_aligner __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - def test_checkpoints_are_pipeline_unique(tmpdir): - """ Names of checkpoint files depend on both stage and pipeline. """ + """Names of checkpoint files depend on both stage and pipeline.""" # Note: conceptually, this tests an underlying mechanistic aspect of the # checkpointing system. @@ -38,10 +38,12 @@ def test_checkpoints_are_pipeline_unique(tmpdir): call_peaks.run() # We expect a different checkpoint file for each stage of each pipeline. - align_reads_expected = {checkpoint_filepath(s.name, align_reads) - for s in align_reads.stages()} - call_peaks_expected = {checkpoint_filepath(s.name, call_peaks) - for s in call_peaks.stages()} + align_reads_expected = { + checkpoint_filepath(s.name, align_reads) for s in align_reads.stages() + } + call_peaks_expected = { + checkpoint_filepath(s.name, call_peaks) for s in call_peaks.stages() + } # Pipeline names are unique here, and each checkpoint name includes # pipeline name for disambiguation, so even a pair of pipelines with a @@ -52,8 +54,9 @@ def test_checkpoints_are_pipeline_unique(tmpdir): # When not setting start/stop parameters and beginning with no checkpoint # files in place, each pipeline generates its full set of checkpoint files. expected_checkpoints = align_reads_expected | call_peaks_expected - observed_checkpoints = set(fetch_checkpoint_files(align_reads)) | \ - set(fetch_checkpoint_files(call_peaks)) + observed_checkpoints = set(fetch_checkpoint_files(align_reads)) | set( + fetch_checkpoint_files(call_peaks) + ) # Verify satisfaction of expectation. try: @@ -68,9 +71,8 @@ def test_checkpoints_are_pipeline_unique(tmpdir): raise - def test_pipeline_checkpoint_respect_sensitivity_and_specificity(tmpdir): - """ Pipeline respects only its own checkpoint(s) for stage skipping. """ + """Pipeline respects only its own checkpoint(s) for stage skipping.""" # Note: conceptually, this is more of an effect- or outcome-based test # of the checkpointing system with respect to stage skipping. @@ -80,22 +82,18 @@ def test_pipeline_checkpoint_respect_sensitivity_and_specificity(tmpdir): align_reads_stage_names = [s.name for s in align_reads.stages()] call_peaks_stage_names = [s.name for s in call_peaks.stages()] - assert {"align_reads"} == \ - set(align_reads_stage_names) & set(call_peaks_stage_names) + assert {"align_reads"} == set(align_reads_stage_names) & set(call_peaks_stage_names) # Set up the checkpoints for the read alignment pipeline by allowing it # to execute once. align_reads.run() - assert os.path.isfile(checkpoint_filepath( - "align_reads", align_reads.manager)) - peaks_align_check_fpath = \ - checkpoint_filepath("align_reads", call_peaks.manager) + assert os.path.isfile(checkpoint_filepath("align_reads", align_reads.manager)) + peaks_align_check_fpath = checkpoint_filepath("align_reads", call_peaks.manager) assert not os.path.isfile(peaks_align_check_fpath) call_peaks.run() exp_lines = [func + os.linesep for func in call_peaks_stage_names] - call_peaks_outpath = os.path.join( - call_peaks.outfolder, call_peaks.name_output_file) - with open(call_peaks_outpath, 'r') as f: + call_peaks_outpath = os.path.join(call_peaks.outfolder, call_peaks.name_output_file) + with open(call_peaks_outpath, "r") as f: obs_lines = f.readlines() assert exp_lines == obs_lines diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index ec774971..295a885b 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -1,43 +1,54 @@ """ Tests for the Pipeline data type """ -from functools import partial import glob import os +from functools import partial + import pytest + from pypiper import Pipeline from pypiper.manager import COMPLETE_FLAG, PAUSE_FLAG, RUN_FLAG -from pypiper.pipeline import \ - checkpoint_filepath, IllegalPipelineDefinitionError, \ - IllegalPipelineExecutionError, UnknownPipelineStageError +from pypiper.pipeline import ( + IllegalPipelineDefinitionError, + IllegalPipelineExecutionError, + UnknownPipelineStageError, + checkpoint_filepath, +) from pypiper.stage import Stage -from pypiper.utils import \ - flag_name, pipeline_filepath, checkpoint_filename, translate_stage_name +from pypiper.utils import ( + checkpoint_filename, + flag_name, + pipeline_filepath, + translate_stage_name, +) +from tests.conftest import ( + CONTENTS, + FILE1_NAME, + FILE_TEXT_PAIRS, + FILENAMES, + OUTPUT_SUFFIX, + TEST_PIPE_NAME, + write_file1, + write_file2, + write_file3, +) from tests.helpers import named_param -from tests.conftest import \ - write_file1, write_file2, write_file3, \ - CONTENTS, FILENAMES, FILE1_NAME, FILE_TEXT_PAIRS, \ - OUTPUT_SUFFIX, TEST_PIPE_NAME - __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - BASIC_ACTIONS = [write_file1, write_file2, write_file3] STAGE_SPECS = ["stage", "name", "function"] - def pytest_generate_tests(metafunc): - """ Dynamic creation and parameterization of cases in this module. """ - if "test_type" in metafunc.fixturenames and \ - metafunc.cls == MostBasicPipelineTests: + """Dynamic creation and parameterization of cases in this module.""" + if "test_type" in metafunc.fixturenames and metafunc.cls == MostBasicPipelineTests: metafunc.parametrize( - argnames="test_type", - argvalues=["effects", "stage_labels", - "checkpoints", "pipe_flag"]) - + argnames="test_type", + argvalues=["effects", "stage_labels", "checkpoints", "pipe_flag"], + ) @pytest.fixture @@ -61,7 +72,6 @@ def stage(request): return _parse_stage(s, spec_type) - def _parse_stage(s, spec_type): """ Do a type transformation on a Stage function. @@ -81,42 +91,37 @@ def _parse_stage(s, spec_type): return s - class EmptyStagesPipeline(Pipeline): - """ Illegal (via empty stages) Pipeline definition. """ + """Illegal (via empty stages) Pipeline definition.""" def __init__(self, manager): - super(EmptyStagesPipeline, self).__init__( - TEST_PIPE_NAME, manager=manager) + super(EmptyStagesPipeline, self).__init__(TEST_PIPE_NAME, manager=manager) def stages(self): return [] class NameCollisionPipeline(Pipeline): - """ Illegal (via empty stages) Pipeline definition. """ + """Illegal (via empty stages) Pipeline definition.""" def __init__(self, manager): - super(NameCollisionPipeline, self).__init__( - TEST_PIPE_NAME, manager=manager) + super(NameCollisionPipeline, self).__init__(TEST_PIPE_NAME, manager=manager) def stages(self): name = "write file1" - return [("write file1", write_file1), - (translate_stage_name(name), write_file1)] - + return [("write file1", write_file1), (translate_stage_name(name), write_file1)] class RunPipelineCornerCaseTests: - """ Tests for exceptional cases of pipeline execution. """ - + """Tests for exceptional cases of pipeline execution.""" @named_param(argnames="point", argvalues=BASIC_ACTIONS) @named_param(argnames="spec_type", argvalues=STAGE_SPECS) @named_param(argnames="inclusive", argvalues=[False, True]) def test_start_point_equals_stop( - self, dummy_pipe, point, spec_type, stage, inclusive): - """ Start=stop is only permitted if stop should be run. """ + self, dummy_pipe, point, spec_type, stage, inclusive + ): + """Start=stop is only permitted if stop should be run.""" _assert_pipeline_initialization(dummy_pipe) @@ -130,80 +135,83 @@ def test_start_point_equals_stop( with pytest.raises(IllegalPipelineExecutionError): dummy_pipe.run(start_point=stage, stop_before=stage) - @pytest.mark.parametrize( - argnames=["start_point", "stop"], - argvalues=[(write_file2, write_file1), - (write_file3, write_file2), - (write_file3, write_file1)]) + argnames=["start_point", "stop"], + argvalues=[ + (write_file2, write_file1), + (write_file3, write_file2), + (write_file3, write_file1), + ], + ) @pytest.mark.parametrize(argnames="spec_type", argvalues=STAGE_SPECS) @pytest.mark.parametrize( - argnames="stop_type", argvalues=["stop_before", "stop_after"]) + argnames="stop_type", argvalues=["stop_before", "stop_after"] + ) def test_start_point_after_stop( - self, dummy_pipe, start_point, stop, stop_type, spec_type): - """ Regardless of specification type, start > stop is prohibited. """ + self, dummy_pipe, start_point, stop, stop_type, spec_type + ): + """Regardless of specification type, start > stop is prohibited.""" start_point = _parse_stage(start_point, spec_type) stop = _parse_stage(stop, spec_type) with pytest.raises(IllegalPipelineExecutionError): dummy_pipe.run(**{"start_point": start_point, stop_type: stop}) - @named_param( - argnames="undefined_stage", - argvalues=["unsupported-pipeline-stage", "unknown_phase"]) - @named_param(argnames="stage_point", - argvalues=["start_point", "stop_before", "stop_after"]) + argnames="undefined_stage", + argvalues=["unsupported-pipeline-stage", "unknown_phase"], + ) + @named_param( + argnames="stage_point", argvalues=["start_point", "stop_before", "stop_after"] + ) def test_unknown_stage(self, dummy_pipe, undefined_stage, stage_point): - """ Start specification must be of known stage name. """ + """Start specification must be of known stage name.""" with pytest.raises(UnknownPipelineStageError): dummy_pipe.run(**{stage_point: undefined_stage}) - @named_param(argnames="stop_before", argvalues=BASIC_ACTIONS) @named_param(argnames="stop_after", argvalues=BASIC_ACTIONS) @named_param(argnames="spec_type", argvalues=STAGE_SPECS) def test_stop_before_and_stop_after( - self, dummy_pipe, stop_before, stop_after, spec_type): - """ Inclusive and exclusive stop cannot both be provided. """ + self, dummy_pipe, stop_before, stop_after, spec_type + ): + """Inclusive and exclusive stop cannot both be provided.""" inclusive_stop = _parse_stage(stop_after, spec_type) exclusive_stop = _parse_stage(stop_before, spec_type) kwargs = {"stop_before": exclusive_stop, "stop_after": inclusive_stop} with pytest.raises(IllegalPipelineExecutionError): dummy_pipe.run(**kwargs) - def test_empty_stages_is_prohibited(self, pl_mgr): - """ Pipeline must have non-empty stages """ + """Pipeline must have non-empty stages""" with pytest.raises(IllegalPipelineDefinitionError): EmptyStagesPipeline(manager=pl_mgr) - def test_stage_name_collision_is_prohibited(self, pl_mgr): - """ Each stage needs unique translation, used for checkpoint file. """ + """Each stage needs unique translation, used for checkpoint file.""" with pytest.raises(IllegalPipelineDefinitionError): NameCollisionPipeline(manager=pl_mgr) - class MostBasicPipelineTests: - """ Test pipeline defined with notion of 'absolute minimum' config. """ - + """Test pipeline defined with notion of 'absolute minimum' config.""" def test_runs_through_full(self, dummy_pipe, test_type): - """ The entire basic pipeline should execute. """ + """The entire basic pipeline should execute.""" # Start with clean output folder. _assert_pipeline_initialization(dummy_pipe) # Make the call under test. dummy_pipe.run(start_point=None, stop_before=None, stop_after=None) - + if test_type == "effects": # We're interested in existence and content of targets. exp_files, _ = zip(*FILE_TEXT_PAIRS) _assert_output(dummy_pipe, exp_files) - fpath_text_pairs = [(pipeline_filepath(dummy_pipe, fname), content) - for fname, content in FILE_TEXT_PAIRS] + fpath_text_pairs = [ + (pipeline_filepath(dummy_pipe, fname), content) + for fname, content in FILE_TEXT_PAIRS + ] for fpath, content in fpath_text_pairs: _assert_expected_content(fpath, content) @@ -214,8 +222,11 @@ def test_runs_through_full(self, dummy_pipe, test_type): try: assert os.path.isfile(chkpt_fpath) except AssertionError: - print("Stage '{}' file doesn't exist: '{}'".format( - stage.name, chkpt_fpath)) + print( + "Stage '{}' file doesn't exist: '{}'".format( + stage.name, chkpt_fpath + ) + ) raise elif test_type == "stage_labels": @@ -229,28 +240,26 @@ def test_runs_through_full(self, dummy_pipe, test_type): else: raise ValueError("Unknown test type: {}".format(test_type)) - def test_skip_completed(self, dummy_pipe, test_type): - """ Pre-completed stage(s) are skipped. """ + """Pre-completed stage(s) are skipped.""" _assert_pipeline_initialization(dummy_pipe) first_stage = dummy_pipe.stages()[0] first_stage_chkpt_fpath = checkpoint_filepath(first_stage, dummy_pipe) - open(first_stage_chkpt_fpath, 'w').close() + open(first_stage_chkpt_fpath, "w").close() assert os.path.isfile(first_stage_chkpt_fpath) exp_skips = [first_stage] exp_execs = dummy_pipe.stages()[1:] # This should neither exist nor be created. - first_stage_outfile = pipeline_filepath( - dummy_pipe.manager, filename=FILE1_NAME) + first_stage_outfile = pipeline_filepath(dummy_pipe.manager, filename=FILE1_NAME) assert not os.path.isfile(first_stage_outfile) - + # Do the action. dummy_pipe.run() - + if test_type == "effects": # We should not have generated the first stage's output file. # That notion is covered in the outfiles assertion. @@ -274,22 +283,21 @@ def test_skip_completed(self, dummy_pipe, test_type): else: raise ValueError("Unknown test type: '{}'".format(test_type)) - - @named_param(argnames="start_index", - argvalues=range(len(BASIC_ACTIONS) - 1)) - @named_param(argnames="start_spec_type", - argvalues=["stage", "function", "name"]) + @named_param(argnames="start_index", argvalues=range(len(BASIC_ACTIONS) - 1)) + @named_param(argnames="start_spec_type", argvalues=["stage", "function", "name"]) def test_execution_allows_specific_starting_point( - self, dummy_pipe, test_type, start_index, start_spec_type): - """ A pipeline may be started from an arbitrary checkpoint. """ + self, dummy_pipe, test_type, start_index, start_spec_type + ): + """A pipeline may be started from an arbitrary checkpoint.""" _assert_pipeline_initialization(dummy_pipe) s = _parse_stage(BASIC_ACTIONS[start_index], start_spec_type) dummy_pipe.run(start_point=s) if test_type == "effects": exp_files = FILENAMES[start_index:] _assert_output(dummy_pipe, exp_files) - fpaths = [pipeline_filepath(dummy_pipe.manager, filename=fn) - for fn in exp_files] + fpaths = [ + pipeline_filepath(dummy_pipe.manager, filename=fn) for fn in exp_files + ] for fp, content in zip(fpaths, CONTENTS[start_index:]): _assert_expected_content(fp, content) elif test_type == "checkpoints": @@ -298,17 +306,18 @@ def test_execution_allows_specific_starting_point( elif test_type == "stage_labels": # Ensure match between skipped and executed stage expectations # and observations. - _assert_stage_states(dummy_pipe, BASIC_ACTIONS[:start_index], - BASIC_ACTIONS[start_index:]) + _assert_stage_states( + dummy_pipe, BASIC_ACTIONS[:start_index], BASIC_ACTIONS[start_index:] + ) elif test_type == "pipe_flag": _assert_pipeline_completed(dummy_pipe) else: raise ValueError("Unknown test type: '{}'".format(test_type)) - def test_all_checkpoints_after_first_executed_are_overwritten( - self, dummy_pipe, test_type): - """ Potential for dependent results means execution is contiguous. """ + self, dummy_pipe, test_type + ): + """Potential for dependent results means execution is contiguous.""" # Start fresh. _assert_pipeline_initialization(dummy_pipe) @@ -317,7 +326,7 @@ def test_all_checkpoints_after_first_executed_are_overwritten( fpath_time_pairs = [] for s in BASIC_ACTIONS[1:]: check_fpath = checkpoint_filepath(s, dummy_pipe.manager) - open(check_fpath, 'w').close() + open(check_fpath, "w").close() fpath_time_pairs.append((check_fpath, os.path.getmtime(check_fpath))) assert os.path.isfile(check_fpath) @@ -334,19 +343,19 @@ def test_all_checkpoints_after_first_executed_are_overwritten( elif test_type == "checkpoints": _assert_checkpoints(dummy_pipe, BASIC_ACTIONS) elif test_type == "stage_labels": - _assert_stage_states(dummy_pipe, expected_skipped=[], - expected_executed=BASIC_ACTIONS) + _assert_stage_states( + dummy_pipe, expected_skipped=[], expected_executed=BASIC_ACTIONS + ) elif test_type == "pipe_flag": _assert_pipeline_completed(dummy_pipe) else: raise ValueError("Unknown test type: {}".format(test_type)) - @named_param(argnames="stop_index", argvalues=range(1, len(BASIC_ACTIONS))) @named_param(argnames="spec_type", argvalues=STAGE_SPECS) @named_param(argnames="stop_type", argvalues=["stop_before", "stop_after"]) def test_stop(self, dummy_pipe, test_type, stop_index, spec_type, stop_type): - """ A pipeline is capable of halting at/after a specified stage. """ + """A pipeline is capable of halting at/after a specified stage.""" # Negative control / pretest. _assert_pipeline_initialization(dummy_pipe) @@ -366,8 +375,9 @@ def test_stop(self, dummy_pipe, test_type, stop_index, spec_type, stop_type): if test_type == "effects": exp_files = FILENAMES[:stop_index] _assert_output(dummy_pipe, exp_files) - fpaths = [pipeline_filepath(dummy_pipe.manager, filename=fn) - for fn in exp_files] + fpaths = [ + pipeline_filepath(dummy_pipe.manager, filename=fn) for fn in exp_files + ] for fp, content in zip(fpaths, CONTENTS[:stop_index]): _assert_expected_content(fp, content) @@ -375,11 +385,12 @@ def test_stop(self, dummy_pipe, test_type, stop_index, spec_type, stop_type): _assert_checkpoints(dummy_pipe, BASIC_ACTIONS[:stop_index]) elif test_type == "stage_labels": _assert_stage_states( - dummy_pipe, expected_skipped=BASIC_ACTIONS[stop_index:], - expected_executed=BASIC_ACTIONS[:stop_index]) + dummy_pipe, + expected_skipped=BASIC_ACTIONS[stop_index:], + expected_executed=BASIC_ACTIONS[:stop_index], + ) elif test_type == "pipe_flag": - if (stop_index == len(BASIC_ACTIONS)) and \ - (stop_type == "stop_after"): + if (stop_index == len(BASIC_ACTIONS)) and (stop_type == "stop_after"): _assert_pipeline_completed(dummy_pipe) else: _assert_pipeline_halted(dummy_pipe) @@ -387,20 +398,20 @@ def test_stop(self, dummy_pipe, test_type, stop_index, spec_type, stop_type): raise ValueError("Unknown test type: '{}'".format(test_type)) - @named_param( - argnames="spec_type", - argvalues=["filename", "filepath", "stage", "stage_name"]) + argnames="spec_type", argvalues=["filename", "filepath", "stage", "stage_name"] +) @named_param(argnames="completed", argvalues=[False, True]) def test_stage_completion_determination(dummy_pipe, spec_type, completed): - """ Pipeline responds to variety of request forms of checkpoint status. """ + """Pipeline responds to variety of request forms of checkpoint status.""" # Allow dummy stage definition and determination of filename. def dummy_test_func(): pass chkpt_name = checkpoint_filename( - dummy_test_func.__name__, pipeline_name=dummy_pipe.name) + dummy_test_func.__name__, pipeline_name=dummy_pipe.name + ) chkpt_fpath = checkpoint_filepath(chkpt_name, dummy_pipe.manager) # Determine how to request the checkpoint completion status. @@ -417,15 +428,18 @@ def dummy_test_func(): # Touch the checkpoint file iff we're positively testing completion. if completed: - open(chkpt_fpath, 'w').close() + open(chkpt_fpath, "w").close() # Check the completion status for concordance with expectation. # Print a bit of info to inform hypotheses about the source of a # hypothetical test error/failure. outfolder_contents = os.listdir(dummy_pipe.outfolder) print("Pipeline outfolder contents: {}".format(outfolder_contents)) - print("Contents as pipeline files: {}".format( - [pipeline_filepath(dummy_pipe.manager, f) for f in outfolder_contents])) + print( + "Contents as pipeline files: {}".format( + [pipeline_filepath(dummy_pipe.manager, f) for f in outfolder_contents] + ) + ) print("Checking completion status: {} ({})".format(s, type(s))) observed_completion = dummy_pipe.completed_stage(s) if completed: @@ -434,7 +448,6 @@ def dummy_test_func(): assert not observed_completion - def _assert_checkpoints(pl, exp_stages): """ Assert equivalence between expected and observed checkpoint files. @@ -449,7 +462,6 @@ def _assert_checkpoints(pl, exp_stages): assert set(exp_fpaths) == set(obs_fpaths) - def _assert_expected_content(fpath, content): """ Determine whether a filepath has the expected content. @@ -460,12 +472,11 @@ def _assert_expected_content(fpath, content): """ assert os.path.isfile(fpath) exp_content = content.split(os.linesep) - with open(fpath, 'r') as f: + with open(fpath, "r") as f: obs_content = [l.rstrip(os.linesep) for l in f.readlines()] assert exp_content == obs_content - def _assert_output(pl, expected_filenames): """ Assert equivalence--with respect to presence only--between expected @@ -476,20 +487,21 @@ def _assert_output(pl, expected_filenames): :param Iterable[str] expected_filenames: :return: """ - obs_outfiles = glob.glob(pipeline_filepath( - pl.manager, "*{}".format(OUTPUT_SUFFIX))) + obs_outfiles = glob.glob(pipeline_filepath(pl.manager, "*{}".format(OUTPUT_SUFFIX))) assert len(expected_filenames) == len(obs_outfiles) expected_filepaths = [] for fname in expected_filenames: - fpath = fname if os.path.isabs(fname) else \ - pipeline_filepath(pl.manager, filename=fname) + fpath = ( + fname + if os.path.isabs(fname) + else pipeline_filepath(pl.manager, filename=fname) + ) expected_filepaths.append(fpath) assert set(expected_filepaths) == set(obs_outfiles) - def _assert_pipeline_status(pl, flag): - """ Assert, based on flag file presence, that a pipeline's completed. """ + """Assert, based on flag file presence, that a pipeline's completed.""" flags = glob.glob(pipeline_filepath(pl.manager, filename=flag_name("*"))) assert 1 == len(flags) exp_flag = pipeline_filepath(pl, suffix="_" + flag_name(flag)) @@ -500,13 +512,10 @@ def _assert_pipeline_status(pl, flag): raise - -_assert_pipeline_completed = partial( - _assert_pipeline_status, flag=COMPLETE_FLAG) +_assert_pipeline_completed = partial(_assert_pipeline_status, flag=COMPLETE_FLAG) _assert_pipeline_halted = partial(_assert_pipeline_status, flag=PAUSE_FLAG) - def _assert_pipeline_initialization(pl): """ Assert that a test case begins with output folder in expected state. @@ -514,21 +523,21 @@ def _assert_pipeline_initialization(pl): :param pypiper.Pipeline pl: Pipeline instance for test case. """ # TODO: implement. - suffices = {"_commands.sh", "_profile.tsv", - "_{}".format(flag_name(RUN_FLAG))} - exp_init_contents = \ - [pipeline_filepath(pl.manager, suffix=s) for s in suffices] - obs_init_contents = [pipeline_filepath(pl.manager, filename=n) - for n in os.listdir(pl.outfolder)] + suffices = {"_commands.sh", "_profile.tsv", "_{}".format(flag_name(RUN_FLAG))} + exp_init_contents = [pipeline_filepath(pl.manager, suffix=s) for s in suffices] + obs_init_contents = [ + pipeline_filepath(pl.manager, filename=n) for n in os.listdir(pl.outfolder) + ] assert len(exp_init_contents) == len(obs_init_contents) assert set(exp_init_contents) == set(obs_init_contents) - def _assert_stage_states(pl, expected_skipped, expected_executed): - """ Assert equivalence between expected and observed stage states. """ + """Assert equivalence between expected and observed stage states.""" + def _ensure_stage(s): return s if isinstance(s, Stage) else Stage(s) + expected_skipped = [_ensure_stage(s) for s in expected_skipped] expected_executed = [_ensure_stage(s) for s in expected_executed] assert expected_skipped == pl.skipped diff --git a/tests/pipeline/test_pipeline_checkpoint.py b/tests/pipeline/test_pipeline_checkpoint.py index 9bbadab8..1267143f 100644 --- a/tests/pipeline/test_pipeline_checkpoint.py +++ b/tests/pipeline/test_pipeline_checkpoint.py @@ -5,17 +5,17 @@ from pypiper.utils import checkpoint_filepath from tests.helpers import fetch_checkpoint_files, named_param -from .conftest import get_pipeline +from .conftest import get_pipeline __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - def test_pipeline_checkpoint_respect_sensitivity_checkpoint_perspective( - pl_name, tmpdir): - """ Pipeline can skip past its stage(s) for which checkpoint exists. """ + pl_name, tmpdir +): + """Pipeline can skip past its stage(s) for which checkpoint exists.""" # Create the pipeline. pipeline = get_pipeline(pl_name, tmpdir.strpath) @@ -27,8 +27,9 @@ def test_pipeline_checkpoint_respect_sensitivity_checkpoint_perspective( pipeline.run() # Verify that we created each of the checkpoints. - expected = [checkpoint_filepath(f.__name__, pipeline.manager) - for f in pipeline.functions] + expected = [ + checkpoint_filepath(f.__name__, pipeline.manager) for f in pipeline.functions + ] observed = fetch_checkpoint_files(pipeline.manager) assert set(expected) == set(observed) @@ -37,8 +38,7 @@ def test_pipeline_checkpoint_respect_sensitivity_checkpoint_perspective( # Remove the checkpoint for the final stage. last_aligner_stage = pipeline.functions[-1] - last_aligner_checkfile = checkpoint_filepath( - last_aligner_stage, pipeline.manager) + last_aligner_checkfile = checkpoint_filepath(last_aligner_stage, pipeline.manager) os.unlink(last_aligner_checkfile) # Verify removal of final stage checkpoint file. @@ -59,9 +59,11 @@ def test_pipeline_checkpoint_respect_sensitivity_checkpoint_perspective( observed = fetch_checkpoint_files(pipeline.manager) exp = set(expected) obs = set(observed) - assert set(expected) == set(observed), \ - "Expected only:\n{}\nExpected and observed:\n{}\nObserved only:\n{}".format( - exp - obs, exp & obs, obs - exp) + assert set(expected) == set( + observed + ), "Expected only:\n{}\nExpected and observed:\n{}\nObserved only:\n{}".format( + exp - obs, exp & obs, obs - exp + ) # Verify the we didn't recreate the checkpoint file for each skipped stage. for f in expected[:-1]: @@ -71,15 +73,15 @@ def test_pipeline_checkpoint_respect_sensitivity_checkpoint_perspective( # Verify the we did in fact recreate the checkpoint file for the stage # that was rerun. - assert os.path.getmtime(last_aligner_checkfile) > \ - timestamps[last_aligner_checkfile], \ - "Recreated checkpoint file ('{}') should be newer than original".\ - format(last_aligner_checkfile) - + assert ( + os.path.getmtime(last_aligner_checkfile) > timestamps[last_aligner_checkfile] + ), "Recreated checkpoint file ('{}') should be newer than original".format( + last_aligner_checkfile + ) def test_pipeline_checkpoint_sensitivity_effect_perspective(pl_name, tmpdir): - """ The pipeline skips execution of stages with extant checkpoint. """ + """The pipeline skips execution of stages with extant checkpoint.""" # Create the pipeline, then check creation of output file. pipeline = get_pipeline(pl_name, tmpdir.strpath) @@ -89,15 +91,16 @@ def test_pipeline_checkpoint_sensitivity_effect_perspective(pl_name, tmpdir): assert os.path.isfile(output_file) # Validate pipeline effects (output file content). - with open(output_file, 'r') as f: + with open(output_file, "r") as f: lines = f.readlines() assert [s.name + os.linesep for s in pipeline.stages()] == lines # Verify presence of checkpoint files to support our expectation about # which stages should be skipped and which should be run during the second # time through the pipeline's execution. - exp_cp_fpaths = set(checkpoint_filepath(s.name, pipeline.manager) - for s in pipeline.stages()) + exp_cp_fpaths = set( + checkpoint_filepath(s.name, pipeline.manager) for s in pipeline.stages() + ) assert exp_cp_fpaths == set(fetch_checkpoint_files(pipeline.manager)) final_stage = pipeline.stages()[-1] final_stage_fpath = checkpoint_filepath(final_stage.name, pipeline.manager) @@ -105,24 +108,26 @@ def test_pipeline_checkpoint_sensitivity_effect_perspective(pl_name, tmpdir): # Verify the effect of the second execution of the pipeline. pipeline.run() - with open(output_file, 'r') as f: + with open(output_file, "r") as f: lines = f.readlines() assert [final_stage.name + os.linesep] == lines @named_param("overwrite", [False, True]) def test_pipeline_reruns_downstream_stages_according_to_parameterization( - overwrite, pl_name, tmpdir): - """ Pipeline overwrites downstream stages unless configured otherwise. """ + overwrite, pl_name, tmpdir +): + """Pipeline overwrites downstream stages unless configured otherwise.""" pl = get_pipeline(pl_name, tmpdir.strpath) # Create checkpoint file for each stage. stage_names = [s.name for s in pl.stages()] - assert 1 < len(stage_names), \ - "Need pipeline with at least two stages to run this test." + assert 1 < len( + stage_names + ), "Need pipeline with at least two stages to run this test." for s_name in stage_names: - open(checkpoint_filepath(s_name, pl.manager), 'w').close() + open(checkpoint_filepath(s_name, pl.manager), "w").close() # Remove the checkpoint file for the penultimate stage. penultimate_stage = stage_names[-2] @@ -140,6 +145,6 @@ def test_pipeline_reruns_downstream_stages_according_to_parameterization( exp_stages.append(stage_names[-1]) exp_lines = [func + os.linesep for func in stage_names[-2:]] outpath = os.path.join(pl.outfolder, pl.name_output_file) - with open(outpath, 'r') as f: + with open(outpath, "r") as f: obs_lines = f.readlines() assert exp_lines == obs_lines diff --git a/tests/pipeline/test_pipeline_constructor.py b/tests/pipeline/test_pipeline_constructor.py index aba414fd..5de22bf6 100644 --- a/tests/pipeline/test_pipeline_constructor.py +++ b/tests/pipeline/test_pipeline_constructor.py @@ -1,17 +1,16 @@ """ Tests for construction of a Pipeline """ import pytest -from pypiper import Pipeline, PipelineManager, Stage -from tests.helpers import assert_equal_dirpath, named_param, SafeTestPipeline +from pypiper import Pipeline, PipelineManager, Stage +from tests.helpers import SafeTestPipeline, assert_equal_dirpath, named_param __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - def test_pipeline_requires_stages_definition(tmpdir): - """ To create a pipeline, define stages (execution steps). """ + """To create a pipeline, define stages (execution steps).""" class NoStagesPipeline(SafeTestPipeline): pass @@ -25,65 +24,54 @@ class NoStagesPipeline(SafeTestPipeline): _MinimalPipeline(name=name, outfolder=tmpdir.strpath) - class JustManagerArgument: - """ A pipeline can be created with just a manager argument. """ - + """A pipeline can be created with just a manager argument.""" NAME_HOOK = "pl_mgr_name" - @pytest.fixture def pl_mgr(self, request, get_pipe_manager): - """ Provide each of this class's test cases with pipeline manager. """ + """Provide each of this class's test cases with pipeline manager.""" if self.NAME_HOOK in request.fixturenames: name = request.getfixturevalue(self.NAME_HOOK) else: name = "test-pipe" return get_pipe_manager(name=name) - - @named_param( - argnames=NAME_HOOK, argvalues=["arbitrary-pipeline", "DummyPipe"]) + @named_param(argnames=NAME_HOOK, argvalues=["arbitrary-pipeline", "DummyPipe"]) def test_pipeline_adopts_manager_name(self, pl_mgr_name, pl_mgr): - """ If given just a manager, a pipeline uses the manager name. """ + """If given just a manager, a pipeline uses the manager name.""" pl = Pipeline(manager=pl_mgr) assert pl_mgr_name == pl_mgr.name assert pl_mgr_name == pl.name - def test_pipeline_adopts_manager_output_folder(self, pl_mgr): - """ Pipeline uses manager output folder if given just manager. """ + """Pipeline uses manager output folder if given just manager.""" pl = Pipeline(manager=pl_mgr) assert pl_mgr.outfolder == pl.outfolder - class MinimalArgumentsWithoutManagerTests: - """ Tests for pipeline constructor argument provision without manager. """ - + """Tests for pipeline constructor argument provision without manager.""" def test_pipeline_creates_manager(self, tmpdir): - """ If not passed a pipeline manager, a pipeline creates one. """ + """If not passed a pipeline manager, a pipeline creates one.""" empty = _MinimalPipeline(name="minimal", outfolder=tmpdir.strpath) assert isinstance(empty.manager, PipelineManager) - @named_param("pipe_name", ["test-pipe", "DummyPipeline"]) def test_manager_adopts_pipeline_name(self, pipe_name, tmpdir): - """ Autogenerated pipeline manager uses pipeline's name. """ + """Autogenerated pipeline manager uses pipeline's name.""" pl = _MinimalPipeline(name=pipe_name, outfolder=tmpdir.strpath) assert pipe_name == pl.name assert pl.name == pl.manager.name - def test_manager_adopts_pipeline_output_folder(self, tmpdir): - """ Autogenerated pipeline manager uses pipeline's output folder. """ + """Autogenerated pipeline manager uses pipeline's output folder.""" pl = _MinimalPipeline(name="test-pipe", outfolder=tmpdir.strpath) assert_equal_dirpath(tmpdir.strpath, pl.outfolder) - class ConceptuallyOverlappingArgumentsTests: """ Test cases in which pipeline's argument space is overspecified. @@ -100,19 +88,15 @@ class ConceptuallyOverlappingArgumentsTests: """ - - def test_same_name_for_manager_and_pipeline( - self, tmpdir, get_pipe_manager): - """ Pipeline name and manager with matching name is unproblematic. """ + def test_same_name_for_manager_and_pipeline(self, tmpdir, get_pipe_manager): + """Pipeline name and manager with matching name is unproblematic.""" name = "test-pipe" pm = get_pipe_manager(name=name, outfolder=tmpdir.strpath) pl = _MinimalPipeline(name=name, manager=pm) assert name == pl.manager.name - - def test_different_name_for_manager_and_pipeline( - self, tmpdir, get_pipe_manager): - """ If given, pipeline favors its own name over manager's. """ + def test_different_name_for_manager_and_pipeline(self, tmpdir, get_pipe_manager): + """If given, pipeline favors its own name over manager's.""" manager_name = "manager" pipeline_name = "pipeline" pm = get_pipe_manager(name=manager_name, outfolder=tmpdir.strpath) @@ -120,19 +104,17 @@ def test_different_name_for_manager_and_pipeline( assert pipeline_name == pl.name assert manager_name == pl.manager.name - - @named_param( - "output_folder", argvalues=["test-output", "testing-output-folder"]) + @named_param("output_folder", argvalues=["test-output", "testing-output-folder"]) def test_pipeline_ignores_outfolder_if_manager_is_passed( - self, output_folder, tmpdir, get_pipe_manager): - """ Manager's output folder trumps explicit output folder. """ + self, output_folder, tmpdir, get_pipe_manager + ): + """Manager's output folder trumps explicit output folder.""" pm = get_pipe_manager(name="test-pipe", outfolder=tmpdir.strpath) pl = _MinimalPipeline(manager=pm, outfolder=output_folder) assert_equal_dirpath(tmpdir.strpath, pl.outfolder) - def test_name_outfolder_and_manager(self, tmpdir, get_pipe_manager): - """ Tests provision of all three primary pipeline arguments. """ + """Tests provision of all three primary pipeline arguments.""" name = "test-pipe" pm = get_pipe_manager(name=name, outfolder=tmpdir.strpath) pl = _MinimalPipeline(name=name, manager=pm, outfolder=tmpdir.strpath) @@ -141,58 +123,53 @@ def test_name_outfolder_and_manager(self, tmpdir, get_pipe_manager): assert pm == pl.manager - def test_pipeline_requires_either_manager_or_outfolder(): - """ Pipeline must be passed pipeline manager or output folder. """ + """Pipeline must be passed pipeline manager or output folder.""" with pytest.raises(TypeError): _MinimalPipeline() - def test_empty_pipeline_manager_name_and_no_explicit_pipeline_name( - tmpdir, get_pipe_manager): - """ If no name's passed to pipeline, the manager must have valid name. """ + tmpdir, get_pipe_manager +): + """If no name's passed to pipeline, the manager must have valid name.""" pm = get_pipe_manager(name="", outfolder=tmpdir.strpath) with pytest.raises(ValueError): _MinimalPipeline(manager=pm) - class AnonymousFunctionStageTests: - """ Tests for anonymous function as a pipeline stage. """ - + """Tests for anonymous function as a pipeline stage.""" def test_anonymous_stage_without_name_is_prohibited(self, tmpdir): - """ Anonymous function as Stage must be paired with name. """ + """Anonymous function as Stage must be paired with name.""" with pytest.raises(TypeError): _AnonymousStageWithoutNamePipeline( - name="test-pipe", outfolder=tmpdir.strpath) - + name="test-pipe", outfolder=tmpdir.strpath + ) def test_anonymous_stage_with_name_is_permitted(self, tmpdir): - """ Anonymous function as Stage must be paired with name. """ - _AnonymousStageWithNamePipeline( - name="test-pipe", outfolder=tmpdir.strpath) - + """Anonymous function as Stage must be paired with name.""" + _AnonymousStageWithNamePipeline(name="test-pipe", outfolder=tmpdir.strpath) class _AnonymousStageWithoutNamePipeline(SafeTestPipeline): - """ Anonymous function as stage is prohibited unless paired with name. """ + """Anonymous function as stage is prohibited unless paired with name.""" + def stages(self): return [lambda: None] - class _AnonymousStageWithNamePipeline(SafeTestPipeline): - """ Anonymous function as Stage is allowed if wrapped with a name. """ + """Anonymous function as Stage is allowed if wrapped with a name.""" + def stages(self): return [("NullStage", lambda: None)] - @pytest.fixture def empty_pipeline(request): - """ Provide test case with minimal pipeline instance. """ + """Provide test case with minimal pipeline instance.""" if "pipe_name" in request.fixturenames: name = request.getfixturevalue("pipe_name") else: @@ -200,15 +177,13 @@ def empty_pipeline(request): return _MinimalPipeline(name) - class _MinimalPipeline(SafeTestPipeline): - """ Minimal pipeline declaration. """ + """Minimal pipeline declaration.""" def stages(self): - """ Sham stages definition. """ + """Sham stages definition.""" return [_do_nothing] - def _do_nothing(): return diff --git a/tests/pipeline_manager/test_halt.py b/tests/pipeline_manager/test_halt.py index 6be733a3..824ecc31 100644 --- a/tests/pipeline_manager/test_halt.py +++ b/tests/pipeline_manager/test_halt.py @@ -1,19 +1,19 @@ """ Tests for effects of pipeline manager's halt() function. """ import os + import pytest + from pypiper.exceptions import PipelineHalt from pypiper.flags import COMPLETE_FLAG, PAUSE_FLAG from tests.helpers import named_param - __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - def test_halt_state(get_pipe_manager): - """ Requesting a halt alters manager state. """ + """Requesting a halt alters manager state.""" pm = get_pipe_manager(name="test-pipe") assert pm._active pm.halt(raise_error=False) @@ -21,9 +21,8 @@ def test_halt_state(get_pipe_manager): assert not pm._active - def test_halt_file(get_pipe_manager): - """ Requesting a halt produces a particular flag file. """ + """Requesting a halt produces a particular flag file.""" pm = get_pipe_manager(name="TestPM") path_halt_file = pm._flag_file_path(PAUSE_FLAG) assert not os.path.isfile(path_halt_file) @@ -31,10 +30,9 @@ def test_halt_file(get_pipe_manager): assert os.path.isfile(path_halt_file) - @named_param("raise_error", [False, True, None]) def test_halt_exceptionality(get_pipe_manager, raise_error): - """ Halting is conditionally exceptional """ + """Halting is conditionally exceptional""" pm = get_pipe_manager(name="halt-error") if raise_error is None: # Default is exceptional. @@ -47,12 +45,10 @@ def test_halt_exceptionality(get_pipe_manager, raise_error): pm.halt(raise_error=False) - @named_param("raise_error", [False, True]) @named_param("test_type", argvalues=["halt_flag", "complete_flag"]) -def test_halt_status_supersedes_completed( - get_pipe_manager, raise_error, test_type): - """ Halting pipeline replaces completed flag with halt flag. """ +def test_halt_status_supersedes_completed(get_pipe_manager, raise_error, test_type): + """Halting pipeline replaces completed flag with halt flag.""" # Create manager and completion flag. pm = get_pipe_manager(name="halt-status-flag") diff --git a/tests/pipeline_manager/test_manager_constructor.py b/tests/pipeline_manager/test_manager_constructor.py index 2ff2a9a8..0792bf1f 100644 --- a/tests/pipeline_manager/test_manager_constructor.py +++ b/tests/pipeline_manager/test_manager_constructor.py @@ -1,47 +1,40 @@ """ Test effects of construction of a pipeline manager. """ import argparse + import pytest + from pypiper.manager import CHECKPOINT_SPECIFICATIONS from tests.helpers import named_param - __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - def pytest_generate_tests(metafunc): - """ Dynamic test case generation for this module's test cases. """ + """Dynamic test case generation for this module's test cases.""" if "spec_type" in metafunc.fixturenames: - metafunc.parametrize( - argnames="spec_type", argvalues=["cmdl", "ctor"]) + metafunc.parametrize(argnames="spec_type", argvalues=["cmdl", "ctor"]) - -@named_param( - "checkpoint_type", argvalues=["curr_checkpoint", "prev_checkpoint"]) -def test_manager_starts_in_null_checkpoint_state( - get_pipe_manager, checkpoint_type): - """ A pipeline manager begins with null checkpoint states. """ +@named_param("checkpoint_type", argvalues=["curr_checkpoint", "prev_checkpoint"]) +def test_manager_starts_in_null_checkpoint_state(get_pipe_manager, checkpoint_type): + """A pipeline manager begins with null checkpoint states.""" pm = get_pipe_manager(name="ctor-checkpoint-state") assert getattr(pm, checkpoint_type) is None - class ManagerConstructorCheckpointSpecificationTests: - """ Tests for manager's constructor's ability to parse and set - checkpoint specifications, which can determine aspects of control flow. """ - + """Tests for manager's constructor's ability to parse and set + checkpoint specifications, which can determine aspects of control flow.""" def test_no_checkpoint_specifications(self, get_pipe_manager): - """ A manager may be constructed without any checkpoint provision. """ + """A manager may be constructed without any checkpoint provision.""" get_pipe_manager(name="test-pipe") - @named_param("start_point", ["filter_reads", "align_reads"]) def test_just_start(self, get_pipe_manager, spec_type, start_point): - """ Starting point may be set from command-line or ctor keyword. """ + """Starting point may be set from command-line or ctor keyword.""" spec_data = {"start_point": start_point} if spec_type == "cmdl": kwargs = {"args": argparse.Namespace(**spec_data)} @@ -50,12 +43,10 @@ def test_just_start(self, get_pipe_manager, spec_type, start_point): pm = get_pipe_manager(name="start-test", **kwargs) assert start_point == pm.start_point - @named_param("stop_type", ["stop_before", "stop_after"]) @named_param("stop_point", ["align_reads", "call_peaks"]) - def test_just_stop(self, get_pipe_manager, - spec_type, stop_type, stop_point): - """ Particular stopping type is set correctly. """ + def test_just_stop(self, get_pipe_manager, spec_type, stop_type, stop_point): + """Particular stopping type is set correctly.""" spec_data = {stop_type: stop_point} if spec_type == "cmdl": kwargs = {"args": argparse.Namespace(**spec_data)} @@ -64,13 +55,13 @@ def test_just_stop(self, get_pipe_manager, pm = get_pipe_manager(name="stop-test", **kwargs) assert stop_point == getattr(pm, stop_type) - @named_param("start_point", ["merge_input", "filter_reads"]) @named_param("stop_point", ["align_reads", "calc_stats"]) @named_param("stop_type", ["stop_before", "stop_after"]) - def test_start_and_stop(self, get_pipe_manager, spec_type, - stop_type, start_point, stop_point): - """ Specifying both start and stop works just fine. """ + def test_start_and_stop( + self, get_pipe_manager, spec_type, stop_type, start_point, stop_point + ): + """Specifying both start and stop works just fine.""" spec_data = {"start_point": start_point, stop_type: stop_point} if spec_type == "cmdl": kwargs = {"args": argparse.Namespace(**spec_data)} @@ -80,15 +71,19 @@ def test_start_and_stop(self, get_pipe_manager, spec_type, assert start_point == pm.start_point assert stop_point == getattr(pm, stop_type) - @named_param("stop_before", ["align_reads", "call_peaks"]) @named_param("stop_after", ["fastqc", "align_reads"]) @named_param("stop_before_type", ["cmdl", "ctor"]) @named_param("stop_after_type", ["cmdl", "ctor"]) def test_both_stop_modes_is_prohibited( - self, get_pipe_manager, stop_before_type, - stop_after_type, stop_before, stop_after): - """ Provision of both prospective and retrospective stop is bad. """ + self, + get_pipe_manager, + stop_before_type, + stop_after_type, + stop_before, + stop_after, + ): + """Provision of both prospective and retrospective stop is bad.""" raw_kwargs = {"stop_before": stop_before, "stop_after": stop_after} cmdl_kwargs = {} if stop_before_type == "cmdl": @@ -99,18 +94,25 @@ def test_both_stop_modes_is_prohibited( with pytest.raises(TypeError): get_pipe_manager(name="test-double-stop", args=args, **raw_kwargs) - @pytest.mark.parametrize( argnames=["start_point", "stop_point"], - argvalues=[("fastqc", "align_reads"), ("align_reads", "call_peaks")]) + argvalues=[("fastqc", "align_reads"), ("align_reads", "call_peaks")], + ) @pytest.mark.parametrize( argnames=["start_spec_type", "stop_spec_type"], - argvalues=[("cmdl", "ctor"), ("ctor", "cmdl")]) + argvalues=[("cmdl", "ctor"), ("ctor", "cmdl")], + ) @named_param("stop_type", ["stop_before", "stop_after"]) def test_complementary_specification_modes( - self, get_pipe_manager, start_spec_type, stop_spec_type, - stop_type, start_point, stop_point): - """ Command-line and keyword specifications can harmonize. """ + self, + get_pipe_manager, + start_spec_type, + stop_spec_type, + stop_type, + start_point, + stop_point, + ): + """Command-line and keyword specifications can harmonize.""" raw_kwargs = {"start_point": start_point, stop_type: stop_point} cmdl_kwargs = {} if start_spec_type == "cmdl": @@ -118,33 +120,40 @@ def test_complementary_specification_modes( if stop_spec_type == "cmdl": cmdl_kwargs[stop_type] = raw_kwargs.pop(stop_type) args = argparse.Namespace(**cmdl_kwargs) - pm = get_pipe_manager(name="complementary-test", - args=args, **raw_kwargs) + pm = get_pipe_manager(name="complementary-test", args=args, **raw_kwargs) assert start_point == pm.start_point assert stop_point == getattr(pm, stop_type) - @named_param( "check_specs", - [["start_point"], ["stop_before"], ["stop_after"], - ["start_point", "stop_before"], ["start_point", "stop_after"]]) + [ + ["start_point"], + ["stop_before"], + ["stop_after"], + ["start_point", "stop_before"], + ["start_point", "stop_after"], + ], + ) def test_command_line_beats_constructor_keyword( - self, get_pipe_manager, check_specs): - """ Command-line specification is favored over constructor keyword. """ + self, get_pipe_manager, check_specs + ): + """Command-line specification is favored over constructor keyword.""" # Declare values to use for respective specification modes. - cmdl_values = {"start_point": "merge_input", - "stop_before": "call_peaks", - "stop_after": "align_reads"} - ctor_values = {"start_point": "fastqc", - "stop_before": "align_reads", - "stop_after": "filter_reads"} + cmdl_values = { + "start_point": "merge_input", + "stop_before": "call_peaks", + "stop_after": "align_reads", + } + ctor_values = { + "start_point": "fastqc", + "stop_before": "align_reads", + "stop_after": "filter_reads", + } # Create specifications based on current test case parameterization. - cmdl_kwargs ={cp_spec: cmdl_values[cp_spec] - for cp_spec in check_specs} - ctor_kwargs = {cp_spec: ctor_values[cp_spec] - for cp_spec in check_specs} + cmdl_kwargs = {cp_spec: cmdl_values[cp_spec] for cp_spec in check_specs} + ctor_kwargs = {cp_spec: ctor_values[cp_spec] for cp_spec in check_specs} args = argparse.Namespace(**cmdl_kwargs) # Build the pipeline manager. diff --git a/tests/pipeline_manager/test_manager_state.py b/tests/pipeline_manager/test_manager_state.py index fb86e9a7..0ae219e4 100644 --- a/tests/pipeline_manager/test_manager_state.py +++ b/tests/pipeline_manager/test_manager_state.py @@ -1,33 +1,33 @@ """ Tests related to pipeline manager state. """ import os + import pytest + from pypiper.utils import checkpoint_filepath, pipeline_filepath from tests.helpers import named_param - __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - def test_starts_running(get_pipe_manager): - """ A PipelineManager begins running during its construction. """ + """A PipelineManager begins running during its construction.""" pm = get_pipe_manager(name="TestPM") assert pm._active + # Parameters governing execution: # 1 -- checkpoint existence # 3 -- halt state (.halted) class ExecutionSkippingTests: - """ Tests for cases in which command execution should be skipped. """ - + """Tests for cases in which command execution should be skipped.""" @named_param("start_point", ["align_reads", "make_call"]) def test_skips_to_start(self, get_pipe_manager, start_point): - """ The pipeline manager can skip to a starting point. """ + """The pipeline manager can skip to a starting point.""" # Initialize the manager. pm = get_pipe_manager(name="StartTestPM", start_point=start_point) @@ -48,8 +48,11 @@ def test_skips_to_start(self, get_pipe_manager, start_point): os.makedirs(fastqc_folder) fastqc_zipfile = os.path.join(fastqc_folder, "qc.zip") fastqc_rawfile = os.path.join(fastqc_folder, "qc.txt") - cmds = ["fastqc", "touch {}".format(fastqc_rawfile), - "touch {}".format(fastqc_zipfile)] + cmds = [ + "fastqc", + "touch {}".format(fastqc_rawfile), + "touch {}".format(fastqc_zipfile), + ] pm.run(cmds, target=fastqc_zipfile) assert not os.path.isfile(fastqc_zipfile) assert not os.path.isfile(fastqc_rawfile) @@ -62,11 +65,9 @@ def test_skips_to_start(self, get_pipe_manager, start_point): pm.run(cmd, target=path_first_file) assert os.path.isfile(path_first_file) - @named_param("num_skips", argvalues=[1, 2, 3]) - def test_skips_execution_if_in_unstarted_state( - self, get_pipe_manager, num_skips): - """ Pipeline manager skips command execution if not in active state. """ + def test_skips_execution_if_in_unstarted_state(self, get_pipe_manager, num_skips): + """Pipeline manager skips command execution if not in active state.""" pm = get_pipe_manager(name="skip-execs") pm._active = False @@ -92,10 +93,9 @@ def test_skips_execution_if_in_unstarted_state( # We break the loop once we've made a call in active state. assert os.path.isfile(testfile) - @named_param("num_skips", argvalues=[1, 2, 3]) def test_respects_checkpoints(self, get_pipe_manager, num_skips): - """ Manager can skip pipeline to where it's not yet checkpointed. """ + """Manager can skip pipeline to where it's not yet checkpointed.""" pm = get_pipe_manager(name="respect-checkpoints") @@ -121,27 +121,22 @@ def test_respects_checkpoints(self, get_pipe_manager, num_skips): try: assert not os.path.isfile(outfile) except AssertionError: - print("Have run {} stage(s) of {} skip(s)". - format(i + 1, num_skips)) - print("Current manager checkpoint: {}". - format(pm.curr_checkpoint)) + print("Have run {} stage(s) of {} skip(s)".format(i + 1, num_skips)) + print("Current manager checkpoint: {}".format(pm.curr_checkpoint)) raise else: # We should have created the output file. try: assert os.path.isfile(outfile) except AssertionError: - print("Have run {} stage(s) of {} skip(s)". - format(i + 1, num_skips)) - print("Current manager checkpoint: {}". - format(pm.curr_checkpoint)) + print("Have run {} stage(s) of {} skip(s)".format(i + 1, num_skips)) + print("Current manager checkpoint: {}".format(pm.curr_checkpoint)) print("Active? {}".format(pm._active)) raise - @named_param("halt_index", [1, 2, 3]) def test_respects_halt(self, get_pipe_manager, halt_index): - """ The pipeline manager skips execution if it's in halted state. """ + """The pipeline manager skips execution if it's in halted state.""" pm = get_pipe_manager(name="respects-halt") targets = ["file{}.txt".format(i) for i in range(1, 5)] for i, t in enumerate(targets): diff --git a/tests/pipeline_manager/test_pipeline_manager.py b/tests/pipeline_manager/test_pipeline_manager.py index 3f5d1510..df71e1a6 100755 --- a/tests/pipeline_manager/test_pipeline_manager.py +++ b/tests/pipeline_manager/test_pipeline_manager.py @@ -9,20 +9,17 @@ import unittest import pypiper -from pypiper.utils import pipeline_filepath from pypiper.exceptions import SubprocessError - +from pypiper.utils import pipeline_filepath __author__ = "Nathan Sheffield" __email__ = "nathan@code.databio.org" - class PipelineManagerTests(unittest.TestCase): - """ Tests for pypiper's PipelineManager. """ - - OUTFOLDER = "pipeline_output" + """Tests for pypiper's PipelineManager.""" + OUTFOLDER = "tests/Data/pipeline_output" @classmethod def _clean(cls): @@ -31,73 +28,68 @@ def _clean(cls): print("Removing " + d) shutil.rmtree(d) - def setUp(self): - """ Start each test case with two pipeline managers. """ + """Start each test case with two pipeline managers.""" print("Setting up...") # Create a fixture self.pp = pypiper.PipelineManager( - "sample_pipeline", outfolder=self.OUTFOLDER, multi=True) + "sample_pipeline", outfolder=self.OUTFOLDER, multi=True + ) self.pp2 = pypiper.PipelineManager( - "sample_pipeline2", outfolder=self.OUTFOLDER, multi=True) + "sample_pipeline2", outfolder=self.OUTFOLDER, multi=True + ) self.pp3 = pypiper.PipelineManager( - "sample_pipeline3", outfolder=self.OUTFOLDER + "3", multi=True) - + "sample_pipeline3", outfolder=self.OUTFOLDER + "3", multi=True + ) def tearDown(self): - """ Scrub the decks after each test case completes. """ + """Scrub the decks after each test case completes.""" print("Tearing down...") self.pp.stop_pipeline() self.pp2.stop_pipeline() self.pp3.stop_pipeline() print("Removing " + self.pp.outfolder) - #shutil.rmtree(self.pp.outfolder) - #shutil.rmtree(self.pp3.outfolder) + # shutil.rmtree(self.pp.outfolder) + # shutil.rmtree(self.pp3.outfolder) self._clean() del self.pp del self.pp2 del self.pp3 - def _isFile(self, filename): - """ Determine if the first manager has this file. """ + """Determine if the first manager has this file.""" filepath = pipeline_filepath(self.pp, filename=filename) return os.path.isfile(filepath) - def _assertFile(self, filename): - """ Assert that the named file exists for first pipeline manager. """ + """Assert that the named file exists for first pipeline manager.""" try: assert self._isFile(filename) except AssertionError: outfolder_contents = os.listdir(self.pp.outfolder) - print("Pipeline outfolder contents:\n{}".format( - "\n".join(outfolder_contents))) + print( + "Pipeline outfolder contents:\n{}".format("\n".join(outfolder_contents)) + ) raise - def _assertNotFile(self, filename): - """ Assert that given file doesn't exist for first manager. """ + """Assert that given file doesn't exist for first manager.""" assert not self._isFile(filename) - def _assertLines(self, expected, observed): - """ Assert equality between collections of lines. """ + """Assert equality between collections of lines.""" if isinstance(observed, str) and os.path.isfile(observed): - with open(observed, 'r') as f: + with open(observed, "r") as f: observed = f.readlines() self.assertListEqual(expected, [l.strip() for l in observed]) - @classmethod def tearDownClass(cls): - """ Ensure folder/file cleanup upon test class completion. """ + """Ensure folder/file cleanup upon test class completion.""" cls._clean() - def test_me(self): - print("Testing initialization...") # Names @@ -108,11 +100,11 @@ def test_me(self): self.assertTrue(os.path.isdir(self.pp.outfolder)) print("Testing status flags...") - self.pp._set_status_flag("testing") - self._assertFile("sample_pipeline_testing.flag") + self.pp._set_status_flag("completed") + self._assertFile("sample_pipeline_DEFAULT_SAMPLE_NAME_completed.flag") self.pp._set_status_flag("running") - self._assertNotFile("sample_pipeline_testing.flag") - self._assertFile("sample_pipeline_running.flag") + self._assertNotFile("sample_pipeline_DEFAULT_SAMPLE_NAME_testing.flag") + self._assertFile("sample_pipeline_DEFAULT_SAMPLE_NAME_running.flag") print("Testing waiting for locks...") self.pp2.wait = False @@ -137,7 +129,7 @@ def test_me(self): target = pipeline_filepath(self.pp, filename="tgt") if os.path.isfile(target): # for repeat runs. os.remove(target) - + self.pp.run("echo first > " + target, target, shell=True) # Should not run self.pp.run("echo second > " + target, target, shell=True) @@ -146,22 +138,25 @@ def test_me(self): self._assertLines(["first"], lines) print("Execute a targetless command...") - self.pp.run("echo third > " + target, - target=None, lock_name="test", shell=True) + self.pp.run("echo third > " + target, target=None, lock_name="test", shell=True) with open(target) as f: lines = f.readlines() self._assertLines(["third"], lines) # Test reporting results self.pp.report_result("key1", "abc") - self.pp.report_result("key2", "def", "shared") + self.pp.report_result("key2", "def") key1 = self.pp.get_stat("key1") - self.assertEqual(key1, 'abc') + self.assertEqual(key1, "abc") - key1 = self.pp2.get_stat("key1") # should fail + try: + key1 = self.pp2.get_stat("key1") # should fail + except KeyError: + key1 = None self.assertEqual(key1, None) - key2 = self.pp2.get_stat("key2") # should succeed - self.assertEqual(key2, 'def') + # We can no longer group based on 'shared' annotations. + # key2 = self.pp2.get_stat("key2") # should succeed + # self.assertEqual(key2, "def") print("Test intermediate file cleanup...") tgt1 = pipeline_filepath(self.pp, filename="tgt1.temp") @@ -174,7 +169,21 @@ def test_me(self): tgt9 = pipeline_filepath(self.pp, filename="tgt9.cond") tgt10 = pipeline_filepath(self.pp, filename="tgt10.txt") - self.pp.run("touch " + tgt1 + " " + tgt2 + " " + tgt3 + " " + tgt4 + " " + tgt5 + " " + tgt6, lock_name="test") + self.pp.run( + "touch " + + tgt1 + + " " + + tgt2 + + " " + + tgt3 + + " " + + tgt4 + + " " + + tgt5 + + " " + + tgt6, + lock_name="test", + ) self.pp.run("touch " + tgt8 + " " + tgt9, lock_name="test") # In global dirty mode, even non-manual clean files should not be deleted: @@ -183,7 +192,9 @@ def test_me(self): self.pp.clean_add(pipeline_filepath(self.pp, filename="*.temp")) self.pp.clean_add(tgt4) self.pp.clean_add(tgt5, conditional=True) - self.pp.clean_add(pipeline_filepath(self.pp, filename="*.cond"), conditional=True) + self.pp.clean_add( + pipeline_filepath(self.pp, filename="*.cond"), conditional=True + ) self.pp._cleanup() self.assertTrue(os.path.isfile(tgt1)) @@ -200,7 +211,7 @@ def test_me(self): cwd = os.getcwd() self.pp.clean_add(tgt6_abs) - os.chdir("pipeline_output") + os.chdir("tests/Data/pipeline_output") self.pp.outfolder = "../" + ofolder self.pp.cleanup_file = "../" + cfile self.pp.clean_add(tgt6_abs) @@ -214,23 +225,21 @@ def test_me(self): print(lines) + self.assertTrue(lines[2] == "rm tgt3.temp\n") + self.assertTrue(lines[10] == "rm tgt6.txt\n") + # lines is only 0-10 so the below code will error. + # self.assertTrue(lines[11] == "rm tgt6.txt\n") - self.assertTrue(lines[2] == 'rm tgt3.temp\n') - self.assertTrue(lines[10] == 'rm tgt6.txt\n') - self.assertTrue(lines[11] == 'rm tgt6.txt\n') - - - - - - self.pp.report_object("Test figure", os.path.join("fig", "fig.jpg")) + self.pp.report_result("Test figure", os.path.join("fig", "fig.jpg")) # But in regular mode, they should be deleted: - self.pp.dirty=False + self.pp.dirty = False self.pp.clean_add(pipeline_filepath(self.pp, filename="*.temp")) self.pp.clean_add(tgt4) self.pp.clean_add(tgt5, conditional=True) - self.pp.clean_add(pipeline_filepath(self.pp, filename="*.cond"), conditional=True) + self.pp.clean_add( + pipeline_filepath(self.pp, filename="*.cond"), conditional=True + ) self.pp._cleanup() self.assertFalse(os.path.isfile(tgt1)) @@ -242,7 +251,6 @@ def test_me(self): self.pp.run("touch " + tgt7, tgt7) self.pp.clean_add(tgt7, manual=True) - self.pp.run("touch " + tgt10, target=tgt10, clean=True) # Conditional delete should not delete tgt5 @@ -250,7 +258,7 @@ def test_me(self): self.assertTrue(os.path.isfile(tgt5)) self.assertTrue(os.path.isfile(tgt8)) self.assertTrue(os.path.isfile(tgt9)) - self.assertTrue(os.path.isfile(tgt10)) # auto cleanup + self.assertTrue(os.path.isfile(tgt10)) # auto cleanup # Stopping pp2 should cause tgt5 to be deleted self.pp2.stop_pipeline() @@ -290,14 +298,13 @@ def test_me(self): with self.assertRaises(KeyboardInterrupt): self.pp._signal_int_handler(None, None) - sleep_lock = pipeline_filepath(self.pp, filename="lock.sleep") - #subprocess.Popen("sleep .5; rm " + sleep_lock, shell=True) + # subprocess.Popen("sleep .5; rm " + sleep_lock, shell=True) self.pp._create_file(sleep_lock) cmd = "echo hello" self.pp.run(cmd, lock_name="sleep") - #subprocess.Popen("sleep .5; rm " + sleep_lock, shell=True) + # subprocess.Popen("sleep .5; rm " + sleep_lock, shell=True) print("Test new start") if os.path.isfile(target): # for repeat runs. @@ -318,7 +325,7 @@ def test_me(self): print("Test dual target") self.pp.new_start = False if os.path.isfile(tgt1): - os.remove(tgt1) + os.remove(tgt1) self.pp.run("touch " + tgt6, tgt6) self.assertTrue(os.path.isfile(tgt6)) # if target exists, should not run @@ -332,12 +339,14 @@ def test_me(self): self.assertFalse(os.path.isfile(tgt5)) self.pp.run("touch " + tgt5, [tgt1, tgt6]) self.assertFalse(os.path.isfile(tgt5)) + self.pp.pipestat.clear_status(self.pp.name, flag_names=["failed"]) + self.pp2.pipestat.clear_status(self.pp2.name, flag_names=["failed"]) + self.pp3.pipestat.clear_status(self.pp3.name, flag_names=["failed"]) def _make_pipe_filepath(pm, filename): return os.path.join(pm.outfolder, filename) - -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/pipeline_manager/test_pipeline_manager_timestamp.py b/tests/pipeline_manager/test_pipeline_manager_timestamp.py index 2f870cf8..18cb7177 100644 --- a/tests/pipeline_manager/test_pipeline_manager_timestamp.py +++ b/tests/pipeline_manager/test_pipeline_manager_timestamp.py @@ -2,13 +2,13 @@ import os import sys + import pytest from pypiper.exceptions import PipelineHalt from pypiper.utils import checkpoint_filepath from tests.helpers import fetch_checkpoint_files, named_param - __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" @@ -17,9 +17,8 @@ FILES_TEST = "files" - def pytest_generate_tests(metafunc): - """ Dynamic test case generation for this module. """ + """Dynamic test case generation for this module.""" if "retrospective" in metafunc.fixturenames: metafunc.parametrize("retrospective", [False, True]) if "test_type" in metafunc.fixturenames: @@ -28,16 +27,15 @@ def pytest_generate_tests(metafunc): metafunc.parametrize("raise_error", [False, True]) - def test_timestamp_requires_no_arguments(get_pipe_manager): - """ A call to timestamp() requires no arguments. """ + """A call to timestamp() requires no arguments.""" pm = get_pipe_manager(name="TestPM") pm.timestamp() @pytest.mark.skip def test_timestamp_message(get_pipe_manager, capsys): - """ Tests for the message component of a timestamp() call. """ + """Tests for the message component of a timestamp() call.""" name = "TestPM" pm = get_pipe_manager(name=name) logfile = pm.pipeline_log_file @@ -55,15 +53,13 @@ def test_timestamp_message(get_pipe_manager, capsys): # The stdout capture with capsys comes through as a single unicode block. # With the move to logger, this test is no longer capturing the output - assert message_content in str(out), \ - "Missing timestamp message ('{}') in message(s)".\ - format(message_content) - + assert message_content in str( + out + ), "Missing timestamp message ('{}') in message(s)".format(message_content) class TimestampHaltingTests: - """ Tests for a manager's ability to halt a pipeline. """ - + """Tests for a manager's ability to halt a pipeline.""" # Note that the tests here are not truly logically independent from the # functionality of the manager's halt() method. The assertions made here @@ -73,10 +69,8 @@ class TimestampHaltingTests: # the mock, but here that seems to inject a level of complexity for which # the cost exceeds the benefit of the logical independence that it confers. - - def test_halts_if_hitting_exclusive_halt_point( - self, get_pipe_manager, raise_error): - """ Halt point may be specified prospectively. """ + def test_halts_if_hitting_exclusive_halt_point(self, get_pipe_manager, raise_error): + """Halt point may be specified prospectively.""" # Create manager, set halt point, and check that it's running. halt_name = "phase3" @@ -102,9 +96,8 @@ def test_halts_if_hitting_exclusive_halt_point( print("STATUS: {}".format(pm.status)) raise - def test_halts_if_halt_on_next(self, get_pipe_manager, raise_error): - """ If in particular state, managed pipeline halts on timestamp(). """ + """If in particular state, managed pipeline halts on timestamp().""" pm = get_pipe_manager(name="TestPM") pm.halt_on_next = True if raise_error: @@ -114,9 +107,8 @@ def test_halts_if_halt_on_next(self, get_pipe_manager, raise_error): pm.timestamp("testing", raise_error=False) assert pm.halted - def test_correctly_sets_halt_on_next(self, get_pipe_manager): - """ Of critical importance to timestamp's checkpointing functionality + """Of critical importance to timestamp's checkpointing functionality is its ability to alter the manager's state such that it triggers a halt on the subsequent timestamp() call. This allows timestamp() to be used in a prospective fashion while still preserving the ability to @@ -125,7 +117,7 @@ def test_correctly_sets_halt_on_next(self, get_pipe_manager): timestamp() before beginning a conceptual block of processing logic, yet still (behave as though) stopping just after completion of execution of a defined stopping point. Essentially, the timestamp() - calls can be prospective yet mixed with a retrospective halt point. """ + calls can be prospective yet mixed with a retrospective halt point.""" # Establish manager and perform initial control assertions. pm = get_pipe_manager(name="TestPM") @@ -143,15 +135,12 @@ def test_correctly_sets_halt_on_next(self, get_pipe_manager): assert pm.halt_on_next - class TimestampStatusTypeTests: - """ Tests for the type of status that a timestamp() call represents. """ - + """Tests for the type of status that a timestamp() call represents.""" - def test_initial_timestamp_checkpoint_file( - self, get_pipe_manager, retrospective): - """ Initial checkpointed timestamp writes checkpoint file if and only - if it's a retrospective timestamp. """ + def test_initial_timestamp_checkpoint_file(self, get_pipe_manager, retrospective): + """Initial checkpointed timestamp writes checkpoint file if and only + if it's a retrospective timestamp.""" pm = get_pipe_manager(name="init-timestamp-file") stage_name = "align_reads" pm.timestamp(checkpoint=stage_name, finished=retrospective) @@ -161,13 +150,12 @@ def test_initial_timestamp_checkpoint_file( else: assert not os.path.isfile(check_fpath) - - @named_param("which_checkpoint_state", - ["curr_checkpoint", "prev_checkpoint"]) + @named_param("which_checkpoint_state", ["curr_checkpoint", "prev_checkpoint"]) def test_initial_timestamp_states( - self, get_pipe_manager, retrospective, which_checkpoint_state): - """ Which checkpoint state is updated by a checkpointed timestamp - call depends upon the perspective of the call. """ + self, get_pipe_manager, retrospective, which_checkpoint_state + ): + """Which checkpoint state is updated by a checkpointed timestamp + call depends upon the perspective of the call.""" # Create the manager and make the timestamp call. pm = get_pipe_manager(name="InitialTimestampState") @@ -188,10 +176,8 @@ def test_initial_timestamp_states( else: assert prev_exp == getattr(pm, "prev_checkpoint") - - def test_two_prospective_checkpointed_timestamps( - self, test_type, stage_pair, pm): - """ Prospective timestamp generates file for previous checkpoint. """ + def test_two_prospective_checkpointed_timestamps(self, test_type, stage_pair, pm): + """Prospective timestamp generates file for previous checkpoint.""" stage1, stage2 = stage_pair pm.timestamp(checkpoint=stage1, finished=False) @@ -205,10 +191,8 @@ def test_two_prospective_checkpointed_timestamps( assert stage1 == pm.prev_checkpoint assert stage2 == pm.curr_checkpoint - - def test_two_retrospective_checkpointed_timestamps( - self, test_type, stage_pair, pm): - """ Retrospective timestamp generates file for current checkpoint. """ + def test_two_retrospective_checkpointed_timestamps(self, test_type, stage_pair, pm): + """Retrospective timestamp generates file for current checkpoint.""" stage1, stage2 = stage_pair pm.timestamp(checkpoint=stage1, finished=True) @@ -222,11 +206,11 @@ def test_two_retrospective_checkpointed_timestamps( assert stage2 == pm.prev_checkpoint assert pm.curr_checkpoint is None - def test_prospective_then_retrospective_checkpointed_timestamps( - self, test_type, stage_pair, pm): - """ If a prospective checkpointed timestamp is followed by a - retrospective one, there's only a file for the retrospective one. """ + self, test_type, stage_pair, pm + ): + """If a prospective checkpointed timestamp is followed by a + retrospective one, there's only a file for the retrospective one.""" stage1, stage2 = stage_pair pm.timestamp(checkpoint=stage1, finished=False) @@ -243,10 +227,10 @@ def test_prospective_then_retrospective_checkpointed_timestamps( assert stage2 == pm.prev_checkpoint assert pm.curr_checkpoint is None - def test_retrospective_the_prospective_checkpointed_timestamps( - self, test_type, stage_pair, pm): - """ Test retrospective timestamp followed by prospective one. """ + self, test_type, stage_pair, pm + ): + """Test retrospective timestamp followed by prospective one.""" stage1, stage2 = stage_pair pm.timestamp(checkpoint=stage1, finished=True) @@ -261,14 +245,12 @@ def test_retrospective_the_prospective_checkpointed_timestamps( assert pm.prev_checkpoint is None assert stage2 == pm.curr_checkpoint - @pytest.fixture def stage_pair(self): - """ Provide test case with a pair of stage names to use. """ + """Provide test case with a pair of stage names to use.""" return "merge_input", "quality_control" - @pytest.fixture def pm(self, get_pipe_manager): - """ Provide test case with a basic, test-safe pipeline manager. """ + """Provide test case with a basic, test-safe pipeline manager.""" return get_pipe_manager(name="checkpointed-timestamp-pair") diff --git a/tests/pipeline_manager/test_pipeline_manager_timestamp_checkpoint_filepath.py b/tests/pipeline_manager/test_pipeline_manager_timestamp_checkpoint_filepath.py index c8daf530..90cc05de 100644 --- a/tests/pipeline_manager/test_pipeline_manager_timestamp_checkpoint_filepath.py +++ b/tests/pipeline_manager/test_pipeline_manager_timestamp_checkpoint_filepath.py @@ -9,14 +9,13 @@ from pypiper.stage import Stage from tests.helpers import named_param - __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - class DummyPM(PipelineManager): - """ Simple override of true PipelineManager, for __init__ simplicity """ + """Simple override of true PipelineManager, for __init__ simplicity""" + def __init__(self, name, outfolder): self.name = name self.outfolder = outfolder @@ -29,17 +28,17 @@ def __init__(self, name, outfolder): self.curr_checkpoint = None - class PipelineMangerTimestampCheckpointFilePathTests: - """ Tests for determination of checkpoint filepath. """ - + """Tests for determination of checkpoint filepath.""" - @named_param(argnames=["name1", "name2"], - argvalues=[("chipseq", "ATACseq"), ("rnaKallisto", "wgbs")]) - @named_param(argnames="spec_type", - argvalues=["stage_name", "stage", "function"]) + @named_param( + argnames=["name1", "name2"], + argvalues=[("chipseq", "ATACseq"), ("rnaKallisto", "wgbs")], + ) + @named_param(argnames="spec_type", argvalues=["stage_name", "stage", "function"]) def test_distinguishes_pipelines_within_outfolder( - self, name1, name2, spec_type, tmpdir): + self, name1, name2, spec_type, tmpdir + ): """ Checkpoint files within sample folder include pipeline name. @@ -66,8 +65,9 @@ def stage_spec(): if spec_type == "function": return trim_reads elif spec_type not in ["stage", "stage_name"]: - raise ValueError("Unrecognized stage specification type: {}". - format(spec_type)) + raise ValueError( + "Unrecognized stage specification type: {}".format(spec_type) + ) else: s = Stage(trim_reads) return s.name if spec_type == "stage_name" else s @@ -86,25 +86,29 @@ def stage_spec(): # Find the checkpoints; there should only be one. checkpoint_pattern = os.path.join( - outfolder, "{}_*{}".format(name1, CHECKPOINT_EXTENSION)) + outfolder, "{}_*{}".format(name1, CHECKPOINT_EXTENSION) + ) checkpoints = glob.glob(checkpoint_pattern) assert 1 == len(checkpoints) assert 1 == len(glob.glob(all_checkpoints_pattern)) # Check that we have the expected checkpoint. - exp_chkpt_fpath = os.path.join(outfolder, "{}_{}".format( - name1, checkpoint_name + CHECKPOINT_EXTENSION)) + exp_chkpt_fpath = os.path.join( + outfolder, "{}_{}".format(name1, checkpoint_name + CHECKPOINT_EXTENSION) + ) assert exp_chkpt_fpath == checkpoints[0] # Create a second checkpoint with the same stage, but with a manager # of a different name. plm2.timestamp(checkpoint=stage_spec(), finished=True) checkpoint_pattern = os.path.join( - outfolder, "{}_*{}".format(name2, CHECKPOINT_EXTENSION)) + outfolder, "{}_*{}".format(name2, CHECKPOINT_EXTENSION) + ) checkpoints = glob.glob(checkpoint_pattern) assert 1 == len(checkpoints) all_checkpoints = glob.glob(all_checkpoints_pattern) assert 2 == len(all_checkpoints) - exp_chkpt_fpath_2 = os.path.join(outfolder, "{}_{}".format( - name2, checkpoint_name + CHECKPOINT_EXTENSION)) + exp_chkpt_fpath_2 = os.path.join( + outfolder, "{}_{}".format(name2, checkpoint_name + CHECKPOINT_EXTENSION) + ) assert {exp_chkpt_fpath, exp_chkpt_fpath_2} == set(all_checkpoints) diff --git a/tests/pipeline_manager/test_set_status_flag.py b/tests/pipeline_manager/test_set_status_flag.py index c67c9cb9..8b9d84f5 100644 --- a/tests/pipeline_manager/test_set_status_flag.py +++ b/tests/pipeline_manager/test_set_status_flag.py @@ -6,15 +6,22 @@ from pypiper.flags import __all__ as ALL_FLAGS from tests.helpers import named_param - __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - -@named_param(argnames="status", argvalues=ALL_FLAGS) +@named_param( + argnames="status", + argvalues=[ + RUN_FLAG, + COMPLETE_FLAG, + FAIL_FLAG, + PAUSE_FLAG, + WAIT_FLAG, + ], +) def test_set_status_flag_is_idempotent(get_pipe_manager, status): - """ Calls to manager's status flag setter are idempotent. """ + """Calls to manager's status flag setter are idempotent.""" pm = get_pipe_manager(name="TestPM") pm._set_status_flag(status) assert status == pm.status @@ -22,15 +29,20 @@ def test_set_status_flag_is_idempotent(get_pipe_manager, status): assert status == pm.status - @pytest.mark.parametrize( argnames=["init_state", "new_state"], - argvalues=[(WAIT_FLAG, RUN_FLAG), (WAIT_FLAG, COMPLETE_FLAG), - (WAIT_FLAG, FAIL_FLAG), (RUN_FLAG, COMPLETE_FLAG), - (RUN_FLAG, PAUSE_FLAG), (RUN_FLAG, FAIL_FLAG), - (FAIL_FLAG, RUN_FLAG)]) + argvalues=[ + (WAIT_FLAG, RUN_FLAG), + (WAIT_FLAG, COMPLETE_FLAG), + (WAIT_FLAG, FAIL_FLAG), + (RUN_FLAG, COMPLETE_FLAG), + (RUN_FLAG, PAUSE_FLAG), + (RUN_FLAG, FAIL_FLAG), + (FAIL_FLAG, RUN_FLAG), + ], +) def test_changes_status_state(get_pipe_manager, init_state, new_state): - """ Manager setting status flag changes is internal status/state. """ + """Manager setting status flag changes is internal status/state.""" pm = get_pipe_manager(name="test-pipe") assert pm.status == RUN_FLAG pm._set_status_flag(init_state) diff --git a/tests/test_packaging.py b/tests/test_packaging.py index 2e5bf819..5ef2ecb6 100644 --- a/tests/test_packaging.py +++ b/tests/test_packaging.py @@ -1,18 +1,26 @@ """ Validate what's available directly on the top-level import. """ -import pytest from inspect import isfunction +import pytest + __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" -@pytest.mark.parametrize(["obj_name", "typecheck"], [ - ("add_logging_options", isfunction), ("check_all_commands", isfunction), - ("determine_uncallable", isfunction), ("logger_via_cli", isfunction)]) +@pytest.mark.parametrize( + ["obj_name", "typecheck"], + [ + ("add_logging_options", isfunction), + ("check_all_commands", isfunction), + ("determine_uncallable", isfunction), + ("logger_via_cli", isfunction), + ], +) def test_top_level_exports(obj_name, typecheck): - """ At package level, validate object availability and type. """ + """At package level, validate object availability and type.""" import pypiper + try: obj = getattr(pypiper, obj_name) except AttributeError: diff --git a/tests/test_pipeline_filepath.py b/tests/test_pipeline_filepath.py index 12a5a874..e8d496fd 100644 --- a/tests/test_pipeline_filepath.py +++ b/tests/test_pipeline_filepath.py @@ -1,10 +1,11 @@ """ Tests for utility functions """ import os + import mock import pytest -from pypiper.utils import pipeline_filepath +from pypiper.utils import pipeline_filepath __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" @@ -14,7 +15,6 @@ SUFFICES = [".txt", "_results.csv", ".stats.tsv", "-data.json"] - @pytest.fixture def pl_mgr(request, tmpdir): """ @@ -34,26 +34,25 @@ def pl_mgr(request, tmpdir): # Set output folder and name attributes for mocked PipelineManager. mock_mgr = mock.Mock(outfolder=tmpdir.strpath) - type(mock_mgr).name = pipe_name # Circumvent 'name' keyword on Mock. + type(mock_mgr).name = pipe_name # Circumvent 'name' keyword on Mock. return mock_mgr - def test_requires_filename_or_suffix(pl_mgr): - """ Either filename or suffix is required to build a path. """ + """Either filename or suffix is required to build a path.""" with pytest.raises(TypeError): pipeline_filepath(pl_mgr) - @pytest.mark.parametrize(argnames="pipe_name", argvalues=PIPELINE_NAMES) @pytest.mark.parametrize(argnames="suffix", argvalues=SUFFICES) @pytest.mark.parametrize( - argnames="test_type", - argvalues=["has_pipe_name", "has_suffix", "full_path"]) + argnames="test_type", argvalues=["has_pipe_name", "has_suffix", "full_path"] +) def test_uses_pipeline_name_if_no_filename( - pipe_name, suffix, test_type, pl_mgr, tmpdir): - """ Pipeline name is proxy for filename if just suffix is given. """ + pipe_name, suffix, test_type, pl_mgr, tmpdir +): + """Pipeline name is proxy for filename if just suffix is given.""" observed = pipeline_filepath(pl_mgr, suffix=suffix) @@ -74,12 +73,11 @@ def test_uses_pipeline_name_if_no_filename( @pytest.mark.parametrize( - argnames="filename", - argvalues=["testfile" + suffix for suffix in SUFFICES]) -@pytest.mark.parametrize( - argnames="test_type", argvalues=["filename", "filepath"]) + argnames="filename", argvalues=["testfile" + suffix for suffix in SUFFICES] +) +@pytest.mark.parametrize(argnames="test_type", argvalues=["filename", "filepath"]) def test_direct_filename(tmpdir, filename, pl_mgr, test_type): - """ When given, filename is used instead of pipeline name. """ + """When given, filename is used instead of pipeline name.""" fullpath = pipeline_filepath(pl_mgr, filename=filename) if test_type == "filename": _, observed = os.path.split(fullpath) @@ -91,12 +89,10 @@ def test_direct_filename(tmpdir, filename, pl_mgr, test_type): raise ValueError("Unrecognized test type: '{}'".format(test_type)) -@pytest.mark.parametrize( - argnames="filename", argvalues=["output", "testfile"]) +@pytest.mark.parametrize(argnames="filename", argvalues=["output", "testfile"]) @pytest.mark.parametrize(argnames="suffix", argvalues=SUFFICES) -def test_suffix_is_appended_to_filename_if_both_are_provided( - pl_mgr, filename, suffix): - """ Suffix is appended to filename if both are provided. """ +def test_suffix_is_appended_to_filename_if_both_are_provided(pl_mgr, filename, suffix): + """Suffix is appended to filename if both are provided.""" expected = filename + suffix fullpath = pipeline_filepath(pl_mgr, filename=filename, suffix=suffix) _, observed = os.path.split(fullpath) diff --git a/tests/utils_tests/test_check_command_callability.py b/tests/utils_tests/test_check_command_callability.py index 517b1d45..b3e8ecb5 100644 --- a/tests/utils_tests/test_check_command_callability.py +++ b/tests/utils_tests/test_check_command_callability.py @@ -1,26 +1,42 @@ """ Tests for checking a collection of commands for callability """ -import mock import os + +import mock import pytest -from pypiper import utils as piper_utils from ubiquerg import powerset from veracitools import ExpectContext +from pypiper import utils as piper_utils + __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" -EXTENSIONS = [".py", ".rb", ".sh", ".java", ".jar", ".pl", ".o", ".R", ".r", - ".cpp", ".c", ".hs", ".scala", ".class"] +EXTENSIONS = [ + ".py", + ".rb", + ".sh", + ".java", + ".jar", + ".pl", + ".o", + ".R", + ".r", + ".cpp", + ".c", + ".hs", + ".scala", + ".class", +] def _touch(f): - """ 'touch' the given file. + """'touch' the given file. :param str f: filepath to create """ - with open(f, 'w'): + with open(f, "w"): print("touch: {}".format(f)) @@ -31,31 +47,37 @@ def _make_exec(f): :param str f: path to create """ import subprocess + _touch(f) subprocess.check_call(["chmod", "+x", f]) def pytest_generate_tests(metafunc): - """ Dynamic test case generation and parameterization for this module """ + """Dynamic test case generation and parameterization for this module""" if "str_list_monad" in metafunc.fixturenames: metafunc.parametrize("str_list_monad", [lambda s: s, lambda s: [s]]) @pytest.mark.skip(reason="test is broken") @pytest.mark.parametrize("filename", ["testfile" + x for x in EXTENSIONS]) -@pytest.mark.parametrize(["setup", "pretest", "exp_miss"], [ - (lambda _: None, - lambda f: not os.path.exists(f), - lambda _: True), - (_touch, - lambda f: os.path.isfile(f) and not os.access(f, os.X_OK), - lambda f: not f.endswith(".jar")), - (_make_exec, - lambda f: os.path.isfile(f) and os.access(f, os.X_OK), - lambda _: False) -]) +@pytest.mark.parametrize( + ["setup", "pretest", "exp_miss"], + [ + (lambda _: None, lambda f: not os.path.exists(f), lambda _: True), + ( + _touch, + lambda f: os.path.isfile(f) and not os.access(f, os.X_OK), + lambda f: not f.endswith(".jar"), + ), + ( + _make_exec, + lambda f: os.path.isfile(f) and os.access(f, os.X_OK), + lambda _: False, + ), + ], +) def test_callability_checker_defaults(tmpdir, filename, setup, pretest, exp_miss): - """ Verify behavior of callability checker with default parameterization. """ + """Verify behavior of callability checker with default parameterization.""" cmd = os.path.join(tmpdir.strpath, filename) setup(cmd) assert pretest(cmd) @@ -71,42 +93,53 @@ def test_callability_checker_defaults(tmpdir, filename, setup, pretest, exp_miss @pytest.mark.parametrize( - ["uncall_result", "expectation"], - [([], True), ([("noncmd", "noncmd")], TypeError)]) + ["uncall_result", "expectation"], [([], True), ([("noncmd", "noncmd")], TypeError)] +) @pytest.mark.parametrize("handler", [lambda: True, "not-a-function"]) def test_check_all_bad_handler_is_type_error_iff_uncallability_exists( - uncall_result, str_list_monad, handler, expectation): - """ Invalid handler evaluation is conditional having >= 1 uncallable command. """ + uncall_result, str_list_monad, handler, expectation +): + """Invalid handler evaluation is conditional having >= 1 uncallable command.""" cmd = "noncmd" - with mock.patch.object(piper_utils, "determine_uncallable", - return_value=uncall_result), \ - ExpectContext(expectation, piper_utils.check_all_commands) as check: + with mock.patch.object( + piper_utils, "determine_uncallable", return_value=uncall_result + ), ExpectContext(expectation, piper_utils.check_all_commands) as check: check(cmds=str_list_monad(cmd), handle=handler) -@pytest.mark.parametrize(["create_result", "expected"], [ - (lambda bads: Exception("{} bad commands: {}".format(len(bads), bads)), Exception), - (lambda bads: "{} bad commands: {}".format(len(bads), bads), False) -]) +@pytest.mark.parametrize( + ["create_result", "expected"], + [ + ( + lambda bads: Exception("{} bad commands: {}".format(len(bads), bads)), + Exception, + ), + (lambda bads: "{} bad commands: {}".format(len(bads), bads), False), + ], +) def test_check_all_result_is_conjunctive(create_result, expected, str_list_monad): - """ Even one uncallable means result is False or an Exception occurs. """ + """Even one uncallable means result is False or an Exception occurs.""" cmd = "noncmd" - with mock.patch.object(piper_utils, "determine_uncallable", - return_value=[(cmd, cmd)]), \ - ExpectContext(expected, piper_utils.check_all_commands) as check: + with mock.patch.object( + piper_utils, "determine_uncallable", return_value=[(cmd, cmd)] + ), ExpectContext(expected, piper_utils.check_all_commands) as check: check(cmds=str_list_monad(cmd), get_bad_result=create_result) @pytest.mark.parametrize("commands", ["man", "ls", ["man", "ls"]]) @pytest.mark.parametrize( ["transforms", "expectation"], - [(arg, lambda res: isinstance(res, list)) for arg in [None, []]] + - [(arg, TypeError) for arg in [1, "a"]]) + [(arg, lambda res: isinstance(res, list)) for arg in [None, []]] + + [(arg, TypeError) for arg in [1, "a"]], +) def test_check_all_requires_iterable_transformations_argument( - commands, transforms, expectation): - """ If transformations arg is non-null, it must be iterable. """ + commands, transforms, expectation +): + """If transformations arg is non-null, it must be iterable.""" + def call(): return piper_utils.determine_uncallable(commands, transformations=transforms) + if isinstance(expectation, type) and issubclass(expectation, Exception): with pytest.raises(expectation): call() @@ -115,31 +148,42 @@ def call(): @pytest.mark.parametrize( - "commands", powerset(["ls", "picard.jar", "$ENVVAR"], nonempty=True)) + "commands", powerset(["ls", "picard.jar", "$ENVVAR"], nonempty=True) +) def test_transformation_accumulation(commands): - """ Accumulation of transformations works as expected """ + """Accumulation of transformations works as expected""" mapjar = lambda c: "java -jar {}".format(c) envjar = "env.jar" - transforms = [(lambda c: c == "$ENVVAR", lambda _: envjar), - (lambda c: c.endswith(".jar"), mapjar)] + transforms = [ + (lambda c: c == "$ENVVAR", lambda _: envjar), + (lambda c: c.endswith(".jar"), mapjar), + ] exps = {"ls": "ls", "picard.jar": mapjar("picard.jar"), "$ENVVAR": mapjar(envjar)} with mock.patch.object(piper_utils, "is_command_callable", return_value=False): res = piper_utils.determine_uncallable( - commands, transformations=transforms, accumulate=True) + commands, transformations=transforms, accumulate=True + ) expectation = [(c, exps[c]) for c in commands] print("EXPECTED: {}".format(expectation)) print("OBSERVED: {}".format(res)) assert expectation == res -@pytest.mark.parametrize("transforms", [ - {(lambda _: True, lambda c: c), (lambda _: False, lambda c: c)}, - {"id": (lambda _: True, lambda c: c), - "java": (lambda c: c.endswith(".jar"), lambda c: "java -jar {}".format(c))} -]) +@pytest.mark.parametrize( + "transforms", + [ + {(lambda _: True, lambda c: c), (lambda _: False, lambda c: c)}, + { + "id": (lambda _: True, lambda c: c), + "java": (lambda c: c.endswith(".jar"), lambda c: "java -jar {}".format(c)), + }, + ], +) def test_non_accumulative_but_unordered_transformation_is_exceptional(transforms): with pytest.raises(Exception) as err_ctx: piper_utils.determine_uncallable("ls", transformations=transforms) - exp_msg = "If transformations are unordered, non-accumulation of " \ - "effects may lead to nondeterministic behavior." + exp_msg = ( + "If transformations are unordered, non-accumulation of " + "effects may lead to nondeterministic behavior." + ) assert str(err_ctx.value) == exp_msg diff --git a/tests/utils_tests/test_head_util.py b/tests/utils_tests/test_head_util.py index 4f55a922..232c6312 100644 --- a/tests/utils_tests/test_head_util.py +++ b/tests/utils_tests/test_head_util.py @@ -2,10 +2,12 @@ import random import string + import pytest -from hypothesis import given, strategies as st -from pypiper.utils import head +from hypothesis import given +from hypothesis import strategies as st +from pypiper.utils import head __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" @@ -14,12 +16,17 @@ NUMBERS_AND_LETTERS = list(string.ascii_letters) + list(range(-9, 10)) # Strategy for generating a pretty arbitrary atomic -ATOMICS = st.deferred(lambda: st.booleans() | st.characters() | st.integers() | - st.floats(allow_nan=False) | st.text()) +ATOMICS = st.deferred( + lambda: st.booleans() + | st.characters() + | st.integers() + | st.floats(allow_nan=False) + | st.text() +) def pytest_generate_tests(metafunc): - """ Test case generation/parameterization for this module. """ + """Test case generation/parameterization for this module.""" if "seqtype" in metafunc.fixturenames: metafunc.parametrize("seqtype", [tuple, list]) if "iter_cast" in metafunc.fixturenames: @@ -27,40 +34,48 @@ def pytest_generate_tests(metafunc): if "h" in metafunc.fixturenames and "xs" in metafunc.fixturenames: metafunc.parametrize( ["h", "xs"], - [(random.choice(NUMBERS_AND_LETTERS), - [random.choice(NUMBERS_AND_LETTERS) - for _ in range(random.randint(5, 10))]) for _ in range(10)]) + [ + ( + random.choice(NUMBERS_AND_LETTERS), + [ + random.choice(NUMBERS_AND_LETTERS) + for _ in range(random.randint(5, 10)) + ], + ) + for _ in range(10) + ], + ) @given(obj=ATOMICS) def test_head_atomic(obj): - """ head() of an atomic object is the object itself. """ + """head() of an atomic object is the object itself.""" assert obj == head(obj) def test_head_empty_string(): - """ Empty string is exception to exceptional-ness of empty collection. """ + """Empty string is exception to exceptional-ness of empty collection.""" assert "" == head("") @pytest.mark.parametrize("coll", [dict(), set(), tuple(), list()]) def test_head_empty_collection(coll): - """ Request for first element from an empty Iterable is exceptional. """ + """Request for first element from an empty Iterable is exceptional.""" with pytest.raises(ValueError): head(coll) def test_head_nonempty_sequential_collection(h, xs, seqtype, iter_cast): - """ Verify accuracy of request for first element from nonempty Iterable. """ + """Verify accuracy of request for first element from nonempty Iterable.""" c = seqtype([h]) + seqtype(xs) assert h == head(iter_cast(c)) def test_head_nonempty_set(): - """ Verify that head of nonempty set is non-exceptional. """ + """Verify that head of nonempty set is non-exceptional.""" head({-1, 0, 1}) def test_head_nonempty_dict(): - """ Verify that head of nonempty dictionary is non-exceptional. """ + """Verify that head of nonempty dictionary is non-exceptional.""" head({"a": 1, "b": 2})