From b3b96bb8790655dd670ed6f4811600a1ec0649de Mon Sep 17 00:00:00 2001 From: Florian Halbritter Date: Wed, 8 Jan 2020 13:31:26 +0100 Subject: [PATCH 01/25] add missing . --- pypiper/ngstk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypiper/ngstk.py b/pypiper/ngstk.py index fff2e554..9977325d 100755 --- a/pypiper/ngstk.py +++ b/pypiper/ngstk.py @@ -759,7 +759,7 @@ def count_unique_mapped_reads(self, file_name, paired_end): if ext == ".sam": param = "-S -F4" - elif ext == "bam": + elif ext == ".bam": param = "-F4" else: raise ValueError("Not a SAM or BAM: '{}'".format(file_name)) From fc004fdda722aed99e707b985d8732dacbb2f69e Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 8 Jan 2020 11:44:27 -0500 Subject: [PATCH 02/25] update testing packages requirements; https://github.com/databio/pypiper/pull/169#issuecomment-572043640 --- requirements/reqs-test.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/requirements/reqs-test.txt b/requirements/reqs-test.txt index 0d929f21..728ba7cf 100644 --- a/requirements/reqs-test.txt +++ b/requirements/reqs-test.txt @@ -1,6 +1,6 @@ mock==2.0.0 -pytest>=4.2.1 -hypothesis -coveralls>=1.1 -pytest-cov==2.6.1 -veracitools +pytest>=4.6.9 +pytest-cov>=2.8.1 +hypothesis==4.38.0 +coveralls +veracitools \ No newline at end of file From 885ad39bb6f606ed3b542f1b37c84d2b1944661e Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 8 Jan 2020 11:48:31 -0500 Subject: [PATCH 03/25] dont test py3.4 --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index f345282e..2308e86f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,6 @@ language: python python: - "2.7" - - "3.4" - "3.5" - "3.6" os: From ce6d2ef19b23341081ced1858b9c6fc5a9ac2439 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 12 Jan 2021 10:24:11 -0500 Subject: [PATCH 04/25] pipestat integration --- pypiper/manager.py | 60 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 8 deletions(-) diff --git a/pypiper/manager.py b/pypiper/manager.py index 6c5b8467..1dfe8b94 100644 --- a/pypiper/manager.py +++ b/pypiper/manager.py @@ -23,6 +23,7 @@ import time import pandas as _pd +from pipestat import PipestatManager, PipestatError from attmap import AttMapEcho from hashlib import md5 import logmuse @@ -96,15 +97,19 @@ class PipelineManager(object): protect from a case in which a restart begins upstream of a stage for which a checkpoint file already exists, but that depends on the upstream stage and thus should be rerun if it's "parent" is rerun. + :param pipestat.PipestatManager: pipestat manager object to use for + reporting pipeline results :raise TypeError: if start or stop point(s) are provided both directly and via args namespace, or if both stopping types (exclusive/prospective and inclusive/retrospective) are provided. """ def __init__( - self, name, outfolder, version=None, args=None, multi=False, - dirty=False, recover=False, new_start=False, force_follow=False, - cores=1, mem="1000M", config_file=None, output_parent=None, - overwrite_checkpoints=False, logger_kwargs=None, **kwargs): + self, name, outfolder, version=None, args=None, multi=False, + dirty=False, recover=False, new_start=False, force_follow=False, + cores=1, mem="1000M", config_file=None, output_parent=None, + overwrite_checkpoints=False, logger_kwargs=None, + pipestat_manager=None, **kwargs + ): # Params defines the set of options that could be updated via # command line args to a pipeline run, that can be forwarded @@ -142,8 +147,10 @@ def __init__( checkpoint = args_dict.pop(optname, None) setattr(self, optname, checkpoint) if self.stop_before and self.stop_after: - raise TypeError("Cannot specify both pre-stop and post-stop; " - "got '{}' and '{}'".format(self.stop_before, self.stop_after)) + raise TypeError( + "Cannot specify both pre-stop and post-stop; " + "got '{}' and '{}'".format(self.stop_before, self.stop_after) + ) # Update this manager's parameters with non-checkpoint-related # command-line parameterization. @@ -169,7 +176,6 @@ def __init__( self.output_parent = params['output_parent'] self.testmode = params['testmode'] - # Set up logger logger_kwargs = logger_kwargs or {} default_logname = ".".join([__name__, self.__class__.__name__, self.name]) @@ -203,7 +209,7 @@ def __init__( # total memory limit provided. # This will give a little breathing room for non-heap java memory use. - if not params['mem'].endswith(('K','M','G','T')): + if not params['mem'].endswith(('K', 'M', 'G', 'T')): self.mem = params['mem'] + "M" else: # Assume the memory is in megabytes. @@ -341,7 +347,20 @@ def __init__( self.debug("No config file") self.config = None + self._pipestat_manager = pipestat_manager + + @property + def pipestat(self): + """ + PipestatManager - object to use for pipeline results reporting + Depending on the object configuration it can report to + a YAML-formatted file or PostgreSQL database. Please refer to pipestat + documentation for more details: http://pipestat.databio.org/ + + :return pipestat.PipestatManager: object to use for results reporting + """ + return self._pipestat_manager @property def _completed(self): @@ -380,6 +399,31 @@ def _has_exit_status(self): """ return self._completed or self.halted or self._failed + def setup_default_pipestat(self, schema_path, namespace=None, + record_identifier=None, results_file_path=None): + """ + A convenience method for ad hoc PipestatManager instantiation. + + Requires only a pipestat-like schema to get a functional PipestatManager + for reporting to a YAML-formatted file. + + :param str schema_path: path to the pipestat-like schema + :param str namespace: namespace to write into, default: pipeline name + :param record_identifier: recordID to report for, default: pipeline name + :param str results_file_path: YAML file to reoprt into, defaults to a + pipeline-named file in the standard pipeline output directory + """ + if self.pipestat is not None: + raise PipestatError(f"{PipestatManager.__name__} is already " + f"initialized:\n{str(self.pipestat)}") + self._pipestat_manager = PipestatManager( + schema_path=schema_path, + name=namespace or self.name, + record_identifier=record_identifier or self.name, + results_file_path=results_file_path or pipeline_filepath( + self, suffix="_results_pipestat.yaml") + ) + def _ignore_interrupts(self): """ Ignore interrupt and termination signals. Used as a pre-execution From cde37916d9ffb26973d24f627d13a5660ffdbf6b Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 15 Jan 2021 16:02:28 -0500 Subject: [PATCH 05/25] multiple changes: - set up pipestatmanager based on pipelinemanager init args - add pipestat argument group --- pypiper/manager.py | 14 ++++++++++++-- pypiper/utils.py | 20 ++++++++++++++++++-- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/pypiper/manager.py b/pypiper/manager.py index 1dfe8b94..cefb07c8 100644 --- a/pypiper/manager.py +++ b/pypiper/manager.py @@ -108,7 +108,8 @@ def __init__( dirty=False, recover=False, new_start=False, force_follow=False, cores=1, mem="1000M", config_file=None, output_parent=None, overwrite_checkpoints=False, logger_kwargs=None, - pipestat_manager=None, **kwargs + pipestat_namespace=None, pipestat_record_id=None, pipestat_schema=None, + pipestat_results_file=None, pipestat_config=None, **kwargs ): # Params defines the set of options that could be updated via @@ -347,7 +348,16 @@ def __init__( self.debug("No config file") self.config = None - self._pipestat_manager = pipestat_manager + # pipesatat setup + potential_namespace = getattr(self, "sample_name", self.name) + self._pipestat_manager = PipestatManager( + namespace=pipestat_namespace or potential_namespace, + record_identifier=pipestat_record_id or potential_namespace, + schema_path=pipestat_schema, + results_file_path=pipestat_results_file or pipeline_filepath( + self, filename="pipestat_results.yaml"), + config=pipestat_config + ) @property def pipestat(self): diff --git a/pypiper/utils.py b/pypiper/utils.py index 67851aab..2959d14f 100644 --- a/pypiper/utils.py +++ b/pypiper/utils.py @@ -816,7 +816,10 @@ def _determine_args(argument_groups, arguments, use_all_args=False): "looper": ["config", "output-parent", "mem", "cores"], "common": ["input", "sample-name"], "ngs": ["sample-name", "input", "input2", "genome", "single-or-paired"], - "logmuse": LOGGING_CLI_OPTDATA.keys() + "logmuse": LOGGING_CLI_OPTDATA.keys(), + "pipestat": ["pipestat-namespace", "pipestat-record-id", + "pipestat-schema", "pipestat-results-file", + "pipestat-config"] } # Handle various types of group specifications. @@ -936,7 +939,20 @@ def _add_args(parser, args, required): "help": "Identifier for genome assembly"}), "single-or-paired": ("-Q", {"default": "single", - "help": "Single- or paired-end sequencing protocol"}) + "help": "Single- or paired-end sequencing protocol"}), + "pipestat-namespace": + {"help": "Namespace to report into. This will be the DB table name " + "if using DB as the object back-end"}, + "pipestat-record-id": + {"help": "Record identifier to report for"}, + "pipestat-schema": + {"help": "Path to the output schema that formalizes the " + "results structure"}, + "pipestat-config": + {"help": "Path to the configuration file"}, + "pipestat-results-file": + {"help": "YAML file to report into, if file is used as " + "the object back-end"} } from logmuse import LOGGING_CLI_OPTDATA From 7257ecaf05483e7623311aca0227030200434d4b Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 20 Jan 2021 14:28:01 -0500 Subject: [PATCH 06/25] add pipeline-name to looper arg group --- pypiper/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pypiper/utils.py b/pypiper/utils.py index 2959d14f..3eda9ddb 100644 --- a/pypiper/utils.py +++ b/pypiper/utils.py @@ -813,7 +813,7 @@ def _determine_args(argument_groups, arguments, use_all_args=False): "config": ["config"], "checkpoint": ["stop-before", "stop-after"], "resource": ["mem", "cores"], - "looper": ["config", "output-parent", "mem", "cores"], + "looper": ["config", "output-parent", "mem", "cores", "pipeline-name"], "common": ["input", "sample-name"], "ngs": ["sample-name", "input", "input2", "genome", "single-or-paired"], "logmuse": LOGGING_CLI_OPTDATA.keys(), @@ -914,6 +914,8 @@ def _add_args(parser, args, required): "help": "Pipeline configuration file (YAML). " "Relative paths are with respect to the " "pipeline script."}), + "pipeline-name": + {"metavar": "PIPELINE_NAME", "help": "Name of the pipeline"}, "sample-name": ("-S", {"metavar": "SAMPLE_NAME", "help": "Name for sample to run"}), From 4b66aa80de40ee2eac2c32bd0f6eeebe70261d29 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 25 Jan 2021 09:55:42 -0500 Subject: [PATCH 07/25] update pipestatmanager configuration --- pypiper/manager.py | 9 +++++++-- requirements/reqs-ngstk.txt | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pypiper/manager.py b/pypiper/manager.py index cefb07c8..65d6ed4d 100644 --- a/pypiper/manager.py +++ b/pypiper/manager.py @@ -350,12 +350,17 @@ def __init__( # pipesatat setup potential_namespace = getattr(self, "sample_name", self.name) + + # don't force default pipestat_results_file value unless + # pipestat config not provided + if pipestat_config is None and pipestat_results_file is None: + pipestat_results_file = pipeline_filepath( + self, filename="pipestat_results.yaml") self._pipestat_manager = PipestatManager( namespace=pipestat_namespace or potential_namespace, record_identifier=pipestat_record_id or potential_namespace, schema_path=pipestat_schema, - results_file_path=pipestat_results_file or pipeline_filepath( - self, filename="pipestat_results.yaml"), + results_file_path=pipestat_results_file, config=pipestat_config ) diff --git a/requirements/reqs-ngstk.txt b/requirements/reqs-ngstk.txt index 80184784..0c62f1a7 100644 --- a/requirements/reqs-ngstk.txt +++ b/requirements/reqs-ngstk.txt @@ -1,4 +1,4 @@ numpy pandas pysam -yacman +yacman \ No newline at end of file From 0916e3e3397c2eb026848b217028bc8c1afcef80 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 1 Feb 2021 12:41:28 -0500 Subject: [PATCH 08/25] pipestat setup optional --- pypiper/manager.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/pypiper/manager.py b/pypiper/manager.py index 65d6ed4d..67ed2c88 100644 --- a/pypiper/manager.py +++ b/pypiper/manager.py @@ -348,21 +348,22 @@ def __init__( self.debug("No config file") self.config = None - # pipesatat setup - potential_namespace = getattr(self, "sample_name", self.name) - - # don't force default pipestat_results_file value unless - # pipestat config not provided - if pipestat_config is None and pipestat_results_file is None: - pipestat_results_file = pipeline_filepath( - self, filename="pipestat_results.yaml") - self._pipestat_manager = PipestatManager( - namespace=pipestat_namespace or potential_namespace, - record_identifier=pipestat_record_id or potential_namespace, - schema_path=pipestat_schema, - results_file_path=pipestat_results_file, - config=pipestat_config - ) + if pipestat_schema is not None: + # pipesatat setup + potential_namespace = getattr(self, "sample_name", self.name) + + # don't force default pipestat_results_file value unless + # pipestat config not provided + if pipestat_config is None and pipestat_results_file is None: + pipestat_results_file = pipeline_filepath( + self, filename="pipestat_results.yaml") + self._pipestat_manager = PipestatManager( + namespace=pipestat_namespace or potential_namespace, + record_identifier=pipestat_record_id or potential_namespace, + schema_path=pipestat_schema, + results_file_path=pipestat_results_file, + config=pipestat_config + ) @property def pipestat(self): @@ -375,7 +376,7 @@ def pipestat(self): :return pipestat.PipestatManager: object to use for results reporting """ - return self._pipestat_manager + return getattr(self, "_pipestat_manager", None) @property def _completed(self): From 7f0a682b1b813b68279e6f986617cfc82f494584 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 4 May 2021 14:15:13 -0400 Subject: [PATCH 09/25] reformat, update PipestatManafer configuration --- docs/conf.py | 214 ++-- example_pipelines/basic.py | 6 +- example_pipelines/count_reads.py | 55 +- example_pipelines/hello_pypiper.py | 3 +- example_pipelines/logmuse_example.py | 63 +- init_interactive.py | 4 +- pypiper/__init__.py | 10 +- pypiper/const.py | 2 +- pypiper/exceptions.py | 46 +- pypiper/flags.py | 3 +- pypiper/folder_context.py | 16 +- pypiper/manager.py | 1009 +++++++++++------ pypiper/ngstk.py | 941 ++++++++++----- pypiper/pipeline.py | 132 ++- pypiper/stage.py | 43 +- pypiper/utils.py | 378 +++--- setup.py | 60 +- tests/conftest.py | 25 +- tests/helpers.py | 24 +- tests/pipeline/conftest.py | 52 +- tests/pipeline/test_multi_pipeline_sample.py | 38 +- tests/pipeline/test_pipeline.py | 263 ++--- tests/pipeline/test_pipeline_checkpoint.py | 59 +- tests/pipeline/test_pipeline_constructor.py | 101 +- tests/pipeline_manager/test_halt.py | 18 +- .../test_manager_constructor.py | 115 +- tests/pipeline_manager/test_manager_state.py | 43 +- .../pipeline_manager/test_pipeline_manager.py | 108 +- .../test_pipeline_manager_timestamp.py | 88 +- ...e_manager_timestamp_checkpoint_filepath.py | 42 +- .../pipeline_manager/test_set_status_flag.py | 21 +- tests/test_packaging.py | 18 +- tests/test_pipeline_filepath.py | 36 +- .../test_check_command_callability.py | 144 ++- tests/utils_tests/test_head_util.py | 43 +- 35 files changed, 2570 insertions(+), 1653 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 27ec7815..f796b3df 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,66 +12,72 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys import os +import sys # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) -sys.path.insert(0, os.path.abspath('../../')) +# sys.path.insert(0, os.path.abspath('.')) +sys.path.insert(0, os.path.abspath("../../")) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.intersphinx', - 'sphinx.ext.todo', - 'sphinx.ext.coverage', - 'sphinx.ext.viewcode', + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.intersphinx", + "sphinx.ext.todo", + "sphinx.ext.coverage", + "sphinx.ext.viewcode", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = u'pypiper' -copyright = u'2015, Nathan Sheffield, Johanna Klughammer, Andre Rendeiro' +project = u"pypiper" +copyright = u"2015, Nathan Sheffield, Johanna Klughammer, Andre Rendeiro" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = open(os.path.join("..", "..", "pypiper", "_version.py")).read().strip().split(" ")[-1].strip('"') +version = ( + open(os.path.join("..", "..", "pypiper", "_version.py")) + .read() + .strip() + .split(" ")[-1] + .strip('"') +) # The full version, including alpha/beta/rc tags. release = version # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -#language = None +# language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -79,27 +85,27 @@ # The reST default role (used for this markup: `text`) to use for all # documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False +# keep_warnings = False # -- Options for HTML output ---------------------------------------------- @@ -115,122 +121,125 @@ # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -#html_theme_options = {} +# html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. -#html_extra_path = [] +# html_extra_path = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True +# html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'pypiperdoc' +htmlhelp_basename = "pypiperdoc" # -- Options for LaTeX output --------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + #'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - ('index', 'pypiper.tex', u'pypiper Documentation', - u'Nathan Sheffield, Johanna Klughammer, Andre Rendeiro', 'manual'), + ( + "index", + "pypiper.tex", + u"pypiper Documentation", + u"Nathan Sheffield, Johanna Klughammer, Andre Rendeiro", + "manual", + ), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output --------------------------------------- @@ -238,12 +247,17 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'pypiper', u'pypiper Documentation', - [u'Nathan Sheffield, Johanna Klughammer, Andre Rendeiro'], 1) + ( + "index", + "pypiper", + u"pypiper Documentation", + [u"Nathan Sheffield, Johanna Klughammer, Andre Rendeiro"], + 1, + ) ] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------- @@ -252,93 +266,99 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'pypiper', u'pypiper Documentation', - u'Nathan Sheffield, Johanna Klughammer, Andre Rendeiro', 'pypiper', 'One line description of project.', - 'Miscellaneous'), + ( + "index", + "pypiper", + u"pypiper Documentation", + u"Nathan Sheffield, Johanna Klughammer, Andre Rendeiro", + "pypiper", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False +# texinfo_no_detailmenu = False # -- Options for Epub output ---------------------------------------------- # Bibliographic Dublin Core info. -epub_title = u'pypiper' -epub_author = u'Nathan Sheffield, Johanna Klughammer, Andre Rendeiro' -epub_publisher = u'Nathan Sheffield, Johanna Klughammer, Andre Rendeiro' -epub_copyright = u'2015, Nathan Sheffield, Johanna Klughammer, Andre Rendeiro' +epub_title = u"pypiper" +epub_author = u"Nathan Sheffield, Johanna Klughammer, Andre Rendeiro" +epub_publisher = u"Nathan Sheffield, Johanna Klughammer, Andre Rendeiro" +epub_copyright = u"2015, Nathan Sheffield, Johanna Klughammer, Andre Rendeiro" # The basename for the epub file. It defaults to the project name. -#epub_basename = u'pypiper' +# epub_basename = u'pypiper' # The HTML theme for the epub output. Since the default themes are not optimized # for small screen space, using the same theme for HTML and epub output is # usually not wise. This defaults to 'epub', a theme designed to save visual # space. -#epub_theme = 'epub' +# epub_theme = 'epub' # The language of the text. It defaults to the language option # or en if the language is not set. -#epub_language = '' +# epub_language = '' # The scheme of the identifier. Typical schemes are ISBN or URL. -#epub_scheme = '' +# epub_scheme = '' # The unique identifier of the text. This can be a ISBN number # or the project homepage. -#epub_identifier = '' +# epub_identifier = '' # A unique identification for the text. -#epub_uid = '' +# epub_uid = '' # A tuple containing the cover image and cover page html template filenames. -#epub_cover = () +# epub_cover = () # A sequence of (type, uri, title) tuples for the guide element of content.opf. -#epub_guide = () +# epub_guide = () # HTML files that should be inserted before the pages created by sphinx. # The format is a list of tuples containing the path and title. -#epub_pre_files = [] +# epub_pre_files = [] # HTML files shat should be inserted after the pages created by sphinx. # The format is a list of tuples containing the path and title. -#epub_post_files = [] +# epub_post_files = [] # A list of files that should not be packed into the epub file. -epub_exclude_files = ['search.html'] +epub_exclude_files = ["search.html"] # The depth of the table of contents in toc.ncx. -#epub_tocdepth = 3 +# epub_tocdepth = 3 # Allow duplicate toc entries. -#epub_tocdup = True +# epub_tocdup = True # Choose between 'default' and 'includehidden'. -#epub_tocscope = 'default' +# epub_tocscope = 'default' # Fix unsupported image types using the PIL. -#epub_fix_images = False +# epub_fix_images = False # Scale large images. -#epub_max_image_width = 0 +# epub_max_image_width = 0 # How to display URL addresses: 'footnote', 'no', or 'inline'. -#epub_show_urls = 'inline' +# epub_show_urls = 'inline' # If false, no index is generated. -#epub_use_index = True +# epub_use_index = True # Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {'http://docs.python.org/': None} +intersphinx_mapping = {"http://docs.python.org/": None} diff --git a/example_pipelines/basic.py b/example_pipelines/basic.py index d4c7bd55..34a0d377 100755 --- a/example_pipelines/basic.py +++ b/example_pipelines/basic.py @@ -8,13 +8,13 @@ # First, make sure you can import the pypiper package import os + import pypiper # Create a PipelineManager instance (don't forget to name it!) # This starts the pipeline. -pm = pypiper.PipelineManager(name="BASIC", - outfolder="pipeline_output/") +pm = pypiper.PipelineManager(name="BASIC", outfolder="pipeline_output/") # Now just build shell command strings, and use the run function # to execute them in order. run needs 2 things: a command, and the @@ -57,5 +57,5 @@ # Now, stop the pipeline to complete gracefully. pm.stop_pipeline() -# Observe your outputs in the pipeline_output folder +# Observe your outputs in the pipeline_output folder # to see what you've created. diff --git a/example_pipelines/count_reads.py b/example_pipelines/count_reads.py index c9703da9..f7648dec 100755 --- a/example_pipelines/count_reads.py +++ b/example_pipelines/count_reads.py @@ -9,25 +9,32 @@ __license__ = "GPL3" __version__ = "0.1" -from argparse import ArgumentParser -import os, re -import sys +import os +import re import subprocess +import sys +from argparse import ArgumentParser + import yaml + import pypiper parser = ArgumentParser( description="A pipeline to count the number of reads and file size. Accepts" - " BAM, fastq, or fastq.gz files.") + " BAM, fastq, or fastq.gz files." +) # First, add standard arguments from Pypiper. # groups="pypiper" will add all the arguments that pypiper uses, # and adding "common" adds arguments for --input and --sample--name # and "output_parent". You can read more about your options for standard # arguments in the pypiper docs (section "command-line arguments") -parser = pypiper.add_pypiper_args(parser, groups=["pypiper", "common", "ngs"], - args=["output-parent", "config"], - required=['sample-name', 'output-parent']) +parser = pypiper.add_pypiper_args( + parser, + groups=["pypiper", "common", "ngs"], + args=["output-parent", "config"], + required=["sample-name", "output-parent"], +) # Add any pipeline-specific arguments if you like here. @@ -42,16 +49,14 @@ else: args.paired_end = False -# args for `output_parent` and `sample_name` were added by the standard -# `add_pypiper_args` function. +# args for `output_parent` and `sample_name` were added by the standard +# `add_pypiper_args` function. # A good practice is to make an output folder for each sample, housed under # the parent output folder, like this: outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name)) # Create a PipelineManager object and start the pipeline -pm = pypiper.PipelineManager(name="count", - outfolder=outfolder, - args=args) +pm = pypiper.PipelineManager(name="count", outfolder=outfolder, args=args) # NGSTk is a "toolkit" that comes with pypiper, providing some functions # for dealing with genome sequence data. You can read more about toolkits in the @@ -75,15 +80,12 @@ # and convert these to fastq files. local_input_files = ngstk.merge_or_link( - [args.input, args.input2], - raw_folder, - args.sample_name) + [args.input, args.input2], raw_folder, args.sample_name +) cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq( - local_input_files, - args.sample_name, - args.paired_end, - fastq_folder) + local_input_files, args.sample_name, args.paired_end, fastq_folder +) # Now we'll use another NGSTk function to grab the file size from the input files @@ -95,10 +97,17 @@ n_input_files = len(list(filter(bool, local_input_files))) -raw_reads = sum([int(ngstk.count_reads(input_file, args.paired_end)) - for input_file in local_input_files]) / n_input_files - -# Finally, we use the report_result() function to print the output and +raw_reads = ( + sum( + [ + int(ngstk.count_reads(input_file, args.paired_end)) + for input_file in local_input_files + ] + ) + / n_input_files +) + +# Finally, we use the report_result() function to print the output and # log the key-value pair in the standard stats.tsv file pm.report_result("Raw_reads", str(raw_reads)) diff --git a/example_pipelines/hello_pypiper.py b/example_pipelines/hello_pypiper.py index 2824a142..88abecfd 100755 --- a/example_pipelines/hello_pypiper.py +++ b/example_pipelines/hello_pypiper.py @@ -1,7 +1,8 @@ #!/usr/bin/env python import pypiper -outfolder = "hello_pypiper_results" # Choose a folder for your results + +outfolder = "hello_pypiper_results" # Choose a folder for your results # Create a PipelineManager, the workhorse of pypiper pm = pypiper.PipelineManager(name="hello_pypiper", outfolder=outfolder) diff --git a/example_pipelines/logmuse_example.py b/example_pipelines/logmuse_example.py index 91fe73f2..61d8cc97 100755 --- a/example_pipelines/logmuse_example.py +++ b/example_pipelines/logmuse_example.py @@ -9,52 +9,57 @@ __license__ = "GPL3" __version__ = "0.1" -from argparse import ArgumentParser -import os, re -import sys +import os +import re import subprocess +import sys +from argparse import ArgumentParser + import yaml -import pypiper +import pypiper def build_argparser(): parser = ArgumentParser( description="A pipeline to count the number of reads and file size. Accepts" - " BAM, fastq, or fastq.gz files.") + " BAM, fastq, or fastq.gz files." + ) # First, add standard arguments from Pypiper. # groups="pypiper" will add all the arguments that pypiper uses, # and adding "common" adds arguments for --input and --sample--name # and "output_parent". You can read more about your options for standard # arguments in the pypiper docs (section "command-line arguments") - parser = pypiper.add_pypiper_args(parser, groups=["pypiper", "common", "ngs", "logmuse"], - args=["output-parent", "config"], - required=['sample-name', 'output-parent']) + parser = pypiper.add_pypiper_args( + parser, + groups=["pypiper", "common", "ngs", "logmuse"], + args=["output-parent", "config"], + required=["sample-name", "output-parent"], + ) # Add any pipeline-specific arguments if you like here. - # args for `output_parent` and `sample_name` were added by the standard - # `add_pypiper_args` function. + # args for `output_parent` and `sample_name` were added by the standard + # `add_pypiper_args` function. return parser + def run_pipeline(): # A good practice is to make an output folder for each sample, housed under # the parent output folder, like this: outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name)) # Create a PipelineManager object and start the pipeline - pm = pypiper.PipelineManager(name="logmuse-test", - outfolder=outfolder, - args=args) + pm = pypiper.PipelineManager(name="logmuse-test", outfolder=outfolder, args=args) pm.info("Getting started!") # NGSTk is a "toolkit" that comes with pypiper, providing some functions # for dealing with genome sequence data. You can read more about toolkits in the # documentation - files = [str(x) + ".tmp" for x in range(1,20)] + files = [str(x) + ".tmp" for x in range(1, 20)] pm.run("touch " + " ".join(files), target=files, clean=True) @@ -76,30 +81,32 @@ def run_pipeline(): # and convert these to fastq files. local_input_files = ngstk.merge_or_link( - [args.input, args.input2], - raw_folder, - args.sample_name) + [args.input, args.input2], raw_folder, args.sample_name + ) cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq( - local_input_files, - args.sample_name, - args.paired_end, - fastq_folder) - + local_input_files, args.sample_name, args.paired_end, fastq_folder + ) # Now we'll use another NGSTk function to grab the file size from the input files # pm.report_result("File_mb", ngstk.get_file_size(local_input_files)) - # And then count the number of reads in the file n_input_files = len(list(filter(bool, local_input_files))) - raw_reads = sum([int(ngstk.count_reads(input_file, args.paired_end)) - for input_file in local_input_files]) / n_input_files - - # Finally, we use the report_result() function to print the output and + raw_reads = ( + sum( + [ + int(ngstk.count_reads(input_file, args.paired_end)) + for input_file in local_input_files + ] + ) + / n_input_files + ) + + # Finally, we use the report_result() function to print the output and # log the key-value pair in the standard stats.tsv file pm.report_result("Raw_reads", str(raw_reads)) @@ -107,7 +114,7 @@ def run_pipeline(): pm.stop_pipeline() -if __name__ == '__main__': +if __name__ == "__main__": try: parser = build_argparser() args = parser.parse_args() diff --git a/init_interactive.py b/init_interactive.py index b63e4fb5..15dfab1f 100644 --- a/init_interactive.py +++ b/init_interactive.py @@ -1,14 +1,12 @@ """ Create dummy PipelineManager and NGSTk instance for interactive session. """ import os -from pypiper import PipelineManager -from pypiper import NGSTk +from pypiper import NGSTk, PipelineManager __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - pm = PipelineManager(name="interactive", outfolder=os.path.expanduser("~")) tk = NGSTk(pm=pm) diff --git a/pypiper/__init__.py b/pypiper/__init__.py index 6a1802d1..3076285e 100644 --- a/pypiper/__init__.py +++ b/pypiper/__init__.py @@ -1,10 +1,10 @@ +# Implicitly re-export so logmuse usage by pipeline author routes through here. +from logmuse import add_logging_options + from ._version import __version__ +from .exceptions import * from .manager import * from .ngstk import * -from .utils import * from .pipeline import * -from .exceptions import * from .stage import * - -# Implicitly re-export so logmuse usage by pipeline author routes through here. -from logmuse import add_logging_options +from .utils import * diff --git a/pypiper/const.py b/pypiper/const.py index 5f2d66e8..0159ddac 100644 --- a/pypiper/const.py +++ b/pypiper/const.py @@ -4,4 +4,4 @@ CHECKPOINT_EXTENSION = ".checkpoint" PIPELINE_CHECKPOINT_DELIMITER = "_" STAGE_NAME_SPACE_REPLACEMENT = "-" -PROFILE_COLNAMES = ['pid', 'hash', 'cid', 'runtime', 'mem', 'cmd', 'lock'] +PROFILE_COLNAMES = ["pid", "hash", "cid", "runtime", "mem", "cmd", "lock"] diff --git a/pypiper/exceptions.py b/pypiper/exceptions.py index 33e3a10c..063b3641 100644 --- a/pypiper/exceptions.py +++ b/pypiper/exceptions.py @@ -4,41 +4,46 @@ __email__ = "vreuter@virginia.edu" -__all__ = ["PipelineError", "PipelineHalt", "IllegalPipelineDefinitionError", - "IllegalPipelineExecutionError", "MissingCheckpointError", - "UnknownPipelineStageError", "UnsupportedFiletypeException", - "SubprocessError"] - - +__all__ = [ + "PipelineError", + "PipelineHalt", + "IllegalPipelineDefinitionError", + "IllegalPipelineExecutionError", + "MissingCheckpointError", + "UnknownPipelineStageError", + "UnsupportedFiletypeException", + "SubprocessError", +] class PipelineError(Exception): - """ General pipeline error. """ + """General pipeline error.""" + pass + class SubprocessError(Exception): pass + class IllegalPipelineDefinitionError(PipelineError): pass - class IllegalPipelineExecutionError(PipelineError): - """ Represent cases of illogical start/stop run() declarations. """ - pass + """Represent cases of illogical start/stop run() declarations.""" + pass class MissingCheckpointError(Exception): - """ Represent case of expected but absent checkpoint file. """ + """Represent case of expected but absent checkpoint file.""" def __init__(self, checkpoint, filepath): msg = "{}: '{}'".format(checkpoint, filepath) super(MissingCheckpointError, self).__init__(msg) - class UnknownPipelineStageError(Exception): """ Triggered by use of unknown/undefined name for a pipeline stage. @@ -47,7 +52,6 @@ class UnknownPipelineStageError(Exception): :param pypiper.Pipeline pipeline: Pipeline for which the stage is unknown/undefined. """ - def __init__(self, stage_name, pipeline=None): message = stage_name if pipeline is not None: @@ -57,12 +61,12 @@ def __init__(self, stage_name, pipeline=None): # Just don't contextualize the error with known stages. pass else: - message = "{}; defined stages: {}". \ - format(message, ", ".join(map(str, stages))) + message = "{}; defined stages: {}".format( + message, ", ".join(map(str, stages)) + ) super(UnknownPipelineStageError, self).__init__(message) - class PipelineHalt(Exception): """ Execution-stopping exception for halting a pipeline. @@ -74,6 +78,7 @@ class PipelineHalt(Exception): PipelineManager's halt method raise this exception. """ + def __init__(self, checkpoint=None, finished=None): if checkpoint is None: super(PipelineHalt, self).__init__() @@ -81,8 +86,9 @@ def __init__(self, checkpoint=None, finished=None): if isinstance(checkpoint, str): last_stage_done = checkpoint else: - last_stage_done = getattr(checkpoint, "name", None) or \ - getattr(checkpoint, "__name__", None) + last_stage_done = getattr(checkpoint, "name", None) or getattr( + checkpoint, "__name__", None + ) if not last_stage_done: super(PipelineHalt, self).__init__() else: @@ -95,9 +101,9 @@ def __init__(self, checkpoint=None, finished=None): super(PipelineHalt, self).__init__(msg) - class UnsupportedFiletypeException(Exception): - """ Restrict filetype domain. """ + """Restrict filetype domain.""" + # Use superclass ctor to allow file name/path or extension to pass # through as the message for why this error is occurring. pass diff --git a/pypiper/flags.py b/pypiper/flags.py index 09e3fb85..21e97d27 100644 --- a/pypiper/flags.py +++ b/pypiper/flags.py @@ -8,5 +8,4 @@ PAUSE_FLAG = "partial" FLAGS = [RUN_FLAG, COMPLETE_FLAG, FAIL_FLAG, WAIT_FLAG, PAUSE_FLAG] -__all__ = ["COMPLETE_FLAG", "FAIL_FLAG", "FLAGS", - "PAUSE_FLAG", "RUN_FLAG", "WAIT_FLAG"] +__all__ = ["COMPLETE_FLAG", "FAIL_FLAG", "FLAGS", "PAUSE_FLAG", "RUN_FLAG", "WAIT_FLAG"] diff --git a/pypiper/folder_context.py b/pypiper/folder_context.py index 360d6c0c..77828af5 100644 --- a/pypiper/folder_context.py +++ b/pypiper/folder_context.py @@ -2,14 +2,12 @@ import os - __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - class FolderContext(object): - """ Context manager for temporarily changing directory. """ + """Context manager for temporarily changing directory.""" def __init__(self, folder): """ @@ -18,18 +16,18 @@ def __init__(self, folder): :param str folder: Path to set as new working directory """ if not os.path.isdir(folder): - raise ValueError( - "Requested temp entry to non-folder: {}".format(folder)) + raise ValueError("Requested temp entry to non-folder: {}".format(folder)) self._prevdir = os.getcwd() self._currdir = folder def __enter__(self): - """ Make the working directory switch. """ + """Make the working directory switch.""" os.chdir(self._currdir) def __exit__(self, exc_type, exc_val, exc_tb): - """ Switch back to the previous working directory. """ + """Switch back to the previous working directory.""" if not os.path.isdir(self._prevdir): - raise RuntimeError("Return path is no longer a directory: {}". - format(self._prevdir)) + raise RuntimeError( + "Return path is no longer a directory: {}".format(self._prevdir) + ) os.chdir(self._prevdir) diff --git a/pypiper/manager.py b/pypiper/manager.py index 67ed2c88..af0cad78 100644 --- a/pypiper/manager.py +++ b/pypiper/manager.py @@ -8,36 +8,47 @@ """ import atexit -from collections import Iterable import datetime import errno import glob import os import platform -import psutil import re import shlex # for splitting commands like a shell does import signal import subprocess import sys import time -import pandas as _pd - -from pipestat import PipestatManager, PipestatError -from attmap import AttMapEcho +import warnings +from collections import Iterable from hashlib import md5 + +import __main__ import logmuse +import pandas as _pd +import psutil +from attmap import AttMapEcho +from pipestat import PipestatError, PipestatManager from yacman import load_yaml + +from ._version import __version__ +from .const import PROFILE_COLNAMES from .exceptions import PipelineHalt, SubprocessError from .flags import * -from .utils import \ - check_shell, checkpoint_filepath, clear_flags, default_pipeline_config, \ - flag_name, get_proc_name, is_multi_target, logger_via_cli, make_lock_name, \ - parse_cmd, pipeline_filepath, CHECKPOINT_SPECIFICATIONS -from .const import PROFILE_COLNAMES -from ._version import __version__ -import __main__ - +from .utils import ( + CHECKPOINT_SPECIFICATIONS, + check_shell, + checkpoint_filepath, + clear_flags, + default_pipeline_config, + flag_name, + get_proc_name, + is_multi_target, + logger_via_cli, + make_lock_name, + parse_cmd, + pipeline_filepath, +) __all__ = ["PipelineManager"] @@ -85,7 +96,7 @@ class PipelineManager(object): even if the preceding command is not run. By default, following functions are only run if the preceding command is run. :param int cores: number of processors to use, default 1 - :param str mem: amount of memory to use. Default units are megabytes unless + :param str mem: amount of memory to use. Default units are megabytes unless specified using the suffix [K|M|G|T]." :param str config_file: path to pipeline configuration file, optional :param str output_parent: path to folder in which output folder will live @@ -103,13 +114,30 @@ class PipelineManager(object): via args namespace, or if both stopping types (exclusive/prospective and inclusive/retrospective) are provided. """ + def __init__( - self, name, outfolder, version=None, args=None, multi=False, - dirty=False, recover=False, new_start=False, force_follow=False, - cores=1, mem="1000M", config_file=None, output_parent=None, - overwrite_checkpoints=False, logger_kwargs=None, - pipestat_namespace=None, pipestat_record_id=None, pipestat_schema=None, - pipestat_results_file=None, pipestat_config=None, **kwargs + self, + name, + outfolder, + version=None, + args=None, + multi=False, + dirty=False, + recover=False, + new_start=False, + force_follow=False, + cores=1, + mem="1000M", + config_file=None, + output_parent=None, + overwrite_checkpoints=False, + logger_kwargs=None, + pipestat_namespace=None, + pipestat_record_id=None, + pipestat_schema=None, + pipestat_results_file=None, + pipestat_config=None, + **kwargs, ): # Params defines the set of options that could be updated via @@ -120,15 +148,15 @@ def __init__( # Establish default params params = { - 'dirty': dirty, - 'recover': recover, - 'new_start': new_start, - 'force_follow': force_follow, - 'config_file': config_file, - 'output_parent': output_parent, - 'cores': cores, - 'mem': mem, - 'testmode': False + "dirty": dirty, + "recover": recover, + "new_start": new_start, + "force_follow": force_follow, + "config_file": config_file, + "output_parent": output_parent, + "cores": cores, + "mem": mem, + "testmode": False, } # Transform the command-line namespace into a Mapping. @@ -169,13 +197,13 @@ def __init__( # Pipeline settings self.name = name self.tee = None - self.overwrite_locks = params['recover'] - self.new_start = params['new_start'] - self.force_follow = params['force_follow'] - self.dirty = params['dirty'] - self.cores = params['cores'] - self.output_parent = params['output_parent'] - self.testmode = params['testmode'] + self.overwrite_locks = params["recover"] + self.new_start = params["new_start"] + self.force_follow = params["force_follow"] + self.dirty = params["dirty"] + self.cores = params["cores"] + self.output_parent = params["output_parent"] + self.testmode = params["testmode"] # Set up logger logger_kwargs = logger_kwargs or {} @@ -210,11 +238,11 @@ def __init__( # total memory limit provided. # This will give a little breathing room for non-heap java memory use. - if not params['mem'].endswith(('K', 'M', 'G', 'T')): - self.mem = params['mem'] + "M" + if not params["mem"].endswith(("K", "M", "G", "T")): + self.mem = params["mem"] + "M" else: # Assume the memory is in megabytes. - self.mem = params['mem'] + self.mem = params["mem"] self.javamem = str(int(int(self.mem[:-1]) * 0.95)) + self.mem[-1:] @@ -239,27 +267,22 @@ def __init__( self.pl_version = version # Set relative output_parent directory to absolute # not necessary after all. . . - #if self.output_parent and not os.path.isabs(self.output_parent): + # if self.output_parent and not os.path.isabs(self.output_parent): # self.output_parent = os.path.join(os.getcwd(), self.output_parent) # File paths: - self.outfolder = os.path.join(outfolder, '') # trailing slash + self.outfolder = os.path.join(outfolder, "") # trailing slash self.pipeline_log_file = pipeline_filepath(self, suffix="_log.md") - self.pipeline_profile_file = \ - pipeline_filepath(self, suffix="_profile.tsv") + self.pipeline_profile_file = pipeline_filepath(self, suffix="_profile.tsv") # Stats and figures are general and so lack the pipeline name. - self.pipeline_stats_file = \ - pipeline_filepath(self, filename="stats.tsv") - self.pipeline_figures_file = \ - pipeline_filepath(self, filename="figures.tsv") - self.pipeline_objects_file = \ - pipeline_filepath(self, filename="objects.tsv") + self.pipeline_stats_file = pipeline_filepath(self, filename="stats.tsv") + self.pipeline_figures_file = pipeline_filepath(self, filename="figures.tsv") + self.pipeline_objects_file = pipeline_filepath(self, filename="objects.tsv") # Record commands used and provide manual cleanup script. - self.pipeline_commands_file = \ - pipeline_filepath(self, suffix="_commands.sh") + self.pipeline_commands_file = pipeline_filepath(self, suffix="_commands.sh") self.cleanup_file = pipeline_filepath(self, suffix="_cleanup.sh") # Pipeline status variables @@ -270,7 +293,7 @@ def __init__( self.locks = [] self.running_procs = {} self.completed_procs = {} - + self.wait = True # turn off for debugging # Initialize status and flags @@ -297,6 +320,22 @@ def __init__( signal.signal(signal.SIGINT, self._signal_int_handler) signal.signal(signal.SIGTERM, self._signal_term_handler) + # pipesatat setup + potential_namespace = getattr(self, "sample_name", self.name) + + # don't force default pipestat_results_file value unless + # pipestat config not provided + if pipestat_config is None and pipestat_results_file is None: + pipestat_results_file = pipeline_filepath( + self, filename="pipestat_results.yaml" + ) + self._pipestat_manager = PipestatManager( + namespace=pipestat_namespace or potential_namespace, + record_identifier=pipestat_record_id or potential_namespace, + schema_path=pipestat_schema, + results_file_path=pipestat_results_file, + config=pipestat_config, + ) self.start_pipeline(args, multi) # Handle config file if it exists @@ -337,8 +376,9 @@ def __init__( default_config = default_pipeline_config(sys.argv[0]) if os.path.isfile(default_config): config_to_load = default_config - self.debug("Using default pipeline config file: {}". - format(config_to_load)) + self.debug( + "Using default pipeline config file: {}".format(config_to_load) + ) # Finally load the config we found. if config_to_load is not None: @@ -348,27 +388,10 @@ def __init__( self.debug("No config file") self.config = None - if pipestat_schema is not None: - # pipesatat setup - potential_namespace = getattr(self, "sample_name", self.name) - - # don't force default pipestat_results_file value unless - # pipestat config not provided - if pipestat_config is None and pipestat_results_file is None: - pipestat_results_file = pipeline_filepath( - self, filename="pipestat_results.yaml") - self._pipestat_manager = PipestatManager( - namespace=pipestat_namespace or potential_namespace, - record_identifier=pipestat_record_id or potential_namespace, - schema_path=pipestat_schema, - results_file_path=pipestat_results_file, - config=pipestat_config - ) - @property def pipestat(self): """ - PipestatManager - object to use for pipeline results reporting + `pipestat.PipestatManager` object to use for pipeline results reporting and status management Depending on the object configuration it can report to a YAML-formatted file or PostgreSQL database. Please refer to pipestat @@ -376,7 +399,14 @@ def pipestat(self): :return pipestat.PipestatManager: object to use for results reporting """ - return getattr(self, "_pipestat_manager", None) + try: + return getattr(self, "_pipestat_manager") + except AttributeError: + raise PipestatError( + f"{PipestatManager.__name__} has not been configured for this pipeline run. " + f"Provide an output schema to the {PipelineManager.__name__} object " + f"in order to initialize it." + ) @property def _completed(self): @@ -385,7 +415,8 @@ def _completed(self): :return bool: Whether the managed pipeline is in a completed state. """ - return self.status == COMPLETE_FLAG + return self.pipestat.get_status() == COMPLETE_FLAG + # return self.status == COMPLETE_FLAG @property def _failed(self): @@ -394,7 +425,8 @@ def _failed(self): :return bool: Whether the managed pipeline is in a failed state. """ - return self.status == FAIL_FLAG + self.pipestat.get_status() == FAIL_FLAG + # return self.status == FAIL_FLAG @property def halted(self): @@ -403,7 +435,8 @@ def halted(self): :return bool: Whether the managed pipeline is in a paused/halted state. """ - return self.status == PAUSE_FLAG + self.pipestat.get_status() == PAUSE_FLAG + # return self.status == PAUSE_FLAG @property def _has_exit_status(self): @@ -415,30 +448,26 @@ def _has_exit_status(self): """ return self._completed or self.halted or self._failed - def setup_default_pipestat(self, schema_path, namespace=None, - record_identifier=None, results_file_path=None): - """ - A convenience method for ad hoc PipestatManager instantiation. - - Requires only a pipestat-like schema to get a functional PipestatManager - for reporting to a YAML-formatted file. - - :param str schema_path: path to the pipestat-like schema - :param str namespace: namespace to write into, default: pipeline name - :param record_identifier: recordID to report for, default: pipeline name - :param str results_file_path: YAML file to reoprt into, defaults to a - pipeline-named file in the standard pipeline output directory - """ - if self.pipestat is not None: - raise PipestatError(f"{PipestatManager.__name__} is already " - f"initialized:\n{str(self.pipestat)}") - self._pipestat_manager = PipestatManager( - schema_path=schema_path, - name=namespace or self.name, - record_identifier=record_identifier or self.name, - results_file_path=results_file_path or pipeline_filepath( - self, suffix="_results_pipestat.yaml") - ) + # def setup_default_pipestat(self, schema_path): + # """ + # A convenience method for ad hoc PipestatManager instantiation. + # + # Requires only a pipestat-like schema to get a functional PipestatManager + # for reporting to a YAML-formatted file. + # + # :param str schema_path: path to the pipestat-like schema + # """ + # if self.pipestat is not None: + # raise PipestatError( + # f"{PipestatManager.__name__} is already " + # f"initialized:\n{str(self.pipestat)}" + # ) + # self._pipestat_manager = PipestatManager( + # schema_path=schema_path, + # namespace=self.name, + # record_identifier=self.name, + # results_file_path=pipeline_filepath(self, suffix="_results_pipestat.yaml"), + # ) def _ignore_interrupts(self): """ @@ -459,16 +488,18 @@ def start_pipeline(self, args=None, multi=False): # By default, Pypiper will mirror every operation so it is displayed both # on sys.stdout **and** to a log file. Unfortunately, interactive python sessions - # ruin this by interfering with stdout. So, for interactive mode, we do not enable + # ruin this by interfering with stdout. So, for interactive mode, we do not enable # the tee subprocess, sending all output to screen only. # Starting multiple PipelineManagers in the same script has the same problem, and # must therefore be run in interactive_mode. interactive_mode = multi or not hasattr(__main__, "__file__") if interactive_mode: - self.warning("Warning: You're running an interactive python session. " - "This works, but pypiper cannot tee the output, so results " - "are only logged to screen.") + self.warning( + "Warning: You're running an interactive python session. " + "This works, but pypiper cannot tee the output, so results " + "are only logged to screen." + ) else: sys.stdout = Unbuffered(sys.stdout) # sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) # Unbuffer output @@ -483,10 +514,12 @@ def start_pipeline(self, args=None, multi=False): # manually (in the exit handler). # a for append to file - + tee = subprocess.Popen( - ["tee", "-a", self.pipeline_log_file], stdin=subprocess.PIPE, - preexec_fn=self._ignore_interrupts) + ["tee", "-a", self.pipeline_log_file], + stdin=subprocess.PIPE, + preexec_fn=self._ignore_interrupts, + ) # If the pipeline is terminated with SIGTERM/SIGINT, # make sure we kill this spawned tee subprocess as well. @@ -516,29 +549,83 @@ def start_pipeline(self, args=None, multi=False): try: # pypiper dir ppd = os.path.dirname(os.path.realpath(__file__)) - gitvars['pypiper_dir'] = ppd - gitvars['pypiper_hash'] = subprocess.check_output("cd " + ppd + "; git rev-parse --verify HEAD 2>/dev/null", shell=True).decode().strip() - gitvars['pypiper_date'] = subprocess.check_output("cd " + ppd + "; git show -s --format=%ai HEAD 2>/dev/null", shell=True).decode().strip() - gitvars['pypiper_diff'] = subprocess.check_output("cd " + ppd + "; git diff --shortstat HEAD 2>/dev/null", shell=True).decode().strip() - gitvars['pypiper_branch'] = subprocess.check_output("cd " + ppd + "; git branch | grep '*' 2>/dev/null", shell=True).decode().strip() + gitvars["pypiper_dir"] = ppd + gitvars["pypiper_hash"] = ( + subprocess.check_output( + "cd " + ppd + "; git rev-parse --verify HEAD 2>/dev/null", + shell=True, + ) + .decode() + .strip() + ) + gitvars["pypiper_date"] = ( + subprocess.check_output( + "cd " + ppd + "; git show -s --format=%ai HEAD 2>/dev/null", + shell=True, + ) + .decode() + .strip() + ) + gitvars["pypiper_diff"] = ( + subprocess.check_output( + "cd " + ppd + "; git diff --shortstat HEAD 2>/dev/null", shell=True + ) + .decode() + .strip() + ) + gitvars["pypiper_branch"] = ( + subprocess.check_output( + "cd " + ppd + "; git branch | grep '*' 2>/dev/null", shell=True + ) + .decode() + .strip() + ) except Exception: pass try: # pipeline dir pld = os.path.dirname(os.path.realpath(sys.argv[0])) - gitvars['pipe_dir'] = pld - gitvars['pipe_hash'] = subprocess.check_output("cd " + pld + "; git rev-parse --verify HEAD 2>/dev/null", shell=True).decode().strip() - gitvars['pipe_date'] = subprocess.check_output("cd " + pld + "; git show -s --format=%ai HEAD 2>/dev/null", shell=True).decode().strip() - gitvars['pipe_diff'] = subprocess.check_output("cd " + pld + "; git diff --shortstat HEAD 2>/dev/null", shell=True).decode().strip() - gitvars['pipe_branch'] = subprocess.check_output("cd " + pld + "; git branch | grep '*' 2>/dev/null", shell=True).decode().strip() + gitvars["pipe_dir"] = pld + gitvars["pipe_hash"] = ( + subprocess.check_output( + "cd " + pld + "; git rev-parse --verify HEAD 2>/dev/null", + shell=True, + ) + .decode() + .strip() + ) + gitvars["pipe_date"] = ( + subprocess.check_output( + "cd " + pld + "; git show -s --format=%ai HEAD 2>/dev/null", + shell=True, + ) + .decode() + .strip() + ) + gitvars["pipe_diff"] = ( + subprocess.check_output( + "cd " + pld + "; git diff --shortstat HEAD 2>/dev/null", shell=True + ) + .decode() + .strip() + ) + gitvars["pipe_branch"] = ( + subprocess.check_output( + "cd " + pld + "; git branch | grep '*' 2>/dev/null", shell=True + ) + .decode() + .strip() + ) except Exception: pass - + # Print out a header section in the pipeline log: # Wrap things in backticks to prevent markdown from interpreting underscores as emphasis. # print("----------------------------------------") self.info("### Pipeline run code and environment:\n") - self.info("* " + "Command".rjust(20) + ": " + "`" + str(" ".join(sys.argv)) + "`") + self.info( + "* " + "Command".rjust(20) + ": " + "`" + str(" ".join(sys.argv)) + "`" + ) self.info("* " + "Compute host".rjust(20) + ": " + platform.node()) self.info("* " + "Working dir".rjust(20) + ": " + os.getcwd()) self.info("* " + "Outfolder".rjust(20) + ": " + self.outfolder) @@ -548,25 +635,75 @@ def start_pipeline(self, args=None, multi=False): self.info("\n### Version log:\n") self.info("* " + "Python version".rjust(20) + ": " + platform.python_version()) try: - self.info("* " + "Pypiper dir".rjust(20) + ": " + "`" + gitvars['pypiper_dir'].strip() + "`") + self.info( + "* " + + "Pypiper dir".rjust(20) + + ": " + + "`" + + gitvars["pypiper_dir"].strip() + + "`" + ) self.info("* " + "Pypiper version".rjust(20) + ": " + __version__) - self.info("* " + "Pypiper hash".rjust(20) + ": " + str(gitvars['pypiper_hash'])) - self.info("* " + "Pypiper branch".rjust(20) + ": " + str(gitvars['pypiper_branch'])) - self.info("* " + "Pypiper date".rjust(20) + ": " + str(gitvars['pypiper_date'])) - if gitvars['pypiper_diff']: - self.info("* " + "Pypiper diff".rjust(20) + ": " + str(gitvars['pypiper_diff'])) + self.info( + "* " + "Pypiper hash".rjust(20) + ": " + str(gitvars["pypiper_hash"]) + ) + self.info( + "* " + + "Pypiper branch".rjust(20) + + ": " + + str(gitvars["pypiper_branch"]) + ) + self.info( + "* " + "Pypiper date".rjust(20) + ": " + str(gitvars["pypiper_date"]) + ) + if gitvars["pypiper_diff"]: + self.info( + "* " + + "Pypiper diff".rjust(20) + + ": " + + str(gitvars["pypiper_diff"]) + ) except KeyError: # It is ok if keys aren't set, it means pypiper isn't in a git repo. pass try: - self.info("* " + "Pipeline dir".rjust(20) + ": " + "`" + gitvars['pipe_dir'].strip() + "`") - self.info("* " + "Pipeline version".rjust(20) + ": " + str(self.pl_version)) - self.info("* " + "Pipeline hash".rjust(20) + ": " + str(gitvars['pipe_hash']).strip()) - self.info("* " + "Pipeline branch".rjust(20) + ": " + str(gitvars['pipe_branch']).strip()) - self.info("* " + "Pipeline date".rjust(20) + ": " + str(gitvars['pipe_date']).strip()) - if (gitvars['pipe_diff'] != ""): - self.info("* " + "Pipeline diff".rjust(20) + ": " + str(gitvars['pipe_diff']).strip()) + self.info( + "* " + + "Pipeline dir".rjust(20) + + ": " + + "`" + + gitvars["pipe_dir"].strip() + + "`" + ) + self.info( + "* " + "Pipeline version".rjust(20) + ": " + str(self.pl_version) + ) + self.info( + "* " + + "Pipeline hash".rjust(20) + + ": " + + str(gitvars["pipe_hash"]).strip() + ) + self.info( + "* " + + "Pipeline branch".rjust(20) + + ": " + + str(gitvars["pipe_branch"]).strip() + ) + self.info( + "* " + + "Pipeline date".rjust(20) + + ": " + + str(gitvars["pipe_date"]).strip() + ) + if gitvars["pipe_diff"] != "": + self.info( + "* " + + "Pipeline diff".rjust(20) + + ": " + + str(gitvars["pipe_diff"]).strip() + ) except KeyError: # It is ok if keys aren't set, it means the pipeline isn't a git repo. pass @@ -578,17 +715,28 @@ def start_pipeline(self, args=None, multi=False): valtext = "`{}`".format(val) self.info("* {}: {}".format(argtext.rjust(20), valtext)) self.info("\n----------------------------------------\n") - self._set_status_flag(RUN_FLAG) + # self._set_status_flag(RUN_FLAG) + self.pipestat.set_status(status_identifier="running") # Record the start in PIPE_profile and PIPE_commands output files so we # can trace which run they belong to with open(self.pipeline_commands_file, "a") as myfile: - myfile.write("# Pipeline started at " + time.strftime("%m-%d %H:%M:%S", time.localtime(self.starttime)) + "\n\n") + myfile.write( + "# Pipeline started at " + + time.strftime("%m-%d %H:%M:%S", time.localtime(self.starttime)) + + "\n\n" + ) with open(self.pipeline_profile_file, "a") as myfile: - myfile.write("# Pipeline started at " + time.strftime("%m-%d %H:%M:%S", time.localtime(self.starttime)) - + "\n\n" + "# " + "\t".join(PROFILE_COLNAMES) + "\n") + myfile.write( + "# Pipeline started at " + + time.strftime("%m-%d %H:%M:%S", time.localtime(self.starttime)) + + "\n\n" + + "# " + + "\t".join(PROFILE_COLNAMES) + + "\n" + ) def _set_status_flag(self, status): """ @@ -613,8 +761,7 @@ def _set_status_flag(self, status): prev_status = self.status self.status = status self._create_file(self._flag_file_path()) - self.debug("\nChanged status from {} to {}.".format( - prev_status, self.status)) + self.debug("\nChanged status from {} to {}.".format(prev_status, self.status)) def _flag_file_path(self, status=None): """ @@ -626,14 +773,23 @@ def _flag_file_path(self, status=None): :param str status: flag file type to create, default to current status :return str: path to flag file of indicated or current status. """ - flag_file_name = "{}_{}".format( - self.name, flag_name(status or self.status)) + flag_file_name = "{}_{}".format(self.name, flag_name(status or self.status)) return pipeline_filepath(self, filename=flag_file_name) ################################### # Process calling functions ################################### - def run(self, cmd, target=None, lock_name=None, shell=None, nofail=False, clean=False, follow=None, container=None): + def run( + self, + cmd, + target=None, + lock_name=None, + shell=None, + nofail=False, + clean=False, + follow=None, + container=None, + ): """ The primary workhorse function of PipelineManager, this runs a command. @@ -671,8 +827,11 @@ def run(self, cmd, target=None, lock_name=None, shell=None, nofail=False, clean= if not self._active: cmds = [cmd] if isinstance(cmd, str) else cmd cmds_text = [c if isinstance(c, str) else " ".join(c) for c in cmds] - self.info("Pipeline is inactive; skipping {} command(s):\n{}". - format(len(cmds), "\n".join(cmds_text))) + self.info( + "Pipeline is inactive; skipping {} command(s):\n{}".format( + len(cmds), "\n".join(cmds_text) + ) + ) return 0 # Short-circuit if the checkpoint file exists and the manager's not @@ -680,11 +839,13 @@ def run(self, cmd, target=None, lock_name=None, shell=None, nofail=False, clean= if self.curr_checkpoint is not None: check_fpath = checkpoint_filepath(self.curr_checkpoint, self) if os.path.isfile(check_fpath) and not self.overwrite_checkpoints: - self.info("Checkpoint file exists for '{}' ('{}'), and the {} has " - "been configured to not overwrite checkpoints; " - "skipping command '{}'".format( - self.curr_checkpoint, check_fpath, - self.__class__.__name__, cmd)) + self.info( + "Checkpoint file exists for '{}' ('{}'), and the {} has " + "been configured to not overwrite checkpoints; " + "skipping command '{}'".format( + self.curr_checkpoint, check_fpath, self.__class__.__name__, cmd + ) + ) return 0 # TODO: consider making the logic such that locking isn't implied, or @@ -694,21 +855,26 @@ def run(self, cmd, target=None, lock_name=None, shell=None, nofail=False, clean= # Therefore, a targetless command that you want # to lock must specify a lock_name manually. if target is None and lock_name is None: - self.fail_pipeline(Exception( - "You must provide either a target or a lock_name.")) + self.fail_pipeline( + Exception("You must provide either a target or a lock_name.") + ) # Downstream code requires target to be a list, so convert if only # a single item was given if not is_multi_target(target) and target is not None: target = [target] - # Downstream code requires a list of locks; convert + # Downstream code requires a list of locks; convert if isinstance(lock_name, str): lock_name = [lock_name] - + # Default lock_name (if not provided) is based on the target file name, # but placed in the parent pipeline outfolder - self.debug("Lock_name {}; target '{}', outfolder '{}'".format(lock_name, target, self.outfolder)) + self.debug( + "Lock_name {}; target '{}', outfolder '{}'".format( + lock_name, target, self.outfolder + ) + ) lock_name = lock_name or make_lock_name(target, self.outfolder) lock_files = [self._make_lock_path(ln) for ln in lock_name] @@ -720,8 +886,11 @@ def run(self, cmd, target=None, lock_name=None, shell=None, nofail=False, clean= call_follow = lambda: None elif not hasattr(follow, "__call__"): # Warn about non-callable argument to follow-up function. - self.warning("Follow-up function is not callable and won't be used: {}". - format(type(follow))) + self.warning( + "Follow-up function is not callable and won't be used: {}".format( + type(follow) + ) + ) call_follow = lambda: None else: # Wrap the follow-up function so that the log shows what's going on. @@ -732,7 +901,6 @@ def call_follow(): follow() self.in_follow = False - # The while=True loop here is unlikely to be triggered, and is just a # wrapper to prevent race conditions; the lock_file must be created by # the current loop. If not, we loop again and then re-do the tests. @@ -744,18 +912,22 @@ def call_follow(): # is found that needs to be recovered or overwritten. It instructs us to # ignore lock files on the next iteration. local_recover = False - local_newstart = False + local_newstart = False proceed_through_locks = False while True: ##### Tests block # Base case: All targets exists and not set to overwrite targets break loop, don't run process. # os.path.exists returns True for either a file or directory; .isfile is file-only - if target is not None and all([os.path.exists(t) for t in target]) \ - and not any([os.path.isfile(l) for l in lock_files]) \ - and not local_newstart: + if ( + target is not None + and all([os.path.exists(t) for t in target]) + and not any([os.path.isfile(l) for l in lock_files]) + and not local_newstart + ): for tgt in target: - if os.path.exists(tgt): self.info("Target exists: `" + tgt + "` ") + if os.path.exists(tgt): + self.info("Target exists: `" + tgt + "` ") if self.new_start: self.info("New start mode; run anyway. ") # Set the local_newstart flag so the command will run anyway. @@ -772,11 +944,17 @@ def call_follow(): for c in cmd: count = len(parse_cmd(c, shell)) self.proc_count += count - self.debug(increment_info_pattern.format(str(c), count, self.proc_count)) + self.debug( + increment_info_pattern.format( + str(c), count, self.proc_count + ) + ) else: count = len(parse_cmd(cmd, shell)) self.proc_count += count - self.debug(increment_info_pattern.format(str(cmd), count, self.proc_count)) + self.debug( + increment_info_pattern.format(str(cmd), count, self.proc_count) + ) break # Do not run command # Scenario 1: Lock file exists, but we're supposed to overwrite target; Run process. @@ -789,8 +967,10 @@ def call_follow(): self.info("Overwriting target...") proceed_through_locks = True elif os.path.isfile(recover_file): - self.info("Found dynamic recovery file ({}); " - "overwriting target...".format(recover_file)) + self.info( + "Found dynamic recovery file ({}); " + "overwriting target...".format(recover_file) + ) # remove the lock file which will then be promptly re-created for the current run. local_recover = True proceed_through_locks = True @@ -802,8 +982,7 @@ def call_follow(): # time (to see if the target exists now) continue - - # If you get to this point, the target doesn't exist, and the lock_file doesn't exist + # If you get to this point, the target doesn't exist, and the lock_file doesn't exist # (or we should overwrite). create the lock (if you can) # Initialize lock in master lock list for lock_file in lock_files: @@ -815,10 +994,13 @@ def call_follow(): self._create_file_racefree(lock_file) # Create lock except OSError as e: if e.errno == errno.EEXIST: # File already exists - self.info("Lock file created after test! Looping again: {}".format( - lock_file)) + self.info( + "Lock file created after test! Looping again: {}".format( + lock_file + ) + ) - # Since a lock file was created by a different source, + # Since a lock file was created by a different source, # we need to reset this flag to re-check the locks. proceed_through_locks = False continue # Go back to start @@ -827,22 +1009,30 @@ def call_follow(): # If you make it past these tests, we should proceed to run the process. if target is not None: - self.info("Target to produce: {} ".format(",".join(['`'+x+'`' for x in target]))) + self.info( + "Target to produce: {} ".format( + ",".join(["`" + x + "`" for x in target]) + ) + ) else: self.info("Targetless command, running... ") if isinstance(cmd, list): # Handle command lists for cmd_i in cmd: - list_ret, maxmem = \ - self.callprint(cmd_i, shell, lock_file, nofail, container) + list_ret, maxmem = self.callprint( + cmd_i, shell, lock_file, nofail, container + ) maxmem = max(maxmem) if isinstance(maxmem, Iterable) else maxmem - local_maxmem = max(local_maxmem, maxmem) - list_ret = max(list_ret) if isinstance(list_ret, Iterable) else list_ret + local_maxmem = max(local_maxmem, maxmem) + list_ret = ( + max(list_ret) if isinstance(list_ret, Iterable) else list_ret + ) process_return_code = max(process_return_code, list_ret) else: # Single command (most common) - process_return_code, local_maxmem = \ - self.callprint(cmd, shell, lock_file, nofail, container) # Run command + process_return_code, local_maxmem = self.callprint( + cmd, shell, lock_file, nofail, container + ) # Run command if isinstance(process_return_code, list): process_return_code = max(process_return_code) @@ -866,7 +1056,7 @@ def checkprint(self, cmd, shell=None, nofail=False): """ Just like callprint, but checks output -- so you can get a variable in python corresponding to the return value of the command you call. - This is equivalent to running subprocess.check_output() + This is equivalent to running subprocess.check_output() instead of subprocess.call(). :param str | Iterable[str] cmd: Bash command(s) to be run. :param bool | str shell: If command requires should be run in its own shell. Optional. @@ -890,9 +1080,11 @@ def checkprint(self, cmd, shell=None, nofail=False): if not shell: if likely_shell: - self.debug("Should this command run in a shell instead of directly in a subprocess?") + self.debug( + "Should this command run in a shell instead of directly in a subprocess?" + ) cmd = shlex.split(cmd) - + try: return subprocess.check_output(cmd, shell=shell).decode().strip() except Exception as e: @@ -901,7 +1093,7 @@ def checkprint(self, cmd, shell=None, nofail=False): def _attend_process(self, proc, sleeptime): """ Waits on a process for a given time to see if it finishes, returns True - if it's still running after the given time or False as soon as it + if it's still running after the given time or False as soon as it returns. :param psutil.Popen proc: Process object opened by psutil.Popen() @@ -952,10 +1144,12 @@ def get_mem_child_sum(proc): if children: mem_sum += sum([x.memory_info().rss for x in children]) # return in gigs - return mem_sum/1e9 + return mem_sum / 1e9 except (psutil.NoSuchProcess, psutil.ZombieProcess) as e: self.warning(e) - self.warning("Warning: couldn't add memory use for process: {}".format(proc.pid)) + self.warning( + "Warning: couldn't add memory use for process: {}".format(proc.pid) + ) return 0 def display_memory(memval): @@ -970,7 +1164,11 @@ def make_hash(o): try: hsh = md5(str(o).encode("utf-8")).hexdigest()[:10] except Exception as e: - self.debug("Could not create hash for '{}', caught exception: {}".format(str(o), e.__class__.__name__)) + self.debug( + "Could not create hash for '{}', caught exception: {}".format( + str(o), e.__class__.__name__ + ) + ) hsh = None return hsh @@ -1003,7 +1201,7 @@ def make_hash(o): "container": container, "p": processes[-1], "args_hash": make_hash(conc_cmd), - "local_proc_id": self.process_counter() + "local_proc_id": self.process_counter(), } self._report_command(cmd, [x.pid for x in processes]) @@ -1029,16 +1227,22 @@ def proc_wrapup(i): current_pid = processes[i].pid info = "PID: {pid};\tCommand: {cmd};\tReturn code: {ret};\tMemory used: {mem}".format( - pid=current_pid, + pid=current_pid, cmd=self.running_procs[current_pid]["proc_name"], ret=processes[i].returncode, - mem=display_memory(local_maxmems[i])) - + mem=display_memory(local_maxmems[i]), + ) + # report process profile - self._report_profile(self.running_procs[current_pid]["proc_name"], lock_file, - time.time() - self.running_procs[current_pid]["start_time"], local_maxmems[i], - current_pid, self.running_procs[current_pid]["args_hash"], - self.running_procs[current_pid]["local_proc_id"]) + self._report_profile( + self.running_procs[current_pid]["proc_name"], + lock_file, + time.time() - self.running_procs[current_pid]["start_time"], + local_maxmems[i], + current_pid, + self.running_procs[current_pid]["args_hash"], + self.running_procs[current_pid]["local_proc_id"], + ) # Remove this as a running subprocess self.running_procs[current_pid]["info"] = info @@ -1051,29 +1255,37 @@ def proc_wrapup(i): returncodes[i] = returncode return info - sleeptime = .0001 - + sleeptime = 0.0001 + while running_processes: self.debug("running") for i in running_processes: - local_maxmems[i] = max(local_maxmems[i], (get_mem_child_sum(processes[i]))) + local_maxmems[i] = max( + local_maxmems[i], (get_mem_child_sum(processes[i])) + ) self.peak_memory = max(self.peak_memory, local_maxmems[i]) self.debug(processes[i]) if not self._attend_process(processes[i], sleeptime): proc_wrapup_text[i] = proc_wrapup(i) - # the sleeptime is extremely short at the beginning and gets longer exponentially + # the sleeptime is extremely short at the beginning and gets longer exponentially # (+ constant to prevent copious checks at the very beginning) # = more precise mem tracing for short processes - sleeptime = min((sleeptime + 0.25) * 3, 60/len(processes)) + sleeptime = min((sleeptime + 0.25) * 3, 60 / len(processes)) # All jobs are done, print a final closing and job info stop_time = time.time() proc_message = "Command completed. {info}" - info = "Elapsed time: " + str(datetime.timedelta(seconds=self.time_elapsed(start_time))) + "." - info += " Running peak memory: {pipe}.".format(pipe=display_memory(self.peak_memory)) + info = ( + "Elapsed time: " + + str(datetime.timedelta(seconds=self.time_elapsed(start_time))) + + "." + ) + info += " Running peak memory: {pipe}.".format( + pipe=display_memory(self.peak_memory) + ) # if len(proc_wrapup_text) == 1: - # info += " {}".format(proc_wrapup_text[0]) + # info += " {}".format(proc_wrapup_text[0]) for i in completed_processes: info += " \n {}".format(self.completed_procs[processes[i].pid]["info"]) @@ -1084,7 +1296,9 @@ def proc_wrapup(i): for rc in returncodes: if rc != 0: - msg = "Subprocess returned nonzero result. Check above output for details" + msg = ( + "Subprocess returned nonzero result. Check above output for details" + ) self._triage_error(SubprocessError(msg), nofail) return [returncodes, local_maxmems] @@ -1119,7 +1333,7 @@ def _wait_for_process(self, p, shell=False): :param bool shell: If command requires should be run in its own shell. Optional. Default: False. """ local_maxmem = -1 - sleeptime = .5 + sleeptime = 0.5 while p.poll() is None: if not shell: local_maxmem = max(local_maxmem, self._memory_usage(p.pid) / 1e6) @@ -1128,7 +1342,7 @@ def _wait_for_process(self, p, shell=False): sleeptime = min(sleeptime + 5, 60) self.peak_memory = max(self.peak_memory, local_maxmem) - + del self.running_procs[p.pid] info = "Process " + str(p.pid) + " returned: (" + str(p.returncode) + ")." @@ -1147,7 +1361,7 @@ def _wait_for_lock(self, lock_file): :param str lock_file: Lock file to wait upon. """ - sleeptime = .5 + sleeptime = 0.5 first_message_flag = False long_message_flag = False dot_count = 0 @@ -1156,12 +1370,15 @@ def _wait_for_lock(self, lock_file): while os.path.isfile(lock_file): if first_message_flag is False: self.timestamp("Waiting for file lock: " + lock_file) - self.warning("This indicates that another process may be executing this " + self.warning( + "This indicates that another process may be executing this " "command, or the pipeline was not properly shut down. If the " "pipeline was not properly shut down last time, " "you should restart it in 'recover' mode (-R) to indicate that " - "this step should be restarted.") - self._set_status_flag(WAIT_FLAG) + "this step should be restarted." + ) + # self._set_status_flag(WAIT_FLAG) + self.pipestat.set_status(status_identifier="waiting") first_message_flag = True else: sys.stdout.write(".") @@ -1181,7 +1398,8 @@ def _wait_for_lock(self, lock_file): if first_message_flag: self.timestamp("File unlocked.") - self._set_status_flag(RUN_FLAG) + # self._set_status_flag(RUN_FLAG) + self.pipestat.set_status(status_identifier="running") ################################### # Logging functions @@ -1205,8 +1423,7 @@ def critical(self, msg, *args, **kwargs): def fatal(self, msg, *args, **kwargs): self._logger.fatal(msg, *args, **kwargs) - def timestamp(self, message="", checkpoint=None, - finished=False, raise_error=True): + def timestamp(self, message="", checkpoint=None, finished=False, raise_error=True): """ Print message, time, and time elapsed, perhaps creating checkpoint. @@ -1249,7 +1466,9 @@ def timestamp(self, message="", checkpoint=None, self.curr_checkpoint = checkpoint self._checkpoint(self.prev_checkpoint) # Handle the two halting conditions. - if (finished and checkpoint == self.stop_after) or (not finished and checkpoint == self.stop_before): + if (finished and checkpoint == self.stop_after) or ( + not finished and checkpoint == self.stop_before + ): self.halt(checkpoint, finished, raise_error=raise_error) # Determine if we've started executing. elif checkpoint == self.start_point: @@ -1263,13 +1482,17 @@ def timestamp(self, message="", checkpoint=None, elapsed = self.time_elapsed(self.last_timestamp) t = time.strftime("%m-%d %H:%M:%S") if checkpoint is None: - msg = "{m} ({t}) elapsed: {delta_t} _TIME_".\ - format(m=message, t=t, delta_t=elapsed) + msg = "{m} ({t}) elapsed: {delta_t} _TIME_".format( + m=message, t=t, delta_t=elapsed + ) else: - msg = "{m} ({t}) ({status} {stage}) elapsed: {delta_t} _TIME_".\ - format(m=message, t=t, - status="finished" if finished else "starting", - stage=checkpoint, delta_t=elapsed) + msg = "{m} ({t}) ({status} {stage}) elapsed: {delta_t} _TIME_".format( + m=message, + t=t, + status="finished" if finished else "starting", + stage=checkpoint, + delta_t=elapsed, + ) if re.match("^###", message): msg = "\n{}\n".format(msg) self.info(msg) @@ -1284,25 +1507,39 @@ def time_elapsed(time_since): """ return round(time.time() - time_since, 0) - def _report_profile(self, command, lock_name, elapsed_time, memory, pid, args_hash, proc_count): + def _report_profile( + self, command, lock_name, elapsed_time, memory, pid, args_hash, proc_count + ): """ Writes a string to self.pipeline_profile_file. """ - rel_lock_name = lock_name if lock_name is None else os.path.relpath(lock_name, self.outfolder) - message_raw = str(pid) + "\t" + \ - str(args_hash) + "\t" + \ - str(proc_count) + "\t" + \ - str(datetime.timedelta(seconds=round(elapsed_time, 2))) + "\t " + \ - str(round(memory, 4)) + "\t" + \ - str(command) + "\t" + \ - str(rel_lock_name) + rel_lock_name = ( + lock_name + if lock_name is None + else os.path.relpath(lock_name, self.outfolder) + ) + message_raw = ( + str(pid) + + "\t" + + str(args_hash) + + "\t" + + str(proc_count) + + "\t" + + str(datetime.timedelta(seconds=round(elapsed_time, 2))) + + "\t " + + str(round(memory, 4)) + + "\t" + + str(command) + + "\t" + + str(rel_lock_name) + ) with open(self.pipeline_profile_file, "a") as myfile: myfile.write(message_raw + "\n") def report_result(self, key, value, annotation=None, nolog=False): """ Writes a string to self.pipeline_stats_file. - + :param str key: name (key) of the stat :param str annotation: By default, the stats will be annotated with the pipeline name, so you can tell which pipeline records which stats. @@ -1321,10 +1558,12 @@ def report_result(self, key, value, annotation=None, nolog=False): # keep the value in memory: self.stats_dict[key] = value message_raw = "{key}\t{value}\t{annotation}".format( - key=key, value=value, annotation=annotation) + key=key, value=value, annotation=annotation + ) message_markdown = "\n> `{key}`\t{value}\t{annotation}\t_RES_".format( - key=key, value=value, annotation=annotation) + key=key, value=value, annotation=annotation + ) if not nolog: self.info(message_markdown) @@ -1333,7 +1572,9 @@ def report_result(self, key, value, annotation=None, nolog=False): # in case multiple pipelines write to the same file. self._safe_write_to_file(self.pipeline_stats_file, message_raw) - def report_object(self, key, filename, anchor_text=None, anchor_image=None, annotation=None): + def report_object( + self, key, filename, anchor_text=None, anchor_image=None, annotation=None + ): """ Writes a string to self.pipeline_objects_file. Used to report figures and others. @@ -1363,22 +1604,38 @@ def report_object(self, key, filename, anchor_text=None, anchor_image=None, anno # better to use a relative path in this file # convert any absolute paths into relative paths - relative_filename = os.path.relpath(filename, self.outfolder) \ - if os.path.isabs(filename) else filename + relative_filename = ( + os.path.relpath(filename, self.outfolder) + if os.path.isabs(filename) + else filename + ) if anchor_image: - relative_anchor_image = os.path.relpath(anchor_image, self.outfolder) \ - if os.path.isabs(anchor_image) else anchor_image + relative_anchor_image = ( + os.path.relpath(anchor_image, self.outfolder) + if os.path.isabs(anchor_image) + else anchor_image + ) else: relative_anchor_image = "None" - message_raw = "{key}\t{filename}\t{anchor_text}\t{anchor_image}\t{annotation}".format( - key=key, filename=relative_filename, anchor_text=anchor_text, - anchor_image=relative_anchor_image, annotation=annotation) + message_raw = ( + "{key}\t{filename}\t{anchor_text}\t{anchor_image}\t{annotation}".format( + key=key, + filename=relative_filename, + anchor_text=anchor_text, + anchor_image=relative_anchor_image, + annotation=annotation, + ) + ) message_markdown = "> `{key}`\t{filename}\t{anchor_text}\t{anchor_image}\t{annotation}\t_OBJ_".format( - key=key, filename=relative_filename, anchor_text=anchor_text, - anchor_image=relative_anchor_image, annotation=annotation) + key=key, + filename=relative_filename, + anchor_text=anchor_text, + anchor_image=relative_anchor_image, + annotation=annotation, + ) self.warning(message_markdown) @@ -1388,6 +1645,11 @@ def _safe_write_to_file(self, file, message): """ Writes a string to a file safely (with file locks). """ + warnings.warn( + "This function may be removed in future release. " + "The recommended way to report pipeline results is using PipelineManager.pipestat.report().", + category=DeprecationWarning, + ) target = file lock_name = make_lock_name(target, self.outfolder) lock_file = self._make_lock_path(lock_name) @@ -1410,13 +1672,13 @@ def _safe_write_to_file(self, file, message): os.remove(lock_file) self.locks.remove(lock_file) - + # If you make it to the end of the while loop, you're done break def _report_command(self, cmd, procs=None): """ - Writes a command to both stdout and to the commands log file + Writes a command to both stdout and to the commands log file (self.pipeline_commands_file). :param str cmd: command to report @@ -1445,22 +1707,22 @@ def _report_command(self, cmd, procs=None): @staticmethod def _create_file(file): """ - Creates a file, but will not fail if the file already exists. - This is vulnerable to race conditions; use this for cases where it + Creates a file, but will not fail if the file already exists. + This is vulnerable to race conditions; use this for cases where it doesn't matter if this process is the one that created the file. :param str file: File to create. """ - with open(file, 'w') as fout: - fout.write('') + with open(file, "w") as fout: + fout.write("") @staticmethod def _create_file_racefree(file): """ Creates a file, but fails if the file already exists. - - This function will thus only succeed if this process actually creates - the file; if the file already exists, it will cause an OSError, + + This function will thus only succeed if this process actually creates + the file; if the file already exists, it will cause an OSError, solving race conditions. :param str file: File to create. @@ -1471,15 +1733,18 @@ def _create_file_racefree(file): @staticmethod def _ensure_lock_prefix(lock_name_base): - """ Ensure that an alleged lock file is correctly prefixed. """ - return lock_name_base if lock_name_base.startswith(LOCK_PREFIX) \ - else LOCK_PREFIX + lock_name_base + """Ensure that an alleged lock file is correctly prefixed.""" + return ( + lock_name_base + if lock_name_base.startswith(LOCK_PREFIX) + else LOCK_PREFIX + lock_name_base + ) def _make_lock_path(self, lock_name_base): """ Create path to lock file with given name as base. - - :param str lock_name_base: Lock file name, designed to not be prefixed + + :param str lock_name_base: Lock file name, designed to not be prefixed with the lock file designation, but that's permitted. :return str: Path to the lock file. """ @@ -1496,8 +1761,8 @@ def _make_lock_path(self, lock_name_base): def _recoverfile_from_lockfile(self, lockfile): """ Create path to recovery file with given name as base. - - :param str lockfile: Name of file on which to base this path, + + :param str lockfile: Name of file on which to base this path, perhaps already prefixed with the designation of a lock file. :return str: Path to recovery file. """ @@ -1513,7 +1778,7 @@ def make_sure_path_exists(path): Creates all directories in a path if it does not exist. :param str path: Path to create. - :raises Exception: if the path creation attempt hits an error with + :raises Exception: if the path creation attempt hits an error with a code indicating a cause other than pre-existence. """ try: @@ -1533,36 +1798,41 @@ def _refresh_stats(self): """ # regex identifies all possible stats files. - #regex = self.outfolder + "*_stats.tsv" - #stats_files = glob.glob(regex) - #stats_files.insert(self.pipeline_stats_file) # last one is the current pipeline - #for stats_file in stats_files: + # regex = self.outfolder + "*_stats.tsv" + # stats_files = glob.glob(regex) + # stats_files.insert(self.pipeline_stats_file) # last one is the current pipeline + # for stats_file in stats_files: stats_file = self.pipeline_stats_file if os.path.isfile(self.pipeline_stats_file): - with open(stats_file, 'r') as stat_file: + with open(stats_file, "r") as stat_file: for line in stat_file: try: # Someone may have put something that's not 3 columns in the stats file # if so, shame on him, but we can just ignore it. - key, value, annotation = line.split('\t') + key, value, annotation = line.split("\t") except ValueError: - self.warning("WARNING: Each row in a stats file is expected to have 3 columns") - - if annotation.rstrip() == self.name or annotation.rstrip() == "shared": + self.warning( + "WARNING: Each row in a stats file is expected to have 3 columns" + ) + + if ( + annotation.rstrip() == self.name + or annotation.rstrip() == "shared" + ): self.stats_dict[key] = value.strip() - #if os.path.isfile(self.pipeline_stats_file): + # if os.path.isfile(self.pipeline_stats_file): def get_stat(self, key): """ Returns a stat that was previously reported. This is necessary for reporting new stats that are - derived from two stats, one of which may have been reported by an earlier run. For example, + derived from two stats, one of which may have been reported by an earlier run. For example, if you first use report_result to report (number of trimmed reads), and then in a later stage - want to report alignment rate, then this second stat (alignment rate) will require knowing the + want to report alignment rate, then this second stat (alignment rate) will require knowing the first stat (number of trimmed reads); however, that may not have been calculated in the current pipeline run, so we must retrieve it from the stats.tsv output file. This command will retrieve such previously reported stats if they were not already calculated in the current pipeline run. - :param key: key of stat to retrieve + :param key: key of stat to retrieve """ try: @@ -1622,9 +1892,12 @@ def _checkpoint(self, stage): # be expected to characterize the extension of a file name/path. base, ext = os.path.splitext(stage) if ext and "." not in base: - self.warning("WARNING: '{}' looks like it may be the name or path of " - "a file; for such a checkpoint, use touch_checkpoint.". - format(stage)) + self.warning( + "WARNING: '{}' looks like it may be the name or path of " + "a file; for such a checkpoint, use touch_checkpoint.".format( + stage + ) + ) else: if not is_checkpoint: self.warning("Not a checkpoint: {}".format(stage)) @@ -1656,9 +1929,12 @@ def _touch_checkpoint(self, check_file): other_folder = os.path.join(folder, "") this_folder = os.path.join(self.outfolder, "") if other_folder != this_folder: - errmsg = "Path provided as checkpoint file isn't in pipeline " \ - "output folder. '{}' is not in '{}'".format( - check_file, self.outfolder) + errmsg = ( + "Path provided as checkpoint file isn't in pipeline " + "output folder. '{}' is not in '{}'".format( + check_file, self.outfolder + ) + ) raise ValueError(errmsg) fpath = check_file else: @@ -1667,14 +1943,14 @@ def _touch_checkpoint(self, check_file): # Create/update timestamp for checkpoint, but base return value on # whether the action was a simple update or a novel creation. already_exists = os.path.isfile(fpath) - open(fpath, 'w').close() + open(fpath, "w").close() action = "Updated" if already_exists else "Created" self.info("{} checkpoint file: '{}'".format(action, fpath)) return already_exists def complete(self): - """ Stop a completely finished pipeline. """ + """Stop a completely finished pipeline.""" self.stop_pipeline(status=COMPLETE_FLAG) def fail_pipeline(self, exc, dynamic_recover=False): @@ -1712,7 +1988,8 @@ def fail_pipeline(self, exc, dynamic_recover=False): total_time = datetime.timedelta(seconds=self.time_elapsed(self.starttime)) self.info("Total time: " + str(total_time)) self.info("Failure reason: " + str(exc)) - self._set_status_flag(FAIL_FLAG) + # self._set_status_flag(FAIL_FLAG) + self.pipestat.set_status(status_identifier="failed") if isinstance(exc, str): exc = RuntimeError(exc) @@ -1743,16 +2020,21 @@ def get_elapsed_time(self): :return int: sum of runtimes in seconds """ if os.path.isfile(self.pipeline_profile_file): - df = _pd.read_csv(self.pipeline_profile_file, sep="\t", comment="#", names=PROFILE_COLNAMES) + df = _pd.read_csv( + self.pipeline_profile_file, + sep="\t", + comment="#", + names=PROFILE_COLNAMES, + ) try: - df['runtime'] = _pd.to_timedelta(df['runtime']) + df["runtime"] = _pd.to_timedelta(df["runtime"]) except ValueError: # return runtime estimate # this happens if old profile style is mixed with the new one # and the columns do not match return self.time_elapsed(self.starttime) - unique_df = df[~df.duplicated('cid', keep='last').values] - return sum(unique_df['runtime'].apply(lambda x: x.total_seconds())) + unique_df = df[~df.duplicated("cid", keep="last").values] + return sum(unique_df["runtime"].apply(lambda x: x.total_seconds())) return self.time_elapsed(self.starttime) def stop_pipeline(self, status=COMPLETE_FLAG): @@ -1761,30 +2043,39 @@ def stop_pipeline(self, status=COMPLETE_FLAG): This is the "healthy" pipeline completion function. The normal pipeline completion function, to be run by the pipeline - at the end of the script. It sets status flag to completed and records + at the end of the script. It sets status flag to completed and records some time and memory statistics to the log file. """ - self._set_status_flag(status) + # self._set_status_flag(status) + self.pipestat.set_status(status_identifier=status) self._cleanup() - elapsed_time_this_run = str(datetime.timedelta(seconds=self.time_elapsed(self.starttime))) - self.report_result("Time", - elapsed_time_this_run, - nolog=True) - self.report_result("Success", - time.strftime("%m-%d-%H:%M:%S"), - nolog=True) + elapsed_time_this_run = str( + datetime.timedelta(seconds=self.time_elapsed(self.starttime)) + ) + self.report_result("Time", elapsed_time_this_run, nolog=True) + self.report_result("Success", time.strftime("%m-%d-%H:%M:%S"), nolog=True) self.info("\n### Pipeline completed. Epilogue") # print("* " + "Total elapsed time".rjust(20) + ": " # + str(datetime.timedelta(seconds=self.time_elapsed(self.starttime)))) - self.info("* " + "Elapsed time (this run)".rjust(30) + ": " + - elapsed_time_this_run) - self.info("* " + "Total elapsed time (all runs)".rjust(30) + ": " + - str(datetime.timedelta(seconds=round(self.get_elapsed_time())))) - self.info("* " + "Peak memory (this run)".rjust(30) + ": " + - str(round(self.peak_memory, 4)) + " GB") - # self.info("* " + "Total peak memory (all runs)".rjust(30) + ": " + - # str(round(self.peak_memory, 4)) + " GB") + self.info( + "* " + "Elapsed time (this run)".rjust(30) + ": " + elapsed_time_this_run + ) + self.info( + "* " + + "Total elapsed time (all runs)".rjust(30) + + ": " + + str(datetime.timedelta(seconds=round(self.get_elapsed_time()))) + ) + self.info( + "* " + + "Peak memory (this run)".rjust(30) + + ": " + + str(round(self.peak_memory, 4)) + + " GB" + ) + # self.info("* " + "Total peak memory (all runs)".rjust(30) + ": " + + # str(round(self.peak_memory, 4)) + " GB") if self.halted: return @@ -1805,7 +2096,7 @@ def _signal_term_handler(self, signal, frame): """ signal_type = "SIGTERM" self._generic_signal_handler(signal_type) - + def _generic_signal_handler(self, signal_type): """ Function for handling both SIGTERM and SIGINT @@ -1824,7 +2115,7 @@ def _generic_signal_handler(self, signal_type): # passed directly to the tee subprocess, so I could handle that on # my own; hence, now I believe I no longer need to do this. I'm # leaving this code here as a relic in case something comes up. - #with open(self.pipeline_log_file, "a") as myfile: + # with open(self.pipeline_log_file, "a") as myfile: # myfile.write(message + "\n") def _signal_int_handler(self, signal, frame): @@ -1859,7 +2150,7 @@ def _exit_handler(self): self.fail_pipeline(Exception("Pipeline failure. See details above.")) if self.tee: - self.tee.kill() + self.tee.kill() def _terminate_running_subprocesses(self): @@ -1870,9 +2161,18 @@ def _terminate_running_subprocesses(self): # Close the preformat tag that we opened when the process was spawned. # record profile of any running processes before killing elapsed_time = time.time() - self.running_procs[pid]["start_time"] - process_peak_mem = self._memory_usage(pid, container=proc_dict["container"])/1e6 - self._report_profile(self.running_procs[pid]["proc_name"], None, elapsed_time, process_peak_mem, pid, - self.running_procs[pid]["args_hash"], self.running_procs[pid]["local_proc_id"]) + process_peak_mem = ( + self._memory_usage(pid, container=proc_dict["container"]) / 1e6 + ) + self._report_profile( + self.running_procs[pid]["proc_name"], + None, + elapsed_time, + process_peak_mem, + pid, + self.running_procs[pid]["args_hash"], + self.running_procs[pid]["local_proc_id"], + ) self._kill_child_process(pid, proc_dict["proc_name"]) del self.running_procs[pid] @@ -1902,10 +2202,10 @@ def pskill(proc_pid, sig=signal.SIGINT): if proc_name: proc_string = " ({proc_name})".format(proc_name=proc_name) - # First a gentle kill + # First a gentle kill sys.stdout.flush() still_running = self._attend_process(psutil.Process(child_pid), 0) - sleeptime = .25 + sleeptime = 0.25 time_waiting = 0 while still_running and time_waiting < 3: @@ -1933,9 +2233,12 @@ def pskill(proc_pid, sig=signal.SIGINT): if still_running: # still running!? - self.warning("Child process {child_pid}{proc_string} never responded" - "I just can't take it anymore. I don't know what to do...".format(child_pid=child_pid, - proc_string=proc_string)) + self.warning( + "Child process {child_pid}{proc_string} never responded" + "I just can't take it anymore. I don't know what to do...".format( + child_pid=child_pid, proc_string=proc_string + ) + ) else: if time_waiting > 0: note = "terminated after {time} sec".format(time=int(time_waiting)) @@ -1943,12 +2246,13 @@ def pskill(proc_pid, sig=signal.SIGINT): note = "was already terminated" msg = "Child process {child_pid}{proc_string} {note}.".format( - child_pid=child_pid, proc_string=proc_string, note=note) + child_pid=child_pid, proc_string=proc_string, note=note + ) self.info(msg) @staticmethod def _atexit_register(*args): - """ Convenience alias to register exit functions without having to import atexit in the pipeline. """ + """Convenience alias to register exit functions without having to import atexit in the pipeline.""" atexit.register(*args) def get_container(self, image, mounts): @@ -2014,11 +2318,17 @@ def clean_add(self, regex, conditional=False, manual=False): try: with open(self.cleanup_file, "a") as myfile: if os.path.isabs(filename): - relative_filename = os.path.relpath(filename, self.outfolder) + relative_filename = os.path.relpath( + filename, self.outfolder + ) absolute_filename = filename else: - relative_filename = os.path.relpath(filename, self.outfolder) - absolute_filename = os.path.abspath(os.path.join(self.outfolder, relative_filename)) + relative_filename = os.path.relpath( + filename, self.outfolder + ) + absolute_filename = os.path.abspath( + os.path.join(self.outfolder, relative_filename) + ) if os.path.isfile(absolute_filename): # print("Adding file to cleanup: {}".format(filename)) myfile.write("rm " + relative_filename + "\n") @@ -2029,9 +2339,15 @@ def clean_add(self, regex, conditional=False, manual=False): # and the directory itself myfile.write("rmdir " + relative_filename + "\n") else: - self.info("File not added to cleanup: {}".format(relative_filename)) + self.info( + "File not added to cleanup: {}".format( + relative_filename + ) + ) except Exception as e: - self.error("Error in clean_add on path {}: {}".format(filename, str(e))) + self.error( + "Error in clean_add on path {}: {}".format(filename, str(e)) + ) elif conditional: self.cleanup_list_conditional.append(regex) else: @@ -2058,9 +2374,11 @@ def _cleanup(self, dry_run=False): n_to_clean_cond = len(self.cleanup_list_conditional) if n_to_clean + n_to_clean_cond > 0: - self.info("Starting cleanup: {} files; {} conditional files for cleanup".format( - n_to_clean, - n_to_clean_cond)) + self.info( + "Starting cleanup: {} files; {} conditional files for cleanup".format( + n_to_clean, n_to_clean_cond + ) + ) else: self.debug("No files to clean.") @@ -2094,9 +2412,12 @@ def _cleanup(self, dry_run=False): if n_to_clean_cond > 0: run_flag = flag_name(RUN_FLAG) - flag_files = [fn for fn in glob.glob(self.outfolder + flag_name("*")) - if COMPLETE_FLAG not in os.path.basename(fn) - and not "{}_{}".format(self.name, run_flag) == os.path.basename(fn)] + flag_files = [ + fn + for fn in glob.glob(self.outfolder + flag_name("*")) + if COMPLETE_FLAG not in os.path.basename(fn) + and not "{}_{}".format(self.name, run_flag) == os.path.basename(fn) + ] if len(flag_files) == 0 and not dry_run: self.info("\nCleaning up conditional list. . .") for expr in self.cleanup_list_conditional: @@ -2115,9 +2436,14 @@ def _cleanup(self, dry_run=False): except: pass else: - self.info("\nConditional flag found: " + str([os.path.basename(i) for i in flag_files])) - self.info("\nThese conditional files were left in place:\n\n- " + - "\n- ".join(self.cleanup_list_conditional)) + self.info( + "\nConditional flag found: " + + str([os.path.basename(i) for i in flag_files]) + ) + self.info( + "\nThese conditional files were left in place:\n\n- " + + "\n- ".join(self.cleanup_list_conditional) + ) # Produce a cleanup script. no_cleanup_script = [] for cleandir in self.cleanup_list_conditional: @@ -2131,10 +2457,13 @@ def _cleanup(self, dry_run=False): clean_script.write("rmdir " + clean_item + "\n") except Exception as e: no_cleanup_script.append(cleandir) - if no_cleanup_script: - self.warning('\n\nCould not produce cleanup script for item(s):\n\n- ' + '\n- '.join(no_cleanup_script)) + if no_cleanup_script: + self.warning( + "\n\nCould not produce cleanup script for item(s):\n\n- " + + "\n- ".join(no_cleanup_script) + ) - def _memory_usage(self, pid='self', category="hwm", container=None): + def _memory_usage(self, pid="self", category="hwm", container=None): """ Memory usage of the process in kilobytes. @@ -2147,8 +2476,8 @@ def _memory_usage(self, pid='self', category="hwm", container=None): cmd = "docker stats " + container + " --format '{{.MemUsage}}' --no-stream" mem_use_str = subprocess.check_output(cmd, shell=True).decode() - mem_num = re.findall('[\d\.]+', mem_use_str.split("/")[0])[0] - mem_scale = re.findall('[A-Za-z]+', mem_use_str.split("/")[0])[0] + mem_num = re.findall("[\d\.]+", mem_use_str.split("/")[0])[0] + mem_scale = re.findall("[A-Za-z]+", mem_use_str.split("/")[0])[0] mem_num = float(mem_num) if mem_scale == "GiB": @@ -2163,13 +2492,13 @@ def _memory_usage(self, pid='self', category="hwm", container=None): # Thanks Martin Geisler: status = None - result = {'peak': 0, 'rss': 0, 'hwm': 0} - + result = {"peak": 0, "rss": 0, "hwm": 0} + try: # This will only work on systems with a /proc file system # (like Linux). # status = open('/proc/self/status') - proc_spot = '/proc/%s/status' % pid + proc_spot = "/proc/%s/status" % pid status = open(proc_spot) for line in status: parts = line.split() @@ -2186,13 +2515,17 @@ def _memory_usage(self, pid='self', category="hwm", container=None): return result[category] def _triage_error(self, e, nofail): - """ Print a message and decide what to do about an error. """ + """Print a message and decide what to do about an error.""" if not nofail: self.fail_pipeline(e) elif self._failed: - self.info("This is a nofail process, but the pipeline was terminated for other reasons, so we fail.") + self.info( + "This is a nofail process, but the pipeline was terminated for other reasons, so we fail." + ) raise e else: self.error(e) - self.error("ERROR: Subprocess returned nonzero result, but pipeline is continuing because nofail=True") + self.error( + "ERROR: Subprocess returned nonzero result, but pipeline is continuing because nofail=True" + ) # TODO: return nonzero, or something. . .? diff --git a/pypiper/ngstk.py b/pypiper/ngstk.py index 9977325d..53d1ad38 100755 --- a/pypiper/ngstk.py +++ b/pypiper/ngstk.py @@ -1,11 +1,13 @@ """ Broadly applicable NGS processing/analysis functionality """ +import errno import os import re import subprocess -import errno + from attmap import AttMapEcho from yacman import load_yaml + from .exceptions import UnsupportedFiletypeException from .utils import is_fastq, is_gzipped_fastq, is_sam_or_bam @@ -43,7 +45,8 @@ def __init__(self, config_file=None, pm=None): # parse yaml into the project's attributes # self.add_entries(**config) super(NGSTk, self).__init__( - None if config_file is None else load_yaml(config_file)) + None if config_file is None else load_yaml(config_file) + ) # Keep a link to the pipeline manager, if one is provided. # if None is provided, instantiate "tools" and "parameters" with empty AttMaps @@ -63,12 +66,15 @@ def __init__(self, config_file=None, pm=None): self.parameters = AttMapEcho() # If pigz is available, use that. Otherwise, default to gzip. - if hasattr(self.pm, "cores") and self.pm.cores > 1 and self.check_command("pigz"): + if ( + hasattr(self.pm, "cores") + and self.pm.cores > 1 + and self.check_command("pigz") + ): self.ziptool_cmd = "pigz -f -p {}".format(self.pm.cores) else: self.ziptool_cmd = "gzip -f" - def _ensure_folders(self, *paths): """ Ensure that paths to folder(s) exist. @@ -90,7 +96,6 @@ def _ensure_folders(self, *paths): # Otherwise, just ensure that we have path to file's folder. self.make_dir(fpath if ext else p) - @property def ziptool(self): """ @@ -100,7 +105,6 @@ def ziptool(self): """ return self.ziptool_cmd - def make_dir(self, path): """ Forge path to directory, creating intermediates as needed. @@ -113,12 +117,10 @@ def make_dir(self, path): if exception.errno != errno.EEXIST: raise - def make_sure_path_exists(self, path): - """ Alias for make_dir """ + """Alias for make_dir""" self.make_dir(path) - # Borrowed from looper def check_command(self, command): """ @@ -126,7 +128,9 @@ def check_command(self, command): """ # Use `command` to see if command is callable, store exit code - code = os.system("command -v {0} >/dev/null 2>&1 || {{ exit 1; }}".format(command)) + code = os.system( + "command -v {0} >/dev/null 2>&1 || {{ exit 1; }}".format(command) + ) # If exit code is not 0, report which command failed and return False, else return True if code != 0: @@ -135,7 +139,6 @@ def check_command(self, command): else: return True - def get_file_size(self, filenames): """ Get size of all files in string (space-separated) in megabytes (Mb). @@ -149,10 +152,15 @@ def get_file_size(self, filenames): if type(filenames) is list: return sum([self.get_file_size(filename) for filename in filenames]) - return round(sum([float(os.stat(f).st_size) for f in filenames.split(" ")]) / (1024 ** 2), 4) - + return round( + sum([float(os.stat(f).st_size) for f in filenames.split(" ")]) + / (1024 ** 2), + 4, + ) - def mark_duplicates(self, aligned_file, out_file, metrics_file, remove_duplicates="True"): + def mark_duplicates( + self, aligned_file, out_file, metrics_file, remove_duplicates="True" + ): cmd = self.tools.java if self.pm.javamem: # If a memory restriction exists. cmd += " -Xmx" + self.pm.javamem @@ -163,9 +171,9 @@ def mark_duplicates(self, aligned_file, out_file, metrics_file, remove_duplicate cmd += " REMOVE_DUPLICATES=" + remove_duplicates return cmd - - def bam2fastq(self, input_bam, output_fastq, - output_fastq2=None, unpaired_fastq=None): + def bam2fastq( + self, input_bam, output_fastq, output_fastq2=None, unpaired_fastq=None + ): """ Create command to convert BAM(s) to FASTQ(s). @@ -185,7 +193,6 @@ def bam2fastq(self, input_bam, output_fastq, cmd += " UNPAIRED_FASTQ={0}".format(unpaired_fastq) return cmd - def bam_to_fastq(self, bam_file, out_fastq_pre, paired_end): """ Build command to convert BAM file to FASTQ file(s) (R1/R2). @@ -209,11 +216,10 @@ def bam_to_fastq(self, bam_file, out_fastq_pre, paired_end): cmd += " VALIDATION_STRINGENCY=SILENT" return cmd - def bam_to_fastq_awk(self, bam_file, out_fastq_pre, paired_end, zipmode=False): """ - This converts bam file to fastq files, but using awk. As of 2016, this is much faster - than the standard way of doing this using Picard, and also much faster than the + This converts bam file to fastq files, but using awk. As of 2016, this is much faster + than the standard way of doing this using Picard, and also much faster than the bedtools implementation as well; however, it does no sanity checks and assumes the reads (for paired data) are all paired (no singletons), in the correct order. :param bool zipmode: Should the output be zipped? @@ -222,29 +228,27 @@ def bam_to_fastq_awk(self, bam_file, out_fastq_pre, paired_end, zipmode=False): fq1 = out_fastq_pre + "_R1.fastq" fq2 = out_fastq_pre + "_R2.fastq" - if zipmode: fq1 = fq1 + ".gz" fq2 = fq2 + ".gz" - fq1_target = " | \"" + self.ziptool + " -c > " + fq1 + '"' - fq2_target = " | \"" + self.ziptool + " -c > " + fq2 + '"' + fq1_target = ' | "' + self.ziptool + " -c > " + fq1 + '"' + fq2_target = ' | "' + self.ziptool + " -c > " + fq2 + '"' else: fq1_target = ' > "' + fq1 + '"' fq2_target = ' > "' + fq2 + '"' - + if paired_end: cmd = self.tools.samtools + " view " + bam_file + " | awk '" - cmd += r'{ if (NR%2==1) print "@"$1"/1\n"$10"\n+\n"$11' + fq1_target + ';' - cmd += r' else print "@"$1"/2\n"$10"\n+\n"$11' + fq2_target + '; }' + cmd += r'{ if (NR%2==1) print "@"$1"/1\n"$10"\n+\n"$11' + fq1_target + ";" + cmd += r' else print "@"$1"/2\n"$10"\n+\n"$11' + fq2_target + "; }" cmd += "'" # end the awk command else: fq2 = None cmd = self.tools.samtools + " view " + bam_file + " | awk '" - cmd += r'{ print "@"$1"\n"$10"\n+\n"$11' + fq1_target + '; }' + cmd += r'{ print "@"$1"\n"$10"\n+\n"$11' + fq1_target + "; }" cmd += "'" return cmd, fq1, fq2 - def bam_to_fastq_bedtools(self, bam_file, out_fastq_pre, paired_end): """ Converts bam to fastq; A version using bedtools @@ -252,14 +256,20 @@ def bam_to_fastq_bedtools(self, bam_file, out_fastq_pre, paired_end): self.make_sure_path_exists(os.path.dirname(out_fastq_pre)) fq1 = out_fastq_pre + "_R1.fastq" fq2 = None - cmd = self.tools.bedtools + " bamtofastq -i " + bam_file + " -fq " + fq1 + ".fastq" + cmd = ( + self.tools.bedtools + + " bamtofastq -i " + + bam_file + + " -fq " + + fq1 + + ".fastq" + ) if paired_end: fq2 = out_fastq_pre + "_R2.fastq" cmd += " -fq2 " + fq2 return cmd, fq1, fq2 - def get_input_ext(self, input_file): """ Get the extension of the input_file. Assumes you're using either @@ -272,12 +282,13 @@ def get_input_ext(self, input_file): elif input_file.endswith(".fastq") or input_file.endswith(".fq"): input_ext = ".fastq" else: - errmsg = "'{}'; this pipeline can only deal with .bam, .fastq, " \ - "or .fastq.gz files".format(input_file) + errmsg = ( + "'{}'; this pipeline can only deal with .bam, .fastq, " + "or .fastq.gz files".format(input_file) + ) raise UnsupportedFiletypeException(errmsg) return input_ext - def merge_or_link(self, input_args, raw_folder, local_base="sample"): """ Standardizes various input possibilities by converting either .bam, @@ -312,8 +323,7 @@ class of inputs (which can in turn be a string or a list). else: local_base_extended = local_base if input_arg: - out = self.merge_or_link( - input_arg, raw_folder, local_base_extended) + out = self.merge_or_link(input_arg, raw_folder, local_base_extended) print("Local input file: '{}'".format(out)) # Make sure file exists: @@ -343,7 +353,8 @@ class of inputs (which can in turn be a string or a list). self.pm.run( "ln -sf " + input_arg + " " + local_input_abs, target=local_input_abs, - shell=True) + shell=True, + ) # return the local (linked) filename absolute path return local_input_abs @@ -365,11 +376,11 @@ class of inputs (which can in turn be a string or a list). if all([self.get_input_ext(x) == ".fastq.gz" for x in input_args]): sample_merged_gz = local_base + ".merged.fastq.gz" output_merge_gz = os.path.join(raw_folder, sample_merged_gz) - #cmd1 = self.ziptool + "-d -c " + " ".join(input_args) + " > " + output_merge - #cmd2 = self.ziptool + " " + output_merge - #self.pm.run([cmd1, cmd2], output_merge_gz) + # cmd1 = self.ziptool + "-d -c " + " ".join(input_args) + " > " + output_merge + # cmd2 = self.ziptool + " " + output_merge + # self.pm.run([cmd1, cmd2], output_merge_gz) # you can save yourself the decompression/recompression: - cmd = "cat " + " ".join(input_args) + " > " + output_merge_gz + cmd = "cat " + " ".join(input_args) + " > " + output_merge_gz self.pm.run(cmd, output_merge_gz) return output_merge_gz @@ -383,13 +394,20 @@ class of inputs (which can in turn be a string or a list). # At this point, we don't recognize the input file types or they # do not match. raise NotImplementedError( - "Input files must be of the same type, and can only " - "merge bam or fastq.") - + "Input files must be of the same type, and can only " + "merge bam or fastq." + ) def input_to_fastq( - self, input_file, sample_name, paired_end, fastq_folder, - output_file=None, multiclass=False, zipmode=False): + self, + input_file, + sample_name, + paired_end, + fastq_folder, + output_file=None, + multiclass=False, + zipmode=False, + ): """ Builds a command to convert input file to fastq, for various inputs. @@ -424,10 +442,15 @@ def input_to_fastq( output_file = [] for in_i, in_arg in enumerate(input_file): output = fastq_prefix + "_R" + str(in_i + 1) + ".fastq" - result_cmd, uf, result_file = \ - self.input_to_fastq(in_arg, sample_name, paired_end, - fastq_folder, output, multiclass=True, - zipmode=zipmode) + result_cmd, uf, result_file = self.input_to_fastq( + in_arg, + sample_name, + paired_end, + fastq_folder, + output, + multiclass=True, + zipmode=zipmode, + ) cmd.append(result_cmd) output_file.append(result_file) @@ -444,8 +467,10 @@ def input_to_fastq( if input_ext == ".bam": print("Found .bam file") - #cmd = self.bam_to_fastq(input_file, fastq_prefix, paired_end) - cmd, fq1, fq2 = self.bam_to_fastq_awk(input_file, fastq_prefix, paired_end, zipmode) + # cmd = self.bam_to_fastq(input_file, fastq_prefix, paired_end) + cmd, fq1, fq2 = self.bam_to_fastq_awk( + input_file, fastq_prefix, paired_end, zipmode + ) # pm.run(cmd, output_file, follow=check_fastq) if fq2: output_file = [fq1, fq2] @@ -455,20 +480,24 @@ def input_to_fastq( print("Found .fastq.gz file") if paired_end and not multiclass: if zipmode: - raise NotImplementedError("Can't use zipmode on interleaved fastq data.") + raise NotImplementedError( + "Can't use zipmode on interleaved fastq data." + ) # For paired-end reads in one fastq file, we must split the # file into 2. The pipeline author will need to include this - # python script in the scripts directory. + # python script in the scripts directory. # TODO: make this self-contained in pypiper. This is a rare # use case these days, as fastq files are almost never # interleaved anymore. - script_path = os.path.join( - self.tools.scripts_dir, "fastq_split.py") + script_path = os.path.join(self.tools.scripts_dir, "fastq_split.py") cmd = self.tools.python + " -u " + script_path cmd += " -i " + input_file cmd += " -o " + fastq_prefix # Must also return the set of output files - output_file = [fastq_prefix + "_R1.fastq", fastq_prefix + "_R2.fastq"] + output_file = [ + fastq_prefix + "_R1.fastq", + fastq_prefix + "_R2.fastq", + ] else: if zipmode: # we do nothing! @@ -477,7 +506,9 @@ def input_to_fastq( else: # For single-end reads, we just unzip the fastq.gz file. # or, paired-end reads that were already split. - cmd = self.ziptool + " -d -c " + input_file + " > " + output_file + cmd = ( + self.ziptool + " -d -c " + input_file + " > " + output_file + ) # a non-shell version # cmd1 = "gunzip --force " + input_file # cmd2 = "mv " + os.path.splitext(input_file)[0] + " " + output_file @@ -491,7 +522,6 @@ def input_to_fastq( return [cmd, fastq_prefix, output_file] - def check_fastq(self, input_files, output_files, paired_end): """ Returns a follow sanity-check function to be run after a fastq conversion. @@ -510,8 +540,9 @@ def check_fastq(self, input_files, output_files, paired_end): # This is AFTER merge, so if there are multiple files it means the # files were split into read1/read2; therefore I must divide by number # of files for final reads. - def temp_func(input_files=input_files, output_files=output_files, - paired_end=paired_end): + def temp_func( + input_files=input_files, output_files=output_files, paired_end=paired_end + ): if type(input_files) != list: input_files = [input_files] @@ -521,14 +552,21 @@ def temp_func(input_files=input_files, output_files=output_files, n_input_files = len(filter(bool, input_files)) n_output_files = len(filter(bool, output_files)) - total_reads = sum([int(self.count_reads(input_file, paired_end)) - for input_file in input_files]) + total_reads = sum( + [ + int(self.count_reads(input_file, paired_end)) + for input_file in input_files + ] + ) raw_reads = int(total_reads / n_input_files) self.pm.report_result("Raw_reads", str(raw_reads)) total_fastq_reads = sum( - [int(self.count_reads(output_file, paired_end)) - for output_file in output_files]) + [ + int(self.count_reads(output_file, paired_end)) + for output_file in output_files + ] + ) fastq_reads = int(total_fastq_reads / n_output_files) self.pm.report_result("Fastq_reads", fastq_reads) @@ -536,20 +574,23 @@ def temp_func(input_files=input_files, output_files=output_files, # We can only assess pass filter reads in bam files with flags. if input_ext == ".bam": num_failed_filter = sum( - [int(self.count_fail_reads(f, paired_end)) - for f in input_files]) + [int(self.count_fail_reads(f, paired_end)) for f in input_files] + ) pf_reads = int(raw_reads) - num_failed_filter self.pm.report_result("PF_reads", str(pf_reads)) if fastq_reads != int(raw_reads): - raise Exception("Fastq conversion error? Number of input reads " - "doesn't number of output reads.") + raise Exception( + "Fastq conversion error? Number of input reads " + "doesn't number of output reads." + ) return fastq_reads return temp_func - - def check_trim(self, trimmed_fastq, paired_end, trimmed_fastq_R2=None, fastqc_folder=None): + def check_trim( + self, trimmed_fastq, paired_end, trimmed_fastq_R2=None, fastqc_folder=None + ): """ Build function to evaluate read trimming, and optionally run fastqc. @@ -581,7 +622,8 @@ def temp_func(): print("Can't calculate trim loss rate without raw read result.") else: self.pm.report_result( - "Trim_loss_rate", round((rr - n_trim) * 100 / rr, 2)) + "Trim_loss_rate", round((rr - n_trim) * 100 / rr, 2) + ) # Also run a fastqc (if installed/requested) if fastqc_folder: @@ -602,7 +644,6 @@ def temp_func(): return temp_func - def validate_bam(self, input_bam): """ Wrapper for Picard's ValidateSamFile. @@ -615,7 +656,6 @@ def validate_bam(self, input_bam): cmd += " INPUT=" + input_bam return cmd - def merge_bams(self, input_bams, merged_bam, in_sorted="TRUE", tmp_dir=None): """ Combine multiple files into one. @@ -653,27 +693,25 @@ def merge_bams(self, input_bams, merged_bam, in_sorted="TRUE", tmp_dir=None): cmd += " TMP_DIR=" + tmp_dir return cmd - - + def merge_bams_samtools(self, input_bams, merged_bam): - cmd = self.tools.samtools + " merge -f " + cmd = self.tools.samtools + " merge -f " cmd += " -@ " + str(self.pm.cores) - cmd += " " + merged_bam + " " + cmd += " " + merged_bam + " " cmd += " ".join(input_bams) return cmd - def merge_fastq(self, inputs, output, run=False, remove_inputs=False): """ Merge FASTQ files (zipped or not) into one. - + :param Iterable[str] inputs: Collection of paths to files to merge. :param str output: Path to single output file. :param bool run: Whether to run the command. :param bool remove_inputs: Whether to keep the original files. - :return NoneType | str: Null if running the command, otherwise the + :return NoneType | str: Null if running the command, otherwise the command itself - :raise ValueError: Raise ValueError if the call is such that + :raise ValueError: Raise ValueError if the call is such that inputs are to be deleted but command is not run. """ if remove_inputs and not run: @@ -687,14 +725,16 @@ def merge_fastq(self, inputs, output, run=False, remove_inputs=False): else: return cmd - def count_lines(self, file_name): """ Uses the command-line utility wc to count the number of lines in a file. For MacOS, must strip leading whitespace from wc. :param str file_name: name of file whose lines are to be counted """ - x = subprocess.check_output("wc -l " + file_name + " | sed -E 's/^[[:space:]]+//' | cut -f1 -d' '", shell=True) + x = subprocess.check_output( + "wc -l " + file_name + " | sed -E 's/^[[:space:]]+//' | cut -f1 -d' '", + shell=True, + ) return x.decode().strip() def count_lines_zip(self, file_name): @@ -703,7 +743,13 @@ def count_lines_zip(self, file_name): For compressed files. :param file: file_name """ - x = subprocess.check_output(self.ziptool + " -d -c " + file_name + " | wc -l | sed -E 's/^[[:space:]]+//' | cut -f1 -d' '", shell=True) + x = subprocess.check_output( + self.ziptool + + " -d -c " + + file_name + + " | wc -l | sed -E 's/^[[:space:]]+//' | cut -f1 -d' '", + shell=True, + ) return x.decode().strip() def get_chrs_from_bam(self, file_name): @@ -711,7 +757,13 @@ def get_chrs_from_bam(self, file_name): Uses samtools to grab the chromosomes from the header that are contained in this bam file. """ - x = subprocess.check_output(self.tools.samtools + " view -H " + file_name + " | grep '^@SQ' | cut -f2| sed s'/SN://'", shell=True) + x = subprocess.check_output( + self.tools.samtools + + " view -H " + + file_name + + " | grep '^@SQ' | cut -f2| sed s'/SN://'", + shell=True, + ) # Chromosomes will be separated by newlines; split into list to return return x.decode().split() @@ -735,14 +787,25 @@ def count_unique_reads(self, file_name, paired_end): if file_name.endswith("bam"): param = "" if paired_end: - r1 = self.samtools_view(file_name, param=param + " -f64", postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'") - r2 = self.samtools_view(file_name, param=param + " -f128", postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'") + r1 = self.samtools_view( + file_name, + param=param + " -f64", + postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'", + ) + r2 = self.samtools_view( + file_name, + param=param + " -f128", + postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'", + ) else: - r1 = self.samtools_view(file_name, param=param + "", postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'") + r1 = self.samtools_view( + file_name, + param=param + "", + postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'", + ) r2 = 0 return int(r1) + int(r2) - def count_unique_mapped_reads(self, file_name, paired_end): """ For a bam or sam file with paired or or single-end reads, returns the @@ -764,16 +827,27 @@ def count_unique_mapped_reads(self, file_name, paired_end): else: raise ValueError("Not a SAM or BAM: '{}'".format(file_name)) - if paired_end: - r1 = self.samtools_view(file_name, param=param + " -f64", postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'") - r2 = self.samtools_view(file_name, param=param + " -f128", postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'") + if paired_end: + r1 = self.samtools_view( + file_name, + param=param + " -f64", + postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'", + ) + r2 = self.samtools_view( + file_name, + param=param + " -f128", + postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'", + ) else: - r1 = self.samtools_view(file_name, param=param + "", postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'") + r1 = self.samtools_view( + file_name, + param=param + "", + postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'", + ) r2 = 0 return int(r1) + int(r2) - def count_flag_reads(self, file_name, flag, paired_end): """ Counts the number of reads with the specified flag. @@ -791,7 +865,6 @@ def count_flag_reads(self, file_name, flag, paired_end): param += " -S" return self.samtools_view(file_name, param=param) - def count_multimapping_reads(self, file_name, paired_end): """ Counts the number of reads that mapped to multiple locations. Warning: @@ -807,7 +880,6 @@ def count_multimapping_reads(self, file_name, paired_end): """ return int(self.count_flag_reads(file_name, 256, paired_end)) - def count_uniquelymapping_reads(self, file_name, paired_end): """ Counts the number of reads that mapped to a unique position. @@ -820,7 +892,6 @@ def count_uniquelymapping_reads(self, file_name, paired_end): param += " -S" return self.samtools_view(file_name, param=param) - def count_fail_reads(self, file_name, paired_end): """ Counts the number of reads that failed platform/vendor quality checks. @@ -831,7 +902,6 @@ def count_fail_reads(self, file_name, paired_end): """ return int(self.count_flag_reads(file_name, 512, paired_end)) - def samtools_view(self, file_name, param, postpend=""): """ Run samtools view, with flexible parameters and post-processing. @@ -843,13 +913,11 @@ def samtools_view(self, file_name, param, postpend=""): :param str postpend: String to append to the samtools command; useful to add cut, sort, wc operations to the samtools view output. """ - cmd = "{} view {} {} {}".format( - self.tools.samtools, param, file_name, postpend) + cmd = "{} view {} {} {}".format(self.tools.samtools, param, file_name, postpend) # in python 3, check_output returns a byte string which causes issues. # with python 3.6 we could use argument: "encoding='UTF-8'"" return subprocess.check_output(cmd, shell=True).decode().strip() - def count_reads(self, file_name, paired_end): """ Count reads in a file. @@ -874,13 +942,14 @@ def count_reads(self, file_name, paired_end): param_text = "-c" if ext == ".bam" else "-c -S" return self.samtools_view(file_name, param=param_text) else: - num_lines = self.count_lines_zip(file_name) \ - if is_gzipped_fastq(file_name) \ - else self.count_lines(file_name) + num_lines = ( + self.count_lines_zip(file_name) + if is_gzipped_fastq(file_name) + else self.count_lines(file_name) + ) divisor = 2 if paired_end else 4 return int(num_lines) / divisor - def count_concordant(self, aligned_bam): """ Count only reads that "aligned concordantly exactly 1 time." @@ -889,9 +958,8 @@ def count_concordant(self, aligned_bam): """ cmd = self.tools.samtools + " view " + aligned_bam + " | " cmd += "grep 'YT:Z:CP'" + " | uniq -u | wc -l | sed -E 's/^[[:space:]]+//'" - - return subprocess.check_output(cmd, shell=True).decode().strip() + return subprocess.check_output(cmd, shell=True).decode().strip() def count_mapped_reads(self, file_name, paired_end): """ @@ -912,35 +980,84 @@ def count_mapped_reads(self, file_name, paired_end): return self.samtools_view(file_name, param="-c -F4 -S") return -1 - def sam_conversions(self, sam_file, depth=True): """ Convert sam files to bam files, then sort and index them for later use. :param bool depth: also calculate coverage over each position """ - cmd = self.tools.samtools + " view -bS " + sam_file + " > " + sam_file.replace(".sam", ".bam") + "\n" - cmd += self.tools.samtools + " sort " + sam_file.replace(".sam", ".bam") + " -o " + sam_file.replace(".sam", "_sorted.bam") + "\n" - cmd += self.tools.samtools + " index " + sam_file.replace(".sam", "_sorted.bam") + "\n" + cmd = ( + self.tools.samtools + + " view -bS " + + sam_file + + " > " + + sam_file.replace(".sam", ".bam") + + "\n" + ) + cmd += ( + self.tools.samtools + + " sort " + + sam_file.replace(".sam", ".bam") + + " -o " + + sam_file.replace(".sam", "_sorted.bam") + + "\n" + ) + cmd += ( + self.tools.samtools + + " index " + + sam_file.replace(".sam", "_sorted.bam") + + "\n" + ) if depth: - cmd += self.tools.samtools + " depth " + sam_file.replace(".sam", "_sorted.bam") + " > " + sam_file.replace(".sam", "_sorted.depth") + "\n" + cmd += ( + self.tools.samtools + + " depth " + + sam_file.replace(".sam", "_sorted.bam") + + " > " + + sam_file.replace(".sam", "_sorted.depth") + + "\n" + ) return cmd - def bam_conversions(self, bam_file, depth=True): """ Sort and index bam files for later use. :param bool depth: also calculate coverage over each position """ - cmd = self.tools.samtools + " view -h " + bam_file + " > " + bam_file.replace(".bam", ".sam") + "\n" - cmd += self.tools.samtools + " sort " + bam_file + " -o " + bam_file.replace(".bam", "_sorted.bam") + "\n" - cmd += self.tools.samtools + " index " + bam_file.replace(".bam", "_sorted.bam") + "\n" + cmd = ( + self.tools.samtools + + " view -h " + + bam_file + + " > " + + bam_file.replace(".bam", ".sam") + + "\n" + ) + cmd += ( + self.tools.samtools + + " sort " + + bam_file + + " -o " + + bam_file.replace(".bam", "_sorted.bam") + + "\n" + ) + cmd += ( + self.tools.samtools + + " index " + + bam_file.replace(".bam", "_sorted.bam") + + "\n" + ) if depth: - cmd += self.tools.samtools + " depth " + bam_file.replace(".bam", "_sorted.bam") + " > " + bam_file.replace(".bam", "_sorted.depth") + "\n" + cmd += ( + self.tools.samtools + + " depth " + + bam_file.replace(".bam", "_sorted.bam") + + " > " + + bam_file.replace(".bam", "_sorted.depth") + + "\n" + ) return cmd - def fastqc(self, file, output_dir): """ Create command to run fastqc on a FASTQ file @@ -959,9 +1076,9 @@ def fastqc(self, file, output_dir): if not os.path.isabs(output_dir) and pm is not None: output_dir = os.path.join(pm.outfolder, output_dir) self.make_sure_path_exists(output_dir) - return "{} --noextract --outdir {} {}".\ - format(self.tools.fastqc, output_dir, file) - + return "{} --noextract --outdir {} {}".format( + self.tools.fastqc, output_dir, file + ) def fastqc_rename(self, input_bam, output_dir, sample_name): """ @@ -984,20 +1101,29 @@ def fastqc_rename(self, input_bam, output_dir, sample_name): cmd1 = self.fastqc(input_bam, output_dir) cmds.append(cmd1) cmd2 = "if [[ ! -s {1}_fastqc.html ]]; then mv {0}_fastqc.html {1}_fastqc.html; mv {0}_fastqc.zip {1}_fastqc.zip; fi".format( - os.path.join(output_dir, initial), os.path.join(output_dir, sample_name)) + os.path.join(output_dir, initial), os.path.join(output_dir, sample_name) + ) cmds.append(cmd2) return cmds - def samtools_index(self, bam_file): """Index a bam file.""" cmd = self.tools.samtools + " index {0}".format(bam_file) return cmd - def slurm_header( - self, job_name, output, queue="shortq", n_tasks=1, time="10:00:00", - cpus_per_task=8, mem_per_cpu=2000, nodes=1, user_mail="", mail_type="end"): + self, + job_name, + output, + queue="shortq", + n_tasks=1, + time="10:00:00", + cpus_per_task=8, + mem_per_cpu=2000, + nodes=1, + user_mail="", + mail_type="end", + ): cmd = """ #!/bin/bash #SBATCH --partition={0} #SBATCH --ntasks={1} @@ -1018,50 +1144,65 @@ def slurm_header( date """.format( - queue, n_tasks, time, cpus_per_task, mem_per_cpu, - nodes, job_name, output, mail_type, user_mail) + queue, + n_tasks, + time, + cpus_per_task, + mem_per_cpu, + nodes, + job_name, + output, + mail_type, + user_mail, + ) return cmd - def slurm_footer(self): return " date" - def slurm_submit_job(self, job_file): return os.system("sbatch %s" % job_file) - def remove_file(self, file_name): return "rm {0}".format(file_name) - def move_file(self, old, new): return "mv {0} {1}".format(old, new) - def preseq_curve(self, bam_file, output_prefix): return """ preseq c_curve -B -P -o {0}.yield.txt {1} - """.format(output_prefix, bam_file) - + """.format( + output_prefix, bam_file + ) def preseq_extrapolate(self, bam_file, output_prefix): return """ preseq lc_extrap -v -B -P -e 1e+9 -o {0}.future_yield.txt {1} - """.format(output_prefix, bam_file) - + """.format( + output_prefix, bam_file + ) def preseq_coverage(self, bam_file, output_prefix): return """ preseq gc_extrap -o {0}.future_coverage.txt {1} - """.format(output_prefix, bam_file) - + """.format( + output_prefix, bam_file + ) def trimmomatic( - self, input_fastq1, output_fastq1, cpus, adapters, log, - input_fastq2=None, output_fastq1_unpaired=None, - output_fastq2=None, output_fastq2_unpaired=None): + self, + input_fastq1, + output_fastq1, + cpus, + adapters, + log, + input_fastq2=None, + output_fastq1_unpaired=None, + output_fastq2=None, + output_fastq2_unpaired=None, + ): PE = False if input_fastq2 is None else True pe = "PE" if PE else "SE" @@ -1072,17 +1213,26 @@ def trimmomatic( cmd += " {0}".format(input_fastq2) cmd += " {0}".format(output_fastq1) if PE: - cmd += " {0} {1} {2}".format(output_fastq1_unpaired, output_fastq2, output_fastq2_unpaired) + cmd += " {0} {1} {2}".format( + output_fastq1_unpaired, output_fastq2, output_fastq2_unpaired + ) cmd += " ILLUMINACLIP:{0}:1:40:15:8:true".format(adapters) cmd += " LEADING:3 TRAILING:3" cmd += " SLIDINGWINDOW:4:10" cmd += " MINLEN:36" return cmd - def skewer( - self, input_fastq1, output_prefix, output_fastq1, - log, cpus, adapters, input_fastq2=None, output_fastq2=None): + self, + input_fastq1, + output_prefix, + output_fastq1, + log, + cpus, + adapters, + input_fastq2=None, + output_fastq2=None, + ): """ Create commands with which to run skewer. @@ -1117,17 +1267,33 @@ def skewer( cmd2 = "mv {0} {1}".format(output_prefix + "-trimmed.fastq", output_fastq1) cmds.append(cmd2) else: - cmd2 = "mv {0} {1}".format(output_prefix + "-trimmed-pair1.fastq", output_fastq1) + cmd2 = "mv {0} {1}".format( + output_prefix + "-trimmed-pair1.fastq", output_fastq1 + ) cmds.append(cmd2) - cmd3 = "mv {0} {1}".format(output_prefix + "-trimmed-pair2.fastq", output_fastq2) + cmd3 = "mv {0} {1}".format( + output_prefix + "-trimmed-pair2.fastq", output_fastq2 + ) cmds.append(cmd3) cmd4 = "mv {0} {1}".format(output_prefix + "-trimmed.log", log) cmds.append(cmd4) return cmds - def bowtie2_map(self, input_fastq1, output_bam, log, metrics, genome_index, max_insert, cpus, input_fastq2=None): + def bowtie2_map( + self, + input_fastq1, + output_bam, + log, + metrics, + genome_index, + max_insert, + cpus, + input_fastq2=None, + ): # Admits 2000bp-long fragments (--maxins option) - cmd = self.tools.bowtie2 + " --very-sensitive --no-discordant -p {0}".format(cpus) + cmd = self.tools.bowtie2 + " --very-sensitive --no-discordant -p {0}".format( + cpus + ) cmd += " -x {0}".format(genome_index) cmd += " --met-file {0}".format(metrics) if input_fastq2 is None: @@ -1136,15 +1302,24 @@ def bowtie2_map(self, input_fastq1, output_bam, log, metrics, genome_index, max_ cmd += " --maxins {0}".format(max_insert) cmd += " -1 {0}".format(input_fastq1) cmd += " -2 {0}".format(input_fastq2) - cmd += " 2> {0} | samtools view -S -b - | samtools sort -o {1} -".format(log, output_bam) + cmd += " 2> {0} | samtools view -S -b - | samtools sort -o {1} -".format( + log, output_bam + ) return cmd def topHat_map(self, input_fastq, output_dir, genome, transcriptome, cpus): # TODO: # Allow paired input - cmd = self.tools.tophat + " --GTF {0} --b2-L 15 --library-type fr-unstranded --mate-inner-dist 120".format(transcriptome) + cmd = ( + self.tools.tophat + + " --GTF {0} --b2-L 15 --library-type fr-unstranded --mate-inner-dist 120".format( + transcriptome + ) + ) cmd += " --max-multihits 100 --no-coverage-search" - cmd += " --num-threads {0} --output-dir {1} {2} {3}".format(cpus, output_dir, genome, input_fastq) + cmd += " --num-threads {0} --output-dir {1} {2} {3}".format( + cpus, output_dir, genome, input_fastq + ) return cmd def picard_mark_duplicates(self, input_bam, output_bam, metrics_file, temp_dir="."): @@ -1164,33 +1339,50 @@ def picard_mark_duplicates(self, input_bam, output_bam, metrics_file, temp_dir=" return [cmd1, cmd2, cmd3] def sambamba_remove_duplicates(self, input_bam, output_bam, cpus=16): - cmd = self.tools.sambamba + " markdup -t {0} -r {1} {2}".format(cpus, input_bam, output_bam) + cmd = self.tools.sambamba + " markdup -t {0} -r {1} {2}".format( + cpus, input_bam, output_bam + ) return cmd def get_mitochondrial_reads(self, bam_file, output, cpus=4): - """ - """ + """ """ tmp_bam = bam_file + "tmp_rmMe" cmd1 = self.tools.sambamba + " index -t {0} {1}".format(cpus, bam_file) - cmd2 = self.tools.sambamba + " slice {0} chrM | {1} markdup -t 4 /dev/stdin {2} 2> {3}".format(bam_file, self.tools.sambamba, tmp_bam, output) + cmd2 = ( + self.tools.sambamba + + " slice {0} chrM | {1} markdup -t 4 /dev/stdin {2} 2> {3}".format( + bam_file, self.tools.sambamba, tmp_bam, output + ) + ) cmd3 = "rm {}".format(tmp_bam) return [cmd1, cmd2, cmd3] - def filter_reads(self, input_bam, output_bam, metrics_file, paired=False, cpus=16, Q=30): + def filter_reads( + self, input_bam, output_bam, metrics_file, paired=False, cpus=16, Q=30 + ): """ Remove duplicates, filter for >Q, remove multiple mapping reads. For paired-end reads, keep only proper pairs. """ nodups = re.sub("\.bam$", "", output_bam) + ".nodups.nofilter.bam" - cmd1 = self.tools.sambamba + " markdup -t {0} -r --compression-level=0 {1} {2} 2> {3}".format(cpus, input_bam, nodups, metrics_file) - cmd2 = self.tools.sambamba + ' view -t {0} -f bam --valid'.format(cpus) + cmd1 = ( + self.tools.sambamba + + " markdup -t {0} -r --compression-level=0 {1} {2} 2> {3}".format( + cpus, input_bam, nodups, metrics_file + ) + ) + cmd2 = self.tools.sambamba + " view -t {0} -f bam --valid".format(cpus) if paired: cmd2 += ' -F "not (unmapped or mate_is_unmapped) and proper_pair' else: cmd2 += ' -F "not unmapped' - cmd2 += ' and not (secondary_alignment or supplementary) and mapping_quality >= {0}"'.format(Q) - cmd2 += ' {0} |'.format(nodups) - cmd2 += self.tools.sambamba + " sort -t {0} /dev/stdin -o {1}".format(cpus, output_bam) + cmd2 += ' and not (secondary_alignment or supplementary) and mapping_quality >= {0}"'.format( + Q + ) + cmd2 += " {0} |".format(nodups) + cmd2 += self.tools.sambamba + " sort -t {0} /dev/stdin -o {1}".format( + cpus, output_bam + ) cmd3 = "if [[ -s {0} ]]; then rm {0}; fi".format(nodups) cmd4 = "if [[ -s {0} ]]; then rm {0}; fi".format(nodups + ".bai") return [cmd1, cmd2, cmd3, cmd4] @@ -1203,7 +1395,6 @@ def shift_reads(self, input_bam, genome, output_bam): cmd += " " + self.tools.samtools + " sort -o {0} -".format(output_bam) return cmd - def sort_index_bam(self, input_bam, output_bam): tmp_bam = re.sub("\.bam", ".sorted", input_bam) cmd1 = self.tools.samtools + " sort {0} {1}".format(input_bam, tmp_bam) @@ -1211,12 +1402,10 @@ def sort_index_bam(self, input_bam, output_bam): cmd3 = self.tools.samtools + " index {0}".format(output_bam) return [cmd1, cmd2, cmd3] - def index_bam(self, input_bam): cmd = self.tools.samtools + " index {0}".format(input_bam) return cmd - def run_spp(self, input_bam, output, plot, cpus): """ Run the SPP read peak analysis tool. @@ -1229,38 +1418,40 @@ def run_spp(self, input_bam, output, plot, cpus): """ base = "{} {} -rf -savp".format(self.tools.Rscript, self.tools.spp) cmd = base + " -savp={} -s=0:5:500 -c={} -out={} -p={}".format( - plot, input_bam, output, cpus) + plot, input_bam, output, cpus + ) return cmd - def get_fragment_sizes(self, bam_file): try: - import pysam import numpy as np + import pysam except: return frag_sizes = list() - bam = pysam.Samfile(bam_file, 'rb') + bam = pysam.Samfile(bam_file, "rb") for read in bam: if bam.getrname(read.tid) != "chrM" and read.tlen < 1500: frag_sizes.append(read.tlen) bam.close() return np.array(frag_sizes) - - def plot_atacseq_insert_sizes(self, bam, plot, output_csv, max_insert=1500, smallest_insert=30): + def plot_atacseq_insert_sizes( + self, bam, plot, output_csv, max_insert=1500, smallest_insert=30 + ): """ Heavy inspiration from here: https://github.com/dbrg77/ATAC/blob/master/ATAC_seq_read_length_curve_fitting.ipynb """ try: - import pysam - import numpy as np + import matplotlib import matplotlib.mlab as mlab - from scipy.optimize import curve_fit + import numpy as np + import pysam from scipy.integrate import simps - import matplotlib - matplotlib.use('Agg') + from scipy.optimize import curve_fit + + matplotlib.use("Agg") import matplotlib.pyplot as plt except: print("Necessary Python modules couldn't be loaded.") @@ -1268,6 +1459,7 @@ def plot_atacseq_insert_sizes(self, bam, plot, output_csv, max_insert=1500, smal try: import seaborn as sns + sns.set_style("whitegrid") except: pass @@ -1275,7 +1467,7 @@ def plot_atacseq_insert_sizes(self, bam, plot, output_csv, max_insert=1500, smal def get_fragment_sizes(bam, max_insert=1500): frag_sizes = list() - bam = pysam.Samfile(bam, 'rb') + bam = pysam.Samfile(bam, "rb") for i, read in enumerate(bam): if read.tlen < max_insert: @@ -1293,11 +1485,13 @@ def mixture_function(x, *p): nfr = expo(x, 2.9e-02, 2.8e-02) nfr[:smallest_insert] = 0 - return (mlab.normpdf(x, m1, s1) * w1 + - mlab.normpdf(x, m2, s2) * w2 + - mlab.normpdf(x, m3, s3) * w3 + - mlab.normpdf(x, m4, s4) * w4 + - nfr) + return ( + mlab.normpdf(x, m1, s1) * w1 + + mlab.normpdf(x, m2, s2) * w2 + + mlab.normpdf(x, m3, s3) * w3 + + mlab.normpdf(x, m4, s4) * w4 + + nfr + ) def expo(x, q, r): """ @@ -1316,17 +1510,30 @@ def expo(x, q, r): # Parameters are empirical, need to check paramGuess = [ - 200, 50, 0.7, # gaussians - 400, 50, 0.15, - 600, 50, 0.1, - 800, 55, 0.045, - 2.9e-02, 2.8e-02 # exponential + 200, + 50, + 0.7, # gaussians + 400, + 50, + 0.15, + 600, + 50, + 0.1, + 800, + 55, + 0.045, + 2.9e-02, + 2.8e-02, # exponential ] try: popt3, pcov3 = curve_fit( - mixture_function, x[smallest_insert:], y[smallest_insert:], - p0=paramGuess, maxfev=100000) + mixture_function, + x[smallest_insert:], + y[smallest_insert:], + p0=paramGuess, + maxfev=100000, + ) except: print("Nucleosomal fit could not be found.") return @@ -1340,19 +1547,19 @@ def expo(x, q, r): plt.hist(frag_sizes, numBins, histtype="step", ec="k", normed=1, alpha=0.5) # Plot nucleosomal fits - plt.plot(x, mlab.normpdf(x, m1, s1) * w1, 'r-', lw=1.5, label="1st nucleosome") - plt.plot(x, mlab.normpdf(x, m2, s2) * w2, 'g-', lw=1.5, label="2nd nucleosome") - plt.plot(x, mlab.normpdf(x, m3, s3) * w3, 'b-', lw=1.5, label="3rd nucleosome") - plt.plot(x, mlab.normpdf(x, m4, s4) * w4, 'c-', lw=1.5, label="4th nucleosome") + plt.plot(x, mlab.normpdf(x, m1, s1) * w1, "r-", lw=1.5, label="1st nucleosome") + plt.plot(x, mlab.normpdf(x, m2, s2) * w2, "g-", lw=1.5, label="2nd nucleosome") + plt.plot(x, mlab.normpdf(x, m3, s3) * w3, "b-", lw=1.5, label="3rd nucleosome") + plt.plot(x, mlab.normpdf(x, m4, s4) * w4, "c-", lw=1.5, label="4th nucleosome") # Plot nucleosome-free fit nfr = expo(x, 2.9e-02, 2.8e-02) nfr[:smallest_insert] = 0 - plt.plot(x, nfr, 'k-', lw=1.5, label="nucleosome-free") + plt.plot(x, nfr, "k-", lw=1.5, label="nucleosome-free") # Plot sum of fits ys = mixture_function(x, *popt3) - plt.plot(x, ys, 'k--', lw=3.5, label="fit sum") + plt.plot(x, ys, "k--", lw=3.5, label="fit sum") plt.legend() plt.xlabel("Fragment size (bp)") @@ -1363,10 +1570,26 @@ def expo(x, q, r): areas = [ ["fraction", "area under curve", "max density"], ["Nucleosome-free fragments", simps(nfr), max(nfr)], - ["1st nucleosome", simps(mlab.normpdf(x, m1, s1) * w1), max(mlab.normpdf(x, m1, s1) * w1)], - ["2nd nucleosome", simps(mlab.normpdf(x, m2, s2) * w1), max(mlab.normpdf(x, m2, s2) * w2)], - ["3rd nucleosome", simps(mlab.normpdf(x, m3, s3) * w1), max(mlab.normpdf(x, m3, s3) * w3)], - ["4th nucleosome", simps(mlab.normpdf(x, m4, s4) * w1), max(mlab.normpdf(x, m4, s4) * w4)] + [ + "1st nucleosome", + simps(mlab.normpdf(x, m1, s1) * w1), + max(mlab.normpdf(x, m1, s1) * w1), + ], + [ + "2nd nucleosome", + simps(mlab.normpdf(x, m2, s2) * w1), + max(mlab.normpdf(x, m2, s2) * w2), + ], + [ + "3rd nucleosome", + simps(mlab.normpdf(x, m3, s3) * w1), + max(mlab.normpdf(x, m3, s3) * w3), + ], + [ + "4th nucleosome", + simps(mlab.normpdf(x, m4, s4) * w1), + max(mlab.normpdf(x, m4, s4) * w4), + ], ] try: @@ -1380,8 +1603,15 @@ def expo(x, q, r): # TODO: parameterize in terms of normalization factor. def bam_to_bigwig( - self, input_bam, output_bigwig, genome_sizes, genome, - tagmented=False, normalize=False, norm_factor=1000): + self, + input_bam, + output_bigwig, + genome_sizes, + genome, + tagmented=False, + normalize=False, + norm_factor=1000, + ): """ Convert a BAM file to a bigWig file. @@ -1401,34 +1631,63 @@ def bam_to_bigwig( transient_file = os.path.abspath(re.sub("\.bigWig", "", output_bigwig)) cmd1 = self.tools.bedtools + " bamtobed -i {0} |".format(input_bam) if not tagmented: - cmd1 += " " + self.tools.bedtools + " slop -i stdin -g {0} -s -l 0 -r 130 |".format(genome_sizes) + cmd1 += ( + " " + + self.tools.bedtools + + " slop -i stdin -g {0} -s -l 0 -r 130 |".format(genome_sizes) + ) cmd1 += " fix_bedfile_genome_boundaries.py {0} |".format(genome) - cmd1 += " " + self.tools.genomeCoverageBed + " {0}-bg -g {1} -i stdin > {2}.cov".format( - "-5 " if tagmented else "", - genome_sizes, - transient_file + cmd1 += ( + " " + + self.tools.genomeCoverageBed + + " {0}-bg -g {1} -i stdin > {2}.cov".format( + "-5 " if tagmented else "", genome_sizes, transient_file + ) ) cmds.append(cmd1) if normalize: - cmds.append("""awk 'NR==FNR{{sum+= $4; next}}{{ $4 = ($4 / sum) * {1}; print}}' {0}.cov {0}.cov | sort -k1,1 -k2,2n > {0}.normalized.cov""".format(transient_file, norm_factor)) - cmds.append(self.tools.bedGraphToBigWig + " {0}{1}.cov {2} {3}".format(transient_file, ".normalized" if normalize else "", genome_sizes, output_bigwig)) + cmds.append( + """awk 'NR==FNR{{sum+= $4; next}}{{ $4 = ($4 / sum) * {1}; print}}' {0}.cov {0}.cov | sort -k1,1 -k2,2n > {0}.normalized.cov""".format( + transient_file, norm_factor + ) + ) + cmds.append( + self.tools.bedGraphToBigWig + + " {0}{1}.cov {2} {3}".format( + transient_file, + ".normalized" if normalize else "", + genome_sizes, + output_bigwig, + ) + ) # remove tmp files cmds.append("if [[ -s {0}.cov ]]; then rm {0}.cov; fi".format(transient_file)) if normalize: - cmds.append("if [[ -s {0}.normalized.cov ]]; then rm {0}.normalized.cov; fi".format(transient_file)) + cmds.append( + "if [[ -s {0}.normalized.cov ]]; then rm {0}.normalized.cov; fi".format( + transient_file + ) + ) cmds.append("chmod 755 {0}".format(output_bigwig)) return cmds - - def add_track_to_hub(self, sample_name, track_url, track_hub, colour, five_prime=""): - cmd1 = """echo "track type=bigWig name='{0} {1}' description='{0} {1}'""".format(sample_name, five_prime) - cmd1 += """ height=32 visibility=full maxHeightPixels=32:32:25 bigDataUrl={0} color={1}" >> {2}""".format(track_url, colour, track_hub) + def add_track_to_hub( + self, sample_name, track_url, track_hub, colour, five_prime="" + ): + cmd1 = ( + """echo "track type=bigWig name='{0} {1}' description='{0} {1}'""".format( + sample_name, five_prime + ) + ) + cmd1 += """ height=32 visibility=full maxHeightPixels=32:32:25 bigDataUrl={0} color={1}" >> {2}""".format( + track_url, colour, track_hub + ) cmd2 = "chmod 755 {0}".format(track_hub) return [cmd1, cmd2] - def link_to_track_hub(self, track_hub_url, file_name, genome): import textwrap + db = "org" if genome == "hg19" else "db" # different database call for human genome = "human" if genome == "hg19" else genome # change hg19 to human html = """ @@ -1438,35 +1697,56 @@ def link_to_track_hub(self, track_hub_url, file_name, genome): html += """{db}={genome}&hgt.customText={track_hub_url}" /> - """.format(track_hub_url=track_hub_url, genome=genome, db=db) - with open(file_name, 'w') as handle: + """.format( + track_hub_url=track_hub_url, genome=genome, db=db + ) + with open(file_name, "w") as handle: handle.write(textwrap.dedent(html)) - def htseq_count(self, input_bam, gtf, output): sam = input_bam.replace("bam", "sam") cmd1 = "samtools view {0} > {1}".format(input_bam, sam) - cmd2 = "htseq-count -f sam -t exon -i transcript_id -m union {0} {1} > {2}".format(sam, gtf, output) + cmd2 = ( + "htseq-count -f sam -t exon -i transcript_id -m union {0} {1} > {2}".format( + sam, gtf, output + ) + ) cmd3 = "rm {0}".format(sam) return [cmd1, cmd2, cmd3] - - def kallisto(self, input_fastq, output_dir, output_bam, transcriptome_index, cpus, input_fastq2=None, size=180, b=200): - cmd1 = self.tools.kallisto + " quant --bias --pseudobam -b {0} -l {1} -i {2} -o {3} -t {4}".format(b, size, transcriptome_index, output_dir, cpus) + def kallisto( + self, + input_fastq, + output_dir, + output_bam, + transcriptome_index, + cpus, + input_fastq2=None, + size=180, + b=200, + ): + cmd1 = ( + self.tools.kallisto + + " quant --bias --pseudobam -b {0} -l {1} -i {2} -o {3} -t {4}".format( + b, size, transcriptome_index, output_dir, cpus + ) + ) if input_fastq2 is None: cmd1 += " --single {0}".format(input_fastq) else: cmd1 += " {0} {1}".format(input_fastq, input_fastq2) cmd1 += " | " + self.tools.samtools + " view -Sb - > {0}".format(output_bam) - cmd2 = self.tools.kallisto + " h5dump -o {0} {0}/abundance.h5".format(output_dir) + cmd2 = self.tools.kallisto + " h5dump -o {0} {0}/abundance.h5".format( + output_dir + ) return [cmd1, cmd2] - def genome_wide_coverage(self, input_bam, genome_windows, output): - cmd = self.tools.bedtools + " coverage -counts -abam {0} -b {1} > {2}".format(input_bam, genome_windows, output) + cmd = self.tools.bedtools + " coverage -counts -abam {0} -b {1} > {2}".format( + input_bam, genome_windows, output + ) return cmd - def calc_frip(self, input_bam, input_bed, threads=4): """ Calculate fraction of reads in peaks. @@ -1483,14 +1763,12 @@ def calc_frip(self, input_bam, input_bed, threads=4): cmd = self.simple_frip(input_bam, input_bed, threads) return subprocess.check_output(cmd.split(" "), shell=True).decode().strip() - def simple_frip(self, input_bam, input_bed, threads=4): cmd = "{} view".format(self.tools.samtools) cmd += " -@ {} -c -L {}".format(threads, input_bed) cmd += " " + input_bam return cmd - def calculate_frip(self, input_bam, input_bed, output, cpus=4): cmd = self.tools.sambamba + " depth region -t {0}".format(cpus) cmd += " -L {0}".format(input_bed) @@ -1498,11 +1776,19 @@ def calculate_frip(self, input_bam, input_bed, output, cpus=4): cmd += " | awk '{{sum+=$5}} END {{print sum}}' > {0}".format(output) return cmd - def macs2_call_peaks( - self, treatment_bams, output_dir, sample_name, genome, - control_bams=None, broad=False, paired=False, - pvalue=None, qvalue=None, include_significance=None): + self, + treatment_bams, + output_dir, + sample_name, + genome, + control_bams=None, + broad=False, + paired=False, + pvalue=None, + qvalue=None, + include_significance=None, + ): """ Use MACS2 to call peaks. @@ -1527,7 +1813,13 @@ def macs2_call_peaks( specified but no value is provided for p-value or q-value. :return str: Command to run. """ - sizes = {"hg38": 2.7e9, "hg19": 2.7e9, "mm10": 1.87e9, "dr7": 1.412e9, "mm9": 1.87e9} + sizes = { + "hg38": 2.7e9, + "hg19": 2.7e9, + "mm10": 1.87e9, + "dr7": 1.412e9, + "mm9": 1.87e9, + } # Whether to specify to MACS2 a value for statistical significance # can be either directly indicated, but if not, it's determined by @@ -1537,10 +1829,14 @@ def macs2_call_peaks( if include_significance is None: include_significance = broad - cmd = self.tools.macs2 + " callpeak -t {0}".format(treatment_bams if type(treatment_bams) is str else " ".join(treatment_bams)) + cmd = self.tools.macs2 + " callpeak -t {0}".format( + treatment_bams if type(treatment_bams) is str else " ".join(treatment_bams) + ) if control_bams is not None: - cmd += " -c {0}".format(control_bams if type(control_bams) is str else " ".join(control_bams)) + cmd += " -c {0}".format( + control_bams if type(control_bams) is str else " ".join(control_bams) + ) if paired: cmd += " -f BAMPE " @@ -1561,26 +1857,46 @@ def macs2_call_peaks( cmd += " --qvalue {}".format(qvalue) else: cmd += " --pvalue {}".format(pvalue or 0.00001) - cmd += " -g {0} -n {1} --outdir {2}".format(sizes[genome], sample_name, output_dir) + cmd += " -g {0} -n {1} --outdir {2}".format( + sizes[genome], sample_name, output_dir + ) return cmd def macs2_call_peaks_atacseq(self, treatment_bam, output_dir, sample_name, genome): - genome_sizes = {"hg38": 2.7e9, "hg19": 2.7e9, "mm10": 1.87e9, "dr7": 1.412e9, "mm9": 1.87e9} + genome_sizes = { + "hg38": 2.7e9, + "hg19": 2.7e9, + "mm10": 1.87e9, + "dr7": 1.412e9, + "mm9": 1.87e9, + } cmd = self.tools.macs2 + " callpeak -t {0}".format(treatment_bam) - cmd += " --nomodel --extsize 147 -g {0} -n {1} --outdir {2}".format(genome_sizes[genome], sample_name, output_dir) + cmd += " --nomodel --extsize 147 -g {0} -n {1} --outdir {2}".format( + genome_sizes[genome], sample_name, output_dir + ) return cmd def macs2_plot_model(self, r_peak_model_file, sample_name, output_dir): # run macs r script cmd1 = "{} {}".format(self.tools.Rscript, r_peak_model_file) # move output plot to sample dir - cmd2 = "mv {0}/{1}_model.pdf {2}/{1}_model.pdf".format(os.getcwd(), sample_name, output_dir) + cmd2 = "mv {0}/{1}_model.pdf {2}/{1}_model.pdf".format( + os.getcwd(), sample_name, output_dir + ) return [cmd1, cmd2] def spp_call_peaks( - self, treatment_bam, control_bam, treatment_name, control_name, - output_dir, broad, cpus, qvalue=None): + self, + treatment_bam, + control_bam, + treatment_name, + control_name, + output_dir, + broad, + cpus, + qvalue=None, + ): """ Build command for R script to call peaks with SPP. @@ -1595,20 +1911,33 @@ def spp_call_peaks( :return str: Command to run. """ broad = "TRUE" if broad else "FALSE" - cmd = self.tools.Rscript + " `which spp_peak_calling.R` {0} {1} {2} {3} {4} {5} {6}".format( - treatment_bam, control_bam, treatment_name, control_name, broad, cpus, output_dir + cmd = ( + self.tools.Rscript + + " `which spp_peak_calling.R` {0} {1} {2} {3} {4} {5} {6}".format( + treatment_bam, + control_bam, + treatment_name, + control_name, + broad, + cpus, + output_dir, + ) ) if qvalue is not None: cmd += " {}".format(qvalue) return cmd def bam_to_bed(self, input_bam, output_bed): - cmd = self.tools.bedtools + " bamtobed -i {0} > {1}".format(input_bam, output_bed) + cmd = self.tools.bedtools + " bamtobed -i {0} > {1}".format( + input_bam, output_bed + ) return cmd def zinba_call_peaks(self, treatment_bed, control_bed, cpus, tagmented=False): fragmentLength = 80 if tagmented else 180 - cmd = self.tools.Rscript + " `which zinba.R` -l {0} -t {1} -c {2}".format(fragmentLength, treatment_bed, control_bed) + cmd = self.tools.Rscript + " `which zinba.R` -l {0} -t {1} -c {2}".format( + fragmentLength, treatment_bed, control_bed + ) return cmd def filter_peaks_mappability(self, peaks, alignability, filtered_peaks): @@ -1616,22 +1945,38 @@ def filter_peaks_mappability(self, peaks, alignability, filtered_peaks): cmd += " -a {0} -b {1} > {2} ".format(peaks, alignability, filtered_peaks) return cmd - def homer_find_motifs(self, peak_file, genome, output_dir, size=150, length="8,10,12,14,16", n_motifs=12): + def homer_find_motifs( + self, + peak_file, + genome, + output_dir, + size=150, + length="8,10,12,14,16", + n_motifs=12, + ): cmd = "findMotifsGenome.pl {0} {1} {2}".format(peak_file, genome, output_dir) cmd += " -mask -size {0} -len {1} -S {2}".format(size, length, n_motifs) return cmd def homer_annotate_pPeaks(self, peak_file, genome, motif_file, output_bed): - cmd = "annotatePeaks.pl {0} {1} -mask -mscore -m {2} |".format(peak_file, genome, motif_file) + cmd = "annotatePeaks.pl {0} {1} -mask -mscore -m {2} |".format( + peak_file, genome, motif_file + ) cmd += "tail -n +2 | cut -f 1,5,22 > {3}".format(output_bed) return cmd - def center_peaks_on_motifs(self, peak_file, genome, window_width, motif_file, output_bed): + def center_peaks_on_motifs( + self, peak_file, genome, window_width, motif_file, output_bed + ): - cmd = "annotatePeaks.pl {0} {1} -size {2} -center {3} |".format(peak_file, genome, window_width, motif_file) + cmd = "annotatePeaks.pl {0} {1} -size {2} -center {3} |".format( + peak_file, genome, window_width, motif_file + ) cmd += " awk -v OFS='\t' '{print $2, $3, $4, $1, $6, $5}' |" cmd += """ awk -v OFS='\t' -F '\t' '{ gsub("0", "+", $6) ; gsub("1", "-", $6) ; print }' |""" - cmd += " fix_bedfile_genome_boundaries.py {0} | sortBed > {1}".format(genome, output_bed) + cmd += " fix_bedfile_genome_boundaries.py {0} | sortBed > {1}".format( + genome, output_bed + ) return cmd def get_read_type(self, bam_file, n=10): @@ -1642,9 +1987,11 @@ def get_read_type(self, bam_file, n=10): :return str, int: tuple of read type and read length """ from collections import Counter + try: - p = subprocess.Popen([self.tools.samtools, 'view', bam_file], - stdout=subprocess.PIPE) + p = subprocess.Popen( + [self.tools.samtools, "view", bam_file], stdout=subprocess.PIPE + ) # Count paired alignments paired = 0 read_length = Counter() @@ -1661,19 +2008,28 @@ def get_read_type(self, bam_file, n=10): # Get most abundant read read_length read_length = sorted(read_length)[-1] # If at least half is paired, return True - if paired > (n / 2.): + if paired > (n / 2.0): return "PE", read_length else: return "SE", read_length - def parse_bowtie_stats(self, stats_file): """ Parses Bowtie2 stats file, returns series with values. :param str stats_file: Bowtie2 output file with alignment statistics. """ import pandas as pd - stats = pd.Series(index=["readCount", "unpaired", "unaligned", "unique", "multiple", "alignmentRate"]) + + stats = pd.Series( + index=[ + "readCount", + "unpaired", + "unaligned", + "unique", + "multiple", + "alignmentRate", + ] + ) try: with open(stats_file) as handle: content = handle.readlines() # list of strings per line @@ -1681,27 +2037,46 @@ def parse_bowtie_stats(self, stats_file): return stats # total reads try: - line = [i for i in range(len(content)) if " reads; of these:" in content[i]][0] + line = [ + i for i in range(len(content)) if " reads; of these:" in content[i] + ][0] stats["readCount"] = re.sub("\D.*", "", content[line]) if 7 > len(content) > 2: - line = [i for i in range(len(content)) if "were unpaired; of these:" in content[i]][0] + line = [ + i + for i in range(len(content)) + if "were unpaired; of these:" in content[i] + ][0] stats["unpaired"] = re.sub("\D", "", re.sub("\(.*", "", content[line])) else: - line = [i for i in range(len(content)) if "were paired; of these:" in content[i]][0] - stats["unpaired"] = stats["readCount"] - int(re.sub("\D", "", re.sub("\(.*", "", content[line]))) - line = [i for i in range(len(content)) if "aligned 0 times" in content[i]][0] + line = [ + i + for i in range(len(content)) + if "were paired; of these:" in content[i] + ][0] + stats["unpaired"] = stats["readCount"] - int( + re.sub("\D", "", re.sub("\(.*", "", content[line])) + ) + line = [i for i in range(len(content)) if "aligned 0 times" in content[i]][ + 0 + ] stats["unaligned"] = re.sub("\D", "", re.sub("\(.*", "", content[line])) - line = [i for i in range(len(content)) if "aligned exactly 1 time" in content[i]][0] + line = [ + i for i in range(len(content)) if "aligned exactly 1 time" in content[i] + ][0] stats["unique"] = re.sub("\D", "", re.sub("\(.*", "", content[line])) - line = [i for i in range(len(content)) if "aligned >1 times" in content[i]][0] + line = [i for i in range(len(content)) if "aligned >1 times" in content[i]][ + 0 + ] stats["multiple"] = re.sub("\D", "", re.sub("\(.*", "", content[line])) - line = [i for i in range(len(content)) if "overall alignment rate" in content[i]][0] + line = [ + i for i in range(len(content)) if "overall alignment rate" in content[i] + ][0] stats["alignmentRate"] = re.sub("\%.*", "", content[line]).strip() except IndexError: pass return stats - def parse_duplicate_stats(self, stats_file): """ Parses sambamba markdup output, returns series with values. @@ -1709,6 +2084,7 @@ def parse_duplicate_stats(self, stats_file): :param str stats_file: sambamba output file with duplicate statistics. """ import pandas as pd + series = pd.Series() try: with open(stats_file) as handle: @@ -1716,17 +2092,32 @@ def parse_duplicate_stats(self, stats_file): except: return series try: - line = [i for i in range(len(content)) if "single ends (among them " in content[i]][0] + line = [ + i + for i in range(len(content)) + if "single ends (among them " in content[i] + ][0] series["single-ends"] = re.sub("\D", "", re.sub("\(.*", "", content[line])) - line = [i for i in range(len(content)) if " end pairs... done in " in content[i]][0] - series["paired-ends"] = re.sub("\D", "", re.sub("\.\.\..*", "", content[line])) - line = [i for i in range(len(content)) if " duplicates, sorting the list... done in " in content[i]][0] - series["duplicates"] = re.sub("\D", "", re.sub("\.\.\..*", "", content[line])) + line = [ + i + for i in range(len(content)) + if " end pairs... done in " in content[i] + ][0] + series["paired-ends"] = re.sub( + "\D", "", re.sub("\.\.\..*", "", content[line]) + ) + line = [ + i + for i in range(len(content)) + if " duplicates, sorting the list... done in " in content[i] + ][0] + series["duplicates"] = re.sub( + "\D", "", re.sub("\.\.\..*", "", content[line]) + ) except IndexError: pass return series - def parse_qc(self, qc_file): """ Parse phantompeakqualtools (spp) QC table and return quality metrics. @@ -1735,10 +2126,13 @@ def parse_qc(self, qc_file): contains sample quality measurements. """ import pandas as pd + series = pd.Series() try: with open(qc_file) as handle: - line = handle.readlines()[0].strip().split("\t") # list of strings per line + line = ( + handle.readlines()[0].strip().split("\t") + ) # list of strings per line series["NSC"] = line[-3] series["RSC"] = line[-2] series["qualityTag"] = line[-1] @@ -1746,7 +2140,6 @@ def parse_qc(self, qc_file): pass return series - def get_peak_number(self, sample): """ Counts number of peaks from a sample's peak file. @@ -1758,7 +2151,6 @@ def get_peak_number(self, sample): sample["peakNumber"] = re.sub("\D.*", "", out) return sample - def get_frip(self, sample): """ Calculates the fraction of reads in peaks for a given sample. @@ -1766,6 +2158,7 @@ def get_frip(self, sample): :param pipelines.Sample sample: Sample object with "peaks" attribute. """ import pandas as pd + with open(sample.frip, "r") as handle: content = handle.readlines() reads_in_peaks = int(re.sub("\D", "", content[0])) diff --git a/pypiper/pipeline.py b/pypiper/pipeline.py index 0a4dbc09..aca78687 100644 --- a/pypiper/pipeline.py +++ b/pypiper/pipeline.py @@ -1,23 +1,29 @@ """ Pipeline base class """ import abc -from collections import OrderedDict import glob import os import sys +from collections import OrderedDict + if sys.version_info < (3, 3): from collections import Iterable, Mapping else: from collections.abc import Iterable, Mapping -from .exceptions import \ - IllegalPipelineDefinitionError, IllegalPipelineExecutionError, \ - UnknownPipelineStageError +from .exceptions import ( + IllegalPipelineDefinitionError, + IllegalPipelineExecutionError, + UnknownPipelineStageError, +) from .manager import PipelineManager from .stage import Stage -from .utils import \ - checkpoint_filepath, flag_name, parse_stage_name, translate_stage_name - +from .utils import ( + checkpoint_filepath, + flag_name, + parse_stage_name, + translate_stage_name, +) __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" @@ -26,7 +32,6 @@ __all__ = ["Pipeline", "UnknownPipelineStageError"] - class Pipeline(object): """ Generic pipeline framework. @@ -49,35 +54,41 @@ class Pipeline(object): :raise pypiper.IllegalPipelineDefinitionError: Definition of collection of stages must be non-empty. """ - + __metaclass__ = abc.ABCMeta - - def __init__(self, name=None, manager=None, outfolder=None, args=None, - **pl_mgr_kwargs): + + def __init__( + self, name=None, manager=None, outfolder=None, args=None, **pl_mgr_kwargs + ): super(Pipeline, self).__init__() try: self.name = name or manager.name except AttributeError: raise TypeError( - "If a pipeline manager isn't provided to create " - "{}, a name is required.".format(Pipeline.__name__)) + "If a pipeline manager isn't provided to create " + "{}, a name is required.".format(Pipeline.__name__) + ) else: if not self.name: raise ValueError( "Invalid name, possible inferred from pipeline manager: " - "{} ({})".format(self.name, type(self.name))) + "{} ({})".format(self.name, type(self.name)) + ) # Determine the PipelineManager. if manager: self.manager = manager if outfolder: - print("Ignoring explicit output folder ({}) and using that of " - "pipeline manager ({})".format(outfolder, - manager.outfolder)) + print( + "Ignoring explicit output folder ({}) and using that of " + "pipeline manager ({})".format(outfolder, manager.outfolder) + ) if name and name != manager.name: - print("Warning: name for pipeline ('{}') doesn't match that " - "of the given manager ('{}')".format(name, manager.name)) + print( + "Warning: name for pipeline ('{}') doesn't match that " + "of the given manager ('{}')".format(name, manager.name) + ) elif outfolder: # We're guaranteed by the upfront exception block around # name setting that we'll either have set the name for this @@ -85,10 +96,13 @@ def __init__(self, name=None, manager=None, outfolder=None, args=None, # protected from passing a null name argument to the pipeline # manager's constructor. self.manager = PipelineManager( - self.name, outfolder, args=args, **pl_mgr_kwargs) + self.name, outfolder, args=args, **pl_mgr_kwargs + ) else: - raise TypeError("To create a {} instance, 'manager' or 'outfolder' " - "is required".format(self.__class__.__name__)) + raise TypeError( + "To create a {} instance, 'manager' or 'outfolder' " + "is required".format(self.__class__.__name__) + ) # Require that checkpoints be overwritten. self.manager.overwrite_checkpoints = True @@ -98,14 +112,19 @@ def __init__(self, name=None, manager=None, outfolder=None, args=None, # stage names are handled, parsed, and translated. self._unordered = _is_unordered(self.stages()) if self._unordered: - print("NOTICE: Unordered definition of stages for " - "pipeline {}".format(self.name)) + print( + "NOTICE: Unordered definition of stages for " + "pipeline {}".format(self.name) + ) # Get to a sequence of pairs of key (possibly in need of translation) # and actual callable. Key is stage name and value is either stage # callable or an already-made stage object. - stages = self.stages().items() \ - if isinstance(self.stages(), Mapping) else self.stages() + stages = ( + self.stages().items() + if isinstance(self.stages(), Mapping) + else self.stages() + ) # Stage spec. parser handles callable validation. name_stage_pairs = [_parse_stage_spec(s) for s in stages] @@ -133,9 +152,12 @@ def __init__(self, name=None, manager=None, outfolder=None, args=None, # Check that there's not a checkpoint name collision. if internal_name in _internal_to_external: already_mapped = _internal_to_external[internal_name] - errmsg = "Duplicate stage name resolution (stage names are too " \ - "similar.) '{}' and '{}' both resolve to '{}'".\ - format(name, already_mapped, internal_name) + errmsg = ( + "Duplicate stage name resolution (stage names are too " + "similar.) '{}' and '{}' both resolve to '{}'".format( + name, already_mapped, internal_name + ) + ) raise IllegalPipelineDefinitionError(errmsg) # Store the stage name translations and the stage itself. @@ -145,7 +167,6 @@ def __init__(self, name=None, manager=None, outfolder=None, args=None, self.skipped, self.executed = None, None - @property def outfolder(self): """ @@ -155,7 +176,6 @@ def outfolder(self): """ return self.manager.outfolder - @abc.abstractmethod def stages(self): """ @@ -165,7 +185,6 @@ def stages(self): """ pass - @property def stage_names(self): """ @@ -177,7 +196,6 @@ class author (i.e., not necessarily those that are used for the """ return [parse_stage_name(s) for s in self._stages] - def checkpoint(self, stage, msg=""): """ Touch checkpoint file for given stage and provide timestamp message. @@ -192,8 +210,8 @@ def checkpoint(self, stage, msg=""): # pipeline completes, so fix the 'finished' parameter to the manager's # timestamp method to be True. return self.manager.timestamp( - message=msg, checkpoint=stage.checkpoint_name, finished=True) - + message=msg, checkpoint=stage.checkpoint_name, finished=True + ) def completed_stage(self, stage): """ @@ -207,12 +225,10 @@ def completed_stage(self, stage): check_path = checkpoint_filepath(stage, self.manager) return os.path.exists(check_path) - def halt(self, **kwargs): - """ Halt the pipeline """ + """Halt the pipeline""" self.manager.halt(**kwargs) - def list_flags(self, only_name=False): """ Determine the flag files associated with this pipeline. @@ -227,7 +243,6 @@ def list_flags(self, only_name=False): else: return paths - def run(self, start_point=None, stop_before=None, stop_after=None): """ Run the pipeline, optionally specifying start and/or stop points. @@ -253,7 +268,8 @@ def run(self, start_point=None, stop_before=None, stop_after=None): if stop_before and stop_after: raise IllegalPipelineExecutionError( - "Cannot specify both inclusive and exclusive stops.") + "Cannot specify both inclusive and exclusive stops." + ) if stop_before: stop = stop_before @@ -275,8 +291,10 @@ def run(self, start_point=None, stop_before=None, stop_after=None): # Permit order-agnostic pipelines, but warn. if self._unordered and (start_point or stop_before or stop_after): - print("WARNING: Starting and stopping points are nonsense for " - "pipeline with unordered stages.") + print( + "WARNING: Starting and stopping points are nonsense for " + "pipeline with unordered stages." + ) # TODO: consider context manager based on start/stop points. @@ -287,7 +305,8 @@ def run(self, start_point=None, stop_before=None, stop_after=None): assert stop_index <= len(self._stages) if start_index >= stop_index: raise IllegalPipelineExecutionError( - "Cannot start pipeline at or after stopping point") + "Cannot start pipeline at or after stopping point" + ) # TODO: consider storing just stage name rather than entire stage. # TODO (cont.): the bad case for whole-Stage is if associated data @@ -334,15 +353,15 @@ def run(self, start_point=None, stop_before=None, stop_after=None): self.halt(raise_error=False) def wrapup(self): - """ Final mock stage to run after final one finishes. """ + """Final mock stage to run after final one finishes.""" self.manager.complete() def _reset(self): - """ Scrub decks with respect to Stage status/label tracking. """ + """Scrub decks with respect to Stage status/label tracking.""" self.skipped, self.executed = [], [] def _start_index(self, start=None): - """ Seek to the first stage to run. """ + """Seek to the first stage to run.""" if start is None: return 0 start_stage = translate_stage_name(start) @@ -378,7 +397,6 @@ def _stop_index(self, stop_point, inclusive): return stop_index + 1 if inclusive else stop_index - def _is_unordered(collection): """ Determine whether a collection appears to be unordered. @@ -394,12 +412,10 @@ def _is_unordered(collection): illogical to investigate whether it's ordered. """ if not isinstance(collection, Iterable): - raise TypeError("Non-iterable alleged collection: {}". - format(type(collection))) - return isinstance(collection, set) or \ - (isinstance(collection, dict) and - not isinstance(collection, OrderedDict)) - + raise TypeError("Non-iterable alleged collection: {}".format(type(collection))) + return isinstance(collection, set) or ( + isinstance(collection, dict) and not isinstance(collection, OrderedDict) + ) def _parse_stage_spec(stage_spec): @@ -417,9 +433,11 @@ def _parse_stage_spec(stage_spec): """ # The logic used here, a message to a user about how to specify Stage. - req_msg = "Stage specification must be either a {0} itself, a " \ - "(, {0}) pair, or a callable with a __name__ attribute " \ - "(e.g., a non-anonymous function)".format(Stage.__name__) + req_msg = ( + "Stage specification must be either a {0} itself, a " + "(, {0}) pair, or a callable with a __name__ attribute " + "(e.g., a non-anonymous function)".format(Stage.__name__) + ) # Simplest case is stage itself. if isinstance(stage_spec, Stage): diff --git a/pypiper/stage.py b/pypiper/stage.py index 29f83c08..6f1d551a 100644 --- a/pypiper/stage.py +++ b/pypiper/stage.py @@ -11,16 +11,13 @@ __all__ = ["Stage"] - class Stage(object): """ Single stage/phase of a pipeline; a logical processing "unit". A stage is a collection of commands that is checkpointed. """ - - def __init__(self, func, f_args=None, f_kwargs=None, - name=None, checkpoint=True): + def __init__(self, func, f_args=None, f_kwargs=None, name=None, checkpoint=True): """ A function, perhaps with arguments, defines the stage. @@ -39,7 +36,6 @@ def __init__(self, func, f_args=None, f_kwargs=None, self.name = name or func.__name__ self.checkpoint = checkpoint - @property def checkpoint_name(self): """ @@ -50,37 +46,42 @@ def checkpoint_name(self): """ return translate_stage_name(self.name) if self.checkpoint else None - def run(self, *args, **kwargs): - """ Alternate form for direct call; execute stage. """ + """Alternate form for direct call; execute stage.""" self(*args, **kwargs) - def __call__(self, *args, **update_kwargs): - """ Execute the stage, allowing updates to args/kwargs. """ + """Execute the stage, allowing updates to args/kwargs.""" kwargs = copy.deepcopy(self.f_kwargs) kwargs.update(update_kwargs) args = args or self.f_args self.f(*args, **kwargs) - def __eq__(self, other): - return isinstance(other, Stage) and \ - self.f.__name__ == other.f.__name__ and \ - ({k: v for k, v in self.__dict__.items() if k != "f"} == - {k: v for k, v in other.__dict__.items() if k != "f"}) - + return ( + isinstance(other, Stage) + and self.f.__name__ == other.f.__name__ + and ( + {k: v for k, v in self.__dict__.items() if k != "f"} + == {k: v for k, v in other.__dict__.items() if k != "f"} + ) + ) def __ne__(self, other): return not (self == other) - def __repr__(self): - return "{klass} '{n}': f={f}, args={pos}, kwargs={kwd}, " \ - "checkpoint={check}".format(klass=self.__class__.__name__, - f=self.f, n=self.name, pos=self.f_args, kwd=self.f_kwargs, - check=self.checkpoint) - + return ( + "{klass} '{n}': f={f}, args={pos}, kwargs={kwd}, " + "checkpoint={check}".format( + klass=self.__class__.__name__, + f=self.f, + n=self.name, + pos=self.f_args, + kwd=self.f_kwargs, + check=self.checkpoint, + ) + ) def __str__(self): return "{}: '{}'".format(self.__class__.__name__, self.name) diff --git a/pypiper/utils.py b/pypiper/utils.py index 3eda9ddb..7e199bfa 100644 --- a/pypiper/utils.py +++ b/pypiper/utils.py @@ -1,27 +1,28 @@ """ Shared utilities """ -from collections import Iterable, Mapping, Sequence import os -import sys import re -from subprocess import PIPE +import sys +from collections import Iterable, Mapping, Sequence from shlex import split +from subprocess import PIPE -if sys.version_info < (3, ): +if sys.version_info < (3,): CHECK_TEXT_TYPES = (str, unicode) from inspect import getargspec as get_fun_sig else: - CHECK_TEXT_TYPES = (str, ) + CHECK_TEXT_TYPES = (str,) from inspect import getfullargspec as get_fun_sig from ubiquerg import expandpath, is_command_callable -from .const import \ - CHECKPOINT_EXTENSION, PIPELINE_CHECKPOINT_DELIMITER, \ - STAGE_NAME_SPACE_REPLACEMENT +from .const import ( + CHECKPOINT_EXTENSION, + PIPELINE_CHECKPOINT_DELIMITER, + STAGE_NAME_SPACE_REPLACEMENT, +) from .flags import FLAGS - __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" @@ -29,15 +30,23 @@ # What to export/attach to pypiper package namespace. # Conceptually, reserve this for functions expected to be used in other # packages, and import from utils within pypiper for other functions. -__all__ = ["add_pypiper_args", "build_command", "check_all_commands", - "determine_uncallable", "get_first_value", "head", "logger_via_cli"] +__all__ = [ + "add_pypiper_args", + "build_command", + "check_all_commands", + "determine_uncallable", + "get_first_value", + "head", + "logger_via_cli", +] CHECKPOINT_SPECIFICATIONS = ["start_point", "stop_before", "stop_after"] -def add_pypiper_args(parser, groups=("pypiper", ), args=None, - required=None, all_args=False): +def add_pypiper_args( + parser, groups=("pypiper",), args=None, required=None, all_args=False +): """ Use this to add standardized pypiper arguments to your python pipeline. @@ -57,7 +66,8 @@ def add_pypiper_args(parser, groups=("pypiper", ), args=None, pypiper arguments added """ args_to_add = _determine_args( - argument_groups=groups, arguments=args, use_all_args=all_args) + argument_groups=groups, arguments=args, use_all_args=all_args + ) parser = _add_args(parser, args_to_add, required) return parser @@ -81,8 +91,7 @@ def build_command(chunks): """ if not chunks: - raise ValueError( - "No command parts: {} ({})".format(chunks, type(chunks))) + raise ValueError("No command parts: {} ({})".format(chunks, type(chunks))) if isinstance(chunks, str): return chunks @@ -149,8 +158,7 @@ def checkpoint_filename(checkpoint, pipeline_name=None): except AttributeError: base = translate_stage_name(checkpoint) if pipeline_name: - base = "{}{}{}".format( - pipeline_name, PIPELINE_CHECKPOINT_DELIMITER, base) + base = "{}{}{}".format(pipeline_name, PIPELINE_CHECKPOINT_DELIMITER, base) return base + CHECKPOINT_EXTENSION @@ -178,7 +186,8 @@ def checkpoint_filepath(checkpoint, pm): else: raise ValueError( "Absolute checkpoint path '{}' is not in pipeline output " - "folder '{}'".format(checkpoint, pm.outfolder)) + "folder '{}'".format(checkpoint, pm.outfolder) + ) _, ext = os.path.splitext(checkpoint) if ext == CHECKPOINT_EXTENSION: return pipeline_filepath(pm, filename=checkpoint) @@ -226,9 +235,12 @@ def check_shell_asterisk(cmd): def check_all_commands( - cmds, - get_bad_result=lambda bads: Exception("{} uncallable commands: {}".format(len(bads), bads)), - handle=None): + cmds, + get_bad_result=lambda bads: Exception( + "{} uncallable commands: {}".format(len(bads), bads) + ), + handle=None, +): """ Determine whether all commands are callable @@ -246,10 +258,12 @@ def check_all_commands( if not bads: return True if handle is None: + def handle(res): if isinstance(res, Exception): raise res print("Command check result: {}".format(res)) + elif not hasattr(handle, "__call__") or not 1 == len(get_fun_sig(handle).args): raise TypeError("Command check error handler must be a one-arg function") handle(get_bad_result(bads)) @@ -257,12 +271,17 @@ def handle(res): def determine_uncallable( - commands, transformations=( - (lambda f: isinstance(f, str) and - os.path.isfile(expandpath(f)) and - expandpath(f).endswith(".jar"), - lambda f: "java -jar {}".format(expandpath(f))), - ), accumulate=False): + commands, + transformations=( + ( + lambda f: isinstance(f, str) + and os.path.isfile(expandpath(f)) + and expandpath(f).endswith(".jar"), + lambda f: "java -jar {}".format(expandpath(f)), + ), + ), + accumulate=False, +): """ Determine which commands are not callable. @@ -282,23 +301,41 @@ def determine_uncallable( """ commands = [commands] if isinstance(commands, str) else commands if transformations: - trans = transformations.values() if isinstance(transformations, Mapping) else transformations - if not isinstance(transformations, Iterable) or isinstance(transformations, str) or \ - not all(map(lambda func_pair: isinstance(func_pair, tuple) and len(func_pair) == 2, trans)): + trans = ( + transformations.values() + if isinstance(transformations, Mapping) + else transformations + ) + if ( + not isinstance(transformations, Iterable) + or isinstance(transformations, str) + or not all( + map( + lambda func_pair: isinstance(func_pair, tuple) + and len(func_pair) == 2, + trans, + ) + ) + ): raise TypeError( "Transformations argument should be a collection of pairs; got " - "{} ({})".format(transformations, type(transformations).__name__)) + "{} ({})".format(transformations, type(transformations).__name__) + ) if accumulate: + def finalize(cmd): for p, t in transformations: if p(cmd): cmd = t(cmd) return cmd + else: if not isinstance(transformations, (tuple, list)): raise Exception( "If transformations are unordered, non-accumulation of " - "effects may lead to nondeterministic behavior.") + "effects may lead to nondeterministic behavior." + ) + def finalize(cmd): print("Transformations: {}".format(transformations)) for p, t in transformations: @@ -308,14 +345,16 @@ def finalize(cmd): else: finalize = lambda cmd: cmd - return [(orig, used) for orig, used in - map(lambda c: (c, finalize(c)), commands) - if not is_command_callable(used)] + return [ + (orig, used) + for orig, used in map(lambda c: (c, finalize(c)), commands) + if not is_command_callable(used) + ] def split_by_pipes_nonnested(cmd): """ - Split the command by shell pipes, but preserve contents in + Split the command by shell pipes, but preserve contents in parentheses and braces. :param str cmd: Command to investigate. @@ -323,7 +362,7 @@ def split_by_pipes_nonnested(cmd): """ # for posterity, this one will do parens only: re.compile(r'(?:[^|(]|\([^)]*\))+') # r = re.compile(r'(?:[^|({]|[\({][^)}]*[\)}])+') - r = re.compile(r'(?:[^|(]|\([^)]*\)+|\{[^}]*\})') + r = re.compile(r"(?:[^|(]|\([^)]*\)+|\{[^}]*\})") return r.findall(cmd) @@ -332,15 +371,15 @@ def split_by_pipes_nonnested(cmd): def split_by_pipes(cmd): """ - Split the command by shell pipes, but preserve contents in + Split the command by shell pipes, but preserve contents in parentheses and braces. Also handles nested parens and braces. :param str cmd: Command to investigate. :return list: List of sub commands to be linked - """ + """ # Build a simple finite state machine to split on pipes, while - # handling nested braces or parentheses. + # handling nested braces or parentheses. stack_brace = [] stack_paren = [] cmdlist = [] @@ -386,10 +425,10 @@ def check_shell_pipes(cmd): def strip_braced_txt(cmd): curly_braces = True while curly_braces: - SRE_match_obj = re.search(r'\{(.*?)}',cmd) + SRE_match_obj = re.search(r"\{(.*?)}", cmd) if not SRE_match_obj is None: - cmd = cmd[:SRE_match_obj.start()] + cmd[(SRE_match_obj.end()+1):] - if re.search(r'\{(.*?)}',cmd) is None: + cmd = cmd[: SRE_match_obj.start()] + cmd[(SRE_match_obj.end() + 1) :] + if re.search(r"\{(.*?)}", cmd) is None: curly_braces = False else: curly_braces = False @@ -460,7 +499,7 @@ def get_proc_name(cmd): if isinstance(cmd, Iterable) and not isinstance(cmd, str): cmd = " ".join(cmd) - return cmd.split()[0].replace('(', '').replace(')', '') + return cmd.split()[0].replace("(", "").replace(")", "") def get_first_value(param, param_pools, on_missing=None, error=True): @@ -506,7 +545,8 @@ def get_first_value(param, param_pools, on_missing=None, error=True): raise TypeError( "Any callable passed as the action to take when a requested " "parameter is missing should accept that parameter and return " - "a value.") + "a value." + ) return on_missing @@ -581,7 +621,7 @@ def is_sam_or_bam(file_name): :param str file_name: Name/path of file to check as SAM-formatted. :return bool: Whether file appears to be SAM-formatted - """ + """ _, ext = os.path.splitext(file_name) return ext in [".bam", ".sam"] @@ -595,7 +635,9 @@ def logger_via_cli(opts, **kwargs): :return logging.Logger: newly created and configured logger """ from copy import deepcopy + import logmuse + kwds = deepcopy(kwargs) # By default, don't require the logging options to have been added to the parser. kwds.setdefault("strict", False) @@ -617,6 +659,7 @@ def make_lock_name(original_path, path_base_folder): :return str: Name or perhaps relative (to the base folder path indicated) path to lock file """ + def make_name(p): if p: return p.replace(path_base_folder, "").replace(os.sep, "__") @@ -628,8 +671,11 @@ def make_name(p): elif isinstance(original_path, Sequence): result = [make_name(p) for p in original_path] return [x for x in result if x] - raise TypeError("Neither string nor other sequence type: {} ({})". - format(original_path, type(original_path))) + raise TypeError( + "Neither string nor other sequence type: {} ({})".format( + original_path, type(original_path) + ) + ) def is_multi_target(target): @@ -645,8 +691,11 @@ def is_multi_target(target): elif isinstance(target, Sequence): return len(target) > 1 else: - raise TypeError("Could not interpret argument as a target: {} ({})". - format(target, type(target))) + raise TypeError( + "Could not interpret argument as a target: {} ({})".format( + target, type(target) + ) + ) def parse_cmd(cmd, shell): @@ -657,12 +706,18 @@ def parse_cmd(cmd, shell): :param bool shell: if the command should be run in the shell rather that in a subprocess :return list[dict]: list of dicts of commands """ + def _make_dict(command): - a, s = (command, True) if check_shell(command, shell) else (split(command), False) + a, s = ( + (command, True) if check_shell(command, shell) else (split(command), False) + ) return dict(args=a, stdout=PIPE, shell=s) - return [_make_dict(c) for c in split_by_pipes(cmd)] if not shell and check_shell_pipes(cmd) \ + return ( + [_make_dict(c) for c in split_by_pipes(cmd)] + if not shell and check_shell_pipes(cmd) else [dict(args=cmd, stdout=None, shell=True)] + ) def parse_cores(cores, pm, default): @@ -730,16 +785,16 @@ def pipeline_filepath(pm, filename=None, suffix=None): """ if filename is None and suffix is None: - raise TypeError("Provide filename and/or suffix to create " - "path to a pipeline file.") + raise TypeError( + "Provide filename and/or suffix to create " "path to a pipeline file." + ) filename = (filename or pm.name) + (suffix or "") # Note that Pipeline and PipelineManager define the same outfolder. # In fact, a Pipeline just references its manager's outfolder. # So we can handle argument of either type to pm parameter. - return filename if os.path.isabs(filename) \ - else os.path.join(pm.outfolder, filename) + return filename if os.path.isabs(filename) else os.path.join(pm.outfolder, filename) def translate_stage_name(stage): @@ -804,12 +859,12 @@ def _determine_args(argument_groups, arguments, use_all_args=False): else: from collections.abc import Iterable - from logmuse import LOGGING_CLI_OPTDATA + # Define the argument groups. args_by_group = { - "pypiper": ["recover", "new-start", "dirty", "force-follow", "testmode"] + - LOGGING_CLI_OPTDATA.keys(), + "pypiper": ["recover", "new-start", "dirty", "force-follow", "testmode"] + + LOGGING_CLI_OPTDATA.keys(), "config": ["config"], "checkpoint": ["stop-before", "stop-after"], "resource": ["mem", "cores"], @@ -817,9 +872,13 @@ def _determine_args(argument_groups, arguments, use_all_args=False): "common": ["input", "sample-name"], "ngs": ["sample-name", "input", "input2", "genome", "single-or-paired"], "logmuse": LOGGING_CLI_OPTDATA.keys(), - "pipestat": ["pipestat-namespace", "pipestat-record-id", - "pipestat-schema", "pipestat-results-file", - "pipestat-config"] + "pipestat": [ + "pipestat-namespace", + "pipestat-record-id", + "pipestat-schema", + "pipestat-results-file", + "pipestat-config", + ], } # Handle various types of group specifications. @@ -885,83 +944,127 @@ def _add_args(parser, args, required): # Define the arguments. argument_data = { - "testmode": - ("-T", {"action": "store_true", - "help": "Only print commands, don't run"}), - "recover": - ("-R", {"action": "store_true", - "help": "Overwrite locks to recover from previous failed run"}), - "new-start": - ("-N", {"action": "store_true", - "help": "Overwrite all results to start a fresh run"}), - "dirty": - ("-D", {"action": "store_true", - "help": "Don't auto-delete intermediate files"}), - "force-follow": - ("-F", {"action": "store_true", - "help": "Always run 'follow' commands"}), - "start-point": - {"help": "Name of pipeline stage at which to begin"}, - "stop-before": - {"help": "Name of pipeline stage at which to stop " - "(exclusive, i.e. not run)"}, - "stop-after": - {"help": "Name of pipeline stage at which to stop " - "(inclusive, i.e. run)"}, - "config": - ("-C", {"dest": "config_file", "metavar": "CONFIG_FILE", - "default": default_config, - "help": "Pipeline configuration file (YAML). " - "Relative paths are with respect to the " - "pipeline script."}), - "pipeline-name": - {"metavar": "PIPELINE_NAME", "help": "Name of the pipeline"}, - "sample-name": - ("-S", {"metavar": "SAMPLE_NAME", - "help": "Name for sample to run"}), - "output-parent": - ("-O", {"metavar": "PARENT_OUTPUT_FOLDER", - "help": "Parent output directory of project"}), - "cores": - ("-P", {"type": int, "default": 1, "metavar": "NUMBER_OF_CORES", - "help": "Number of cores for parallelized processes"}), - "mem": - ("-M", {"default": "4000", "metavar": "MEMORY_LIMIT", - "help": "Memory limit for processes accepting such. " - "Default units are megabytes unless specified " - "using the suffix [K|M|G|T]."}), - "input": - ("-I", {"nargs": "+", "metavar": "INPUT_FILES", - "help": "One or more primary input files"}), - "input2": - ("-I2", {"nargs": "*", "metavar": "INPUT_FILES2", - "help": "Secondary input files, such as read2"}), - "genome": - ("-G", {"dest": "genome_assembly", - "help": "Identifier for genome assembly"}), - "single-or-paired": - ("-Q", {"default": "single", - "help": "Single- or paired-end sequencing protocol"}), - "pipestat-namespace": - {"help": "Namespace to report into. This will be the DB table name " - "if using DB as the object back-end"}, - "pipestat-record-id": - {"help": "Record identifier to report for"}, - "pipestat-schema": - {"help": "Path to the output schema that formalizes the " - "results structure"}, - "pipestat-config": - {"help": "Path to the configuration file"}, - "pipestat-results-file": - {"help": "YAML file to report into, if file is used as " - "the object back-end"} + "testmode": ( + "-T", + {"action": "store_true", "help": "Only print commands, don't run"}, + ), + "recover": ( + "-R", + { + "action": "store_true", + "help": "Overwrite locks to recover from previous failed run", + }, + ), + "new-start": ( + "-N", + { + "action": "store_true", + "help": "Overwrite all results to start a fresh run", + }, + ), + "dirty": ( + "-D", + {"action": "store_true", "help": "Don't auto-delete intermediate files"}, + ), + "force-follow": ( + "-F", + {"action": "store_true", "help": "Always run 'follow' commands"}, + ), + "start-point": {"help": "Name of pipeline stage at which to begin"}, + "stop-before": { + "help": "Name of pipeline stage at which to stop " + "(exclusive, i.e. not run)" + }, + "stop-after": { + "help": "Name of pipeline stage at which to stop " "(inclusive, i.e. run)" + }, + "config": ( + "-C", + { + "dest": "config_file", + "metavar": "CONFIG_FILE", + "default": default_config, + "help": "Pipeline configuration file (YAML). " + "Relative paths are with respect to the " + "pipeline script.", + }, + ), + "pipeline-name": {"metavar": "PIPELINE_NAME", "help": "Name of the pipeline"}, + "sample-name": ( + "-S", + {"metavar": "SAMPLE_NAME", "help": "Name for sample to run"}, + ), + "output-parent": ( + "-O", + { + "metavar": "PARENT_OUTPUT_FOLDER", + "help": "Parent output directory of project", + }, + ), + "cores": ( + "-P", + { + "type": int, + "default": 1, + "metavar": "NUMBER_OF_CORES", + "help": "Number of cores for parallelized processes", + }, + ), + "mem": ( + "-M", + { + "default": "4000", + "metavar": "MEMORY_LIMIT", + "help": "Memory limit for processes accepting such. " + "Default units are megabytes unless specified " + "using the suffix [K|M|G|T].", + }, + ), + "input": ( + "-I", + { + "nargs": "+", + "metavar": "INPUT_FILES", + "help": "One or more primary input files", + }, + ), + "input2": ( + "-I2", + { + "nargs": "*", + "metavar": "INPUT_FILES2", + "help": "Secondary input files, such as read2", + }, + ), + "genome": ( + "-G", + {"dest": "genome_assembly", "help": "Identifier for genome assembly"}, + ), + "single-or-paired": ( + "-Q", + {"default": "single", "help": "Single- or paired-end sequencing protocol"}, + ), + "pipestat-namespace": { + "help": "Namespace to report into. This will be the DB table name " + "if using DB as the object back-end" + }, + "pipestat-record-id": {"help": "Record identifier to report for"}, + "pipestat-schema": { + "help": "Path to the output schema that formalizes the " "results structure" + }, + "pipestat-config": {"help": "Path to the configuration file"}, + "pipestat-results-file": { + "help": "YAML file to report into, if file is used as " + "the object back-end" + }, } - + from logmuse import LOGGING_CLI_OPTDATA + argument_data.update(LOGGING_CLI_OPTDATA) if len(required) > 0: - required_named = parser.add_argument_group('required named arguments') + required_named = parser.add_argument_group("required named arguments") # Configure the parser for each argument. for arg in args: @@ -979,12 +1082,13 @@ def _add_args(parser, args, required): raise TypeError( "Option name must map to dict or two-tuple (short " "name and dict) of argument command-line argument " - "specification data.") + "specification data." + ) argdata["required"] = arg in required long_opt = "--{}".format(arg) - opts = (short_opt, long_opt) if short_opt else (long_opt, ) + opts = (short_opt, long_opt) if short_opt else (long_opt,) if arg in required: required_named.add_argument(*opts, **argdata) else: diff --git a/setup.py b/setup.py index 52a5a70e..3971a195 100644 --- a/setup.py +++ b/setup.py @@ -7,26 +7,28 @@ try: from setuptools import setup + if sys.version_info < (2, 7): - extra['install_requires'] = ['argparse'] + extra["install_requires"] = ["argparse"] if sys.version_info >= (3,): - extra['use_2to3'] = True + extra["use_2to3"] = True except ImportError: from distutils.core import setup + if sys.version_info < (2, 7): - extra['dependencies'] = ['argparse'] + extra["dependencies"] = ["argparse"] def read_reqs_file(reqs_name): - """ Read requirements file for given requirements group. """ - path_reqs_file = os.path.join( - "requirements", "reqs-{}.txt".format(reqs_name)) - with open(path_reqs_file, 'r') as reqs_file: - return [pkg.rstrip() for pkg in reqs_file.readlines() - if not pkg.startswith("#")] + """Read requirements file for given requirements group.""" + path_reqs_file = os.path.join("requirements", "reqs-{}.txt".format(reqs_name)) + with open(path_reqs_file, "r") as reqs_file: + return [ + pkg.rstrip() for pkg in reqs_file.readlines() if not pkg.startswith("#") + ] -with open(os.path.join("pypiper", "_version.py"), 'r') as versionfile: +with open(os.path.join("pypiper", "_version.py"), "r") as versionfile: version = versionfile.readline().split()[-1].strip("\"'\n") @@ -36,8 +38,9 @@ def read_reqs_file(reqs_name): test_reqs = read_reqs_file("test") # Allow specification of desired features, which implies dependencies. -addl_reqs = {bundle_name: read_reqs_file(bundle_name) - for bundle_name in ["ngstk", "plot"]} +addl_reqs = { + bundle_name: read_reqs_file(bundle_name) for bundle_name in ["ngstk", "plot"] +} # Complete collection of user requirements. addl_reqs["all"] = list({pkg for bundle in addl_reqs.values() for pkg in bundle}) @@ -48,35 +51,38 @@ def read_reqs_file(reqs_name): # Handle the pypi README formatting. try: import pypandoc - long_description = pypandoc.convert_file('README.md', 'rst') + + long_description = pypandoc.convert_file("README.md", "rst") msg = "\033[032mPandoc conversion succeeded.\033[0m" -except(IOError, ImportError, OSError): +except (IOError, ImportError, OSError): msg = "\033[0;31mWarning: pandoc conversion failed!\033[0m" - long_description = open('README.md').read() + long_description = open("README.md").read() setup( - name='piper', - packages=['pypiper'], + name="piper", + packages=["pypiper"], install_requires=basic_reqs, version=version, - description='A lightweight python toolkit for gluing together restartable, robust command line pipelines', + description="A lightweight python toolkit for gluing together restartable, robust command line pipelines", long_description=long_description, - long_description_content_type='text/markdown', + long_description_content_type="text/markdown", classifiers=[ "Development Status :: 4 - Beta", "License :: OSI Approved :: BSD License", "Programming Language :: Python :: 2.7", - "Topic :: Scientific/Engineering :: Bio-Informatics" - ], - author='Nathan Sheffield, Johanna Klughammer, Andre Rendeiro', - author_email='nathan@code.databio.org, jklughammer@cemm.oeaw.ac.at, arendeiro@cemm.oeaw.ac.at', - url='https://github.com/databio/pypiper/', + "Topic :: Scientific/Engineering :: Bio-Informatics", + ], + author="Nathan Sheffield, Johanna Klughammer, Andre Rendeiro", + author_email="nathan@code.databio.org, jklughammer@cemm.oeaw.ac.at, arendeiro@cemm.oeaw.ac.at", + url="https://github.com/databio/pypiper/", license="BSD2", - test_suite="tests", # python setup.py test - tests_require=test_reqs, # Test-specific package dependencies + test_suite="tests", # python setup.py test + tests_require=test_reqs, # Test-specific package dependencies # Extra package if doing `python setup.py test` - setup_requires=(["pytest-runner"] if {"test", "pytest", "ptr"} & set(sys.argv) else []), + setup_requires=( + ["pytest-runner"] if {"test", "pytest", "ptr"} & set(sys.argv) else [] + ), extras_require=addl_reqs, # Version-specific items **extra diff --git a/tests/conftest.py b/tests/conftest.py index 08db22b8..2269408e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,14 +1,13 @@ """ Fixtures and configuration visible to all tests """ import copy -from functools import partial import os +from functools import partial import pytest from pypiper import Pipeline, PipelineManager, Stage - __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" @@ -31,10 +30,10 @@ FILE_TEXT_PAIRS = list(zip(FILENAMES, CONTENTS)) - @pytest.fixture def get_pipe_manager(tmpdir): - """ Provide safe creation of pipeline manager, with multi=True. """ + """Provide safe creation of pipeline manager, with multi=True.""" + def get_mgr(**kwargs): if "outfolder" in kwargs: kwd_args = kwargs @@ -42,52 +41,48 @@ def get_mgr(**kwargs): kwd_args = copy.deepcopy(kwargs) kwd_args["outfolder"] = tmpdir.strpath return PipelineManager(multi=True, **kwd_args) - return get_mgr + return get_mgr @pytest.fixture def pl_mgr(request, get_pipe_manager): - """ Provide a PipelineManager and ensure that it's stopped. """ + """Provide a PipelineManager and ensure that it's stopped.""" pm = get_pipe_manager(name=TEST_PIPE_NAME) + def _ensure_stopped(): pm.stop_pipeline() + request.addfinalizer(_ensure_stopped) return pm - @pytest.fixture def dummy_pipe(pl_mgr): - """ Provide a basic Pipeline instance for a test case. """ + """Provide a basic Pipeline instance for a test case.""" return DummyPipeline(pl_mgr) - def write_file1(folder): _write(*FILE_TEXT_PAIRS[0], folder=folder) - def write_file2(folder): _write(*FILE_TEXT_PAIRS[1], folder=folder) - def write_file3(folder): _write(*FILE_TEXT_PAIRS[2], folder=folder) - def _write(filename, content, folder=None): path = os.path.join(folder, filename) - with open(path, 'w') as f: + with open(path, "w") as f: f.write(content) - class DummyPipeline(Pipeline): - """ Basic pipeline implementation for tests """ + """Basic pipeline implementation for tests""" def __init__(self, manager): super(DummyPipeline, self).__init__(TEST_PIPE_NAME, manager=manager) diff --git a/tests/helpers.py b/tests/helpers.py index 16f842c6..3a91a88e 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -1,18 +1,18 @@ """ Helpers for tests """ -from functools import partial import glob import os +from functools import partial + import pytest + from pypiper import Pipeline from pypiper.utils import checkpoint_filepath - __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - def assert_equal_dirpath(p1, p2): """ Assert that a pair of folder paths has two equal members. @@ -23,7 +23,6 @@ def assert_equal_dirpath(p1, p2): assert p1.rstrip(os.sep) == p2.rstrip(os.sep) - def fetch_checkpoint_files(pm): """ Fetch all of a manager's checkpoint file paths. @@ -37,7 +36,6 @@ def fetch_checkpoint_files(pm): return glob.glob(pattern) - def named_param(argnames, argvalues): """ Improve pytest's native labeling of test case parameterization. @@ -53,15 +51,19 @@ def named_param(argnames, argvalues): :return functools.partial: Parameterize version of parametrize, with values and ids fixed. """ - return partial(pytest.mark.parametrize( - argnames=argnames, argvalues=argvalues, - ids=lambda val: "{}={}".format(argnames, val))) - + return partial( + pytest.mark.parametrize( + argnames=argnames, + argvalues=argvalues, + ids=lambda val: "{}={}".format(argnames, val), + ) + ) class SafeTestPipeline(Pipeline): - """ Pipeline for tests that protects against bad file descriptor. """ + """Pipeline for tests that protects against bad file descriptor.""" + def __init__(self, *args, **kwargs): - kwd_args = {"multi": True} # Like interactive mode. + kwd_args = {"multi": True} # Like interactive mode. kwd_args.update(kwargs) super(SafeTestPipeline, self).__init__(*args, **kwd_args) diff --git a/tests/pipeline/conftest.py b/tests/pipeline/conftest.py index 28be5db0..9f72e657 100644 --- a/tests/pipeline/conftest.py +++ b/tests/pipeline/conftest.py @@ -1,27 +1,24 @@ """ Test configuration for Pipeline tests. """ import os + import pytest + from pypiper import Stage from tests.helpers import SafeTestPipeline - __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - READ_ALIGNER_FILENAME = "aligner.lst" PEAK_CALLER_FILENAME = "caller.lst" - def pytest_generate_tests(metafunc): - """ Dynamic test case parameterization. """ + """Dynamic test case parameterization.""" if "pl_name" in metafunc.fixturenames: - metafunc.parametrize( - "pl_name", [read_aligner.__name__, call_peaks.__name__]) - + metafunc.parametrize("pl_name", [read_aligner.__name__, call_peaks.__name__]) # Dummy functions used as elements of pipeline stages() collections. @@ -41,10 +38,8 @@ def call_peaks(): pass - class FunctionNameWriterPipeline(SafeTestPipeline): - """ Basic pipeline that writes to file the names of its functions. """ - + """Basic pipeline that writes to file the names of its functions.""" def __init__(self, name, outfolder, filename, functions): """ @@ -62,9 +57,7 @@ def __init__(self, name, outfolder, filename, functions): self.name_output_file = filename self.functions = functions # Get the stages() benefit of superclass extension. - super(FunctionNameWriterPipeline, self).__init__( - name=name, outfolder=outfolder) - + super(FunctionNameWriterPipeline, self).__init__(name=name, outfolder=outfolder) def write_name(self, func): """ @@ -73,12 +66,11 @@ def write_name(self, func): :param callable func: Name of function to write to the output file. """ outpath = os.path.join(self.outfolder, self.name_output_file) - with open(outpath, 'a') as f: + with open(outpath, "a") as f: f.write(func.__name__ + os.linesep) - def run(self, **kwargs): - """ Start with clean output file, then use superclass method. """ + """Start with clean output file, then use superclass method.""" # Ensure that we start with a clean file since the nature of the # operations performed (sequential file writes) creates desire to # open output file in append mode rather than write mode. @@ -87,30 +79,26 @@ def run(self, **kwargs): os.unlink(output_file) super(FunctionNameWriterPipeline, self).run(**kwargs) - def stages(self): - """ Sequence of operations to perform. """ - return [Stage(self.write_name, (f,), name=f.__name__) - for f in self.functions] - + """Sequence of operations to perform.""" + return [Stage(self.write_name, (f,), name=f.__name__) for f in self.functions] # Functions and fixtures + def get_read_aligner(outfolder): - """ Create a dummy 'read aligner' pipeline. """ + """Create a dummy 'read aligner' pipeline.""" return FunctionNameWriterPipeline( - "read-aligner", outfolder, - READ_ALIGNER_FILENAME, [merge_input, qc, align_reads]) - + "read-aligner", outfolder, READ_ALIGNER_FILENAME, [merge_input, qc, align_reads] + ) def get_peak_caller(outfolder): - """ Create a dummy 'peak caller' pipeline. """ + """Create a dummy 'peak caller' pipeline.""" return FunctionNameWriterPipeline( - "peak-caller", outfolder, - PEAK_CALLER_FILENAME, [align_reads, call_peaks]) - + "peak-caller", outfolder, PEAK_CALLER_FILENAME, [align_reads, call_peaks] + ) def get_pipeline(name, outfolder): @@ -129,15 +117,13 @@ def get_pipeline(name, outfolder): raise ValueError("Unknown pipeline request: '{}'".format(name)) - @pytest.fixture def read_aligner(tmpdir): - """ Provide test case with a read aligner pipeline instance. """ + """Provide test case with a read aligner pipeline instance.""" return get_read_aligner(outfolder=tmpdir.strpath) - @pytest.fixture def peak_caller(tmpdir): - """ Provide test case with a 'PeakCaller' pipeline instance. """ + """Provide test case with a 'PeakCaller' pipeline instance.""" return get_peak_caller(outfolder=tmpdir.strpath) diff --git a/tests/pipeline/test_multi_pipeline_sample.py b/tests/pipeline/test_multi_pipeline_sample.py index 4d3e7503..f8874d36 100644 --- a/tests/pipeline/test_multi_pipeline_sample.py +++ b/tests/pipeline/test_multi_pipeline_sample.py @@ -1,18 +1,18 @@ """ Tests for case in which multiple pipelines process a single sample. """ import os + from pypiper.utils import checkpoint_filepath from tests.helpers import fetch_checkpoint_files, named_param -from .conftest import get_peak_caller, get_pipeline, get_read_aligner +from .conftest import get_peak_caller, get_pipeline, get_read_aligner __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - def test_checkpoints_are_pipeline_unique(tmpdir): - """ Names of checkpoint files depend on both stage and pipeline. """ + """Names of checkpoint files depend on both stage and pipeline.""" # Note: conceptually, this tests an underlying mechanistic aspect of the # checkpointing system. @@ -38,10 +38,12 @@ def test_checkpoints_are_pipeline_unique(tmpdir): call_peaks.run() # We expect a different checkpoint file for each stage of each pipeline. - align_reads_expected = {checkpoint_filepath(s.name, align_reads) - for s in align_reads.stages()} - call_peaks_expected = {checkpoint_filepath(s.name, call_peaks) - for s in call_peaks.stages()} + align_reads_expected = { + checkpoint_filepath(s.name, align_reads) for s in align_reads.stages() + } + call_peaks_expected = { + checkpoint_filepath(s.name, call_peaks) for s in call_peaks.stages() + } # Pipeline names are unique here, and each checkpoint name includes # pipeline name for disambiguation, so even a pair of pipelines with a @@ -52,8 +54,9 @@ def test_checkpoints_are_pipeline_unique(tmpdir): # When not setting start/stop parameters and beginning with no checkpoint # files in place, each pipeline generates its full set of checkpoint files. expected_checkpoints = align_reads_expected | call_peaks_expected - observed_checkpoints = set(fetch_checkpoint_files(align_reads)) | \ - set(fetch_checkpoint_files(call_peaks)) + observed_checkpoints = set(fetch_checkpoint_files(align_reads)) | set( + fetch_checkpoint_files(call_peaks) + ) # Verify satisfaction of expectation. try: @@ -68,9 +71,8 @@ def test_checkpoints_are_pipeline_unique(tmpdir): raise - def test_pipeline_checkpoint_respect_sensitivity_and_specificity(tmpdir): - """ Pipeline respects only its own checkpoint(s) for stage skipping. """ + """Pipeline respects only its own checkpoint(s) for stage skipping.""" # Note: conceptually, this is more of an effect- or outcome-based test # of the checkpointing system with respect to stage skipping. @@ -80,22 +82,18 @@ def test_pipeline_checkpoint_respect_sensitivity_and_specificity(tmpdir): align_reads_stage_names = [s.name for s in align_reads.stages()] call_peaks_stage_names = [s.name for s in call_peaks.stages()] - assert {"align_reads"} == \ - set(align_reads_stage_names) & set(call_peaks_stage_names) + assert {"align_reads"} == set(align_reads_stage_names) & set(call_peaks_stage_names) # Set up the checkpoints for the read alignment pipeline by allowing it # to execute once. align_reads.run() - assert os.path.isfile(checkpoint_filepath( - "align_reads", align_reads.manager)) - peaks_align_check_fpath = \ - checkpoint_filepath("align_reads", call_peaks.manager) + assert os.path.isfile(checkpoint_filepath("align_reads", align_reads.manager)) + peaks_align_check_fpath = checkpoint_filepath("align_reads", call_peaks.manager) assert not os.path.isfile(peaks_align_check_fpath) call_peaks.run() exp_lines = [func + os.linesep for func in call_peaks_stage_names] - call_peaks_outpath = os.path.join( - call_peaks.outfolder, call_peaks.name_output_file) - with open(call_peaks_outpath, 'r') as f: + call_peaks_outpath = os.path.join(call_peaks.outfolder, call_peaks.name_output_file) + with open(call_peaks_outpath, "r") as f: obs_lines = f.readlines() assert exp_lines == obs_lines diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index ec774971..295a885b 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -1,43 +1,54 @@ """ Tests for the Pipeline data type """ -from functools import partial import glob import os +from functools import partial + import pytest + from pypiper import Pipeline from pypiper.manager import COMPLETE_FLAG, PAUSE_FLAG, RUN_FLAG -from pypiper.pipeline import \ - checkpoint_filepath, IllegalPipelineDefinitionError, \ - IllegalPipelineExecutionError, UnknownPipelineStageError +from pypiper.pipeline import ( + IllegalPipelineDefinitionError, + IllegalPipelineExecutionError, + UnknownPipelineStageError, + checkpoint_filepath, +) from pypiper.stage import Stage -from pypiper.utils import \ - flag_name, pipeline_filepath, checkpoint_filename, translate_stage_name +from pypiper.utils import ( + checkpoint_filename, + flag_name, + pipeline_filepath, + translate_stage_name, +) +from tests.conftest import ( + CONTENTS, + FILE1_NAME, + FILE_TEXT_PAIRS, + FILENAMES, + OUTPUT_SUFFIX, + TEST_PIPE_NAME, + write_file1, + write_file2, + write_file3, +) from tests.helpers import named_param -from tests.conftest import \ - write_file1, write_file2, write_file3, \ - CONTENTS, FILENAMES, FILE1_NAME, FILE_TEXT_PAIRS, \ - OUTPUT_SUFFIX, TEST_PIPE_NAME - __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - BASIC_ACTIONS = [write_file1, write_file2, write_file3] STAGE_SPECS = ["stage", "name", "function"] - def pytest_generate_tests(metafunc): - """ Dynamic creation and parameterization of cases in this module. """ - if "test_type" in metafunc.fixturenames and \ - metafunc.cls == MostBasicPipelineTests: + """Dynamic creation and parameterization of cases in this module.""" + if "test_type" in metafunc.fixturenames and metafunc.cls == MostBasicPipelineTests: metafunc.parametrize( - argnames="test_type", - argvalues=["effects", "stage_labels", - "checkpoints", "pipe_flag"]) - + argnames="test_type", + argvalues=["effects", "stage_labels", "checkpoints", "pipe_flag"], + ) @pytest.fixture @@ -61,7 +72,6 @@ def stage(request): return _parse_stage(s, spec_type) - def _parse_stage(s, spec_type): """ Do a type transformation on a Stage function. @@ -81,42 +91,37 @@ def _parse_stage(s, spec_type): return s - class EmptyStagesPipeline(Pipeline): - """ Illegal (via empty stages) Pipeline definition. """ + """Illegal (via empty stages) Pipeline definition.""" def __init__(self, manager): - super(EmptyStagesPipeline, self).__init__( - TEST_PIPE_NAME, manager=manager) + super(EmptyStagesPipeline, self).__init__(TEST_PIPE_NAME, manager=manager) def stages(self): return [] class NameCollisionPipeline(Pipeline): - """ Illegal (via empty stages) Pipeline definition. """ + """Illegal (via empty stages) Pipeline definition.""" def __init__(self, manager): - super(NameCollisionPipeline, self).__init__( - TEST_PIPE_NAME, manager=manager) + super(NameCollisionPipeline, self).__init__(TEST_PIPE_NAME, manager=manager) def stages(self): name = "write file1" - return [("write file1", write_file1), - (translate_stage_name(name), write_file1)] - + return [("write file1", write_file1), (translate_stage_name(name), write_file1)] class RunPipelineCornerCaseTests: - """ Tests for exceptional cases of pipeline execution. """ - + """Tests for exceptional cases of pipeline execution.""" @named_param(argnames="point", argvalues=BASIC_ACTIONS) @named_param(argnames="spec_type", argvalues=STAGE_SPECS) @named_param(argnames="inclusive", argvalues=[False, True]) def test_start_point_equals_stop( - self, dummy_pipe, point, spec_type, stage, inclusive): - """ Start=stop is only permitted if stop should be run. """ + self, dummy_pipe, point, spec_type, stage, inclusive + ): + """Start=stop is only permitted if stop should be run.""" _assert_pipeline_initialization(dummy_pipe) @@ -130,80 +135,83 @@ def test_start_point_equals_stop( with pytest.raises(IllegalPipelineExecutionError): dummy_pipe.run(start_point=stage, stop_before=stage) - @pytest.mark.parametrize( - argnames=["start_point", "stop"], - argvalues=[(write_file2, write_file1), - (write_file3, write_file2), - (write_file3, write_file1)]) + argnames=["start_point", "stop"], + argvalues=[ + (write_file2, write_file1), + (write_file3, write_file2), + (write_file3, write_file1), + ], + ) @pytest.mark.parametrize(argnames="spec_type", argvalues=STAGE_SPECS) @pytest.mark.parametrize( - argnames="stop_type", argvalues=["stop_before", "stop_after"]) + argnames="stop_type", argvalues=["stop_before", "stop_after"] + ) def test_start_point_after_stop( - self, dummy_pipe, start_point, stop, stop_type, spec_type): - """ Regardless of specification type, start > stop is prohibited. """ + self, dummy_pipe, start_point, stop, stop_type, spec_type + ): + """Regardless of specification type, start > stop is prohibited.""" start_point = _parse_stage(start_point, spec_type) stop = _parse_stage(stop, spec_type) with pytest.raises(IllegalPipelineExecutionError): dummy_pipe.run(**{"start_point": start_point, stop_type: stop}) - @named_param( - argnames="undefined_stage", - argvalues=["unsupported-pipeline-stage", "unknown_phase"]) - @named_param(argnames="stage_point", - argvalues=["start_point", "stop_before", "stop_after"]) + argnames="undefined_stage", + argvalues=["unsupported-pipeline-stage", "unknown_phase"], + ) + @named_param( + argnames="stage_point", argvalues=["start_point", "stop_before", "stop_after"] + ) def test_unknown_stage(self, dummy_pipe, undefined_stage, stage_point): - """ Start specification must be of known stage name. """ + """Start specification must be of known stage name.""" with pytest.raises(UnknownPipelineStageError): dummy_pipe.run(**{stage_point: undefined_stage}) - @named_param(argnames="stop_before", argvalues=BASIC_ACTIONS) @named_param(argnames="stop_after", argvalues=BASIC_ACTIONS) @named_param(argnames="spec_type", argvalues=STAGE_SPECS) def test_stop_before_and_stop_after( - self, dummy_pipe, stop_before, stop_after, spec_type): - """ Inclusive and exclusive stop cannot both be provided. """ + self, dummy_pipe, stop_before, stop_after, spec_type + ): + """Inclusive and exclusive stop cannot both be provided.""" inclusive_stop = _parse_stage(stop_after, spec_type) exclusive_stop = _parse_stage(stop_before, spec_type) kwargs = {"stop_before": exclusive_stop, "stop_after": inclusive_stop} with pytest.raises(IllegalPipelineExecutionError): dummy_pipe.run(**kwargs) - def test_empty_stages_is_prohibited(self, pl_mgr): - """ Pipeline must have non-empty stages """ + """Pipeline must have non-empty stages""" with pytest.raises(IllegalPipelineDefinitionError): EmptyStagesPipeline(manager=pl_mgr) - def test_stage_name_collision_is_prohibited(self, pl_mgr): - """ Each stage needs unique translation, used for checkpoint file. """ + """Each stage needs unique translation, used for checkpoint file.""" with pytest.raises(IllegalPipelineDefinitionError): NameCollisionPipeline(manager=pl_mgr) - class MostBasicPipelineTests: - """ Test pipeline defined with notion of 'absolute minimum' config. """ - + """Test pipeline defined with notion of 'absolute minimum' config.""" def test_runs_through_full(self, dummy_pipe, test_type): - """ The entire basic pipeline should execute. """ + """The entire basic pipeline should execute.""" # Start with clean output folder. _assert_pipeline_initialization(dummy_pipe) # Make the call under test. dummy_pipe.run(start_point=None, stop_before=None, stop_after=None) - + if test_type == "effects": # We're interested in existence and content of targets. exp_files, _ = zip(*FILE_TEXT_PAIRS) _assert_output(dummy_pipe, exp_files) - fpath_text_pairs = [(pipeline_filepath(dummy_pipe, fname), content) - for fname, content in FILE_TEXT_PAIRS] + fpath_text_pairs = [ + (pipeline_filepath(dummy_pipe, fname), content) + for fname, content in FILE_TEXT_PAIRS + ] for fpath, content in fpath_text_pairs: _assert_expected_content(fpath, content) @@ -214,8 +222,11 @@ def test_runs_through_full(self, dummy_pipe, test_type): try: assert os.path.isfile(chkpt_fpath) except AssertionError: - print("Stage '{}' file doesn't exist: '{}'".format( - stage.name, chkpt_fpath)) + print( + "Stage '{}' file doesn't exist: '{}'".format( + stage.name, chkpt_fpath + ) + ) raise elif test_type == "stage_labels": @@ -229,28 +240,26 @@ def test_runs_through_full(self, dummy_pipe, test_type): else: raise ValueError("Unknown test type: {}".format(test_type)) - def test_skip_completed(self, dummy_pipe, test_type): - """ Pre-completed stage(s) are skipped. """ + """Pre-completed stage(s) are skipped.""" _assert_pipeline_initialization(dummy_pipe) first_stage = dummy_pipe.stages()[0] first_stage_chkpt_fpath = checkpoint_filepath(first_stage, dummy_pipe) - open(first_stage_chkpt_fpath, 'w').close() + open(first_stage_chkpt_fpath, "w").close() assert os.path.isfile(first_stage_chkpt_fpath) exp_skips = [first_stage] exp_execs = dummy_pipe.stages()[1:] # This should neither exist nor be created. - first_stage_outfile = pipeline_filepath( - dummy_pipe.manager, filename=FILE1_NAME) + first_stage_outfile = pipeline_filepath(dummy_pipe.manager, filename=FILE1_NAME) assert not os.path.isfile(first_stage_outfile) - + # Do the action. dummy_pipe.run() - + if test_type == "effects": # We should not have generated the first stage's output file. # That notion is covered in the outfiles assertion. @@ -274,22 +283,21 @@ def test_skip_completed(self, dummy_pipe, test_type): else: raise ValueError("Unknown test type: '{}'".format(test_type)) - - @named_param(argnames="start_index", - argvalues=range(len(BASIC_ACTIONS) - 1)) - @named_param(argnames="start_spec_type", - argvalues=["stage", "function", "name"]) + @named_param(argnames="start_index", argvalues=range(len(BASIC_ACTIONS) - 1)) + @named_param(argnames="start_spec_type", argvalues=["stage", "function", "name"]) def test_execution_allows_specific_starting_point( - self, dummy_pipe, test_type, start_index, start_spec_type): - """ A pipeline may be started from an arbitrary checkpoint. """ + self, dummy_pipe, test_type, start_index, start_spec_type + ): + """A pipeline may be started from an arbitrary checkpoint.""" _assert_pipeline_initialization(dummy_pipe) s = _parse_stage(BASIC_ACTIONS[start_index], start_spec_type) dummy_pipe.run(start_point=s) if test_type == "effects": exp_files = FILENAMES[start_index:] _assert_output(dummy_pipe, exp_files) - fpaths = [pipeline_filepath(dummy_pipe.manager, filename=fn) - for fn in exp_files] + fpaths = [ + pipeline_filepath(dummy_pipe.manager, filename=fn) for fn in exp_files + ] for fp, content in zip(fpaths, CONTENTS[start_index:]): _assert_expected_content(fp, content) elif test_type == "checkpoints": @@ -298,17 +306,18 @@ def test_execution_allows_specific_starting_point( elif test_type == "stage_labels": # Ensure match between skipped and executed stage expectations # and observations. - _assert_stage_states(dummy_pipe, BASIC_ACTIONS[:start_index], - BASIC_ACTIONS[start_index:]) + _assert_stage_states( + dummy_pipe, BASIC_ACTIONS[:start_index], BASIC_ACTIONS[start_index:] + ) elif test_type == "pipe_flag": _assert_pipeline_completed(dummy_pipe) else: raise ValueError("Unknown test type: '{}'".format(test_type)) - def test_all_checkpoints_after_first_executed_are_overwritten( - self, dummy_pipe, test_type): - """ Potential for dependent results means execution is contiguous. """ + self, dummy_pipe, test_type + ): + """Potential for dependent results means execution is contiguous.""" # Start fresh. _assert_pipeline_initialization(dummy_pipe) @@ -317,7 +326,7 @@ def test_all_checkpoints_after_first_executed_are_overwritten( fpath_time_pairs = [] for s in BASIC_ACTIONS[1:]: check_fpath = checkpoint_filepath(s, dummy_pipe.manager) - open(check_fpath, 'w').close() + open(check_fpath, "w").close() fpath_time_pairs.append((check_fpath, os.path.getmtime(check_fpath))) assert os.path.isfile(check_fpath) @@ -334,19 +343,19 @@ def test_all_checkpoints_after_first_executed_are_overwritten( elif test_type == "checkpoints": _assert_checkpoints(dummy_pipe, BASIC_ACTIONS) elif test_type == "stage_labels": - _assert_stage_states(dummy_pipe, expected_skipped=[], - expected_executed=BASIC_ACTIONS) + _assert_stage_states( + dummy_pipe, expected_skipped=[], expected_executed=BASIC_ACTIONS + ) elif test_type == "pipe_flag": _assert_pipeline_completed(dummy_pipe) else: raise ValueError("Unknown test type: {}".format(test_type)) - @named_param(argnames="stop_index", argvalues=range(1, len(BASIC_ACTIONS))) @named_param(argnames="spec_type", argvalues=STAGE_SPECS) @named_param(argnames="stop_type", argvalues=["stop_before", "stop_after"]) def test_stop(self, dummy_pipe, test_type, stop_index, spec_type, stop_type): - """ A pipeline is capable of halting at/after a specified stage. """ + """A pipeline is capable of halting at/after a specified stage.""" # Negative control / pretest. _assert_pipeline_initialization(dummy_pipe) @@ -366,8 +375,9 @@ def test_stop(self, dummy_pipe, test_type, stop_index, spec_type, stop_type): if test_type == "effects": exp_files = FILENAMES[:stop_index] _assert_output(dummy_pipe, exp_files) - fpaths = [pipeline_filepath(dummy_pipe.manager, filename=fn) - for fn in exp_files] + fpaths = [ + pipeline_filepath(dummy_pipe.manager, filename=fn) for fn in exp_files + ] for fp, content in zip(fpaths, CONTENTS[:stop_index]): _assert_expected_content(fp, content) @@ -375,11 +385,12 @@ def test_stop(self, dummy_pipe, test_type, stop_index, spec_type, stop_type): _assert_checkpoints(dummy_pipe, BASIC_ACTIONS[:stop_index]) elif test_type == "stage_labels": _assert_stage_states( - dummy_pipe, expected_skipped=BASIC_ACTIONS[stop_index:], - expected_executed=BASIC_ACTIONS[:stop_index]) + dummy_pipe, + expected_skipped=BASIC_ACTIONS[stop_index:], + expected_executed=BASIC_ACTIONS[:stop_index], + ) elif test_type == "pipe_flag": - if (stop_index == len(BASIC_ACTIONS)) and \ - (stop_type == "stop_after"): + if (stop_index == len(BASIC_ACTIONS)) and (stop_type == "stop_after"): _assert_pipeline_completed(dummy_pipe) else: _assert_pipeline_halted(dummy_pipe) @@ -387,20 +398,20 @@ def test_stop(self, dummy_pipe, test_type, stop_index, spec_type, stop_type): raise ValueError("Unknown test type: '{}'".format(test_type)) - @named_param( - argnames="spec_type", - argvalues=["filename", "filepath", "stage", "stage_name"]) + argnames="spec_type", argvalues=["filename", "filepath", "stage", "stage_name"] +) @named_param(argnames="completed", argvalues=[False, True]) def test_stage_completion_determination(dummy_pipe, spec_type, completed): - """ Pipeline responds to variety of request forms of checkpoint status. """ + """Pipeline responds to variety of request forms of checkpoint status.""" # Allow dummy stage definition and determination of filename. def dummy_test_func(): pass chkpt_name = checkpoint_filename( - dummy_test_func.__name__, pipeline_name=dummy_pipe.name) + dummy_test_func.__name__, pipeline_name=dummy_pipe.name + ) chkpt_fpath = checkpoint_filepath(chkpt_name, dummy_pipe.manager) # Determine how to request the checkpoint completion status. @@ -417,15 +428,18 @@ def dummy_test_func(): # Touch the checkpoint file iff we're positively testing completion. if completed: - open(chkpt_fpath, 'w').close() + open(chkpt_fpath, "w").close() # Check the completion status for concordance with expectation. # Print a bit of info to inform hypotheses about the source of a # hypothetical test error/failure. outfolder_contents = os.listdir(dummy_pipe.outfolder) print("Pipeline outfolder contents: {}".format(outfolder_contents)) - print("Contents as pipeline files: {}".format( - [pipeline_filepath(dummy_pipe.manager, f) for f in outfolder_contents])) + print( + "Contents as pipeline files: {}".format( + [pipeline_filepath(dummy_pipe.manager, f) for f in outfolder_contents] + ) + ) print("Checking completion status: {} ({})".format(s, type(s))) observed_completion = dummy_pipe.completed_stage(s) if completed: @@ -434,7 +448,6 @@ def dummy_test_func(): assert not observed_completion - def _assert_checkpoints(pl, exp_stages): """ Assert equivalence between expected and observed checkpoint files. @@ -449,7 +462,6 @@ def _assert_checkpoints(pl, exp_stages): assert set(exp_fpaths) == set(obs_fpaths) - def _assert_expected_content(fpath, content): """ Determine whether a filepath has the expected content. @@ -460,12 +472,11 @@ def _assert_expected_content(fpath, content): """ assert os.path.isfile(fpath) exp_content = content.split(os.linesep) - with open(fpath, 'r') as f: + with open(fpath, "r") as f: obs_content = [l.rstrip(os.linesep) for l in f.readlines()] assert exp_content == obs_content - def _assert_output(pl, expected_filenames): """ Assert equivalence--with respect to presence only--between expected @@ -476,20 +487,21 @@ def _assert_output(pl, expected_filenames): :param Iterable[str] expected_filenames: :return: """ - obs_outfiles = glob.glob(pipeline_filepath( - pl.manager, "*{}".format(OUTPUT_SUFFIX))) + obs_outfiles = glob.glob(pipeline_filepath(pl.manager, "*{}".format(OUTPUT_SUFFIX))) assert len(expected_filenames) == len(obs_outfiles) expected_filepaths = [] for fname in expected_filenames: - fpath = fname if os.path.isabs(fname) else \ - pipeline_filepath(pl.manager, filename=fname) + fpath = ( + fname + if os.path.isabs(fname) + else pipeline_filepath(pl.manager, filename=fname) + ) expected_filepaths.append(fpath) assert set(expected_filepaths) == set(obs_outfiles) - def _assert_pipeline_status(pl, flag): - """ Assert, based on flag file presence, that a pipeline's completed. """ + """Assert, based on flag file presence, that a pipeline's completed.""" flags = glob.glob(pipeline_filepath(pl.manager, filename=flag_name("*"))) assert 1 == len(flags) exp_flag = pipeline_filepath(pl, suffix="_" + flag_name(flag)) @@ -500,13 +512,10 @@ def _assert_pipeline_status(pl, flag): raise - -_assert_pipeline_completed = partial( - _assert_pipeline_status, flag=COMPLETE_FLAG) +_assert_pipeline_completed = partial(_assert_pipeline_status, flag=COMPLETE_FLAG) _assert_pipeline_halted = partial(_assert_pipeline_status, flag=PAUSE_FLAG) - def _assert_pipeline_initialization(pl): """ Assert that a test case begins with output folder in expected state. @@ -514,21 +523,21 @@ def _assert_pipeline_initialization(pl): :param pypiper.Pipeline pl: Pipeline instance for test case. """ # TODO: implement. - suffices = {"_commands.sh", "_profile.tsv", - "_{}".format(flag_name(RUN_FLAG))} - exp_init_contents = \ - [pipeline_filepath(pl.manager, suffix=s) for s in suffices] - obs_init_contents = [pipeline_filepath(pl.manager, filename=n) - for n in os.listdir(pl.outfolder)] + suffices = {"_commands.sh", "_profile.tsv", "_{}".format(flag_name(RUN_FLAG))} + exp_init_contents = [pipeline_filepath(pl.manager, suffix=s) for s in suffices] + obs_init_contents = [ + pipeline_filepath(pl.manager, filename=n) for n in os.listdir(pl.outfolder) + ] assert len(exp_init_contents) == len(obs_init_contents) assert set(exp_init_contents) == set(obs_init_contents) - def _assert_stage_states(pl, expected_skipped, expected_executed): - """ Assert equivalence between expected and observed stage states. """ + """Assert equivalence between expected and observed stage states.""" + def _ensure_stage(s): return s if isinstance(s, Stage) else Stage(s) + expected_skipped = [_ensure_stage(s) for s in expected_skipped] expected_executed = [_ensure_stage(s) for s in expected_executed] assert expected_skipped == pl.skipped diff --git a/tests/pipeline/test_pipeline_checkpoint.py b/tests/pipeline/test_pipeline_checkpoint.py index 9bbadab8..1267143f 100644 --- a/tests/pipeline/test_pipeline_checkpoint.py +++ b/tests/pipeline/test_pipeline_checkpoint.py @@ -5,17 +5,17 @@ from pypiper.utils import checkpoint_filepath from tests.helpers import fetch_checkpoint_files, named_param -from .conftest import get_pipeline +from .conftest import get_pipeline __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - def test_pipeline_checkpoint_respect_sensitivity_checkpoint_perspective( - pl_name, tmpdir): - """ Pipeline can skip past its stage(s) for which checkpoint exists. """ + pl_name, tmpdir +): + """Pipeline can skip past its stage(s) for which checkpoint exists.""" # Create the pipeline. pipeline = get_pipeline(pl_name, tmpdir.strpath) @@ -27,8 +27,9 @@ def test_pipeline_checkpoint_respect_sensitivity_checkpoint_perspective( pipeline.run() # Verify that we created each of the checkpoints. - expected = [checkpoint_filepath(f.__name__, pipeline.manager) - for f in pipeline.functions] + expected = [ + checkpoint_filepath(f.__name__, pipeline.manager) for f in pipeline.functions + ] observed = fetch_checkpoint_files(pipeline.manager) assert set(expected) == set(observed) @@ -37,8 +38,7 @@ def test_pipeline_checkpoint_respect_sensitivity_checkpoint_perspective( # Remove the checkpoint for the final stage. last_aligner_stage = pipeline.functions[-1] - last_aligner_checkfile = checkpoint_filepath( - last_aligner_stage, pipeline.manager) + last_aligner_checkfile = checkpoint_filepath(last_aligner_stage, pipeline.manager) os.unlink(last_aligner_checkfile) # Verify removal of final stage checkpoint file. @@ -59,9 +59,11 @@ def test_pipeline_checkpoint_respect_sensitivity_checkpoint_perspective( observed = fetch_checkpoint_files(pipeline.manager) exp = set(expected) obs = set(observed) - assert set(expected) == set(observed), \ - "Expected only:\n{}\nExpected and observed:\n{}\nObserved only:\n{}".format( - exp - obs, exp & obs, obs - exp) + assert set(expected) == set( + observed + ), "Expected only:\n{}\nExpected and observed:\n{}\nObserved only:\n{}".format( + exp - obs, exp & obs, obs - exp + ) # Verify the we didn't recreate the checkpoint file for each skipped stage. for f in expected[:-1]: @@ -71,15 +73,15 @@ def test_pipeline_checkpoint_respect_sensitivity_checkpoint_perspective( # Verify the we did in fact recreate the checkpoint file for the stage # that was rerun. - assert os.path.getmtime(last_aligner_checkfile) > \ - timestamps[last_aligner_checkfile], \ - "Recreated checkpoint file ('{}') should be newer than original".\ - format(last_aligner_checkfile) - + assert ( + os.path.getmtime(last_aligner_checkfile) > timestamps[last_aligner_checkfile] + ), "Recreated checkpoint file ('{}') should be newer than original".format( + last_aligner_checkfile + ) def test_pipeline_checkpoint_sensitivity_effect_perspective(pl_name, tmpdir): - """ The pipeline skips execution of stages with extant checkpoint. """ + """The pipeline skips execution of stages with extant checkpoint.""" # Create the pipeline, then check creation of output file. pipeline = get_pipeline(pl_name, tmpdir.strpath) @@ -89,15 +91,16 @@ def test_pipeline_checkpoint_sensitivity_effect_perspective(pl_name, tmpdir): assert os.path.isfile(output_file) # Validate pipeline effects (output file content). - with open(output_file, 'r') as f: + with open(output_file, "r") as f: lines = f.readlines() assert [s.name + os.linesep for s in pipeline.stages()] == lines # Verify presence of checkpoint files to support our expectation about # which stages should be skipped and which should be run during the second # time through the pipeline's execution. - exp_cp_fpaths = set(checkpoint_filepath(s.name, pipeline.manager) - for s in pipeline.stages()) + exp_cp_fpaths = set( + checkpoint_filepath(s.name, pipeline.manager) for s in pipeline.stages() + ) assert exp_cp_fpaths == set(fetch_checkpoint_files(pipeline.manager)) final_stage = pipeline.stages()[-1] final_stage_fpath = checkpoint_filepath(final_stage.name, pipeline.manager) @@ -105,24 +108,26 @@ def test_pipeline_checkpoint_sensitivity_effect_perspective(pl_name, tmpdir): # Verify the effect of the second execution of the pipeline. pipeline.run() - with open(output_file, 'r') as f: + with open(output_file, "r") as f: lines = f.readlines() assert [final_stage.name + os.linesep] == lines @named_param("overwrite", [False, True]) def test_pipeline_reruns_downstream_stages_according_to_parameterization( - overwrite, pl_name, tmpdir): - """ Pipeline overwrites downstream stages unless configured otherwise. """ + overwrite, pl_name, tmpdir +): + """Pipeline overwrites downstream stages unless configured otherwise.""" pl = get_pipeline(pl_name, tmpdir.strpath) # Create checkpoint file for each stage. stage_names = [s.name for s in pl.stages()] - assert 1 < len(stage_names), \ - "Need pipeline with at least two stages to run this test." + assert 1 < len( + stage_names + ), "Need pipeline with at least two stages to run this test." for s_name in stage_names: - open(checkpoint_filepath(s_name, pl.manager), 'w').close() + open(checkpoint_filepath(s_name, pl.manager), "w").close() # Remove the checkpoint file for the penultimate stage. penultimate_stage = stage_names[-2] @@ -140,6 +145,6 @@ def test_pipeline_reruns_downstream_stages_according_to_parameterization( exp_stages.append(stage_names[-1]) exp_lines = [func + os.linesep for func in stage_names[-2:]] outpath = os.path.join(pl.outfolder, pl.name_output_file) - with open(outpath, 'r') as f: + with open(outpath, "r") as f: obs_lines = f.readlines() assert exp_lines == obs_lines diff --git a/tests/pipeline/test_pipeline_constructor.py b/tests/pipeline/test_pipeline_constructor.py index aba414fd..5de22bf6 100644 --- a/tests/pipeline/test_pipeline_constructor.py +++ b/tests/pipeline/test_pipeline_constructor.py @@ -1,17 +1,16 @@ """ Tests for construction of a Pipeline """ import pytest -from pypiper import Pipeline, PipelineManager, Stage -from tests.helpers import assert_equal_dirpath, named_param, SafeTestPipeline +from pypiper import Pipeline, PipelineManager, Stage +from tests.helpers import SafeTestPipeline, assert_equal_dirpath, named_param __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - def test_pipeline_requires_stages_definition(tmpdir): - """ To create a pipeline, define stages (execution steps). """ + """To create a pipeline, define stages (execution steps).""" class NoStagesPipeline(SafeTestPipeline): pass @@ -25,65 +24,54 @@ class NoStagesPipeline(SafeTestPipeline): _MinimalPipeline(name=name, outfolder=tmpdir.strpath) - class JustManagerArgument: - """ A pipeline can be created with just a manager argument. """ - + """A pipeline can be created with just a manager argument.""" NAME_HOOK = "pl_mgr_name" - @pytest.fixture def pl_mgr(self, request, get_pipe_manager): - """ Provide each of this class's test cases with pipeline manager. """ + """Provide each of this class's test cases with pipeline manager.""" if self.NAME_HOOK in request.fixturenames: name = request.getfixturevalue(self.NAME_HOOK) else: name = "test-pipe" return get_pipe_manager(name=name) - - @named_param( - argnames=NAME_HOOK, argvalues=["arbitrary-pipeline", "DummyPipe"]) + @named_param(argnames=NAME_HOOK, argvalues=["arbitrary-pipeline", "DummyPipe"]) def test_pipeline_adopts_manager_name(self, pl_mgr_name, pl_mgr): - """ If given just a manager, a pipeline uses the manager name. """ + """If given just a manager, a pipeline uses the manager name.""" pl = Pipeline(manager=pl_mgr) assert pl_mgr_name == pl_mgr.name assert pl_mgr_name == pl.name - def test_pipeline_adopts_manager_output_folder(self, pl_mgr): - """ Pipeline uses manager output folder if given just manager. """ + """Pipeline uses manager output folder if given just manager.""" pl = Pipeline(manager=pl_mgr) assert pl_mgr.outfolder == pl.outfolder - class MinimalArgumentsWithoutManagerTests: - """ Tests for pipeline constructor argument provision without manager. """ - + """Tests for pipeline constructor argument provision without manager.""" def test_pipeline_creates_manager(self, tmpdir): - """ If not passed a pipeline manager, a pipeline creates one. """ + """If not passed a pipeline manager, a pipeline creates one.""" empty = _MinimalPipeline(name="minimal", outfolder=tmpdir.strpath) assert isinstance(empty.manager, PipelineManager) - @named_param("pipe_name", ["test-pipe", "DummyPipeline"]) def test_manager_adopts_pipeline_name(self, pipe_name, tmpdir): - """ Autogenerated pipeline manager uses pipeline's name. """ + """Autogenerated pipeline manager uses pipeline's name.""" pl = _MinimalPipeline(name=pipe_name, outfolder=tmpdir.strpath) assert pipe_name == pl.name assert pl.name == pl.manager.name - def test_manager_adopts_pipeline_output_folder(self, tmpdir): - """ Autogenerated pipeline manager uses pipeline's output folder. """ + """Autogenerated pipeline manager uses pipeline's output folder.""" pl = _MinimalPipeline(name="test-pipe", outfolder=tmpdir.strpath) assert_equal_dirpath(tmpdir.strpath, pl.outfolder) - class ConceptuallyOverlappingArgumentsTests: """ Test cases in which pipeline's argument space is overspecified. @@ -100,19 +88,15 @@ class ConceptuallyOverlappingArgumentsTests: """ - - def test_same_name_for_manager_and_pipeline( - self, tmpdir, get_pipe_manager): - """ Pipeline name and manager with matching name is unproblematic. """ + def test_same_name_for_manager_and_pipeline(self, tmpdir, get_pipe_manager): + """Pipeline name and manager with matching name is unproblematic.""" name = "test-pipe" pm = get_pipe_manager(name=name, outfolder=tmpdir.strpath) pl = _MinimalPipeline(name=name, manager=pm) assert name == pl.manager.name - - def test_different_name_for_manager_and_pipeline( - self, tmpdir, get_pipe_manager): - """ If given, pipeline favors its own name over manager's. """ + def test_different_name_for_manager_and_pipeline(self, tmpdir, get_pipe_manager): + """If given, pipeline favors its own name over manager's.""" manager_name = "manager" pipeline_name = "pipeline" pm = get_pipe_manager(name=manager_name, outfolder=tmpdir.strpath) @@ -120,19 +104,17 @@ def test_different_name_for_manager_and_pipeline( assert pipeline_name == pl.name assert manager_name == pl.manager.name - - @named_param( - "output_folder", argvalues=["test-output", "testing-output-folder"]) + @named_param("output_folder", argvalues=["test-output", "testing-output-folder"]) def test_pipeline_ignores_outfolder_if_manager_is_passed( - self, output_folder, tmpdir, get_pipe_manager): - """ Manager's output folder trumps explicit output folder. """ + self, output_folder, tmpdir, get_pipe_manager + ): + """Manager's output folder trumps explicit output folder.""" pm = get_pipe_manager(name="test-pipe", outfolder=tmpdir.strpath) pl = _MinimalPipeline(manager=pm, outfolder=output_folder) assert_equal_dirpath(tmpdir.strpath, pl.outfolder) - def test_name_outfolder_and_manager(self, tmpdir, get_pipe_manager): - """ Tests provision of all three primary pipeline arguments. """ + """Tests provision of all three primary pipeline arguments.""" name = "test-pipe" pm = get_pipe_manager(name=name, outfolder=tmpdir.strpath) pl = _MinimalPipeline(name=name, manager=pm, outfolder=tmpdir.strpath) @@ -141,58 +123,53 @@ def test_name_outfolder_and_manager(self, tmpdir, get_pipe_manager): assert pm == pl.manager - def test_pipeline_requires_either_manager_or_outfolder(): - """ Pipeline must be passed pipeline manager or output folder. """ + """Pipeline must be passed pipeline manager or output folder.""" with pytest.raises(TypeError): _MinimalPipeline() - def test_empty_pipeline_manager_name_and_no_explicit_pipeline_name( - tmpdir, get_pipe_manager): - """ If no name's passed to pipeline, the manager must have valid name. """ + tmpdir, get_pipe_manager +): + """If no name's passed to pipeline, the manager must have valid name.""" pm = get_pipe_manager(name="", outfolder=tmpdir.strpath) with pytest.raises(ValueError): _MinimalPipeline(manager=pm) - class AnonymousFunctionStageTests: - """ Tests for anonymous function as a pipeline stage. """ - + """Tests for anonymous function as a pipeline stage.""" def test_anonymous_stage_without_name_is_prohibited(self, tmpdir): - """ Anonymous function as Stage must be paired with name. """ + """Anonymous function as Stage must be paired with name.""" with pytest.raises(TypeError): _AnonymousStageWithoutNamePipeline( - name="test-pipe", outfolder=tmpdir.strpath) - + name="test-pipe", outfolder=tmpdir.strpath + ) def test_anonymous_stage_with_name_is_permitted(self, tmpdir): - """ Anonymous function as Stage must be paired with name. """ - _AnonymousStageWithNamePipeline( - name="test-pipe", outfolder=tmpdir.strpath) - + """Anonymous function as Stage must be paired with name.""" + _AnonymousStageWithNamePipeline(name="test-pipe", outfolder=tmpdir.strpath) class _AnonymousStageWithoutNamePipeline(SafeTestPipeline): - """ Anonymous function as stage is prohibited unless paired with name. """ + """Anonymous function as stage is prohibited unless paired with name.""" + def stages(self): return [lambda: None] - class _AnonymousStageWithNamePipeline(SafeTestPipeline): - """ Anonymous function as Stage is allowed if wrapped with a name. """ + """Anonymous function as Stage is allowed if wrapped with a name.""" + def stages(self): return [("NullStage", lambda: None)] - @pytest.fixture def empty_pipeline(request): - """ Provide test case with minimal pipeline instance. """ + """Provide test case with minimal pipeline instance.""" if "pipe_name" in request.fixturenames: name = request.getfixturevalue("pipe_name") else: @@ -200,15 +177,13 @@ def empty_pipeline(request): return _MinimalPipeline(name) - class _MinimalPipeline(SafeTestPipeline): - """ Minimal pipeline declaration. """ + """Minimal pipeline declaration.""" def stages(self): - """ Sham stages definition. """ + """Sham stages definition.""" return [_do_nothing] - def _do_nothing(): return diff --git a/tests/pipeline_manager/test_halt.py b/tests/pipeline_manager/test_halt.py index 6be733a3..824ecc31 100644 --- a/tests/pipeline_manager/test_halt.py +++ b/tests/pipeline_manager/test_halt.py @@ -1,19 +1,19 @@ """ Tests for effects of pipeline manager's halt() function. """ import os + import pytest + from pypiper.exceptions import PipelineHalt from pypiper.flags import COMPLETE_FLAG, PAUSE_FLAG from tests.helpers import named_param - __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - def test_halt_state(get_pipe_manager): - """ Requesting a halt alters manager state. """ + """Requesting a halt alters manager state.""" pm = get_pipe_manager(name="test-pipe") assert pm._active pm.halt(raise_error=False) @@ -21,9 +21,8 @@ def test_halt_state(get_pipe_manager): assert not pm._active - def test_halt_file(get_pipe_manager): - """ Requesting a halt produces a particular flag file. """ + """Requesting a halt produces a particular flag file.""" pm = get_pipe_manager(name="TestPM") path_halt_file = pm._flag_file_path(PAUSE_FLAG) assert not os.path.isfile(path_halt_file) @@ -31,10 +30,9 @@ def test_halt_file(get_pipe_manager): assert os.path.isfile(path_halt_file) - @named_param("raise_error", [False, True, None]) def test_halt_exceptionality(get_pipe_manager, raise_error): - """ Halting is conditionally exceptional """ + """Halting is conditionally exceptional""" pm = get_pipe_manager(name="halt-error") if raise_error is None: # Default is exceptional. @@ -47,12 +45,10 @@ def test_halt_exceptionality(get_pipe_manager, raise_error): pm.halt(raise_error=False) - @named_param("raise_error", [False, True]) @named_param("test_type", argvalues=["halt_flag", "complete_flag"]) -def test_halt_status_supersedes_completed( - get_pipe_manager, raise_error, test_type): - """ Halting pipeline replaces completed flag with halt flag. """ +def test_halt_status_supersedes_completed(get_pipe_manager, raise_error, test_type): + """Halting pipeline replaces completed flag with halt flag.""" # Create manager and completion flag. pm = get_pipe_manager(name="halt-status-flag") diff --git a/tests/pipeline_manager/test_manager_constructor.py b/tests/pipeline_manager/test_manager_constructor.py index 2ff2a9a8..0792bf1f 100644 --- a/tests/pipeline_manager/test_manager_constructor.py +++ b/tests/pipeline_manager/test_manager_constructor.py @@ -1,47 +1,40 @@ """ Test effects of construction of a pipeline manager. """ import argparse + import pytest + from pypiper.manager import CHECKPOINT_SPECIFICATIONS from tests.helpers import named_param - __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - def pytest_generate_tests(metafunc): - """ Dynamic test case generation for this module's test cases. """ + """Dynamic test case generation for this module's test cases.""" if "spec_type" in metafunc.fixturenames: - metafunc.parametrize( - argnames="spec_type", argvalues=["cmdl", "ctor"]) + metafunc.parametrize(argnames="spec_type", argvalues=["cmdl", "ctor"]) - -@named_param( - "checkpoint_type", argvalues=["curr_checkpoint", "prev_checkpoint"]) -def test_manager_starts_in_null_checkpoint_state( - get_pipe_manager, checkpoint_type): - """ A pipeline manager begins with null checkpoint states. """ +@named_param("checkpoint_type", argvalues=["curr_checkpoint", "prev_checkpoint"]) +def test_manager_starts_in_null_checkpoint_state(get_pipe_manager, checkpoint_type): + """A pipeline manager begins with null checkpoint states.""" pm = get_pipe_manager(name="ctor-checkpoint-state") assert getattr(pm, checkpoint_type) is None - class ManagerConstructorCheckpointSpecificationTests: - """ Tests for manager's constructor's ability to parse and set - checkpoint specifications, which can determine aspects of control flow. """ - + """Tests for manager's constructor's ability to parse and set + checkpoint specifications, which can determine aspects of control flow.""" def test_no_checkpoint_specifications(self, get_pipe_manager): - """ A manager may be constructed without any checkpoint provision. """ + """A manager may be constructed without any checkpoint provision.""" get_pipe_manager(name="test-pipe") - @named_param("start_point", ["filter_reads", "align_reads"]) def test_just_start(self, get_pipe_manager, spec_type, start_point): - """ Starting point may be set from command-line or ctor keyword. """ + """Starting point may be set from command-line or ctor keyword.""" spec_data = {"start_point": start_point} if spec_type == "cmdl": kwargs = {"args": argparse.Namespace(**spec_data)} @@ -50,12 +43,10 @@ def test_just_start(self, get_pipe_manager, spec_type, start_point): pm = get_pipe_manager(name="start-test", **kwargs) assert start_point == pm.start_point - @named_param("stop_type", ["stop_before", "stop_after"]) @named_param("stop_point", ["align_reads", "call_peaks"]) - def test_just_stop(self, get_pipe_manager, - spec_type, stop_type, stop_point): - """ Particular stopping type is set correctly. """ + def test_just_stop(self, get_pipe_manager, spec_type, stop_type, stop_point): + """Particular stopping type is set correctly.""" spec_data = {stop_type: stop_point} if spec_type == "cmdl": kwargs = {"args": argparse.Namespace(**spec_data)} @@ -64,13 +55,13 @@ def test_just_stop(self, get_pipe_manager, pm = get_pipe_manager(name="stop-test", **kwargs) assert stop_point == getattr(pm, stop_type) - @named_param("start_point", ["merge_input", "filter_reads"]) @named_param("stop_point", ["align_reads", "calc_stats"]) @named_param("stop_type", ["stop_before", "stop_after"]) - def test_start_and_stop(self, get_pipe_manager, spec_type, - stop_type, start_point, stop_point): - """ Specifying both start and stop works just fine. """ + def test_start_and_stop( + self, get_pipe_manager, spec_type, stop_type, start_point, stop_point + ): + """Specifying both start and stop works just fine.""" spec_data = {"start_point": start_point, stop_type: stop_point} if spec_type == "cmdl": kwargs = {"args": argparse.Namespace(**spec_data)} @@ -80,15 +71,19 @@ def test_start_and_stop(self, get_pipe_manager, spec_type, assert start_point == pm.start_point assert stop_point == getattr(pm, stop_type) - @named_param("stop_before", ["align_reads", "call_peaks"]) @named_param("stop_after", ["fastqc", "align_reads"]) @named_param("stop_before_type", ["cmdl", "ctor"]) @named_param("stop_after_type", ["cmdl", "ctor"]) def test_both_stop_modes_is_prohibited( - self, get_pipe_manager, stop_before_type, - stop_after_type, stop_before, stop_after): - """ Provision of both prospective and retrospective stop is bad. """ + self, + get_pipe_manager, + stop_before_type, + stop_after_type, + stop_before, + stop_after, + ): + """Provision of both prospective and retrospective stop is bad.""" raw_kwargs = {"stop_before": stop_before, "stop_after": stop_after} cmdl_kwargs = {} if stop_before_type == "cmdl": @@ -99,18 +94,25 @@ def test_both_stop_modes_is_prohibited( with pytest.raises(TypeError): get_pipe_manager(name="test-double-stop", args=args, **raw_kwargs) - @pytest.mark.parametrize( argnames=["start_point", "stop_point"], - argvalues=[("fastqc", "align_reads"), ("align_reads", "call_peaks")]) + argvalues=[("fastqc", "align_reads"), ("align_reads", "call_peaks")], + ) @pytest.mark.parametrize( argnames=["start_spec_type", "stop_spec_type"], - argvalues=[("cmdl", "ctor"), ("ctor", "cmdl")]) + argvalues=[("cmdl", "ctor"), ("ctor", "cmdl")], + ) @named_param("stop_type", ["stop_before", "stop_after"]) def test_complementary_specification_modes( - self, get_pipe_manager, start_spec_type, stop_spec_type, - stop_type, start_point, stop_point): - """ Command-line and keyword specifications can harmonize. """ + self, + get_pipe_manager, + start_spec_type, + stop_spec_type, + stop_type, + start_point, + stop_point, + ): + """Command-line and keyword specifications can harmonize.""" raw_kwargs = {"start_point": start_point, stop_type: stop_point} cmdl_kwargs = {} if start_spec_type == "cmdl": @@ -118,33 +120,40 @@ def test_complementary_specification_modes( if stop_spec_type == "cmdl": cmdl_kwargs[stop_type] = raw_kwargs.pop(stop_type) args = argparse.Namespace(**cmdl_kwargs) - pm = get_pipe_manager(name="complementary-test", - args=args, **raw_kwargs) + pm = get_pipe_manager(name="complementary-test", args=args, **raw_kwargs) assert start_point == pm.start_point assert stop_point == getattr(pm, stop_type) - @named_param( "check_specs", - [["start_point"], ["stop_before"], ["stop_after"], - ["start_point", "stop_before"], ["start_point", "stop_after"]]) + [ + ["start_point"], + ["stop_before"], + ["stop_after"], + ["start_point", "stop_before"], + ["start_point", "stop_after"], + ], + ) def test_command_line_beats_constructor_keyword( - self, get_pipe_manager, check_specs): - """ Command-line specification is favored over constructor keyword. """ + self, get_pipe_manager, check_specs + ): + """Command-line specification is favored over constructor keyword.""" # Declare values to use for respective specification modes. - cmdl_values = {"start_point": "merge_input", - "stop_before": "call_peaks", - "stop_after": "align_reads"} - ctor_values = {"start_point": "fastqc", - "stop_before": "align_reads", - "stop_after": "filter_reads"} + cmdl_values = { + "start_point": "merge_input", + "stop_before": "call_peaks", + "stop_after": "align_reads", + } + ctor_values = { + "start_point": "fastqc", + "stop_before": "align_reads", + "stop_after": "filter_reads", + } # Create specifications based on current test case parameterization. - cmdl_kwargs ={cp_spec: cmdl_values[cp_spec] - for cp_spec in check_specs} - ctor_kwargs = {cp_spec: ctor_values[cp_spec] - for cp_spec in check_specs} + cmdl_kwargs = {cp_spec: cmdl_values[cp_spec] for cp_spec in check_specs} + ctor_kwargs = {cp_spec: ctor_values[cp_spec] for cp_spec in check_specs} args = argparse.Namespace(**cmdl_kwargs) # Build the pipeline manager. diff --git a/tests/pipeline_manager/test_manager_state.py b/tests/pipeline_manager/test_manager_state.py index fb86e9a7..0ae219e4 100644 --- a/tests/pipeline_manager/test_manager_state.py +++ b/tests/pipeline_manager/test_manager_state.py @@ -1,33 +1,33 @@ """ Tests related to pipeline manager state. """ import os + import pytest + from pypiper.utils import checkpoint_filepath, pipeline_filepath from tests.helpers import named_param - __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - def test_starts_running(get_pipe_manager): - """ A PipelineManager begins running during its construction. """ + """A PipelineManager begins running during its construction.""" pm = get_pipe_manager(name="TestPM") assert pm._active + # Parameters governing execution: # 1 -- checkpoint existence # 3 -- halt state (.halted) class ExecutionSkippingTests: - """ Tests for cases in which command execution should be skipped. """ - + """Tests for cases in which command execution should be skipped.""" @named_param("start_point", ["align_reads", "make_call"]) def test_skips_to_start(self, get_pipe_manager, start_point): - """ The pipeline manager can skip to a starting point. """ + """The pipeline manager can skip to a starting point.""" # Initialize the manager. pm = get_pipe_manager(name="StartTestPM", start_point=start_point) @@ -48,8 +48,11 @@ def test_skips_to_start(self, get_pipe_manager, start_point): os.makedirs(fastqc_folder) fastqc_zipfile = os.path.join(fastqc_folder, "qc.zip") fastqc_rawfile = os.path.join(fastqc_folder, "qc.txt") - cmds = ["fastqc", "touch {}".format(fastqc_rawfile), - "touch {}".format(fastqc_zipfile)] + cmds = [ + "fastqc", + "touch {}".format(fastqc_rawfile), + "touch {}".format(fastqc_zipfile), + ] pm.run(cmds, target=fastqc_zipfile) assert not os.path.isfile(fastqc_zipfile) assert not os.path.isfile(fastqc_rawfile) @@ -62,11 +65,9 @@ def test_skips_to_start(self, get_pipe_manager, start_point): pm.run(cmd, target=path_first_file) assert os.path.isfile(path_first_file) - @named_param("num_skips", argvalues=[1, 2, 3]) - def test_skips_execution_if_in_unstarted_state( - self, get_pipe_manager, num_skips): - """ Pipeline manager skips command execution if not in active state. """ + def test_skips_execution_if_in_unstarted_state(self, get_pipe_manager, num_skips): + """Pipeline manager skips command execution if not in active state.""" pm = get_pipe_manager(name="skip-execs") pm._active = False @@ -92,10 +93,9 @@ def test_skips_execution_if_in_unstarted_state( # We break the loop once we've made a call in active state. assert os.path.isfile(testfile) - @named_param("num_skips", argvalues=[1, 2, 3]) def test_respects_checkpoints(self, get_pipe_manager, num_skips): - """ Manager can skip pipeline to where it's not yet checkpointed. """ + """Manager can skip pipeline to where it's not yet checkpointed.""" pm = get_pipe_manager(name="respect-checkpoints") @@ -121,27 +121,22 @@ def test_respects_checkpoints(self, get_pipe_manager, num_skips): try: assert not os.path.isfile(outfile) except AssertionError: - print("Have run {} stage(s) of {} skip(s)". - format(i + 1, num_skips)) - print("Current manager checkpoint: {}". - format(pm.curr_checkpoint)) + print("Have run {} stage(s) of {} skip(s)".format(i + 1, num_skips)) + print("Current manager checkpoint: {}".format(pm.curr_checkpoint)) raise else: # We should have created the output file. try: assert os.path.isfile(outfile) except AssertionError: - print("Have run {} stage(s) of {} skip(s)". - format(i + 1, num_skips)) - print("Current manager checkpoint: {}". - format(pm.curr_checkpoint)) + print("Have run {} stage(s) of {} skip(s)".format(i + 1, num_skips)) + print("Current manager checkpoint: {}".format(pm.curr_checkpoint)) print("Active? {}".format(pm._active)) raise - @named_param("halt_index", [1, 2, 3]) def test_respects_halt(self, get_pipe_manager, halt_index): - """ The pipeline manager skips execution if it's in halted state. """ + """The pipeline manager skips execution if it's in halted state.""" pm = get_pipe_manager(name="respects-halt") targets = ["file{}.txt".format(i) for i in range(1, 5)] for i, t in enumerate(targets): diff --git a/tests/pipeline_manager/test_pipeline_manager.py b/tests/pipeline_manager/test_pipeline_manager.py index 3f5d1510..0017ab59 100755 --- a/tests/pipeline_manager/test_pipeline_manager.py +++ b/tests/pipeline_manager/test_pipeline_manager.py @@ -9,21 +9,18 @@ import unittest import pypiper -from pypiper.utils import pipeline_filepath from pypiper.exceptions import SubprocessError - +from pypiper.utils import pipeline_filepath __author__ = "Nathan Sheffield" __email__ = "nathan@code.databio.org" - class PipelineManagerTests(unittest.TestCase): - """ Tests for pypiper's PipelineManager. """ + """Tests for pypiper's PipelineManager.""" OUTFOLDER = "pipeline_output" - @classmethod def _clean(cls): for d in glob.glob(cls.OUTFOLDER + "*"): @@ -31,71 +28,67 @@ def _clean(cls): print("Removing " + d) shutil.rmtree(d) - def setUp(self): - """ Start each test case with two pipeline managers. """ + """Start each test case with two pipeline managers.""" print("Setting up...") # Create a fixture self.pp = pypiper.PipelineManager( - "sample_pipeline", outfolder=self.OUTFOLDER, multi=True) + "sample_pipeline", outfolder=self.OUTFOLDER, multi=True + ) self.pp2 = pypiper.PipelineManager( - "sample_pipeline2", outfolder=self.OUTFOLDER, multi=True) + "sample_pipeline2", outfolder=self.OUTFOLDER, multi=True + ) self.pp3 = pypiper.PipelineManager( - "sample_pipeline3", outfolder=self.OUTFOLDER + "3", multi=True) - + "sample_pipeline3", outfolder=self.OUTFOLDER + "3", multi=True + ) def tearDown(self): - """ Scrub the decks after each test case completes. """ + """Scrub the decks after each test case completes.""" print("Tearing down...") self.pp.stop_pipeline() self.pp2.stop_pipeline() self.pp3.stop_pipeline() print("Removing " + self.pp.outfolder) - #shutil.rmtree(self.pp.outfolder) - #shutil.rmtree(self.pp3.outfolder) + # shutil.rmtree(self.pp.outfolder) + # shutil.rmtree(self.pp3.outfolder) self._clean() del self.pp del self.pp2 del self.pp3 - def _isFile(self, filename): - """ Determine if the first manager has this file. """ + """Determine if the first manager has this file.""" filepath = pipeline_filepath(self.pp, filename=filename) return os.path.isfile(filepath) - def _assertFile(self, filename): - """ Assert that the named file exists for first pipeline manager. """ + """Assert that the named file exists for first pipeline manager.""" try: assert self._isFile(filename) except AssertionError: outfolder_contents = os.listdir(self.pp.outfolder) - print("Pipeline outfolder contents:\n{}".format( - "\n".join(outfolder_contents))) + print( + "Pipeline outfolder contents:\n{}".format("\n".join(outfolder_contents)) + ) raise - def _assertNotFile(self, filename): - """ Assert that given file doesn't exist for first manager. """ + """Assert that given file doesn't exist for first manager.""" assert not self._isFile(filename) - def _assertLines(self, expected, observed): - """ Assert equality between collections of lines. """ + """Assert equality between collections of lines.""" if isinstance(observed, str) and os.path.isfile(observed): - with open(observed, 'r') as f: + with open(observed, "r") as f: observed = f.readlines() self.assertListEqual(expected, [l.strip() for l in observed]) - @classmethod def tearDownClass(cls): - """ Ensure folder/file cleanup upon test class completion. """ + """Ensure folder/file cleanup upon test class completion.""" cls._clean() - def test_me(self): print("Testing initialization...") @@ -137,7 +130,7 @@ def test_me(self): target = pipeline_filepath(self.pp, filename="tgt") if os.path.isfile(target): # for repeat runs. os.remove(target) - + self.pp.run("echo first > " + target, target, shell=True) # Should not run self.pp.run("echo second > " + target, target, shell=True) @@ -146,8 +139,7 @@ def test_me(self): self._assertLines(["first"], lines) print("Execute a targetless command...") - self.pp.run("echo third > " + target, - target=None, lock_name="test", shell=True) + self.pp.run("echo third > " + target, target=None, lock_name="test", shell=True) with open(target) as f: lines = f.readlines() self._assertLines(["third"], lines) @@ -156,12 +148,12 @@ def test_me(self): self.pp.report_result("key1", "abc") self.pp.report_result("key2", "def", "shared") key1 = self.pp.get_stat("key1") - self.assertEqual(key1, 'abc') + self.assertEqual(key1, "abc") key1 = self.pp2.get_stat("key1") # should fail self.assertEqual(key1, None) key2 = self.pp2.get_stat("key2") # should succeed - self.assertEqual(key2, 'def') + self.assertEqual(key2, "def") print("Test intermediate file cleanup...") tgt1 = pipeline_filepath(self.pp, filename="tgt1.temp") @@ -174,7 +166,21 @@ def test_me(self): tgt9 = pipeline_filepath(self.pp, filename="tgt9.cond") tgt10 = pipeline_filepath(self.pp, filename="tgt10.txt") - self.pp.run("touch " + tgt1 + " " + tgt2 + " " + tgt3 + " " + tgt4 + " " + tgt5 + " " + tgt6, lock_name="test") + self.pp.run( + "touch " + + tgt1 + + " " + + tgt2 + + " " + + tgt3 + + " " + + tgt4 + + " " + + tgt5 + + " " + + tgt6, + lock_name="test", + ) self.pp.run("touch " + tgt8 + " " + tgt9, lock_name="test") # In global dirty mode, even non-manual clean files should not be deleted: @@ -183,7 +189,9 @@ def test_me(self): self.pp.clean_add(pipeline_filepath(self.pp, filename="*.temp")) self.pp.clean_add(tgt4) self.pp.clean_add(tgt5, conditional=True) - self.pp.clean_add(pipeline_filepath(self.pp, filename="*.cond"), conditional=True) + self.pp.clean_add( + pipeline_filepath(self.pp, filename="*.cond"), conditional=True + ) self.pp._cleanup() self.assertTrue(os.path.isfile(tgt1)) @@ -214,23 +222,20 @@ def test_me(self): print(lines) - - self.assertTrue(lines[2] == 'rm tgt3.temp\n') - self.assertTrue(lines[10] == 'rm tgt6.txt\n') - self.assertTrue(lines[11] == 'rm tgt6.txt\n') - - - - + self.assertTrue(lines[2] == "rm tgt3.temp\n") + self.assertTrue(lines[10] == "rm tgt6.txt\n") + self.assertTrue(lines[11] == "rm tgt6.txt\n") self.pp.report_object("Test figure", os.path.join("fig", "fig.jpg")) # But in regular mode, they should be deleted: - self.pp.dirty=False + self.pp.dirty = False self.pp.clean_add(pipeline_filepath(self.pp, filename="*.temp")) self.pp.clean_add(tgt4) self.pp.clean_add(tgt5, conditional=True) - self.pp.clean_add(pipeline_filepath(self.pp, filename="*.cond"), conditional=True) + self.pp.clean_add( + pipeline_filepath(self.pp, filename="*.cond"), conditional=True + ) self.pp._cleanup() self.assertFalse(os.path.isfile(tgt1)) @@ -242,7 +247,6 @@ def test_me(self): self.pp.run("touch " + tgt7, tgt7) self.pp.clean_add(tgt7, manual=True) - self.pp.run("touch " + tgt10, target=tgt10, clean=True) # Conditional delete should not delete tgt5 @@ -250,7 +254,7 @@ def test_me(self): self.assertTrue(os.path.isfile(tgt5)) self.assertTrue(os.path.isfile(tgt8)) self.assertTrue(os.path.isfile(tgt9)) - self.assertTrue(os.path.isfile(tgt10)) # auto cleanup + self.assertTrue(os.path.isfile(tgt10)) # auto cleanup # Stopping pp2 should cause tgt5 to be deleted self.pp2.stop_pipeline() @@ -290,14 +294,13 @@ def test_me(self): with self.assertRaises(KeyboardInterrupt): self.pp._signal_int_handler(None, None) - sleep_lock = pipeline_filepath(self.pp, filename="lock.sleep") - #subprocess.Popen("sleep .5; rm " + sleep_lock, shell=True) + # subprocess.Popen("sleep .5; rm " + sleep_lock, shell=True) self.pp._create_file(sleep_lock) cmd = "echo hello" self.pp.run(cmd, lock_name="sleep") - #subprocess.Popen("sleep .5; rm " + sleep_lock, shell=True) + # subprocess.Popen("sleep .5; rm " + sleep_lock, shell=True) print("Test new start") if os.path.isfile(target): # for repeat runs. @@ -318,7 +321,7 @@ def test_me(self): print("Test dual target") self.pp.new_start = False if os.path.isfile(tgt1): - os.remove(tgt1) + os.remove(tgt1) self.pp.run("touch " + tgt6, tgt6) self.assertTrue(os.path.isfile(tgt6)) # if target exists, should not run @@ -338,6 +341,5 @@ def _make_pipe_filepath(pm, filename): return os.path.join(pm.outfolder, filename) - -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/pipeline_manager/test_pipeline_manager_timestamp.py b/tests/pipeline_manager/test_pipeline_manager_timestamp.py index 2f870cf8..18cb7177 100644 --- a/tests/pipeline_manager/test_pipeline_manager_timestamp.py +++ b/tests/pipeline_manager/test_pipeline_manager_timestamp.py @@ -2,13 +2,13 @@ import os import sys + import pytest from pypiper.exceptions import PipelineHalt from pypiper.utils import checkpoint_filepath from tests.helpers import fetch_checkpoint_files, named_param - __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" @@ -17,9 +17,8 @@ FILES_TEST = "files" - def pytest_generate_tests(metafunc): - """ Dynamic test case generation for this module. """ + """Dynamic test case generation for this module.""" if "retrospective" in metafunc.fixturenames: metafunc.parametrize("retrospective", [False, True]) if "test_type" in metafunc.fixturenames: @@ -28,16 +27,15 @@ def pytest_generate_tests(metafunc): metafunc.parametrize("raise_error", [False, True]) - def test_timestamp_requires_no_arguments(get_pipe_manager): - """ A call to timestamp() requires no arguments. """ + """A call to timestamp() requires no arguments.""" pm = get_pipe_manager(name="TestPM") pm.timestamp() @pytest.mark.skip def test_timestamp_message(get_pipe_manager, capsys): - """ Tests for the message component of a timestamp() call. """ + """Tests for the message component of a timestamp() call.""" name = "TestPM" pm = get_pipe_manager(name=name) logfile = pm.pipeline_log_file @@ -55,15 +53,13 @@ def test_timestamp_message(get_pipe_manager, capsys): # The stdout capture with capsys comes through as a single unicode block. # With the move to logger, this test is no longer capturing the output - assert message_content in str(out), \ - "Missing timestamp message ('{}') in message(s)".\ - format(message_content) - + assert message_content in str( + out + ), "Missing timestamp message ('{}') in message(s)".format(message_content) class TimestampHaltingTests: - """ Tests for a manager's ability to halt a pipeline. """ - + """Tests for a manager's ability to halt a pipeline.""" # Note that the tests here are not truly logically independent from the # functionality of the manager's halt() method. The assertions made here @@ -73,10 +69,8 @@ class TimestampHaltingTests: # the mock, but here that seems to inject a level of complexity for which # the cost exceeds the benefit of the logical independence that it confers. - - def test_halts_if_hitting_exclusive_halt_point( - self, get_pipe_manager, raise_error): - """ Halt point may be specified prospectively. """ + def test_halts_if_hitting_exclusive_halt_point(self, get_pipe_manager, raise_error): + """Halt point may be specified prospectively.""" # Create manager, set halt point, and check that it's running. halt_name = "phase3" @@ -102,9 +96,8 @@ def test_halts_if_hitting_exclusive_halt_point( print("STATUS: {}".format(pm.status)) raise - def test_halts_if_halt_on_next(self, get_pipe_manager, raise_error): - """ If in particular state, managed pipeline halts on timestamp(). """ + """If in particular state, managed pipeline halts on timestamp().""" pm = get_pipe_manager(name="TestPM") pm.halt_on_next = True if raise_error: @@ -114,9 +107,8 @@ def test_halts_if_halt_on_next(self, get_pipe_manager, raise_error): pm.timestamp("testing", raise_error=False) assert pm.halted - def test_correctly_sets_halt_on_next(self, get_pipe_manager): - """ Of critical importance to timestamp's checkpointing functionality + """Of critical importance to timestamp's checkpointing functionality is its ability to alter the manager's state such that it triggers a halt on the subsequent timestamp() call. This allows timestamp() to be used in a prospective fashion while still preserving the ability to @@ -125,7 +117,7 @@ def test_correctly_sets_halt_on_next(self, get_pipe_manager): timestamp() before beginning a conceptual block of processing logic, yet still (behave as though) stopping just after completion of execution of a defined stopping point. Essentially, the timestamp() - calls can be prospective yet mixed with a retrospective halt point. """ + calls can be prospective yet mixed with a retrospective halt point.""" # Establish manager and perform initial control assertions. pm = get_pipe_manager(name="TestPM") @@ -143,15 +135,12 @@ def test_correctly_sets_halt_on_next(self, get_pipe_manager): assert pm.halt_on_next - class TimestampStatusTypeTests: - """ Tests for the type of status that a timestamp() call represents. """ - + """Tests for the type of status that a timestamp() call represents.""" - def test_initial_timestamp_checkpoint_file( - self, get_pipe_manager, retrospective): - """ Initial checkpointed timestamp writes checkpoint file if and only - if it's a retrospective timestamp. """ + def test_initial_timestamp_checkpoint_file(self, get_pipe_manager, retrospective): + """Initial checkpointed timestamp writes checkpoint file if and only + if it's a retrospective timestamp.""" pm = get_pipe_manager(name="init-timestamp-file") stage_name = "align_reads" pm.timestamp(checkpoint=stage_name, finished=retrospective) @@ -161,13 +150,12 @@ def test_initial_timestamp_checkpoint_file( else: assert not os.path.isfile(check_fpath) - - @named_param("which_checkpoint_state", - ["curr_checkpoint", "prev_checkpoint"]) + @named_param("which_checkpoint_state", ["curr_checkpoint", "prev_checkpoint"]) def test_initial_timestamp_states( - self, get_pipe_manager, retrospective, which_checkpoint_state): - """ Which checkpoint state is updated by a checkpointed timestamp - call depends upon the perspective of the call. """ + self, get_pipe_manager, retrospective, which_checkpoint_state + ): + """Which checkpoint state is updated by a checkpointed timestamp + call depends upon the perspective of the call.""" # Create the manager and make the timestamp call. pm = get_pipe_manager(name="InitialTimestampState") @@ -188,10 +176,8 @@ def test_initial_timestamp_states( else: assert prev_exp == getattr(pm, "prev_checkpoint") - - def test_two_prospective_checkpointed_timestamps( - self, test_type, stage_pair, pm): - """ Prospective timestamp generates file for previous checkpoint. """ + def test_two_prospective_checkpointed_timestamps(self, test_type, stage_pair, pm): + """Prospective timestamp generates file for previous checkpoint.""" stage1, stage2 = stage_pair pm.timestamp(checkpoint=stage1, finished=False) @@ -205,10 +191,8 @@ def test_two_prospective_checkpointed_timestamps( assert stage1 == pm.prev_checkpoint assert stage2 == pm.curr_checkpoint - - def test_two_retrospective_checkpointed_timestamps( - self, test_type, stage_pair, pm): - """ Retrospective timestamp generates file for current checkpoint. """ + def test_two_retrospective_checkpointed_timestamps(self, test_type, stage_pair, pm): + """Retrospective timestamp generates file for current checkpoint.""" stage1, stage2 = stage_pair pm.timestamp(checkpoint=stage1, finished=True) @@ -222,11 +206,11 @@ def test_two_retrospective_checkpointed_timestamps( assert stage2 == pm.prev_checkpoint assert pm.curr_checkpoint is None - def test_prospective_then_retrospective_checkpointed_timestamps( - self, test_type, stage_pair, pm): - """ If a prospective checkpointed timestamp is followed by a - retrospective one, there's only a file for the retrospective one. """ + self, test_type, stage_pair, pm + ): + """If a prospective checkpointed timestamp is followed by a + retrospective one, there's only a file for the retrospective one.""" stage1, stage2 = stage_pair pm.timestamp(checkpoint=stage1, finished=False) @@ -243,10 +227,10 @@ def test_prospective_then_retrospective_checkpointed_timestamps( assert stage2 == pm.prev_checkpoint assert pm.curr_checkpoint is None - def test_retrospective_the_prospective_checkpointed_timestamps( - self, test_type, stage_pair, pm): - """ Test retrospective timestamp followed by prospective one. """ + self, test_type, stage_pair, pm + ): + """Test retrospective timestamp followed by prospective one.""" stage1, stage2 = stage_pair pm.timestamp(checkpoint=stage1, finished=True) @@ -261,14 +245,12 @@ def test_retrospective_the_prospective_checkpointed_timestamps( assert pm.prev_checkpoint is None assert stage2 == pm.curr_checkpoint - @pytest.fixture def stage_pair(self): - """ Provide test case with a pair of stage names to use. """ + """Provide test case with a pair of stage names to use.""" return "merge_input", "quality_control" - @pytest.fixture def pm(self, get_pipe_manager): - """ Provide test case with a basic, test-safe pipeline manager. """ + """Provide test case with a basic, test-safe pipeline manager.""" return get_pipe_manager(name="checkpointed-timestamp-pair") diff --git a/tests/pipeline_manager/test_pipeline_manager_timestamp_checkpoint_filepath.py b/tests/pipeline_manager/test_pipeline_manager_timestamp_checkpoint_filepath.py index c8daf530..90cc05de 100644 --- a/tests/pipeline_manager/test_pipeline_manager_timestamp_checkpoint_filepath.py +++ b/tests/pipeline_manager/test_pipeline_manager_timestamp_checkpoint_filepath.py @@ -9,14 +9,13 @@ from pypiper.stage import Stage from tests.helpers import named_param - __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - class DummyPM(PipelineManager): - """ Simple override of true PipelineManager, for __init__ simplicity """ + """Simple override of true PipelineManager, for __init__ simplicity""" + def __init__(self, name, outfolder): self.name = name self.outfolder = outfolder @@ -29,17 +28,17 @@ def __init__(self, name, outfolder): self.curr_checkpoint = None - class PipelineMangerTimestampCheckpointFilePathTests: - """ Tests for determination of checkpoint filepath. """ - + """Tests for determination of checkpoint filepath.""" - @named_param(argnames=["name1", "name2"], - argvalues=[("chipseq", "ATACseq"), ("rnaKallisto", "wgbs")]) - @named_param(argnames="spec_type", - argvalues=["stage_name", "stage", "function"]) + @named_param( + argnames=["name1", "name2"], + argvalues=[("chipseq", "ATACseq"), ("rnaKallisto", "wgbs")], + ) + @named_param(argnames="spec_type", argvalues=["stage_name", "stage", "function"]) def test_distinguishes_pipelines_within_outfolder( - self, name1, name2, spec_type, tmpdir): + self, name1, name2, spec_type, tmpdir + ): """ Checkpoint files within sample folder include pipeline name. @@ -66,8 +65,9 @@ def stage_spec(): if spec_type == "function": return trim_reads elif spec_type not in ["stage", "stage_name"]: - raise ValueError("Unrecognized stage specification type: {}". - format(spec_type)) + raise ValueError( + "Unrecognized stage specification type: {}".format(spec_type) + ) else: s = Stage(trim_reads) return s.name if spec_type == "stage_name" else s @@ -86,25 +86,29 @@ def stage_spec(): # Find the checkpoints; there should only be one. checkpoint_pattern = os.path.join( - outfolder, "{}_*{}".format(name1, CHECKPOINT_EXTENSION)) + outfolder, "{}_*{}".format(name1, CHECKPOINT_EXTENSION) + ) checkpoints = glob.glob(checkpoint_pattern) assert 1 == len(checkpoints) assert 1 == len(glob.glob(all_checkpoints_pattern)) # Check that we have the expected checkpoint. - exp_chkpt_fpath = os.path.join(outfolder, "{}_{}".format( - name1, checkpoint_name + CHECKPOINT_EXTENSION)) + exp_chkpt_fpath = os.path.join( + outfolder, "{}_{}".format(name1, checkpoint_name + CHECKPOINT_EXTENSION) + ) assert exp_chkpt_fpath == checkpoints[0] # Create a second checkpoint with the same stage, but with a manager # of a different name. plm2.timestamp(checkpoint=stage_spec(), finished=True) checkpoint_pattern = os.path.join( - outfolder, "{}_*{}".format(name2, CHECKPOINT_EXTENSION)) + outfolder, "{}_*{}".format(name2, CHECKPOINT_EXTENSION) + ) checkpoints = glob.glob(checkpoint_pattern) assert 1 == len(checkpoints) all_checkpoints = glob.glob(all_checkpoints_pattern) assert 2 == len(all_checkpoints) - exp_chkpt_fpath_2 = os.path.join(outfolder, "{}_{}".format( - name2, checkpoint_name + CHECKPOINT_EXTENSION)) + exp_chkpt_fpath_2 = os.path.join( + outfolder, "{}_{}".format(name2, checkpoint_name + CHECKPOINT_EXTENSION) + ) assert {exp_chkpt_fpath, exp_chkpt_fpath_2} == set(all_checkpoints) diff --git a/tests/pipeline_manager/test_set_status_flag.py b/tests/pipeline_manager/test_set_status_flag.py index c67c9cb9..3114750f 100644 --- a/tests/pipeline_manager/test_set_status_flag.py +++ b/tests/pipeline_manager/test_set_status_flag.py @@ -6,15 +6,13 @@ from pypiper.flags import __all__ as ALL_FLAGS from tests.helpers import named_param - __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" - @named_param(argnames="status", argvalues=ALL_FLAGS) def test_set_status_flag_is_idempotent(get_pipe_manager, status): - """ Calls to manager's status flag setter are idempotent. """ + """Calls to manager's status flag setter are idempotent.""" pm = get_pipe_manager(name="TestPM") pm._set_status_flag(status) assert status == pm.status @@ -22,15 +20,20 @@ def test_set_status_flag_is_idempotent(get_pipe_manager, status): assert status == pm.status - @pytest.mark.parametrize( argnames=["init_state", "new_state"], - argvalues=[(WAIT_FLAG, RUN_FLAG), (WAIT_FLAG, COMPLETE_FLAG), - (WAIT_FLAG, FAIL_FLAG), (RUN_FLAG, COMPLETE_FLAG), - (RUN_FLAG, PAUSE_FLAG), (RUN_FLAG, FAIL_FLAG), - (FAIL_FLAG, RUN_FLAG)]) + argvalues=[ + (WAIT_FLAG, RUN_FLAG), + (WAIT_FLAG, COMPLETE_FLAG), + (WAIT_FLAG, FAIL_FLAG), + (RUN_FLAG, COMPLETE_FLAG), + (RUN_FLAG, PAUSE_FLAG), + (RUN_FLAG, FAIL_FLAG), + (FAIL_FLAG, RUN_FLAG), + ], +) def test_changes_status_state(get_pipe_manager, init_state, new_state): - """ Manager setting status flag changes is internal status/state. """ + """Manager setting status flag changes is internal status/state.""" pm = get_pipe_manager(name="test-pipe") assert pm.status == RUN_FLAG pm._set_status_flag(init_state) diff --git a/tests/test_packaging.py b/tests/test_packaging.py index 2e5bf819..5ef2ecb6 100644 --- a/tests/test_packaging.py +++ b/tests/test_packaging.py @@ -1,18 +1,26 @@ """ Validate what's available directly on the top-level import. """ -import pytest from inspect import isfunction +import pytest + __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" -@pytest.mark.parametrize(["obj_name", "typecheck"], [ - ("add_logging_options", isfunction), ("check_all_commands", isfunction), - ("determine_uncallable", isfunction), ("logger_via_cli", isfunction)]) +@pytest.mark.parametrize( + ["obj_name", "typecheck"], + [ + ("add_logging_options", isfunction), + ("check_all_commands", isfunction), + ("determine_uncallable", isfunction), + ("logger_via_cli", isfunction), + ], +) def test_top_level_exports(obj_name, typecheck): - """ At package level, validate object availability and type. """ + """At package level, validate object availability and type.""" import pypiper + try: obj = getattr(pypiper, obj_name) except AttributeError: diff --git a/tests/test_pipeline_filepath.py b/tests/test_pipeline_filepath.py index 12a5a874..e8d496fd 100644 --- a/tests/test_pipeline_filepath.py +++ b/tests/test_pipeline_filepath.py @@ -1,10 +1,11 @@ """ Tests for utility functions """ import os + import mock import pytest -from pypiper.utils import pipeline_filepath +from pypiper.utils import pipeline_filepath __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" @@ -14,7 +15,6 @@ SUFFICES = [".txt", "_results.csv", ".stats.tsv", "-data.json"] - @pytest.fixture def pl_mgr(request, tmpdir): """ @@ -34,26 +34,25 @@ def pl_mgr(request, tmpdir): # Set output folder and name attributes for mocked PipelineManager. mock_mgr = mock.Mock(outfolder=tmpdir.strpath) - type(mock_mgr).name = pipe_name # Circumvent 'name' keyword on Mock. + type(mock_mgr).name = pipe_name # Circumvent 'name' keyword on Mock. return mock_mgr - def test_requires_filename_or_suffix(pl_mgr): - """ Either filename or suffix is required to build a path. """ + """Either filename or suffix is required to build a path.""" with pytest.raises(TypeError): pipeline_filepath(pl_mgr) - @pytest.mark.parametrize(argnames="pipe_name", argvalues=PIPELINE_NAMES) @pytest.mark.parametrize(argnames="suffix", argvalues=SUFFICES) @pytest.mark.parametrize( - argnames="test_type", - argvalues=["has_pipe_name", "has_suffix", "full_path"]) + argnames="test_type", argvalues=["has_pipe_name", "has_suffix", "full_path"] +) def test_uses_pipeline_name_if_no_filename( - pipe_name, suffix, test_type, pl_mgr, tmpdir): - """ Pipeline name is proxy for filename if just suffix is given. """ + pipe_name, suffix, test_type, pl_mgr, tmpdir +): + """Pipeline name is proxy for filename if just suffix is given.""" observed = pipeline_filepath(pl_mgr, suffix=suffix) @@ -74,12 +73,11 @@ def test_uses_pipeline_name_if_no_filename( @pytest.mark.parametrize( - argnames="filename", - argvalues=["testfile" + suffix for suffix in SUFFICES]) -@pytest.mark.parametrize( - argnames="test_type", argvalues=["filename", "filepath"]) + argnames="filename", argvalues=["testfile" + suffix for suffix in SUFFICES] +) +@pytest.mark.parametrize(argnames="test_type", argvalues=["filename", "filepath"]) def test_direct_filename(tmpdir, filename, pl_mgr, test_type): - """ When given, filename is used instead of pipeline name. """ + """When given, filename is used instead of pipeline name.""" fullpath = pipeline_filepath(pl_mgr, filename=filename) if test_type == "filename": _, observed = os.path.split(fullpath) @@ -91,12 +89,10 @@ def test_direct_filename(tmpdir, filename, pl_mgr, test_type): raise ValueError("Unrecognized test type: '{}'".format(test_type)) -@pytest.mark.parametrize( - argnames="filename", argvalues=["output", "testfile"]) +@pytest.mark.parametrize(argnames="filename", argvalues=["output", "testfile"]) @pytest.mark.parametrize(argnames="suffix", argvalues=SUFFICES) -def test_suffix_is_appended_to_filename_if_both_are_provided( - pl_mgr, filename, suffix): - """ Suffix is appended to filename if both are provided. """ +def test_suffix_is_appended_to_filename_if_both_are_provided(pl_mgr, filename, suffix): + """Suffix is appended to filename if both are provided.""" expected = filename + suffix fullpath = pipeline_filepath(pl_mgr, filename=filename, suffix=suffix) _, observed = os.path.split(fullpath) diff --git a/tests/utils_tests/test_check_command_callability.py b/tests/utils_tests/test_check_command_callability.py index 00bb19ff..32cf14c7 100644 --- a/tests/utils_tests/test_check_command_callability.py +++ b/tests/utils_tests/test_check_command_callability.py @@ -1,26 +1,42 @@ """ Tests for checking a collection of commands for callability """ -import mock import os + +import mock import pytest -from pypiper import utils as piper_utils from ubiquerg import powerset from veracitools import ExpectContext +from pypiper import utils as piper_utils + __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" -EXTENSIONS = [".py", ".rb", ".sh", ".java", ".jar", ".pl", ".o", ".R", ".r", - ".cpp", ".c", ".hs", ".scala", ".class"] +EXTENSIONS = [ + ".py", + ".rb", + ".sh", + ".java", + ".jar", + ".pl", + ".o", + ".R", + ".r", + ".cpp", + ".c", + ".hs", + ".scala", + ".class", +] def _touch(f): - """ 'touch' the given file. + """'touch' the given file. :param str f: filepath to create """ - with open(f, 'w'): + with open(f, "w"): print("touch: {}".format(f)) @@ -31,30 +47,36 @@ def _make_exec(f): :param str f: path to create """ import subprocess + _touch(f) subprocess.check_call(["chmod", "+x", f]) def pytest_generate_tests(metafunc): - """ Dynamic test case generation and parameterization for this module """ + """Dynamic test case generation and parameterization for this module""" if "str_list_monad" in metafunc.fixturenames: metafunc.parametrize("str_list_monad", [lambda s: s, lambda s: [s]]) @pytest.mark.parametrize("filename", ["testfile" + x for x in EXTENSIONS]) -@pytest.mark.parametrize(["setup", "pretest", "exp_miss"], [ - (lambda _: None, - lambda f: not os.path.exists(f), - lambda _: True), - (_touch, - lambda f: os.path.isfile(f) and not os.access(f, os.X_OK), - lambda f: not f.endswith(".jar")), - (_make_exec, - lambda f: os.path.isfile(f) and os.access(f, os.X_OK), - lambda _: False) -]) +@pytest.mark.parametrize( + ["setup", "pretest", "exp_miss"], + [ + (lambda _: None, lambda f: not os.path.exists(f), lambda _: True), + ( + _touch, + lambda f: os.path.isfile(f) and not os.access(f, os.X_OK), + lambda f: not f.endswith(".jar"), + ), + ( + _make_exec, + lambda f: os.path.isfile(f) and os.access(f, os.X_OK), + lambda _: False, + ), + ], +) def test_callability_checker_defaults(tmpdir, filename, setup, pretest, exp_miss): - """ Verify behavior of callability checker with default parameterization. """ + """Verify behavior of callability checker with default parameterization.""" cmd = os.path.join(tmpdir.strpath, filename) setup(cmd) assert pretest(cmd) @@ -70,42 +92,53 @@ def test_callability_checker_defaults(tmpdir, filename, setup, pretest, exp_miss @pytest.mark.parametrize( - ["uncall_result", "expectation"], - [([], True), ([("noncmd", "noncmd")], TypeError)]) + ["uncall_result", "expectation"], [([], True), ([("noncmd", "noncmd")], TypeError)] +) @pytest.mark.parametrize("handler", [lambda: True, "not-a-function"]) def test_check_all_bad_handler_is_type_error_iff_uncallability_exists( - uncall_result, str_list_monad, handler, expectation): - """ Invalid handler evaluation is conditional having >= 1 uncallable command. """ + uncall_result, str_list_monad, handler, expectation +): + """Invalid handler evaluation is conditional having >= 1 uncallable command.""" cmd = "noncmd" - with mock.patch.object(piper_utils, "determine_uncallable", - return_value=uncall_result), \ - ExpectContext(expectation, piper_utils.check_all_commands) as check: + with mock.patch.object( + piper_utils, "determine_uncallable", return_value=uncall_result + ), ExpectContext(expectation, piper_utils.check_all_commands) as check: check(cmds=str_list_monad(cmd), handle=handler) -@pytest.mark.parametrize(["create_result", "expected"], [ - (lambda bads: Exception("{} bad commands: {}".format(len(bads), bads)), Exception), - (lambda bads: "{} bad commands: {}".format(len(bads), bads), False) -]) +@pytest.mark.parametrize( + ["create_result", "expected"], + [ + ( + lambda bads: Exception("{} bad commands: {}".format(len(bads), bads)), + Exception, + ), + (lambda bads: "{} bad commands: {}".format(len(bads), bads), False), + ], +) def test_check_all_result_is_conjunctive(create_result, expected, str_list_monad): - """ Even one uncallable means result is False or an Exception occurs. """ + """Even one uncallable means result is False or an Exception occurs.""" cmd = "noncmd" - with mock.patch.object(piper_utils, "determine_uncallable", - return_value=[(cmd, cmd)]), \ - ExpectContext(expected, piper_utils.check_all_commands) as check: + with mock.patch.object( + piper_utils, "determine_uncallable", return_value=[(cmd, cmd)] + ), ExpectContext(expected, piper_utils.check_all_commands) as check: check(cmds=str_list_monad(cmd), get_bad_result=create_result) @pytest.mark.parametrize("commands", ["man", "ls", ["man", "ls"]]) @pytest.mark.parametrize( ["transforms", "expectation"], - [(arg, lambda res: isinstance(res, list)) for arg in [None, []]] + - [(arg, TypeError) for arg in [1, "a"]]) + [(arg, lambda res: isinstance(res, list)) for arg in [None, []]] + + [(arg, TypeError) for arg in [1, "a"]], +) def test_check_all_requires_iterable_transformations_argument( - commands, transforms, expectation): - """ If transformations arg is non-null, it must be iterable. """ + commands, transforms, expectation +): + """If transformations arg is non-null, it must be iterable.""" + def call(): return piper_utils.determine_uncallable(commands, transformations=transforms) + if isinstance(expectation, type) and issubclass(expectation, Exception): with pytest.raises(expectation): call() @@ -114,31 +147,42 @@ def call(): @pytest.mark.parametrize( - "commands", powerset(["ls", "picard.jar", "$ENVVAR"], nonempty=True)) + "commands", powerset(["ls", "picard.jar", "$ENVVAR"], nonempty=True) +) def test_transformation_accumulation(commands): - """ Accumulation of transformations works as expected """ + """Accumulation of transformations works as expected""" mapjar = lambda c: "java -jar {}".format(c) envjar = "env.jar" - transforms = [(lambda c: c == "$ENVVAR", lambda _: envjar), - (lambda c: c.endswith(".jar"), mapjar)] + transforms = [ + (lambda c: c == "$ENVVAR", lambda _: envjar), + (lambda c: c.endswith(".jar"), mapjar), + ] exps = {"ls": "ls", "picard.jar": mapjar("picard.jar"), "$ENVVAR": mapjar(envjar)} with mock.patch.object(piper_utils, "is_command_callable", return_value=False): res = piper_utils.determine_uncallable( - commands, transformations=transforms, accumulate=True) + commands, transformations=transforms, accumulate=True + ) expectation = [(c, exps[c]) for c in commands] print("EXPECTED: {}".format(expectation)) print("OBSERVED: {}".format(res)) assert expectation == res -@pytest.mark.parametrize("transforms", [ - {(lambda _: True, lambda c: c), (lambda _: False, lambda c: c)}, - {"id": (lambda _: True, lambda c: c), - "java": (lambda c: c.endswith(".jar"), lambda c: "java -jar {}".format(c))} -]) +@pytest.mark.parametrize( + "transforms", + [ + {(lambda _: True, lambda c: c), (lambda _: False, lambda c: c)}, + { + "id": (lambda _: True, lambda c: c), + "java": (lambda c: c.endswith(".jar"), lambda c: "java -jar {}".format(c)), + }, + ], +) def test_non_accumulative_but_unordered_transformation_is_exceptional(transforms): with pytest.raises(Exception) as err_ctx: piper_utils.determine_uncallable("ls", transformations=transforms) - exp_msg = "If transformations are unordered, non-accumulation of " \ - "effects may lead to nondeterministic behavior." + exp_msg = ( + "If transformations are unordered, non-accumulation of " + "effects may lead to nondeterministic behavior." + ) assert str(err_ctx.value) == exp_msg diff --git a/tests/utils_tests/test_head_util.py b/tests/utils_tests/test_head_util.py index 4f55a922..232c6312 100644 --- a/tests/utils_tests/test_head_util.py +++ b/tests/utils_tests/test_head_util.py @@ -2,10 +2,12 @@ import random import string + import pytest -from hypothesis import given, strategies as st -from pypiper.utils import head +from hypothesis import given +from hypothesis import strategies as st +from pypiper.utils import head __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" @@ -14,12 +16,17 @@ NUMBERS_AND_LETTERS = list(string.ascii_letters) + list(range(-9, 10)) # Strategy for generating a pretty arbitrary atomic -ATOMICS = st.deferred(lambda: st.booleans() | st.characters() | st.integers() | - st.floats(allow_nan=False) | st.text()) +ATOMICS = st.deferred( + lambda: st.booleans() + | st.characters() + | st.integers() + | st.floats(allow_nan=False) + | st.text() +) def pytest_generate_tests(metafunc): - """ Test case generation/parameterization for this module. """ + """Test case generation/parameterization for this module.""" if "seqtype" in metafunc.fixturenames: metafunc.parametrize("seqtype", [tuple, list]) if "iter_cast" in metafunc.fixturenames: @@ -27,40 +34,48 @@ def pytest_generate_tests(metafunc): if "h" in metafunc.fixturenames and "xs" in metafunc.fixturenames: metafunc.parametrize( ["h", "xs"], - [(random.choice(NUMBERS_AND_LETTERS), - [random.choice(NUMBERS_AND_LETTERS) - for _ in range(random.randint(5, 10))]) for _ in range(10)]) + [ + ( + random.choice(NUMBERS_AND_LETTERS), + [ + random.choice(NUMBERS_AND_LETTERS) + for _ in range(random.randint(5, 10)) + ], + ) + for _ in range(10) + ], + ) @given(obj=ATOMICS) def test_head_atomic(obj): - """ head() of an atomic object is the object itself. """ + """head() of an atomic object is the object itself.""" assert obj == head(obj) def test_head_empty_string(): - """ Empty string is exception to exceptional-ness of empty collection. """ + """Empty string is exception to exceptional-ness of empty collection.""" assert "" == head("") @pytest.mark.parametrize("coll", [dict(), set(), tuple(), list()]) def test_head_empty_collection(coll): - """ Request for first element from an empty Iterable is exceptional. """ + """Request for first element from an empty Iterable is exceptional.""" with pytest.raises(ValueError): head(coll) def test_head_nonempty_sequential_collection(h, xs, seqtype, iter_cast): - """ Verify accuracy of request for first element from nonempty Iterable. """ + """Verify accuracy of request for first element from nonempty Iterable.""" c = seqtype([h]) + seqtype(xs) assert h == head(iter_cast(c)) def test_head_nonempty_set(): - """ Verify that head of nonempty set is non-exceptional. """ + """Verify that head of nonempty set is non-exceptional.""" head({-1, 0, 1}) def test_head_nonempty_dict(): - """ Verify that head of nonempty dictionary is non-exceptional. """ + """Verify that head of nonempty dictionary is non-exceptional.""" head({"a": 1, "b": 2}) From bffadae186402fc5303c79d92751fbc0969a0da5 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 7 May 2021 16:17:33 -0400 Subject: [PATCH 10/25] add pipestat integration documentation --- docs/pipestat.md | 122 +++++++++++++++++++++++++++++++++++++++++++++++ mkdocs.yml | 1 + 2 files changed, 123 insertions(+) create mode 100644 docs/pipestat.md diff --git a/docs/pipestat.md b/docs/pipestat.md new file mode 100644 index 00000000..3923a215 --- /dev/null +++ b/docs/pipestat.md @@ -0,0 +1,122 @@ +# Pipestat + +Starting with pypiper v0.13.0 [pipestat](http://pipestat.databio.org) is the recommended way of reporting pipeline statistics. +You can browse the pipestat documentation to learn more about it, but briefly pipestat is a tool that standardizes reporting of pipeline results. It provides 1) a standard specification for how pipeline outputs should be stored; and 2) an implementation to easily write results to that format from within Python or from the command line. + +## Advancements + +There are a multiple advantages of using piestat instead of the current pieline results reporiting system: + +1. **Database results storage:** the results can be stored either in a database or a YAML-formatted results file. This way a pypiper pipeline running in an emphemeral compute environment can report the results to the database and exit. No need to sync the results with a central results storage. +2. **Strict and clear results definition:** all the results that can be reported by a pipeline run *must* be pre-defined in a [pipestat results schema](http://pipestat.databio.org/en/latest/pipestat_specification/#pipestat-schema-format) that in a simplest case just indicates the result's type. This presents piepstat clients with the possibility to *reliably* gather all the possible results and related metadata. +3. **On-the-fly results validation:** the schema is used to validate and/or convert the reported result to a strictly determined type, which makes the connection of pypiper with downstream pipeline results processing software seamless. +4. **Unified, pipeline-agnostic results interface:** other pipelines, possibly created with different pipeline frameworks, can read and write results via Python API or command line interface. This feature significantly incerases your pipeline interoperability. + +## Setup + +In order to start reporting results with pipestat in your pipeline all you need to do is: + +1. Define a [pipestat resuts schema](http://pipestat.databio.org/en/latest/pipestat_specification/#pipestat-schema-format) + +```yaml +my_int_result: + type: integer + description: "This is my first result" +my_str_result: + type: string +``` +2. Pass the pipestat results schema to the `PipelineManager` object constructor. + +```python +pm = pypiper.PipelineManager( + name="hello_pypiper", + outfolfer="$HOME/hello_pypiper", + pipestat_schema="my_results_schema.yaml", +) +``` + +3. Use `pipestat` property of the `PipelineManager` object to report/retrieve results. See usage for more details. + +And in the simplest case... that's it! Pypiper *by default* will use a YAML-formated file to store the reported results in the selected `outfolder`. + +### Advanced features + +Pypiper-pipestat integration really shines when more advanced features are used. Here's how to set them up. + +**Use a database to store reported results** + +In order to establish a database connection pipestat requires few pieces of information, which *must* be provided in a [pipestat configuration file](http://pipestat.databio.org/en/latest/config/) passed to the `PipelineManager` constructor. + +This is an example of such a file: + +```yaml +database: + name: pypiper # database name + user: pypiper # database user name + password: pypiper # database password + host: localhost # database host address + port: 5433 # port the database is running on + dialect: postgresql # type of the databse + driver: psycopg2 # driver to use to communicate +``` + +For reference, here is a Docker command that would run a PostgreSQL instance that could be used to store the pipeline results when configured with with the configuration file above: + +```console +docker volume create postgres-data + +docker run -d --name pypiper-postgres \ +-p 5432:5433 -e POSTGRES_PASSWORD=pypiper \ +-e POSTGRES_USER=pypiper -e POSTGRES_DB=pypiper \ +-v postgres-data:/var/lib/postgresql/data postgres +``` + +**Highlight results** + +The pipestat results schema can include any number of additional attributes for results. An example of that is *results highlighting*. + +When a `highlight: true` attribute is included attribute under result identifier in the schema file the highlighted results can be later retrieved by pipestat clients via `PipelineManager.pipestat.highlighted_results` property, which simply returns a list of result identifiers. to be presented in a special way. + +**Custom run status management** + + + + +### Usage + +Since a pipeline run-specific `PipestatManager` instance is attached to the `PipelineManager` object all the public pipestat API can be used. Please refer to the [pipestat API documentation](http://pipestat.databio.org/en/latest/autodoc_build/pipestat/) to read about all the currently available features. + +Here we present the most commonly used features: + +- results reporting + +*report a result, convert to schema-defined type and overwrite previously reported result* + +```python +results = { + "my_int_result": 10, + "my_str_result": "test" +} +pm.pipestat.report( + values=results, + strict_type=True, + force_overwrite=True) +``` +- results retrieval + +```python +pm.pipestat.retrieve(result_identifier="my_int_result") +``` + +- results schema exploration + +```python +pm.pipestat.schema +``` + + +- exploration of canonical [jsonschema](https://json-schema.org/) representation of result schemas + +```python +pm.pipestat.result_schemas +``` \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index e3eb2694..f3a3a1fd 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -16,6 +16,7 @@ nav: - Automatic command-line arguments: cli.md - Configuring pipelines: configuration.md - Reporting statistics: report.md + - Reporting statistics with pipestat: pipestat.md - Cleaning up intermediate files: clean.md - Best practices: best-practices.md - Toolkits: From 76cc53e8939c1a86e89630041cfeb575f192a2a0 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 7 May 2021 16:20:28 -0400 Subject: [PATCH 11/25] add pipestat req --- requirements/reqs-docs.txt | 1 + requirements/reqs-pypiper.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/requirements/reqs-docs.txt b/requirements/reqs-docs.txt index ad53e8dc..9dd2e5d5 100644 --- a/requirements/reqs-docs.txt +++ b/requirements/reqs-docs.txt @@ -3,3 +3,4 @@ markdown-include pydoc-markdown piper https://github.com/databio/mkdocs-databio/archive/master.zip +-e git+git://github.com/pepkit/pipestat@orm#egg=pipestat \ No newline at end of file diff --git a/requirements/reqs-pypiper.txt b/requirements/reqs-pypiper.txt index fba597d0..2fccde0f 100644 --- a/requirements/reqs-pypiper.txt +++ b/requirements/reqs-pypiper.txt @@ -4,3 +4,4 @@ psutil pandas ubiquerg>=0.4.5 yacman +# pipestat>=0.1.0 From 3bb0da278262322ce7eab8059bb0428d2a29ff6d Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 11 May 2021 12:24:17 -0400 Subject: [PATCH 12/25] fall back to default pipestat schema if not provided --- pypiper/manager.py | 4 +++- pypiper/utils.py | 14 ++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/pypiper/manager.py b/pypiper/manager.py index af0cad78..0882c829 100644 --- a/pypiper/manager.py +++ b/pypiper/manager.py @@ -48,6 +48,7 @@ make_lock_name, parse_cmd, pipeline_filepath, + default_pipestat_schema, ) __all__ = ["PipelineManager"] @@ -322,6 +323,7 @@ def __init__( # pipesatat setup potential_namespace = getattr(self, "sample_name", self.name) + potential_pipestat_schema = default_pipestat_schema(sys.argv[0]) # don't force default pipestat_results_file value unless # pipestat config not provided @@ -332,7 +334,7 @@ def __init__( self._pipestat_manager = PipestatManager( namespace=pipestat_namespace or potential_namespace, record_identifier=pipestat_record_id or potential_namespace, - schema_path=pipestat_schema, + schema_path=pipestat_schema or potential_pipestat_schema, results_file_path=pipestat_results_file, config=pipestat_config, ) diff --git a/pypiper/utils.py b/pypiper/utils.py index 7e199bfa..932252da 100644 --- a/pypiper/utils.py +++ b/pypiper/utils.py @@ -926,6 +926,20 @@ def default_pipeline_config(pipeline_filepath): return os.path.splitext(os.path.basename(pipeline_filepath))[0] + ".yaml" +def default_pipestat_schema(pipeline_filepath): + """ + Determine the default filepath for a pipeline's pipestat output schema. + + :param str pipeline_filepath: path to a pipeline + :return str: default filepath for a pipeline's pipestat output schema. + """ + pipestat_results_schema = os.path.join( + os.path.dirname(pipeline_filepath), "pipestat_results_schema.yaml" + ) + print(f"Using default schema: {pipestat_results_schema}") + return pipestat_results_schema if os.path.exists(pipestat_results_schema) else None + + def _add_args(parser, args, required): """ Add new arguments to an ArgumentParser. From 5976775ad45fd8d32c398a0ca74613d66aaa43b7 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 11 May 2021 12:34:27 -0400 Subject: [PATCH 13/25] add default schema location in docs --- docs/pipestat.md | 45 ++++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/docs/pipestat.md b/docs/pipestat.md index 3923a215..b7539bd7 100644 --- a/docs/pipestat.md +++ b/docs/pipestat.md @@ -16,26 +16,29 @@ There are a multiple advantages of using piestat instead of the current pieline In order to start reporting results with pipestat in your pipeline all you need to do is: -1. Define a [pipestat resuts schema](http://pipestat.databio.org/en/latest/pipestat_specification/#pipestat-schema-format) +### Define a [pipestat resuts schema](http://pipestat.databio.org/en/latest/pipestat_specification/#pipestat-schema-format) ```yaml my_int_result: - type: integer - description: "This is my first result" + type: integer + description: "This is my first result" my_str_result: - type: string + type: string ``` -2. Pass the pipestat results schema to the `PipelineManager` object constructor. + +### Pass the pipestat results schema to the `PipelineManager` object constructor ```python pm = pypiper.PipelineManager( - name="hello_pypiper", - outfolfer="$HOME/hello_pypiper", - pipestat_schema="my_results_schema.yaml", + name="hello_pypiper", + outfolfer="$HOME/hello_pypiper", + pipestat_schema="pipestat_results_schema.yaml", ) ``` -3. Use `pipestat` property of the `PipelineManager` object to report/retrieve results. See usage for more details. +If `pipestat_schema` argument is not provided, by default pypiper will look for a `pipestat_results_schema.yaml` file next to the pipeline Python script. + +### Use `pipestat` property of the `PipelineManager` object to report/retrieve results. See usage for more details And in the simplest case... that's it! Pypiper *by default* will use a YAML-formated file to store the reported results in the selected `outfolder`. @@ -43,7 +46,7 @@ And in the simplest case... that's it! Pypiper *by default* will use a YAML-form Pypiper-pipestat integration really shines when more advanced features are used. Here's how to set them up. -**Use a database to store reported results** +#### Use a database to store reported results* In order to establish a database connection pipestat requires few pieces of information, which *must* be provided in a [pipestat configuration file](http://pipestat.databio.org/en/latest/config/) passed to the `PipelineManager` constructor. @@ -71,17 +74,12 @@ docker run -d --name pypiper-postgres \ -v postgres-data:/var/lib/postgresql/data postgres ``` -**Highlight results** +#### Highlight results The pipestat results schema can include any number of additional attributes for results. An example of that is *results highlighting*. When a `highlight: true` attribute is included attribute under result identifier in the schema file the highlighted results can be later retrieved by pipestat clients via `PipelineManager.pipestat.highlighted_results` property, which simply returns a list of result identifiers. to be presented in a special way. -**Custom run status management** - - - - ### Usage Since a pipeline run-specific `PipestatManager` instance is attached to the `PipelineManager` object all the public pipestat API can be used. Please refer to the [pipestat API documentation](http://pipestat.databio.org/en/latest/autodoc_build/pipestat/) to read about all the currently available features. @@ -94,14 +92,16 @@ Here we present the most commonly used features: ```python results = { - "my_int_result": 10, - "my_str_result": "test" + "my_int_result": 10, + "my_str_result": "test" } pm.pipestat.report( - values=results, - strict_type=True, - force_overwrite=True) + values=results, + strict_type=True, + force_overwrite=True +) ``` + - results retrieval ```python @@ -114,9 +114,8 @@ pm.pipestat.retrieve(result_identifier="my_int_result") pm.pipestat.schema ``` - - exploration of canonical [jsonschema](https://json-schema.org/) representation of result schemas ```python pm.pipestat.result_schemas -``` \ No newline at end of file +``` From 0ed601b19f9331f9b3adcbf0886353b87bc009e0 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 11 May 2021 12:47:27 -0400 Subject: [PATCH 14/25] simplify pipestat docs --- docs/pipestat.md | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/docs/pipestat.md b/docs/pipestat.md index b7539bd7..0ae85c47 100644 --- a/docs/pipestat.md +++ b/docs/pipestat.md @@ -14,9 +14,7 @@ There are a multiple advantages of using piestat instead of the current pieline ## Setup -In order to start reporting results with pipestat in your pipeline all you need to do is: - -### Define a [pipestat resuts schema](http://pipestat.databio.org/en/latest/pipestat_specification/#pipestat-schema-format) +In order to start reporting results with pipestat in your pipeline all you need to do is define a [pipestat resuts schema](http://pipestat.databio.org/en/latest/pipestat_specification/#pipestat-schema-format): ```yaml my_int_result: @@ -26,27 +24,30 @@ my_str_result: type: string ``` -### Pass the pipestat results schema to the `PipelineManager` object constructor +And in the simplest case... that's it! Now you can use `pipestat` property of the `PipelineManager` object to report/retrieve results. -```python -pm = pypiper.PipelineManager( - name="hello_pypiper", - outfolfer="$HOME/hello_pypiper", - pipestat_schema="pipestat_results_schema.yaml", -) -``` +Pypiper *by default* will use a YAML-formated file to store the reported results in the selected `outfolder` and will look for `pipestat_results_schema.yaml` file in the pipeline Python script directory. -If `pipestat_schema` argument is not provided, by default pypiper will look for a `pipestat_results_schema.yaml` file next to the pipeline Python script. +### Advanced features -### Use `pipestat` property of the `PipelineManager` object to report/retrieve results. See usage for more details +Pypiper-pipestat integration really shines when more advanced features are used. Here's how to set them up. -And in the simplest case... that's it! Pypiper *by default* will use a YAML-formated file to store the reported results in the selected `outfolder`. +#### Configure custom pipestat options -### Advanced features +You can configure pipestat by passing arguments with custom values to `pypiper.PipelineManager` constructor: -Pypiper-pipestat integration really shines when more advanced features are used. Here's how to set them up. +```python +pm = pypiper.PipelineManager( + ..., + pipestat_schema="custom_results_schema.yaml", + pipestat_results_file="custom_results_file.yaml", + pipestat_record_id="my_record", + pipestat_namespace="my_namespace", + pipestat_config="custom_pipestat_config.yaml", +) +``` -#### Use a database to store reported results* +#### Use a database to store reported results In order to establish a database connection pipestat requires few pieces of information, which *must* be provided in a [pipestat configuration file](http://pipestat.databio.org/en/latest/config/) passed to the `PipelineManager` constructor. From a34845d22ab90efb8b6bc59946e90466af3888a0 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 22 Jun 2021 14:23:13 -0400 Subject: [PATCH 15/25] use pipestat in ngstk --- pypiper/ngstk.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/pypiper/ngstk.py b/pypiper/ngstk.py index 53d1ad38..c64c8b8f 100755 --- a/pypiper/ngstk.py +++ b/pypiper/ngstk.py @@ -559,7 +559,7 @@ def temp_func( ] ) raw_reads = int(total_reads / n_input_files) - self.pm.report_result("Raw_reads", str(raw_reads)) + self.pm.pipestat.report(values={"Raw_reads": str(raw_reads)}) total_fastq_reads = sum( [ @@ -569,7 +569,7 @@ def temp_func( ) fastq_reads = int(total_fastq_reads / n_output_files) - self.pm.report_result("Fastq_reads", fastq_reads) + self.pm.pipestat.report(values={"Fastq_reads": fastq_reads}) input_ext = self.get_input_ext(input_files[0]) # We can only assess pass filter reads in bam files with flags. if input_ext == ".bam": @@ -577,7 +577,7 @@ def temp_func( [int(self.count_fail_reads(f, paired_end)) for f in input_files] ) pf_reads = int(raw_reads) - num_failed_filter - self.pm.report_result("PF_reads", str(pf_reads)) + self.pm.pipestat.report(values={"PF_reads": str(pf_reads)}) if fastq_reads != int(raw_reads): raise Exception( "Fastq conversion error? Number of input reads " @@ -615,9 +615,9 @@ def temp_func(): print("WARNING: specified paired-end but no R2 file") n_trim = float(self.count_reads(trimmed_fastq, paired_end)) - self.pm.report_result("Trimmed_reads", int(n_trim)) + self.pm.pipestat.report(values={"Trimmed_reads": int(n_trim)}) try: - rr = float(self.pm.get_stat("Raw_reads")) + rr = float(self.pm.pipestat.retrieve("Raw_reads")) except: print("Can't calculate trim loss rate without raw read result.") else: @@ -633,14 +633,28 @@ def temp_func(): self.pm.run(cmd, lock_name="trimmed_fastqc", nofail=True) fname, ext = os.path.splitext(os.path.basename(trimmed_fastq)) fastqc_html = os.path.join(fastqc_folder, fname + "_fastqc.html") - self.pm.report_object("FastQC report r1", fastqc_html) + self.pm.pipestat.report( + values={ + "FastQC_report_R1": { + "path": fastqc_html, + "title": "FastQC report R1", + } + } + ) if paired_end and trimmed_fastq_R2: cmd = self.fastqc(trimmed_fastq_R2, fastqc_folder) self.pm.run(cmd, lock_name="trimmed_fastqc_R2", nofail=True) fname, ext = os.path.splitext(os.path.basename(trimmed_fastq_R2)) fastqc_html = os.path.join(fastqc_folder, fname + "_fastqc.html") - self.pm.report_object("FastQC report r2", fastqc_html) + self.pm.pipestat.report( + values={ + "FastQC_report_R2": { + "path": fastqc_html, + "title": "FastQC report R2", + } + } + ) return temp_func From b59d07beb528327db01e2912604c5ee0a80d80c3 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 24 Jun 2021 16:02:37 -0400 Subject: [PATCH 16/25] use released pipestat --- requirements/reqs-docs.txt | 3 +-- requirements/reqs-ngstk.txt | 3 ++- requirements/reqs-pypiper.txt | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements/reqs-docs.txt b/requirements/reqs-docs.txt index 9dd2e5d5..f0323dff 100644 --- a/requirements/reqs-docs.txt +++ b/requirements/reqs-docs.txt @@ -2,5 +2,4 @@ mkdocs>=1.0 markdown-include pydoc-markdown piper -https://github.com/databio/mkdocs-databio/archive/master.zip --e git+git://github.com/pepkit/pipestat@orm#egg=pipestat \ No newline at end of file +https://github.com/databio/mkdocs-databio/archive/master.zip \ No newline at end of file diff --git a/requirements/reqs-ngstk.txt b/requirements/reqs-ngstk.txt index 0c62f1a7..226df0a0 100644 --- a/requirements/reqs-ngstk.txt +++ b/requirements/reqs-ngstk.txt @@ -1,4 +1,5 @@ numpy pandas pysam -yacman \ No newline at end of file +yacman +pipestat>=0.1.0 \ No newline at end of file diff --git a/requirements/reqs-pypiper.txt b/requirements/reqs-pypiper.txt index 2fccde0f..ddc89a98 100644 --- a/requirements/reqs-pypiper.txt +++ b/requirements/reqs-pypiper.txt @@ -4,4 +4,4 @@ psutil pandas ubiquerg>=0.4.5 yacman -# pipestat>=0.1.0 +pipestat>=0.1.0 From bf3b62d8601a76aa31b35d916423edf66d13d697 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 8 Jul 2021 14:50:34 -0400 Subject: [PATCH 17/25] source pipestat args from kwargs if not explicitly provided --- pypiper/manager.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/pypiper/manager.py b/pypiper/manager.py index 0882c829..796217d2 100644 --- a/pypiper/manager.py +++ b/pypiper/manager.py @@ -109,13 +109,13 @@ class PipelineManager(object): protect from a case in which a restart begins upstream of a stage for which a checkpoint file already exists, but that depends on the upstream stage and thus should be rerun if it's "parent" is rerun. - :param pipestat.PipestatManager: pipestat manager object to use for - reporting pipeline results :raise TypeError: if start or stop point(s) are provided both directly and via args namespace, or if both stopping types (exclusive/prospective and inclusive/retrospective) are provided. """ + # TODO: add pipestat-related args docstrings + def __init__( self, name, @@ -323,7 +323,6 @@ def __init__( # pipesatat setup potential_namespace = getattr(self, "sample_name", self.name) - potential_pipestat_schema = default_pipestat_schema(sys.argv[0]) # don't force default pipestat_results_file value unless # pipestat config not provided @@ -331,12 +330,24 @@ def __init__( pipestat_results_file = pipeline_filepath( self, filename="pipestat_results.yaml" ) + + def _get_arg(args_dict, arg_name): + """safely get argument from arg dict -- return None if doesn't exist""" + return None if arg_name not in args_dict else args_dict[arg_name] + self._pipestat_manager = PipestatManager( - namespace=pipestat_namespace or potential_namespace, - record_identifier=pipestat_record_id or potential_namespace, - schema_path=pipestat_schema or potential_pipestat_schema, - results_file_path=pipestat_results_file, - config=pipestat_config, + namespace=pipestat_namespace + or _get_arg(args_dict, "pipestat_namespace") + or potential_namespace, + record_identifier=pipestat_record_id + or _get_arg(args_dict, "pipestat_record_id") + or potential_namespace, + schema_path=pipestat_schema + or _get_arg(args_dict, "pipestat_schema") + or default_pipestat_schema(sys.argv[0]), + results_file_path=pipestat_results_file + or _get_arg(args_dict, "pipestat_results_file"), + config=pipestat_config or _get_arg(args_dict, "pipestat_config"), ) self.start_pipeline(args, multi) From ca3f515c22de1747b59feeafe291b05e9841fc67 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 8 Jul 2021 15:09:10 -0400 Subject: [PATCH 18/25] update dev version --- docs/changelog.md | 6 ++++++ pypiper/_version.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/changelog.md b/docs/changelog.md index 8cd3678b..ccb77805 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,5 +1,11 @@ # Changelog + +## [0.12.2] -- unreleased +### Added + +- [pipestat](http://pipestat.databio.org/en/latest/) support + ## [0.12.1] -- 2019-08-29 ### Fixed diff --git a/pypiper/_version.py b/pypiper/_version.py index def467e0..f3f4ce23 100644 --- a/pypiper/_version.py +++ b/pypiper/_version.py @@ -1 +1 @@ -__version__ = "0.12.1" +__version__ = "0.12.2-dev" From ee8cba54f94a15f3759408bebc2c51c178211fc7 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 28 Jul 2021 13:44:47 -0400 Subject: [PATCH 19/25] add default_return_code might be used to discriminate between runs that did not execute any commands and runs that did --- pypiper/manager.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pypiper/manager.py b/pypiper/manager.py index 796217d2..843edd7c 100644 --- a/pypiper/manager.py +++ b/pypiper/manager.py @@ -802,6 +802,7 @@ def run( clean=False, follow=None, container=None, + default_return_code=0, ): """ The primary workhorse function of PipelineManager, this runs a command. @@ -832,6 +833,8 @@ def run( to an auto cleanup list. Optional. :param callable follow: Function to call after executing (each) command. :param str container: Name for Docker container in which to run commands. + :param Any default_return_code: Return code to use, might be used to discriminate + between runs that did not execute any commands and runs that did. :return int: Return code of process. If a list of commands is passed, this is the maximum of all return codes for all commands. """ @@ -845,7 +848,7 @@ def run( len(cmds), "\n".join(cmds_text) ) ) - return 0 + return default_return_code # Short-circuit if the checkpoint file exists and the manager's not # been configured to overwrite such files. @@ -859,7 +862,7 @@ def run( self.curr_checkpoint, check_fpath, self.__class__.__name__, cmd ) ) - return 0 + return default_return_code # TODO: consider making the logic such that locking isn't implied, or # TODO (cont.): that we can make it otherwise such that it's not @@ -891,7 +894,7 @@ def run( lock_name = lock_name or make_lock_name(target, self.outfolder) lock_files = [self._make_lock_path(ln) for ln in lock_name] - process_return_code = 0 + process_return_code = default_return_code local_maxmem = 0 # Decide how to do follow-up. From 50e91b6e23ce62f67525fe525bb203f7c5ad6701 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 28 Jul 2021 14:42:34 -0400 Subject: [PATCH 20/25] handle None return codes in int comparisons --- pypiper/manager.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/pypiper/manager.py b/pypiper/manager.py index 843edd7c..e7ce81c9 100644 --- a/pypiper/manager.py +++ b/pypiper/manager.py @@ -839,6 +839,25 @@ def run( this is the maximum of all return codes for all commands. """ + def _max_ret_code(codes_list): + """ + Return the maximum of a list of return codes. + + :param list[int] code: List of return codes to compare. + :return int: Maximum of list. + """ + # filter out codes that are None + codes_list = [code for code in codes_list if code is not None] + # get the max of the remaining codes + if codes_list: + return max(codes_list) + # if no codes are left, return None + return + + # validate default return code + if default_return_code is not None and not isinstance(default_return_code, int): + raise TypeError("default_return_code must be an int or None") + # If the pipeline's not been started, skip ahead. if not self._active: cmds = [cmd] if isinstance(cmd, str) else cmd @@ -1041,16 +1060,18 @@ def call_follow(): maxmem = max(maxmem) if isinstance(maxmem, Iterable) else maxmem local_maxmem = max(local_maxmem, maxmem) list_ret = ( - max(list_ret) if isinstance(list_ret, Iterable) else list_ret + _max_ret_code(list_ret) + if isinstance(list_ret, Iterable) + else list_ret ) - process_return_code = max(process_return_code, list_ret) + process_return_code = _max_ret_code([process_return_code, list_ret]) else: # Single command (most common) process_return_code, local_maxmem = self.callprint( cmd, shell, lock_file, nofail, container ) # Run command if isinstance(process_return_code, list): - process_return_code = max(process_return_code) + process_return_code = _max_ret_code(process_return_code) # For temporary files, you can specify a clean option to automatically # add them to the clean list, saving you a manual call to clean_add From e0b10ca14bb0c19f369892042fa965572a3d95fe Mon Sep 17 00:00:00 2001 From: nsheff Date: Mon, 20 Dec 2021 20:23:58 -0500 Subject: [PATCH 21/25] add github workflows --- .github/workflows/black.yml | 11 +++++++++ .github/workflows/python-publish.yml | 30 ++++++++++++++++++++++++ .github/workflows/run-pytest.yml | 35 ++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+) create mode 100644 .github/workflows/black.yml create mode 100644 .github/workflows/python-publish.yml create mode 100644 .github/workflows/run-pytest.yml diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml new file mode 100644 index 00000000..63e18519 --- /dev/null +++ b/.github/workflows/black.yml @@ -0,0 +1,11 @@ +name: Lint + +on: [push, pull_request] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + - uses: psf/black@20.8b1 diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 00000000..48c52e13 --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,30 @@ +# This workflows will upload a Python Package using Twine when a release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +name: Upload Python Package + +on: + release: + types: [created] + +jobs: + deploy: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: "3.x" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel twine + - name: Build and publish + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: | + python setup.py sdist bdist_wheel + twine upload dist/* diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml new file mode 100644 index 00000000..d75b1cf1 --- /dev/null +++ b/.github/workflows/run-pytest.yml @@ -0,0 +1,35 @@ +name: Run pytests + +on: + push: + branches: [master, dev] + pull_request: + branches: [master, dev] + +jobs: + pytest: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.6, 3.7, 3.8, 3.9] + os: [ubuntu-latest, macos-latest] + + steps: + - uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dev dependancies + run: if [ -f requirements/requirements-dev.txt ]; then pip install -r requirements/requirements-dev.txt; fi + + - name: Install test dependancies + run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt; fi + + - name: Install package + run: python -m pip install . + + - name: Run pytest tests + run: pytest tests -x -vv --remote-data From f5f3a14525128359c2a709290736b2cf4368d395 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 25 Jan 2022 15:27:55 -0500 Subject: [PATCH 22/25] update python3 required changes to filter iterators to list and dict keys to list --- pypiper/ngstk.py | 6 +++--- pypiper/utils.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pypiper/ngstk.py b/pypiper/ngstk.py index c64c8b8f..5996b1fa 100755 --- a/pypiper/ngstk.py +++ b/pypiper/ngstk.py @@ -312,7 +312,7 @@ class of inputs (which can in turn be a string or a list). if any(isinstance(i, list) for i in input_args): # We have a list of lists. Process each individually. local_input_files = list() - n_input_files = len(filter(bool, input_args)) + n_input_files = len(list(filter(bool, input_args))) print("Number of input file sets: " + str(n_input_files)) for input_i, input_arg in enumerate(input_args): @@ -549,8 +549,8 @@ def temp_func( if type(output_files) != list: output_files = [output_files] - n_input_files = len(filter(bool, input_files)) - n_output_files = len(filter(bool, output_files)) + n_input_files = len(list(filter(bool, input_files))) + n_output_files = len(list(filter(bool, output_files))) total_reads = sum( [ diff --git a/pypiper/utils.py b/pypiper/utils.py index 932252da..93467cc9 100644 --- a/pypiper/utils.py +++ b/pypiper/utils.py @@ -864,14 +864,14 @@ def _determine_args(argument_groups, arguments, use_all_args=False): # Define the argument groups. args_by_group = { "pypiper": ["recover", "new-start", "dirty", "force-follow", "testmode"] - + LOGGING_CLI_OPTDATA.keys(), + + [*LOGGING_CLI_OPTDATA], "config": ["config"], "checkpoint": ["stop-before", "stop-after"], "resource": ["mem", "cores"], "looper": ["config", "output-parent", "mem", "cores", "pipeline-name"], "common": ["input", "sample-name"], "ngs": ["sample-name", "input", "input2", "genome", "single-or-paired"], - "logmuse": LOGGING_CLI_OPTDATA.keys(), + "logmuse": [*LOGGING_CLI_OPTDATA], "pipestat": [ "pipestat-namespace", "pipestat-record-id", From 336cc5e50d81c6b03802232962cc8c0d35eb0a02 Mon Sep 17 00:00:00 2001 From: nsheff Date: Tue, 25 Jan 2022 17:19:54 -0500 Subject: [PATCH 23/25] fix reqs merge --- requirements/requirements-test.txt | 8 -------- 1 file changed, 8 deletions(-) diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt index 8407ee52..f4b7458c 100644 --- a/requirements/requirements-test.txt +++ b/requirements/requirements-test.txt @@ -1,15 +1,7 @@ mock==2.0.0 -<<<<<<< HEAD:requirements/reqs-test.txt pytest>=4.6.9 pytest-cov>=2.8.1 hypothesis==4.38.0 coveralls veracitools -======= -pytest>=4.2.1 -hypothesis -coveralls>=1.1 -pytest-cov==2.6.1 -veracitools pytest-remotedata ->>>>>>> master:requirements/requirements-test.txt From 948a003bcd67528701ddb925dddabb160bab4658 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Thu, 29 Jun 2023 15:43:16 -0400 Subject: [PATCH 24/25] Pipestat addition (#188) * Work towards pypier utilizing pipestat changes. Related: https://github.com/pepkit/pipestat/issues/21 * Refactor to align with pipestat refactoring. Create default output schema to satisfy pipestat requirements. * Call pipestat backend for setting status. Set status for pipeline.manager. * Pytest fixes. * Pytest fixes, revert old test. * Move make_sure_path_exists before creating PipestatManager Object * Ensure pypiper is calling pipestat interface and NOT the backend. * Removed redundant default output schema function and refactored. * Add clear_status to end of unit test. * Implement pipestat.report for reporting results and objects with and without annotations. * Fix unit test, reporting results, and refresh_stats. #187 * lint * remove report_object and simplify report_result * Add report_object back for backwards compatibility. * Add ability to pass result_format to pipestat and return formatted string. Added annotation for backwards compatibility. * Change to passing formatting function to pipestat instead of format flag. * Added default mark down formatter to pass during pipestatmanager creation. Fixed sample_name and pipeline_name mix up for pipeline manager. * Add passing multi flag to pipestatmanager during creation. * Add printing pipestat arguments to log file. * Polish output of pipestat object after initialization * Update docs * Update requirements * fix _failed property return * update changelog --- README.md | 2 +- docs/changelog.md | 2 +- docs/conf.py | 24 +- docs/outputs.md | 2 +- docs/pipestat.md | 8 +- docs/report.md | 2 + example_pipelines/logmuse_example.py | 1 - pypiper/const.py | 1 + pypiper/manager.py | 290 ++++++++---------- pypiper/ngstk.py | 6 +- pypiper/pipeline.py | 7 +- pypiper/utils.py | 15 +- requirements/requirements-docs.txt | 1 + requirements/requirements-pypiper.txt | 2 +- setup.py | 6 +- .../Data/default_pipestat_output_schema.yaml | 9 + tests/Data/sample_output_schema.yaml | 24 ++ .../pipeline_manager/test_pipeline_manager.py | 33 +- .../pipeline_manager/test_set_status_flag.py | 11 +- 19 files changed, 237 insertions(+), 209 deletions(-) create mode 100644 tests/Data/default_pipestat_output_schema.yaml create mode 100644 tests/Data/sample_output_schema.yaml diff --git a/README.md b/README.md index f17aa958..29f13efe 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,6 @@ # Pypiper [![Documentation Status](https://readthedocs.org/projects/pypiper/badge/?version=latest)](http://pypiper.readthedocs.org/en/latest/?badge=latest) -[![Build Status](https://travis-ci.org/databio/pypiper.svg?branch=master)](https://travis-ci.org/databio/pypiper) +[![Build Status](https://github.com/databio/pypiper/actions/workflows/run-pytest.yml/badge.svg?branch=dev)](https://github.com/databio/pypiper/actions/workflows/run-pytest.yml?branch=dev) A lightweight python toolkit for gluing together restartable, robust shell pipelines. Learn more in the [documentation](http://pypiper.databio.org). diff --git a/docs/changelog.md b/docs/changelog.md index 28c77770..34a500c9 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,6 +1,6 @@ # Changelog -## [0.13.0] -- unreleased +## [0.13.0] -- 2023-06-29 ### Added - [pipestat](http://pipestat.databio.org/en/latest/) support diff --git a/docs/conf.py b/docs/conf.py index f796b3df..0566b0b9 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -51,8 +51,8 @@ master_doc = "index" # General information about the project. -project = u"pypiper" -copyright = u"2015, Nathan Sheffield, Johanna Klughammer, Andre Rendeiro" +project = "pypiper" +copyright = "2015, Nathan Sheffield, Johanna Klughammer, Andre Rendeiro" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -215,8 +215,8 @@ ( "index", "pypiper.tex", - u"pypiper Documentation", - u"Nathan Sheffield, Johanna Klughammer, Andre Rendeiro", + "pypiper Documentation", + "Nathan Sheffield, Johanna Klughammer, Andre Rendeiro", "manual", ), ] @@ -250,8 +250,8 @@ ( "index", "pypiper", - u"pypiper Documentation", - [u"Nathan Sheffield, Johanna Klughammer, Andre Rendeiro"], + "pypiper Documentation", + ["Nathan Sheffield, Johanna Klughammer, Andre Rendeiro"], 1, ) ] @@ -269,8 +269,8 @@ ( "index", "pypiper", - u"pypiper Documentation", - u"Nathan Sheffield, Johanna Klughammer, Andre Rendeiro", + "pypiper Documentation", + "Nathan Sheffield, Johanna Klughammer, Andre Rendeiro", "pypiper", "One line description of project.", "Miscellaneous", @@ -293,10 +293,10 @@ # -- Options for Epub output ---------------------------------------------- # Bibliographic Dublin Core info. -epub_title = u"pypiper" -epub_author = u"Nathan Sheffield, Johanna Klughammer, Andre Rendeiro" -epub_publisher = u"Nathan Sheffield, Johanna Klughammer, Andre Rendeiro" -epub_copyright = u"2015, Nathan Sheffield, Johanna Klughammer, Andre Rendeiro" +epub_title = "pypiper" +epub_author = "Nathan Sheffield, Johanna Klughammer, Andre Rendeiro" +epub_publisher = "Nathan Sheffield, Johanna Klughammer, Andre Rendeiro" +epub_copyright = "2015, Nathan Sheffield, Johanna Klughammer, Andre Rendeiro" # The basename for the epub file. It defaults to the project name. # epub_basename = u'pypiper' diff --git a/docs/outputs.md b/docs/outputs.md index 30ad8b5e..786086b1 100644 --- a/docs/outputs.md +++ b/docs/outputs.md @@ -9,7 +9,7 @@ Assume you are using a pypiper pipeline named `PIPE` ( it passes `name="PIPE"` t * **PIPE_status.flag** As the pipeline runs, it produces a flag in the output directory, which can be either `PIPE_running.flag`, `PIPE_failed.flag`, or `PIPE_completed.flag`. These flags make it easy to assess the current state of running pipelines for individual samples, and for many samples in a project simultaneously. -* **stats.tsv** +* **stats.yaml** Any results reported by the pipeline are saved as key-value pairs in this file, for easy parsing. * **PIPE_profile.md** diff --git a/docs/pipestat.md b/docs/pipestat.md index 0ae85c47..77534f09 100644 --- a/docs/pipestat.md +++ b/docs/pipestat.md @@ -5,10 +5,10 @@ You can browse the pipestat documentation to learn more about it, but briefly pi ## Advancements -There are a multiple advantages of using piestat instead of the current pieline results reporiting system: +There are a multiple advantages of using pipestat instead of the current pipeline results reporting system: 1. **Database results storage:** the results can be stored either in a database or a YAML-formatted results file. This way a pypiper pipeline running in an emphemeral compute environment can report the results to the database and exit. No need to sync the results with a central results storage. -2. **Strict and clear results definition:** all the results that can be reported by a pipeline run *must* be pre-defined in a [pipestat results schema](http://pipestat.databio.org/en/latest/pipestat_specification/#pipestat-schema-format) that in a simplest case just indicates the result's type. This presents piepstat clients with the possibility to *reliably* gather all the possible results and related metadata. +2. **Strict and clear results definition:** all the results that can be reported by a pipeline run *must* be pre-defined in a [pipestat results schema](http://pipestat.databio.org/en/latest/pipestat_specification/#pipestat-schema-format) that in a simplest case just indicates the result's type. This presents pipestat clients with the possibility to *reliably* gather all the possible results and related metadata. 3. **On-the-fly results validation:** the schema is used to validate and/or convert the reported result to a strictly determined type, which makes the connection of pypiper with downstream pipeline results processing software seamless. 4. **Unified, pipeline-agnostic results interface:** other pipelines, possibly created with different pipeline frameworks, can read and write results via Python API or command line interface. This feature significantly incerases your pipeline interoperability. @@ -41,8 +41,8 @@ pm = pypiper.PipelineManager( ..., pipestat_schema="custom_results_schema.yaml", pipestat_results_file="custom_results_file.yaml", - pipestat_record_id="my_record", - pipestat_namespace="my_namespace", + pipestat_sample_name="my_record", + pipestat_project_name="my_namespace", pipestat_config="custom_pipestat_config.yaml", ) ``` diff --git a/docs/report.md b/docs/report.md index 7e8e5f46..fd50c270 100644 --- a/docs/report.md +++ b/docs/report.md @@ -6,6 +6,8 @@ When you call `pm.report_result(key, value)`, pypiper simply writes the key-valu ## Reporting objects +**Note**: Reporting objects will be deprecated in a future release. It is recommended to use `report_result`. + Starting in version 0.8, pypiper now implements a second reporting function, `report_object`. This is analogous to the `report_result` function, but instead of reporting simple key-value pairs, it lets you record any produced file as an output. Most commonly, this is used to record figures (PDFs, PNGs, etc.) produced by the pipeline. It can also be used to report other files, like HTML files. Pypiper writes results to `objects.tsv`, which can then be aggregated for project-level summaries of plots and other pipeline result files. diff --git a/example_pipelines/logmuse_example.py b/example_pipelines/logmuse_example.py index 61d8cc97..3b98b6df 100755 --- a/example_pipelines/logmuse_example.py +++ b/example_pipelines/logmuse_example.py @@ -21,7 +21,6 @@ def build_argparser(): - parser = ArgumentParser( description="A pipeline to count the number of reads and file size. Accepts" " BAM, fastq, or fastq.gz files." diff --git a/pypiper/const.py b/pypiper/const.py index 0159ddac..27495297 100644 --- a/pypiper/const.py +++ b/pypiper/const.py @@ -2,6 +2,7 @@ CHECKPOINT_EXTENSION = ".checkpoint" +DEFAULT_SAMPLE_NAME = "DEFAULT_SAMPLE_NAME" PIPELINE_CHECKPOINT_DELIMITER = "_" STAGE_NAME_SPACE_REPLACEMENT = "-" PROFILE_COLNAMES = ["pid", "hash", "cid", "runtime", "mem", "cmd", "lock"] diff --git a/pypiper/manager.py b/pypiper/manager.py index 48bcf8e7..9d32100b 100644 --- a/pypiper/manager.py +++ b/pypiper/manager.py @@ -33,7 +33,7 @@ from yacman import load_yaml from ._version import __version__ -from .const import PROFILE_COLNAMES +from .const import PROFILE_COLNAMES, DEFAULT_SAMPLE_NAME from .exceptions import PipelineHalt, SubprocessError from .flags import * from .utils import ( @@ -49,8 +49,10 @@ make_lock_name, parse_cmd, pipeline_filepath, - default_pipestat_schema, + default_pipestat_output_schema, + result_formatter_markdown, ) +from pipestat.helpers import read_yaml_data __all__ = ["PipelineManager"] @@ -134,14 +136,14 @@ def __init__( output_parent=None, overwrite_checkpoints=False, logger_kwargs=None, - pipestat_namespace=None, - pipestat_record_id=None, + pipestat_project_name=None, + pipestat_sample_name=None, pipestat_schema=None, pipestat_results_file=None, pipestat_config=None, + pipestat_result_formatter=None, **kwargs, ): - # Params defines the set of options that could be updated via # command line args to a pipeline run, that can be forwarded # to Pypiper. If any pypiper arguments are passed @@ -274,14 +276,13 @@ def __init__( # File paths: self.outfolder = os.path.join(outfolder, "") # trailing slash + self.make_sure_path_exists(self.outfolder) self.pipeline_log_file = pipeline_filepath(self, suffix="_log.md") self.pipeline_profile_file = pipeline_filepath(self, suffix="_profile.tsv") # Stats and figures are general and so lack the pipeline name. - self.pipeline_stats_file = pipeline_filepath(self, filename="stats.tsv") - self.pipeline_figures_file = pipeline_filepath(self, filename="figures.tsv") - self.pipeline_objects_file = pipeline_filepath(self, filename="objects.tsv") + self.pipeline_stats_file = pipeline_filepath(self, filename="stats.yaml") # Record commands used and provide manual cleanup script. self.pipeline_commands_file = pipeline_filepath(self, suffix="_commands.sh") @@ -307,6 +308,11 @@ def __init__( # In-memory holder for report_result self.stats_dict = {} + # Result formatter to pass to pipestat + self.pipestat_result_formatter = ( + pipestat_result_formatter or result_formatter_markdown + ) + # Checkpoint-related parameters self.overwrite_checkpoints = overwrite_checkpoints self.halt_on_next = False @@ -322,8 +328,9 @@ def __init__( signal.signal(signal.SIGINT, self._signal_int_handler) signal.signal(signal.SIGTERM, self._signal_term_handler) - # pipesatat setup - potential_namespace = getattr(self, "sample_name", self.name) + # pipestat setup + self.pipestat_sample_name = pipestat_sample_name or DEFAULT_SAMPLE_NAME + # getattr(self, "sample_name", DEFAULT_SAMPLE_NAME) # don't force default pipestat_results_file value unless # pipestat config not provided @@ -337,19 +344,19 @@ def _get_arg(args_dict, arg_name): return None if arg_name not in args_dict else args_dict[arg_name] self._pipestat_manager = PipestatManager( - namespace=pipestat_namespace - or _get_arg(args_dict, "pipestat_namespace") - or potential_namespace, - record_identifier=pipestat_record_id - or _get_arg(args_dict, "pipestat_record_id") - or potential_namespace, + sample_name=self.pipestat_sample_name + or _get_arg(args_dict, "pipestat_sample_name") + or DEFAULT_SAMPLE_NAME, + pipeline_name=self.name, schema_path=pipestat_schema or _get_arg(args_dict, "pipestat_schema") - or default_pipestat_schema(sys.argv[0]), - results_file_path=pipestat_results_file + or default_pipestat_output_schema(sys.argv[0]), + results_file_path=self.pipeline_stats_file or _get_arg(args_dict, "pipestat_results_file"), - config=pipestat_config or _get_arg(args_dict, "pipestat_config"), + config_file=pipestat_config or _get_arg(args_dict, "pipestat_config"), + multi_pipelines=multi, ) + self.start_pipeline(args, multi) # Handle config file if it exists @@ -429,8 +436,10 @@ def _completed(self): :return bool: Whether the managed pipeline is in a completed state. """ - return self.pipestat.get_status() == COMPLETE_FLAG - # return self.status == COMPLETE_FLAG + return ( + self.pipestat.get_status(self._pipestat_manager.sample_name) + == COMPLETE_FLAG + ) @property def _failed(self): @@ -439,18 +448,17 @@ def _failed(self): :return bool: Whether the managed pipeline is in a failed state. """ - self.pipestat.get_status() == FAIL_FLAG - # return self.status == FAIL_FLAG + return self.pipestat.get_status(self._pipestat_manager.sample_name) == FAIL_FLAG @property def halted(self): """ Is the managed pipeline in a paused/halted state? - :return bool: Whether the managed pipeline is in a paused/halted state. """ - self.pipestat.get_status() == PAUSE_FLAG - # return self.status == PAUSE_FLAG + return ( + self.pipestat.get_status(self._pipestat_manager.sample_name) == PAUSE_FLAG + ) @property def _has_exit_status(self): @@ -462,27 +470,6 @@ def _has_exit_status(self): """ return self._completed or self.halted or self._failed - # def setup_default_pipestat(self, schema_path): - # """ - # A convenience method for ad hoc PipestatManager instantiation. - # - # Requires only a pipestat-like schema to get a functional PipestatManager - # for reporting to a YAML-formatted file. - # - # :param str schema_path: path to the pipestat-like schema - # """ - # if self.pipestat is not None: - # raise PipestatError( - # f"{PipestatManager.__name__} is already " - # f"initialized:\n{str(self.pipestat)}" - # ) - # self._pipestat_manager = PipestatManager( - # schema_path=schema_path, - # namespace=self.name, - # record_identifier=self.name, - # results_file_path=pipeline_filepath(self, suffix="_results_pipestat.yaml"), - # ) - def _ignore_interrupts(self): """ Ignore interrupt and termination signals. Used as a pre-execution @@ -498,7 +485,7 @@ def start_pipeline(self, args=None, multi=False): You provide only the output directory (used for pipeline stats, log, and status flag files). """ # Perhaps this could all just be put into __init__, but I just kind of like the idea of a start function - self.make_sure_path_exists(self.outfolder) + # self.make_sure_path_exists(self.outfolder) # By default, Pypiper will mirror every operation so it is displayed both # on sys.stdout **and** to a log file. Unfortunately, interactive python sessions @@ -728,9 +715,17 @@ def start_pipeline(self, args=None, multi=False): argtext = "`{}`".format(arg) valtext = "`{}`".format(val) self.info("* {}: {}".format(argtext.rjust(20), valtext)) + + self.info("\n### Initialized Pipestat Object:\n") + results = self._pipestat_manager.__str__().split("\n") + for i in results: + self.info("* " + i) + self.info("* Sample name: " + self.pipestat_sample_name + "\n") self.info("\n----------------------------------------\n") - # self._set_status_flag(RUN_FLAG) - self.pipestat.set_status(status_identifier="running") + self.status = "running" + self.pipestat.set_status( + sample_name=self._pipestat_manager.sample_name, status_identifier="running" + ) # Record the start in PIPE_profile and PIPE_commands output files so we # can trace which run they belong to @@ -774,7 +769,9 @@ def _set_status_flag(self, status): # Set new status. prev_status = self.status self.status = status - self._create_file(self._flag_file_path()) + self.pipestat.set_status( + sample_name=self._pipestat_manager.sample_name, status_identifier=status + ) self.debug("\nChanged status from {} to {}.".format(prev_status, self.status)) def _flag_file_path(self, status=None): @@ -787,7 +784,12 @@ def _flag_file_path(self, status=None): :param str status: flag file type to create, default to current status :return str: path to flag file of indicated or current status. """ - flag_file_name = "{}_{}".format(self.name, flag_name(status or self.status)) + + flag_file_name = "{}_{}_{}".format( + self._pipestat_manager["_pipeline_name"], + self.pipestat_sample_name, + flag_name(status or self.status), + ) return pipeline_filepath(self, filename=flag_file_name) ################################### @@ -1416,7 +1418,10 @@ def _wait_for_lock(self, lock_file): "this step should be restarted." ) # self._set_status_flag(WAIT_FLAG) - self.pipestat.set_status(status_identifier="waiting") + self.pipestat.set_status( + sample_name=self._pipestat_manager.sample_name, + status_identifier="waiting", + ) first_message_flag = True else: sys.stdout.write(".") @@ -1437,7 +1442,10 @@ def _wait_for_lock(self, lock_file): if first_message_flag: self.timestamp("File unlocked.") # self._set_status_flag(RUN_FLAG) - self.pipestat.set_status(status_identifier="running") + self.pipestat.set_status( + sample_name=self._pipestat_manager.sample_name, + status_identifier="running", + ) ################################### # Logging functions @@ -1574,48 +1582,49 @@ def _report_profile( with open(self.pipeline_profile_file, "a") as myfile: myfile.write(message_raw + "\n") - def report_result(self, key, value, annotation=None, nolog=False): + def report_result(self, key, value, nolog=False, result_formatter=None): """ - Writes a string to self.pipeline_stats_file. + Writes a key:value pair to self.pipeline_stats_file. :param str key: name (key) of the stat - :param str annotation: By default, the stats will be annotated with the - pipeline name, so you can tell which pipeline records which stats. - If you want, you can change this; use annotation='shared' if you - need the stat to be used by another pipeline (using get_stat()). + :param dict value: value of the stat to report. :param bool nolog: Turn on this flag to NOT print this result in the logfile. Use sparingly in case you will be printing the result in a different format. - """ - # Default annotation is current pipeline name. - annotation = str(annotation or self.name) - - # In case the value is passed with trailing whitespace. - value = str(value).strip() + :param str result_formatter: function for formatting via pipestat backend + :return str reported_result: the reported result is returned as a list of formatted strings. + """ # keep the value in memory: self.stats_dict[key] = value - message_raw = "{key}\t{value}\t{annotation}".format( - key=key, value=value, annotation=annotation - ) - message_markdown = "\n> `{key}`\t{value}\t{annotation}\t_RES_".format( - key=key, value=value, annotation=annotation + rf = result_formatter or self.pipestat_result_formatter + + reported_result = self.pipestat.report( + values={key: value}, + sample_name=self.pipestat_sample_name, + result_formatter=rf, ) if not nolog: - self.info(message_markdown) + for r in reported_result: + self.info(r) - # Just to be extra careful, let's lock the file while we we write - # in case multiple pipelines write to the same file. - self._safe_write_to_file(self.pipeline_stats_file, message_raw) + return reported_result def report_object( - self, key, filename, anchor_text=None, anchor_image=None, annotation=None + self, + key, + filename, + anchor_text=None, + anchor_image=None, + annotation=None, + nolog=False, + result_formatter=None, ): """ - Writes a string to self.pipeline_objects_file. Used to report figures - and others. + Writes a key:value pair to self.pipeline_stats_file. Note: this function + will be deprecated. Using report_result is recommended. :param str key: name (key) of the object :param str filename: relative path to the file (relative to parent @@ -1628,18 +1637,26 @@ def report_object( :param str annotation: By default, the figures will be annotated with the pipeline name, so you can tell which pipeline records which figures. If you want, you can change this. + :param bool nolog: Turn on this flag to NOT print this result in the + logfile. Use sparingly in case you will be printing the result in a + different format. + :param str result_formatter: function for formatting via pipestat backend + :return str reported_result: the reported result is returned as a list of formatted strings. """ - + warnings.warn( + "This function may be removed in future release. " + "The recommended way to report pipeline results is using PipelineManager.pipestat.report().", + category=DeprecationWarning, + ) + rf = result_formatter or self.pipestat_result_formatter # Default annotation is current pipeline name. annotation = str(annotation or self.name) - # In case the value is passed with trailing whitespace. filename = str(filename).strip() if anchor_text: anchor_text = str(anchor_text).strip() else: anchor_text = str(key).strip() - # better to use a relative path in this file # convert any absolute paths into relative paths relative_filename = ( @@ -1657,62 +1674,22 @@ def report_object( else: relative_anchor_image = "None" - message_raw = ( - "{key}\t{filename}\t{anchor_text}\t{anchor_image}\t{annotation}".format( - key=key, - filename=relative_filename, - anchor_text=anchor_text, - anchor_image=relative_anchor_image, - annotation=annotation, - ) - ) - - message_markdown = "> `{key}`\t{filename}\t{anchor_text}\t{anchor_image}\t{annotation}\t_OBJ_".format( - key=key, + message_raw = "{filename}\t{anchor_text}\t{anchor_image}\t{annotation}".format( filename=relative_filename, anchor_text=anchor_text, anchor_image=relative_anchor_image, annotation=annotation, ) - self.warning(message_markdown) - - self._safe_write_to_file(self.pipeline_objects_file, message_raw) + val = {key: message_raw.replace("\t", " ")} - def _safe_write_to_file(self, file, message): - """ - Writes a string to a file safely (with file locks). - """ - warnings.warn( - "This function may be removed in future release. " - "The recommended way to report pipeline results is using PipelineManager.pipestat.report().", - category=DeprecationWarning, + reported_result = self.pipestat.report( + values=val, sample_name=self.pipestat_sample_name, result_formatter=rf ) - target = file - lock_name = make_lock_name(target, self.outfolder) - lock_file = self._make_lock_path(lock_name) - - while True: - if os.path.isfile(lock_file): - self._wait_for_lock(lock_file) - else: - try: - self.locks.append(lock_file) - self._create_file_racefree(lock_file) - except OSError as e: - if e.errno == errno.EEXIST: - self.warning("Lock file created after test! Looping again.") - continue # Go back to start - - # Proceed with file writing - with open(file, "a") as myfile: - myfile.write(message + "\n") - - os.remove(lock_file) - self.locks.remove(lock_file) - - # If you make it to the end of the while loop, you're done - break + if not nolog: + for r in reported_result: + self.info(r) + return reported_result def _report_command(self, cmd, procs=None): """ @@ -1831,35 +1808,21 @@ def make_sure_path_exists(path): def _refresh_stats(self): """ - Loads up the stats sheet created for this pipeline run and reads + Loads up the stats yaml created for this pipeline run and reads those stats into memory """ - # regex identifies all possible stats files. - # regex = self.outfolder + "*_stats.tsv" - # stats_files = glob.glob(regex) - # stats_files.insert(self.pipeline_stats_file) # last one is the current pipeline - # for stats_file in stats_files: - - stats_file = self.pipeline_stats_file if os.path.isfile(self.pipeline_stats_file): - with open(stats_file, "r") as stat_file: - for line in stat_file: - try: - # Someone may have put something that's not 3 columns in the stats file - # if so, shame on him, but we can just ignore it. - key, value, annotation = line.split("\t") - except ValueError: - self.warning( - "WARNING: Each row in a stats file is expected to have 3 columns" - ) - - if ( - annotation.rstrip() == self.name - or annotation.rstrip() == "shared" - ): - self.stats_dict[key] = value.strip() - # if os.path.isfile(self.pipeline_stats_file): + _, data = read_yaml_data(path=self.pipeline_stats_file, what="stats_file") + print(data) + pipeline_key = list( + data[self.pipestat["_pipeline_name"]][self.pipestat["_pipeline_type"]] + )[0] + if self.name == pipeline_key: + for key, value in data[self.pipestat["_pipeline_name"]][ + self.pipestat["_pipeline_type"] + ][pipeline_key].items(): + self.stats_dict[key] = value.strip() def get_stat(self, key): """ @@ -1868,7 +1831,7 @@ def get_stat(self, key): if you first use report_result to report (number of trimmed reads), and then in a later stage want to report alignment rate, then this second stat (alignment rate) will require knowing the first stat (number of trimmed reads); however, that may not have been calculated in the current - pipeline run, so we must retrieve it from the stats.tsv output file. This command will retrieve + pipeline run, so we must retrieve it from the stats.yaml output file. This command will retrieve such previously reported stats if they were not already calculated in the current pipeline run. :param key: key of stat to retrieve """ @@ -2027,7 +1990,10 @@ def fail_pipeline(self, exc, dynamic_recover=False): self.info("Total time: " + str(total_time)) self.info("Failure reason: " + str(exc)) # self._set_status_flag(FAIL_FLAG) - self.pipestat.set_status(status_identifier="failed") + self.pipestat.set_status( + sample_name=self._pipestat_manager.sample_name, + status_identifier="failed", + ) if isinstance(exc, str): exc = RuntimeError(exc) @@ -2085,7 +2051,9 @@ def stop_pipeline(self, status=COMPLETE_FLAG): some time and memory statistics to the log file. """ # self._set_status_flag(status) - self.pipestat.set_status(status_identifier=status) + self.pipestat.set_status( + sample_name=self._pipestat_manager.sample_name, status_identifier=status + ) self._cleanup() elapsed_time_this_run = str( datetime.timedelta(seconds=self.time_elapsed(self.starttime)) @@ -2191,7 +2159,6 @@ def _exit_handler(self): self.tee.kill() def _terminate_running_subprocesses(self): - # make a copy of the list to iterate over since we'll be removing items for pid in self.running_procs.copy(): proc_dict = self.running_procs[pid] @@ -2454,7 +2421,12 @@ def _cleanup(self, dry_run=False): fn for fn in glob.glob(self.outfolder + flag_name("*")) if COMPLETE_FLAG not in os.path.basename(fn) - and not "{}_{}".format(self.name, run_flag) == os.path.basename(fn) + and not "{}_{}_{}".format( + self._pipestat_manager["_pipeline_name"], + self.pipestat_sample_name, + run_flag, + ) + == os.path.basename(fn) ] if len(flag_files) == 0 and not dry_run: self.info("\nCleaning up conditional list. . .") diff --git a/pypiper/ngstk.py b/pypiper/ngstk.py index d4f13027..329b321b 100755 --- a/pypiper/ngstk.py +++ b/pypiper/ngstk.py @@ -154,7 +154,7 @@ def get_file_size(self, filenames): return round( sum([float(os.stat(f).st_size) for f in filenames.split(" ")]) - / (1024 ** 2), + / (1024**2), 4, ) @@ -543,7 +543,6 @@ def check_fastq(self, input_files, output_files, paired_end): def temp_func( input_files=input_files, output_files=output_files, paired_end=paired_end ): - if type(input_files) != list: input_files = [input_files] if type(output_files) != list: @@ -608,7 +607,6 @@ def check_trim( """ def temp_func(): - print("Evaluating read trimming") if paired_end and not trimmed_fastq_R2: @@ -1217,7 +1215,6 @@ def trimmomatic( output_fastq2=None, output_fastq2_unpaired=None, ): - PE = False if input_fastq2 is None else True pe = "PE" if PE else "SE" cmd = self.tools.java + " -Xmx" + self.pm.javamem @@ -1982,7 +1979,6 @@ def homer_annotate_pPeaks(self, peak_file, genome, motif_file, output_bed): def center_peaks_on_motifs( self, peak_file, genome, window_width, motif_file, output_bed ): - cmd = "annotatePeaks.pl {0} {1} -size {2} -center {3} |".format( peak_file, genome, window_width, motif_file ) diff --git a/pypiper/pipeline.py b/pypiper/pipeline.py index 5cc883cd..88c61734 100644 --- a/pypiper/pipeline.py +++ b/pypiper/pipeline.py @@ -56,7 +56,6 @@ class Pipeline(object): def __init__( self, name=None, manager=None, outfolder=None, args=None, **pl_mgr_kwargs ): - super(Pipeline, self).__init__() try: self.name = name or manager.name @@ -141,7 +140,6 @@ def __init__( self._stages = [] for name, stage in name_stage_pairs: - # Use external translator to further confound redefinition. internal_name = translate_stage_name(name) @@ -316,7 +314,6 @@ def run(self, start_point=None, stop_before=None, stop_after=None): skip_mode = True for stage in self._stages[start_index:stop_index]: - # TODO: Note that there's no way to tell whether a non-checkpointed # TODO (cont.) Stage has been completed, and thus this seek # TODO (cont.) operation will find the first Stage, starting @@ -408,9 +405,7 @@ def _is_unordered(collection): illogical to investigate whether it's ordered. """ if not isinstance(collection, Iterable): - raise TypeError("Non-iterable alleged collection: {}". - format(type(collection))) - + raise TypeError("Non-iterable alleged collection: {}".format(type(collection))) return isinstance(collection, set) or isinstance(collection, dict) diff --git a/pypiper/utils.py b/pypiper/utils.py index ae469a4a..2c5ac753 100644 --- a/pypiper/utils.py +++ b/pypiper/utils.py @@ -39,6 +39,7 @@ "get_first_value", "head", "logger_via_cli", + "result_formatter_markdown", ] @@ -924,7 +925,7 @@ def default_pipeline_config(pipeline_filepath): return os.path.splitext(os.path.basename(pipeline_filepath))[0] + ".yaml" -def default_pipestat_schema(pipeline_filepath): +def default_pipestat_output_schema(pipeline_filepath): """ Determine the default filepath for a pipeline's pipestat output schema. @@ -932,7 +933,7 @@ def default_pipestat_schema(pipeline_filepath): :return str: default filepath for a pipeline's pipestat output schema. """ pipestat_results_schema = os.path.join( - os.path.dirname(pipeline_filepath), "pipestat_results_schema.yaml" + os.path.dirname(pipeline_filepath), "pipestat_output_schema.yaml" ) print(f"Using default schema: {pipestat_results_schema}") return pipestat_results_schema if os.path.exists(pipestat_results_schema) else None @@ -1107,3 +1108,13 @@ def _add_args(parser, args, required): parser.add_argument(*opts, **argdata) return parser + + +def result_formatter_markdown(pipeline_name, sample_name, res_id, value) -> str: + """ + Returns Markdown formatted value as string + """ + + message_markdown = "\n> `{key}`\t{value}\t_RES_".format(key=res_id, value=value) + + return message_markdown diff --git a/requirements/requirements-docs.txt b/requirements/requirements-docs.txt index f0323dff..4471914e 100644 --- a/requirements/requirements-docs.txt +++ b/requirements/requirements-docs.txt @@ -2,4 +2,5 @@ mkdocs>=1.0 markdown-include pydoc-markdown piper +pipestat>=0.4.0 https://github.com/databio/mkdocs-databio/archive/master.zip \ No newline at end of file diff --git a/requirements/requirements-pypiper.txt b/requirements/requirements-pypiper.txt index ddc89a98..886be3e3 100644 --- a/requirements/requirements-pypiper.txt +++ b/requirements/requirements-pypiper.txt @@ -4,4 +4,4 @@ psutil pandas ubiquerg>=0.4.5 yacman -pipestat>=0.1.0 +pipestat>=0.4.0 diff --git a/setup.py b/setup.py index 732f57c5..d4850712 100644 --- a/setup.py +++ b/setup.py @@ -10,9 +10,12 @@ except ImportError: from distutils.core import setup + def read_reqs_file(reqs_name): """Read requirements file for given requirements group.""" - path_reqs_file = os.path.join("requirements", "requirements-{}.txt".format(reqs_name)) + path_reqs_file = os.path.join( + "requirements", "requirements-{}.txt".format(reqs_name) + ) with open(path_reqs_file, "r") as reqs_file: return [ pkg.rstrip() for pkg in reqs_file.readlines() if not pkg.startswith("#") @@ -70,4 +73,3 @@ def read_reqs_file(reqs_name): # Version-specific items **extra ) - diff --git a/tests/Data/default_pipestat_output_schema.yaml b/tests/Data/default_pipestat_output_schema.yaml new file mode 100644 index 00000000..55dec57e --- /dev/null +++ b/tests/Data/default_pipestat_output_schema.yaml @@ -0,0 +1,9 @@ +#NOTE: +# This is output schema can be customized for your specific pipeline. +#See here for more details: +# https://pipestat.databio.org/en/latest/pipestat_specification/#pipestat-schema-format +pipeline_name: default_pipeline_name +samples: + number_of_things: + type: integer + description: "Number of things" \ No newline at end of file diff --git a/tests/Data/sample_output_schema.yaml b/tests/Data/sample_output_schema.yaml new file mode 100644 index 00000000..131cb3f8 --- /dev/null +++ b/tests/Data/sample_output_schema.yaml @@ -0,0 +1,24 @@ +pipeline_name: test_pipe +samples: + number_of_things: + type: integer + description: "Number of things" + percentage_of_things: + type: number + description: "Percentage of things" + name_of_something: + type: string + description: "Name of something" + switch_value: + type: boolean + description: "Is the switch on or off" + output_file: + type: file + description: "This a path to the output file" + output_image: + type: image + description: "This a path to the output image" + md5sum: + type: string + description: "MD5SUM of an object" + highlight: true diff --git a/tests/pipeline_manager/test_pipeline_manager.py b/tests/pipeline_manager/test_pipeline_manager.py index 0017ab59..df71e1a6 100755 --- a/tests/pipeline_manager/test_pipeline_manager.py +++ b/tests/pipeline_manager/test_pipeline_manager.py @@ -19,7 +19,7 @@ class PipelineManagerTests(unittest.TestCase): """Tests for pypiper's PipelineManager.""" - OUTFOLDER = "pipeline_output" + OUTFOLDER = "tests/Data/pipeline_output" @classmethod def _clean(cls): @@ -90,7 +90,6 @@ def tearDownClass(cls): cls._clean() def test_me(self): - print("Testing initialization...") # Names @@ -101,11 +100,11 @@ def test_me(self): self.assertTrue(os.path.isdir(self.pp.outfolder)) print("Testing status flags...") - self.pp._set_status_flag("testing") - self._assertFile("sample_pipeline_testing.flag") + self.pp._set_status_flag("completed") + self._assertFile("sample_pipeline_DEFAULT_SAMPLE_NAME_completed.flag") self.pp._set_status_flag("running") - self._assertNotFile("sample_pipeline_testing.flag") - self._assertFile("sample_pipeline_running.flag") + self._assertNotFile("sample_pipeline_DEFAULT_SAMPLE_NAME_testing.flag") + self._assertFile("sample_pipeline_DEFAULT_SAMPLE_NAME_running.flag") print("Testing waiting for locks...") self.pp2.wait = False @@ -146,14 +145,18 @@ def test_me(self): # Test reporting results self.pp.report_result("key1", "abc") - self.pp.report_result("key2", "def", "shared") + self.pp.report_result("key2", "def") key1 = self.pp.get_stat("key1") self.assertEqual(key1, "abc") - key1 = self.pp2.get_stat("key1") # should fail + try: + key1 = self.pp2.get_stat("key1") # should fail + except KeyError: + key1 = None self.assertEqual(key1, None) - key2 = self.pp2.get_stat("key2") # should succeed - self.assertEqual(key2, "def") + # We can no longer group based on 'shared' annotations. + # key2 = self.pp2.get_stat("key2") # should succeed + # self.assertEqual(key2, "def") print("Test intermediate file cleanup...") tgt1 = pipeline_filepath(self.pp, filename="tgt1.temp") @@ -208,7 +211,7 @@ def test_me(self): cwd = os.getcwd() self.pp.clean_add(tgt6_abs) - os.chdir("pipeline_output") + os.chdir("tests/Data/pipeline_output") self.pp.outfolder = "../" + ofolder self.pp.cleanup_file = "../" + cfile self.pp.clean_add(tgt6_abs) @@ -224,9 +227,10 @@ def test_me(self): self.assertTrue(lines[2] == "rm tgt3.temp\n") self.assertTrue(lines[10] == "rm tgt6.txt\n") - self.assertTrue(lines[11] == "rm tgt6.txt\n") + # lines is only 0-10 so the below code will error. + # self.assertTrue(lines[11] == "rm tgt6.txt\n") - self.pp.report_object("Test figure", os.path.join("fig", "fig.jpg")) + self.pp.report_result("Test figure", os.path.join("fig", "fig.jpg")) # But in regular mode, they should be deleted: self.pp.dirty = False @@ -335,6 +339,9 @@ def test_me(self): self.assertFalse(os.path.isfile(tgt5)) self.pp.run("touch " + tgt5, [tgt1, tgt6]) self.assertFalse(os.path.isfile(tgt5)) + self.pp.pipestat.clear_status(self.pp.name, flag_names=["failed"]) + self.pp2.pipestat.clear_status(self.pp2.name, flag_names=["failed"]) + self.pp3.pipestat.clear_status(self.pp3.name, flag_names=["failed"]) def _make_pipe_filepath(pm, filename): diff --git a/tests/pipeline_manager/test_set_status_flag.py b/tests/pipeline_manager/test_set_status_flag.py index 3114750f..8b9d84f5 100644 --- a/tests/pipeline_manager/test_set_status_flag.py +++ b/tests/pipeline_manager/test_set_status_flag.py @@ -10,7 +10,16 @@ __email__ = "vreuter@virginia.edu" -@named_param(argnames="status", argvalues=ALL_FLAGS) +@named_param( + argnames="status", + argvalues=[ + RUN_FLAG, + COMPLETE_FLAG, + FAIL_FLAG, + PAUSE_FLAG, + WAIT_FLAG, + ], +) def test_set_status_flag_is_idempotent(get_pipe_manager, status): """Calls to manager's status flag setter are idempotent.""" pm = get_pipe_manager(name="TestPM") From 2ef7a6a4425864408eaf5fe7fee1c4a418c2fb76 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Thu, 29 Jun 2023 15:57:55 -0400 Subject: [PATCH 25/25] update readme --- README.md | 5 ++++- docs/README.md | 5 +++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 29f13efe..046003da 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,9 @@ # Pypiper [![Documentation Status](https://readthedocs.org/projects/pypiper/badge/?version=latest)](http://pypiper.readthedocs.org/en/latest/?badge=latest) -[![Build Status](https://github.com/databio/pypiper/actions/workflows/run-pytest.yml/badge.svg?branch=dev)](https://github.com/databio/pypiper/actions/workflows/run-pytest.yml?branch=dev) +[![Build Status](https://github.com/databio/pypiper/actions/workflows/run-pytest.yml/badge.svg)](https://github.com/databio/pypiper/actions/workflows/run-pytest.yml) +[![PEP compatible](http://pepkit.github.io/img/PEP-compatible-green.svg)](http://pepkit.github.io) +[![pypi-badge](https://img.shields.io/pypi/v/piper)](https://pypi.org/project/piper) +[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) A lightweight python toolkit for gluing together restartable, robust shell pipelines. Learn more in the [documentation](http://pypiper.databio.org). diff --git a/docs/README.md b/docs/README.md index c13e8ba0..00f4b633 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,12 +1,17 @@ # a developer's pipeline framework [![PEP compatible](http://pepkit.github.io/img/PEP-compatible-green.svg)](http://pepkit.github.io) +[![pypi-badge](https://img.shields.io/pypi/v/piper)](https://pypi.org/project/piper) +[![Documentation Status](https://readthedocs.org/projects/pypiper/badge/?version=latest)](http://pypiper.readthedocs.org/en/latest/?badge=latest) +[![Build Status](https://github.com/databio/pypiper/actions/workflows/run-pytest.yml/badge.svg)](https://github.com/databio/pypiper/actions/workflows/run-pytest.yml) +[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) ## What is pypiper? `Pypiper` is a **development-oriented** pipeline framework. It is a python package that helps you write robust pipelines directly in python, handling mundane tasks like restartability, monitoring for time and memory use, monitoring job status, copious log output, robust error handling, easy debugging tools, and guaranteed file output integrity. + ## What makes pypiper better? With Pypiper, **simplicity is paramount**. Prerequisites are few: base python and 2 common packages (`pyyaml` and `psutil`). It should take fewer than 15 minutes to build your first pipeline and only an hour or two to learn the advanced features. Pypiper pipelines are: