diff --git a/.travis.yml b/.travis.yml
index f345282e..2308e86f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,7 +1,6 @@
language: python
python:
- "2.7"
- - "3.4"
- "3.5"
- "3.6"
os:
diff --git a/README.md b/README.md
index f17aa958..046003da 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,9 @@
# Pypiper
[![Documentation Status](https://readthedocs.org/projects/pypiper/badge/?version=latest)](http://pypiper.readthedocs.org/en/latest/?badge=latest)
-[![Build Status](https://travis-ci.org/databio/pypiper.svg?branch=master)](https://travis-ci.org/databio/pypiper)
+[![Build Status](https://github.com/databio/pypiper/actions/workflows/run-pytest.yml/badge.svg)](https://github.com/databio/pypiper/actions/workflows/run-pytest.yml)
+[![PEP compatible](http://pepkit.github.io/img/PEP-compatible-green.svg)](http://pepkit.github.io)
+[![pypi-badge](https://img.shields.io/pypi/v/piper)](https://pypi.org/project/piper)
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
A lightweight python toolkit for gluing together restartable, robust shell pipelines. Learn more in the [documentation](http://pypiper.databio.org).
diff --git a/docs/README.md b/docs/README.md
index c13e8ba0..00f4b633 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,12 +1,17 @@
# a developer's pipeline framework
[![PEP compatible](http://pepkit.github.io/img/PEP-compatible-green.svg)](http://pepkit.github.io)
+[![pypi-badge](https://img.shields.io/pypi/v/piper)](https://pypi.org/project/piper)
+[![Documentation Status](https://readthedocs.org/projects/pypiper/badge/?version=latest)](http://pypiper.readthedocs.org/en/latest/?badge=latest)
+[![Build Status](https://github.com/databio/pypiper/actions/workflows/run-pytest.yml/badge.svg)](https://github.com/databio/pypiper/actions/workflows/run-pytest.yml)
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
## What is pypiper?
`Pypiper` is a **development-oriented** pipeline framework. It is a python package that helps you write robust pipelines directly in python, handling mundane tasks like restartability, monitoring for time and memory use, monitoring job status, copious log output, robust error handling, easy debugging tools, and guaranteed file output integrity.
+
## What makes pypiper better?
With Pypiper, **simplicity is paramount**. Prerequisites are few: base python and 2 common packages (`pyyaml` and `psutil`). It should take fewer than 15 minutes to build your first pipeline and only an hour or two to learn the advanced features. Pypiper pipelines are:
diff --git a/docs/changelog.md b/docs/changelog.md
index ddecfdf5..34a500c9 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,5 +1,9 @@
# Changelog
+## [0.13.0] -- 2023-06-29
+### Added
+
+- [pipestat](http://pipestat.databio.org/en/latest/) support
## [0.12.3] -- 2022-01-25
@@ -11,7 +15,6 @@
### Fixed
- Removed use2to3 for compatibility with setuptools 58
-
## [0.12.1] -- 2019-08-29
### Fixed
diff --git a/docs/conf.py b/docs/conf.py
index 27ec7815..0566b0b9 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -12,66 +12,72 @@
# All configuration values have a default; values that are commented out
# serve to show the default.
-import sys
import os
+import sys
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
-#sys.path.insert(0, os.path.abspath('.'))
-sys.path.insert(0, os.path.abspath('../../'))
+# sys.path.insert(0, os.path.abspath('.'))
+sys.path.insert(0, os.path.abspath("../../"))
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
-#needs_sphinx = '1.0'
+# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
- 'sphinx.ext.autodoc',
- 'sphinx.ext.autosummary',
- 'sphinx.ext.intersphinx',
- 'sphinx.ext.todo',
- 'sphinx.ext.coverage',
- 'sphinx.ext.viewcode',
+ "sphinx.ext.autodoc",
+ "sphinx.ext.autosummary",
+ "sphinx.ext.intersphinx",
+ "sphinx.ext.todo",
+ "sphinx.ext.coverage",
+ "sphinx.ext.viewcode",
]
# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
# The suffix of source filenames.
-source_suffix = '.rst'
+source_suffix = ".rst"
# The encoding of source files.
-#source_encoding = 'utf-8-sig'
+# source_encoding = 'utf-8-sig'
# The master toctree document.
-master_doc = 'index'
+master_doc = "index"
# General information about the project.
-project = u'pypiper'
-copyright = u'2015, Nathan Sheffield, Johanna Klughammer, Andre Rendeiro'
+project = "pypiper"
+copyright = "2015, Nathan Sheffield, Johanna Klughammer, Andre Rendeiro"
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
-version = open(os.path.join("..", "..", "pypiper", "_version.py")).read().strip().split(" ")[-1].strip('"')
+version = (
+ open(os.path.join("..", "..", "pypiper", "_version.py"))
+ .read()
+ .strip()
+ .split(" ")[-1]
+ .strip('"')
+)
# The full version, including alpha/beta/rc tags.
release = version
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
-#language = None
+# language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
-#today = ''
+# today = ''
# Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
+# today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
@@ -79,27 +85,27 @@
# The reST default role (used for this markup: `text`) to use for all
# documents.
-#default_role = None
+# default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
+# add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
-#add_module_names = True
+# add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
-#show_authors = False
+# show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
# A list of ignored prefixes for module index sorting.
-#modindex_common_prefix = []
+# modindex_common_prefix = []
# If true, keep warnings as "system message" paragraphs in the built documents.
-#keep_warnings = False
+# keep_warnings = False
# -- Options for HTML output ----------------------------------------------
@@ -115,122 +121,125 @@
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
-#html_theme_options = {}
+# html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
-#html_theme_path = []
+# html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to
# " v documentation".
-#html_title = None
+# html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
-#html_short_title = None
+# html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
-#html_logo = None
+# html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
-#html_favicon = None
+# html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
# Add any extra paths that contain custom files (such as robots.txt or
# .htaccess) here, relative to this directory. These files are copied
# directly to the root of the documentation.
-#html_extra_path = []
+# html_extra_path = []
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
-#html_last_updated_fmt = '%b %d, %Y'
+# html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
-#html_use_smartypants = True
+# html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
+# html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
-#html_additional_pages = {}
+# html_additional_pages = {}
# If false, no module index is generated.
-#html_domain_indices = True
+# html_domain_indices = True
# If false, no index is generated.
-#html_use_index = True
+# html_use_index = True
# If true, the index is split into individual pages for each letter.
-#html_split_index = False
+# html_split_index = False
# If true, links to the reST sources are added to the pages.
-#html_show_sourcelink = True
+# html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
-#html_show_sphinx = True
+# html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
-#html_show_copyright = True
+# html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
-#html_use_opensearch = ''
+# html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
-#html_file_suffix = None
+# html_file_suffix = None
# Output file base name for HTML help builder.
-htmlhelp_basename = 'pypiperdoc'
+htmlhelp_basename = "pypiperdoc"
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
-# The paper size ('letterpaper' or 'a4paper').
-#'papersize': 'letterpaper',
-
-# The font size ('10pt', '11pt' or '12pt').
-#'pointsize': '10pt',
-
-# Additional stuff for the LaTeX preamble.
-#'preamble': '',
+ # The paper size ('letterpaper' or 'a4paper').
+ #'papersize': 'letterpaper',
+ # The font size ('10pt', '11pt' or '12pt').
+ #'pointsize': '10pt',
+ # Additional stuff for the LaTeX preamble.
+ #'preamble': '',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
- ('index', 'pypiper.tex', u'pypiper Documentation',
- u'Nathan Sheffield, Johanna Klughammer, Andre Rendeiro', 'manual'),
+ (
+ "index",
+ "pypiper.tex",
+ "pypiper Documentation",
+ "Nathan Sheffield, Johanna Klughammer, Andre Rendeiro",
+ "manual",
+ ),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
-#latex_logo = None
+# latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
-#latex_use_parts = False
+# latex_use_parts = False
# If true, show page references after internal links.
-#latex_show_pagerefs = False
+# latex_show_pagerefs = False
# If true, show URL addresses after external links.
-#latex_show_urls = False
+# latex_show_urls = False
# Documents to append as an appendix to all manuals.
-#latex_appendices = []
+# latex_appendices = []
# If false, no module index is generated.
-#latex_domain_indices = True
+# latex_domain_indices = True
# -- Options for manual page output ---------------------------------------
@@ -238,12 +247,17 @@
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
- ('index', 'pypiper', u'pypiper Documentation',
- [u'Nathan Sheffield, Johanna Klughammer, Andre Rendeiro'], 1)
+ (
+ "index",
+ "pypiper",
+ "pypiper Documentation",
+ ["Nathan Sheffield, Johanna Klughammer, Andre Rendeiro"],
+ 1,
+ )
]
# If true, show URL addresses after external links.
-#man_show_urls = False
+# man_show_urls = False
# -- Options for Texinfo output -------------------------------------------
@@ -252,93 +266,99 @@
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
- ('index', 'pypiper', u'pypiper Documentation',
- u'Nathan Sheffield, Johanna Klughammer, Andre Rendeiro', 'pypiper', 'One line description of project.',
- 'Miscellaneous'),
+ (
+ "index",
+ "pypiper",
+ "pypiper Documentation",
+ "Nathan Sheffield, Johanna Klughammer, Andre Rendeiro",
+ "pypiper",
+ "One line description of project.",
+ "Miscellaneous",
+ ),
]
# Documents to append as an appendix to all manuals.
-#texinfo_appendices = []
+# texinfo_appendices = []
# If false, no module index is generated.
-#texinfo_domain_indices = True
+# texinfo_domain_indices = True
# How to display URL addresses: 'footnote', 'no', or 'inline'.
-#texinfo_show_urls = 'footnote'
+# texinfo_show_urls = 'footnote'
# If true, do not generate a @detailmenu in the "Top" node's menu.
-#texinfo_no_detailmenu = False
+# texinfo_no_detailmenu = False
# -- Options for Epub output ----------------------------------------------
# Bibliographic Dublin Core info.
-epub_title = u'pypiper'
-epub_author = u'Nathan Sheffield, Johanna Klughammer, Andre Rendeiro'
-epub_publisher = u'Nathan Sheffield, Johanna Klughammer, Andre Rendeiro'
-epub_copyright = u'2015, Nathan Sheffield, Johanna Klughammer, Andre Rendeiro'
+epub_title = "pypiper"
+epub_author = "Nathan Sheffield, Johanna Klughammer, Andre Rendeiro"
+epub_publisher = "Nathan Sheffield, Johanna Klughammer, Andre Rendeiro"
+epub_copyright = "2015, Nathan Sheffield, Johanna Klughammer, Andre Rendeiro"
# The basename for the epub file. It defaults to the project name.
-#epub_basename = u'pypiper'
+# epub_basename = u'pypiper'
# The HTML theme for the epub output. Since the default themes are not optimized
# for small screen space, using the same theme for HTML and epub output is
# usually not wise. This defaults to 'epub', a theme designed to save visual
# space.
-#epub_theme = 'epub'
+# epub_theme = 'epub'
# The language of the text. It defaults to the language option
# or en if the language is not set.
-#epub_language = ''
+# epub_language = ''
# The scheme of the identifier. Typical schemes are ISBN or URL.
-#epub_scheme = ''
+# epub_scheme = ''
# The unique identifier of the text. This can be a ISBN number
# or the project homepage.
-#epub_identifier = ''
+# epub_identifier = ''
# A unique identification for the text.
-#epub_uid = ''
+# epub_uid = ''
# A tuple containing the cover image and cover page html template filenames.
-#epub_cover = ()
+# epub_cover = ()
# A sequence of (type, uri, title) tuples for the guide element of content.opf.
-#epub_guide = ()
+# epub_guide = ()
# HTML files that should be inserted before the pages created by sphinx.
# The format is a list of tuples containing the path and title.
-#epub_pre_files = []
+# epub_pre_files = []
# HTML files shat should be inserted after the pages created by sphinx.
# The format is a list of tuples containing the path and title.
-#epub_post_files = []
+# epub_post_files = []
# A list of files that should not be packed into the epub file.
-epub_exclude_files = ['search.html']
+epub_exclude_files = ["search.html"]
# The depth of the table of contents in toc.ncx.
-#epub_tocdepth = 3
+# epub_tocdepth = 3
# Allow duplicate toc entries.
-#epub_tocdup = True
+# epub_tocdup = True
# Choose between 'default' and 'includehidden'.
-#epub_tocscope = 'default'
+# epub_tocscope = 'default'
# Fix unsupported image types using the PIL.
-#epub_fix_images = False
+# epub_fix_images = False
# Scale large images.
-#epub_max_image_width = 0
+# epub_max_image_width = 0
# How to display URL addresses: 'footnote', 'no', or 'inline'.
-#epub_show_urls = 'inline'
+# epub_show_urls = 'inline'
# If false, no index is generated.
-#epub_use_index = True
+# epub_use_index = True
# Example configuration for intersphinx: refer to the Python standard library.
-intersphinx_mapping = {'http://docs.python.org/': None}
+intersphinx_mapping = {"http://docs.python.org/": None}
diff --git a/docs/outputs.md b/docs/outputs.md
index 30ad8b5e..786086b1 100644
--- a/docs/outputs.md
+++ b/docs/outputs.md
@@ -9,7 +9,7 @@ Assume you are using a pypiper pipeline named `PIPE` ( it passes `name="PIPE"` t
* **PIPE_status.flag**
As the pipeline runs, it produces a flag in the output directory, which can be either `PIPE_running.flag`, `PIPE_failed.flag`, or `PIPE_completed.flag`. These flags make it easy to assess the current state of running pipelines for individual samples, and for many samples in a project simultaneously.
-* **stats.tsv**
+* **stats.yaml**
Any results reported by the pipeline are saved as key-value pairs in this file, for easy parsing.
* **PIPE_profile.md**
diff --git a/docs/pipestat.md b/docs/pipestat.md
new file mode 100644
index 00000000..77534f09
--- /dev/null
+++ b/docs/pipestat.md
@@ -0,0 +1,122 @@
+# Pipestat
+
+Starting with pypiper v0.13.0 [pipestat](http://pipestat.databio.org) is the recommended way of reporting pipeline statistics.
+You can browse the pipestat documentation to learn more about it, but briefly pipestat is a tool that standardizes reporting of pipeline results. It provides 1) a standard specification for how pipeline outputs should be stored; and 2) an implementation to easily write results to that format from within Python or from the command line.
+
+## Advancements
+
+There are a multiple advantages of using pipestat instead of the current pipeline results reporting system:
+
+1. **Database results storage:** the results can be stored either in a database or a YAML-formatted results file. This way a pypiper pipeline running in an emphemeral compute environment can report the results to the database and exit. No need to sync the results with a central results storage.
+2. **Strict and clear results definition:** all the results that can be reported by a pipeline run *must* be pre-defined in a [pipestat results schema](http://pipestat.databio.org/en/latest/pipestat_specification/#pipestat-schema-format) that in a simplest case just indicates the result's type. This presents pipestat clients with the possibility to *reliably* gather all the possible results and related metadata.
+3. **On-the-fly results validation:** the schema is used to validate and/or convert the reported result to a strictly determined type, which makes the connection of pypiper with downstream pipeline results processing software seamless.
+4. **Unified, pipeline-agnostic results interface:** other pipelines, possibly created with different pipeline frameworks, can read and write results via Python API or command line interface. This feature significantly incerases your pipeline interoperability.
+
+## Setup
+
+In order to start reporting results with pipestat in your pipeline all you need to do is define a [pipestat resuts schema](http://pipestat.databio.org/en/latest/pipestat_specification/#pipestat-schema-format):
+
+```yaml
+my_int_result:
+ type: integer
+ description: "This is my first result"
+my_str_result:
+ type: string
+```
+
+And in the simplest case... that's it! Now you can use `pipestat` property of the `PipelineManager` object to report/retrieve results.
+
+Pypiper *by default* will use a YAML-formated file to store the reported results in the selected `outfolder` and will look for `pipestat_results_schema.yaml` file in the pipeline Python script directory.
+
+### Advanced features
+
+Pypiper-pipestat integration really shines when more advanced features are used. Here's how to set them up.
+
+#### Configure custom pipestat options
+
+You can configure pipestat by passing arguments with custom values to `pypiper.PipelineManager` constructor:
+
+```python
+pm = pypiper.PipelineManager(
+ ...,
+ pipestat_schema="custom_results_schema.yaml",
+ pipestat_results_file="custom_results_file.yaml",
+ pipestat_sample_name="my_record",
+ pipestat_project_name="my_namespace",
+ pipestat_config="custom_pipestat_config.yaml",
+)
+```
+
+#### Use a database to store reported results
+
+In order to establish a database connection pipestat requires few pieces of information, which *must* be provided in a [pipestat configuration file](http://pipestat.databio.org/en/latest/config/) passed to the `PipelineManager` constructor.
+
+This is an example of such a file:
+
+```yaml
+database:
+ name: pypiper # database name
+ user: pypiper # database user name
+ password: pypiper # database password
+ host: localhost # database host address
+ port: 5433 # port the database is running on
+ dialect: postgresql # type of the databse
+ driver: psycopg2 # driver to use to communicate
+```
+
+For reference, here is a Docker command that would run a PostgreSQL instance that could be used to store the pipeline results when configured with with the configuration file above:
+
+```console
+docker volume create postgres-data
+
+docker run -d --name pypiper-postgres \
+-p 5432:5433 -e POSTGRES_PASSWORD=pypiper \
+-e POSTGRES_USER=pypiper -e POSTGRES_DB=pypiper \
+-v postgres-data:/var/lib/postgresql/data postgres
+```
+
+#### Highlight results
+
+The pipestat results schema can include any number of additional attributes for results. An example of that is *results highlighting*.
+
+When a `highlight: true` attribute is included attribute under result identifier in the schema file the highlighted results can be later retrieved by pipestat clients via `PipelineManager.pipestat.highlighted_results` property, which simply returns a list of result identifiers. to be presented in a special way.
+
+### Usage
+
+Since a pipeline run-specific `PipestatManager` instance is attached to the `PipelineManager` object all the public pipestat API can be used. Please refer to the [pipestat API documentation](http://pipestat.databio.org/en/latest/autodoc_build/pipestat/) to read about all the currently available features.
+
+Here we present the most commonly used features:
+
+- results reporting
+
+*report a result, convert to schema-defined type and overwrite previously reported result*
+
+```python
+results = {
+ "my_int_result": 10,
+ "my_str_result": "test"
+}
+pm.pipestat.report(
+ values=results,
+ strict_type=True,
+ force_overwrite=True
+)
+```
+
+- results retrieval
+
+```python
+pm.pipestat.retrieve(result_identifier="my_int_result")
+```
+
+- results schema exploration
+
+```python
+pm.pipestat.schema
+```
+
+- exploration of canonical [jsonschema](https://json-schema.org/) representation of result schemas
+
+```python
+pm.pipestat.result_schemas
+```
diff --git a/docs/report.md b/docs/report.md
index 7e8e5f46..fd50c270 100644
--- a/docs/report.md
+++ b/docs/report.md
@@ -6,6 +6,8 @@ When you call `pm.report_result(key, value)`, pypiper simply writes the key-valu
## Reporting objects
+**Note**: Reporting objects will be deprecated in a future release. It is recommended to use `report_result`.
+
Starting in version 0.8, pypiper now implements a second reporting function, `report_object`. This is analogous to the `report_result` function, but instead of reporting simple key-value pairs, it lets you record any produced file as an output. Most commonly, this is used to record figures (PDFs, PNGs, etc.) produced by the pipeline. It can also be used to report other files, like HTML files.
Pypiper writes results to `objects.tsv`, which can then be aggregated for project-level summaries of plots and other pipeline result files.
diff --git a/example_pipelines/basic.py b/example_pipelines/basic.py
index d4c7bd55..34a0d377 100755
--- a/example_pipelines/basic.py
+++ b/example_pipelines/basic.py
@@ -8,13 +8,13 @@
# First, make sure you can import the pypiper package
import os
+
import pypiper
# Create a PipelineManager instance (don't forget to name it!)
# This starts the pipeline.
-pm = pypiper.PipelineManager(name="BASIC",
- outfolder="pipeline_output/")
+pm = pypiper.PipelineManager(name="BASIC", outfolder="pipeline_output/")
# Now just build shell command strings, and use the run function
# to execute them in order. run needs 2 things: a command, and the
@@ -57,5 +57,5 @@
# Now, stop the pipeline to complete gracefully.
pm.stop_pipeline()
-# Observe your outputs in the pipeline_output folder
+# Observe your outputs in the pipeline_output folder
# to see what you've created.
diff --git a/example_pipelines/count_reads.py b/example_pipelines/count_reads.py
index c9703da9..f7648dec 100755
--- a/example_pipelines/count_reads.py
+++ b/example_pipelines/count_reads.py
@@ -9,25 +9,32 @@
__license__ = "GPL3"
__version__ = "0.1"
-from argparse import ArgumentParser
-import os, re
-import sys
+import os
+import re
import subprocess
+import sys
+from argparse import ArgumentParser
+
import yaml
+
import pypiper
parser = ArgumentParser(
description="A pipeline to count the number of reads and file size. Accepts"
- " BAM, fastq, or fastq.gz files.")
+ " BAM, fastq, or fastq.gz files."
+)
# First, add standard arguments from Pypiper.
# groups="pypiper" will add all the arguments that pypiper uses,
# and adding "common" adds arguments for --input and --sample--name
# and "output_parent". You can read more about your options for standard
# arguments in the pypiper docs (section "command-line arguments")
-parser = pypiper.add_pypiper_args(parser, groups=["pypiper", "common", "ngs"],
- args=["output-parent", "config"],
- required=['sample-name', 'output-parent'])
+parser = pypiper.add_pypiper_args(
+ parser,
+ groups=["pypiper", "common", "ngs"],
+ args=["output-parent", "config"],
+ required=["sample-name", "output-parent"],
+)
# Add any pipeline-specific arguments if you like here.
@@ -42,16 +49,14 @@
else:
args.paired_end = False
-# args for `output_parent` and `sample_name` were added by the standard
-# `add_pypiper_args` function.
+# args for `output_parent` and `sample_name` were added by the standard
+# `add_pypiper_args` function.
# A good practice is to make an output folder for each sample, housed under
# the parent output folder, like this:
outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name))
# Create a PipelineManager object and start the pipeline
-pm = pypiper.PipelineManager(name="count",
- outfolder=outfolder,
- args=args)
+pm = pypiper.PipelineManager(name="count", outfolder=outfolder, args=args)
# NGSTk is a "toolkit" that comes with pypiper, providing some functions
# for dealing with genome sequence data. You can read more about toolkits in the
@@ -75,15 +80,12 @@
# and convert these to fastq files.
local_input_files = ngstk.merge_or_link(
- [args.input, args.input2],
- raw_folder,
- args.sample_name)
+ [args.input, args.input2], raw_folder, args.sample_name
+)
cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq(
- local_input_files,
- args.sample_name,
- args.paired_end,
- fastq_folder)
+ local_input_files, args.sample_name, args.paired_end, fastq_folder
+)
# Now we'll use another NGSTk function to grab the file size from the input files
@@ -95,10 +97,17 @@
n_input_files = len(list(filter(bool, local_input_files)))
-raw_reads = sum([int(ngstk.count_reads(input_file, args.paired_end))
- for input_file in local_input_files]) / n_input_files
-
-# Finally, we use the report_result() function to print the output and
+raw_reads = (
+ sum(
+ [
+ int(ngstk.count_reads(input_file, args.paired_end))
+ for input_file in local_input_files
+ ]
+ )
+ / n_input_files
+)
+
+# Finally, we use the report_result() function to print the output and
# log the key-value pair in the standard stats.tsv file
pm.report_result("Raw_reads", str(raw_reads))
diff --git a/example_pipelines/hello_pypiper.py b/example_pipelines/hello_pypiper.py
index 2824a142..88abecfd 100755
--- a/example_pipelines/hello_pypiper.py
+++ b/example_pipelines/hello_pypiper.py
@@ -1,7 +1,8 @@
#!/usr/bin/env python
import pypiper
-outfolder = "hello_pypiper_results" # Choose a folder for your results
+
+outfolder = "hello_pypiper_results" # Choose a folder for your results
# Create a PipelineManager, the workhorse of pypiper
pm = pypiper.PipelineManager(name="hello_pypiper", outfolder=outfolder)
diff --git a/example_pipelines/logmuse_example.py b/example_pipelines/logmuse_example.py
index 91fe73f2..3b98b6df 100755
--- a/example_pipelines/logmuse_example.py
+++ b/example_pipelines/logmuse_example.py
@@ -9,52 +9,56 @@
__license__ = "GPL3"
__version__ = "0.1"
-from argparse import ArgumentParser
-import os, re
-import sys
+import os
+import re
import subprocess
+import sys
+from argparse import ArgumentParser
+
import yaml
-import pypiper
+import pypiper
def build_argparser():
-
parser = ArgumentParser(
description="A pipeline to count the number of reads and file size. Accepts"
- " BAM, fastq, or fastq.gz files.")
+ " BAM, fastq, or fastq.gz files."
+ )
# First, add standard arguments from Pypiper.
# groups="pypiper" will add all the arguments that pypiper uses,
# and adding "common" adds arguments for --input and --sample--name
# and "output_parent". You can read more about your options for standard
# arguments in the pypiper docs (section "command-line arguments")
- parser = pypiper.add_pypiper_args(parser, groups=["pypiper", "common", "ngs", "logmuse"],
- args=["output-parent", "config"],
- required=['sample-name', 'output-parent'])
+ parser = pypiper.add_pypiper_args(
+ parser,
+ groups=["pypiper", "common", "ngs", "logmuse"],
+ args=["output-parent", "config"],
+ required=["sample-name", "output-parent"],
+ )
# Add any pipeline-specific arguments if you like here.
- # args for `output_parent` and `sample_name` were added by the standard
- # `add_pypiper_args` function.
+ # args for `output_parent` and `sample_name` were added by the standard
+ # `add_pypiper_args` function.
return parser
+
def run_pipeline():
# A good practice is to make an output folder for each sample, housed under
# the parent output folder, like this:
outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name))
# Create a PipelineManager object and start the pipeline
- pm = pypiper.PipelineManager(name="logmuse-test",
- outfolder=outfolder,
- args=args)
+ pm = pypiper.PipelineManager(name="logmuse-test", outfolder=outfolder, args=args)
pm.info("Getting started!")
# NGSTk is a "toolkit" that comes with pypiper, providing some functions
# for dealing with genome sequence data. You can read more about toolkits in the
# documentation
- files = [str(x) + ".tmp" for x in range(1,20)]
+ files = [str(x) + ".tmp" for x in range(1, 20)]
pm.run("touch " + " ".join(files), target=files, clean=True)
@@ -76,30 +80,32 @@ def run_pipeline():
# and convert these to fastq files.
local_input_files = ngstk.merge_or_link(
- [args.input, args.input2],
- raw_folder,
- args.sample_name)
+ [args.input, args.input2], raw_folder, args.sample_name
+ )
cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq(
- local_input_files,
- args.sample_name,
- args.paired_end,
- fastq_folder)
-
+ local_input_files, args.sample_name, args.paired_end, fastq_folder
+ )
# Now we'll use another NGSTk function to grab the file size from the input files
#
pm.report_result("File_mb", ngstk.get_file_size(local_input_files))
-
# And then count the number of reads in the file
n_input_files = len(list(filter(bool, local_input_files)))
- raw_reads = sum([int(ngstk.count_reads(input_file, args.paired_end))
- for input_file in local_input_files]) / n_input_files
-
- # Finally, we use the report_result() function to print the output and
+ raw_reads = (
+ sum(
+ [
+ int(ngstk.count_reads(input_file, args.paired_end))
+ for input_file in local_input_files
+ ]
+ )
+ / n_input_files
+ )
+
+ # Finally, we use the report_result() function to print the output and
# log the key-value pair in the standard stats.tsv file
pm.report_result("Raw_reads", str(raw_reads))
@@ -107,7 +113,7 @@ def run_pipeline():
pm.stop_pipeline()
-if __name__ == '__main__':
+if __name__ == "__main__":
try:
parser = build_argparser()
args = parser.parse_args()
diff --git a/init_interactive.py b/init_interactive.py
index b63e4fb5..15dfab1f 100644
--- a/init_interactive.py
+++ b/init_interactive.py
@@ -1,14 +1,12 @@
""" Create dummy PipelineManager and NGSTk instance for interactive session. """
import os
-from pypiper import PipelineManager
-from pypiper import NGSTk
+from pypiper import NGSTk, PipelineManager
__author__ = "Vince Reuter"
__email__ = "vreuter@virginia.edu"
-
pm = PipelineManager(name="interactive", outfolder=os.path.expanduser("~"))
tk = NGSTk(pm=pm)
diff --git a/mkdocs.yml b/mkdocs.yml
index e3eb2694..f3a3a1fd 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -16,6 +16,7 @@ nav:
- Automatic command-line arguments: cli.md
- Configuring pipelines: configuration.md
- Reporting statistics: report.md
+ - Reporting statistics with pipestat: pipestat.md
- Cleaning up intermediate files: clean.md
- Best practices: best-practices.md
- Toolkits:
diff --git a/pypiper/__init__.py b/pypiper/__init__.py
index 6a1802d1..3076285e 100644
--- a/pypiper/__init__.py
+++ b/pypiper/__init__.py
@@ -1,10 +1,10 @@
+# Implicitly re-export so logmuse usage by pipeline author routes through here.
+from logmuse import add_logging_options
+
from ._version import __version__
+from .exceptions import *
from .manager import *
from .ngstk import *
-from .utils import *
from .pipeline import *
-from .exceptions import *
from .stage import *
-
-# Implicitly re-export so logmuse usage by pipeline author routes through here.
-from logmuse import add_logging_options
+from .utils import *
diff --git a/pypiper/_version.py b/pypiper/_version.py
index 8e1395bd..f23a6b39 100644
--- a/pypiper/_version.py
+++ b/pypiper/_version.py
@@ -1 +1 @@
-__version__ = "0.12.3"
+__version__ = "0.13.0"
diff --git a/pypiper/const.py b/pypiper/const.py
index 5f2d66e8..27495297 100644
--- a/pypiper/const.py
+++ b/pypiper/const.py
@@ -2,6 +2,7 @@
CHECKPOINT_EXTENSION = ".checkpoint"
+DEFAULT_SAMPLE_NAME = "DEFAULT_SAMPLE_NAME"
PIPELINE_CHECKPOINT_DELIMITER = "_"
STAGE_NAME_SPACE_REPLACEMENT = "-"
-PROFILE_COLNAMES = ['pid', 'hash', 'cid', 'runtime', 'mem', 'cmd', 'lock']
+PROFILE_COLNAMES = ["pid", "hash", "cid", "runtime", "mem", "cmd", "lock"]
diff --git a/pypiper/exceptions.py b/pypiper/exceptions.py
index 33e3a10c..063b3641 100644
--- a/pypiper/exceptions.py
+++ b/pypiper/exceptions.py
@@ -4,41 +4,46 @@
__email__ = "vreuter@virginia.edu"
-__all__ = ["PipelineError", "PipelineHalt", "IllegalPipelineDefinitionError",
- "IllegalPipelineExecutionError", "MissingCheckpointError",
- "UnknownPipelineStageError", "UnsupportedFiletypeException",
- "SubprocessError"]
-
-
+__all__ = [
+ "PipelineError",
+ "PipelineHalt",
+ "IllegalPipelineDefinitionError",
+ "IllegalPipelineExecutionError",
+ "MissingCheckpointError",
+ "UnknownPipelineStageError",
+ "UnsupportedFiletypeException",
+ "SubprocessError",
+]
class PipelineError(Exception):
- """ General pipeline error. """
+ """General pipeline error."""
+
pass
+
class SubprocessError(Exception):
pass
+
class IllegalPipelineDefinitionError(PipelineError):
pass
-
class IllegalPipelineExecutionError(PipelineError):
- """ Represent cases of illogical start/stop run() declarations. """
- pass
+ """Represent cases of illogical start/stop run() declarations."""
+ pass
class MissingCheckpointError(Exception):
- """ Represent case of expected but absent checkpoint file. """
+ """Represent case of expected but absent checkpoint file."""
def __init__(self, checkpoint, filepath):
msg = "{}: '{}'".format(checkpoint, filepath)
super(MissingCheckpointError, self).__init__(msg)
-
class UnknownPipelineStageError(Exception):
"""
Triggered by use of unknown/undefined name for a pipeline stage.
@@ -47,7 +52,6 @@ class UnknownPipelineStageError(Exception):
:param pypiper.Pipeline pipeline: Pipeline for which the stage is unknown/undefined.
"""
-
def __init__(self, stage_name, pipeline=None):
message = stage_name
if pipeline is not None:
@@ -57,12 +61,12 @@ def __init__(self, stage_name, pipeline=None):
# Just don't contextualize the error with known stages.
pass
else:
- message = "{}; defined stages: {}". \
- format(message, ", ".join(map(str, stages)))
+ message = "{}; defined stages: {}".format(
+ message, ", ".join(map(str, stages))
+ )
super(UnknownPipelineStageError, self).__init__(message)
-
class PipelineHalt(Exception):
"""
Execution-stopping exception for halting a pipeline.
@@ -74,6 +78,7 @@ class PipelineHalt(Exception):
PipelineManager's halt method raise this exception.
"""
+
def __init__(self, checkpoint=None, finished=None):
if checkpoint is None:
super(PipelineHalt, self).__init__()
@@ -81,8 +86,9 @@ def __init__(self, checkpoint=None, finished=None):
if isinstance(checkpoint, str):
last_stage_done = checkpoint
else:
- last_stage_done = getattr(checkpoint, "name", None) or \
- getattr(checkpoint, "__name__", None)
+ last_stage_done = getattr(checkpoint, "name", None) or getattr(
+ checkpoint, "__name__", None
+ )
if not last_stage_done:
super(PipelineHalt, self).__init__()
else:
@@ -95,9 +101,9 @@ def __init__(self, checkpoint=None, finished=None):
super(PipelineHalt, self).__init__(msg)
-
class UnsupportedFiletypeException(Exception):
- """ Restrict filetype domain. """
+ """Restrict filetype domain."""
+
# Use superclass ctor to allow file name/path or extension to pass
# through as the message for why this error is occurring.
pass
diff --git a/pypiper/flags.py b/pypiper/flags.py
index 09e3fb85..21e97d27 100644
--- a/pypiper/flags.py
+++ b/pypiper/flags.py
@@ -8,5 +8,4 @@
PAUSE_FLAG = "partial"
FLAGS = [RUN_FLAG, COMPLETE_FLAG, FAIL_FLAG, WAIT_FLAG, PAUSE_FLAG]
-__all__ = ["COMPLETE_FLAG", "FAIL_FLAG", "FLAGS",
- "PAUSE_FLAG", "RUN_FLAG", "WAIT_FLAG"]
+__all__ = ["COMPLETE_FLAG", "FAIL_FLAG", "FLAGS", "PAUSE_FLAG", "RUN_FLAG", "WAIT_FLAG"]
diff --git a/pypiper/folder_context.py b/pypiper/folder_context.py
index 360d6c0c..77828af5 100644
--- a/pypiper/folder_context.py
+++ b/pypiper/folder_context.py
@@ -2,14 +2,12 @@
import os
-
__author__ = "Vince Reuter"
__email__ = "vreuter@virginia.edu"
-
class FolderContext(object):
- """ Context manager for temporarily changing directory. """
+ """Context manager for temporarily changing directory."""
def __init__(self, folder):
"""
@@ -18,18 +16,18 @@ def __init__(self, folder):
:param str folder: Path to set as new working directory
"""
if not os.path.isdir(folder):
- raise ValueError(
- "Requested temp entry to non-folder: {}".format(folder))
+ raise ValueError("Requested temp entry to non-folder: {}".format(folder))
self._prevdir = os.getcwd()
self._currdir = folder
def __enter__(self):
- """ Make the working directory switch. """
+ """Make the working directory switch."""
os.chdir(self._currdir)
def __exit__(self, exc_type, exc_val, exc_tb):
- """ Switch back to the previous working directory. """
+ """Switch back to the previous working directory."""
if not os.path.isdir(self._prevdir):
- raise RuntimeError("Return path is no longer a directory: {}".
- format(self._prevdir))
+ raise RuntimeError(
+ "Return path is no longer a directory: {}".format(self._prevdir)
+ )
os.chdir(self._prevdir)
diff --git a/pypiper/manager.py b/pypiper/manager.py
index aa43389a..9d32100b 100644
--- a/pypiper/manager.py
+++ b/pypiper/manager.py
@@ -8,35 +8,51 @@
"""
import atexit
-from collections.abc import Iterable
import datetime
import errno
import glob
import os
import platform
-import psutil
import re
import shlex # for splitting commands like a shell does
import signal
import subprocess
import sys
import time
-import pandas as _pd
+import warnings
-from attmap import AttMapEcho
+from collections.abc import Iterable
from hashlib import md5
+
+import __main__
import logmuse
+import pandas as _pd
+import psutil
+from attmap import AttMapEcho
+from pipestat import PipestatError, PipestatManager
from yacman import load_yaml
+
+from ._version import __version__
+from .const import PROFILE_COLNAMES, DEFAULT_SAMPLE_NAME
from .exceptions import PipelineHalt, SubprocessError
from .flags import *
-from .utils import \
- check_shell, checkpoint_filepath, clear_flags, default_pipeline_config, \
- flag_name, get_proc_name, is_multi_target, logger_via_cli, make_lock_name, \
- parse_cmd, pipeline_filepath, CHECKPOINT_SPECIFICATIONS
-from .const import PROFILE_COLNAMES
-from ._version import __version__
-import __main__
-
+from .utils import (
+ CHECKPOINT_SPECIFICATIONS,
+ check_shell,
+ checkpoint_filepath,
+ clear_flags,
+ default_pipeline_config,
+ flag_name,
+ get_proc_name,
+ is_multi_target,
+ logger_via_cli,
+ make_lock_name,
+ parse_cmd,
+ pipeline_filepath,
+ default_pipestat_output_schema,
+ result_formatter_markdown,
+)
+from pipestat.helpers import read_yaml_data
__all__ = ["PipelineManager"]
@@ -84,7 +100,7 @@ class PipelineManager(object):
even if the preceding command is not run. By default,
following functions are only run if the preceding command is run.
:param int cores: number of processors to use, default 1
- :param str mem: amount of memory to use. Default units are megabytes unless
+ :param str mem: amount of memory to use. Default units are megabytes unless
specified using the suffix [K|M|G|T]."
:param str config_file: path to pipeline configuration file, optional
:param str output_parent: path to folder in which output folder will live
@@ -100,12 +116,34 @@ class PipelineManager(object):
via args namespace, or if both stopping types (exclusive/prospective
and inclusive/retrospective) are provided.
"""
- def __init__(
- self, name, outfolder, version=None, args=None, multi=False,
- dirty=False, recover=False, new_start=False, force_follow=False,
- cores=1, mem="1000M", config_file=None, output_parent=None,
- overwrite_checkpoints=False, logger_kwargs=None, **kwargs):
+ # TODO: add pipestat-related args docstrings
+
+ def __init__(
+ self,
+ name,
+ outfolder,
+ version=None,
+ args=None,
+ multi=False,
+ dirty=False,
+ recover=False,
+ new_start=False,
+ force_follow=False,
+ cores=1,
+ mem="1000M",
+ config_file=None,
+ output_parent=None,
+ overwrite_checkpoints=False,
+ logger_kwargs=None,
+ pipestat_project_name=None,
+ pipestat_sample_name=None,
+ pipestat_schema=None,
+ pipestat_results_file=None,
+ pipestat_config=None,
+ pipestat_result_formatter=None,
+ **kwargs,
+ ):
# Params defines the set of options that could be updated via
# command line args to a pipeline run, that can be forwarded
# to Pypiper. If any pypiper arguments are passed
@@ -114,15 +152,15 @@ def __init__(
# Establish default params
params = {
- 'dirty': dirty,
- 'recover': recover,
- 'new_start': new_start,
- 'force_follow': force_follow,
- 'config_file': config_file,
- 'output_parent': output_parent,
- 'cores': cores,
- 'mem': mem,
- 'testmode': False
+ "dirty": dirty,
+ "recover": recover,
+ "new_start": new_start,
+ "force_follow": force_follow,
+ "config_file": config_file,
+ "output_parent": output_parent,
+ "cores": cores,
+ "mem": mem,
+ "testmode": False,
}
# Transform the command-line namespace into a Mapping.
@@ -142,8 +180,10 @@ def __init__(
checkpoint = args_dict.pop(optname, None)
setattr(self, optname, checkpoint)
if self.stop_before and self.stop_after:
- raise TypeError("Cannot specify both pre-stop and post-stop; "
- "got '{}' and '{}'".format(self.stop_before, self.stop_after))
+ raise TypeError(
+ "Cannot specify both pre-stop and post-stop; "
+ "got '{}' and '{}'".format(self.stop_before, self.stop_after)
+ )
# Update this manager's parameters with non-checkpoint-related
# command-line parameterization.
@@ -161,14 +201,13 @@ def __init__(
# Pipeline settings
self.name = name
self.tee = None
- self.overwrite_locks = params['recover']
- self.new_start = params['new_start']
- self.force_follow = params['force_follow']
- self.dirty = params['dirty']
- self.cores = params['cores']
- self.output_parent = params['output_parent']
- self.testmode = params['testmode']
-
+ self.overwrite_locks = params["recover"]
+ self.new_start = params["new_start"]
+ self.force_follow = params["force_follow"]
+ self.dirty = params["dirty"]
+ self.cores = params["cores"]
+ self.output_parent = params["output_parent"]
+ self.testmode = params["testmode"]
# Set up logger
logger_kwargs = logger_kwargs or {}
@@ -203,11 +242,11 @@ def __init__(
# total memory limit provided.
# This will give a little breathing room for non-heap java memory use.
- if not params['mem'].endswith(('K','M','G','T')):
- self.mem = params['mem'] + "M"
+ if not params["mem"].endswith(("K", "M", "G", "T")):
+ self.mem = params["mem"] + "M"
else:
# Assume the memory is in megabytes.
- self.mem = params['mem']
+ self.mem = params["mem"]
self.javamem = str(int(int(self.mem[:-1]) * 0.95)) + self.mem[-1:]
@@ -232,27 +271,21 @@ def __init__(
self.pl_version = version
# Set relative output_parent directory to absolute
# not necessary after all. . .
- #if self.output_parent and not os.path.isabs(self.output_parent):
+ # if self.output_parent and not os.path.isabs(self.output_parent):
# self.output_parent = os.path.join(os.getcwd(), self.output_parent)
# File paths:
- self.outfolder = os.path.join(outfolder, '') # trailing slash
+ self.outfolder = os.path.join(outfolder, "") # trailing slash
+ self.make_sure_path_exists(self.outfolder)
self.pipeline_log_file = pipeline_filepath(self, suffix="_log.md")
- self.pipeline_profile_file = \
- pipeline_filepath(self, suffix="_profile.tsv")
+ self.pipeline_profile_file = pipeline_filepath(self, suffix="_profile.tsv")
# Stats and figures are general and so lack the pipeline name.
- self.pipeline_stats_file = \
- pipeline_filepath(self, filename="stats.tsv")
- self.pipeline_figures_file = \
- pipeline_filepath(self, filename="figures.tsv")
- self.pipeline_objects_file = \
- pipeline_filepath(self, filename="objects.tsv")
+ self.pipeline_stats_file = pipeline_filepath(self, filename="stats.yaml")
# Record commands used and provide manual cleanup script.
- self.pipeline_commands_file = \
- pipeline_filepath(self, suffix="_commands.sh")
+ self.pipeline_commands_file = pipeline_filepath(self, suffix="_commands.sh")
self.cleanup_file = pipeline_filepath(self, suffix="_cleanup.sh")
# Pipeline status variables
@@ -263,7 +296,7 @@ def __init__(
self.locks = []
self.running_procs = {}
self.completed_procs = {}
-
+
self.wait = True # turn off for debugging
# Initialize status and flags
@@ -275,6 +308,11 @@ def __init__(
# In-memory holder for report_result
self.stats_dict = {}
+ # Result formatter to pass to pipestat
+ self.pipestat_result_formatter = (
+ pipestat_result_formatter or result_formatter_markdown
+ )
+
# Checkpoint-related parameters
self.overwrite_checkpoints = overwrite_checkpoints
self.halt_on_next = False
@@ -290,6 +328,35 @@ def __init__(
signal.signal(signal.SIGINT, self._signal_int_handler)
signal.signal(signal.SIGTERM, self._signal_term_handler)
+ # pipestat setup
+ self.pipestat_sample_name = pipestat_sample_name or DEFAULT_SAMPLE_NAME
+ # getattr(self, "sample_name", DEFAULT_SAMPLE_NAME)
+
+ # don't force default pipestat_results_file value unless
+ # pipestat config not provided
+ if pipestat_config is None and pipestat_results_file is None:
+ pipestat_results_file = pipeline_filepath(
+ self, filename="pipestat_results.yaml"
+ )
+
+ def _get_arg(args_dict, arg_name):
+ """safely get argument from arg dict -- return None if doesn't exist"""
+ return None if arg_name not in args_dict else args_dict[arg_name]
+
+ self._pipestat_manager = PipestatManager(
+ sample_name=self.pipestat_sample_name
+ or _get_arg(args_dict, "pipestat_sample_name")
+ or DEFAULT_SAMPLE_NAME,
+ pipeline_name=self.name,
+ schema_path=pipestat_schema
+ or _get_arg(args_dict, "pipestat_schema")
+ or default_pipestat_output_schema(sys.argv[0]),
+ results_file_path=self.pipeline_stats_file
+ or _get_arg(args_dict, "pipestat_results_file"),
+ config_file=pipestat_config or _get_arg(args_dict, "pipestat_config"),
+ multi_pipelines=multi,
+ )
+
self.start_pipeline(args, multi)
# Handle config file if it exists
@@ -330,8 +397,9 @@ def __init__(
default_config = default_pipeline_config(sys.argv[0])
if os.path.isfile(default_config):
config_to_load = default_config
- self.debug("Using default pipeline config file: {}".
- format(config_to_load))
+ self.debug(
+ "Using default pipeline config file: {}".format(config_to_load)
+ )
# Finally load the config we found.
if config_to_load is not None:
@@ -341,7 +409,25 @@ def __init__(
self.debug("No config file")
self.config = None
+ @property
+ def pipestat(self):
+ """
+ `pipestat.PipestatManager` object to use for pipeline results reporting and status management
+ Depending on the object configuration it can report to
+ a YAML-formatted file or PostgreSQL database. Please refer to pipestat
+ documentation for more details: http://pipestat.databio.org/
+
+ :return pipestat.PipestatManager: object to use for results reporting
+ """
+ try:
+ return getattr(self, "_pipestat_manager")
+ except AttributeError:
+ raise PipestatError(
+ f"{PipestatManager.__name__} has not been configured for this pipeline run. "
+ f"Provide an output schema to the {PipelineManager.__name__} object "
+ f"in order to initialize it."
+ )
@property
def _completed(self):
@@ -350,7 +436,10 @@ def _completed(self):
:return bool: Whether the managed pipeline is in a completed state.
"""
- return self.status == COMPLETE_FLAG
+ return (
+ self.pipestat.get_status(self._pipestat_manager.sample_name)
+ == COMPLETE_FLAG
+ )
@property
def _failed(self):
@@ -359,16 +448,17 @@ def _failed(self):
:return bool: Whether the managed pipeline is in a failed state.
"""
- return self.status == FAIL_FLAG
+ return self.pipestat.get_status(self._pipestat_manager.sample_name) == FAIL_FLAG
@property
def halted(self):
"""
Is the managed pipeline in a paused/halted state?
-
:return bool: Whether the managed pipeline is in a paused/halted state.
"""
- return self.status == PAUSE_FLAG
+ return (
+ self.pipestat.get_status(self._pipestat_manager.sample_name) == PAUSE_FLAG
+ )
@property
def _has_exit_status(self):
@@ -395,20 +485,22 @@ def start_pipeline(self, args=None, multi=False):
You provide only the output directory (used for pipeline stats, log, and status flag files).
"""
# Perhaps this could all just be put into __init__, but I just kind of like the idea of a start function
- self.make_sure_path_exists(self.outfolder)
+ # self.make_sure_path_exists(self.outfolder)
# By default, Pypiper will mirror every operation so it is displayed both
# on sys.stdout **and** to a log file. Unfortunately, interactive python sessions
- # ruin this by interfering with stdout. So, for interactive mode, we do not enable
+ # ruin this by interfering with stdout. So, for interactive mode, we do not enable
# the tee subprocess, sending all output to screen only.
# Starting multiple PipelineManagers in the same script has the same problem, and
# must therefore be run in interactive_mode.
interactive_mode = multi or not hasattr(__main__, "__file__")
if interactive_mode:
- self.warning("Warning: You're running an interactive python session. "
- "This works, but pypiper cannot tee the output, so results "
- "are only logged to screen.")
+ self.warning(
+ "Warning: You're running an interactive python session. "
+ "This works, but pypiper cannot tee the output, so results "
+ "are only logged to screen."
+ )
else:
sys.stdout = Unbuffered(sys.stdout)
# sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) # Unbuffer output
@@ -423,10 +515,12 @@ def start_pipeline(self, args=None, multi=False):
# manually (in the exit handler).
# a for append to file
-
+
tee = subprocess.Popen(
- ["tee", "-a", self.pipeline_log_file], stdin=subprocess.PIPE,
- preexec_fn=self._ignore_interrupts)
+ ["tee", "-a", self.pipeline_log_file],
+ stdin=subprocess.PIPE,
+ preexec_fn=self._ignore_interrupts,
+ )
# If the pipeline is terminated with SIGTERM/SIGINT,
# make sure we kill this spawned tee subprocess as well.
@@ -456,29 +550,83 @@ def start_pipeline(self, args=None, multi=False):
try:
# pypiper dir
ppd = os.path.dirname(os.path.realpath(__file__))
- gitvars['pypiper_dir'] = ppd
- gitvars['pypiper_hash'] = subprocess.check_output("cd " + ppd + "; git rev-parse --verify HEAD 2>/dev/null", shell=True).decode().strip()
- gitvars['pypiper_date'] = subprocess.check_output("cd " + ppd + "; git show -s --format=%ai HEAD 2>/dev/null", shell=True).decode().strip()
- gitvars['pypiper_diff'] = subprocess.check_output("cd " + ppd + "; git diff --shortstat HEAD 2>/dev/null", shell=True).decode().strip()
- gitvars['pypiper_branch'] = subprocess.check_output("cd " + ppd + "; git branch | grep '*' 2>/dev/null", shell=True).decode().strip()
+ gitvars["pypiper_dir"] = ppd
+ gitvars["pypiper_hash"] = (
+ subprocess.check_output(
+ "cd " + ppd + "; git rev-parse --verify HEAD 2>/dev/null",
+ shell=True,
+ )
+ .decode()
+ .strip()
+ )
+ gitvars["pypiper_date"] = (
+ subprocess.check_output(
+ "cd " + ppd + "; git show -s --format=%ai HEAD 2>/dev/null",
+ shell=True,
+ )
+ .decode()
+ .strip()
+ )
+ gitvars["pypiper_diff"] = (
+ subprocess.check_output(
+ "cd " + ppd + "; git diff --shortstat HEAD 2>/dev/null", shell=True
+ )
+ .decode()
+ .strip()
+ )
+ gitvars["pypiper_branch"] = (
+ subprocess.check_output(
+ "cd " + ppd + "; git branch | grep '*' 2>/dev/null", shell=True
+ )
+ .decode()
+ .strip()
+ )
except Exception:
pass
try:
# pipeline dir
pld = os.path.dirname(os.path.realpath(sys.argv[0]))
- gitvars['pipe_dir'] = pld
- gitvars['pipe_hash'] = subprocess.check_output("cd " + pld + "; git rev-parse --verify HEAD 2>/dev/null", shell=True).decode().strip()
- gitvars['pipe_date'] = subprocess.check_output("cd " + pld + "; git show -s --format=%ai HEAD 2>/dev/null", shell=True).decode().strip()
- gitvars['pipe_diff'] = subprocess.check_output("cd " + pld + "; git diff --shortstat HEAD 2>/dev/null", shell=True).decode().strip()
- gitvars['pipe_branch'] = subprocess.check_output("cd " + pld + "; git branch | grep '*' 2>/dev/null", shell=True).decode().strip()
+ gitvars["pipe_dir"] = pld
+ gitvars["pipe_hash"] = (
+ subprocess.check_output(
+ "cd " + pld + "; git rev-parse --verify HEAD 2>/dev/null",
+ shell=True,
+ )
+ .decode()
+ .strip()
+ )
+ gitvars["pipe_date"] = (
+ subprocess.check_output(
+ "cd " + pld + "; git show -s --format=%ai HEAD 2>/dev/null",
+ shell=True,
+ )
+ .decode()
+ .strip()
+ )
+ gitvars["pipe_diff"] = (
+ subprocess.check_output(
+ "cd " + pld + "; git diff --shortstat HEAD 2>/dev/null", shell=True
+ )
+ .decode()
+ .strip()
+ )
+ gitvars["pipe_branch"] = (
+ subprocess.check_output(
+ "cd " + pld + "; git branch | grep '*' 2>/dev/null", shell=True
+ )
+ .decode()
+ .strip()
+ )
except Exception:
pass
-
+
# Print out a header section in the pipeline log:
# Wrap things in backticks to prevent markdown from interpreting underscores as emphasis.
# print("----------------------------------------")
self.info("### Pipeline run code and environment:\n")
- self.info("* " + "Command".rjust(20) + ": " + "`" + str(" ".join(sys.argv)) + "`")
+ self.info(
+ "* " + "Command".rjust(20) + ": " + "`" + str(" ".join(sys.argv)) + "`"
+ )
self.info("* " + "Compute host".rjust(20) + ": " + platform.node())
self.info("* " + "Working dir".rjust(20) + ": " + os.getcwd())
self.info("* " + "Outfolder".rjust(20) + ": " + self.outfolder)
@@ -488,25 +636,75 @@ def start_pipeline(self, args=None, multi=False):
self.info("\n### Version log:\n")
self.info("* " + "Python version".rjust(20) + ": " + platform.python_version())
try:
- self.info("* " + "Pypiper dir".rjust(20) + ": " + "`" + gitvars['pypiper_dir'].strip() + "`")
+ self.info(
+ "* "
+ + "Pypiper dir".rjust(20)
+ + ": "
+ + "`"
+ + gitvars["pypiper_dir"].strip()
+ + "`"
+ )
self.info("* " + "Pypiper version".rjust(20) + ": " + __version__)
- self.info("* " + "Pypiper hash".rjust(20) + ": " + str(gitvars['pypiper_hash']))
- self.info("* " + "Pypiper branch".rjust(20) + ": " + str(gitvars['pypiper_branch']))
- self.info("* " + "Pypiper date".rjust(20) + ": " + str(gitvars['pypiper_date']))
- if gitvars['pypiper_diff']:
- self.info("* " + "Pypiper diff".rjust(20) + ": " + str(gitvars['pypiper_diff']))
+ self.info(
+ "* " + "Pypiper hash".rjust(20) + ": " + str(gitvars["pypiper_hash"])
+ )
+ self.info(
+ "* "
+ + "Pypiper branch".rjust(20)
+ + ": "
+ + str(gitvars["pypiper_branch"])
+ )
+ self.info(
+ "* " + "Pypiper date".rjust(20) + ": " + str(gitvars["pypiper_date"])
+ )
+ if gitvars["pypiper_diff"]:
+ self.info(
+ "* "
+ + "Pypiper diff".rjust(20)
+ + ": "
+ + str(gitvars["pypiper_diff"])
+ )
except KeyError:
# It is ok if keys aren't set, it means pypiper isn't in a git repo.
pass
try:
- self.info("* " + "Pipeline dir".rjust(20) + ": " + "`" + gitvars['pipe_dir'].strip() + "`")
- self.info("* " + "Pipeline version".rjust(20) + ": " + str(self.pl_version))
- self.info("* " + "Pipeline hash".rjust(20) + ": " + str(gitvars['pipe_hash']).strip())
- self.info("* " + "Pipeline branch".rjust(20) + ": " + str(gitvars['pipe_branch']).strip())
- self.info("* " + "Pipeline date".rjust(20) + ": " + str(gitvars['pipe_date']).strip())
- if (gitvars['pipe_diff'] != ""):
- self.info("* " + "Pipeline diff".rjust(20) + ": " + str(gitvars['pipe_diff']).strip())
+ self.info(
+ "* "
+ + "Pipeline dir".rjust(20)
+ + ": "
+ + "`"
+ + gitvars["pipe_dir"].strip()
+ + "`"
+ )
+ self.info(
+ "* " + "Pipeline version".rjust(20) + ": " + str(self.pl_version)
+ )
+ self.info(
+ "* "
+ + "Pipeline hash".rjust(20)
+ + ": "
+ + str(gitvars["pipe_hash"]).strip()
+ )
+ self.info(
+ "* "
+ + "Pipeline branch".rjust(20)
+ + ": "
+ + str(gitvars["pipe_branch"]).strip()
+ )
+ self.info(
+ "* "
+ + "Pipeline date".rjust(20)
+ + ": "
+ + str(gitvars["pipe_date"]).strip()
+ )
+ if gitvars["pipe_diff"] != "":
+ self.info(
+ "* "
+ + "Pipeline diff".rjust(20)
+ + ": "
+ + str(gitvars["pipe_diff"]).strip()
+ )
except KeyError:
# It is ok if keys aren't set, it means the pipeline isn't a git repo.
pass
@@ -517,18 +715,37 @@ def start_pipeline(self, args=None, multi=False):
argtext = "`{}`".format(arg)
valtext = "`{}`".format(val)
self.info("* {}: {}".format(argtext.rjust(20), valtext))
+
+ self.info("\n### Initialized Pipestat Object:\n")
+ results = self._pipestat_manager.__str__().split("\n")
+ for i in results:
+ self.info("* " + i)
+ self.info("* Sample name: " + self.pipestat_sample_name + "\n")
self.info("\n----------------------------------------\n")
- self._set_status_flag(RUN_FLAG)
+ self.status = "running"
+ self.pipestat.set_status(
+ sample_name=self._pipestat_manager.sample_name, status_identifier="running"
+ )
# Record the start in PIPE_profile and PIPE_commands output files so we
# can trace which run they belong to
with open(self.pipeline_commands_file, "a") as myfile:
- myfile.write("# Pipeline started at " + time.strftime("%m-%d %H:%M:%S", time.localtime(self.starttime)) + "\n\n")
+ myfile.write(
+ "# Pipeline started at "
+ + time.strftime("%m-%d %H:%M:%S", time.localtime(self.starttime))
+ + "\n\n"
+ )
with open(self.pipeline_profile_file, "a") as myfile:
- myfile.write("# Pipeline started at " + time.strftime("%m-%d %H:%M:%S", time.localtime(self.starttime))
- + "\n\n" + "# " + "\t".join(PROFILE_COLNAMES) + "\n")
+ myfile.write(
+ "# Pipeline started at "
+ + time.strftime("%m-%d %H:%M:%S", time.localtime(self.starttime))
+ + "\n\n"
+ + "# "
+ + "\t".join(PROFILE_COLNAMES)
+ + "\n"
+ )
def _set_status_flag(self, status):
"""
@@ -552,9 +769,10 @@ def _set_status_flag(self, status):
# Set new status.
prev_status = self.status
self.status = status
- self._create_file(self._flag_file_path())
- self.debug("\nChanged status from {} to {}.".format(
- prev_status, self.status))
+ self.pipestat.set_status(
+ sample_name=self._pipestat_manager.sample_name, status_identifier=status
+ )
+ self.debug("\nChanged status from {} to {}.".format(prev_status, self.status))
def _flag_file_path(self, status=None):
"""
@@ -566,14 +784,29 @@ def _flag_file_path(self, status=None):
:param str status: flag file type to create, default to current status
:return str: path to flag file of indicated or current status.
"""
- flag_file_name = "{}_{}".format(
- self.name, flag_name(status or self.status))
+
+ flag_file_name = "{}_{}_{}".format(
+ self._pipestat_manager["_pipeline_name"],
+ self.pipestat_sample_name,
+ flag_name(status or self.status),
+ )
return pipeline_filepath(self, filename=flag_file_name)
###################################
# Process calling functions
###################################
- def run(self, cmd, target=None, lock_name=None, shell=None, nofail=False, clean=False, follow=None, container=None):
+ def run(
+ self,
+ cmd,
+ target=None,
+ lock_name=None,
+ shell=None,
+ nofail=False,
+ clean=False,
+ follow=None,
+ container=None,
+ default_return_code=0,
+ ):
"""
The primary workhorse function of PipelineManager, this runs a command.
@@ -603,29 +836,55 @@ def run(self, cmd, target=None, lock_name=None, shell=None, nofail=False, clean=
to an auto cleanup list. Optional.
:param callable follow: Function to call after executing (each) command.
:param str container: Name for Docker container in which to run commands.
+ :param Any default_return_code: Return code to use, might be used to discriminate
+ between runs that did not execute any commands and runs that did.
:return int: Return code of process. If a list of commands is passed,
this is the maximum of all return codes for all commands.
"""
+ def _max_ret_code(codes_list):
+ """
+ Return the maximum of a list of return codes.
+
+ :param list[int] code: List of return codes to compare.
+ :return int: Maximum of list.
+ """
+ # filter out codes that are None
+ codes_list = [code for code in codes_list if code is not None]
+ # get the max of the remaining codes
+ if codes_list:
+ return max(codes_list)
+ # if no codes are left, return None
+ return
+
+ # validate default return code
+ if default_return_code is not None and not isinstance(default_return_code, int):
+ raise TypeError("default_return_code must be an int or None")
+
# If the pipeline's not been started, skip ahead.
if not self._active:
cmds = [cmd] if isinstance(cmd, str) else cmd
cmds_text = [c if isinstance(c, str) else " ".join(c) for c in cmds]
- self.info("Pipeline is inactive; skipping {} command(s):\n{}".
- format(len(cmds), "\n".join(cmds_text)))
- return 0
+ self.info(
+ "Pipeline is inactive; skipping {} command(s):\n{}".format(
+ len(cmds), "\n".join(cmds_text)
+ )
+ )
+ return default_return_code
# Short-circuit if the checkpoint file exists and the manager's not
# been configured to overwrite such files.
if self.curr_checkpoint is not None:
check_fpath = checkpoint_filepath(self.curr_checkpoint, self)
if os.path.isfile(check_fpath) and not self.overwrite_checkpoints:
- self.info("Checkpoint file exists for '{}' ('{}'), and the {} has "
- "been configured to not overwrite checkpoints; "
- "skipping command '{}'".format(
- self.curr_checkpoint, check_fpath,
- self.__class__.__name__, cmd))
- return 0
+ self.info(
+ "Checkpoint file exists for '{}' ('{}'), and the {} has "
+ "been configured to not overwrite checkpoints; "
+ "skipping command '{}'".format(
+ self.curr_checkpoint, check_fpath, self.__class__.__name__, cmd
+ )
+ )
+ return default_return_code
# TODO: consider making the logic such that locking isn't implied, or
# TODO (cont.): that we can make it otherwise such that it's not
@@ -634,25 +893,30 @@ def run(self, cmd, target=None, lock_name=None, shell=None, nofail=False, clean=
# Therefore, a targetless command that you want
# to lock must specify a lock_name manually.
if target is None and lock_name is None:
- self.fail_pipeline(Exception(
- "You must provide either a target or a lock_name."))
+ self.fail_pipeline(
+ Exception("You must provide either a target or a lock_name.")
+ )
# Downstream code requires target to be a list, so convert if only
# a single item was given
if not is_multi_target(target) and target is not None:
target = [target]
- # Downstream code requires a list of locks; convert
+ # Downstream code requires a list of locks; convert
if isinstance(lock_name, str):
lock_name = [lock_name]
-
+
# Default lock_name (if not provided) is based on the target file name,
# but placed in the parent pipeline outfolder
- self.debug("Lock_name {}; target '{}', outfolder '{}'".format(lock_name, target, self.outfolder))
+ self.debug(
+ "Lock_name {}; target '{}', outfolder '{}'".format(
+ lock_name, target, self.outfolder
+ )
+ )
lock_name = lock_name or make_lock_name(target, self.outfolder)
lock_files = [self._make_lock_path(ln) for ln in lock_name]
- process_return_code = 0
+ process_return_code = default_return_code
local_maxmem = 0
# Decide how to do follow-up.
@@ -660,8 +924,11 @@ def run(self, cmd, target=None, lock_name=None, shell=None, nofail=False, clean=
call_follow = lambda: None
elif not hasattr(follow, "__call__"):
# Warn about non-callable argument to follow-up function.
- self.warning("Follow-up function is not callable and won't be used: {}".
- format(type(follow)))
+ self.warning(
+ "Follow-up function is not callable and won't be used: {}".format(
+ type(follow)
+ )
+ )
call_follow = lambda: None
else:
# Wrap the follow-up function so that the log shows what's going on.
@@ -672,7 +939,6 @@ def call_follow():
follow()
self.in_follow = False
-
# The while=True loop here is unlikely to be triggered, and is just a
# wrapper to prevent race conditions; the lock_file must be created by
# the current loop. If not, we loop again and then re-do the tests.
@@ -684,18 +950,22 @@ def call_follow():
# is found that needs to be recovered or overwritten. It instructs us to
# ignore lock files on the next iteration.
local_recover = False
- local_newstart = False
+ local_newstart = False
proceed_through_locks = False
while True:
##### Tests block
# Base case: All targets exists and not set to overwrite targets break loop, don't run process.
# os.path.exists returns True for either a file or directory; .isfile is file-only
- if target is not None and all([os.path.exists(t) for t in target]) \
- and not any([os.path.isfile(l) for l in lock_files]) \
- and not local_newstart:
+ if (
+ target is not None
+ and all([os.path.exists(t) for t in target])
+ and not any([os.path.isfile(l) for l in lock_files])
+ and not local_newstart
+ ):
for tgt in target:
- if os.path.exists(tgt): self.info("Target exists: `" + tgt + "` ")
+ if os.path.exists(tgt):
+ self.info("Target exists: `" + tgt + "` ")
if self.new_start:
self.info("New start mode; run anyway. ")
# Set the local_newstart flag so the command will run anyway.
@@ -712,11 +982,17 @@ def call_follow():
for c in cmd:
count = len(parse_cmd(c, shell))
self.proc_count += count
- self.debug(increment_info_pattern.format(str(c), count, self.proc_count))
+ self.debug(
+ increment_info_pattern.format(
+ str(c), count, self.proc_count
+ )
+ )
else:
count = len(parse_cmd(cmd, shell))
self.proc_count += count
- self.debug(increment_info_pattern.format(str(cmd), count, self.proc_count))
+ self.debug(
+ increment_info_pattern.format(str(cmd), count, self.proc_count)
+ )
break # Do not run command
# Scenario 1: Lock file exists, but we're supposed to overwrite target; Run process.
@@ -729,8 +1005,10 @@ def call_follow():
self.info("Overwriting target...")
proceed_through_locks = True
elif os.path.isfile(recover_file):
- self.info("Found dynamic recovery file ({}); "
- "overwriting target...".format(recover_file))
+ self.info(
+ "Found dynamic recovery file ({}); "
+ "overwriting target...".format(recover_file)
+ )
# remove the lock file which will then be promptly re-created for the current run.
local_recover = True
proceed_through_locks = True
@@ -742,8 +1020,7 @@ def call_follow():
# time (to see if the target exists now)
continue
-
- # If you get to this point, the target doesn't exist, and the lock_file doesn't exist
+ # If you get to this point, the target doesn't exist, and the lock_file doesn't exist
# (or we should overwrite). create the lock (if you can)
# Initialize lock in master lock list
for lock_file in lock_files:
@@ -755,10 +1032,13 @@ def call_follow():
self._create_file_racefree(lock_file) # Create lock
except OSError as e:
if e.errno == errno.EEXIST: # File already exists
- self.info("Lock file created after test! Looping again: {}".format(
- lock_file))
+ self.info(
+ "Lock file created after test! Looping again: {}".format(
+ lock_file
+ )
+ )
- # Since a lock file was created by a different source,
+ # Since a lock file was created by a different source,
# we need to reset this flag to re-check the locks.
proceed_through_locks = False
continue # Go back to start
@@ -767,24 +1047,34 @@ def call_follow():
# If you make it past these tests, we should proceed to run the process.
if target is not None:
- self.info("Target to produce: {} ".format(",".join(['`'+x+'`' for x in target])))
+ self.info(
+ "Target to produce: {} ".format(
+ ",".join(["`" + x + "`" for x in target])
+ )
+ )
else:
self.info("Targetless command, running... ")
if isinstance(cmd, list): # Handle command lists
for cmd_i in cmd:
- list_ret, maxmem = \
- self.callprint(cmd_i, shell, lock_file, nofail, container)
+ list_ret, maxmem = self.callprint(
+ cmd_i, shell, lock_file, nofail, container
+ )
maxmem = max(maxmem) if isinstance(maxmem, Iterable) else maxmem
- local_maxmem = max(local_maxmem, maxmem)
- list_ret = max(list_ret) if isinstance(list_ret, Iterable) else list_ret
- process_return_code = max(process_return_code, list_ret)
+ local_maxmem = max(local_maxmem, maxmem)
+ list_ret = (
+ _max_ret_code(list_ret)
+ if isinstance(list_ret, Iterable)
+ else list_ret
+ )
+ process_return_code = _max_ret_code([process_return_code, list_ret])
else: # Single command (most common)
- process_return_code, local_maxmem = \
- self.callprint(cmd, shell, lock_file, nofail, container) # Run command
+ process_return_code, local_maxmem = self.callprint(
+ cmd, shell, lock_file, nofail, container
+ ) # Run command
if isinstance(process_return_code, list):
- process_return_code = max(process_return_code)
+ process_return_code = _max_ret_code(process_return_code)
# For temporary files, you can specify a clean option to automatically
# add them to the clean list, saving you a manual call to clean_add
@@ -806,7 +1096,7 @@ def checkprint(self, cmd, shell=None, nofail=False):
"""
Just like callprint, but checks output -- so you can get a variable
in python corresponding to the return value of the command you call.
- This is equivalent to running subprocess.check_output()
+ This is equivalent to running subprocess.check_output()
instead of subprocess.call().
:param str | Iterable[str] cmd: Bash command(s) to be run.
:param bool | str shell: If command requires should be run in its own shell. Optional.
@@ -830,9 +1120,11 @@ def checkprint(self, cmd, shell=None, nofail=False):
if not shell:
if likely_shell:
- self.debug("Should this command run in a shell instead of directly in a subprocess?")
+ self.debug(
+ "Should this command run in a shell instead of directly in a subprocess?"
+ )
cmd = shlex.split(cmd)
-
+
try:
return subprocess.check_output(cmd, shell=shell).decode().strip()
except Exception as e:
@@ -841,7 +1133,7 @@ def checkprint(self, cmd, shell=None, nofail=False):
def _attend_process(self, proc, sleeptime):
"""
Waits on a process for a given time to see if it finishes, returns True
- if it's still running after the given time or False as soon as it
+ if it's still running after the given time or False as soon as it
returns.
:param psutil.Popen proc: Process object opened by psutil.Popen()
@@ -892,10 +1184,12 @@ def get_mem_child_sum(proc):
if children:
mem_sum += sum([x.memory_info().rss for x in children])
# return in gigs
- return mem_sum/1e9
+ return mem_sum / 1e9
except (psutil.NoSuchProcess, psutil.ZombieProcess) as e:
self.warning(e)
- self.warning("Warning: couldn't add memory use for process: {}".format(proc.pid))
+ self.warning(
+ "Warning: couldn't add memory use for process: {}".format(proc.pid)
+ )
return 0
def display_memory(memval):
@@ -910,7 +1204,11 @@ def make_hash(o):
try:
hsh = md5(str(o).encode("utf-8")).hexdigest()[:10]
except Exception as e:
- self.debug("Could not create hash for '{}', caught exception: {}".format(str(o), e.__class__.__name__))
+ self.debug(
+ "Could not create hash for '{}', caught exception: {}".format(
+ str(o), e.__class__.__name__
+ )
+ )
hsh = None
return hsh
@@ -943,7 +1241,7 @@ def make_hash(o):
"container": container,
"p": processes[-1],
"args_hash": make_hash(conc_cmd),
- "local_proc_id": self.process_counter()
+ "local_proc_id": self.process_counter(),
}
self._report_command(cmd, [x.pid for x in processes])
@@ -969,16 +1267,22 @@ def proc_wrapup(i):
current_pid = processes[i].pid
info = "PID: {pid};\tCommand: {cmd};\tReturn code: {ret};\tMemory used: {mem}".format(
- pid=current_pid,
+ pid=current_pid,
cmd=self.running_procs[current_pid]["proc_name"],
ret=processes[i].returncode,
- mem=display_memory(local_maxmems[i]))
-
+ mem=display_memory(local_maxmems[i]),
+ )
+
# report process profile
- self._report_profile(self.running_procs[current_pid]["proc_name"], lock_file,
- time.time() - self.running_procs[current_pid]["start_time"], local_maxmems[i],
- current_pid, self.running_procs[current_pid]["args_hash"],
- self.running_procs[current_pid]["local_proc_id"])
+ self._report_profile(
+ self.running_procs[current_pid]["proc_name"],
+ lock_file,
+ time.time() - self.running_procs[current_pid]["start_time"],
+ local_maxmems[i],
+ current_pid,
+ self.running_procs[current_pid]["args_hash"],
+ self.running_procs[current_pid]["local_proc_id"],
+ )
# Remove this as a running subprocess
self.running_procs[current_pid]["info"] = info
@@ -991,29 +1295,37 @@ def proc_wrapup(i):
returncodes[i] = returncode
return info
- sleeptime = .0001
-
+ sleeptime = 0.0001
+
while running_processes:
self.debug("running")
for i in running_processes:
- local_maxmems[i] = max(local_maxmems[i], (get_mem_child_sum(processes[i])))
+ local_maxmems[i] = max(
+ local_maxmems[i], (get_mem_child_sum(processes[i]))
+ )
self.peak_memory = max(self.peak_memory, local_maxmems[i])
self.debug(processes[i])
if not self._attend_process(processes[i], sleeptime):
proc_wrapup_text[i] = proc_wrapup(i)
- # the sleeptime is extremely short at the beginning and gets longer exponentially
+ # the sleeptime is extremely short at the beginning and gets longer exponentially
# (+ constant to prevent copious checks at the very beginning)
# = more precise mem tracing for short processes
- sleeptime = min((sleeptime + 0.25) * 3, 60/len(processes))
+ sleeptime = min((sleeptime + 0.25) * 3, 60 / len(processes))
# All jobs are done, print a final closing and job info
stop_time = time.time()
proc_message = "Command completed. {info}"
- info = "Elapsed time: " + str(datetime.timedelta(seconds=self.time_elapsed(start_time))) + "."
- info += " Running peak memory: {pipe}.".format(pipe=display_memory(self.peak_memory))
+ info = (
+ "Elapsed time: "
+ + str(datetime.timedelta(seconds=self.time_elapsed(start_time)))
+ + "."
+ )
+ info += " Running peak memory: {pipe}.".format(
+ pipe=display_memory(self.peak_memory)
+ )
# if len(proc_wrapup_text) == 1:
- # info += " {}".format(proc_wrapup_text[0])
+ # info += " {}".format(proc_wrapup_text[0])
for i in completed_processes:
info += " \n {}".format(self.completed_procs[processes[i].pid]["info"])
@@ -1024,7 +1336,9 @@ def proc_wrapup(i):
for rc in returncodes:
if rc != 0:
- msg = "Subprocess returned nonzero result. Check above output for details"
+ msg = (
+ "Subprocess returned nonzero result. Check above output for details"
+ )
self._triage_error(SubprocessError(msg), nofail)
return [returncodes, local_maxmems]
@@ -1059,7 +1373,7 @@ def _wait_for_process(self, p, shell=False):
:param bool shell: If command requires should be run in its own shell. Optional. Default: False.
"""
local_maxmem = -1
- sleeptime = .5
+ sleeptime = 0.5
while p.poll() is None:
if not shell:
local_maxmem = max(local_maxmem, self._memory_usage(p.pid) / 1e6)
@@ -1068,7 +1382,7 @@ def _wait_for_process(self, p, shell=False):
sleeptime = min(sleeptime + 5, 60)
self.peak_memory = max(self.peak_memory, local_maxmem)
-
+
del self.running_procs[p.pid]
info = "Process " + str(p.pid) + " returned: (" + str(p.returncode) + ")."
@@ -1087,7 +1401,7 @@ def _wait_for_lock(self, lock_file):
:param str lock_file: Lock file to wait upon.
"""
- sleeptime = .5
+ sleeptime = 0.5
first_message_flag = False
long_message_flag = False
dot_count = 0
@@ -1096,12 +1410,18 @@ def _wait_for_lock(self, lock_file):
while os.path.isfile(lock_file):
if first_message_flag is False:
self.timestamp("Waiting for file lock: " + lock_file)
- self.warning("This indicates that another process may be executing this "
+ self.warning(
+ "This indicates that another process may be executing this "
"command, or the pipeline was not properly shut down. If the "
"pipeline was not properly shut down last time, "
"you should restart it in 'recover' mode (-R) to indicate that "
- "this step should be restarted.")
- self._set_status_flag(WAIT_FLAG)
+ "this step should be restarted."
+ )
+ # self._set_status_flag(WAIT_FLAG)
+ self.pipestat.set_status(
+ sample_name=self._pipestat_manager.sample_name,
+ status_identifier="waiting",
+ )
first_message_flag = True
else:
sys.stdout.write(".")
@@ -1121,7 +1441,11 @@ def _wait_for_lock(self, lock_file):
if first_message_flag:
self.timestamp("File unlocked.")
- self._set_status_flag(RUN_FLAG)
+ # self._set_status_flag(RUN_FLAG)
+ self.pipestat.set_status(
+ sample_name=self._pipestat_manager.sample_name,
+ status_identifier="running",
+ )
###################################
# Logging functions
@@ -1145,8 +1469,7 @@ def critical(self, msg, *args, **kwargs):
def fatal(self, msg, *args, **kwargs):
self._logger.fatal(msg, *args, **kwargs)
- def timestamp(self, message="", checkpoint=None,
- finished=False, raise_error=True):
+ def timestamp(self, message="", checkpoint=None, finished=False, raise_error=True):
"""
Print message, time, and time elapsed, perhaps creating checkpoint.
@@ -1189,7 +1512,9 @@ def timestamp(self, message="", checkpoint=None,
self.curr_checkpoint = checkpoint
self._checkpoint(self.prev_checkpoint)
# Handle the two halting conditions.
- if (finished and checkpoint == self.stop_after) or (not finished and checkpoint == self.stop_before):
+ if (finished and checkpoint == self.stop_after) or (
+ not finished and checkpoint == self.stop_before
+ ):
self.halt(checkpoint, finished, raise_error=raise_error)
# Determine if we've started executing.
elif checkpoint == self.start_point:
@@ -1203,13 +1528,17 @@ def timestamp(self, message="", checkpoint=None,
elapsed = self.time_elapsed(self.last_timestamp)
t = time.strftime("%m-%d %H:%M:%S")
if checkpoint is None:
- msg = "{m} ({t}) elapsed: {delta_t} _TIME_".\
- format(m=message, t=t, delta_t=elapsed)
+ msg = "{m} ({t}) elapsed: {delta_t} _TIME_".format(
+ m=message, t=t, delta_t=elapsed
+ )
else:
- msg = "{m} ({t}) ({status} {stage}) elapsed: {delta_t} _TIME_".\
- format(m=message, t=t,
- status="finished" if finished else "starting",
- stage=checkpoint, delta_t=elapsed)
+ msg = "{m} ({t}) ({status} {stage}) elapsed: {delta_t} _TIME_".format(
+ m=message,
+ t=t,
+ status="finished" if finished else "starting",
+ stage=checkpoint,
+ delta_t=elapsed,
+ )
if re.match("^###", message):
msg = "\n{}\n".format(msg)
self.info(msg)
@@ -1224,59 +1553,78 @@ def time_elapsed(time_since):
"""
return round(time.time() - time_since, 0)
- def _report_profile(self, command, lock_name, elapsed_time, memory, pid, args_hash, proc_count):
+ def _report_profile(
+ self, command, lock_name, elapsed_time, memory, pid, args_hash, proc_count
+ ):
"""
Writes a string to self.pipeline_profile_file.
"""
- rel_lock_name = lock_name if lock_name is None else os.path.relpath(lock_name, self.outfolder)
- message_raw = str(pid) + "\t" + \
- str(args_hash) + "\t" + \
- str(proc_count) + "\t" + \
- str(datetime.timedelta(seconds=round(elapsed_time, 2))) + "\t " + \
- str(round(memory, 4)) + "\t" + \
- str(command) + "\t" + \
- str(rel_lock_name)
+ rel_lock_name = (
+ lock_name
+ if lock_name is None
+ else os.path.relpath(lock_name, self.outfolder)
+ )
+ message_raw = (
+ str(pid)
+ + "\t"
+ + str(args_hash)
+ + "\t"
+ + str(proc_count)
+ + "\t"
+ + str(datetime.timedelta(seconds=round(elapsed_time, 2)))
+ + "\t "
+ + str(round(memory, 4))
+ + "\t"
+ + str(command)
+ + "\t"
+ + str(rel_lock_name)
+ )
with open(self.pipeline_profile_file, "a") as myfile:
myfile.write(message_raw + "\n")
- def report_result(self, key, value, annotation=None, nolog=False):
+ def report_result(self, key, value, nolog=False, result_formatter=None):
"""
- Writes a string to self.pipeline_stats_file.
-
+ Writes a key:value pair to self.pipeline_stats_file.
+
:param str key: name (key) of the stat
- :param str annotation: By default, the stats will be annotated with the
- pipeline name, so you can tell which pipeline records which stats.
- If you want, you can change this; use annotation='shared' if you
- need the stat to be used by another pipeline (using get_stat()).
+ :param dict value: value of the stat to report.
:param bool nolog: Turn on this flag to NOT print this result in the
logfile. Use sparingly in case you will be printing the result in a
different format.
- """
- # Default annotation is current pipeline name.
- annotation = str(annotation or self.name)
-
- # In case the value is passed with trailing whitespace.
- value = str(value).strip()
+ :param str result_formatter: function for formatting via pipestat backend
+ :return str reported_result: the reported result is returned as a list of formatted strings.
+ """
# keep the value in memory:
self.stats_dict[key] = value
- message_raw = "{key}\t{value}\t{annotation}".format(
- key=key, value=value, annotation=annotation)
- message_markdown = "\n> `{key}`\t{value}\t{annotation}\t_RES_".format(
- key=key, value=value, annotation=annotation)
+ rf = result_formatter or self.pipestat_result_formatter
+
+ reported_result = self.pipestat.report(
+ values={key: value},
+ sample_name=self.pipestat_sample_name,
+ result_formatter=rf,
+ )
if not nolog:
- self.info(message_markdown)
+ for r in reported_result:
+ self.info(r)
- # Just to be extra careful, let's lock the file while we we write
- # in case multiple pipelines write to the same file.
- self._safe_write_to_file(self.pipeline_stats_file, message_raw)
+ return reported_result
- def report_object(self, key, filename, anchor_text=None, anchor_image=None, annotation=None):
+ def report_object(
+ self,
+ key,
+ filename,
+ anchor_text=None,
+ anchor_image=None,
+ annotation=None,
+ nolog=False,
+ result_formatter=None,
+ ):
"""
- Writes a string to self.pipeline_objects_file. Used to report figures
- and others.
+ Writes a key:value pair to self.pipeline_stats_file. Note: this function
+ will be deprecated. Using report_result is recommended.
:param str key: name (key) of the object
:param str filename: relative path to the file (relative to parent
@@ -1289,74 +1637,63 @@ def report_object(self, key, filename, anchor_text=None, anchor_image=None, anno
:param str annotation: By default, the figures will be annotated with
the pipeline name, so you can tell which pipeline records which
figures. If you want, you can change this.
- """
-
+ :param bool nolog: Turn on this flag to NOT print this result in the
+ logfile. Use sparingly in case you will be printing the result in a
+ different format.
+ :param str result_formatter: function for formatting via pipestat backend
+ :return str reported_result: the reported result is returned as a list of formatted strings.
+ """
+ warnings.warn(
+ "This function may be removed in future release. "
+ "The recommended way to report pipeline results is using PipelineManager.pipestat.report().",
+ category=DeprecationWarning,
+ )
+ rf = result_formatter or self.pipestat_result_formatter
# Default annotation is current pipeline name.
annotation = str(annotation or self.name)
-
# In case the value is passed with trailing whitespace.
filename = str(filename).strip()
if anchor_text:
anchor_text = str(anchor_text).strip()
else:
anchor_text = str(key).strip()
-
# better to use a relative path in this file
# convert any absolute paths into relative paths
- relative_filename = os.path.relpath(filename, self.outfolder) \
- if os.path.isabs(filename) else filename
+ relative_filename = (
+ os.path.relpath(filename, self.outfolder)
+ if os.path.isabs(filename)
+ else filename
+ )
if anchor_image:
- relative_anchor_image = os.path.relpath(anchor_image, self.outfolder) \
- if os.path.isabs(anchor_image) else anchor_image
+ relative_anchor_image = (
+ os.path.relpath(anchor_image, self.outfolder)
+ if os.path.isabs(anchor_image)
+ else anchor_image
+ )
else:
relative_anchor_image = "None"
- message_raw = "{key}\t{filename}\t{anchor_text}\t{anchor_image}\t{annotation}".format(
- key=key, filename=relative_filename, anchor_text=anchor_text,
- anchor_image=relative_anchor_image, annotation=annotation)
-
- message_markdown = "> `{key}`\t{filename}\t{anchor_text}\t{anchor_image}\t{annotation}\t_OBJ_".format(
- key=key, filename=relative_filename, anchor_text=anchor_text,
- anchor_image=relative_anchor_image, annotation=annotation)
-
- self.warning(message_markdown)
+ message_raw = "{filename}\t{anchor_text}\t{anchor_image}\t{annotation}".format(
+ filename=relative_filename,
+ anchor_text=anchor_text,
+ anchor_image=relative_anchor_image,
+ annotation=annotation,
+ )
- self._safe_write_to_file(self.pipeline_objects_file, message_raw)
-
- def _safe_write_to_file(self, file, message):
- """
- Writes a string to a file safely (with file locks).
- """
- target = file
- lock_name = make_lock_name(target, self.outfolder)
- lock_file = self._make_lock_path(lock_name)
+ val = {key: message_raw.replace("\t", " ")}
- while True:
- if os.path.isfile(lock_file):
- self._wait_for_lock(lock_file)
- else:
- try:
- self.locks.append(lock_file)
- self._create_file_racefree(lock_file)
- except OSError as e:
- if e.errno == errno.EEXIST:
- self.warning("Lock file created after test! Looping again.")
- continue # Go back to start
-
- # Proceed with file writing
- with open(file, "a") as myfile:
- myfile.write(message + "\n")
-
- os.remove(lock_file)
- self.locks.remove(lock_file)
-
- # If you make it to the end of the while loop, you're done
- break
+ reported_result = self.pipestat.report(
+ values=val, sample_name=self.pipestat_sample_name, result_formatter=rf
+ )
+ if not nolog:
+ for r in reported_result:
+ self.info(r)
+ return reported_result
def _report_command(self, cmd, procs=None):
"""
- Writes a command to both stdout and to the commands log file
+ Writes a command to both stdout and to the commands log file
(self.pipeline_commands_file).
:param str cmd: command to report
@@ -1385,22 +1722,22 @@ def _report_command(self, cmd, procs=None):
@staticmethod
def _create_file(file):
"""
- Creates a file, but will not fail if the file already exists.
- This is vulnerable to race conditions; use this for cases where it
+ Creates a file, but will not fail if the file already exists.
+ This is vulnerable to race conditions; use this for cases where it
doesn't matter if this process is the one that created the file.
:param str file: File to create.
"""
- with open(file, 'w') as fout:
- fout.write('')
+ with open(file, "w") as fout:
+ fout.write("")
@staticmethod
def _create_file_racefree(file):
"""
Creates a file, but fails if the file already exists.
-
- This function will thus only succeed if this process actually creates
- the file; if the file already exists, it will cause an OSError,
+
+ This function will thus only succeed if this process actually creates
+ the file; if the file already exists, it will cause an OSError,
solving race conditions.
:param str file: File to create.
@@ -1411,15 +1748,18 @@ def _create_file_racefree(file):
@staticmethod
def _ensure_lock_prefix(lock_name_base):
- """ Ensure that an alleged lock file is correctly prefixed. """
- return lock_name_base if lock_name_base.startswith(LOCK_PREFIX) \
- else LOCK_PREFIX + lock_name_base
+ """Ensure that an alleged lock file is correctly prefixed."""
+ return (
+ lock_name_base
+ if lock_name_base.startswith(LOCK_PREFIX)
+ else LOCK_PREFIX + lock_name_base
+ )
def _make_lock_path(self, lock_name_base):
"""
Create path to lock file with given name as base.
-
- :param str lock_name_base: Lock file name, designed to not be prefixed
+
+ :param str lock_name_base: Lock file name, designed to not be prefixed
with the lock file designation, but that's permitted.
:return str: Path to the lock file.
"""
@@ -1436,8 +1776,8 @@ def _make_lock_path(self, lock_name_base):
def _recoverfile_from_lockfile(self, lockfile):
"""
Create path to recovery file with given name as base.
-
- :param str lockfile: Name of file on which to base this path,
+
+ :param str lockfile: Name of file on which to base this path,
perhaps already prefixed with the designation of a lock file.
:return str: Path to recovery file.
"""
@@ -1453,7 +1793,7 @@ def make_sure_path_exists(path):
Creates all directories in a path if it does not exist.
:param str path: Path to create.
- :raises Exception: if the path creation attempt hits an error with
+ :raises Exception: if the path creation attempt hits an error with
a code indicating a cause other than pre-existence.
"""
try:
@@ -1468,41 +1808,32 @@ def make_sure_path_exists(path):
def _refresh_stats(self):
"""
- Loads up the stats sheet created for this pipeline run and reads
+ Loads up the stats yaml created for this pipeline run and reads
those stats into memory
"""
- # regex identifies all possible stats files.
- #regex = self.outfolder + "*_stats.tsv"
- #stats_files = glob.glob(regex)
- #stats_files.insert(self.pipeline_stats_file) # last one is the current pipeline
- #for stats_file in stats_files:
-
- stats_file = self.pipeline_stats_file
if os.path.isfile(self.pipeline_stats_file):
- with open(stats_file, 'r') as stat_file:
- for line in stat_file:
- try:
- # Someone may have put something that's not 3 columns in the stats file
- # if so, shame on him, but we can just ignore it.
- key, value, annotation = line.split('\t')
- except ValueError:
- self.warning("WARNING: Each row in a stats file is expected to have 3 columns")
-
- if annotation.rstrip() == self.name or annotation.rstrip() == "shared":
- self.stats_dict[key] = value.strip()
- #if os.path.isfile(self.pipeline_stats_file):
+ _, data = read_yaml_data(path=self.pipeline_stats_file, what="stats_file")
+ print(data)
+ pipeline_key = list(
+ data[self.pipestat["_pipeline_name"]][self.pipestat["_pipeline_type"]]
+ )[0]
+ if self.name == pipeline_key:
+ for key, value in data[self.pipestat["_pipeline_name"]][
+ self.pipestat["_pipeline_type"]
+ ][pipeline_key].items():
+ self.stats_dict[key] = value.strip()
def get_stat(self, key):
"""
Returns a stat that was previously reported. This is necessary for reporting new stats that are
- derived from two stats, one of which may have been reported by an earlier run. For example,
+ derived from two stats, one of which may have been reported by an earlier run. For example,
if you first use report_result to report (number of trimmed reads), and then in a later stage
- want to report alignment rate, then this second stat (alignment rate) will require knowing the
+ want to report alignment rate, then this second stat (alignment rate) will require knowing the
first stat (number of trimmed reads); however, that may not have been calculated in the current
- pipeline run, so we must retrieve it from the stats.tsv output file. This command will retrieve
+ pipeline run, so we must retrieve it from the stats.yaml output file. This command will retrieve
such previously reported stats if they were not already calculated in the current pipeline run.
- :param key: key of stat to retrieve
+ :param key: key of stat to retrieve
"""
try:
@@ -1562,9 +1893,12 @@ def _checkpoint(self, stage):
# be expected to characterize the extension of a file name/path.
base, ext = os.path.splitext(stage)
if ext and "." not in base:
- self.warning("WARNING: '{}' looks like it may be the name or path of "
- "a file; for such a checkpoint, use touch_checkpoint.".
- format(stage))
+ self.warning(
+ "WARNING: '{}' looks like it may be the name or path of "
+ "a file; for such a checkpoint, use touch_checkpoint.".format(
+ stage
+ )
+ )
else:
if not is_checkpoint:
self.warning("Not a checkpoint: {}".format(stage))
@@ -1596,9 +1930,12 @@ def _touch_checkpoint(self, check_file):
other_folder = os.path.join(folder, "")
this_folder = os.path.join(self.outfolder, "")
if other_folder != this_folder:
- errmsg = "Path provided as checkpoint file isn't in pipeline " \
- "output folder. '{}' is not in '{}'".format(
- check_file, self.outfolder)
+ errmsg = (
+ "Path provided as checkpoint file isn't in pipeline "
+ "output folder. '{}' is not in '{}'".format(
+ check_file, self.outfolder
+ )
+ )
raise ValueError(errmsg)
fpath = check_file
else:
@@ -1607,14 +1944,14 @@ def _touch_checkpoint(self, check_file):
# Create/update timestamp for checkpoint, but base return value on
# whether the action was a simple update or a novel creation.
already_exists = os.path.isfile(fpath)
- open(fpath, 'w').close()
+ open(fpath, "w").close()
action = "Updated" if already_exists else "Created"
self.info("{} checkpoint file: '{}'".format(action, fpath))
return already_exists
def complete(self):
- """ Stop a completely finished pipeline. """
+ """Stop a completely finished pipeline."""
self.stop_pipeline(status=COMPLETE_FLAG)
def fail_pipeline(self, exc, dynamic_recover=False):
@@ -1652,7 +1989,11 @@ def fail_pipeline(self, exc, dynamic_recover=False):
total_time = datetime.timedelta(seconds=self.time_elapsed(self.starttime))
self.info("Total time: " + str(total_time))
self.info("Failure reason: " + str(exc))
- self._set_status_flag(FAIL_FLAG)
+ # self._set_status_flag(FAIL_FLAG)
+ self.pipestat.set_status(
+ sample_name=self._pipestat_manager.sample_name,
+ status_identifier="failed",
+ )
if isinstance(exc, str):
exc = RuntimeError(exc)
@@ -1683,16 +2024,21 @@ def get_elapsed_time(self):
:return int: sum of runtimes in seconds
"""
if os.path.isfile(self.pipeline_profile_file):
- df = _pd.read_csv(self.pipeline_profile_file, sep="\t", comment="#", names=PROFILE_COLNAMES)
+ df = _pd.read_csv(
+ self.pipeline_profile_file,
+ sep="\t",
+ comment="#",
+ names=PROFILE_COLNAMES,
+ )
try:
- df['runtime'] = _pd.to_timedelta(df['runtime'])
+ df["runtime"] = _pd.to_timedelta(df["runtime"])
except ValueError:
# return runtime estimate
# this happens if old profile style is mixed with the new one
# and the columns do not match
return self.time_elapsed(self.starttime)
- unique_df = df[~df.duplicated('cid', keep='last').values]
- return sum(unique_df['runtime'].apply(lambda x: x.total_seconds()))
+ unique_df = df[~df.duplicated("cid", keep="last").values]
+ return sum(unique_df["runtime"].apply(lambda x: x.total_seconds()))
return self.time_elapsed(self.starttime)
def stop_pipeline(self, status=COMPLETE_FLAG):
@@ -1701,30 +2047,41 @@ def stop_pipeline(self, status=COMPLETE_FLAG):
This is the "healthy" pipeline completion function.
The normal pipeline completion function, to be run by the pipeline
- at the end of the script. It sets status flag to completed and records
+ at the end of the script. It sets status flag to completed and records
some time and memory statistics to the log file.
"""
- self._set_status_flag(status)
+ # self._set_status_flag(status)
+ self.pipestat.set_status(
+ sample_name=self._pipestat_manager.sample_name, status_identifier=status
+ )
self._cleanup()
- elapsed_time_this_run = str(datetime.timedelta(seconds=self.time_elapsed(self.starttime)))
- self.report_result("Time",
- elapsed_time_this_run,
- nolog=True)
- self.report_result("Success",
- time.strftime("%m-%d-%H:%M:%S"),
- nolog=True)
+ elapsed_time_this_run = str(
+ datetime.timedelta(seconds=self.time_elapsed(self.starttime))
+ )
+ self.report_result("Time", elapsed_time_this_run, nolog=True)
+ self.report_result("Success", time.strftime("%m-%d-%H:%M:%S"), nolog=True)
self.info("\n### Pipeline completed. Epilogue")
# print("* " + "Total elapsed time".rjust(20) + ": "
# + str(datetime.timedelta(seconds=self.time_elapsed(self.starttime))))
- self.info("* " + "Elapsed time (this run)".rjust(30) + ": " +
- elapsed_time_this_run)
- self.info("* " + "Total elapsed time (all runs)".rjust(30) + ": " +
- str(datetime.timedelta(seconds=round(self.get_elapsed_time()))))
- self.info("* " + "Peak memory (this run)".rjust(30) + ": " +
- str(round(self.peak_memory, 4)) + " GB")
- # self.info("* " + "Total peak memory (all runs)".rjust(30) + ": " +
- # str(round(self.peak_memory, 4)) + " GB")
+ self.info(
+ "* " + "Elapsed time (this run)".rjust(30) + ": " + elapsed_time_this_run
+ )
+ self.info(
+ "* "
+ + "Total elapsed time (all runs)".rjust(30)
+ + ": "
+ + str(datetime.timedelta(seconds=round(self.get_elapsed_time())))
+ )
+ self.info(
+ "* "
+ + "Peak memory (this run)".rjust(30)
+ + ": "
+ + str(round(self.peak_memory, 4))
+ + " GB"
+ )
+ # self.info("* " + "Total peak memory (all runs)".rjust(30) + ": " +
+ # str(round(self.peak_memory, 4)) + " GB")
if self.halted:
return
@@ -1745,7 +2102,7 @@ def _signal_term_handler(self, signal, frame):
"""
signal_type = "SIGTERM"
self._generic_signal_handler(signal_type)
-
+
def _generic_signal_handler(self, signal_type):
"""
Function for handling both SIGTERM and SIGINT
@@ -1764,7 +2121,7 @@ def _generic_signal_handler(self, signal_type):
# passed directly to the tee subprocess, so I could handle that on
# my own; hence, now I believe I no longer need to do this. I'm
# leaving this code here as a relic in case something comes up.
- #with open(self.pipeline_log_file, "a") as myfile:
+ # with open(self.pipeline_log_file, "a") as myfile:
# myfile.write(message + "\n")
def _signal_int_handler(self, signal, frame):
@@ -1799,10 +2156,9 @@ def _exit_handler(self):
self.fail_pipeline(Exception("Pipeline failure. See details above."))
if self.tee:
- self.tee.kill()
+ self.tee.kill()
def _terminate_running_subprocesses(self):
-
# make a copy of the list to iterate over since we'll be removing items
for pid in self.running_procs.copy():
proc_dict = self.running_procs[pid]
@@ -1810,9 +2166,18 @@ def _terminate_running_subprocesses(self):
# Close the preformat tag that we opened when the process was spawned.
# record profile of any running processes before killing
elapsed_time = time.time() - self.running_procs[pid]["start_time"]
- process_peak_mem = self._memory_usage(pid, container=proc_dict["container"])/1e6
- self._report_profile(self.running_procs[pid]["proc_name"], None, elapsed_time, process_peak_mem, pid,
- self.running_procs[pid]["args_hash"], self.running_procs[pid]["local_proc_id"])
+ process_peak_mem = (
+ self._memory_usage(pid, container=proc_dict["container"]) / 1e6
+ )
+ self._report_profile(
+ self.running_procs[pid]["proc_name"],
+ None,
+ elapsed_time,
+ process_peak_mem,
+ pid,
+ self.running_procs[pid]["args_hash"],
+ self.running_procs[pid]["local_proc_id"],
+ )
self._kill_child_process(pid, proc_dict["proc_name"])
del self.running_procs[pid]
@@ -1842,10 +2207,10 @@ def pskill(proc_pid, sig=signal.SIGINT):
if proc_name:
proc_string = " ({proc_name})".format(proc_name=proc_name)
- # First a gentle kill
+ # First a gentle kill
sys.stdout.flush()
still_running = self._attend_process(psutil.Process(child_pid), 0)
- sleeptime = .25
+ sleeptime = 0.25
time_waiting = 0
while still_running and time_waiting < 3:
@@ -1873,9 +2238,12 @@ def pskill(proc_pid, sig=signal.SIGINT):
if still_running:
# still running!?
- self.warning("Child process {child_pid}{proc_string} never responded"
- "I just can't take it anymore. I don't know what to do...".format(child_pid=child_pid,
- proc_string=proc_string))
+ self.warning(
+ "Child process {child_pid}{proc_string} never responded"
+ "I just can't take it anymore. I don't know what to do...".format(
+ child_pid=child_pid, proc_string=proc_string
+ )
+ )
else:
if time_waiting > 0:
note = "terminated after {time} sec".format(time=int(time_waiting))
@@ -1883,12 +2251,13 @@ def pskill(proc_pid, sig=signal.SIGINT):
note = "was already terminated"
msg = "Child process {child_pid}{proc_string} {note}.".format(
- child_pid=child_pid, proc_string=proc_string, note=note)
+ child_pid=child_pid, proc_string=proc_string, note=note
+ )
self.info(msg)
@staticmethod
def _atexit_register(*args):
- """ Convenience alias to register exit functions without having to import atexit in the pipeline. """
+ """Convenience alias to register exit functions without having to import atexit in the pipeline."""
atexit.register(*args)
def get_container(self, image, mounts):
@@ -1954,11 +2323,17 @@ def clean_add(self, regex, conditional=False, manual=False):
try:
with open(self.cleanup_file, "a") as myfile:
if os.path.isabs(filename):
- relative_filename = os.path.relpath(filename, self.outfolder)
+ relative_filename = os.path.relpath(
+ filename, self.outfolder
+ )
absolute_filename = filename
else:
- relative_filename = os.path.relpath(filename, self.outfolder)
- absolute_filename = os.path.abspath(os.path.join(self.outfolder, relative_filename))
+ relative_filename = os.path.relpath(
+ filename, self.outfolder
+ )
+ absolute_filename = os.path.abspath(
+ os.path.join(self.outfolder, relative_filename)
+ )
if os.path.isfile(absolute_filename):
# print("Adding file to cleanup: {}".format(filename))
myfile.write("rm " + relative_filename + "\n")
@@ -1969,9 +2344,15 @@ def clean_add(self, regex, conditional=False, manual=False):
# and the directory itself
myfile.write("rmdir " + relative_filename + "\n")
else:
- self.info("File not added to cleanup: {}".format(relative_filename))
+ self.info(
+ "File not added to cleanup: {}".format(
+ relative_filename
+ )
+ )
except Exception as e:
- self.error("Error in clean_add on path {}: {}".format(filename, str(e)))
+ self.error(
+ "Error in clean_add on path {}: {}".format(filename, str(e))
+ )
elif conditional:
self.cleanup_list_conditional.append(regex)
else:
@@ -1998,9 +2379,11 @@ def _cleanup(self, dry_run=False):
n_to_clean_cond = len(self.cleanup_list_conditional)
if n_to_clean + n_to_clean_cond > 0:
- self.info("Starting cleanup: {} files; {} conditional files for cleanup".format(
- n_to_clean,
- n_to_clean_cond))
+ self.info(
+ "Starting cleanup: {} files; {} conditional files for cleanup".format(
+ n_to_clean, n_to_clean_cond
+ )
+ )
else:
self.debug("No files to clean.")
@@ -2034,9 +2417,17 @@ def _cleanup(self, dry_run=False):
if n_to_clean_cond > 0:
run_flag = flag_name(RUN_FLAG)
- flag_files = [fn for fn in glob.glob(self.outfolder + flag_name("*"))
- if COMPLETE_FLAG not in os.path.basename(fn)
- and not "{}_{}".format(self.name, run_flag) == os.path.basename(fn)]
+ flag_files = [
+ fn
+ for fn in glob.glob(self.outfolder + flag_name("*"))
+ if COMPLETE_FLAG not in os.path.basename(fn)
+ and not "{}_{}_{}".format(
+ self._pipestat_manager["_pipeline_name"],
+ self.pipestat_sample_name,
+ run_flag,
+ )
+ == os.path.basename(fn)
+ ]
if len(flag_files) == 0 and not dry_run:
self.info("\nCleaning up conditional list. . .")
for expr in self.cleanup_list_conditional:
@@ -2055,9 +2446,14 @@ def _cleanup(self, dry_run=False):
except:
pass
else:
- self.info("\nConditional flag found: " + str([os.path.basename(i) for i in flag_files]))
- self.info("\nThese conditional files were left in place:\n\n- " +
- "\n- ".join(self.cleanup_list_conditional))
+ self.info(
+ "\nConditional flag found: "
+ + str([os.path.basename(i) for i in flag_files])
+ )
+ self.info(
+ "\nThese conditional files were left in place:\n\n- "
+ + "\n- ".join(self.cleanup_list_conditional)
+ )
# Produce a cleanup script.
no_cleanup_script = []
for cleandir in self.cleanup_list_conditional:
@@ -2071,10 +2467,13 @@ def _cleanup(self, dry_run=False):
clean_script.write("rmdir " + clean_item + "\n")
except Exception as e:
no_cleanup_script.append(cleandir)
- if no_cleanup_script:
- self.warning('\n\nCould not produce cleanup script for item(s):\n\n- ' + '\n- '.join(no_cleanup_script))
+ if no_cleanup_script:
+ self.warning(
+ "\n\nCould not produce cleanup script for item(s):\n\n- "
+ + "\n- ".join(no_cleanup_script)
+ )
- def _memory_usage(self, pid='self', category="hwm", container=None):
+ def _memory_usage(self, pid="self", category="hwm", container=None):
"""
Memory usage of the process in kilobytes.
@@ -2087,8 +2486,8 @@ def _memory_usage(self, pid='self', category="hwm", container=None):
cmd = "docker stats " + container + " --format '{{.MemUsage}}' --no-stream"
mem_use_str = subprocess.check_output(cmd, shell=True).decode()
- mem_num = re.findall('[\d\.]+', mem_use_str.split("/")[0])[0]
- mem_scale = re.findall('[A-Za-z]+', mem_use_str.split("/")[0])[0]
+ mem_num = re.findall("[\d\.]+", mem_use_str.split("/")[0])[0]
+ mem_scale = re.findall("[A-Za-z]+", mem_use_str.split("/")[0])[0]
mem_num = float(mem_num)
if mem_scale == "GiB":
@@ -2103,13 +2502,13 @@ def _memory_usage(self, pid='self', category="hwm", container=None):
# Thanks Martin Geisler:
status = None
- result = {'peak': 0, 'rss': 0, 'hwm': 0}
-
+ result = {"peak": 0, "rss": 0, "hwm": 0}
+
try:
# This will only work on systems with a /proc file system
# (like Linux).
# status = open('/proc/self/status')
- proc_spot = '/proc/%s/status' % pid
+ proc_spot = "/proc/%s/status" % pid
status = open(proc_spot)
for line in status:
parts = line.split()
@@ -2126,13 +2525,17 @@ def _memory_usage(self, pid='self', category="hwm", container=None):
return result[category]
def _triage_error(self, e, nofail):
- """ Print a message and decide what to do about an error. """
+ """Print a message and decide what to do about an error."""
if not nofail:
self.fail_pipeline(e)
elif self._failed:
- self.info("This is a nofail process, but the pipeline was terminated for other reasons, so we fail.")
+ self.info(
+ "This is a nofail process, but the pipeline was terminated for other reasons, so we fail."
+ )
raise e
else:
self.error(e)
- self.error("ERROR: Subprocess returned nonzero result, but pipeline is continuing because nofail=True")
+ self.error(
+ "ERROR: Subprocess returned nonzero result, but pipeline is continuing because nofail=True"
+ )
# TODO: return nonzero, or something. . .?
diff --git a/pypiper/ngstk.py b/pypiper/ngstk.py
index dcc57e8e..329b321b 100755
--- a/pypiper/ngstk.py
+++ b/pypiper/ngstk.py
@@ -1,11 +1,13 @@
""" Broadly applicable NGS processing/analysis functionality """
+import errno
import os
import re
import subprocess
-import errno
+
from attmap import AttMapEcho
from yacman import load_yaml
+
from .exceptions import UnsupportedFiletypeException
from .utils import is_fastq, is_gzipped_fastq, is_sam_or_bam
@@ -43,7 +45,8 @@ def __init__(self, config_file=None, pm=None):
# parse yaml into the project's attributes
# self.add_entries(**config)
super(NGSTk, self).__init__(
- None if config_file is None else load_yaml(config_file))
+ None if config_file is None else load_yaml(config_file)
+ )
# Keep a link to the pipeline manager, if one is provided.
# if None is provided, instantiate "tools" and "parameters" with empty AttMaps
@@ -63,12 +66,15 @@ def __init__(self, config_file=None, pm=None):
self.parameters = AttMapEcho()
# If pigz is available, use that. Otherwise, default to gzip.
- if hasattr(self.pm, "cores") and self.pm.cores > 1 and self.check_command("pigz"):
+ if (
+ hasattr(self.pm, "cores")
+ and self.pm.cores > 1
+ and self.check_command("pigz")
+ ):
self.ziptool_cmd = "pigz -f -p {}".format(self.pm.cores)
else:
self.ziptool_cmd = "gzip -f"
-
def _ensure_folders(self, *paths):
"""
Ensure that paths to folder(s) exist.
@@ -90,7 +96,6 @@ def _ensure_folders(self, *paths):
# Otherwise, just ensure that we have path to file's folder.
self.make_dir(fpath if ext else p)
-
@property
def ziptool(self):
"""
@@ -100,7 +105,6 @@ def ziptool(self):
"""
return self.ziptool_cmd
-
def make_dir(self, path):
"""
Forge path to directory, creating intermediates as needed.
@@ -113,12 +117,10 @@ def make_dir(self, path):
if exception.errno != errno.EEXIST:
raise
-
def make_sure_path_exists(self, path):
- """ Alias for make_dir """
+ """Alias for make_dir"""
self.make_dir(path)
-
# Borrowed from looper
def check_command(self, command):
"""
@@ -126,7 +128,9 @@ def check_command(self, command):
"""
# Use `command` to see if command is callable, store exit code
- code = os.system("command -v {0} >/dev/null 2>&1 || {{ exit 1; }}".format(command))
+ code = os.system(
+ "command -v {0} >/dev/null 2>&1 || {{ exit 1; }}".format(command)
+ )
# If exit code is not 0, report which command failed and return False, else return True
if code != 0:
@@ -135,7 +139,6 @@ def check_command(self, command):
else:
return True
-
def get_file_size(self, filenames):
"""
Get size of all files in string (space-separated) in megabytes (Mb).
@@ -149,10 +152,15 @@ def get_file_size(self, filenames):
if type(filenames) is list:
return sum([self.get_file_size(filename) for filename in filenames])
- return round(sum([float(os.stat(f).st_size) for f in filenames.split(" ")]) / (1024 ** 2), 4)
-
+ return round(
+ sum([float(os.stat(f).st_size) for f in filenames.split(" ")])
+ / (1024**2),
+ 4,
+ )
- def mark_duplicates(self, aligned_file, out_file, metrics_file, remove_duplicates="True"):
+ def mark_duplicates(
+ self, aligned_file, out_file, metrics_file, remove_duplicates="True"
+ ):
cmd = self.tools.java
if self.pm.javamem: # If a memory restriction exists.
cmd += " -Xmx" + self.pm.javamem
@@ -163,9 +171,9 @@ def mark_duplicates(self, aligned_file, out_file, metrics_file, remove_duplicate
cmd += " REMOVE_DUPLICATES=" + remove_duplicates
return cmd
-
- def bam2fastq(self, input_bam, output_fastq,
- output_fastq2=None, unpaired_fastq=None):
+ def bam2fastq(
+ self, input_bam, output_fastq, output_fastq2=None, unpaired_fastq=None
+ ):
"""
Create command to convert BAM(s) to FASTQ(s).
@@ -185,7 +193,6 @@ def bam2fastq(self, input_bam, output_fastq,
cmd += " UNPAIRED_FASTQ={0}".format(unpaired_fastq)
return cmd
-
def bam_to_fastq(self, bam_file, out_fastq_pre, paired_end):
"""
Build command to convert BAM file to FASTQ file(s) (R1/R2).
@@ -209,11 +216,10 @@ def bam_to_fastq(self, bam_file, out_fastq_pre, paired_end):
cmd += " VALIDATION_STRINGENCY=SILENT"
return cmd
-
def bam_to_fastq_awk(self, bam_file, out_fastq_pre, paired_end, zipmode=False):
"""
- This converts bam file to fastq files, but using awk. As of 2016, this is much faster
- than the standard way of doing this using Picard, and also much faster than the
+ This converts bam file to fastq files, but using awk. As of 2016, this is much faster
+ than the standard way of doing this using Picard, and also much faster than the
bedtools implementation as well; however, it does no sanity checks and assumes the reads
(for paired data) are all paired (no singletons), in the correct order.
:param bool zipmode: Should the output be zipped?
@@ -222,29 +228,27 @@ def bam_to_fastq_awk(self, bam_file, out_fastq_pre, paired_end, zipmode=False):
fq1 = out_fastq_pre + "_R1.fastq"
fq2 = out_fastq_pre + "_R2.fastq"
-
if zipmode:
fq1 = fq1 + ".gz"
fq2 = fq2 + ".gz"
- fq1_target = " | \"" + self.ziptool + " -c > " + fq1 + '"'
- fq2_target = " | \"" + self.ziptool + " -c > " + fq2 + '"'
+ fq1_target = ' | "' + self.ziptool + " -c > " + fq1 + '"'
+ fq2_target = ' | "' + self.ziptool + " -c > " + fq2 + '"'
else:
fq1_target = ' > "' + fq1 + '"'
fq2_target = ' > "' + fq2 + '"'
-
+
if paired_end:
cmd = self.tools.samtools + " view " + bam_file + " | awk '"
- cmd += r'{ if (NR%2==1) print "@"$1"/1\n"$10"\n+\n"$11' + fq1_target + ';'
- cmd += r' else print "@"$1"/2\n"$10"\n+\n"$11' + fq2_target + '; }'
+ cmd += r'{ if (NR%2==1) print "@"$1"/1\n"$10"\n+\n"$11' + fq1_target + ";"
+ cmd += r' else print "@"$1"/2\n"$10"\n+\n"$11' + fq2_target + "; }"
cmd += "'" # end the awk command
else:
fq2 = None
cmd = self.tools.samtools + " view " + bam_file + " | awk '"
- cmd += r'{ print "@"$1"\n"$10"\n+\n"$11' + fq1_target + '; }'
+ cmd += r'{ print "@"$1"\n"$10"\n+\n"$11' + fq1_target + "; }"
cmd += "'"
return cmd, fq1, fq2
-
def bam_to_fastq_bedtools(self, bam_file, out_fastq_pre, paired_end):
"""
Converts bam to fastq; A version using bedtools
@@ -252,14 +256,20 @@ def bam_to_fastq_bedtools(self, bam_file, out_fastq_pre, paired_end):
self.make_sure_path_exists(os.path.dirname(out_fastq_pre))
fq1 = out_fastq_pre + "_R1.fastq"
fq2 = None
- cmd = self.tools.bedtools + " bamtofastq -i " + bam_file + " -fq " + fq1 + ".fastq"
+ cmd = (
+ self.tools.bedtools
+ + " bamtofastq -i "
+ + bam_file
+ + " -fq "
+ + fq1
+ + ".fastq"
+ )
if paired_end:
fq2 = out_fastq_pre + "_R2.fastq"
cmd += " -fq2 " + fq2
return cmd, fq1, fq2
-
def get_input_ext(self, input_file):
"""
Get the extension of the input_file. Assumes you're using either
@@ -272,12 +282,13 @@ def get_input_ext(self, input_file):
elif input_file.endswith(".fastq") or input_file.endswith(".fq"):
input_ext = ".fastq"
else:
- errmsg = "'{}'; this pipeline can only deal with .bam, .fastq, " \
- "or .fastq.gz files".format(input_file)
+ errmsg = (
+ "'{}'; this pipeline can only deal with .bam, .fastq, "
+ "or .fastq.gz files".format(input_file)
+ )
raise UnsupportedFiletypeException(errmsg)
return input_ext
-
def merge_or_link(self, input_args, raw_folder, local_base="sample"):
"""
Standardizes various input possibilities by converting either .bam,
@@ -312,8 +323,7 @@ class of inputs (which can in turn be a string or a list).
else:
local_base_extended = local_base
if input_arg:
- out = self.merge_or_link(
- input_arg, raw_folder, local_base_extended)
+ out = self.merge_or_link(input_arg, raw_folder, local_base_extended)
print("Local input file: '{}'".format(out))
# Make sure file exists:
@@ -343,7 +353,8 @@ class of inputs (which can in turn be a string or a list).
self.pm.run(
"ln -sf " + input_arg + " " + local_input_abs,
target=local_input_abs,
- shell=True)
+ shell=True,
+ )
# return the local (linked) filename absolute path
return local_input_abs
@@ -365,11 +376,11 @@ class of inputs (which can in turn be a string or a list).
if all([self.get_input_ext(x) == ".fastq.gz" for x in input_args]):
sample_merged_gz = local_base + ".merged.fastq.gz"
output_merge_gz = os.path.join(raw_folder, sample_merged_gz)
- #cmd1 = self.ziptool + "-d -c " + " ".join(input_args) + " > " + output_merge
- #cmd2 = self.ziptool + " " + output_merge
- #self.pm.run([cmd1, cmd2], output_merge_gz)
+ # cmd1 = self.ziptool + "-d -c " + " ".join(input_args) + " > " + output_merge
+ # cmd2 = self.ziptool + " " + output_merge
+ # self.pm.run([cmd1, cmd2], output_merge_gz)
# you can save yourself the decompression/recompression:
- cmd = "cat " + " ".join(input_args) + " > " + output_merge_gz
+ cmd = "cat " + " ".join(input_args) + " > " + output_merge_gz
self.pm.run(cmd, output_merge_gz)
return output_merge_gz
@@ -383,13 +394,20 @@ class of inputs (which can in turn be a string or a list).
# At this point, we don't recognize the input file types or they
# do not match.
raise NotImplementedError(
- "Input files must be of the same type, and can only "
- "merge bam or fastq.")
-
+ "Input files must be of the same type, and can only "
+ "merge bam or fastq."
+ )
def input_to_fastq(
- self, input_file, sample_name, paired_end, fastq_folder,
- output_file=None, multiclass=False, zipmode=False):
+ self,
+ input_file,
+ sample_name,
+ paired_end,
+ fastq_folder,
+ output_file=None,
+ multiclass=False,
+ zipmode=False,
+ ):
"""
Builds a command to convert input file to fastq, for various inputs.
@@ -424,10 +442,15 @@ def input_to_fastq(
output_file = []
for in_i, in_arg in enumerate(input_file):
output = fastq_prefix + "_R" + str(in_i + 1) + ".fastq"
- result_cmd, uf, result_file = \
- self.input_to_fastq(in_arg, sample_name, paired_end,
- fastq_folder, output, multiclass=True,
- zipmode=zipmode)
+ result_cmd, uf, result_file = self.input_to_fastq(
+ in_arg,
+ sample_name,
+ paired_end,
+ fastq_folder,
+ output,
+ multiclass=True,
+ zipmode=zipmode,
+ )
cmd.append(result_cmd)
output_file.append(result_file)
@@ -444,8 +467,10 @@ def input_to_fastq(
if input_ext == ".bam":
print("Found .bam file")
- #cmd = self.bam_to_fastq(input_file, fastq_prefix, paired_end)
- cmd, fq1, fq2 = self.bam_to_fastq_awk(input_file, fastq_prefix, paired_end, zipmode)
+ # cmd = self.bam_to_fastq(input_file, fastq_prefix, paired_end)
+ cmd, fq1, fq2 = self.bam_to_fastq_awk(
+ input_file, fastq_prefix, paired_end, zipmode
+ )
# pm.run(cmd, output_file, follow=check_fastq)
if fq2:
output_file = [fq1, fq2]
@@ -455,20 +480,24 @@ def input_to_fastq(
print("Found .fastq.gz file")
if paired_end and not multiclass:
if zipmode:
- raise NotImplementedError("Can't use zipmode on interleaved fastq data.")
+ raise NotImplementedError(
+ "Can't use zipmode on interleaved fastq data."
+ )
# For paired-end reads in one fastq file, we must split the
# file into 2. The pipeline author will need to include this
- # python script in the scripts directory.
+ # python script in the scripts directory.
# TODO: make this self-contained in pypiper. This is a rare
# use case these days, as fastq files are almost never
# interleaved anymore.
- script_path = os.path.join(
- self.tools.scripts_dir, "fastq_split.py")
+ script_path = os.path.join(self.tools.scripts_dir, "fastq_split.py")
cmd = self.tools.python + " -u " + script_path
cmd += " -i " + input_file
cmd += " -o " + fastq_prefix
# Must also return the set of output files
- output_file = [fastq_prefix + "_R1.fastq", fastq_prefix + "_R2.fastq"]
+ output_file = [
+ fastq_prefix + "_R1.fastq",
+ fastq_prefix + "_R2.fastq",
+ ]
else:
if zipmode:
# we do nothing!
@@ -477,7 +506,9 @@ def input_to_fastq(
else:
# For single-end reads, we just unzip the fastq.gz file.
# or, paired-end reads that were already split.
- cmd = self.ziptool + " -d -c " + input_file + " > " + output_file
+ cmd = (
+ self.ziptool + " -d -c " + input_file + " > " + output_file
+ )
# a non-shell version
# cmd1 = "gunzip --force " + input_file
# cmd2 = "mv " + os.path.splitext(input_file)[0] + " " + output_file
@@ -491,7 +522,6 @@ def input_to_fastq(
return [cmd, fastq_prefix, output_file]
-
def check_fastq(self, input_files, output_files, paired_end):
"""
Returns a follow sanity-check function to be run after a fastq conversion.
@@ -510,9 +540,9 @@ def check_fastq(self, input_files, output_files, paired_end):
# This is AFTER merge, so if there are multiple files it means the
# files were split into read1/read2; therefore I must divide by number
# of files for final reads.
- def temp_func(input_files=input_files, output_files=output_files,
- paired_end=paired_end):
-
+ def temp_func(
+ input_files=input_files, output_files=output_files, paired_end=paired_end
+ ):
if type(input_files) != list:
input_files = [input_files]
if type(output_files) != list:
@@ -521,35 +551,45 @@ def temp_func(input_files=input_files, output_files=output_files,
n_input_files = len(list(filter(bool, input_files)))
n_output_files = len(list(filter(bool, output_files)))
- total_reads = sum([int(self.count_reads(input_file, paired_end))
- for input_file in input_files])
+ total_reads = sum(
+ [
+ int(self.count_reads(input_file, paired_end))
+ for input_file in input_files
+ ]
+ )
raw_reads = int(total_reads / n_input_files)
- self.pm.report_result("Raw_reads", str(raw_reads))
+ self.pm.pipestat.report(values={"Raw_reads": str(raw_reads)})
total_fastq_reads = sum(
- [int(self.count_reads(output_file, paired_end))
- for output_file in output_files])
+ [
+ int(self.count_reads(output_file, paired_end))
+ for output_file in output_files
+ ]
+ )
fastq_reads = int(total_fastq_reads / n_output_files)
- self.pm.report_result("Fastq_reads", fastq_reads)
+ self.pm.pipestat.report(values={"Fastq_reads": fastq_reads})
input_ext = self.get_input_ext(input_files[0])
# We can only assess pass filter reads in bam files with flags.
if input_ext == ".bam":
num_failed_filter = sum(
- [int(self.count_fail_reads(f, paired_end))
- for f in input_files])
+ [int(self.count_fail_reads(f, paired_end)) for f in input_files]
+ )
pf_reads = int(raw_reads) - num_failed_filter
- self.pm.report_result("PF_reads", str(pf_reads))
+ self.pm.pipestat.report(values={"PF_reads": str(pf_reads)})
if fastq_reads != int(raw_reads):
- raise Exception("Fastq conversion error? Number of input reads "
- "doesn't number of output reads.")
+ raise Exception(
+ "Fastq conversion error? Number of input reads "
+ "doesn't number of output reads."
+ )
return fastq_reads
return temp_func
-
- def check_trim(self, trimmed_fastq, paired_end, trimmed_fastq_R2=None, fastqc_folder=None):
+ def check_trim(
+ self, trimmed_fastq, paired_end, trimmed_fastq_R2=None, fastqc_folder=None
+ ):
"""
Build function to evaluate read trimming, and optionally run fastqc.
@@ -567,21 +607,21 @@ def check_trim(self, trimmed_fastq, paired_end, trimmed_fastq_R2=None, fastqc_fo
"""
def temp_func():
-
print("Evaluating read trimming")
if paired_end and not trimmed_fastq_R2:
print("WARNING: specified paired-end but no R2 file")
n_trim = float(self.count_reads(trimmed_fastq, paired_end))
- self.pm.report_result("Trimmed_reads", int(n_trim))
+ self.pm.pipestat.report(values={"Trimmed_reads": int(n_trim)})
try:
- rr = float(self.pm.get_stat("Raw_reads"))
+ rr = float(self.pm.pipestat.retrieve("Raw_reads"))
except:
print("Can't calculate trim loss rate without raw read result.")
else:
self.pm.report_result(
- "Trim_loss_rate", round((rr - n_trim) * 100 / rr, 2))
+ "Trim_loss_rate", round((rr - n_trim) * 100 / rr, 2)
+ )
# Also run a fastqc (if installed/requested)
if fastqc_folder:
@@ -591,18 +631,31 @@ def temp_func():
self.pm.run(cmd, lock_name="trimmed_fastqc", nofail=True)
fname, ext = os.path.splitext(os.path.basename(trimmed_fastq))
fastqc_html = os.path.join(fastqc_folder, fname + "_fastqc.html")
- self.pm.report_object("FastQC report r1", fastqc_html)
+ self.pm.pipestat.report(
+ values={
+ "FastQC_report_R1": {
+ "path": fastqc_html,
+ "title": "FastQC report R1",
+ }
+ }
+ )
if paired_end and trimmed_fastq_R2:
cmd = self.fastqc(trimmed_fastq_R2, fastqc_folder)
self.pm.run(cmd, lock_name="trimmed_fastqc_R2", nofail=True)
fname, ext = os.path.splitext(os.path.basename(trimmed_fastq_R2))
fastqc_html = os.path.join(fastqc_folder, fname + "_fastqc.html")
- self.pm.report_object("FastQC report r2", fastqc_html)
+ self.pm.pipestat.report(
+ values={
+ "FastQC_report_R2": {
+ "path": fastqc_html,
+ "title": "FastQC report R2",
+ }
+ }
+ )
return temp_func
-
def validate_bam(self, input_bam):
"""
Wrapper for Picard's ValidateSamFile.
@@ -615,7 +668,6 @@ def validate_bam(self, input_bam):
cmd += " INPUT=" + input_bam
return cmd
-
def merge_bams(self, input_bams, merged_bam, in_sorted="TRUE", tmp_dir=None):
"""
Combine multiple files into one.
@@ -653,27 +705,25 @@ def merge_bams(self, input_bams, merged_bam, in_sorted="TRUE", tmp_dir=None):
cmd += " TMP_DIR=" + tmp_dir
return cmd
-
-
+
def merge_bams_samtools(self, input_bams, merged_bam):
- cmd = self.tools.samtools + " merge -f "
+ cmd = self.tools.samtools + " merge -f "
cmd += " -@ " + str(self.pm.cores)
- cmd += " " + merged_bam + " "
+ cmd += " " + merged_bam + " "
cmd += " ".join(input_bams)
return cmd
-
def merge_fastq(self, inputs, output, run=False, remove_inputs=False):
"""
Merge FASTQ files (zipped or not) into one.
-
+
:param Iterable[str] inputs: Collection of paths to files to merge.
:param str output: Path to single output file.
:param bool run: Whether to run the command.
:param bool remove_inputs: Whether to keep the original files.
- :return NoneType | str: Null if running the command, otherwise the
+ :return NoneType | str: Null if running the command, otherwise the
command itself
- :raise ValueError: Raise ValueError if the call is such that
+ :raise ValueError: Raise ValueError if the call is such that
inputs are to be deleted but command is not run.
"""
if remove_inputs and not run:
@@ -687,14 +737,16 @@ def merge_fastq(self, inputs, output, run=False, remove_inputs=False):
else:
return cmd
-
def count_lines(self, file_name):
"""
Uses the command-line utility wc to count the number of lines in a file. For MacOS, must strip leading whitespace from wc.
:param str file_name: name of file whose lines are to be counted
"""
- x = subprocess.check_output("wc -l " + file_name + " | sed -E 's/^[[:space:]]+//' | cut -f1 -d' '", shell=True)
+ x = subprocess.check_output(
+ "wc -l " + file_name + " | sed -E 's/^[[:space:]]+//' | cut -f1 -d' '",
+ shell=True,
+ )
return x.decode().strip()
def count_lines_zip(self, file_name):
@@ -703,7 +755,13 @@ def count_lines_zip(self, file_name):
For compressed files.
:param file: file_name
"""
- x = subprocess.check_output(self.ziptool + " -d -c " + file_name + " | wc -l | sed -E 's/^[[:space:]]+//' | cut -f1 -d' '", shell=True)
+ x = subprocess.check_output(
+ self.ziptool
+ + " -d -c "
+ + file_name
+ + " | wc -l | sed -E 's/^[[:space:]]+//' | cut -f1 -d' '",
+ shell=True,
+ )
return x.decode().strip()
def get_chrs_from_bam(self, file_name):
@@ -711,7 +769,13 @@ def get_chrs_from_bam(self, file_name):
Uses samtools to grab the chromosomes from the header that are contained
in this bam file.
"""
- x = subprocess.check_output(self.tools.samtools + " view -H " + file_name + " | grep '^@SQ' | cut -f2| sed s'/SN://'", shell=True)
+ x = subprocess.check_output(
+ self.tools.samtools
+ + " view -H "
+ + file_name
+ + " | grep '^@SQ' | cut -f2| sed s'/SN://'",
+ shell=True,
+ )
# Chromosomes will be separated by newlines; split into list to return
return x.decode().split()
@@ -735,14 +799,25 @@ def count_unique_reads(self, file_name, paired_end):
if file_name.endswith("bam"):
param = ""
if paired_end:
- r1 = self.samtools_view(file_name, param=param + " -f64", postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'")
- r2 = self.samtools_view(file_name, param=param + " -f128", postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'")
+ r1 = self.samtools_view(
+ file_name,
+ param=param + " -f64",
+ postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'",
+ )
+ r2 = self.samtools_view(
+ file_name,
+ param=param + " -f128",
+ postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'",
+ )
else:
- r1 = self.samtools_view(file_name, param=param + "", postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'")
+ r1 = self.samtools_view(
+ file_name,
+ param=param + "",
+ postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'",
+ )
r2 = 0
return int(r1) + int(r2)
-
def count_unique_mapped_reads(self, file_name, paired_end):
"""
For a bam or sam file with paired or or single-end reads, returns the
@@ -759,21 +834,32 @@ def count_unique_mapped_reads(self, file_name, paired_end):
if ext == ".sam":
param = "-S -F4"
- elif ext == "bam":
+ elif ext == ".bam":
param = "-F4"
else:
raise ValueError("Not a SAM or BAM: '{}'".format(file_name))
- if paired_end:
- r1 = self.samtools_view(file_name, param=param + " -f64", postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'")
- r2 = self.samtools_view(file_name, param=param + " -f128", postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'")
+ if paired_end:
+ r1 = self.samtools_view(
+ file_name,
+ param=param + " -f64",
+ postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'",
+ )
+ r2 = self.samtools_view(
+ file_name,
+ param=param + " -f128",
+ postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'",
+ )
else:
- r1 = self.samtools_view(file_name, param=param + "", postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'")
+ r1 = self.samtools_view(
+ file_name,
+ param=param + "",
+ postpend=" | cut -f1 | sort -k1,1 -u | wc -l | sed -E 's/^[[:space:]]+//'",
+ )
r2 = 0
return int(r1) + int(r2)
-
def count_flag_reads(self, file_name, flag, paired_end):
"""
Counts the number of reads with the specified flag.
@@ -791,7 +877,6 @@ def count_flag_reads(self, file_name, flag, paired_end):
param += " -S"
return self.samtools_view(file_name, param=param)
-
def count_multimapping_reads(self, file_name, paired_end):
"""
Counts the number of reads that mapped to multiple locations. Warning:
@@ -807,7 +892,6 @@ def count_multimapping_reads(self, file_name, paired_end):
"""
return int(self.count_flag_reads(file_name, 256, paired_end))
-
def count_uniquelymapping_reads(self, file_name, paired_end):
"""
Counts the number of reads that mapped to a unique position.
@@ -820,7 +904,6 @@ def count_uniquelymapping_reads(self, file_name, paired_end):
param += " -S"
return self.samtools_view(file_name, param=param)
-
def count_fail_reads(self, file_name, paired_end):
"""
Counts the number of reads that failed platform/vendor quality checks.
@@ -831,7 +914,6 @@ def count_fail_reads(self, file_name, paired_end):
"""
return int(self.count_flag_reads(file_name, 512, paired_end))
-
def samtools_view(self, file_name, param, postpend=""):
"""
Run samtools view, with flexible parameters and post-processing.
@@ -843,13 +925,11 @@ def samtools_view(self, file_name, param, postpend=""):
:param str postpend: String to append to the samtools command;
useful to add cut, sort, wc operations to the samtools view output.
"""
- cmd = "{} view {} {} {}".format(
- self.tools.samtools, param, file_name, postpend)
+ cmd = "{} view {} {} {}".format(self.tools.samtools, param, file_name, postpend)
# in python 3, check_output returns a byte string which causes issues.
# with python 3.6 we could use argument: "encoding='UTF-8'""
return subprocess.check_output(cmd, shell=True).decode().strip()
-
def count_reads(self, file_name, paired_end):
"""
Count reads in a file.
@@ -874,13 +954,14 @@ def count_reads(self, file_name, paired_end):
param_text = "-c" if ext == ".bam" else "-c -S"
return self.samtools_view(file_name, param=param_text)
else:
- num_lines = self.count_lines_zip(file_name) \
- if is_gzipped_fastq(file_name) \
- else self.count_lines(file_name)
+ num_lines = (
+ self.count_lines_zip(file_name)
+ if is_gzipped_fastq(file_name)
+ else self.count_lines(file_name)
+ )
divisor = 2 if paired_end else 4
return int(num_lines) / divisor
-
def count_concordant(self, aligned_bam):
"""
Count only reads that "aligned concordantly exactly 1 time."
@@ -889,9 +970,8 @@ def count_concordant(self, aligned_bam):
"""
cmd = self.tools.samtools + " view " + aligned_bam + " | "
cmd += "grep 'YT:Z:CP'" + " | uniq -u | wc -l | sed -E 's/^[[:space:]]+//'"
-
- return subprocess.check_output(cmd, shell=True).decode().strip()
+ return subprocess.check_output(cmd, shell=True).decode().strip()
def count_mapped_reads(self, file_name, paired_end):
"""
@@ -912,35 +992,84 @@ def count_mapped_reads(self, file_name, paired_end):
return self.samtools_view(file_name, param="-c -F4 -S")
return -1
-
def sam_conversions(self, sam_file, depth=True):
"""
Convert sam files to bam files, then sort and index them for later use.
:param bool depth: also calculate coverage over each position
"""
- cmd = self.tools.samtools + " view -bS " + sam_file + " > " + sam_file.replace(".sam", ".bam") + "\n"
- cmd += self.tools.samtools + " sort " + sam_file.replace(".sam", ".bam") + " -o " + sam_file.replace(".sam", "_sorted.bam") + "\n"
- cmd += self.tools.samtools + " index " + sam_file.replace(".sam", "_sorted.bam") + "\n"
+ cmd = (
+ self.tools.samtools
+ + " view -bS "
+ + sam_file
+ + " > "
+ + sam_file.replace(".sam", ".bam")
+ + "\n"
+ )
+ cmd += (
+ self.tools.samtools
+ + " sort "
+ + sam_file.replace(".sam", ".bam")
+ + " -o "
+ + sam_file.replace(".sam", "_sorted.bam")
+ + "\n"
+ )
+ cmd += (
+ self.tools.samtools
+ + " index "
+ + sam_file.replace(".sam", "_sorted.bam")
+ + "\n"
+ )
if depth:
- cmd += self.tools.samtools + " depth " + sam_file.replace(".sam", "_sorted.bam") + " > " + sam_file.replace(".sam", "_sorted.depth") + "\n"
+ cmd += (
+ self.tools.samtools
+ + " depth "
+ + sam_file.replace(".sam", "_sorted.bam")
+ + " > "
+ + sam_file.replace(".sam", "_sorted.depth")
+ + "\n"
+ )
return cmd
-
def bam_conversions(self, bam_file, depth=True):
"""
Sort and index bam files for later use.
:param bool depth: also calculate coverage over each position
"""
- cmd = self.tools.samtools + " view -h " + bam_file + " > " + bam_file.replace(".bam", ".sam") + "\n"
- cmd += self.tools.samtools + " sort " + bam_file + " -o " + bam_file.replace(".bam", "_sorted.bam") + "\n"
- cmd += self.tools.samtools + " index " + bam_file.replace(".bam", "_sorted.bam") + "\n"
+ cmd = (
+ self.tools.samtools
+ + " view -h "
+ + bam_file
+ + " > "
+ + bam_file.replace(".bam", ".sam")
+ + "\n"
+ )
+ cmd += (
+ self.tools.samtools
+ + " sort "
+ + bam_file
+ + " -o "
+ + bam_file.replace(".bam", "_sorted.bam")
+ + "\n"
+ )
+ cmd += (
+ self.tools.samtools
+ + " index "
+ + bam_file.replace(".bam", "_sorted.bam")
+ + "\n"
+ )
if depth:
- cmd += self.tools.samtools + " depth " + bam_file.replace(".bam", "_sorted.bam") + " > " + bam_file.replace(".bam", "_sorted.depth") + "\n"
+ cmd += (
+ self.tools.samtools
+ + " depth "
+ + bam_file.replace(".bam", "_sorted.bam")
+ + " > "
+ + bam_file.replace(".bam", "_sorted.depth")
+ + "\n"
+ )
return cmd
-
def fastqc(self, file, output_dir):
"""
Create command to run fastqc on a FASTQ file
@@ -959,9 +1088,9 @@ def fastqc(self, file, output_dir):
if not os.path.isabs(output_dir) and pm is not None:
output_dir = os.path.join(pm.outfolder, output_dir)
self.make_sure_path_exists(output_dir)
- return "{} --noextract --outdir {} {}".\
- format(self.tools.fastqc, output_dir, file)
-
+ return "{} --noextract --outdir {} {}".format(
+ self.tools.fastqc, output_dir, file
+ )
def fastqc_rename(self, input_bam, output_dir, sample_name):
"""
@@ -984,20 +1113,29 @@ def fastqc_rename(self, input_bam, output_dir, sample_name):
cmd1 = self.fastqc(input_bam, output_dir)
cmds.append(cmd1)
cmd2 = "if [[ ! -s {1}_fastqc.html ]]; then mv {0}_fastqc.html {1}_fastqc.html; mv {0}_fastqc.zip {1}_fastqc.zip; fi".format(
- os.path.join(output_dir, initial), os.path.join(output_dir, sample_name))
+ os.path.join(output_dir, initial), os.path.join(output_dir, sample_name)
+ )
cmds.append(cmd2)
return cmds
-
def samtools_index(self, bam_file):
"""Index a bam file."""
cmd = self.tools.samtools + " index {0}".format(bam_file)
return cmd
-
def slurm_header(
- self, job_name, output, queue="shortq", n_tasks=1, time="10:00:00",
- cpus_per_task=8, mem_per_cpu=2000, nodes=1, user_mail="", mail_type="end"):
+ self,
+ job_name,
+ output,
+ queue="shortq",
+ n_tasks=1,
+ time="10:00:00",
+ cpus_per_task=8,
+ mem_per_cpu=2000,
+ nodes=1,
+ user_mail="",
+ mail_type="end",
+ ):
cmd = """ #!/bin/bash
#SBATCH --partition={0}
#SBATCH --ntasks={1}
@@ -1018,51 +1156,65 @@ def slurm_header(
date
""".format(
- queue, n_tasks, time, cpus_per_task, mem_per_cpu,
- nodes, job_name, output, mail_type, user_mail)
+ queue,
+ n_tasks,
+ time,
+ cpus_per_task,
+ mem_per_cpu,
+ nodes,
+ job_name,
+ output,
+ mail_type,
+ user_mail,
+ )
return cmd
-
def slurm_footer(self):
return " date"
-
def slurm_submit_job(self, job_file):
return os.system("sbatch %s" % job_file)
-
def remove_file(self, file_name):
return "rm {0}".format(file_name)
-
def move_file(self, old, new):
return "mv {0} {1}".format(old, new)
-
def preseq_curve(self, bam_file, output_prefix):
return """
preseq c_curve -B -P -o {0}.yield.txt {1}
- """.format(output_prefix, bam_file)
-
+ """.format(
+ output_prefix, bam_file
+ )
def preseq_extrapolate(self, bam_file, output_prefix):
return """
preseq lc_extrap -v -B -P -e 1e+9 -o {0}.future_yield.txt {1}
- """.format(output_prefix, bam_file)
-
+ """.format(
+ output_prefix, bam_file
+ )
def preseq_coverage(self, bam_file, output_prefix):
return """
preseq gc_extrap -o {0}.future_coverage.txt {1}
- """.format(output_prefix, bam_file)
-
+ """.format(
+ output_prefix, bam_file
+ )
def trimmomatic(
- self, input_fastq1, output_fastq1, cpus, adapters, log,
- input_fastq2=None, output_fastq1_unpaired=None,
- output_fastq2=None, output_fastq2_unpaired=None):
-
+ self,
+ input_fastq1,
+ output_fastq1,
+ cpus,
+ adapters,
+ log,
+ input_fastq2=None,
+ output_fastq1_unpaired=None,
+ output_fastq2=None,
+ output_fastq2_unpaired=None,
+ ):
PE = False if input_fastq2 is None else True
pe = "PE" if PE else "SE"
cmd = self.tools.java + " -Xmx" + self.pm.javamem
@@ -1072,17 +1224,26 @@ def trimmomatic(
cmd += " {0}".format(input_fastq2)
cmd += " {0}".format(output_fastq1)
if PE:
- cmd += " {0} {1} {2}".format(output_fastq1_unpaired, output_fastq2, output_fastq2_unpaired)
+ cmd += " {0} {1} {2}".format(
+ output_fastq1_unpaired, output_fastq2, output_fastq2_unpaired
+ )
cmd += " ILLUMINACLIP:{0}:1:40:15:8:true".format(adapters)
cmd += " LEADING:3 TRAILING:3"
cmd += " SLIDINGWINDOW:4:10"
cmd += " MINLEN:36"
return cmd
-
def skewer(
- self, input_fastq1, output_prefix, output_fastq1,
- log, cpus, adapters, input_fastq2=None, output_fastq2=None):
+ self,
+ input_fastq1,
+ output_prefix,
+ output_fastq1,
+ log,
+ cpus,
+ adapters,
+ input_fastq2=None,
+ output_fastq2=None,
+ ):
"""
Create commands with which to run skewer.
@@ -1117,17 +1278,33 @@ def skewer(
cmd2 = "mv {0} {1}".format(output_prefix + "-trimmed.fastq", output_fastq1)
cmds.append(cmd2)
else:
- cmd2 = "mv {0} {1}".format(output_prefix + "-trimmed-pair1.fastq", output_fastq1)
+ cmd2 = "mv {0} {1}".format(
+ output_prefix + "-trimmed-pair1.fastq", output_fastq1
+ )
cmds.append(cmd2)
- cmd3 = "mv {0} {1}".format(output_prefix + "-trimmed-pair2.fastq", output_fastq2)
+ cmd3 = "mv {0} {1}".format(
+ output_prefix + "-trimmed-pair2.fastq", output_fastq2
+ )
cmds.append(cmd3)
cmd4 = "mv {0} {1}".format(output_prefix + "-trimmed.log", log)
cmds.append(cmd4)
return cmds
- def bowtie2_map(self, input_fastq1, output_bam, log, metrics, genome_index, max_insert, cpus, input_fastq2=None):
+ def bowtie2_map(
+ self,
+ input_fastq1,
+ output_bam,
+ log,
+ metrics,
+ genome_index,
+ max_insert,
+ cpus,
+ input_fastq2=None,
+ ):
# Admits 2000bp-long fragments (--maxins option)
- cmd = self.tools.bowtie2 + " --very-sensitive --no-discordant -p {0}".format(cpus)
+ cmd = self.tools.bowtie2 + " --very-sensitive --no-discordant -p {0}".format(
+ cpus
+ )
cmd += " -x {0}".format(genome_index)
cmd += " --met-file {0}".format(metrics)
if input_fastq2 is None:
@@ -1136,15 +1313,24 @@ def bowtie2_map(self, input_fastq1, output_bam, log, metrics, genome_index, max_
cmd += " --maxins {0}".format(max_insert)
cmd += " -1 {0}".format(input_fastq1)
cmd += " -2 {0}".format(input_fastq2)
- cmd += " 2> {0} | samtools view -S -b - | samtools sort -o {1} -".format(log, output_bam)
+ cmd += " 2> {0} | samtools view -S -b - | samtools sort -o {1} -".format(
+ log, output_bam
+ )
return cmd
def topHat_map(self, input_fastq, output_dir, genome, transcriptome, cpus):
# TODO:
# Allow paired input
- cmd = self.tools.tophat + " --GTF {0} --b2-L 15 --library-type fr-unstranded --mate-inner-dist 120".format(transcriptome)
+ cmd = (
+ self.tools.tophat
+ + " --GTF {0} --b2-L 15 --library-type fr-unstranded --mate-inner-dist 120".format(
+ transcriptome
+ )
+ )
cmd += " --max-multihits 100 --no-coverage-search"
- cmd += " --num-threads {0} --output-dir {1} {2} {3}".format(cpus, output_dir, genome, input_fastq)
+ cmd += " --num-threads {0} --output-dir {1} {2} {3}".format(
+ cpus, output_dir, genome, input_fastq
+ )
return cmd
def picard_mark_duplicates(self, input_bam, output_bam, metrics_file, temp_dir="."):
@@ -1164,33 +1350,50 @@ def picard_mark_duplicates(self, input_bam, output_bam, metrics_file, temp_dir="
return [cmd1, cmd2, cmd3]
def sambamba_remove_duplicates(self, input_bam, output_bam, cpus=16):
- cmd = self.tools.sambamba + " markdup -t {0} -r {1} {2}".format(cpus, input_bam, output_bam)
+ cmd = self.tools.sambamba + " markdup -t {0} -r {1} {2}".format(
+ cpus, input_bam, output_bam
+ )
return cmd
def get_mitochondrial_reads(self, bam_file, output, cpus=4):
- """
- """
+ """ """
tmp_bam = bam_file + "tmp_rmMe"
cmd1 = self.tools.sambamba + " index -t {0} {1}".format(cpus, bam_file)
- cmd2 = self.tools.sambamba + " slice {0} chrM | {1} markdup -t 4 /dev/stdin {2} 2> {3}".format(bam_file, self.tools.sambamba, tmp_bam, output)
+ cmd2 = (
+ self.tools.sambamba
+ + " slice {0} chrM | {1} markdup -t 4 /dev/stdin {2} 2> {3}".format(
+ bam_file, self.tools.sambamba, tmp_bam, output
+ )
+ )
cmd3 = "rm {}".format(tmp_bam)
return [cmd1, cmd2, cmd3]
- def filter_reads(self, input_bam, output_bam, metrics_file, paired=False, cpus=16, Q=30):
+ def filter_reads(
+ self, input_bam, output_bam, metrics_file, paired=False, cpus=16, Q=30
+ ):
"""
Remove duplicates, filter for >Q, remove multiple mapping reads.
For paired-end reads, keep only proper pairs.
"""
nodups = re.sub("\.bam$", "", output_bam) + ".nodups.nofilter.bam"
- cmd1 = self.tools.sambamba + " markdup -t {0} -r --compression-level=0 {1} {2} 2> {3}".format(cpus, input_bam, nodups, metrics_file)
- cmd2 = self.tools.sambamba + ' view -t {0} -f bam --valid'.format(cpus)
+ cmd1 = (
+ self.tools.sambamba
+ + " markdup -t {0} -r --compression-level=0 {1} {2} 2> {3}".format(
+ cpus, input_bam, nodups, metrics_file
+ )
+ )
+ cmd2 = self.tools.sambamba + " view -t {0} -f bam --valid".format(cpus)
if paired:
cmd2 += ' -F "not (unmapped or mate_is_unmapped) and proper_pair'
else:
cmd2 += ' -F "not unmapped'
- cmd2 += ' and not (secondary_alignment or supplementary) and mapping_quality >= {0}"'.format(Q)
- cmd2 += ' {0} |'.format(nodups)
- cmd2 += self.tools.sambamba + " sort -t {0} /dev/stdin -o {1}".format(cpus, output_bam)
+ cmd2 += ' and not (secondary_alignment or supplementary) and mapping_quality >= {0}"'.format(
+ Q
+ )
+ cmd2 += " {0} |".format(nodups)
+ cmd2 += self.tools.sambamba + " sort -t {0} /dev/stdin -o {1}".format(
+ cpus, output_bam
+ )
cmd3 = "if [[ -s {0} ]]; then rm {0}; fi".format(nodups)
cmd4 = "if [[ -s {0} ]]; then rm {0}; fi".format(nodups + ".bai")
return [cmd1, cmd2, cmd3, cmd4]
@@ -1203,7 +1406,6 @@ def shift_reads(self, input_bam, genome, output_bam):
cmd += " " + self.tools.samtools + " sort -o {0} -".format(output_bam)
return cmd
-
def sort_index_bam(self, input_bam, output_bam):
tmp_bam = re.sub("\.bam", ".sorted", input_bam)
cmd1 = self.tools.samtools + " sort {0} {1}".format(input_bam, tmp_bam)
@@ -1211,12 +1413,10 @@ def sort_index_bam(self, input_bam, output_bam):
cmd3 = self.tools.samtools + " index {0}".format(output_bam)
return [cmd1, cmd2, cmd3]
-
def index_bam(self, input_bam):
cmd = self.tools.samtools + " index {0}".format(input_bam)
return cmd
-
def run_spp(self, input_bam, output, plot, cpus):
"""
Run the SPP read peak analysis tool.
@@ -1229,38 +1429,40 @@ def run_spp(self, input_bam, output, plot, cpus):
"""
base = "{} {} -rf -savp".format(self.tools.Rscript, self.tools.spp)
cmd = base + " -savp={} -s=0:5:500 -c={} -out={} -p={}".format(
- plot, input_bam, output, cpus)
+ plot, input_bam, output, cpus
+ )
return cmd
-
def get_fragment_sizes(self, bam_file):
try:
- import pysam
import numpy as np
+ import pysam
except:
return
frag_sizes = list()
- bam = pysam.Samfile(bam_file, 'rb')
+ bam = pysam.Samfile(bam_file, "rb")
for read in bam:
if bam.getrname(read.tid) != "chrM" and read.tlen < 1500:
frag_sizes.append(read.tlen)
bam.close()
return np.array(frag_sizes)
-
- def plot_atacseq_insert_sizes(self, bam, plot, output_csv, max_insert=1500, smallest_insert=30):
+ def plot_atacseq_insert_sizes(
+ self, bam, plot, output_csv, max_insert=1500, smallest_insert=30
+ ):
"""
Heavy inspiration from here:
https://github.com/dbrg77/ATAC/blob/master/ATAC_seq_read_length_curve_fitting.ipynb
"""
try:
- import pysam
- import numpy as np
+ import matplotlib
import matplotlib.mlab as mlab
- from scipy.optimize import curve_fit
+ import numpy as np
+ import pysam
from scipy.integrate import simps
- import matplotlib
- matplotlib.use('Agg')
+ from scipy.optimize import curve_fit
+
+ matplotlib.use("Agg")
import matplotlib.pyplot as plt
except:
print("Necessary Python modules couldn't be loaded.")
@@ -1268,6 +1470,7 @@ def plot_atacseq_insert_sizes(self, bam, plot, output_csv, max_insert=1500, smal
try:
import seaborn as sns
+
sns.set_style("whitegrid")
except:
pass
@@ -1275,7 +1478,7 @@ def plot_atacseq_insert_sizes(self, bam, plot, output_csv, max_insert=1500, smal
def get_fragment_sizes(bam, max_insert=1500):
frag_sizes = list()
- bam = pysam.Samfile(bam, 'rb')
+ bam = pysam.Samfile(bam, "rb")
for i, read in enumerate(bam):
if read.tlen < max_insert:
@@ -1293,11 +1496,13 @@ def mixture_function(x, *p):
nfr = expo(x, 2.9e-02, 2.8e-02)
nfr[:smallest_insert] = 0
- return (mlab.normpdf(x, m1, s1) * w1 +
- mlab.normpdf(x, m2, s2) * w2 +
- mlab.normpdf(x, m3, s3) * w3 +
- mlab.normpdf(x, m4, s4) * w4 +
- nfr)
+ return (
+ mlab.normpdf(x, m1, s1) * w1
+ + mlab.normpdf(x, m2, s2) * w2
+ + mlab.normpdf(x, m3, s3) * w3
+ + mlab.normpdf(x, m4, s4) * w4
+ + nfr
+ )
def expo(x, q, r):
"""
@@ -1316,17 +1521,30 @@ def expo(x, q, r):
# Parameters are empirical, need to check
paramGuess = [
- 200, 50, 0.7, # gaussians
- 400, 50, 0.15,
- 600, 50, 0.1,
- 800, 55, 0.045,
- 2.9e-02, 2.8e-02 # exponential
+ 200,
+ 50,
+ 0.7, # gaussians
+ 400,
+ 50,
+ 0.15,
+ 600,
+ 50,
+ 0.1,
+ 800,
+ 55,
+ 0.045,
+ 2.9e-02,
+ 2.8e-02, # exponential
]
try:
popt3, pcov3 = curve_fit(
- mixture_function, x[smallest_insert:], y[smallest_insert:],
- p0=paramGuess, maxfev=100000)
+ mixture_function,
+ x[smallest_insert:],
+ y[smallest_insert:],
+ p0=paramGuess,
+ maxfev=100000,
+ )
except:
print("Nucleosomal fit could not be found.")
return
@@ -1340,19 +1558,19 @@ def expo(x, q, r):
plt.hist(frag_sizes, numBins, histtype="step", ec="k", normed=1, alpha=0.5)
# Plot nucleosomal fits
- plt.plot(x, mlab.normpdf(x, m1, s1) * w1, 'r-', lw=1.5, label="1st nucleosome")
- plt.plot(x, mlab.normpdf(x, m2, s2) * w2, 'g-', lw=1.5, label="2nd nucleosome")
- plt.plot(x, mlab.normpdf(x, m3, s3) * w3, 'b-', lw=1.5, label="3rd nucleosome")
- plt.plot(x, mlab.normpdf(x, m4, s4) * w4, 'c-', lw=1.5, label="4th nucleosome")
+ plt.plot(x, mlab.normpdf(x, m1, s1) * w1, "r-", lw=1.5, label="1st nucleosome")
+ plt.plot(x, mlab.normpdf(x, m2, s2) * w2, "g-", lw=1.5, label="2nd nucleosome")
+ plt.plot(x, mlab.normpdf(x, m3, s3) * w3, "b-", lw=1.5, label="3rd nucleosome")
+ plt.plot(x, mlab.normpdf(x, m4, s4) * w4, "c-", lw=1.5, label="4th nucleosome")
# Plot nucleosome-free fit
nfr = expo(x, 2.9e-02, 2.8e-02)
nfr[:smallest_insert] = 0
- plt.plot(x, nfr, 'k-', lw=1.5, label="nucleosome-free")
+ plt.plot(x, nfr, "k-", lw=1.5, label="nucleosome-free")
# Plot sum of fits
ys = mixture_function(x, *popt3)
- plt.plot(x, ys, 'k--', lw=3.5, label="fit sum")
+ plt.plot(x, ys, "k--", lw=3.5, label="fit sum")
plt.legend()
plt.xlabel("Fragment size (bp)")
@@ -1363,10 +1581,26 @@ def expo(x, q, r):
areas = [
["fraction", "area under curve", "max density"],
["Nucleosome-free fragments", simps(nfr), max(nfr)],
- ["1st nucleosome", simps(mlab.normpdf(x, m1, s1) * w1), max(mlab.normpdf(x, m1, s1) * w1)],
- ["2nd nucleosome", simps(mlab.normpdf(x, m2, s2) * w1), max(mlab.normpdf(x, m2, s2) * w2)],
- ["3rd nucleosome", simps(mlab.normpdf(x, m3, s3) * w1), max(mlab.normpdf(x, m3, s3) * w3)],
- ["4th nucleosome", simps(mlab.normpdf(x, m4, s4) * w1), max(mlab.normpdf(x, m4, s4) * w4)]
+ [
+ "1st nucleosome",
+ simps(mlab.normpdf(x, m1, s1) * w1),
+ max(mlab.normpdf(x, m1, s1) * w1),
+ ],
+ [
+ "2nd nucleosome",
+ simps(mlab.normpdf(x, m2, s2) * w1),
+ max(mlab.normpdf(x, m2, s2) * w2),
+ ],
+ [
+ "3rd nucleosome",
+ simps(mlab.normpdf(x, m3, s3) * w1),
+ max(mlab.normpdf(x, m3, s3) * w3),
+ ],
+ [
+ "4th nucleosome",
+ simps(mlab.normpdf(x, m4, s4) * w1),
+ max(mlab.normpdf(x, m4, s4) * w4),
+ ],
]
try:
@@ -1380,8 +1614,15 @@ def expo(x, q, r):
# TODO: parameterize in terms of normalization factor.
def bam_to_bigwig(
- self, input_bam, output_bigwig, genome_sizes, genome,
- tagmented=False, normalize=False, norm_factor=1000):
+ self,
+ input_bam,
+ output_bigwig,
+ genome_sizes,
+ genome,
+ tagmented=False,
+ normalize=False,
+ norm_factor=1000,
+ ):
"""
Convert a BAM file to a bigWig file.
@@ -1401,34 +1642,63 @@ def bam_to_bigwig(
transient_file = os.path.abspath(re.sub("\.bigWig", "", output_bigwig))
cmd1 = self.tools.bedtools + " bamtobed -i {0} |".format(input_bam)
if not tagmented:
- cmd1 += " " + self.tools.bedtools + " slop -i stdin -g {0} -s -l 0 -r 130 |".format(genome_sizes)
+ cmd1 += (
+ " "
+ + self.tools.bedtools
+ + " slop -i stdin -g {0} -s -l 0 -r 130 |".format(genome_sizes)
+ )
cmd1 += " fix_bedfile_genome_boundaries.py {0} |".format(genome)
- cmd1 += " " + self.tools.genomeCoverageBed + " {0}-bg -g {1} -i stdin > {2}.cov".format(
- "-5 " if tagmented else "",
- genome_sizes,
- transient_file
+ cmd1 += (
+ " "
+ + self.tools.genomeCoverageBed
+ + " {0}-bg -g {1} -i stdin > {2}.cov".format(
+ "-5 " if tagmented else "", genome_sizes, transient_file
+ )
)
cmds.append(cmd1)
if normalize:
- cmds.append("""awk 'NR==FNR{{sum+= $4; next}}{{ $4 = ($4 / sum) * {1}; print}}' {0}.cov {0}.cov | sort -k1,1 -k2,2n > {0}.normalized.cov""".format(transient_file, norm_factor))
- cmds.append(self.tools.bedGraphToBigWig + " {0}{1}.cov {2} {3}".format(transient_file, ".normalized" if normalize else "", genome_sizes, output_bigwig))
+ cmds.append(
+ """awk 'NR==FNR{{sum+= $4; next}}{{ $4 = ($4 / sum) * {1}; print}}' {0}.cov {0}.cov | sort -k1,1 -k2,2n > {0}.normalized.cov""".format(
+ transient_file, norm_factor
+ )
+ )
+ cmds.append(
+ self.tools.bedGraphToBigWig
+ + " {0}{1}.cov {2} {3}".format(
+ transient_file,
+ ".normalized" if normalize else "",
+ genome_sizes,
+ output_bigwig,
+ )
+ )
# remove tmp files
cmds.append("if [[ -s {0}.cov ]]; then rm {0}.cov; fi".format(transient_file))
if normalize:
- cmds.append("if [[ -s {0}.normalized.cov ]]; then rm {0}.normalized.cov; fi".format(transient_file))
+ cmds.append(
+ "if [[ -s {0}.normalized.cov ]]; then rm {0}.normalized.cov; fi".format(
+ transient_file
+ )
+ )
cmds.append("chmod 755 {0}".format(output_bigwig))
return cmds
-
- def add_track_to_hub(self, sample_name, track_url, track_hub, colour, five_prime=""):
- cmd1 = """echo "track type=bigWig name='{0} {1}' description='{0} {1}'""".format(sample_name, five_prime)
- cmd1 += """ height=32 visibility=full maxHeightPixels=32:32:25 bigDataUrl={0} color={1}" >> {2}""".format(track_url, colour, track_hub)
+ def add_track_to_hub(
+ self, sample_name, track_url, track_hub, colour, five_prime=""
+ ):
+ cmd1 = (
+ """echo "track type=bigWig name='{0} {1}' description='{0} {1}'""".format(
+ sample_name, five_prime
+ )
+ )
+ cmd1 += """ height=32 visibility=full maxHeightPixels=32:32:25 bigDataUrl={0} color={1}" >> {2}""".format(
+ track_url, colour, track_hub
+ )
cmd2 = "chmod 755 {0}".format(track_hub)
return [cmd1, cmd2]
-
def link_to_track_hub(self, track_hub_url, file_name, genome):
import textwrap
+
db = "org" if genome == "hg19" else "db" # different database call for human
genome = "human" if genome == "hg19" else genome # change hg19 to human
html = """
@@ -1438,35 +1708,56 @@ def link_to_track_hub(self, track_hub_url, file_name, genome):
html += """{db}={genome}&hgt.customText={track_hub_url}" />