From 4d39d6c78d62e67942ecd976a67e791d443a6b4f Mon Sep 17 00:00:00 2001 From: nsheff Date: Wed, 19 Oct 2022 15:16:00 -0400 Subject: [PATCH 01/43] update pepatac bulker crate. See #227; see #226 --- checkinstall | 2 +- pepatac_input_schema.yaml | 1 + sample_pipeline_interface.yaml | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/checkinstall b/checkinstall index 736534a..b571502 100755 --- a/checkinstall +++ b/checkinstall @@ -417,7 +417,7 @@ else fi if [ -f "$CWD/pipelines/pepatac.py" ]; then - PIPELINE="$CWD/pipelines/pepatac.py" + PIPELINE=$(cat "$CWD/pipelines/pepatac.py") else PIPELINE=$(curl https://raw.githubusercontent.com/databio/pepatac/master/pipelines/pepatac.py) fi diff --git a/pepatac_input_schema.yaml b/pepatac_input_schema.yaml index 7472b58..3e4dee4 100644 --- a/pepatac_input_schema.yaml +++ b/pepatac_input_schema.yaml @@ -98,6 +98,7 @@ properties: required: - sample_name - protocol + - read_type - read1 - genome required_files: diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index 531885e..d800307 100644 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -43,8 +43,8 @@ command_template: > compute: singularity_image: ${SIMAGES}pepatac conda_env: pepatac - docker_image: databio/pepatac - bulker_crate: databio/pepatac:1.0.9 + docker_image: databio/pepatac + bulker_crate: databio/pepatac:1.0.10 size_dependent_variables: resources-sample.tsv bioconductor: readFunName: runCOCOA From 44cb10472ea5e86d8e372638ac2d16fb0d37bbbc Mon Sep 17 00:00:00 2001 From: nsheff Date: Wed, 19 Oct 2022 15:16:13 -0400 Subject: [PATCH 02/43] add microtest --- tests/microtest.csv | 2 ++ tests/microtest.yaml | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) create mode 100644 tests/microtest.csv create mode 100644 tests/microtest.yaml diff --git a/tests/microtest.csv b/tests/microtest.csv new file mode 100644 index 0000000..1690644 --- /dev/null +++ b/tests/microtest.csv @@ -0,0 +1,2 @@ +sample_name,protocol,read1,read2 +test_sample,ATAC-seq, diff --git a/tests/microtest.yaml b/tests/microtest.yaml new file mode 100644 index 0000000..607b8ed --- /dev/null +++ b/tests/microtest.yaml @@ -0,0 +1,18 @@ +pep_version: "2.1.0" +sample_table: microtest.csv + +looper: + output_dir: microtest + +sample_modifiers: + append: + genome: hg38_chr22 + pipeline_interfaces: ../sample_pipeline_interface.yaml + read1: R1 + read2: R2 + read_type: PAIRED + derive: + attributes: ["read1", "read2"] + sources: + R1: "${CODE}/microtest/data/atac-seq_PE_R1.fastq.gz" + R2: "${CODE}/microtest/data/atac-seq_PE_R2.fastq.gz" From 4913c81bc6e4ca2258e3ed9df3b98cafe83cf67a Mon Sep 17 00:00:00 2001 From: Nathan Sheffield Date: Wed, 19 Oct 2022 16:51:45 -0400 Subject: [PATCH 03/43] Update project_pipeline_interface.yaml --- project_pipeline_interface.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project_pipeline_interface.yaml b/project_pipeline_interface.yaml index 22e15c3..42f0e55 100644 --- a/project_pipeline_interface.yaml +++ b/project_pipeline_interface.yaml @@ -13,7 +13,7 @@ command_template: > compute: singularity_image: ${SIMAGES}pepatac docker_image: databio/pepatac - bulker_crate: databio/pepatac:1.0.9 + bulker_crate: databio/pepatac:1.0.10 size_dependent_variables: resources-project.tsv bioconductor: From 7614c771c901815288b6ca0d486019e7dd34edac Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Mon, 31 Oct 2022 09:03:56 -0400 Subject: [PATCH 04/43] updates --- PEPATACr/R/PEPATACr.R | 0 containers/pepatac.Dockerfile | 0 docs/assets.md | 0 docs/changelog.md | 0 docs/detailed-install.md | 0 docs/reference_peaks.md | 0 docs/run-conda.md | 0 docs/run-directly.md | 0 docs/tutorial.md | 0 docs/usage.md | 0 pepatac_input_schema.yaml | 0 pipelines/pepatac.yaml | 0 project_pipeline_interface.yaml | 0 requirements-conda.yml | 0 requirements.txt | 0 sample_pipeline_interface.yaml | 0 usage.txt | 0 17 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 PEPATACr/R/PEPATACr.R mode change 100644 => 100755 containers/pepatac.Dockerfile mode change 100644 => 100755 docs/assets.md mode change 100644 => 100755 docs/changelog.md mode change 100644 => 100755 docs/detailed-install.md mode change 100644 => 100755 docs/reference_peaks.md mode change 100644 => 100755 docs/run-conda.md mode change 100644 => 100755 docs/run-directly.md mode change 100644 => 100755 docs/tutorial.md mode change 100644 => 100755 docs/usage.md mode change 100644 => 100755 pepatac_input_schema.yaml mode change 100644 => 100755 pipelines/pepatac.yaml mode change 100644 => 100755 project_pipeline_interface.yaml mode change 100644 => 100755 requirements-conda.yml mode change 100644 => 100755 requirements.txt mode change 100644 => 100755 sample_pipeline_interface.yaml mode change 100644 => 100755 usage.txt diff --git a/PEPATACr/R/PEPATACr.R b/PEPATACr/R/PEPATACr.R old mode 100644 new mode 100755 diff --git a/containers/pepatac.Dockerfile b/containers/pepatac.Dockerfile old mode 100644 new mode 100755 diff --git a/docs/assets.md b/docs/assets.md old mode 100644 new mode 100755 diff --git a/docs/changelog.md b/docs/changelog.md old mode 100644 new mode 100755 diff --git a/docs/detailed-install.md b/docs/detailed-install.md old mode 100644 new mode 100755 diff --git a/docs/reference_peaks.md b/docs/reference_peaks.md old mode 100644 new mode 100755 diff --git a/docs/run-conda.md b/docs/run-conda.md old mode 100644 new mode 100755 diff --git a/docs/run-directly.md b/docs/run-directly.md old mode 100644 new mode 100755 diff --git a/docs/tutorial.md b/docs/tutorial.md old mode 100644 new mode 100755 diff --git a/docs/usage.md b/docs/usage.md old mode 100644 new mode 100755 diff --git a/pepatac_input_schema.yaml b/pepatac_input_schema.yaml old mode 100644 new mode 100755 diff --git a/pipelines/pepatac.yaml b/pipelines/pepatac.yaml old mode 100644 new mode 100755 diff --git a/project_pipeline_interface.yaml b/project_pipeline_interface.yaml old mode 100644 new mode 100755 diff --git a/requirements-conda.yml b/requirements-conda.yml old mode 100644 new mode 100755 diff --git a/requirements.txt b/requirements.txt old mode 100644 new mode 100755 diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml old mode 100644 new mode 100755 diff --git a/usage.txt b/usage.txt old mode 100644 new mode 100755 From f7911ff0d794468e428648f6a634d6878b4fdcd7 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Tue, 14 Nov 2023 16:17:04 -0500 Subject: [PATCH 05/43] update installation and use placeholder dev branches until releases are ready --- docs/detailed-install.md | 24 +++++++++++++++++------- requirements.txt | 10 ++++++---- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/docs/detailed-install.md b/docs/detailed-install.md index 29cf6af..4638b8d 100644 --- a/docs/detailed-install.md +++ b/docs/detailed-install.md @@ -29,8 +29,8 @@ We'll install each of these pieces of software before moving forward. Let's sta ```console cd tools/ wget https://github.com/arq5x/bedtools2/releases/download/v2.29.2/bedtools-2.29.2.tar.gz -tar -zxvf bedtools-2.29.0.tar.gz -rm bedtools-2.29.0.tar.gz +tar -zxvf bedtools-2.29.2.tar.gz +rm bedtools-2.29.2.tar.gz cd bedtools2 make ``` @@ -50,6 +50,7 @@ rm bowtie2-2.4.1-source.zip cd bowtie2-2.4.1 make ``` +Note: you may need to install `libtbb-dev` if `make` fails, e.g. using `apt install libtbb-dev` Again, let's add `bowtie2` to our `PATH` environment variable: @@ -59,6 +60,14 @@ export PATH="$PATH:/path/to/pepatac_tutorial/tools/bowtie2-2.4.1/" #### preseq The pipeline uses `preseq` to calculate library complexity. Check out the author's [page for more instruction](https://github.com/smithlabcode/preseq). +Note: If receiving the following error later in the tutorial: `preseq: error while loading shared libraries: libgsl.so.0: cannot open shared object file: No such file or directory` +you may need to install `libgsl-dev` using: `apt install libgsl-dev` and either: +1. `export LD_LIBRARY_PATH=/usr/local/lib` +2. link `libgsl.so.0` to an existing `libgsl`, e.g. `libgsl.so.27` + +More info can be found here: +https://www.gnu.org/software/gsl/doc/html/usage.html#shared-libraries + ```console wget http://smithlabresearch.org/downloads/preseq_linux_v2.0.tar.bz2 tar xvfj preseq_linux_v2.0.tar.bz2 @@ -84,7 +93,7 @@ wget https://github.com/samtools/samtools/releases/download/1.10/samtools-1.10.t tar xvfj samtools-1.10.tar.bz2 rm samtools-1.10.tar.bz2 cd samtools-1.10 -/configure +./configure ``` Alternatively, if you do not have the ability to install `samtools` to the default location, you can specify using the `--prefix=/install/destination/dir/` option. [Learn more about the `--prefix` option here](http://samtools.github.io/bcftools/howtos/install.html). ```console @@ -126,6 +135,7 @@ That should do it! Now we'll [install some **optional** packages](tutorial.md#1 `PEPATAC` uses `R` to generate quality control and read/peak annotation plots, so you'll need to have R functional if you want these outputs. We have packaged all the `R` code into a supporting package called [PEPATACr](https://github.com/databio/pepatac/tree/master/PEPATACr). The `PEPATAC` package relies on a few additional packages which can be installed at the command line as follows: +Note: if given error regarding `devtools` try: `apt install r-cran-devtools` before proceeding with installation. ``` Rscript -e 'install.packages("devtools")' Rscript -e 'devtools::install_github("pepkit/pepr")' @@ -187,10 +197,10 @@ export PICARD="/path/to/pepatac_tutorial/tools/picard.jar" To extract files quicker, `PEPATAC` can also utilize `pigz` in place of `gzip` if you have it installed. Let's go ahead and do that now. It's not required, but it can help speed everything up when you have many samples to process. ```console cd /path/to/pepatac_tutorial/tools/ -wget https://zlib.net/pigz/pigz-2.4.tar.gz -tar xvfz pigz-2.4.tar.gz -rm pigz-2.4.tar.gz -cd pigz-2.4/ +wget https://zlib.net/pigz/pigz-2.8.tar.gz +tar xvfz pigz-2.8.tar.gz +rm pigz-2.8.tar.gz +cd pigz-2.8/ make ``` Don't forget to add this to your `PATH` too! diff --git a/requirements.txt b/requirements.txt index c6debba..7b2db87 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,20 +5,22 @@ colorama>=0.3.9 Cython>=0.29 cykhash>=1.0.2 divvy>=0.6.0 -eido>=0.2.0 +eido>=0.2.1 #fseq2>=2.0.2 hypothesis==4.38.0 jinja2 jsonschema>=3.0.1 logmuse>=0.2.6 -looper>=1.4.2 +# looper>=1.4.2 +git+https://github.com/pepkit/looper.git@dev MACS2>=2.2.7.1 numpy>=1.21 oyaml pararead>=0.7.0 pandas>=1.4 peppy>=0.31.1 -piper>=0.12.3 +# piper>=0.12.3 +git+https://github.com/databio/pypiper.git@dev psutil>=5.8 pysam>=0.16 python-Levenshtein>=0.12 @@ -26,4 +28,4 @@ pyyaml>=3.13 refgenconf>=0.12.2 #refgenie ubiquerg>=0.6.1 -yacman>=0.8.4 +yacman>=0.9.2 From 6a2fdd092691104c38dfa4f73014572de0af602d Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Wed, 15 Nov 2023 17:09:15 -0500 Subject: [PATCH 06/43] update looper configs and tutorial for Looper >v1.5.0 --- docs/tutorial.md | 12 +++++++++- .../tutorial/.looper_tutorial_refgenie.yaml | 7 ++++++ .../tutorial_refgenie_project_config.yaml | 22 +++++++++++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 examples/tutorial/.looper_tutorial_refgenie.yaml create mode 100644 examples/tutorial/tutorial_refgenie_project_config.yaml diff --git a/docs/tutorial.md b/docs/tutorial.md index 2ced155..aef2076 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -376,8 +376,14 @@ cd ../tools/pepatac/ ``` Now, we'll use `looper` to run the sample locally. ```console -looper run examples/tutorial/tutorial_refgenie.yaml +looper run --looper-config examples/tutorial/.looper_tutorial_refgenie.yaml ``` +Note: if using Looper<1.5.0, the run method is via: +```console +looper run examples/tutorial/tutorial_refgenie.yaml +``` + + Congratulations! Your first samples should be running through the pipeline now. For both samples to run locally should take 30-50 minutes in total depending on your system. After the pipeline is finished, we can look through the output directory together. We've provided an example breakdown of just such a directory in the [browse output page](browse_output.md). @@ -385,6 +391,10 @@ After the pipeline is finished, we can look through the output directory togethe ## 6: Use `looper` to run the project level pipeline The pipeline also includes project level analyses that work on all samples concurrently. This allows for analyses that require output produced by individual sample analysis. We'll run the project analysis much like we run the sample analysis: ```console +looper runp --looper-config examples/tutorial/.looper_tutorial_refgenie.yaml +``` +Note: if using Looper<1.5.0, the run method is via: +```console looper runp examples/tutorial/tutorial_refgenie.yaml ``` This should take about a minute on the tutorial samples and will generate a `summary/` directory containing project level output in the parent project directory. You can [browse the tutorial data](browse_output.md) to see the example output. diff --git a/examples/tutorial/.looper_tutorial_refgenie.yaml b/examples/tutorial/.looper_tutorial_refgenie.yaml new file mode 100644 index 0000000..50936eb --- /dev/null +++ b/examples/tutorial/.looper_tutorial_refgenie.yaml @@ -0,0 +1,7 @@ +name: PEPATAC_tutorial +pep_config: tutorial_refgenie_project_config.yaml + +output_dir: "${TUTORIAL}/processed/" +pipeline_interfaces: + sample: ["${TUTORIAL}/tools/pepatac/sample_pipeline_interface.yaml"] + project: ["${TUTORIAL}/tools/pepatac/project_pipeline_interface.yaml"] diff --git a/examples/tutorial/tutorial_refgenie_project_config.yaml b/examples/tutorial/tutorial_refgenie_project_config.yaml new file mode 100644 index 0000000..e39b447 --- /dev/null +++ b/examples/tutorial/tutorial_refgenie_project_config.yaml @@ -0,0 +1,22 @@ +pep_version: 2.0.0 +sample_table: tutorial.csv + +sample_modifiers: + derive: + attributes: [read1, read2] + sources: + # Obtain tutorial data from http://big.databio.org/pepatac/ then set + # path to your local saved files + R1: "${TUTORIAL}/tools/pepatac/examples/data/{sample_name}_r1.fastq.gz" + R2: "${TUTORIAL}/tools/pepatac/examples/data/{sample_name}_r2.fastq.gz" + imply: + - if: + organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] + then: + genome: hg38 + prealignment_names: ["rCRSd"] + deduplicator: samblaster # Default. [options: picard] + trimmer: skewer # Default. [options: pyadapt, trimmomatic] + peak_type: fixed # Default. [options: variable] + extend: "250" # Default. For fixed-width peaks, extend this distance up- and down-stream. + frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run From ef08ee97a7cb8fbccd8e58b2af3c724e4e4faa6f Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Thu, 16 Nov 2023 15:04:31 -0500 Subject: [PATCH 07/43] add clarification for looper report if using looper>= 1.5.0 --- docs/tutorial.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/tutorial.md b/docs/tutorial.md index aef2076..3f0c08b 100755 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -199,6 +199,7 @@ looper runp examples/tutorial/tutorial_refgenie.yaml This should take about a minute on the tutorial samples and will generate a `summary/` directory containing project level output in the parent project directory. You can [browse the tutorial data](browse_output.md) to see the example output. ## 7: Generate an `HTML` report using `looper` +Note: beginning with Looper 1.5.0, pipestat configuration is required to use `looper report`, please see here [Configuring Looper to use pipestat](https://looper.databio.org/en/dev/pipestat/) Let's take full advantage of `looper` and generate a pipeline `HTML` report that makes all our results easy to view and browse. If you'd like to skip right to the results and see what it looks like, [check out the tutorial results](files/examples/tutorial/PEPATAC_tutorial_summary.html). Otherwise, let's generate a report ourselves. From dc78eac826d36957d0cc84092e3e4d505ece4f47 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Mon, 20 Nov 2023 13:56:14 -0500 Subject: [PATCH 08/43] update output_schema to align with pipestat output schemas. Add pipestat config to tutorial example. --- .../tutorial/.looper_tutorial_refgenie.yaml | 3 + pepatac_output_schema.yaml | 184 ++++++++++++------ 2 files changed, 123 insertions(+), 64 deletions(-) diff --git a/examples/tutorial/.looper_tutorial_refgenie.yaml b/examples/tutorial/.looper_tutorial_refgenie.yaml index 50936eb..d1a0406 100644 --- a/examples/tutorial/.looper_tutorial_refgenie.yaml +++ b/examples/tutorial/.looper_tutorial_refgenie.yaml @@ -5,3 +5,6 @@ output_dir: "${TUTORIAL}/processed/" pipeline_interfaces: sample: ["${TUTORIAL}/tools/pepatac/sample_pipeline_interface.yaml"] project: ["${TUTORIAL}/tools/pepatac/project_pipeline_interface.yaml"] + +pipestat: + results_file_path: "${TUTORIAL}/processed/results_pipeline/{sample_name}/stats.yaml" diff --git a/pepatac_output_schema.yaml b/pepatac_output_schema.yaml index 19840b0..f295ad8 100644 --- a/pepatac_output_schema.yaml +++ b/pepatac_output_schema.yaml @@ -1,67 +1,123 @@ +title: An example Pipestat output schema description: objects produced by PEPATAC pipeline. +type: object properties: + pipeline_name: PEPATAC samples: - type: array - items: - type: object - properties: - smooth_bw: - path: "aligned_{genome}/{sample_name}_smooth.bw" - type: string - description: "Smoothed signal track" - exact_bw: - path: "aligned_{genome}_exact/{sample_name}_exact.bw" - type: string - description: "Nucleotide-resolution signal track" - aligned_bam: - path: "aligned_{genome}/{sample_name}_sort_dedup.bam" - type: string - description: "Coordinate sorted deduplicated, aligned BAM file" - peak_file: - path: "peak_calling_{genome}/{sample_name}_peaks_normalized.narrowPeak" - type: string - description: "Sample peak file" - coverage_file: - path: "peak_calling_{genome}/{sample_name}_peaks_coverage.bed.gz" - type: string - description: "Sample peak coverage table" - summits_bed: - path: "peak_calling_{genome}/{sample_name}_summits.bed" - type: string - description: "Peak summit file" - alignment_percent_file: - title: "Alignment percent file" - description: "Plots percent of total alignment to all pre-alignments and primary genome." - thumbnail_path: "summary/{name}_alignmentPercent.png" - path: "summary/{name}_alignmentPercent.pdf" - type: image - alignment_raw_file: - title: "Alignment raw file" - description: "Plots raw alignment rates to all pre-alignments and primary genome." - thumbnail_path: "summary/{name}_alignmentRaw.png" - path: "summary/{name}_alignmentRaw.pdf" - type: image - tss_file: - title: "TSS enrichment file" - description: "Plots TSS scores for each sample." - thumbnail_path: "summary/{name}_TSSEnrichment.png" - path: "summary/{name}_TSSEnrichment.pdf" - type: image - library_complexity_file: - title: "Library complexity file" - description: "Plots each sample's library complexity on a single plot." - thumbnail_path: "summary/{name}_libComplexity.png" - path: "summary/{name}_libComplexity.pdf" - type: image - consensus_peaks_file: - title: "Consensus peaks file" - description: "A set of consensus peaks across samples." - thumbnail_path: "summary/{name}_*_consensusPeaks.png" - path: "summary/{name}_*_consensusPeaks.narrowPeak" - type: string - counts_table: - title: "Project peak coverage file" - description: "Project peak coverages: chr_start_end X sample" - thumbnail_path: "summary/{name}_*_peaks_coverage.png" - path: "summary/{name}_*_peaks_coverage.tsv" - type: string + type: object + properties: + smooth_bw: + type: string + description: "Smoothed signal track" + exact_bw: + type: string + description: "Nucleotide-resolution signal track" + aligned_bam: + type: string + description: "Coordinate sorted deduplicated, aligned BAM file" + peak_file: + type: string + description: "Sample peak file" + coverage_file: + type: string + description: "Sample peak coverage table" + summits_bed: + type: string + description: "Peak summit file" + project: + type: object + properties: + alignment_percent_file: + title: "Alignment percent file" + description: "Plots percent of total alignment to all pre-alignments and primary genome." + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + alignment_raw_file: + title: "Alignment raw file" + description: "Plots raw alignment rates to all pre-alignments and primary genome." + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + tss_file: + title: "TSS enrichment file" + description: "Plots TSS scores for each sample." + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + library_complexity_file: + title: "Library complexity file" + description: "Plots each sample's library complexity on a single plot." + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + consensus_peaks_file: + title: "consesus peak file" + description: "A set of consensus peaks across samples." + type: object + object_type: file + properties: + path: + type: string + title: + type: string + thumbnail_path: + type: string + required: + - path + - title + counts_table: + title: "Project peak coverage file" + description: "Project peak coverages: chr_start_end X sample" + type: object + object_type: file + properties: + path: + type: string + title: + type: string + thumbnail_path: + type: string + required: + - path + - title \ No newline at end of file From bab6d62bb7d310f5b199308b6475aff1bc0a569e Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Mon, 20 Nov 2023 16:06:26 -0500 Subject: [PATCH 09/43] add pipestat_results_file during pypiper creation to force unified results file #256 --- examples/tutorial/.looper_tutorial_refgenie.yaml | 2 +- pipelines/pepatac.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/tutorial/.looper_tutorial_refgenie.yaml b/examples/tutorial/.looper_tutorial_refgenie.yaml index d1a0406..35e4ec4 100644 --- a/examples/tutorial/.looper_tutorial_refgenie.yaml +++ b/examples/tutorial/.looper_tutorial_refgenie.yaml @@ -7,4 +7,4 @@ pipeline_interfaces: project: ["${TUTORIAL}/tools/pepatac/project_pipeline_interface.yaml"] pipestat: - results_file_path: "${TUTORIAL}/processed/results_pipeline/{sample_name}/stats.yaml" + results_file_path: "${TUTORIAL}/processed/results_pipeline/pepatac_results.yaml" diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 3161542..02c2643 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -509,9 +509,10 @@ def main(): # access in ancillary functions outside of main(). outfolder = os.path.abspath( os.path.join(args.output_parent, args.sample_name)) + results_file = os.path.abspath(os.path.join(args.output_parent, "pepatac_results.yaml")) global pm pm = pypiper.PipelineManager( - name="PEPATAC", outfolder=outfolder, args=args, version=__version__) + name="PEPATAC", outfolder=outfolder, pipestat_results_file=results_file,args=args, version=__version__) global ngstk ngstk = pypiper.NGSTk(pm=pm) From b49e6962d82c62a2cc8cf42099c82b0ff19becee Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Mon, 20 Nov 2023 17:24:34 -0500 Subject: [PATCH 10/43] add pipestat_sample_name during pypiper creation related to #256 and #257 --- pipelines/pepatac.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 02c2643..323a523 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -512,7 +512,7 @@ def main(): results_file = os.path.abspath(os.path.join(args.output_parent, "pepatac_results.yaml")) global pm pm = pypiper.PipelineManager( - name="PEPATAC", outfolder=outfolder, pipestat_results_file=results_file,args=args, version=__version__) + name="PEPATAC", outfolder=outfolder,pipestat_sample_name=args.sample_name, pipestat_results_file=results_file,args=args, version=__version__) global ngstk ngstk = pypiper.NGSTk(pm=pm) From f254c0b44bf81e9526ed95e6190a53389db77f5d Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Mon, 27 Nov 2023 14:13:14 -0500 Subject: [PATCH 11/43] ensure project level pipeline_name is the same as sample_level name, so they can use the same pipestat_results_file backend --- project_pipeline_interface.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project_pipeline_interface.yaml b/project_pipeline_interface.yaml index 42f0e55..9cbfb89 100755 --- a/project_pipeline_interface.yaml +++ b/project_pipeline_interface.yaml @@ -1,4 +1,4 @@ -pipeline_name: PEPATAC_summarizer +pipeline_name: PEPATAC pipeline_type: project path: pipelines/pepatac_collator.py output_schema: pepatac_output_schema.yaml From 1d5a214fef88774e6bfe4a40867f7a21ca4edfd5 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Mon, 27 Nov 2023 18:15:53 -0500 Subject: [PATCH 12/43] adjust output_schema for #256 --- pepatac_output_schema.yaml | 240 ++++++++++++++++++++++++++++++++++++- 1 file changed, 238 insertions(+), 2 deletions(-) diff --git a/pepatac_output_schema.yaml b/pepatac_output_schema.yaml index f295ad8..3b198c0 100644 --- a/pepatac_output_schema.yaml +++ b/pepatac_output_schema.yaml @@ -24,6 +24,242 @@ properties: summits_bed: type: string description: "Peak summit file" + File_mb: + type: number + description: "size of file" + Read_type: + type: string + description: "read_type" + Genome: + type: string + description: "e.g. hg38" + Raw_reads: + type: string + description: "raw reads" + Fastq_reads: + type: number + description: "fastq_reads" + Trimmed_reads: + type: number + description: "trimmed_reads" + Trimmed_loss_rate: + type: number + description: "trimmed loss rate" + FastQC report r1: + title: "FastQC report r1" + description: "FastQC report r1" + type: object + object_type: file + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + FastQC report r2: + title: "FastQC report r2" + description: "FastQC report r2" + type: object + object_type: file + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + Aligned_reads_rCRSd: + type: number + description: "Aligned_reads_rCRSd" + Alignment_rate_rCRSd: + type: number + description: "Alignment_rate_rCRSd" + Mapped_reads: + type: string + description: "mapped_reads" + QC_filtered_reads: + type: number + description: "QC_filtered_reads" + Aligned_reads: + type: string + description: "Aligned_reads" + Alignment_rate: + type: number + description: "Alignment_rate" + Total_efficiency: + type: number + description: "Total_efficiency" + Mitochondrial_reads: + type: number + description: "Mitochondrial_reads" + NRF: + type: number + description: "NRF" + PBC1: + type: number + description: "PBC1" + PBC2: + type: number + description: "PBC1" + Unmapped_reads: + type: number + description: "Unmapped_reads" + Duplicate_reads: + type: string + description: "Duplicate_reads" + Dedup_aligned_reads: + type: number + description: "Dedup_aligned_reads" + Dedup_alignment_rate: + type: number + description: "Dedup_alignment_rate" + Dedup_total_efficiency: + type: number + description: "Dedup_total_efficiency" + NFR_frac: + type: number + description: "NFR_frac" + mono_frac: + type: number + description: "mono_frac" + di_frac: + type: number + description: "di_frac" + tri_frac: + type: number + description: "tri_frac" + poly_frac: + type: number + description: "poly_frac" + Read_length: + type: number + description: "Read_length" + Genome_size: + type: number + description: "Read_length" + Frac_exp_unique_at_10M: + type: number + description: "Frac_exp_unique_at_10M" + TSS_score: + type: number + description: "TSS_score" + Fragment distribution: + title: "Fragment distribution" + description: "Fragment distribution" + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + Peak_count: + type: number + description: "Peak_count" + FRiP: + type: number + description: "FRiP" + Peak chromosome distribution: + title: "Peak chromosome distribution" + description: "Peak chromosome distribution" + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + TSS distance distribution: + title: "TSS distance distribution" + description: "TSS distance distribution" + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + Peak partition distribution: + title: "Peak partition distribution" + description: "Peak partition distribution" + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + cFRiF: + title: "cFRiF" + description: "cFRiF" + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + FRiF: + title: "FRiF" + description: "FRiF" + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + Time: + type: string + description: "time" + Success: + type: string + description: "success" project: type: object properties: @@ -59,7 +295,7 @@ properties: - path - thumbnail_path - title - tss_file: + TSS enrichment: title: "TSS enrichment file" description: "Plots TSS scores for each sample." type: object @@ -75,7 +311,7 @@ properties: - path - thumbnail_path - title - library_complexity_file: + Library complexity: title: "Library complexity file" description: "Plots each sample's library complexity on a single plot." type: object From 3f73ec788bb8c5de046653ff592251a11a780507 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Wed, 29 Nov 2023 14:48:43 -0500 Subject: [PATCH 13/43] revert to not pass pipestat_results_file to pypiper, add {record_identifier} to looper config --- examples/tutorial/.looper_tutorial_refgenie.yaml | 2 +- pipelines/pepatac.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/tutorial/.looper_tutorial_refgenie.yaml b/examples/tutorial/.looper_tutorial_refgenie.yaml index 35e4ec4..e3448e2 100644 --- a/examples/tutorial/.looper_tutorial_refgenie.yaml +++ b/examples/tutorial/.looper_tutorial_refgenie.yaml @@ -7,4 +7,4 @@ pipeline_interfaces: project: ["${TUTORIAL}/tools/pepatac/project_pipeline_interface.yaml"] pipestat: - results_file_path: "${TUTORIAL}/processed/results_pipeline/pepatac_results.yaml" + results_file_path: "${TUTORIAL}/processed/results_pipeline/{record_identifier}/stats.yaml" diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 323a523..f7b7ed7 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -509,10 +509,9 @@ def main(): # access in ancillary functions outside of main(). outfolder = os.path.abspath( os.path.join(args.output_parent, args.sample_name)) - results_file = os.path.abspath(os.path.join(args.output_parent, "pepatac_results.yaml")) global pm pm = pypiper.PipelineManager( - name="PEPATAC", outfolder=outfolder,pipestat_sample_name=args.sample_name, pipestat_results_file=results_file,args=args, version=__version__) + name="PEPATAC", outfolder=outfolder,pipestat_sample_name=args.sample_name, args=args, version=__version__) global ngstk ngstk = pypiper.NGSTk(pm=pm) From c93db83f1636ed3e57959998d2eb566f461373f6 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Thu, 30 Nov 2023 11:20:46 -0500 Subject: [PATCH 14/43] add project name to project config to prevent looper render error #259 --- examples/tutorial/tutorial_refgenie_project_config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/tutorial/tutorial_refgenie_project_config.yaml b/examples/tutorial/tutorial_refgenie_project_config.yaml index e39b447..a45528d 100644 --- a/examples/tutorial/tutorial_refgenie_project_config.yaml +++ b/examples/tutorial/tutorial_refgenie_project_config.yaml @@ -1,3 +1,4 @@ +name: PEPATAC_tutorial pep_version: 2.0.0 sample_table: tutorial.csv From 2ff63c2087935ed3315c017d297a80879241cad4 Mon Sep 17 00:00:00 2001 From: Jason Smith Date: Thu, 30 Nov 2023 07:24:41 -0500 Subject: [PATCH 15/43] enable less than or combinations of less than and greater than package requirements; skip conda check if the environment isnt present --- checkinstall | 732 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 582 insertions(+), 150 deletions(-) diff --git a/checkinstall b/checkinstall index b571502..9776640 100755 --- a/checkinstall +++ b/checkinstall @@ -119,61 +119,282 @@ fi while IFS= read -r line; do [ "${line:0:1}" = "#" ] && continue - IFS='>=' read -r -a array <<< "$line" - package=${array[0]} - required=${array[2]} - required=$(trim ${required}) - IFS='.' read -r -a required_version <<< "$required" - declare -i rmajor - declare -i rminor - declare -i rpatch - rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') - rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') - rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') - - if ! pip_show "${package}"; then - echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") - printf "\n" - NATIVE_INSTALL=1 - BULKER_INSTALL=1 - else - installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') - installed=$(trim ${installed}) - IFS='.' read -r -a installed_version <<< "$installed" - declare -i imajor - declare -i iminor - declare -i ipatch - imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') - iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') - ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') - - if ! [ -z "$required" ]; then - if [ $imajor -lt $rmajor ]; then - echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") - printf "\n" - NATIVE_INSTALL=1 - BULKER_INSTALL=1 - elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then - echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") - printf "\n" - NATIVE_INSTALL=1 - BULKER_INSTALL=1 - elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then - echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") - printf "\n" - NATIVE_INSTALL=1 - BULKER_INSTALL=1 + + if [[ $line == *","* ]]; then + IFS=',' read -r -a array <<< "$line" + first_half=${array[0]} + # Check the maximum requirement + IFS='<' read -r -a array <<< "$first_half" + package=${array[0]} + required=${array[2]} + required=$(trim ${required}) + + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}"; then + echo $(warn "WARNING: PEPATAC requires the Python package, $package, < $required. Try pip install $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + else + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -gt $rmajor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, < $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -gt $rminor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, < $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -gt $rpatch ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, < $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + fi + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + fi + fi + + + second_half=${array[2]} + # Check the minimum requirement + IFS='>=' read -r -a array <<< "$second_half" + required=${array[2]} + required=$(trim ${required}) + + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}"; then + echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + else + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -lt $rmajor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + fi + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + fi + fi + elif [[ $line == *">="* ]]; then + IFS='>=' read -r -a array <<< "$line" + package=${array[0]} + required=${array[2]} + required=$(trim ${required}) + + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}"; then + echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + else + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -lt $rmajor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + fi + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + fi + fi + elif [[ $line == *"=="* ]]; then + IFS='==' read -r -a array <<< "$line" + package=${array[0]} + required=${array[2]} + required=$(trim ${required}) + + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}"; then + echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + else + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -lt $rmajor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, == $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, == $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, == $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + fi else - echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") fi + fi + else + IFS='<' read -r -a array <<< "$line" + package=${array[0]} + required=${array[2]} + required=$(trim ${required}) + + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}"; then + echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 else - echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -lt $rmajor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + fi + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + fi fi fi done < $REQS # Check tool installation -declare -a requiredCommands=("perl" "awk" "grep" "sed" "bedtools" "bowtie2" "fseq" "macs2" "preseq" "samblaster" "samtools" "skewer" "bedToBigBed" "bigWigCat" "wigToBigWig" "Rscript") +declare -a requiredCommands=("perl" "awk" "grep" "sed" "bedtools" "bowtie2" "macs3" "preseq" "samblaster" "samtools" "skewer" "bedToBigBed" "bigWigCat" "wigToBigWig" "Rscript") for cmd in ${requiredCommands[@]}; do if ! is_executable $cmd; then @@ -224,135 +445,346 @@ if ! is_executable "conda"; then CONDA_INSTALL=1 else eval "$(conda shell.bash hook)" - conda activate pepatac - - unset PYTHONPATH - unset R_LIBS - - # Check Python - if ! is_executable "python"; then - echo $(warn "WARNING: PEPATAC requires python 3.0 or greater. Install python and checkinstall again.") - printf "\n" - CONDA_INSTALL=1 - else - #echo "which python: $(which python)" - ver=$(python -V 2>&1 | sed 's/.* \([0-9]\).\([0-9]\).*/\1\2/') - if [ "$ver" -lt "30" ]; then - echo $(warn "WARNING: PEPATAC requires python 3.0 or greater. Update python and checkinstall again.") + + # Check if a pepatac conda environment exists + CONDA_ENVS=$(conda env list | grep -c '^pepatac\s') + + if [[ "$CONDA_ENVS" == 1 ]]; then + conda activate pepatac + + unset PYTHONPATH + unset R_LIBS + + # Check Python + if ! is_executable "python"; then + echo $(warn "WARNING: PEPATAC requires python 3.0 or greater. Install python and checkinstall again.") printf "\n" CONDA_INSTALL=1 + else + #echo "which python: $(which python)" + ver=$(python -V 2>&1 | sed 's/.* \([0-9]\).\([0-9]\).*/\1\2/') + if [ "$ver" -lt "30" ]; then + echo $(warn "WARNING: PEPATAC requires python 3.0 or greater. Update python and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + fi fi - fi - - # Check Python packages - if ! is_executable "pip"; then - echo $(warn "WARNING: PEPATAC requires pip. Please install pip and checkinstall again.") - printf "\n" - CONDA_INSTALL=1 - fi - - if [ -f "requirements.txt" ]; then - REQS="requirements.txt" - else - REQS=$(curl https://raw.githubusercontent.com/databio/pepatac/master/requirements.txt) - fi - while IFS= read -r line; do - [ "${line:0:1}" = "#" ] && continue - IFS='>=' read -r -a array <<< "$line" - package=${array[0]} - required=${array[2]} - required=$(trim ${required}) - IFS='.' read -r -a required_version <<< "$required" - declare -i rmajor - declare -i rminor - declare -i rpatch - rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') - rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') - rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') - - if ! pip_show "${package}"; then - echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") + # Check Python packages + if ! is_executable "pip"; then + echo $(warn "WARNING: PEPATAC requires pip. Please install pip and checkinstall again.") printf "\n" CONDA_INSTALL=1 + fi + + if [ -f "requirements.txt" ]; then + REQS="requirements.txt" else - installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') - installed=$(trim ${installed}) - IFS='.' read -r -a installed_version <<< "$installed" - declare -i imajor - declare -i iminor - declare -i ipatch - imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') - iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') - ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + REQS=$(curl https://raw.githubusercontent.com/databio/pepatac/master/requirements.txt) + fi - if ! [ -z "$required" ]; then - if [ $imajor -lt $rmajor ]; then - #echo -e "Installed major: ${imajor}\tRequired major: ${rmajor}" - echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + while IFS= read -r line; do + [ "${line:0:1}" = "#" ] && continue + + if [[ $line == *","* ]]; then + IFS=',' read -r -a array <<< "$line" + first_half=${array[0]} + # Check the maximum requirement + IFS='<' read -r -a array <<< "$first_half" + package=${array[0]} + required=${array[2]} + required=$(trim ${required}) + + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}"; then + echo $(warn "WARNING: PEPATAC requires the Python package, $package, < $required. Try pip install $package and checkinstall again.") printf "\n" CONDA_INSTALL=1 - elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then - #echo -e "Installed minor: ${iminor}\tRequired minor: ${rminor}" - echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + else + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -gt $rmajor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, < $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -gt $rminor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, < $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -gt $rpatch ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, < $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + fi + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + fi + fi + + + second_half=${array[2]} + # Check the minimum requirement + IFS='>=' read -r -a array <<< "$second_half" + required=${array[2]} + required=$(trim ${required}) + + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}"; then + echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") printf "\n" CONDA_INSTALL=1 - elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then - #echo -e "Installed patch: ${ipatch}\tRequired patch: ${rpatch}" - echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + else + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -lt $rmajor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + fi + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + fi + fi + elif [[ $line == *">="* ]]; then + IFS='>=' read -r -a array <<< "$line" + package=${array[0]} + required=${array[2]} + required=$(trim ${required}) + + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}"; then + echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") printf "\n" CONDA_INSTALL=1 else - echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -lt $rmajor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + fi + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + fi + fi + elif [[ $line == *"=="* ]]; then + IFS='==' read -r -a array <<< "$line" + package=${array[0]} + required=${array[2]} + required=$(trim ${required}) + + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}"; then + echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + else + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -lt $rmajor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, == $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, == $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, == $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + fi + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + fi fi else - echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + IFS='<' read -r -a array <<< "$line" + package=${array[0]} + required=${array[2]} + required=$(trim ${required}) + + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}"; then + echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + else + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -lt $rmajor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + fi + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + fi + fi fi - fi - done < $REQS + done < $REQS - # Check tool installation - declare -a requiredCommands=("perl" "awk" "grep" "sed" "bedtools" "bowtie2" "fseq" "macs2" "preseq" "samblaster" "samtools" "skewer" "bedToBigBed" "bigWigCat" "wigToBigWig" "Rscript") + # Check tool installation + declare -a requiredCommands=("perl" "awk" "grep" "sed" "bedtools" "bowtie2" "macs3" "preseq" "samblaster" "samtools" "skewer" "bedToBigBed" "bigWigCat" "wigToBigWig" "Rscript") - for cmd in ${requiredCommands[@]}; do - if ! is_executable $cmd; then - echo $(warn "WARNING: Please install $cmd and checkinstall again.") + for cmd in ${requiredCommands[@]}; do + if ! is_executable $cmd; then + echo $(warn "WARNING: Please install $cmd and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + else + echo -e $(success "SUCCESS: ${cmd}") + fi + done + + ## Check R packages + if ! is_executable "R"; then + echo $(warn "WARNING: PEPATAC requires R 3.5 or greater.\n Please install R>=3.5 and checkinstall again.") printf "\n" - CONDA_INSTALL=1 + exit 1 else - echo -e $(success "SUCCESS: ${cmd}") + rVer=$(R --version 2>&1 | grep 'R version' | awk '{print $3}') + rVer=$(echo "${rVer//.}") + if [ "$rVer" -lt "350" ]; then + echo $(warn "WARNING: PEPATAC requires R 3.5 or greater. Update R and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + fi fi - done - ## Check R packages - if ! is_executable "R"; then - echo $(warn "WARNING: PEPATAC requires R 3.5 or greater.\n Please install R>=3.5 and checkinstall again.") - printf "\n" - exit 1 + declare -a requiredRPackages=("optigrab" "devtools" "GenomicDistributions" "PEPATACr" "data.table" "pepr" "gplots" "grid" "ggplot2" "scales" "IRanges" "GenomicRanges") + for package in ${requiredRPackages[@]}; do + cmd=$(echo "Rscript -e 'library(\"$package\")'") + packageInstalled=$(eval $cmd 2>&1) + if [[ "$packageInstalled" == *Error* ]]; then + echo $(warn "WARNING: Please install the R package, $package, and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + else + echo -e $(success "SUCCESS: R package: ${package}") + fi + done + + conda deactivate else - rVer=$(R --version 2>&1 | grep 'R version' | awk '{print $3}') - rVer=$(echo "${rVer//.}") - if [ "$rVer" -lt "350" ]; then - echo $(warn "WARNING: PEPATAC requires R 3.5 or greater. Update R and checkinstall again.") - printf "\n" - CONDA_INSTALL=1 - fi + echo -e "The pepatac conda environment is not present." + CONDA_INSTALL=1 fi - - declare -a requiredRPackages=("optigrab" "devtools" "GenomicDistributions" "PEPATACr" "data.table" "pepr" "gplots" "grid" "ggplot2" "scales" "IRanges" "GenomicRanges") - for package in ${requiredRPackages[@]}; do - cmd=$(echo "Rscript -e 'library(\"$package\")'") - packageInstalled=$(eval $cmd 2>&1) - if [[ "$packageInstalled" == *Error* ]]; then - echo $(warn "WARNING: Please install the R package, $package, and checkinstall again.") - printf "\n" - CONDA_INSTALL=1 - else - echo -e $(success "SUCCESS: R package: ${package}") - fi - done - - conda deactivate fi ################################################################################ From 2e985b6f4b4e4a9dd042e86b867f51cbf6b0df64 Mon Sep 17 00:00:00 2001 From: Jason Smith Date: Thu, 30 Nov 2023 09:14:19 -0500 Subject: [PATCH 16/43] fix merge conflicts requirements.txt; fix if statement of GenomicDistributions output --- PEPATACr/R/PEPATACr.R | 2 +- requirements.txt | 31 +++++++++++++++++++------------ 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/PEPATACr/R/PEPATACr.R b/PEPATACr/R/PEPATACr.R index 5b9085d..bc8f9cd 100755 --- a/PEPATACr/R/PEPATACr.R +++ b/PEPATACr/R/PEPATACr.R @@ -1446,7 +1446,7 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"), message(x$msg) } - if (x$value == "" || is.na(x$value) || is.null(x$value)) { + if (is.null(nrow(x$value))) { return(ggplot()) } # Don't plot lowest 10% represented chromosomes diff --git a/requirements.txt b/requirements.txt index 7b2db87..9f805e2 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,31 +1,38 @@ -attmap>=0.13.0 +attmap>=0.13.2 bio>=0.2.4 +blosc2>=2.0.0 +clyent==1.2.1 codecov>=2.0 colorama>=0.3.9 Cython>=0.29 cykhash>=1.0.2 divvy>=0.6.0 -eido>=0.2.1 +eido>=0.2.2 +FuzzyTM>=0.4.0 #fseq2>=2.0.2 hypothesis==4.38.0 jinja2 jsonschema>=3.0.1 -logmuse>=0.2.6 -# looper>=1.4.2 git+https://github.com/pepkit/looper.git@dev -MACS2>=2.2.7.1 -numpy>=1.21 -oyaml +git+https://github.com/databio/pypiper.git@dev +logmuse>=0.2.7 +#looper>=1.4.2 +MACS2>=2.2.9.1 +MACS3>=3.0.0b1 +numpy<1.25,>=1.21 +oyaml>=1.0 pararead>=0.7.0 pandas>=1.4 -peppy>=0.31.1 -# piper>=0.12.3 -git+https://github.com/databio/pypiper.git@dev +#peppy>=0.35.7 +piper>=0.13.2 +pipestat>=0.5.1 psutil>=5.8 +pyqt5<5.16 +pyqtwebengine<5.16 pysam>=0.16 python-Levenshtein>=0.12 pyyaml>=3.13 refgenconf>=0.12.2 -#refgenie -ubiquerg>=0.6.1 +requests_mock +ubiquerg>=0.6.3 yacman>=0.9.2 From 61bc578224de67d0ad91e27f3d3bd6733a853265 Mon Sep 17 00:00:00 2001 From: Jason Smith Date: Thu, 30 Nov 2023 09:44:39 -0500 Subject: [PATCH 17/43] update ggplot size to linewidth --- PEPATACr/R/PEPATACr.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PEPATACr/R/PEPATACr.R b/PEPATACr/R/PEPATACr.R index bc8f9cd..a9c6dd0 100755 --- a/PEPATACr/R/PEPATACr.R +++ b/PEPATACr/R/PEPATACr.R @@ -835,7 +835,7 @@ plotFRiF <- function(sample_name, num_reads, genome_size, p <- ggplot(covDF, aes(x=log10(cumSize), y=frip, group=feature, color=feature)) + #geom_line(aes(linetype=feature), size=2, alpha=0.5) + - geom_line(size=2, alpha=0.5) + + geom_line(linewidth=2, alpha=0.5) + guides(linetype = "none") + labs(x=expression(log[10]("number of bases")), y="FRiF") + @@ -884,7 +884,7 @@ plotFRiF <- function(sample_name, num_reads, genome_size, # group=feature, color=feature)) + p <- ggplot(covDF, aes(x=log10(cumSize), y=frip, group=feature, color=feature)) + - geom_line(size=2, alpha=0.5) + + geom_line(linewidth=2, alpha=0.5) + guides(linetype = "none") + labs(x=expression(log[10]("number of bases")), y="FRiF") + theme_PEPATAC() From 4c5e1da061f4609d6dfc2c780c336611f6146f1b Mon Sep 17 00:00:00 2001 From: Jason Smith Date: Thu, 30 Nov 2023 10:23:54 -0500 Subject: [PATCH 18/43] manually address --bed argument in frif plotting --- tools/PEPATAC.R | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tools/PEPATAC.R b/tools/PEPATAC.R index 256444d..b043ded 100755 --- a/tools/PEPATAC.R +++ b/tools/PEPATAC.R @@ -322,9 +322,15 @@ if (is.na(subcmd) || grepl("/R", subcmd)) { description="Output file name.") numArgs <- length(opt_get_args()) argGap <- ifelse(reads, 13, 12) - bed <- opt_get(name = c("bed", "b"), required=TRUE, - n=(numArgs - argGap), - description="Coverage file(s).") + + # Manually parse --bed argument + opt_args_dt <- as.data.table(opt_get_args()) + bed_start <- grep("--bed", opt_args_dt$V1) + 1 + bed_end <- nrow(opt_args_dt) + # bed <- opt_get(name = c("bed", "b"), required=TRUE, + # n=(numArgs - argGap), + # description="Coverage file(s).") + bed <- opt_args_dt$V1[bed_start:bed_end] p <- plotFRiF(sample_name = sample_name, num_reads = as.numeric(num_reads), From 8d5160c4debfccf69730064a46368b73199f472b Mon Sep 17 00:00:00 2001 From: Jason Smith Date: Thu, 30 Nov 2023 13:12:47 -0500 Subject: [PATCH 19/43] newest versions of UCSC-tools drop the ability to use PIPEs; split wig and wigToBigWig cmds --- tools/bamSitesToWig.py | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/tools/bamSitesToWig.py b/tools/bamSitesToWig.py index bb365d5..f47e471 100755 --- a/tools/bamSitesToWig.py +++ b/tools/bamSitesToWig.py @@ -99,27 +99,27 @@ def __call__(self, chrom): reads = self.fetch_chunk(chrom) chromOutFile = self._tempf(chrom.replace("|","_")) + chromOutFileWig = chromOutFile + ".wig" chromOutFileBw = chromOutFile + ".bw" + chromOutFileWigSm = chromOutFile + "_smooth.wig" chromOutFileBwSm = chromOutFile + "_smooth.bw" tmpFile = chromOutFile + "_cuts.txt" cutsToWig = os.path.join(os.path.dirname(__file__), "cutsToWig.pl") cmd1 = ("sort -n | perl " + cutsToWig + " " + str(chrom_size) + - " " + str(self.variable_step) + " " + str(self.scale)) - cmd2 = ("wigToBigWig -clip -fixedSummaries -keepAllChromosomes stdin " + + " " + str(self.variable_step) + " " + str(self.scale) + + " > " + chromOutFileWig) + cmd2 = ("wigToBigWig -clip -fixedSummaries -keepAllChromosomes " + + chromOutFileWig + " " + self.chrom_sizes_file + " " + chromOutFileBw) _LOGGER.debug(" cutsToWigProcess: " + cmd1) _LOGGER.debug(" wigToBigWigProcess: " + cmd2) if self.exactbw: cutsToWigProcess = subprocess.Popen(cmd1, shell=True, - stdin=subprocess.PIPE, stdout=subprocess.PIPE) - wigToBigWigProcess = subprocess.Popen( - ['wigToBigWig', '-clip', '-fixedSummaries', - '-keepAllChromosomes', 'stdin', - self.chrom_sizes_file, chromOutFileBw], - stdin=cutsToWigProcess.stdout) + stdin=subprocess.PIPE) + if self.smoothbw: cutsToWigSm = os.path.join(os.path.dirname(__file__), @@ -127,17 +127,13 @@ def __call__(self, chrom): cmd1 = ("sort -n | tee " + tmpFile + " | perl " + cutsToWigSm + " " + str(chrom_size) + " " + str(self.smooth_length) + " " + str(self.step_size) + " " + str(self.variable_step) + - " " + str(self.scale)) + " " + str(self.scale) + " > " + chromOutFileWigSm) cmd2 = ("wigToBigWig -clip -fixedSummaries " + - "-keepAllChromosomes stdin " + self.chrom_sizes_file + - " " + chromOutFileBwSm) + "-keepAllChromosomes " + chromOutFileWigSm + " " + + self.chrom_sizes_file + " " + chromOutFileBwSm) cutsToWigProcessSm = subprocess.Popen(cmd1, shell=True, - stdin=subprocess.PIPE, stdout=subprocess.PIPE) - wigToBigWigProcessSm = subprocess.Popen( - ['wigToBigWig', '-clip', '-fixedSummaries', - '-keepAllChromosomes', 'stdin', - self.chrom_sizes_file, chromOutFileBwSm], - stdin=cutsToWigProcessSm.stdout) + stdin=subprocess.PIPE) + if self.bedout: chromOutFileBed = chromOutFile + ".bed" @@ -241,8 +237,13 @@ def get_shifted_pos(read, shift_factor): # Clean up processes if self.exactbw: cutsToWigProcess.stdin.close() + cutsToWigProcess.communicate() _LOGGER.debug("Encoding exact bigwig for " + chrom + " (last read position:" + str(read.pos) + ")...") + wigToBigWigProcess = subprocess.Popen( + ['wigToBigWig', '-clip', '-fixedSummaries', + '-keepAllChromosomes', chromOutFileWig, + self.chrom_sizes_file, chromOutFileBw]) wigToBigWigProcess.communicate() if self.bedout: @@ -250,8 +251,13 @@ def get_shifted_pos(read, shift_factor): if self.smoothbw: cutsToWigProcessSm.stdin.close() + cutsToWigProcessSm.communicate() _LOGGER.debug("Encoding smooth bigwig for " + chrom + " (last read position:" + str(read.pos) + ")...") + wigToBigWigProcessSm = subprocess.Popen( + ['wigToBigWig', '-clip', '-fixedSummaries', + '-keepAllChromosomes', chromOutFileWigSm, + self.chrom_sizes_file, chromOutFileBwSm]) wigToBigWigProcessSm.communicate() except StopIteration as e: From 781ac6da4ea656aff0d8ba3f88b3a46fbbe5809b Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Fri, 1 Dec 2023 08:35:33 -0500 Subject: [PATCH 20/43] Ensure PEPATAC is using most recent dev branches of looper, pypiper, and pipestat --- requirements.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 9f805e2..6534a7f 100755 --- a/requirements.txt +++ b/requirements.txt @@ -24,8 +24,9 @@ oyaml>=1.0 pararead>=0.7.0 pandas>=1.4 #peppy>=0.35.7 -piper>=0.13.2 -pipestat>=0.5.1 +#piper>=0.13.2 +#pipestat>=0.5.1 +git+https://github.com/pepkit/pipestat.git@dev psutil>=5.8 pyqt5<5.16 pyqtwebengine<5.16 From 84a42e023571b41a5f71196fd423bdeb9c2577ff Mon Sep 17 00:00:00 2001 From: Jason Smith Date: Mon, 4 Dec 2023 15:07:46 -0500 Subject: [PATCH 21/43] handle YAML files; clean up Imports v Depends --- PEPATACr/DESCRIPTION | 8 +- PEPATACr/NAMESPACE | 2 + PEPATACr/NAMESPACE.bak | 25 +++ PEPATACr/PEPATACr.Rproj | 42 ++--- PEPATACr/R/PEPATACr.R | 305 +++++++++++++++++++-------------- PEPATACr/man/consensusPeaks.Rd | 6 +- PEPATACr/man/reducePeaks.Rd | 4 +- PEPATACr/man/yamlToDT.Rd | 18 ++ 8 files changed, 253 insertions(+), 157 deletions(-) create mode 100644 PEPATACr/NAMESPACE.bak create mode 100644 PEPATACr/man/yamlToDT.Rd diff --git a/PEPATACr/DESCRIPTION b/PEPATACr/DESCRIPTION index cfa97fc..44939cd 100644 --- a/PEPATACr/DESCRIPTION +++ b/PEPATACr/DESCRIPTION @@ -1,11 +1,13 @@ Package: PEPATACr Title: Functions and libraries to analyze ATAC-seq data -Version: 0.0.1.0000 +Version: 0.0.2.0000 Authors@R: person("Jason", "Smith", email = "jasonsmith@virginia.edu", role = c("aut", "cre")) Maintainer: Jason Smith Description: Installs required libraries to calculate the fraction of reads in features, TSS enrichments, and fragment length distributions. -Depends: R (>= 3.5.1), data.table, pepr, gplots, grid, ggplot2, optigrab, scales, GenomicDistributions +BiocViews: +Depends: R (>= 3.5.1) +Imports: data.table, pepr, gplots, grid, ggplot2, optigrab, scales, yaml, GenomicDistributions, GenomicDistributionsData License: BSD 2-Clause "Simplified" License Encoding: UTF-8 LazyData: true -RoxygenNote: 7.1.1 +RoxygenNote: 7.2.3 diff --git a/PEPATACr/NAMESPACE b/PEPATACr/NAMESPACE index 8582fa6..9a4abc2 100644 --- a/PEPATACr/NAMESPACE +++ b/PEPATACr/NAMESPACE @@ -23,3 +23,5 @@ export(sampleName) export(setPanelSize) export(splitDataTable) export(summarizer) +import(data.table) +import(ggplot2) diff --git a/PEPATACr/NAMESPACE.bak b/PEPATACr/NAMESPACE.bak new file mode 100644 index 0000000..8582fa6 --- /dev/null +++ b/PEPATACr/NAMESPACE.bak @@ -0,0 +1,25 @@ +# Generated by roxygen2: do not edit by hand + +export(consensusPeaks) +export(createAssetsSummary) +export(createStatsSummary) +export(fileIcon) +export(getPrealignments) +export(narrowPeakToBigBed) +export(peakCounts) +export(plotAlignedPct) +export(plotAlignedRaw) +export(plotAnno) +export(plotComplexityCurves) +export(plotFLD) +export(plotFRiF) +export(plotLibSizes) +export(plotTSS) +export(plotTSSscores) +export(readPepatacPeakCounts) +export(reducePeaks) +export(roundUpNice) +export(sampleName) +export(setPanelSize) +export(splitDataTable) +export(summarizer) diff --git a/PEPATACr/PEPATACr.Rproj b/PEPATACr/PEPATACr.Rproj index bfa3107..f54a4b6 100644 --- a/PEPATACr/PEPATACr.Rproj +++ b/PEPATACr/PEPATACr.Rproj @@ -1,21 +1,21 @@ -Version: 1.0 - -RestoreWorkspace: No -SaveWorkspace: No -AlwaysSaveHistory: Default - -EnableCodeIndexing: Yes -UseSpacesForTab: Yes -NumSpacesForTab: 4 -Encoding: UTF-8 - -RnwWeave: Sweave -LaTeX: pdfLaTeX - -AutoAppendNewline: Yes -StripTrailingWhitespace: Yes - -BuildType: Package -PackageUseDevtools: Yes -PackageInstallArgs: --no-multiarch --with-keep.source -PackageRoxygenize: rd,collate,namespace +Version: 1.0 + +RestoreWorkspace: No +SaveWorkspace: No +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 4 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace diff --git a/PEPATACr/R/PEPATACr.R b/PEPATACr/R/PEPATACr.R index a9c6dd0..8ce5d8a 100755 --- a/PEPATACr/R/PEPATACr.R +++ b/PEPATACr/R/PEPATACr.R @@ -10,6 +10,8 @@ #' @author Jason Smith #' #' @references \url{http://github.com/databio/pepatac} +#' @import data.table +#' @import ggplot2 NULL ################################################################################ @@ -20,8 +22,8 @@ NULL #' @examples #' theme_PEPATAC() theme_PEPATAC <- function(base_family = "sans", ...){ - theme_classic(base_family = base_family, base_size = 14, ...) + - theme( + ggplot2::theme_classic(base_family = base_family, base_size = 14, ...) + + ggplot2::theme( axis.line = element_line(size = 0.5), axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5), panel.grid.major = element_blank(), @@ -88,7 +90,7 @@ fileIcon <- function(x=seq(0:275), y=seq(0:275)) { y=c(157.5/275*maxY,157.5/275*maxY)), aes(x=x,y=y), color="gray", size=4, lineend="round") + theme_PEPATAC() + - theme(panel.border = element_blank(), + ggplot2::theme(panel.border = element_blank(), axis.line = element_blank(), axis.title.x=element_blank(), axis.text.x=element_blank(), @@ -191,12 +193,13 @@ plotComplexityCurves <- function(ccurves, } # Get the real counts if we have them - rcDT <- data.table(name = character(length(real_counts_path)), - total = integer(length(real_counts_path)), - unique = integer(length(real_counts_path))) + rcDT <- data.table::data.table( + name = character(length(real_counts_path)), + total = integer(length(real_counts_path)), + unique = integer(length(real_counts_path))) if (exists(real_counts_path[1])) { - rc_file <- data.table(get(real_counts_path[1])) + rc_file <- data.table::data.table(get(real_counts_path[1])) rcDT$name <- basename(rc_file$V1) rcDT$total <- as.integer(rc_file$V2) if (ncol(rc_file) == 3 && !ignore_unique) { @@ -221,7 +224,7 @@ plotComplexityCurves <- function(ccurves, for (rc in 1:length(real_counts_path)) { info <- file.info(file.path(real_counts_path[rc])) if (file.exists(real_counts_path[rc]) && info$size != 0) { - rc_file <- fread(real_counts_path[rc]) + rc_file <- data.table::fread(real_counts_path[rc]) rcDT$name[rc] <- basename(rc_file$V1) rcDT$total[rc] <- as.integer(rc_file$V2) if (ncol(rc_file) == 3 && !ignore_unique) { @@ -259,7 +262,8 @@ plotComplexityCurves <- function(ccurves, "#372B4C", "#E3DAC7", "#27CAE6", "#B361BC", "#897779", "#6114F8", "#19C42B", "#56B4E9")) - clist <- data.table(TOTAL_READS = list(), EXPECTED_DISTINCT = list()) + clist <- data.table::data.table(TOTAL_READS = list(), + EXPECTED_DISTINCT = list()) ccurves <- as.list(ccurves) colormap <- palette(length(ccurves)) for (c in 1:length(ccurves)) { @@ -270,9 +274,9 @@ plotComplexityCurves <- function(ccurves, #message(paste0("Processing ", sample_name)) if (exists(ccurves[[c]])) { - ctable <- data.table(get(ccurves[[c]])) + ctable <- data.table::data.table(get(ccurves[[c]])) } else if (file.exists(ccurves[[c]])) { - ctable <- fread(ccurves[[c]]) + ctable <- data.table::fread(ccurves[[c]]) } else { stop(paste0("FileExistsError: ", ccurves[[c]], " could not be found.")) @@ -313,7 +317,7 @@ plotComplexityCurves <- function(ccurves, } } if (c == 1) { - clist <- data.table( + clist <- data.table::data.table( SAMPLE_NAME = list(rep(sample_name, length(ccurve_TOTAL_READS))), TOTAL_READS = list(ccurve_TOTAL_READS), @@ -322,7 +326,7 @@ plotComplexityCurves <- function(ccurves, ) } else { clist <- rbindlist(list(clist, - data.table(SAMPLE_NAME = list(rep(sample_name, + data.table::data.table(SAMPLE_NAME = list(rep(sample_name, length(ccurve_TOTAL_READS))), TOTAL_READS = list(ccurve_TOTAL_READS), EXPECTED_DISTINCT = list(ccurve_EXPECTED_DISTINCT), @@ -499,11 +503,11 @@ plotComplexityCurves <- function(ccurves, labs(col = "") + scale_color_discrete(labels=c(clist$SAMPLE_NAME)) + theme_PEPATAC() + - theme(legend.position = "right", + ggplot2::theme(legend.position = "right", plot.caption = element_text(size = 8, face = "italic")) # inset zoom plot - zoom_theme <- theme(legend.position = "none", + zoom_theme <- ggplot2::theme(legend.position = "none", axis.line = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank(), @@ -563,7 +567,7 @@ plotComplexityCurves <- function(ccurves, # Don't include legend for single sample plots if (length(ccurves) == 1) { - fig <- fig + theme(legend.position = "none") + fig <- fig + ggplot2::theme(legend.position = "none") } return(fig) @@ -846,7 +850,7 @@ plotFRiF <- function(sample_name, num_reads, genome_size, labels$val), values=labels$color) + labs(color="FRiF") + - theme(legend.position="right", + ggplot2::theme(legend.position="right", legend.justification=c(0.1,0.9), legend.background=element_blank(), legend.text = element_text(size = rel(0.65)), @@ -862,11 +866,11 @@ plotFRiF <- function(sample_name, num_reads, genome_size, coord_flip() + scale_x_discrete(position="top") + theme_PEPATAC() + - theme(plot.background = element_rect(fill = "transparent", - color = NA,), - panel.background = element_rect(fill = "transparent"), - rect = element_rect(fill = "transparent"), - plot.margin = unit(c(0,0,-6.5,-6.5),"mm")) + ggplot2::theme(plot.background = element_rect( + fill = "transparent", color = NA,), + panel.background = element_rect(fill = "transparent"), + rect = element_rect(fill = "transparent"), + plot.margin = unit(c(0,0,-6.5,-6.5),"mm")) g <- ggplotGrob(p2) min_x <- min(layer_scales(p)$x$range$range) @@ -894,7 +898,7 @@ plotFRiF <- function(sample_name, num_reads, genome_size, labels$val), values=labels$color) + labs(color="FRiF") + - theme(legend.position=c(0.075,0.975), + ggplot2::theme(legend.position=c(0.075,0.975), legend.justification=c(0.1,0.9), legend.title = element_blank(), legend.text = element_text(size = rel(0.65)), @@ -929,7 +933,7 @@ plotFRiF <- function(sample_name, num_reads, genome_size, labels$val), values=labels$color) + labs(color="FRiF") + - theme(legend.position="right", + ggplot2::theme(legend.position="right", legend.justification=c(0.1,0.9), legend.background=element_blank(), legend.key = element_blank(), @@ -944,11 +948,11 @@ plotFRiF <- function(sample_name, num_reads, genome_size, coord_flip() + scale_x_discrete(position="top") + theme_PEPATAC() + - theme(plot.background = element_rect(fill = "transparent", - color = NA,), - panel.background = element_rect(fill = "transparent"), - rect = element_rect(fill = "transparent"), - plot.margin = unit(c(0,0,-6.5,-6.5),"mm")) + ggplot2::theme(plot.background = element_rect( + fill = "transparent", color = NA,), + panel.background = element_rect(fill = "transparent"), + rect = element_rect(fill = "transparent"), + plot.margin = unit(c(0,0,-6.5,-6.5),"mm")) g <- ggplotGrob(p2) min_x <- min(layer_scales(p)$x$range$range) @@ -1004,8 +1008,8 @@ plotTSS <- function(TSSfile, cutoff=6) { } } - t1 <- theme_classic(base_size=14) + - theme(plot.background = element_blank(), + t1 <- ggplot2::theme_classic(base_size=14) + + ggplot2::theme(plot.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.border = element_rect(colour = "black", @@ -1016,26 +1020,26 @@ plotTSS <- function(TSSfile, cutoff=6) { aspect.ratio = 1, axis.ticks.length = unit(2, "mm")) - iMat <- data.table(V1 = numeric()) + iMat <- data.table::data.table(V1 = numeric()) if (length(TSSfile) == 1) { if (exists(TSSfile)) { - iMat <- data.table(get(TSSfile)) + iMat <- data.table::data.table(get(TSSfile)) } else { - iMat <- fread(TSSfile) + iMat <- data.table::fread(TSSfile) } } else if (length(TSSfile) == 2) { for (i in 1:length(TSSfile)) { if (exists(TSSfile[i])) { if (i == 1) { - iMat <- data.table(get(TSSfile[i])) + iMat <- data.table::data.table(get(TSSfile[i])) } else { - iMat <- list(iMat, data.table(get(TSSfile[i]))) + iMat <- list(iMat, data.table::data.table(get(TSSfile[i]))) } } else { if (i == 1) { - iMat <- fread(TSSfile[i]) + iMat <- data.table::fread(TSSfile[i]) } else { - iMat <- list(iMat, fread(TSSfile[i])) + iMat <- list(iMat, data.table::fread(TSSfile[i])) } } } @@ -1192,11 +1196,11 @@ plotFLD <- function(fragL, max_fragment = 200) { if (exists(fragL_count)) { - dat <- data.table(get(fragL_count)) + dat <- data.table::data.table(get(fragL_count)) } else if (file.exists(fragL_count)) { info <- file.info(file.path(fragL_count)) if (info$size != 0) { - dat <- fread(fragL_count) + dat <- data.table::fread(fragL_count) } else { warning(paste0(fragL_count, " is an empty file.")) return(ggplot()) @@ -1207,9 +1211,9 @@ plotFLD <- function(fragL, } if (exists(fragL)) { - summary_table <- data.table(get(fragL)) + summary_table <- data.table::data.table(get(fragL)) } else if (file.exists(fragL)) { - summary_table <- fread(fragL) + summary_table <- data.table::fread(fragL) } else { stop(paste0("FileExistsError: ", fragL, " could not be found.")) quit(save = "no", status = 1, runLast = FALSE) @@ -1217,7 +1221,7 @@ plotFLD <- function(fragL, dat1 <- dat[dat$V2<=max_fragment,] tmp <- seq(1:as.numeric(dat1[1,2]-1)) - dat0 <- data.table(V1=rep(0,length(tmp)),V2=tmp) + dat0 <- data.table::data.table(V1=rep(0,length(tmp)),V2=tmp) dat2 <- rbind(dat0, dat1) x_min <- which.min(dat1$V1[1:which.max(dat1$V1)]) @@ -1237,15 +1241,16 @@ plotFLD <- function(fragL, geom_line(alpha=0.5) + labs(x="Fragment length", y=ylabel) + theme_PEPATAC() + - theme(axis.text.x = element_text(angle = 0, hjust = 0.5)) - - summ <- data.table(Min=min(summary_table$V1), - Max=max(summary_table$V1), - Median=median(summary_table$V1), - Mean=mean(summary_table$V1), - Stdev=sd(summary_table$V1)) + ggplot2::theme(axis.text.x = element_text(angle = 0, hjust = 0.5)) + + summ <- data.table::data.table( + Min=min(summary_table$V1), + Max=max(summary_table$V1), + Median=median(summary_table$V1), + Mean=mean(summary_table$V1), + Stdev=sd(summary_table$V1)) # Write summary table to stats file - fwrite(summ, file=fragL_txt, row.names=F, quote=F, sep="\t") + data.table::fwrite(summ, file=fragL_txt, row.names=F, quote=F, sep="\t") return(p) } @@ -1411,11 +1416,11 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"), # load input file and convert to/ensure it is in BED6 format if (exists(input)) { - in_file <- data.table(get(input)) + in_file <- data.table::data.table(get(input)) } else { info <- file.info(file.path(input)) if (file.exists(file.path(input)) && info$size != 0) { - in_file <- fread(file.path(input)) + in_file <- data.table::fread(file.path(input)) } else { out_file <- file.path(output, paste(basename(sample_path), output_type, @@ -1492,13 +1497,14 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"), # Partition distribution plots knownGenomes <- c('hg19', 'hg38', 'mm9', 'mm10') if (exists(feat)) { - anno_file <- data.table(get(feat)) + anno_file <- data.table::data.table(get(feat)) } else { if (filetype(paste0(feat)) == "gzfile") { - anno_file <- fread(cmd=(sprintf('gzip -d -c %s', shQuote(file.path(feat))))) + anno_file <- data.table::fread( + cmd=(sprintf('gzip -d -c %s', shQuote(file.path(feat))))) suppressWarnings(closeAllConnections()) } else { - anno_file <- fread(file.path(feat)) + anno_file <- data.table::fread(file.path(feat)) } } @@ -1565,7 +1571,7 @@ narrowPeakToBigBed <- function(input=input, chr_sizes=chr_sizes, sample_path <- sampleName(input, 1) if (file.exists(file.path(chr_sizes))) { - chrom_sizes <- fread(file.path(chr_sizes)) + chrom_sizes <- data.table::fread(file.path(chr_sizes)) colnames(chrom_sizes) <- c("chr", "size") } else { err_msg <- paste0("Could not find: ", file.path(chr_sizes)) @@ -1574,7 +1580,7 @@ narrowPeakToBigBed <- function(input=input, chr_sizes=chr_sizes, info = file.info(file.path(input)) if (file.exists(file.path(input)) && info$size != 0) { - np <- fread(file.path(input)) + np <- data.table::fread(file.path(input)) colnames(np) <- c("chr", "chromStart", "chromEnd", "name", "score", "strand", "signalValue", "pValue", "qValue", "peak") } else { @@ -1610,8 +1616,8 @@ narrowPeakToBigBed <- function(input=input, chr_sizes=chr_sizes, as_file <- file.path(paste0(dirname(sample_path), "bigNarrowPeak.as")) out_file <- file.path(paste0(sample_path, "_peaks.bigBed")) - fwrite(np, file=tmp_file, col.names=FALSE, row.names=FALSE, - quote=FALSE, sep='\t') + data.table::fwrite(np, file=tmp_file, col.names=FALSE, row.names=FALSE, + quote=FALSE, sep='\t') cat("table bigNarrowPeak\n", "\"BED6+4 Peaks of signal enrichment based on pooled, normalized (interpreted) data.\"\n", @@ -1650,7 +1656,7 @@ narrowPeakToBigBed <- function(input=input, chr_sizes=chr_sizes, reducePeaks <- function(input, sample_name, chr_sizes, output=NA, normalize=FALSE) { info <- file.info(file.path(input)) if (file.exists(file.path(input)) && info$size != 0) { - peaks <- fread(file.path(input)) + peaks <- data.table::fread(file.path(input)) if (ncol(peaks) == 6) { colnames(peaks) <- c("chr", "start", "end", "name", "score", "strand") @@ -1677,7 +1683,7 @@ reducePeaks <- function(input, sample_name, chr_sizes, output=NA, normalize=FALS info <- file.info(file.path(chr_sizes)) if (file.exists(file.path(chr_sizes)) && info$size != 0) { - c_size <- fread(file.path(chr_sizes)) + c_size <- data.table::fread(file.path(chr_sizes)) colnames(c_size) <- c("chr", "size") } else { if (info$size == 0) { @@ -1694,9 +1700,11 @@ reducePeaks <- function(input, sample_name, chr_sizes, output=NA, normalize=FALS type="any", which=TRUE, nomatch=0) if (bedOnly) { # Only have the "score" to rank peaks - qVals <- data.table(index=rep(1:nrow(peaks)), qValue=peaks$score) + qVals <- data.table::data.table(index=rep(1:nrow(peaks)), + qValue=peaks$score) } else { - qVals <- data.table(index=rep(1:nrow(peaks)), qValue=peaks$qValue) + qVals <- data.table::data.table(index=rep(1:nrow(peaks)), + qValue=peaks$qValue) } setkey(hits, xid) setkey(qVals, index) @@ -1722,10 +1730,11 @@ reducePeaks <- function(input, sample_name, chr_sizes, output=NA, normalize=FALS # save final peak set if (is.na(output)) { file_path <- file.path(dirname(input), sample_name) - fwrite(final, paste0(file_path, "_peaks_normalized.narrowPeak"), - sep="\t", col.names=FALSE) + data.table::fwrite(final, + paste0(file_path, "_peaks_normalized.narrowPeak"), + sep="\t", col.names=FALSE) } else { - fwrite(final, output, sep="\t", col.names=FALSE) + data.table::fwrite(final, output, sep="\t", col.names=FALSE) } } else { @@ -1785,9 +1794,9 @@ setPanelSize <- function(p=NULL, g=ggplotGrob(p), file=NULL, if(!is.null(file)) ggsave(file, g, limitsize = FALSE, - width=convertWidth(sum(g$widths) + margin, + width=grid::convertWidth(sum(g$widths) + margin, unitTo="in", valueOnly=TRUE), - height=convertHeight(sum(g$heights) + margin, + height=grid::convertHeight(sum(g$heights) + margin, unitTo="in", valueOnly=TRUE)) invisible(g) } @@ -1818,9 +1827,9 @@ getPrealignments <- function(stats_file) { #' @export plotAlignedRaw <- function(prj, summary_dir, stats) { # Convenience - project_name <- config(prj)$name + project_name <- pepr::config(prj)$name - align_theme <- theme( + align_theme <- ggplot2::theme( plot.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), @@ -1847,7 +1856,7 @@ plotAlignedRaw <- function(prj, summary_dir, stats) { # Get prealignments if they exist prealignments <- getPrealignments(stats) - unaligned <- stats$Fastq_reads - stats$Aligned_reads + unaligned <- as.numeric(stats$Fastq_reads) - as.numeric(stats$Aligned_reads) # If prealignments exist...include in unaligned reads count if (!is.null(prealignments)) { for (i in 1:length(unlist(prealignments))) { @@ -1859,7 +1868,7 @@ plotAlignedRaw <- function(prj, summary_dir, stats) { align_raw <- tryCatch( { - data.table(sample = stats$sample_name, + data.table::data.table(sample = stats$sample_name, unaligned = as.integer(unaligned), duplicates = as.integer(stats$Duplicate_reads)) }, @@ -1919,10 +1928,10 @@ plotAlignedRaw <- function(prj, summary_dir, stats) { max_reads <- max(rowSums(align_raw[,2:ncol(align_raw)])) upper_limit <- roundUpNice(max_reads/1000000) chart_height <- (length(unique(align_raw$sample))) * 0.75 - plot_colors <- data.table(unaligned="gray15") + plot_colors <- data.table::data.table(unaligned="gray15") if (!is.null(prealignments)) { - more_colors <- colorpanel(length(unlist(prealignments)), + more_colors <- gplots::colorpanel(length(unlist(prealignments)), low="#FFE595", mid="#F6CAA6", high="#F6F2A6") for (i in 1:length(unlist(prealignments))) { plot_colors[, unlist(prealignments)[i] := more_colors[i]] @@ -1930,7 +1939,7 @@ plotAlignedRaw <- function(prj, summary_dir, stats) { } plot_colors[, duplicates := "#FC1E25"] - more_colors <- colorpanel(length(genome_names), + more_colors <- gplots::colorpanel(length(genome_names), low="#4876FF", mid="#94D9CE", high="#7648FF") for (i in 1:length(genome_names)) { plot_colors[, (genome_names[i]) := more_colors[i]] @@ -2004,9 +2013,9 @@ plotAlignedRaw <- function(prj, summary_dir, stats) { #' @export plotAlignedPct <- function(prj, summary_dir, stats) { # Convenience - project_name <- config(prj)$name + project_name <- pepr::config(prj)$name - align_theme <- theme( + align_theme <- ggplot2::theme( plot.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), @@ -2030,7 +2039,7 @@ plotAlignedPct <- function(prj, summary_dir, stats) { legend.title = element_blank() ) - unaligned <- 100 - stats$Alignment_rate + unaligned <- 100 - as.numeric(stats$Alignment_rate) # Get prealignments if they exist prealignments <- getPrealignments(stats) @@ -2054,7 +2063,7 @@ plotAlignedPct <- function(prj, summary_dir, stats) { } } - align_percent <- data.table(sample=stats$sample_name, + align_percent <- data.table::data.table(sample=stats$sample_name, unaligned=unaligned, duplicates=as.numeric(duplicates)) @@ -2121,10 +2130,10 @@ plotAlignedPct <- function(prj, summary_dir, stats) { upper_limit <- 103 chart_height <- (length(unique(align_percent$sample))) * 0.75 - plot_colors <- data.table(unaligned="gray15") + plot_colors <- data.table::data.table(unaligned="gray15") if (!is.null(prealignments)) { - more_colors <- colorpanel(length(unlist(prealignments)), + more_colors <- gplots::colorpanel(length(unlist(prealignments)), low="#FFE595", mid="#F6CAA6", high="#F6F2A6") for (i in 1:length(unlist(prealignments))) { plot_colors[, unlist(prealignments)[i] := more_colors[i]] @@ -2132,7 +2141,7 @@ plotAlignedPct <- function(prj, summary_dir, stats) { } plot_colors[, duplicates := "#FC1E25"] - more_colors <- colorpanel(length(genome_names), + more_colors <- gplots::colorpanel(length(genome_names), low="#4876FF", mid="#94D9CE", high="#7648FF") for (i in 1:length(genome_names)) { plot_colors[, (genome_names[i]) := more_colors[i]] @@ -2210,9 +2219,9 @@ plotAlignedPct <- function(prj, summary_dir, stats) { #' @export plotTSSscores <- function(prj, summary_dir, stats, cutoff=6) { # Convenience - project_name <- config(prj)$name + project_name <- pepr::config(prj)$name - align_theme <- theme( + align_theme <- ggplot2::theme( plot.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), @@ -2241,12 +2250,12 @@ plotTSSscores <- function(prj, summary_dir, stats, cutoff=6) { red_min <- 0 red_max <- cutoff-0.01 red_breaks <- seq(red_min,red_max,0.01) - red_colors <- colorpanel(length(red_breaks), + red_colors <- gplots::colorpanel(length(red_breaks), "#AF0000","#E40E00","#FF7A6A") green_min <- cutoff green_max <- 30 green_breaks <- seq(green_min,green_max,0.01) - green_colors <- colorpanel(length(green_breaks)-1, + green_colors <- gplots::colorpanel(length(green_breaks)-1, "#B4E896","#009405","#003B00") TSS_colors <- c(red_colors, green_colors) @@ -2346,9 +2355,9 @@ plotTSSscores <- function(prj, summary_dir, stats, cutoff=6) { #' @export plotLibSizes <- function(prj, summary_dir, stats) { # Convenience - project_name <- config(prj)$name + project_name <- pepr::config(prj)$name - align_theme <- theme( + align_theme <- ggplot2::theme( plot.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), @@ -2411,6 +2420,34 @@ plotLibSizes <- function(prj, summary_dir, stats) { ) } +#' Internal helper function for \code{summarizer} +#' Convert the new pipestat yaml outputs into R readable data.table file +#' +#' @param sample_name A list project sample names +#' @param yaml_file A project level stats summary yaml file +yamlToDT <- function(sample_name, yaml_file) { + sample_DT <- data.table::as.data.table(yaml_file$PEPATAC$sample[[sample_name]]) + remove_cols <- c("Library complexity", + "TSS enrichment", + "TSS distance distribution", + "Fragment distribution", + "Peak chromosome distribution", + "Peak partition distribution", + "cFRiF", + "FRiF", + "FastQC report r1", + "FastQC report r2") + cols_present <- character() + for (column_name in remove_cols) { + if (any(grepl(column_name, names(sample_DT)))) { + cols_present <- c(cols_present, column_name) + } + } + sample_DT[, (cols_present) := NULL] + sample_DT <- unique(sample_DT) + sample_DT$sample_name <- sample_name + return(sample_DT) +} #' This function is meant to plot multiple summary graphs from the summary table #' made by the Looper summarize command @@ -2420,25 +2457,27 @@ plotLibSizes <- function(prj, summary_dir, stats) { #' @keywords summarize PEPATAC #' @export summarizer <- function(project, output_dir) { + options(scipen = 999) # Build the stats summary file path summary_file <- file.path(output_dir, - paste0(pepr::config(project)$name, "_stats_summary.tsv")) - + paste0(pepr::config(project)$name, "_stats_summary.yaml")) + if (file.exists(summary_file)) { + summary_file <- yaml::read_yaml(summary_file, + handlers=list(int=function(x) { as.numeric(x) })) + } else { + warning("PEPATAC.R summarizer was unable to locate the summary file.") + return(FALSE) + } + #message(paste0("summary file : ", summary_file)) # DEBUG write(paste0("Creating summary plots..."), stdout()) # Set the output directory summary_dir <- suppressMessages(file.path(output_dir, "summary")) # Produce output directory (if needed) dir.create(summary_dir, showWarnings = FALSE) - # read in stats summary file - if (file.exists(summary_file)) { - stats <- suppressWarnings(fread(summary_file, - header=TRUE, - check.names=FALSE)) - } else { - warning("PEPATAC.R summarizer was unable to locate the summary file.") - return(FALSE) - } + # convert yaml to data.table object + stats <- rbindlist(sapply(project_samples, FUN=yamlToDT, + yaml_file=summary_file), fill=TRUE) # Set absent values in table to zero stats[is.na(stats)] <- 0 @@ -2500,7 +2539,7 @@ countReproduciblePeaks <- function(peak_list, peak_DT) { collapsePeaks <- function(sample_table, chr_sizes, min_samples=2, min_score=5, min_olap=1) { # create combined peaks - peaks <- rbindlist(lapply(sample_table$peak_files, fread), idcol="file") + peaks <- rbindlist(lapply(sample_table$peak_files, data.table::fread), idcol="file") if (ncol(peaks) == 7) { colnames(peaks) <- c("file", "chr", "start", "end", "name", "score", "strand") @@ -2512,7 +2551,7 @@ collapsePeaks <- function(sample_table, chr_sizes, warning(paste0("Peak files did not contain a recognizable number", " of columns (", ncol(peaks), ")")) rm(peaks) - final <- data.table(chr=character(), + final <- data.table::data.table(chr=character(), start=integer(), end=integer(), name=character(), @@ -2537,7 +2576,7 @@ collapsePeaks <- function(sample_table, chr_sizes, hits <- data.table::data.table(xid=queryHits(hitsGR), yid=subjectHits(hitsGR)) setkey(hits, xid) - scores <- data.table(index=rep(1:nrow(x)), score=x$score) + scores <- data.table::data.table(index=rep(1:nrow(x)), score=x$score) setkey(scores, index) out <- hits[scores, nomatch=0] keep <- out[out[,.I[which.max(score)],by=yid]$V1] @@ -2610,7 +2649,7 @@ consensusPeaks <- function(sample_table, summary_dir, results_subdir, assets, file_exists <- append(file_exists, file.path(file_list[i])) } } - files <- data.table(peak_files=file_exists) + files <- data.table::data.table(peak_files=file_exists) consensus_peak_files = list() if (nrow(files) == 0) { return(consensus_peak_files) @@ -2636,7 +2675,7 @@ consensusPeaks <- function(sample_table, summary_dir, results_subdir, assets, } c_path <- unique(sample_table[genome == g, c_path]) if (file.exists(c_path)) { - c_size <- fread(c_path) + c_size <- data.table::fread(c_path) colnames(c_size) <- c("chr", "size") } else { warning("Unable to load the chromosome sizes file.") @@ -2654,7 +2693,7 @@ consensusPeaks <- function(sample_table, summary_dir, results_subdir, assets, file_name <- paste0("_", g,"_consensusPeaks.narrowPeak") output_file <- file.path(summary_dir, paste0(project_name, file_name)) - fwrite(final, output_file, sep="\t", col.names=FALSE) + data.table::fwrite(final, output_file, sep="\t", col.names=FALSE) consensus_peak_files <- c(consensus_peak_files, output_file) rm(final) invisible(gc()) @@ -2704,7 +2743,7 @@ readPepatacPeakCounts = function(prj, results_subdir) { result <- lapply(paths, function(x){ info <- file.info(file.path(x)) if (file.exists(x) && info$size != 0) { - df <- fread(x) + df <- data.table::fread(x) colnames(df) <- c("chr", "start", "end", "read_count", "base_count", "width", "frac", "norm") gr <- GenomicRanges::GRanges(df) @@ -2784,7 +2823,7 @@ peakCounts <- function(sample_table, summary_dir, results_subdir, assets, file_exists <- append(file_exists, file.path(file_list[i])) } } - files <- data.table(peak_files=file_exists) + files <- data.table::data.table(peak_files=file_exists) consensus_peak_files = list() if (nrow(files) == 0) { return(consensus_peak_files) @@ -2811,7 +2850,7 @@ peakCounts <- function(sample_table, summary_dir, results_subdir, assets, } c_path <- unique(sample_table[genome == g, c_path]) if (file.exists(c_path)) { - c_size <- fread(c_path) + c_size <- data.table::fread(c_path) colnames(c_size) <- c("chr", "size") } else { warning("Unable to load the chromosome sizes file.") @@ -2823,7 +2862,7 @@ peakCounts <- function(sample_table, summary_dir, results_subdir, assets, nrow(st_list[[g]]), " samples...")) if (reference) { read_peaks <- function(x, y) { - fread(y) + data.table::fread(y) } # Load each peak file as list of named data.tables peaks <- mapply(FUN=read_peaks, @@ -2856,7 +2895,7 @@ peakCounts <- function(sample_table, summary_dir, results_subdir, assets, return(NULL) } } else { - peaks_dt <- data.table(chr=as.character(), + peaks_dt <- data.table::data.table(chr=as.character(), start=as.numeric(), end=as.numeric(), read_count=as.numeric(), @@ -2868,7 +2907,7 @@ peakCounts <- function(sample_table, summary_dir, results_subdir, assets, peaks <- tryCatch( { suppressMessages( - rbindlist(lapply(st_list[[g]]$peak_files, fread))) + rbindlist(lapply(st_list[[g]]$peak_files, data.table::fread))) }, error=function(e) { message("peakCounts() peak coverage file fread(): ", e) @@ -2898,9 +2937,10 @@ peakCounts <- function(sample_table, summary_dir, results_subdir, assets, # instead, different column for each sample is the counts columns, # plural - reduce_dt <- data.table(chr=as.character(seqnames(reduceGR)), - start=start(reduceGR), - end=end(reduceGR)) + reduce_dt <- data.table::data.table( + chr=as.character(seqnames(reduceGR)), + start=start(reduceGR), + end=end(reduceGR)) f <- function(x) {list(0)} # Need to make syntactically valid names valid_names <- make.unique(make.names(st_list[[g]]$sample_name)) @@ -2926,7 +2966,7 @@ peakCounts <- function(sample_table, summary_dir, results_subdir, assets, for (file in st_list[[g]]$peak_files) { info <- file.info(file.path(file)) if (file.exists(file.path(file)) && info$size != 0) { - p <- fread(file) + p <- data.table::fread(file) #name <- gsub("_peaks_coverage.bed","", basename(file)) name <- make.unique(make.names(st_list[[g]][i]$sample_name)) i <- i + 1 @@ -2945,21 +2985,22 @@ peakCounts <- function(sample_table, summary_dir, results_subdir, assets, polap <- width(olap) / width(peaksGR[subjectHits(hitsGR)]) if (poverlap & norm) { - counts <- data.table(index=rep(1:nrow(p)), + counts <- data.table::data.table(index=rep(1:nrow(p)), counts=p$norm*polap) } else if (!poverlap & norm) { - counts <- data.table(index=rep(1:nrow(p)), + counts <- data.table::data.table(index=rep(1:nrow(p)), counts=p$norm) } else if (poverlap & !norm) { - counts <- data.table(index=rep(1:nrow(p)), + counts <- data.table::data.table(index=rep(1:nrow(p)), counts=p$read_count*polap) } else { - counts <- data.table(index=rep(1:nrow(p)), + counts <- data.table::data.table(index=rep(1:nrow(p)), counts=p$read_count) } - hits <- data.table(xid=queryHits(hitsGR), - yid=subjectHits(hitsGR)) + hits <- data.table::data.table( + xid=queryHits(hitsGR), + yid=subjectHits(hitsGR)) setkey(hits, yid) setkey(counts, index) out <- hits[counts, nomatch=0] @@ -3004,7 +3045,8 @@ peakCounts <- function(sample_table, summary_dir, results_subdir, assets, output_file <- file.path(summary_dir, paste0(project_name, "_", g, "_peaks_coverage.tsv")) # save consensus peak set - fwrite(reduce_dt, output_file, sep="\t", col.names=TRUE) + data.table::fwrite(reduce_dt, output_file, + sep="\t", col.names=TRUE) counts_path <- c(counts_path, output_file) } else { warning(strwrap(prefix = " ", initial = "", @@ -3036,7 +3078,7 @@ createStatsSummary <- function(samples, results_subdir) { next } - t <- fread(sample_assets_file, header=FALSE, + t <- data.table::fread(sample_assets_file, header=FALSE, col.names=c('stat', 'val', 'annotation')) # Remove complete duplicates t <- t[!duplicated(t[, c('stat', 'val', 'annotation')], @@ -3047,9 +3089,9 @@ createStatsSummary <- function(samples, results_subdir) { fromLast=TRUE),] t[stat=="Time",]$val <- max_time - t2 <- data.table(t(t$val)) + t2 <- data.table::data.table(t(t$val)) colnames(t2) <- t$stat - t2 <- cbind(data.table(sample_name=sample), t2) + t2 <- cbind(data.table::data.table(sample_name=sample), t2) if (exists("stats")) { stats <- rbind(stats, t2, fill=TRUE) } else { @@ -3073,10 +3115,11 @@ createStatsSummary <- function(samples, results_subdir) { createAssetsSummary <- function(samples, results_subdir) { # Create assets_summary file missing_files <- 0 - assets <- data.table(sample_name=character(), - asset=character(), - path=character(), - annotation=character()) + assets <- data.table::data.table( + sample_name=character(), + asset=character(), + path=character(), + annotation=character()) write(paste0("Creating assets summary..."), stdout()) for (sample in samples) { @@ -3088,7 +3131,7 @@ createAssetsSummary <- function(samples, results_subdir) { next } - t <- fread(sample_assets_file, header=FALSE, + t <- data.table::fread(sample_assets_file, header=FALSE, col.names=c('asset', 'path', 'annotation')) t <- t[!duplicated(t[, c('asset', 'path', 'annotation')], fromLast=TRUE),] diff --git a/PEPATACr/man/consensusPeaks.Rd b/PEPATACr/man/consensusPeaks.Rd index 0e2d64b..64248c4 100644 --- a/PEPATACr/man/consensusPeaks.Rd +++ b/PEPATACr/man/consensusPeaks.Rd @@ -10,7 +10,8 @@ consensusPeaks( results_subdir, assets, min_samples = 2, - min_score = 5 + min_score = 5, + min_olap = 1 ) } \arguments{ @@ -27,6 +28,9 @@ genomes.} in to keep.} \item{min_score}{A minimum peak score to keep an individual peak.} + +\item{min_olap}{A minimum number of bases between peaks to be +considered overlapping.} } \description{ This function is meant to identify a project level set of consensus peaks. diff --git a/PEPATACr/man/reducePeaks.Rd b/PEPATACr/man/reducePeaks.Rd index 25eb7b7..bc37d2d 100644 --- a/PEPATACr/man/reducePeaks.Rd +++ b/PEPATACr/man/reducePeaks.Rd @@ -5,11 +5,13 @@ \title{This function is meant to keep only the most significant non-overlapping peaks. It also trims peaks extending beyond the bounds of the chromosome.} \usage{ -reducePeaks(input, chr_sizes, output = NA, normalize = FALSE) +reducePeaks(input, sample_name, chr_sizes, output = NA, normalize = FALSE) } \arguments{ \item{input}{Path to narrowPeak file} +\item{sample_name}{Sample name character string} + \item{chr_sizes}{Genome chromosome sizes file. } \item{output}{Output file name.} diff --git a/PEPATACr/man/yamlToDT.Rd b/PEPATACr/man/yamlToDT.Rd new file mode 100644 index 0000000..272027f --- /dev/null +++ b/PEPATACr/man/yamlToDT.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PEPATACr.R +\name{yamlToDT} +\alias{yamlToDT} +\title{Internal helper function for \code{summarizer} +Convert the new pipestat yaml outputs into R readable data.table file} +\usage{ +yamlToDT(sample_name, yaml_file) +} +\arguments{ +\item{sample_name}{A list project sample names} + +\item{yaml_file}{A project level stats summary yaml file} +} +\description{ +Internal helper function for \code{summarizer} +Convert the new pipestat yaml outputs into R readable data.table file +} From 1129e8ef10ad80aeb1e22bf238cc2a398810a7cf Mon Sep 17 00:00:00 2001 From: Jason Smith Date: Mon, 4 Dec 2023 15:08:02 -0500 Subject: [PATCH 22/43] adjust stats summary to be performed in python --- pipelines/pepatac_collator.py | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/pipelines/pepatac_collator.py b/pipelines/pepatac_collator.py index 2c085f6..7d994ad 100755 --- a/pipelines/pepatac_collator.py +++ b/pipelines/pepatac_collator.py @@ -5,12 +5,15 @@ __author__ = ["Jason Smith", "Michal Stolarczyk"] __email__ = "jasonsmith@virginia.edu" -__version__ = "0.0.4" +__version__ = "0.0.5" from argparse import ArgumentParser import os import sys import pypiper +import peppy +from peppy.utils import load_yaml +import yaml from ubiquerg import VersionInHelpParser def tool_path(tool_name): @@ -33,7 +36,7 @@ def parse_arguments(): """ parser = VersionInHelpParser(prog="PEPATAC_collator", description='PEPATAC collator' , version=__version__) - parser = pypiper.add_pypiper_args(parser, groups=['pypiper', 'looper']) + parser = pypiper.add_pypiper_args(parser, groups=['pypiper', 'looper', 'common']) parser.add_argument("-n", "--name", help="Name of the project to use.", type=str) parser.add_argument("-r", "--results", @@ -71,6 +74,33 @@ def main(): pm = pypiper.PipelineManager(name="PEPATAC_collator", outfolder=outfolder, args=args, version=__version__) + pm.debug(f"\nargs: {args}\n") + + project = peppy.Project(args.config_file) + #results_subdir = "/home/jps3dp/processed/pepatac_tutorial//processed/results_pipeline" + #stats_yaml_files.append(os.path.join(args.results, sample, "stats.yaml")) + # project._project_data['_config']['looper']['output_dir'] + project_stats_file = os.path.join(args.output_parent, f"{project.name}_stats_summary.yaml") + stats_yaml_files = [] + sample_names = [] + for sample in project.sample_table.sample_name: + pm.debug(f"sample name: {sample}") + sample_names.append(sample) + stats_yaml_files.append(os.path.join(args.results, sample, "stats.yaml")) + num_samples = len(sample_names) + yaml_dict = load_yaml(stats_yaml_files[0]) + if num_samples > 1: + sample_names = sample_names[1:] + stats_yaml_files = stats_yaml_files[1:] + for i in range(len(stats_yaml_files)): + pm.debug(f"i: {i}") + sample_name = sample_names[i] + yaml_tmp = load_yaml(stats_yaml_files[i]) + yaml_dict['PEPATAC']['sample'][sample_name] = yaml_tmp['PEPATAC']['sample'][sample_name] + with open(project_stats_file, 'w') as file: + yaml.dump(yaml_dict, file) + print(f"Summary (n={num_samples}: {project_stats_file})") + cmd = (f"Rscript {tool_path('PEPATAC_summarizer.R')} " f"{args.config_file} {args.output_parent} " f"{args.results} {args.cutoff} {args.min_score} {args.min_olap}") From 8633c18a11bae0835abce7aa1fe99a299bd158f6 Mon Sep 17 00:00:00 2001 From: Jason Smith Date: Mon, 4 Dec 2023 15:08:14 -0500 Subject: [PATCH 23/43] create initial stats_summary file in python --- tools/PEPATAC_summarizer.R | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/tools/PEPATAC_summarizer.R b/tools/PEPATAC_summarizer.R index aea77d7..3d1095e 100755 --- a/tools/PEPATAC_summarizer.R +++ b/tools/PEPATAC_summarizer.R @@ -108,10 +108,11 @@ pep <- argv$config # Load the project prj <- invisible(suppressWarnings(pepr::Project(pep))) # Convenience -project_name <- config(prj)$name +project_name <- pepr::config(prj)$name project_samples <- pepr::sampleTable(prj)$sample_name -sample_table <- data.table(sample_name=pepr::sampleTable(prj)$sample_name, - genome=pepr::sampleTable(prj)$genome) +sample_table <- data.table::data.table( + sample_name=pepr::sampleTable(prj)$sample_name, + genome=pepr::sampleTable(prj)$genome) # Set the output directory summary_dir <- suppressMessages(file.path(argv$output, "summary")) @@ -132,15 +133,15 @@ if (dir.exists(argv$results)) { # Generate stats summary -stats <- PEPATACr::createStatsSummary(project_samples, results_subdir) -if (nrow(stats) == 0) { - quit() -} -project_stats_file <- file.path(argv$output, - paste0(project_name, '_stats_summary.tsv')) -message(sprintf("Summary (n=%s): %s", - length(unique(stats$sample_name)), project_stats_file)) -fwrite(stats, project_stats_file, sep="\t", col.names=TRUE) +# stats <- PEPATACr::createStatsSummary(project_samples, results_subdir) +# if (nrow(stats) == 0) { + # quit() +# } +# project_stats_file <- file.path(argv$output, + # paste0(project_name, '_stats_summary.tsv')) +# message(sprintf("Summary (n=%s): %s", + # length(unique(stats$sample_name)), project_stats_file)) +# data.table::fwrite(stats, project_stats_file, sep="\t", col.names=TRUE) # Generate assets @@ -152,7 +153,7 @@ project_assets_file <- file.path(argv$output, paste0(project_name, '_assets_summary.tsv')) message(sprintf("Summary (n=%s): %s", length(unique(assets$sample_name)), project_assets_file)) -fwrite(assets, project_assets_file, sep="\t", col.names=FALSE) +data.table::fwrite(assets, project_assets_file, sep="\t", col.names=FALSE) # Produce project summary plots From da7a28ef9a5564c681e371da151c1ae3c6b9303a Mon Sep 17 00:00:00 2001 From: Jason Smith Date: Thu, 7 Dec 2023 12:10:45 -0500 Subject: [PATCH 24/43] update R package imports --- PEPATACr/DESCRIPTION | 2 +- PEPATACr/NAMESPACE | 1 + PEPATACr/R/PEPATACr.R | 35 ++++++++++++++++++----------------- 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/PEPATACr/DESCRIPTION b/PEPATACr/DESCRIPTION index 44939cd..8fcee6e 100644 --- a/PEPATACr/DESCRIPTION +++ b/PEPATACr/DESCRIPTION @@ -5,7 +5,7 @@ Authors@R: person("Jason", "Smith", email = "jasonsmith@virginia.edu", role = c( Maintainer: Jason Smith Description: Installs required libraries to calculate the fraction of reads in features, TSS enrichments, and fragment length distributions. BiocViews: -Depends: R (>= 3.5.1) +Depends: R (>= 4.0), GenomicRanges Imports: data.table, pepr, gplots, grid, ggplot2, optigrab, scales, yaml, GenomicDistributions, GenomicDistributionsData License: BSD 2-Clause "Simplified" License Encoding: UTF-8 diff --git a/PEPATACr/NAMESPACE b/PEPATACr/NAMESPACE index 9a4abc2..660437e 100644 --- a/PEPATACr/NAMESPACE +++ b/PEPATACr/NAMESPACE @@ -25,3 +25,4 @@ export(splitDataTable) export(summarizer) import(data.table) import(ggplot2) +import(optigrab) diff --git a/PEPATACr/R/PEPATACr.R b/PEPATACr/R/PEPATACr.R index 8ce5d8a..cb8a043 100755 --- a/PEPATACr/R/PEPATACr.R +++ b/PEPATACr/R/PEPATACr.R @@ -12,6 +12,7 @@ #' @references \url{http://github.com/databio/pepatac} #' @import data.table #' @import ggplot2 +#' @import optigrab NULL ################################################################################ @@ -621,7 +622,7 @@ plotComplexityCurves <- function(ccurves, calcFRiF <- function(bedFile, total, reads) { colnames(bedFile) <- c("chromosome", "start", "end", "count", "bases", "width", "fraction") - grObj <- makeGRangesFromDataFrame(bedFile) + grObj <- GenomicRanges::makeGRangesFromDataFrame(bedFile) grObj <- reduce(grObj) redBed <- data.frame(chromosome=seqnames(grObj), start=start(grObj), end=end(grObj)) @@ -1366,7 +1367,7 @@ tryCatchChromBins <- function(x, y) { withCallingHandlers( { msg <- "" - list(value = calcChromBinsRef(x, y), msg = msg) + list(value = GenomicDistributions::calcChromBinsRef(x, y), msg = msg) }, warning = function(e) { msg <<- trimws(paste0("WARNING: ", e)) @@ -1441,7 +1442,7 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"), } # Convert to GRanges Object - query <- makeGRangesFromDataFrame(in_bed, keep.extra.columns=TRUE) + query <- GenomicRanges::makeGRangesFromDataFrame(in_bed, keep.extra.columns=TRUE) if (tolower(plot) == "chromosome") { # Chromosome distribution plot @@ -1460,7 +1461,7 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"), keep <- tbl[tbl$Freq > cutoff, 1] x <- x$value[x$value$chr %in% keep,] if (nrow(x) > 0) { - ga_plot <- plotChromBins(x) + ga_plot <- GenomicDistributions::plotChromBins(x) return(ga_plot) } else { message("Too few peaks to plot. Check the genome alignment rates.") @@ -1470,15 +1471,15 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"), # Feature distance distribution plots x <- tryCatch( { - suppressMessages(calcFeatureDistRefTSS(query, genome)) + suppressMessages(GenomicDistributions::calcFeatureDistRefTSS(query, genome)) }, error=function(e) { - message("calcFeatureDistRefTSS(): ", e) + message("GenomicDistributions::calcFeatureDistRefTSS(): ", e) return(NULL) }, warning=function(e) { - message("calcFeatureDistRefTSS(): ", e) - return(NULL) + message("GenomicDistributions::calcFeatureDistRefTSS(): ", e) + suppressMessages(GenomicDistributions::calcFeatureDistRefTSS(query, genome)) } ) @@ -1487,7 +1488,7 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"), } if (!is.na(x[1])) { - TSS_plot <- plotFeatureDist(x, featureName="TSS") + TSS_plot <- GenomicDistributions::plotFeatureDist(x, featureName="TSS") return(TSS_plot) } else { message("Unable to produce TSS distribution plot.") @@ -1532,7 +1533,7 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"), # Default to chromosome distribution plot # Chromosome distribution plot # Chromosome distribution plot - x <- chromBins(query, genome) + x <- GenomicDistributions::chromBins(query, genome) if (x$msg != "") { message(x$msg) @@ -1547,7 +1548,7 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"), keep <- tbl[tbl$Freq > cutoff, 1] x <- x$value[x$value$chr %in% keep,] if (nrow(x) > 0) { - ga_plot <- plotChromBins(x) + ga_plot <- GenomicDistributions::plotChromBins(x) return(ga_plot) } else { message("Too few peaks to plot. Check the genome alignment rates.") @@ -1695,7 +1696,7 @@ reducePeaks <- function(input, sample_name, chr_sizes, output=NA, normalize=FALS } if (exists("peaks") & exists("c_size")) { - hits <- foverlaps(peaks, peaks, + hits <- data.table::foverlaps(peaks, peaks, by.x=c("chr", "start", "end"), type="any", which=TRUE, nomatch=0) if (bedOnly) { @@ -2511,7 +2512,7 @@ summarizer <- function(project, output_dir) { countReproduciblePeaks <- function(peak_list, peak_DT) { setkey(peak_DT, chr, start, end) setkey(peak_list, chr, start, end) - hits <- foverlaps(peak_list, peak_DT, + hits <- data.table::foverlaps(peak_list, peak_DT, by.x=c("chr", "start", "end"), type="any", which=TRUE, nomatch=0) # track the number of overlaps of final peak set peaks @@ -2569,7 +2570,7 @@ collapsePeaks <- function(sample_table, chr_sizes, peaks_by_chr <- split(peaks, peaks$chr) hit_aggregator <- function(x) { #message(paste0("x: ", unique(x$chr))) # DEBUG - peaksGR <- makeGRangesFromDataFrame(x, keep.extra.columns=FALSE) + peaksGR <- GenomicRanges::makeGRangesFromDataFrame(x, keep.extra.columns=FALSE) hitsGR <- suppressWarnings( findOverlaps(peaksGR, peaksGR, ignore.strand=TRUE, minoverlap=min_olap)) @@ -2931,7 +2932,7 @@ peakCounts <- function(sample_table, summary_dir, results_subdir, assets, setkey(peaks, chr, start, end) peaks_dt <- rbind(peaks_dt, peaks) # Convert to GRanges for more efficient findOverlaps vs data.table - peaksGR <- makeGRangesFromDataFrame(peaks_dt, + peaksGR <- GenomicRanges::makeGRangesFromDataFrame(peaks_dt, keep.extra.columns=TRUE) reduceGR <- reduce(peaksGR) @@ -2974,8 +2975,8 @@ peakCounts <- function(sample_table, summary_dir, results_subdir, assets, "base_count", "width", "frac", "norm") setkey(p, chr, start, end) - reduceGR <- makeGRangesFromDataFrame(reduce_dt) - peaksGR <- makeGRangesFromDataFrame(p) + reduceGR <- GenomicRanges::makeGRangesFromDataFrame(reduce_dt) + peaksGR <- GenomicRanges::makeGRangesFromDataFrame(p) hitsGR <- findOverlaps(query=reduceGR, subject=peaksGR, minoverlap=min_olap) From 9ca799fec2c56f3b06d3efe5ec18c86be1e3c553 Mon Sep 17 00:00:00 2001 From: Jason Smith Date: Thu, 7 Dec 2023 12:11:40 -0500 Subject: [PATCH 25/43] bump to macs3 usage --- pepatac_input_schema.yaml | 2 +- pipelines/pepatac.py | 40 +++++++++++++++++----------------- pipelines/pepatac.yaml | 6 ++--- sample_pipeline_interface.yaml | 2 +- 4 files changed, 25 insertions(+), 25 deletions(-) diff --git a/pepatac_input_schema.yaml b/pepatac_input_schema.yaml index 3e4dee4..5a40779 100755 --- a/pepatac_input_schema.yaml +++ b/pepatac_input_schema.yaml @@ -46,7 +46,7 @@ properties: peak_caller: type: string description: "Specify the peak calling tool" - enum: ["fseq", "genrich", "hmmratac", "homer", "macs2"] + enum: ["fseq", "genrich", "hmmratac", "homer", "macs3"] genome_size: type: string description: "MACS2 effective genome size" diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index f7b7ed7..b97fd44 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -5,7 +5,7 @@ __author__ = ["Jin Xu", "Nathan Sheffield", "Jason Smith"] __email__ = "jasonsmith@virginia.edu" -__version__ = "0.10.4" +__version__ = "0.10.5" from argparse import ArgumentParser @@ -24,7 +24,7 @@ TOOLS_FOLDER = "tools" ANNO_FOLDER = "anno" ALIGNERS = ["bowtie2", "bwa"] -PEAK_CALLERS = ["fseq", "fseq2", "genrich", "hmmratac", "homer", "macs2"] +PEAK_CALLERS = ["fseq", "fseq2", "genrich", "hmmratac", "homer", "macs3"] PEAK_TYPES = [ "fixed", "variable"] DEDUPLICATORS = ["picard", "samblaster", "samtools"] TRIMMERS = ["trimmomatic", "pyadapt", "skewer"] @@ -57,7 +57,7 @@ def parse_arguments(): help="Name of deduplicator program.") parser.add_argument("--peak-caller", dest="peak_caller", type=str.lower, - default="macs2", choices=PEAK_CALLERS, + default="macs3", choices=PEAK_CALLERS, help="Name of peak caller.") parser.add_argument("-gs", "--genome-size", default="2.7e9", type=str.lower, @@ -68,7 +68,7 @@ def parse_arguments(): parser.add_argument("--peak-type", default="fixed", dest="peak_type", choices=PEAK_TYPES, type=str.lower, help="Call variable or fixed width peaks.\n" - "Fixed width requires MACS2.") + "Fixed width requires MACS3.") parser.add_argument("--extend", default=250, dest="extend", type=int, @@ -530,8 +530,8 @@ def main(): "pigz", "bwa"] # Confirm compatible peak calling settings - if args.peak_type == "fixed" and not args.peak_caller == "macs2": - err_msg = ("Must use MACS2 with `--peak-type fixed` width peaks. " + + if args.peak_type == "fixed" and not args.peak_caller == "macs3": + err_msg = ("Must use MACS3 with `--peak-type fixed` width peaks. " + "Either change the " + "`--peak-caller {}` or ".format(PEAK_CALLERS) + "use `--peak-type variable`.") @@ -1929,7 +1929,7 @@ def report_peak_count(): else: if args.peak_caller == "fseq": if args.peak_type == "fixed": - err_msg = "Must use MACS2 when calling fixed width peaks." + err_msg = "Must use MACS3 when calling fixed width peaks." pm.fail_pipeline(RuntimeError(err_msg)) else: fseq_cmd_chunks = [ @@ -1953,7 +1953,7 @@ def report_peak_count(): cmd = [fseq_cmd, merge_chrom_peaks_files] elif args.peak_caller == "fseq2": if args.peak_type == "fixed": - err_msg = "Must use MACS2 when calling fixed width peaks." + err_msg = "Must use MACS3 when calling fixed width peaks." pm.fail_pipeline(RuntimeError(err_msg)) else: fseq_cmd_chunks = [ @@ -1972,7 +1972,7 @@ def report_peak_count(): cmd = fseq_cmd elif args.peak_caller == "hmmratac" and args.paired_end: if args.peak_type == "fixed": - err_msg = "Must use MACS2 when calling fixed width peaks." + err_msg = "Must use MACS3 when calling fixed width peaks." pm.fail_pipeline(RuntimeError(err_msg)) else: gapped_peak_file = os.path.join(peak_folder, args.sample_name + @@ -2010,20 +2010,20 @@ def report_peak_count(): cmd3 += os.path.join(peak_folder, args.sample_name) cmd = cmd3 elif args.peak_caller == "hmmratac" and not args.paired_end: - pm.info("HMMRATAC requires paired-end data. Defaulting to MACS2") + pm.info("HMMRATAC requires paired-end data. Defaulting to MACS3") cmd_base = [ - "{} callpeak".format(tools.macs2), + "{} callpeak".format(tools.macs3), ("-t", peak_input_file), ("-f", "BED"), ("--outdir", peak_folder), ("-n", args.sample_name), ("-g", args.genome_size) ] - cmd_base.extend(param.macs2.params.split()) + cmd_base.extend(param.macs3.params.split()) cmd = build_command(cmd_base) elif args.peak_caller == "genrich": if args.peak_type == "fixed": - err_msg = "Must use MACS2 when calling fixed width peaks." + err_msg = "Must use MACS3 when calling fixed width peaks." pm.fail_pipeline(RuntimeError(err_msg)) else: cmd1 = (tools.samtools + " sort -n " + rmdup_bam + " > " + @@ -2037,7 +2037,7 @@ def report_peak_count(): cmd = [cmd1, cmd2] elif args.peak_caller == "homer": if args.peak_type == "fixed": - err_msg = "Must use MACS2 when calling fixed width peaks." + err_msg = "Must use MACS3 when calling fixed width peaks." pm.fail_pipeline(RuntimeError(err_msg)) else: tag_directory = os.path.join(peak_folder, "HOMER_tags") @@ -2059,10 +2059,10 @@ def report_peak_count(): pm.clean_add(tag_directory) cmd = [cmd1, cmd2, cmd3] else: - # MACS2 + # MACS3 # Note: required input file is non-positional ("treatment" file -t) cmd_base = [ - "{} callpeak".format(tools.macs2), + "{} callpeak".format(tools.macs3), ("-t", peak_input_file), ("-f", "BED"), ("--outdir", peak_folder), @@ -2070,11 +2070,11 @@ def report_peak_count(): ("-g", args.genome_size) ] if args.peak_type == "fixed": - cmd_base.extend(param.macs2.params.split()) + cmd_base.extend(param.macs3.params.split()) elif args.peak_type == "variable": - cmd_base.extend(param.macs2_variable.params.split()) + cmd_base.extend(param.macs3_variable.params.split()) else: - cmd_base.extend(param.macs2.params.split()) + cmd_base.extend(param.macs3.params.split()) cmd = build_command(cmd_base) # Call peaks and report peak count. @@ -2138,7 +2138,7 @@ def report_peak_count(): fixed_peak_file = os.path.join(peak_folder, args.sample_name + "_peaks_fixedWidth.narrowPeak") # If using fixed peaks, extend from summit - if args.peak_type == "fixed" and args.peak_caller == "macs2": + if args.peak_type == "fixed" and args.peak_caller == "macs3": temp = tempfile.NamedTemporaryFile(dir=peak_folder, delete=False) # extend peaks from summit by 'extend' # start extend from center of peak diff --git a/pipelines/pepatac.yaml b/pipelines/pepatac.yaml index b825074..9719cc3 100755 --- a/pipelines/pepatac.yaml +++ b/pipelines/pepatac.yaml @@ -9,7 +9,7 @@ tools: # absolute paths to required tools bedtools: bedtools bowtie2: bowtie2 fastqc: fastqc - macs2: macs2 + macs3: macs3 preseq: preseq samblaster: samblaster skewer: skewer @@ -68,13 +68,13 @@ parameters: # parameters passed to bioinformatic tools, subclassed by tool samtools: params: '-q 10' # -q: skip alignments with MAPQ < 10. - macs2: + macs3: params: '--shift -75 --extsize 150 --nomodel --call-summits --nolambda --keep-dup all -p 0.01' # -f: Format of tag file. # -q: The qvalue (minimum FDR) cutoff to call significant regions. # --shift: Assign an arbitrary shift in bp. See MACS documentation. # --nomodel: Will bybass building the shifting model. - macs2_variable: + macs3_variable: params: '-f BED -q 0.01 --shift 0 --nomodel' # -f: Format of tag file. # -q: The qvalue (minimum FDR) cutoff to call significant regions. diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index 17de28a..9b9f33a 100755 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -23,7 +23,7 @@ command_template: > {% if sample.prealignment_index is defined %} --prealignment-index { sample.prealignment_index } {% endif %} {% if sample.prealignment_names is defined %} {% if aligner == "bowtie2" or sample.aligner == "bowtie2" %} --prealignment-index {% for p in sample.prealignment_names %} { p ~ '=' ~ refgenie[p].bowtie2_index.dir } {% endfor %} {% else %} --prealignment-index {% for p in sample.prealignment_names %} { p ~ '=' ~ refgenie[p].bwa_index.dir } {% endfor %} {% endif %} {% endif %} {% if sample.deduplicator is defined %} --deduplicator { sample.deduplicator } {% endif %} - {% if sample.peak_caller is defined %} --peak-caller { sample.peak_caller } {% else %} --peak-caller "macs2" {% endif %} + {% if sample.peak_caller is defined %} --peak-caller { sample.peak_caller } {% else %} --peak-caller "macs3" {% endif %} {% if sample.peak_type is defined %} --peak-type { sample.peak_type } {% else %} --peak-type "fixed" {% endif %} {% if sample.extend is defined %} --extend { sample.extend } {% else %} --extend 250 {% endif %} {% if sample.genome_size is defined %} --genome-size { sample.genome_size } {% else %} --genome-size "2.7e9" {% endif %} From ed195339af69cf1fc4e968fa4630f09d5556f361 Mon Sep 17 00:00:00 2001 From: Jason Smith Date: Thu, 7 Dec 2023 12:12:00 -0500 Subject: [PATCH 26/43] update R usage --- tools/PEPATAC.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/PEPATAC.R b/tools/PEPATAC.R index b043ded..23eb738 100755 --- a/tools/PEPATAC.R +++ b/tools/PEPATAC.R @@ -6,7 +6,7 @@ version <- 0.6 ##### Load dependencies ##### -required_libraries <- c("PEPATACr") +required_libraries <- c("PEPATACr", "optigrab") for (i in required_libraries) { loadLibrary <- tryCatch ( { @@ -16,8 +16,6 @@ for (i in required_libraries) { error=function(e) { message("Error: Install the \"", i, "\" R package before proceeding.") - message('i.e. devtools::install_github("databio/pepatac",', - ' subdir="PEPATACr")') return(NULL) }, warning=function(e) { @@ -324,7 +322,7 @@ if (is.na(subcmd) || grepl("/R", subcmd)) { argGap <- ifelse(reads, 13, 12) # Manually parse --bed argument - opt_args_dt <- as.data.table(opt_get_args()) + opt_args_dt <- data.table::as.data.table(opt_get_args()) bed_start <- grep("--bed", opt_args_dt$V1) + 1 bed_end <- nrow(opt_args_dt) # bed <- opt_get(name = c("bed", "b"), required=TRUE, From 7d3ae51a586fbcf4745348171405a2fd0103ab37 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Fri, 8 Dec 2023 18:51:27 -0500 Subject: [PATCH 27/43] PEPATAC collator now reports as project-level pipeline-type and places results in the same subdir as the sample-level results, addresses https://github.com/pepkit/pipestat/issues/130 --- .gitignore | 3 ++- pipelines/pepatac_collator.py | 8 +++++++- project_pipeline_interface.yaml | 2 +- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 1f3396a..1804b5b 100644 --- a/.gitignore +++ b/.gitignore @@ -46,4 +46,5 @@ examples/test_project/test_fseq.yaml examples/test_project/test_genrich.yaml examples/test_project/test_hmmratac.yaml examples/test_project/test_homer.yaml -examples/test_project/test_macs.yaml \ No newline at end of file +examples/test_project/test_macs.yaml +.Rproj.user diff --git a/pipelines/pepatac_collator.py b/pipelines/pepatac_collator.py index 7d994ad..4baa313 100755 --- a/pipelines/pepatac_collator.py +++ b/pipelines/pepatac_collator.py @@ -69,9 +69,10 @@ def parse_arguments(): def main(): args = parse_arguments() + outfolder = os.path.abspath(os.path.join(args.output_parent, "summary")) - pm = pypiper.PipelineManager(name="PEPATAC_collator", outfolder=outfolder, + pm = pypiper.PipelineManager(name="PEPATAC_collator", outfolder=outfolder, pipestat_sample_name="PEPATAC_Collator", pipestat_pipeline_type="project", args=args, version=__version__) pm.debug(f"\nargs: {args}\n") @@ -123,6 +124,11 @@ def main(): outfolder, "{name}_peaks_coverage.tsv".format(name=args.name)) pm.run(cmd, [complexity_file, consensus_peaks_file, peak_coverage_file]) + + pm.report_object("Library complexity", complexity_file) + pm.report_object("consensus_peaks_file", consensus_peaks_file) + pm.report_object("counts_table", peak_coverage_file) + pm.stop_pipeline() diff --git a/project_pipeline_interface.yaml b/project_pipeline_interface.yaml index 9cbfb89..febeda8 100755 --- a/project_pipeline_interface.yaml +++ b/project_pipeline_interface.yaml @@ -5,7 +5,7 @@ output_schema: pepatac_output_schema.yaml command_template: > {pipeline.path} --config {looper.pep_config} - -O {looper.output_dir} + -O {looper.results_subdir} -P {compute.cores} -M {compute.mem} -n {project.name} From b55fea76787f4457811fdcf8c242257c6c68432c Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Mon, 11 Dec 2023 08:57:30 -0500 Subject: [PATCH 28/43] PEPATAC collator pipeline name changed to PEPATAC --- pipelines/pepatac_collator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/pepatac_collator.py b/pipelines/pepatac_collator.py index 4baa313..c2755b1 100755 --- a/pipelines/pepatac_collator.py +++ b/pipelines/pepatac_collator.py @@ -72,7 +72,7 @@ def main(): outfolder = os.path.abspath(os.path.join(args.output_parent, "summary")) - pm = pypiper.PipelineManager(name="PEPATAC_collator", outfolder=outfolder, pipestat_sample_name="PEPATAC_Collator", pipestat_pipeline_type="project", + pm = pypiper.PipelineManager(name="PEPATAC", outfolder=outfolder, pipestat_sample_name="PEPATAC_Collator", pipestat_pipeline_type="project", args=args, version=__version__) pm.debug(f"\nargs: {args}\n") From 924aefac73561103d69c6d43814fe9533a0cc12c Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Mon, 11 Dec 2023 13:11:04 -0500 Subject: [PATCH 29/43] minor doc clarification for compute options --- docs/run-cluster.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/run-cluster.md b/docs/run-cluster.md index 76fb98c..9b5bb8d 100644 --- a/docs/run-cluster.md +++ b/docs/run-cluster.md @@ -47,3 +47,10 @@ divvy init $DIVCFG ``` Next, you edit that config file to add in any compute packages you need. `PEPATAC` will then give you access to any of your custom packages with `looper --package `. For complete instructions on how to create a custom compute package, read [how to configure divvy](https://divvy.databio.org/en/latest/configuration/). + +Alternatively, you can specify compute parameters via the CLI: + +```console +looper run examples/test_project/test_config_refgenie.yaml -d \ + --package slurm --compute PARTITION="your_cluster_partition_name" +``` \ No newline at end of file From fd7112348fe4ab0f95a78ad08e7e876e16d7f9d7 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Fri, 15 Dec 2023 17:53:45 -0500 Subject: [PATCH 30/43] add reporting thumbnail path for project-level items --- pipelines/pepatac_collator.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pipelines/pepatac_collator.py b/pipelines/pepatac_collator.py index c2755b1..d293252 100755 --- a/pipelines/pepatac_collator.py +++ b/pipelines/pepatac_collator.py @@ -118,16 +118,23 @@ def main(): complexity_file = os.path.join( outfolder, "{name}_libComplexity.pdf".format(name=args.name)) + complexity_thumbnail = os.path.join( + outfolder, "{name}_libComplexity.png".format(name=args.name)) consensus_peaks_file = os.path.join( outfolder, "{name}_*_consensusPeaks.narrowPeak".format(name=args.name)) + consensus_peaks_thumbnail = os.path.join( + outfolder, "{name}_*_consensusPeaks.png".format(name=args.name)) peak_coverage_file = os.path.join( outfolder, "{name}_peaks_coverage.tsv".format(name=args.name)) + peak_coverage_thumbnail = os.path.join( + outfolder, "{name}_peaks_coverage.png".format(name=args.name)) pm.run(cmd, [complexity_file, consensus_peaks_file, peak_coverage_file]) - pm.report_object("Library complexity", complexity_file) - pm.report_object("consensus_peaks_file", consensus_peaks_file) - pm.report_object("counts_table", peak_coverage_file) + # For pypiper's report object, anchor image = thumbnail path + pm.report_object("Library complexity", complexity_file, anchor_image=complexity_thumbnail) + pm.report_object("consensus_peaks_file", consensus_peaks_file, anchor_image=consensus_peaks_thumbnail) + pm.report_object("counts_table", peak_coverage_file, anchor_image=peak_coverage_thumbnail) pm.stop_pipeline() From 0dcfa8aa50bbda6e5d9eceb8c17b196b75add731 Mon Sep 17 00:00:00 2001 From: Jason Smith Date: Mon, 18 Dec 2023 07:18:15 -0500 Subject: [PATCH 31/43] remove old comments; fix file name construction of project output --- pipelines/pepatac_collator.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/pipelines/pepatac_collator.py b/pipelines/pepatac_collator.py index d293252..e50ac57 100755 --- a/pipelines/pepatac_collator.py +++ b/pipelines/pepatac_collator.py @@ -78,9 +78,6 @@ def main(): pm.debug(f"\nargs: {args}\n") project = peppy.Project(args.config_file) - #results_subdir = "/home/jps3dp/processed/pepatac_tutorial//processed/results_pipeline" - #stats_yaml_files.append(os.path.join(args.results, sample, "stats.yaml")) - # project._project_data['_config']['looper']['output_dir'] project_stats_file = os.path.join(args.output_parent, f"{project.name}_stats_summary.yaml") stats_yaml_files = [] sample_names = [] @@ -120,14 +117,20 @@ def main(): outfolder, "{name}_libComplexity.pdf".format(name=args.name)) complexity_thumbnail = os.path.join( outfolder, "{name}_libComplexity.png".format(name=args.name)) - consensus_peaks_file = os.path.join( - outfolder, "{name}_*_consensusPeaks.narrowPeak".format(name=args.name)) - consensus_peaks_thumbnail = os.path.join( - outfolder, "{name}_*_consensusPeaks.png".format(name=args.name)) - peak_coverage_file = os.path.join( - outfolder, "{name}_peaks_coverage.tsv".format(name=args.name)) - peak_coverage_thumbnail = os.path.join( - outfolder, "{name}_peaks_coverage.png".format(name=args.name)) + + # TODO: add genome to file name + for genome in project.sample_table.genome.unique(): + pm.debug(f"genome: {genome}") + consensus_peaks_file = os.path.join( + outfolder, f"{args.name}_{genome}_consensusPeaks.narrowPeak") + consensus_peaks_thumbnail = os.path.join( + outfolder, f"{args.name}_{genome}_consensusPeaks.png") + pm.debug(f"consensus_peaks_file: {consensus_peaks_file}") + peak_coverage_file = os.path.join( + outfolder, f"{args.name}_{genome}_peaks_coverage.tsv") + peak_coverage_thumbnail = os.path.join( + outfolder, f"{args.name}_{genome}_peaks_coverage.png") + pm.debug(f"peak_coverage_file(s): {peak_coverage_file}") pm.run(cmd, [complexity_file, consensus_peaks_file, peak_coverage_file]) From 208be4c35427211361f4bd72946436ba9bcf2ecc Mon Sep 17 00:00:00 2001 From: Jason Smith Date: Mon, 18 Dec 2023 07:19:19 -0500 Subject: [PATCH 32/43] clean up code and update comments --- tools/PEPATAC_summarizer.R | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/tools/PEPATAC_summarizer.R b/tools/PEPATAC_summarizer.R index 3d1095e..07d17cb 100755 --- a/tools/PEPATAC_summarizer.R +++ b/tools/PEPATAC_summarizer.R @@ -11,11 +11,11 @@ # )) # # Created: 5/18/17 -# Last updated: 05/10/2021 +# Last updated: 12/18/2023 # # usage: Rscript /path/to/Rscript/PEPATAC_summarizer.R # /path/to/project_config.yaml -# Depends: R (>= 3.5.1) +# Depends: R (>= 4.0) # Imports: PEPATACr, argparser ############################################################################### ##### LOAD ARGUMENTPARSER ##### @@ -131,19 +131,6 @@ if (dir.exists(argv$results)) { quit() } - -# Generate stats summary -# stats <- PEPATACr::createStatsSummary(project_samples, results_subdir) -# if (nrow(stats) == 0) { - # quit() -# } -# project_stats_file <- file.path(argv$output, - # paste0(project_name, '_stats_summary.tsv')) -# message(sprintf("Summary (n=%s): %s", - # length(unique(stats$sample_name)), project_stats_file)) -# data.table::fwrite(stats, project_stats_file, sep="\t", col.names=TRUE) - - # Generate assets assets <- PEPATACr::createAssetsSummary(project_samples, results_subdir) if (nrow(assets) == 0) { @@ -190,15 +177,15 @@ if (!file.exists(complexity_path) || argv$new_start) { read_length = 0, real_counts_path = rcSub, ignore_unique = FALSE) - output_file <- file.path(summary_dir, + output_pdf <- file.path(summary_dir, paste0(project_name, "_libComplexity.pdf")) - pdf(file = output_file, width= 10, height = 7, useDingbats=F) + pdf(file = output_pdf, width= 10, height = 7, useDingbats=F) suppressWarnings(print(p)) invisible(dev.off()) - output_file <- file.path(summary_dir, + output_png <- file.path(summary_dir, paste0(project_name, "_libComplexity.png")) - png(filename = output_file, width = 686, height = 480) + png(filename = output_png, width = 686, height = 480) suppressWarnings(print(p)) invisible(dev.off()) } else { From 5b61347ba7fec6fb42dd17bd9289813a8bada076 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Mon, 18 Dec 2023 12:33:46 -0500 Subject: [PATCH 33/43] change collator's output folder and sample name to match so that pipestat summarize can find it for status and log --- pipelines/pepatac_collator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/pepatac_collator.py b/pipelines/pepatac_collator.py index e50ac57..0fd9f0b 100755 --- a/pipelines/pepatac_collator.py +++ b/pipelines/pepatac_collator.py @@ -72,7 +72,7 @@ def main(): outfolder = os.path.abspath(os.path.join(args.output_parent, "summary")) - pm = pypiper.PipelineManager(name="PEPATAC", outfolder=outfolder, pipestat_sample_name="PEPATAC_Collator", pipestat_pipeline_type="project", + pm = pypiper.PipelineManager(name="PEPATAC", outfolder=outfolder, pipestat_sample_name="summary", pipestat_pipeline_type="project", args=args, version=__version__) pm.debug(f"\nargs: {args}\n") From 4733fc46fbe48a4b0d6a2b3d2e35da44b8c331eb Mon Sep 17 00:00:00 2001 From: Jason Smith Date: Mon, 18 Dec 2023 15:01:54 -0500 Subject: [PATCH 34/43] update nomenclature; add alignment and TSS plots to collator output reports --- pepatac_output_schema.yaml | 4 ++-- pipelines/pepatac_collator.py | 40 +++++++++++++++++++++++++++++------ 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/pepatac_output_schema.yaml b/pepatac_output_schema.yaml index 3b198c0..3c3626c 100644 --- a/pepatac_output_schema.yaml +++ b/pepatac_output_schema.yaml @@ -295,7 +295,7 @@ properties: - path - thumbnail_path - title - TSS enrichment: + TSS_enrichment: title: "TSS enrichment file" description: "Plots TSS scores for each sample." type: object @@ -311,7 +311,7 @@ properties: - path - thumbnail_path - title - Library complexity: + Library_complexity: title: "Library complexity file" description: "Plots each sample's library complexity on a single plot." type: object diff --git a/pipelines/pepatac_collator.py b/pipelines/pepatac_collator.py index 0fd9f0b..a2756e7 100755 --- a/pipelines/pepatac_collator.py +++ b/pipelines/pepatac_collator.py @@ -5,7 +5,7 @@ __author__ = ["Jason Smith", "Michal Stolarczyk"] __email__ = "jasonsmith@virginia.edu" -__version__ = "0.0.5" +__version__ = "0.0.6" from argparse import ArgumentParser import os @@ -72,7 +72,9 @@ def main(): outfolder = os.path.abspath(os.path.join(args.output_parent, "summary")) - pm = pypiper.PipelineManager(name="PEPATAC", outfolder=outfolder, pipestat_sample_name="summary", pipestat_pipeline_type="project", + pm = pypiper.PipelineManager(name="PEPATAC", outfolder=outfolder, + pipestat_sample_name="summary", + pipestat_pipeline_type="project", args=args, version=__version__) pm.debug(f"\nargs: {args}\n") @@ -114,9 +116,24 @@ def main(): cmd += " --normalized" complexity_file = os.path.join( - outfolder, "{name}_libComplexity.pdf".format(name=args.name)) + outfolder, f"{args.name}_libComplexity.pdf") complexity_thumbnail = os.path.join( - outfolder, "{name}_libComplexity.png".format(name=args.name)) + outfolder, f"{args.name}_libComplexity.png") + + alignment_percent_file = os.path.join( + outfolder, f"{args.name}_alignmentPercent.pdf") + alignment_percent_thumbnail = os.path.join( + outfolder, f"{args.name}_alignmentPercent.png") + + alignment_raw_file = os.path.join( + outfolder, f"{args.name}_alignmentRaw.pdf") + alignment_raw_thumbnail = os.path.join( + outfolder, f"{args.name}_alignmentRaw.png") + + TSS_enrichment_file = os.path.join( + outfolder, f"{args.name}_TSSEnrichment.pdf") + TSS_enrichment_thumbnail = os.path.join( + outfolder, f"{args.name}_TSSEnrichment.png") # TODO: add genome to file name for genome in project.sample_table.genome.unique(): @@ -135,9 +152,18 @@ def main(): pm.run(cmd, [complexity_file, consensus_peaks_file, peak_coverage_file]) # For pypiper's report object, anchor image = thumbnail path - pm.report_object("Library complexity", complexity_file, anchor_image=complexity_thumbnail) - pm.report_object("consensus_peaks_file", consensus_peaks_file, anchor_image=consensus_peaks_thumbnail) - pm.report_object("counts_table", peak_coverage_file, anchor_image=peak_coverage_thumbnail) + pm.report_object("Library_complexity", complexity_file, + anchor_image=complexity_thumbnail) + pm.report_object("alignment_percent_file", alignment_percent_file, + anchor_image=alignment_percent_thumbnail) + pm.report_object("alignment_raw_file", alignment_raw_file, + anchor_image=alignment_raw_thumbnail) + pm.report_object("TSS_enrichment", TSS_enrichment_file, + anchor_image=TSS_enrichment_thumbnail) + pm.report_object("consensus_peaks_file", consensus_peaks_file, + anchor_image=consensus_peaks_thumbnail) + pm.report_object("counts_table", peak_coverage_file, + anchor_image=peak_coverage_thumbnail) pm.stop_pipeline() From bc3ca28413dba0dc6cb83c381d7e7c48ea0e7ba3 Mon Sep 17 00:00:00 2001 From: Jason Smith Date: Tue, 19 Dec 2023 12:33:43 -0500 Subject: [PATCH 35/43] remove debug comment --- pipelines/pepatac_collator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pipelines/pepatac_collator.py b/pipelines/pepatac_collator.py index a2756e7..e4ad73a 100755 --- a/pipelines/pepatac_collator.py +++ b/pipelines/pepatac_collator.py @@ -135,7 +135,6 @@ def main(): TSS_enrichment_thumbnail = os.path.join( outfolder, f"{args.name}_TSSEnrichment.png") - # TODO: add genome to file name for genome in project.sample_table.genome.unique(): pm.debug(f"genome: {genome}") consensus_peaks_file = os.path.join( From e698b55e93aa739cbe74d58b6c5a7183902e640e Mon Sep 17 00:00:00 2001 From: Jason Smith Date: Tue, 19 Dec 2023 13:22:41 -0500 Subject: [PATCH 36/43] update size vs linewidth ggplot arguments; adjust aspect ratio of alignment plots --- PEPATACr/R/PEPATACr.R | 83 ++++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 37 deletions(-) diff --git a/PEPATACr/R/PEPATACr.R b/PEPATACr/R/PEPATACr.R index cb8a043..b349848 100755 --- a/PEPATACr/R/PEPATACr.R +++ b/PEPATACr/R/PEPATACr.R @@ -1834,11 +1834,13 @@ plotAlignedRaw <- function(prj, summary_dir, stats) { plot.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), - panel.border = element_rect(colour = "black", fill = NA, size = 0.5), + panel.border = element_rect(colour = "black", fill = NA, + linewidth = 0.5), panel.background = element_blank(), axis.line = element_blank(), axis.text.x = element_text(face = "plain", color = "black", - size = 20, hjust = 0.5), + size = 20, angle = 90, hjust = 1, + vjust=0.5), axis.text.y = element_text(face = "plain", color = "black", size = 20, hjust = 1), axis.title.x = element_text(face = "plain", color = "black", @@ -1847,11 +1849,12 @@ plotAlignedRaw <- function(prj, summary_dir, stats) { size = 22, hjust = 0.5), plot.title = element_text(face = "bold", color = "black", size = 12, hjust = 0.5), - axis.ticks = element_line(size = 0.5), + axis.ticks = element_line(linewidth = 0.5), axis.ticks.length = unit(2, "mm"), legend.background = element_rect(fill = "transparent"), legend.text = element_text(size = 16), - legend.title = element_blank() + legend.title = element_blank(), + aspect.ratio = 1 ) # Get prealignments if they exist @@ -1914,21 +1917,21 @@ plotAlignedRaw <- function(prj, summary_dir, stats) { stats[, (paste("Aligned_reads", unlist(prealignments)[i], sep="_")), with=FALSE][[1]])] } - setcolorder(align_raw, c("sample", "unaligned", + data.table::setcolorder(align_raw, c("sample", "unaligned", paste(unlist(prealignments)), "duplicates", paste(unique(stats$Genome)))) } else { - setcolorder(align_raw, c("sample", "unaligned", "duplicates", + data.table::setcolorder(align_raw, c("sample", "unaligned", "duplicates", paste(unique(stats$Genome)))) } align_raw$sample <- factor(align_raw$sample, levels = unique(align_raw$sample)) - melt_align_raw <- melt(align_raw, id.vars = "sample") + melt_align_raw <- reshape2::melt(align_raw, id.vars = "sample") max_reads <- max(rowSums(align_raw[,2:ncol(align_raw)])) upper_limit <- roundUpNice(max_reads/1000000) - chart_height <- (length(unique(align_raw$sample))) * 0.75 + chart_height <- (length(unique(align_raw$sample))) #* 0.75 plot_colors <- data.table::data.table(unaligned="gray15") if (!is.null(prealignments)) { @@ -1948,7 +1951,7 @@ plotAlignedRaw <- function(prj, summary_dir, stats) { align_raw_plot <- ( ggplot(melt_align_raw, aes(x=sample, y=as.numeric(value)/1000000)) + - geom_col(aes(fill=variable), colour="black", size=0.25, width=0.8) + + geom_col(aes(fill=variable), colour="black", `linewidth`=0.25, width=0.8) + guides(fill=guide_legend(reverse=TRUE)) + labs(x="", y="Number of reads (M)") + scale_x_discrete(limits=rev(levels(melt_align_raw$sample))) + @@ -1964,7 +1967,7 @@ plotAlignedRaw <- function(prj, summary_dir, stats) { setPanelSize( align_raw_plot, file=output_file, - width=unit(8,"inches"), + width=unit(chart_height,"inches"), height=unit(chart_height,"inches") ) ) @@ -1977,8 +1980,8 @@ plotAlignedRaw <- function(prj, summary_dir, stats) { colnames(more_to_see) <- colnames(align_raw_thumb) align_raw_thumb <- rbind(align_raw_thumb, more_to_see) align_raw_thumb$sample <- droplevels(align_raw_thumb)$sample - melt_align_raw_thumb <- melt(align_raw_thumb, id.vars="sample") - chart_height <- (length(unique(align_raw_thumb$sample))) * 0.75 + melt_align_raw_thumb <- reshape2::melt(align_raw_thumb, id.vars="sample") + chart_height <- (length(unique(align_raw_thumb$sample))) #* 0.75 } else {melt_align_raw_thumb <- melt_align_raw} thumb_raw_plot <- ( @@ -1998,7 +2001,7 @@ plotAlignedRaw <- function(prj, summary_dir, stats) { setPanelSize( thumb_raw_plot, file=output_file, - width=unit(8,"inches"), + width=unit(chart_height,"inches"), height=unit(chart_height,"inches") ) ) @@ -2020,11 +2023,13 @@ plotAlignedPct <- function(prj, summary_dir, stats) { plot.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), - panel.border = element_rect(colour = "black", fill = NA, size = 0.5), + panel.border = element_rect(colour = "black", fill = NA, + linewidth = 0.5), panel.background = element_blank(), axis.line = element_blank(), axis.text.x = element_text(face = "plain", color = "black", - size = 20, hjust = 0.5), + size = 20, angle = 90, hjust = 1, + vjust=0.5), axis.text.y = element_text(face = "plain", color = "black", size = 20, hjust = 1), axis.title.x = element_text(face = "plain", color = "black", @@ -2033,11 +2038,12 @@ plotAlignedPct <- function(prj, summary_dir, stats) { size = 22, hjust = 0.5), plot.title = element_text(face = "bold", color = "black", size = 12, hjust = 0.5), - axis.ticks = element_line(size = 0.5), + axis.ticks = element_line(linewidth = 0.5), axis.ticks.length = unit(2, "mm"), legend.background = element_rect(fill = "transparent"), legend.text = element_text(size = 16), - legend.title = element_blank() + legend.title = element_blank(), + aspect.ratio = 1 ) unaligned <- 100 - as.numeric(stats$Alignment_rate) @@ -2093,11 +2099,11 @@ plotAlignedPct <- function(prj, summary_dir, stats) { unlist(prealignments)[i], sep="_")), with=FALSE][[1]])] } - setcolorder(align_percent, c("sample", "unaligned", + data.table::setcolorder(align_percent, c("sample", "unaligned", paste(unlist(prealignments)), "duplicates", paste(unique(stats$Genome)))) } else { - setcolorder(align_percent, c("sample", "unaligned", "duplicates", + data.table::setcolorder(align_percent, c("sample", "unaligned", "duplicates", paste(unique(stats$Genome)))) } @@ -2127,9 +2133,9 @@ plotAlignedPct <- function(prj, summary_dir, stats) { print(aberrant_samples, row.names=FALSE) } - melt_align_percent <- melt(align_percent, id.vars="sample") + melt_align_percent <- reshape2::melt(align_percent, id.vars="sample") upper_limit <- 103 - chart_height <- (length(unique(align_percent$sample))) * 0.75 + chart_height <- (length(unique(align_percent$sample))) #* 0.75 plot_colors <- data.table::data.table(unaligned="gray15") @@ -2166,7 +2172,7 @@ plotAlignedPct <- function(prj, summary_dir, stats) { setPanelSize( align_percent_plot, file=output_file, - width=unit(8,"inches"), + width=unit(chart_height,"inches"), height=unit(chart_height,"inches") ) ) @@ -2180,10 +2186,9 @@ plotAlignedPct <- function(prj, summary_dir, stats) { colnames(more_to_see) <- colnames(align_percent_thumb) align_percent_thumb <- rbind(align_percent_thumb, more_to_see) align_percent_thumb$sample <- droplevels(align_percent_thumb)$sample - melt_align_percent_thumb <- melt(align_percent_thumb, + melt_align_percent_thumb <- reshape2::melt(align_percent_thumb, id.vars="sample") - chart_height <- ((length(unique(align_percent_thumb$sample))) * - 0.75) + chart_height <- ((length(unique(align_percent_thumb$sample)))) } else {melt_align_percent_thumb <- melt_align_percent} thumb_percent_plot <- ( @@ -2203,7 +2208,7 @@ plotAlignedPct <- function(prj, summary_dir, stats) { setPanelSize( thumb_percent_plot, file=output_file, - width=unit(8,"inches"), + width=unit(chart_height,"inches"), height=unit(chart_height,"inches") ) ) @@ -2227,11 +2232,12 @@ plotTSSscores <- function(prj, summary_dir, stats, cutoff=6) { panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.border = element_rect(colour = "black", fill = NA, - size = 0.5), + linewidth = 0.5), panel.background = element_blank(), axis.line = element_blank(), axis.text.x = element_text(face = "plain", color = "black", - size = 20, hjust = 0.5), + size = 20, angle = 90, hjust = 1, + vjust=0.5), axis.text.y = element_text(face = "plain", color = "black", size = 20, hjust = 1), axis.title.x = element_text(face = "plain", color = "black", @@ -2240,11 +2246,12 @@ plotTSSscores <- function(prj, summary_dir, stats, cutoff=6) { size = 22, hjust = 0.5), plot.title = element_text(face = "bold", color = "black", size = 12, hjust = 0.5), - axis.ticks = element_line(size = 0.5), + axis.ticks = element_line(linewidth = 0.5), axis.ticks.length = unit(2, "mm"), legend.background = element_rect(fill = "transparent"), legend.text = element_text(size = 16), - legend.title = element_blank() + legend.title = element_blank(), + aspect.ratio = 1 ) # Establish red/green color scheme @@ -2288,7 +2295,7 @@ plotTSSscores <- function(prj, summary_dir, stats, cutoff=6) { max_TSS <- max(stats$TSS_score, na.rm=TRUE) upper_limit <- roundUpNice(max_TSS) - chart_height <- (length(unique(TSS_score$sample))) * 0.75 + chart_height <- (length(unique(TSS_score$sample))) #* 0.75 TSS_score$sample <- factor(TSS_score$sample, levels=unique(TSS_score$sample)) @@ -2318,7 +2325,7 @@ plotTSSscores <- function(prj, summary_dir, stats, cutoff=6) { # Limit to 25 samples max if (length(TSS_score$sample) > 25) { TSS_score_thumb <- TSS_score[1:25, ] - chart_height <- (length(unique(TSS_score_thumb$sample))) * 0.75 + chart_height <- (length(unique(TSS_score_thumb$sample))) #* 0.75 more_to_see <- data.frame(t(c("...", "0", "#AF0000"))) colnames(more_to_see) <- colnames(TSS_score_thumb) TSS_score_thumb <- rbind(TSS_score_thumb, more_to_see) @@ -2362,7 +2369,8 @@ plotLibSizes <- function(prj, summary_dir, stats) { plot.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), - panel.border = element_rect(colour = "black", fill = NA, size = 0.5), + panel.border = element_rect(colour = "black", fill = NA, + linewidth = 0.5), panel.background = element_blank(), axis.line = element_blank(), axis.text.x = element_text(face = "plain", color = "black", @@ -2375,11 +2383,12 @@ plotLibSizes <- function(prj, summary_dir, stats) { size = 22, hjust = 0.5), plot.title = element_text(face = "bold", color = "black", size = 12, hjust = 0.5), - axis.ticks = element_line(size = 0.5), + axis.ticks = element_line(linewidth = 0.5), axis.ticks.length = unit(2, "mm"), legend.background = element_rect(fill = "transparent"), legend.text = element_text(size = 16), - legend.title = element_blank() + legend.title = element_blank(), + aspect.ratio = 1 ) picard_lib_size <- cbind.data.frame( @@ -2477,8 +2486,8 @@ summarizer <- function(project, output_dir) { dir.create(summary_dir, showWarnings = FALSE) # convert yaml to data.table object - stats <- rbindlist(sapply(project_samples, FUN=yamlToDT, - yaml_file=summary_file), fill=TRUE) + stats <- data.table::rbindlist(sapply(project_samples, FUN=yamlToDT, + yaml_file=summary_file), fill=TRUE) # Set absent values in table to zero stats[is.na(stats)] <- 0 From 1c3163070e589efcfacdb694dbbdba8761a7aa15 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Wed, 20 Dec 2023 09:19:50 -0500 Subject: [PATCH 37/43] update piper,looper, pipestat to latest pre-release versions --- requirements.txt | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6534a7f..ebba500 100755 --- a/requirements.txt +++ b/requirements.txt @@ -13,10 +13,8 @@ FuzzyTM>=0.4.0 hypothesis==4.38.0 jinja2 jsonschema>=3.0.1 -git+https://github.com/pepkit/looper.git@dev -git+https://github.com/databio/pypiper.git@dev logmuse>=0.2.7 -#looper>=1.4.2 +looper>=1.6.0a3 MACS2>=2.2.9.1 MACS3>=3.0.0b1 numpy<1.25,>=1.21 @@ -24,9 +22,8 @@ oyaml>=1.0 pararead>=0.7.0 pandas>=1.4 #peppy>=0.35.7 -#piper>=0.13.2 -#pipestat>=0.5.1 -git+https://github.com/pepkit/pipestat.git@dev +piper>=0.14.0a2 +pipestat>=0.6.0a11 psutil>=5.8 pyqt5<5.16 pyqtwebengine<5.16 From 03337963550a21b6538017bb9d958d4846ce402a Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Thu, 21 Dec 2023 10:48:44 -0500 Subject: [PATCH 38/43] minor docs updates --- docs/detailed-install.md | 1 + docs/run-conda.md | 2 ++ 2 files changed, 3 insertions(+) diff --git a/docs/detailed-install.md b/docs/detailed-install.md index 4638b8d..f11b754 100755 --- a/docs/detailed-install.md +++ b/docs/detailed-install.md @@ -137,6 +137,7 @@ That should do it! Now we'll [install some **optional** packages](tutorial.md#1 Note: if given error regarding `devtools` try: `apt install r-cran-devtools` before proceeding with installation. ``` +Rscript -e 'install.packages('argparser')' Rscript -e 'install.packages("devtools")' Rscript -e 'devtools::install_github("pepkit/pepr")' Rscript -e 'install.packages("BiocManager")' diff --git a/docs/run-conda.md b/docs/run-conda.md index 16eedd4..8debd97 100755 --- a/docs/run-conda.md +++ b/docs/run-conda.md @@ -42,10 +42,12 @@ To ensure these packages are installed to the `pepatac` `conda` environment, mak conda activate pepatac unset R_LIBS export R_LIBS="$CONDA_PREFIX/lib/R/library" +export R_LIBS_USER="$CONDA_PREFIX/lib/R/library" ``` From the `pepatac/` directory, open `R` and install the following packages: ```{R} +install.packages('argparser') install.packages("optigrab") devtools::install_github("databio/GenomicDistributions") install.packages("http://big.databio.org/GenomicDistributionsData/GenomicDistributionsData_0.0.2.tar.gz", repos=NULL) From 4868fd2428caed0875125a58b6b74361b59528de Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Thu, 21 Dec 2023 11:47:36 -0500 Subject: [PATCH 39/43] add note about genomic distributions data --- docs/detailed-install.md | 1 + docs/run-conda.md | 1 + 2 files changed, 2 insertions(+) diff --git a/docs/detailed-install.md b/docs/detailed-install.md index f11b754..e841c3d 100755 --- a/docs/detailed-install.md +++ b/docs/detailed-install.md @@ -136,6 +136,7 @@ That should do it! Now we'll [install some **optional** packages](tutorial.md#1 `PEPATAC` uses `R` to generate quality control and read/peak annotation plots, so you'll need to have R functional if you want these outputs. We have packaged all the `R` code into a supporting package called [PEPATACr](https://github.com/databio/pepatac/tree/master/PEPATACr). The `PEPATAC` package relies on a few additional packages which can be installed at the command line as follows: Note: if given error regarding `devtools` try: `apt install r-cran-devtools` before proceeding with installation. +Note: if receiving an error for GenomicDistributionsData_0.0.2.tar.gz, download the file manually and install directly using `install.packages("local/path/to/GenomicDistributionsData_0.0.2.tar.gz", repos=NULL)` ``` Rscript -e 'install.packages('argparser')' Rscript -e 'install.packages("devtools")' diff --git a/docs/run-conda.md b/docs/run-conda.md index 8debd97..649b4a1 100755 --- a/docs/run-conda.md +++ b/docs/run-conda.md @@ -46,6 +46,7 @@ export R_LIBS_USER="$CONDA_PREFIX/lib/R/library" ``` From the `pepatac/` directory, open `R` and install the following packages: +Note: if receiving an error for GenomicDistributionsData_0.0.2.tar.gz, download the file manually and install directly using `install.packages("local/path/to/GenomicDistributionsData_0.0.2.tar.gz", repos=NULL)` ```{R} install.packages('argparser') install.packages("optigrab") From ca48837fcbd2efd39e499d4ddfdd97673cb93b4d Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Thu, 21 Dec 2023 14:42:10 -0500 Subject: [PATCH 40/43] changelog update --- docs/changelog.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/changelog.md b/docs/changelog.md index f029725..4b774ea 100755 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,6 +1,11 @@ # Change log All notable changes to this project will be documented in this file. +## [0.10.6] -- 2023-XX-XX + +### Fixed + - adjusted requirements and docs for Looper 1.6.0, Pipestat v0.6.0, and Pypiper 0.14.0 + ## [0.10.5] -- 2023-07-31 ### Fixed From 212590d08422d011090e5134193393bee08bd59b Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Fri, 22 Dec 2023 11:41:55 -0500 Subject: [PATCH 41/43] update reqs --- requirements.txt | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index ebba500..0d49c1f 100755 --- a/requirements.txt +++ b/requirements.txt @@ -14,16 +14,15 @@ hypothesis==4.38.0 jinja2 jsonschema>=3.0.1 logmuse>=0.2.7 -looper>=1.6.0a3 +looper>=1.6.0 MACS2>=2.2.9.1 MACS3>=3.0.0b1 numpy<1.25,>=1.21 oyaml>=1.0 pararead>=0.7.0 pandas>=1.4 -#peppy>=0.35.7 -piper>=0.14.0a2 -pipestat>=0.6.0a11 +piper>=0.14.0 +pipestat>=0.6.0 psutil>=5.8 pyqt5<5.16 pyqtwebengine<5.16 From dad0bf62d9c9d39c938acaa2c08c5d19d53a7f2a Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Fri, 22 Dec 2023 13:41:59 -0500 Subject: [PATCH 42/43] update changelog, version number --- docs/changelog.md | 2 +- pipelines/pepatac.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 4b774ea..ba7d65e 100755 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,7 +1,7 @@ # Change log All notable changes to this project will be documented in this file. -## [0.10.6] -- 2023-XX-XX +## [0.10.6] -- 2023-12-22 ### Fixed - adjusted requirements and docs for Looper 1.6.0, Pipestat v0.6.0, and Pypiper 0.14.0 diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index b97fd44..9ee47b0 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -5,7 +5,7 @@ __author__ = ["Jin Xu", "Nathan Sheffield", "Jason Smith"] __email__ = "jasonsmith@virginia.edu" -__version__ = "0.10.5" +__version__ = "0.10.6" from argparse import ArgumentParser From 49b9a1a5d8dfc5a0dcbfdc78dd52fe7d2a73c803 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Fri, 22 Dec 2023 13:50:42 -0500 Subject: [PATCH 43/43] v0.11.0 release prep --- docs/changelog.md | 7 ++++++- pipelines/pepatac.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index ba7d65e..5aef8b7 100755 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,11 +1,16 @@ # Change log All notable changes to this project will be documented in this file. -## [0.10.6] -- 2023-12-22 +## [0.11.0] -- 2023-12-22 ### Fixed - adjusted requirements and docs for Looper 1.6.0, Pipestat v0.6.0, and Pypiper 0.14.0 +### Changed +- pipeline uses MACS3 instead of MACS2 +- output schema is now a pipestat compatible JSON Schema +- collator now outputs with record_identifier name `summary` + ## [0.10.5] -- 2023-07-31 ### Fixed diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 9ee47b0..2e0ac28 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -5,7 +5,7 @@ __author__ = ["Jin Xu", "Nathan Sheffield", "Jason Smith"] __email__ = "jasonsmith@virginia.edu" -__version__ = "0.10.6" +__version__ = "0.11.0" from argparse import ArgumentParser