Merge pull request #48 from databio/dev

Dev
databio · Jun 25, 2018 · 5e1e8c3 · 5e1e8c3
2 parents d895ca0 + 8f1cbac
commit 5e1e8c3
Show file tree

Hide file tree

Showing 31 changed files with 2,168 additions and 1,288 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,18 @@
 # Change log
 All notable changes to this project will be documented in this file.
 
-## [0.7.0] -- Unreleased
+## [0.7.0] -- 2018-06-25
+
+### Added
+- Added containerization feature
+    - Run with either [docker](https://www.docker.com/) or [singularity](https://singularity.lbl.gov/)
+- Added early bowtie2 index check
+
+### Changed
+- Renamed pipeline
+- Improved summary figure reporting
+- Integrated summary results into pipeline interface
+
 
 ## [0.6.1] -- 2017-12-15
 

diff --git a/Makefile b/Makefile
@@ -1,6 +1,13 @@
 microtest:
-	python $$CODEBASE/ATACseq/pipelines/ATACseq.py -I $$MICROTEST/data/atacR1.fq.gz -I2 $$MICROTEST/data/atacR2.fq.gz -G hg19 -O $$HOME/scratch -S atac_test --single-or-paired paired -R
+	python $$CODEBASE/pepatac/pipelines/pepatac.py -I $$MICROTEST/data/atacR1.fq.gz -I2 $$MICROTEST/data/atacR2.fq.gz -G hg19 -O $$HOME/scratch -S atac_test --single-or-paired paired -R
 test:
-	python pipelines/ATACseq.py  -P 3 -M 100 -O test_out -R -S liver -G hg19  -Q paired  -C ATACseq.yaml  --genome-size hs --prealignments rCRSd human_repeats -I examples/test_data/liver-CD31_test_R1.fastq.gz -I2 examples/test_data/liver-CD31_test_R2.fastq.gz  
+	python pipelines/pepatac.py  -P 3 -M 100 -O test_out -R -S liver -G hg19  -Q paired  -C pepatac.yaml  --genome-size hs --prealignments rCRSd human_repeats -I examples/test_data/liver-CD31_test_R1.fastq.gz -I2 examples/test_data/liver-CD31_test_R2.fastq.gz  
 changtest:
-	python pipelines/ATACseq.py  -P 3 -M 100 -O test_out -R -S liver -G hg19  -Q paired  -C $HOME/code/ATACseq/examples/chang_project/ATACseq.yaml  -gs mm -I examples/test_data/liver-CD31_test_R1.fastq.gz -I2 examples/test_data/liver-CD31_test_R2.fastq.gz 
+	python pipelines/pepatac.py  -P 3 -M 100 -O test_out -R -S liver -G hg19  -Q paired  -C $HOME/code/pepatac/examples/chang_project/pepatac.yaml  -gs mm -I examples/test_data/liver-CD31_test_R1.fastq.gz -I2 examples/test_data/liver-CD31_test_R2.fastq.gz 
+
+
+docker:
+	docker build -t databio/pepatac -f containers/pepatac.Dockerfile .
+
+singularity:
+	singularity build $${SIMAGES}pepatac docker://databio/pepatac
diff --git a/README.md b/README.md
diff --git a/config/pipeline_interface.yaml b/config/pipeline_interface.yaml
@@ -1,5 +1,5 @@
-ATACseq.py:
-  name: ATACseq
+pepatac.py:
+  name: PEPATAC
   looper_args: True
   required_input_files: [read1, read2]
   all_input_files: [read1, read2]

diff --git a/config/protocol_mappings.yaml b/config/protocol_mappings.yaml
@@ -1,2 +1,2 @@
-ATAC: ATACseq.py
-ATAC-SEQ: ATACseq.py
+ATAC: pepatac.py
+ATAC-SEQ: pepatac.py
diff --git a/containers/pepatac.Dockerfile b/containers/pepatac.Dockerfile
@@ -0,0 +1,157 @@
+# Pull base image
+FROM phusion/baseimage:0.10.1
+
+# Who maintains this image
+LABEL maintainer Jason Smith "[email protected]"
+
+# Version info
+LABEL version 0.8.1
+
+# Use baseimage-docker's init system.
+CMD ["/sbin/my_init"]
+
+# Install dependencies
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install --assume-yes \    
+    curl \
+    default-jre \
+    default-jdk \
+    git \
+    libcommons-math3-java \
+    libcurl4-gnutls-dev \ 
+    libjbzip2-java \
+    libpng-dev \
+    libssl-dev \
+    libtbb2 \
+    libtbb-dev \
+    openssl \
+    pigz \
+    python \
+    python-pip python-dev build-essential \
+    wget
+
+# Install MySQL server
+RUN DEBIAN_FRONTEND=noninteractive apt-get install --assume-yes mysql-server \
+    mysql-client \
+    libmysqlclient-dev
+
+# Install python tools
+RUN pip install --upgrade pip
+RUN pip install virtualenv && \
+    pip install numpy && \
+    pip install MACS2 && \
+    pip install pararead && \
+    pip install piper
+
+# Install R
+RUN DEBIAN_FRONTEND=noninteractive apt-get --assume-yes install r-base r-base-dev && \
+    echo "r <- getOption('repos'); r['CRAN'] <- 'http://cran.us.r-project.org'; options(repos = r);" > ~/.Rprofile && \
+    Rscript -e "install.packages('devtools')" && \
+    Rscript -e "devtools::install_github('pepkit/pepr')" && \
+    Rscript -e "install.packages('gtable')" && \
+    Rscript -e "install.packages('argparser')" && \
+    Rscript -e "install.packages('ggplot2')" && \  
+    Rscript -e "install.packages('gplots')" && \
+    Rscript -e "install.packages('grid')" && \
+    Rscript -e "install.packages('scales')" && \
+    Rscript -e "install.packages('data.table')" && \
+    Rscript -e "install.packages('stringr')"
+
+
+# Install bedtools
+RUN DEBIAN_FRONTEND=noninteractive apt-get install --assume-yes \
+    ant \
+    bedtools
+
+# Install fastqc
+WORKDIR /home/tools/
+RUN wget http://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v0.11.7.zip && \
+    unzip fastqc_v0.11.7.zip && \
+    cd /home/tools/FastQC && \
+    chmod 755 fastqc && \ 
+    ln -s /home/tools/FastQC/fastqc /usr/bin/
+
+# Install htslib
+WORKDIR /home/src/
+RUN wget https://github.com/samtools/htslib/releases/download/1.7/htslib-1.7.tar.bz2 && \
+    tar xf htslib-1.7.tar.bz2 && \
+    cd /home/src/htslib-1.7 && \
+    ./configure --prefix /home/tools/ && \
+    make && \
+    make install
+
+# Install samtools
+WORKDIR /home/src/
+RUN wget https://github.com/samtools/samtools/releases/download/1.7/samtools-1.7.tar.bz2 && \
+    tar xf samtools-1.7.tar.bz2 && \
+    cd /home/src/samtools-1.7 && \
+    ./configure && \
+    make && \
+    make install && \
+    ln -s /home/src/samtools-1.7/samtools /usr/bin/
+
+# Install bowtie2
+WORKDIR /home/src/
+RUN wget https://downloads.sourceforge.net/project/bowtie-bio/bowtie2/2.3.4.1/bowtie2-2.3.4.1-source.zip && \
+    unzip bowtie2-2.3.4.1-source.zip && \
+    cd /home/src/bowtie2-2.3.4.1 && \
+    make && \
+    make install && \
+    ln -s /home/src/bowtie2-2.3.4.1/bowtie2 /usr/bin/
+
+# Install picard
+WORKDIR /home/tools/bin
+RUN wget https://github.com/broadinstitute/picard/releases/download/2.18.0/picard.jar && \
+    chmod +x picard.jar
+
+# Install UCSC tools
+WORKDIR /home/tools/
+RUN wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bedGraphToBigWig && \
+    wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/wigToBigWig && \
+    wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bigWigCat && \
+    wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/bedSort && \
+    chmod +x /home/tools/bedGraphToBigWig && \
+    chmod +x /home/tools/wigToBigWig && \
+    chmod +x /home/tools/bigWigCat && \
+    chmod +x /home/tools/bedSort && \
+    ln -s /home/tools/bedGraphToBigWig /usr/bin/ && \
+    ln -s /home/tools/wigToBigWig /usr/bin/ && \
+    ln -s /home/tools/bigWigCat /usr/bin/ && \
+    ln -s /home/tools/bedSort /usr/bin/
+
+# Install Skewer
+WORKDIR /home/src/
+RUN git clone git://github.com/relipmoc/skewer.git && \
+    cd /home/src/skewer && \
+    make && \
+    make install
+
+# OPTIONAL REQUIREMENTS
+# Install F-seq
+WORKDIR /home/src/
+RUN wget https://github.com/aboyle/F-seq/archive/master.zip && \
+    unzip master.zip && \
+    cd /home/src/F-seq-master && \
+    ant && \
+    cd dist~/ && \
+    tar xf fseq.tgz && \
+    ln -s /home/src/F-seq-master/dist~/fseq/bin/fseq /usr/bin/
+
+# Install Trimmomatic
+WORKDIR /home/src/
+RUN wget http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-0.36.zip && \
+    unzip Trimmomatic-0.36.zip && \
+    chmod +x Trimmomatic-0.36/trimmomatic-0.36.jar
+
+# Set environment variables
+ENV PATH=/home/tools/bin:/home/tools/:/home/tools/bin/kentUtils/:/home/src/F-seq-master/dist~/fseq/bin:/home/src/bowtie2-2.3.4.1:/home/src/skewer:/home/src/samtools-1.7:/home/src/Trimmomatic-0.36/:/home/src/htslib-1.7:$PATH \
+    TRIMMOMATIC=/home/src/Trimmomatic-0.36/trimmomatic-0.36.jar \
+    PICARD=/home/tools/bin/picard.jar \
+    R_LIBS_USER=/usr/local/lib/R/site-library/
+
+# Define default command
+WORKDIR /home/
+CMD ["/bin/bash"]
+
+# Clean up APT when done.
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
diff --git a/example_cmd.txt b/example_cmd.txt
@@ -1,90 +1,33 @@
-# Example commands of using pepATAC through pypiper. 
-# For the example commands of using pepATAC with looper, please see the xxx Users Guide.
+# Example commands of using PEPATAC through pypiper. 
+# For the example commands of using PEPATAC with looper, please see the xxx Users Guide.
 
 INPUT=/path/to/sequencing_results/fastq_files
 
-# run pepATAC on a human paired-end reads dataset using 5 threads:
-python pipelines/ATACseq.py -P 5 -O output_folder -S output_sample_name -G hg38  -Q paired  -C ATACseq.yaml  -gs hs -I $INPUT/ATACseq_results_PE_R1.fastq.gz -I2 $INPUT/ATACseq_results_PE_R2.fastq.gz  
+# run PEPATAC on a human paired-end reads dataset using 5 threads:
+python pipelines/pepatac.py -P 5 -O output_folder -S output_sample_name -G hg38  -Q paired  -C pepatac.yaml  -gs hs -I $INPUT/pepatac_results_PE_R1.fastq.gz -I2 $INPUT/pepatac_results_PE_R2.fastq.gz  
 
-# run pepATAC on multiple datasets at the same time:  <- this could be wrong as I don't see an explaination of how to use -I and -I2 with multiple samples
-python pipelines/ATACseq.py -P 5 -O output_folder -S output_sample_name -G hg38  -Q paired  -C ATACseq.yaml  -gs hs -I $INPUT/ATACseq_results1_PE_R1.fastq.gz $INPUT/ATACseq_results2_PE_R1.fastq.gz $INPUT/ATACseq_results3_PE_R1.fastq.gz -I2 $INPUT/ATACseq_results1_PE_R2.fastq.gz $INPUT/ATACseq_results2_PE_R2.fastq.gz $INPUT/ATACseq_results3_PE_R2.fastq.gz 
+# run PEPATAC on multiple datasets at the same time:  <- this could be wrong as I don't see an explaination of how to use -I and -I2 with multiple samples
+python pipelines/pepatac.py -P 5 -O output_folder -S output_sample_name -G hg38  -Q paired  -C pepatac.yaml  -gs hs -I $INPUT/pepatac_results1_PE_R1.fastq.gz $INPUT/pepatac_results2_PE_R1.fastq.gz $INPUT/pepatac_results3_PE_R1.fastq.gz -I2 $INPUT/pepatac_results1_PE_R2.fastq.gz $INPUT/pepatac_results2_PE_R2.fastq.gz $INPUT/pepatac_results3_PE_R2.fastq.gz 
 
 # run multiple samples with a for loop:
 declare -a sample_name_arr=("sample1","sample2","sample3")
 for sample_name in "${sample_name_arr[@]}"
 do
 file1=$INPUT/{$file1}_PE_R1.fastq.gz
 file2=${file1/R1/R2}
-python pipelines/ATACseq.py -P 5 -O output_folder -S $sample_name -G hg38  -Q paired  -C ATACseq.yaml  -gs hs -I $file1 -I2 $file2  
+python pipelines/pepatac.py -P 5 -O output_folder -S $sample_name -G hg38  -Q paired  -C pepatac.yaml  -gs hs -I $file1 -I2 $file2  
 done
 
-# run pepATAC on a mouse single-end reads dataset using 8 threads:
-python pipelines/ATACseq.py -P 8 -O output_folder -S output_sample_name -G mm10  -Q single  -C ATACseq.yaml  -gs mm -I $INPUT/ATACseq_results_PE_R1.fastq.gz  
+# run PEPATAC on a mouse single-end reads dataset using 8 threads:
+python pipelines/pepatac.py -P 8 -O output_folder -S output_sample_name -G mm10  -Q single  -C pepatac.yaml  -gs mm -I $INPUT/pepatac_results_PE_R1.fastq.gz  
 
-# run pepATAC with different trimming tools then default trimmomatic, currectly supports skewer and pyadapt: 
-python pipelines/ATACseq.py --skewer TRUE -P 5 -O output_folder -S output_sample_name -G hg38  -Q paired  -C ATACseq.yaml  -gs hs -I $INPUT/ATACseq_results_PE_R1.fastq.gz -I2 $INPUT/ATACseq_results_PE_R2.fastq.gz  
-python pipelines/ATACseq.py --pyadapt TRUE -P 5 -O output_folder -S output_sample_name -G hg38  -Q paired  -C ATACseq.yaml  -gs hs -I $INPUT/ATACseq_results_PE_R1.fastq.gz -I2 $INPUT/ATACseq_results_PE_R2.fastq.gz  
-
-# re-run pepATAC and over-write the previous output: 
-python pipelines/ATACseq.py -N -P 5 -O output_folder -S output_sample_name -G hg38  -Q paired  -C ATACseq.yaml  -gs hs -I $INPUT/ATACseq_results_PE_R1.fastq.gz -I2 $INPUT/ATACseq_results_PE_R2.fastq.gz  
-
-# continue to run pepATAC since a locked step (usually locked due to failure): 
-python pipelines/ATACseq.py -R -P 5 -O output_folder -S output_sample_name -G hg38  -Q paired  -C ATACseq.yaml  -gs hs -I $INPUT/ATACseq_results_PE_R1.fastq.gz -I2 $INPUT/ATACseq_results_PE_R2.fastq.gz  
-
-
-
-
-# check xxxx for full list of parameter usage
-
-# full list of parameters are listed below: 
-python ATACseq.py 
-usage: ATACseq.py [-h] [-N] [-I2 INPUT_FILES2 [INPUT_FILES2 ...]]
-[-M MEMORY_LIMIT] [-Q SINGLE_OR_PAIRED] [-S SAMPLE_NAME]
-[-P NUMBER_OF_CORES] [-D] [-I INPUT_FILES [INPUT_FILES ...]]
-[-F] [-R] [-C CONFIG_FILE] [-O PARENT_OUTPUT_FOLDER]
-[-G GENOME_ASSEMBLY] [-gs GENOME_SIZE]
-[--frip-ref-peaks FRIP_REF_PEAKS] [--pyadapt] [--skewer]
-[--prealignments PREALIGNMENTS [PREALIGNMENTS ...]] [-V]
-
-Pipeline
-optional arguments:
--C CONFIG_FILE, --config CONFIG_FILE
-pipeline config file in YAML format; relative paths
-are considered relative to the pipeline script.
-defaults to ATACseq.yaml
--D, --dirty           Make all cleanups manual
--F, --follow          Run all follow commands, even if command is not run
---frip-ref-peaks FRIP_REF_PEAKS
-Reference peak set for calculating FRIP
--G GENOME_ASSEMBLY, --genome GENOME_ASSEMBLY
-identifier for genome assempbly (required)
--gs GENOME_SIZE, --genome-size GENOME_SIZE
-genome size for MACS2
--h, --help            show this help message and exit
--I INPUT_FILES [INPUT_FILES ...], --input INPUT_FILES [INPUT_FILES ...]
-One or more primary input files (required)
--I2 INPUT_FILES2 [INPUT_FILES2 ...], --input2 INPUT_FILES2 [INPUT_FILES2 ...]
-One or more secondary input files (if they exists);
-for example, second read in pair.
--M MEMORY_LIMIT, --mem MEMORY_LIMIT
-Memory string for processes that accept memory limits
-(like java)
--N, --new-start       Fresh start mode, overwrite all
--O PARENT_OUTPUT_FOLDER, --output-parent PARENT_OUTPUT_FOLDER
-parent output directory of the project (required).
--P NUMBER_OF_CORES, --cores NUMBER_OF_CORES
-number of cores to use for parallel processes
--Q SINGLE_OR_PAIRED, --single-or-paired SINGLE_OR_PAIRED
-single or paired end? default: single
--R, --recover         Recover mode, overwrite locks
--S SAMPLE_NAME, --sample-name SAMPLE_NAME
-unique name for output subfolder and files (required)
---pyadapt             Use pyadapter_trim for trimming? [Default: False]
---skewer              Use skewer for trimming? [Default: False]
---prealignments PREALIGNMENTS [PREALIGNMENTS ...]
-List of reference genomes to align to before primary
-alignment.
--V, --version         show program's version number and exit'
+# run PEPATAC with different trimming tools then default trimmomatic, currectly supports skewer and pyadapt: 
+python pipelines/pepatac.py --skewer TRUE -P 5 -O output_folder -S output_sample_name -G hg38  -Q paired  -C pepatac.yaml  -gs hs -I $INPUT/pepatac_results_PE_R1.fastq.gz -I2 $INPUT/pepatac_results_PE_R2.fastq.gz  
+python pipelines/pepatac.py --pyadapt TRUE -P 5 -O output_folder -S output_sample_name -G hg38  -Q paired  -C pepatac.yaml  -gs hs -I $INPUT/pepatac_results_PE_R1.fastq.gz -I2 $INPUT/pepatac_results_PE_R2.fastq.gz  
 
+# re-run PEPATAC and over-write the previous output: 
+python pipelines/pepatac.py -N -P 5 -O output_folder -S output_sample_name -G hg38  -Q paired  -C pepatac.yaml  -gs hs -I $INPUT/pepatac_results_PE_R1.fastq.gz -I2 $INPUT/pepatac_results_PE_R2.fastq.gz  
 
+# continue to run PEPATAC since a locked step (usually locked due to failure): 
+python pipelines/pepatac.py -R -P 5 -O output_folder -S output_sample_name -G hg38  -Q paired  -C pepatac.yaml  -gs hs -I $INPUT/pepatac_results_PE_R1.fastq.gz -I2 $INPUT/pepatac_results_PE_R2.fastq.gz  
 
diff --git a/examples/chang_project/README.md b/examples/chang_project/README.md
@@ -2,14 +2,14 @@
 
 This folder contains an skeleton template with configuration options already set for the Chang lab compute environment. To set up a new project in the Chang lab compute environment, follow these instructions:
 
-1. Follow the **installing** instructions in the main README to get prerequisites (install looper, pypiper, and clone the ATACseq repository).
+1. Follow the **installing** instructions in the main README to get prerequisites (install looper, pypiper, and clone the PEPATAC repository).
 2. Copy this folder ([examples/chang_project](examples/chang_project/)) and name the new folder for your project.
-3. In your new folder, edit `project_config.yaml` to set the `metadata.pipelines_dir` option to the location of your cloned ATACseq repository.
+3. In your new folder, edit `project_config.yaml` to set the `metadata.pipelines_dir` option to the location of your cloned PEPATAC repository.
 4. Edit `project_config.yaml` to set the `data_sources.R1` and `data_sources.R2`  to point to where you store fastq files. Your files must be named in some systematic pattern that can be created by populating sample variables, like `{sample_name}`. Detailed instructions are available here: [using looper derived columns](http://looper.readthedocs.io/en/latest/advanced.html#pointing-to-flexible-data-with-derived-columns).
 5. Make any other (optional) changes you want to `project_config.yaml`.
 6. Modify `project_annotation.csv` to include your sample list.
 7. Run the project with `looper run path/to/project_config.yaml`.
 
-Essentially, all this does differently from the default is that we have provided a configuration file. See the `pipeline_config` section in the [project config file](examples/chang_project/project_config.yaml) -- we simply set this to `ATACseq_chang.yaml` for your project, and then include [ATACseq_chang.yaml](examples/chang_project/ATACseq_chang.yaml) parallel to the project config file. 
+Essentially, all this does differently from the default is that we have provided a configuration file. See the `pipeline_config` section in the [project config file](examples/chang_project/project_config.yaml) -- we simply set this to `pepatac_chang.yaml` for your project, and then include [pepatac_chang.yaml](examples/chang_project/pepatac_chang.yaml) parallel to the project config file. 
 
 Once you have it set up, you have all the power of looper for your project. It's simple to submit to a cluster, summarize your results, clean, and monitor your project. You can find additional details on what you can do with this in the [looper docs](http://looper.readthedocs.io/).
diff --git a/examples/chang_project/ATACseq_chang.yaml → examples/chang_project/pepatac_chang.yaml b/examples/chang_project/ATACseq_chang.yaml → examples/chang_project/pepatac_chang.yaml
@@ -1,4 +1,4 @@
-# Configuration file for ATACseq pipeline based on pypiper
+# PEPATAC configuration file for an ATACseq pipeline based on pypiper
 
 # basic tools 
 # public tools

diff --git a/examples/chang_project/project_config.yaml b/examples/chang_project/project_config.yaml
@@ -33,6 +33,6 @@ implied_columns:
       prealignments: null
 
 pipeline_config:
-  ATACseq.py: ATACseq_chang.yaml  # Use this to load Chang Lab settings
-  #ATACseq.py: null  # Use this to load default environment settings
+  pepatac.py: pepatac_chang.yaml  # Use this to load Chang Lab settings
+  #pepatac.py: null  # Use this to load default environment settings
 
diff --git a/examples/gold_atac/README.md b/examples/gold_atac/README.md
@@ -8,7 +8,7 @@ Testing ATAC-seq pipeline on gold standard public ATAC-seq data.
 Download raw `fastq.gz` files (use `fastq-dump` from SRA. You may also use `get_geo.py` to download raw ATAC-seq reads from SRA and metadata from GEO:
 
 ```
-python get_geo.py -i ~/code/ATACseq/examples/gold_atac/metadata/gold_atac_gse.csv -r --fastq
+python get_geo.py -i ~/code/pepatac/examples/gold_atac/metadata/gold_atac_gse.csv -r --fastq
 ```
 
 I used resulting file [metadata/annocomb_gold_atac_gse.csv](metadata/annocomb_gold_atac_gse.csv) to create the looper metadata sheet, [metadata/gold_atac_annotation.csv](metadata/gold_atac_annotation.csv).
@@ -18,5 +18,10 @@ I create project config file and sampled test data. The SRA fastq files should b
 ## Run pipeline
 
 ```
-looper run ${CODE}ATACseq/examples/gold_atac/metadata/project_config.yaml -d
+looper run ${CODE}pepatac/examples/gold_atac/metadata/project_config.yaml -d
 ```
+
+
+## Test data
+
+There's a small test sample stored right here as `test1`