From 4fd047b02d30cc5fcc198a77ce3048b53535b14e Mon Sep 17 00:00:00 2001 From: ekiernan <55763654+ekiernan@users.noreply.github.com> Date: Wed, 7 Oct 2020 09:01:33 -0400 Subject: [PATCH 1/3] Lk methods updates (#96) Updated Methods docs --- .../broad/dna_seq/germline/single_sample/exome/README.md | 2 +- .../broad/dna_seq/germline/single_sample/wgs/README.md | 2 +- .../illumina/IlluminaGenotypingArray.documentation.md | 4 ++-- pipelines/cemba/cemba_methylcseq/CEMBA.methods.md | 4 ++-- pipelines/cemba/cemba_methylcseq/README.md | 2 +- pipelines/skylab/optimus/README.md | 2 +- pipelines/skylab/optimus/optimus.methods.md | 8 ++++---- pipelines/skylab/scATAC/README.md | 2 +- pipelines/skylab/smartseq2_multisample/README.md | 2 +- .../skylab/smartseq2_multisample/smart-seq2.methods.md | 6 +++--- pipelines/skylab/smartseq2_single_sample/README.md | 2 +- 11 files changed, 18 insertions(+), 18 deletions(-) diff --git a/pipelines/broad/dna_seq/germline/single_sample/exome/README.md b/pipelines/broad/dna_seq/germline/single_sample/exome/README.md index e3677b581a..efce25150c 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/exome/README.md +++ b/pipelines/broad/dna_seq/germline/single_sample/exome/README.md @@ -1,6 +1,6 @@ | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [ExomeGermlineSingleSample_v2.0](ExomeGermlineSingleSample.wdl) | June 10, 2020 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in dsde-pipelines or contact [Kylee Degatano](mailto:kdegatano@broadinstitute.org) | +| [ExomeGermlineSingleSample_v2.0](https://github.com/broadinstitute/warp/releases) | June 10, 2020 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in dsde-pipelines or contact [Kylee Degatano](mailto:kdegatano@broadinstitute.org) | # Table of Contents - [Introduction to the Exome Germline Single Sample Pipeline](#introduction-to-the-exome-germline-single-sample-pipeline) diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/README.md b/pipelines/broad/dna_seq/germline/single_sample/wgs/README.md index 23916d1945..9fdeac2352 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/README.md +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/README.md @@ -1,6 +1,6 @@ | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [WholeGenomeGermlineSingleSample_v2.0](WholeGenomeGermlineSingleSample.wdl) | June 22, 2020 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in WARP or contact [Kylee Degatano](mailto:kdegatano@broadinstitute.org) | +| [WholeGenomeGermlineSingleSample_v2.0](https://github.com/broadinstitute/warp/releases) | June 22, 2020 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in WARP or contact [Kylee Degatano](mailto:kdegatano@broadinstitute.org) | # Introduction to the Whole Genome Germline Single Sample Pipeline diff --git a/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.documentation.md b/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.documentation.md index 7eef9042fa..c999e65754 100644 --- a/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.documentation.md +++ b/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.documentation.md @@ -1,10 +1,10 @@ | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [Version 1.11.0](IlluminaGenotypingArray.wdl) | Oct 1, 2020 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in warp or contact [Kylee Degatano](mailto:kdegatano@broadinstitute.org) | +| [Version 1.11.0](https://github.com/broadinstitute/warp/releases) | Oct 1, 2020 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in warp or contact [Kylee Degatano](mailto:kdegatano@broadinstitute.org) | # Table of Contents - [Illumina Genotyping Array Pipeline Overview](#illumina-genotyping-array-pipeline-overview) -- [Introduction to the Illumina Genotyping Array] Pipeline](#introduction-to-the-illumina-genotyping-array-pipeline) +- [Introduction to the Illumina Genotyping Array Pipeline](#introduction-to-the-illumina-genotyping-array-pipeline) - [Set-up](#set-up) * [Workflow Installation and Requirements](#workflow-installation-and-requirements) * [Inputs](#inputs) diff --git a/pipelines/cemba/cemba_methylcseq/CEMBA.methods.md b/pipelines/cemba/cemba_methylcseq/CEMBA.methods.md index 59404ca072..a5d421d33a 100644 --- a/pipelines/cemba/cemba_methylcseq/CEMBA.methods.md +++ b/pipelines/cemba/cemba_methylcseq/CEMBA.methods.md @@ -1,9 +1,9 @@ -# CEMBA_v1.0 Publication Methods +# CEMBA_v1.0.0 Publication Methods Below we provide a sample methods section for a publication. For the complete pipeline documentation, see the [CEMBA README](README.md). ### Methods -Data processing was performed with the CEMBA v1.0 Pipeline. Sequencing reads were first trimmed to remove adaptors using Cutadapt 1.18 with the following parameters in paired-end mode: -f fastq -quality-cutoff 20 -minimum-length 62 -a AGATCGGAAGAGCACACGTCTGAAC -A AGATCGGAAGAGCGTCGTGTAGGGA. After trimming the adapters, an unaligned BAM (uBAM) for the trimmed R1 FASTQ was created using Picard v2.18.23. Cell barcodes were then extracted from the trimmed R1 FASTQ and tagged to the R1 uBAM with Single Cell Tools (sctools) v0.3.4a using a barcode whitelist as well as configurable barcode start positions and lengths. Next, for multiplexed samples, the random primer index sequence and Adaptase C/T tail were further removed from the adaptor-trimmed R1 and R2 FASTQs using Cutadapt with the following parameters: -f fastq -quality-cutoff 16 -quality-cutoff -16 -minimum-length 30. The trimmed R1 and R2 reads were then aligned to mouse (mm10) or human (hg19) genomes separately as single-end reads using Bismark v0.21.0 with the parameters --bowtie2 --icpc --X 2000 (paired-end mode) and --pbat (activated for mapping R1 reads). After alignment, the output R1 and R2 BAMs were sorted in coordinate order and duplicates removed using the Picard MarkDuplicates REMOVE_DUPLICATE option. Samtools 1.9 was used to further filter BAMs with a minimum map quality of 30 using the parameter -bhq 30. Methylation reports were produced for the filtered BAMs using Bismark. The barcodes from the R1 uBAM were then attached to the aligned, filtered R1 BAM with Picard. The R1 and R2 BAMs were merged with Samtools. Readnames were added to the merged BAM and a methylated VCF created using MethylationTypeCaller in GATK 4.1.2.0. Samtools was then used to calculate coverage depth for sites with coverage greater than 1 and to create BAM index files. The final outputs included the barcoded aligned BAM, BAM index, a VCF with locus-specific methylation information, VCF index, and methylation reports. +Data processing was performed with the CEMBA v1.0.0 Pipeline. Sequencing reads were first trimmed to remove adaptors using Cutadapt 1.18 with the following parameters in paired-end mode: -f fastq -quality-cutoff 20 -minimum-length 62 -a AGATCGGAAGAGCACACGTCTGAAC -A AGATCGGAAGAGCGTCGTGTAGGGA. After trimming the adapters, an unaligned BAM (uBAM) for the trimmed R1 FASTQ was created using Picard v2.18.23. Cell barcodes were then extracted from the trimmed R1 FASTQ and tagged to the R1 uBAM with Single Cell Tools (sctools) v0.3.4a using a barcode whitelist as well as configurable barcode start positions and lengths. Next, for multiplexed samples, the random primer index sequence and Adaptase C/T tail were further removed from the adaptor-trimmed R1 and R2 FASTQs using Cutadapt with the following parameters: -f fastq -quality-cutoff 16 -quality-cutoff -16 -minimum-length 30. The trimmed R1 and R2 reads were then aligned to mouse (mm10) or human (hg19) genomes separately as single-end reads using Bismark v0.21.0 with the parameters --bowtie2 --icpc --X 2000 (paired-end mode) and --pbat (activated for mapping R1 reads). After alignment, the output R1 and R2 BAMs were sorted in coordinate order and duplicates removed using the Picard MarkDuplicates REMOVE_DUPLICATE option. Samtools 1.9 was used to further filter BAMs with a minimum map quality of 30 using the parameter -bhq 30. Methylation reports were produced for the filtered BAMs using Bismark. The barcodes from the R1 uBAM were then attached to the aligned, filtered R1 BAM with Picard. The R1 and R2 BAMs were merged with Samtools. Readnames were added to the merged BAM and a methylated VCF created using MethylationTypeCaller in GATK 4.1.2.0. Samtools was then used to calculate coverage depth for sites with coverage greater than 1 and to create BAM index files. The final outputs included the barcoded aligned BAM, BAM index, a VCF with locus-specific methylation information, VCF index, and methylation reports. An example of the pipeline and its outputs is available on Terra (https://app.terra.bio/#workspaces/brain-initiative-bcdc/Methyl-c-seq_Pipeline). Examples of genomic reference files and other inputs can be found in the pipeline’s [example JSON](example_inputs/CEMBA.inputs.json). diff --git a/pipelines/cemba/cemba_methylcseq/README.md b/pipelines/cemba/cemba_methylcseq/README.md index 585fa6281a..5424fabeaa 100644 --- a/pipelines/cemba/cemba_methylcseq/README.md +++ b/pipelines/cemba/cemba_methylcseq/README.md @@ -1,6 +1,6 @@ | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [CEMBA_v1.0](CEMBA.wdl) | July 28, 2020 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in warp or contact [Kylee Degatano](mailto:kdegatano@broadinstitute.org) | +| [CEMBA_v1.0.0](https://github.com/broadinstitute/warp/releases) | July 28, 2020 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in warp or contact [Kylee Degatano](mailto:kdegatano@broadinstitute.org) | # Table of Contents diff --git a/pipelines/skylab/optimus/README.md b/pipelines/skylab/optimus/README.md index 70d18d1035..bd8a7eb089 100644 --- a/pipelines/skylab/optimus/README.md +++ b/pipelines/skylab/optimus/README.md @@ -1,6 +1,6 @@ | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [optimus_v4.0.1](https://github.com/broadinstitute/warp/releases/tag/Optimus_v4.0.1) | September 14, 2020 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in warp or contact [Kylee Degatano](mailto:kdegatano@broadinstitute.org) | +| [optimus_v4.0.2](https://github.com/broadinstitute/warp/releases) | September 14, 2020 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in warp or contact [Kylee Degatano](mailto:kdegatano@broadinstitute.org) | # Table of Contents diff --git a/pipelines/skylab/optimus/optimus.methods.md b/pipelines/skylab/optimus/optimus.methods.md index ef99d9f05c..2ac0867e77 100644 --- a/pipelines/skylab/optimus/optimus.methods.md +++ b/pipelines/skylab/optimus/optimus.methods.md @@ -1,14 +1,14 @@ -# Optimus v3.0.1 Methods -Below we provide a sample methods sections for a publication, separated into single-cell or single-nuclei use cases. For the complete pipeline documentation, see the [Optimus](README.md). +# Optimus v4.0.2 Methods +Below we provide a sample methods sections for a publication, separated into single-cell or single-nuclei use cases. For the complete pipeline documentation, see the [Optimus README](README.md). # Methods ## Single-cell (sc_rna mode) -Data preprocessing and count matrix construction were performed using the Optimus v3.0.1 Pipeline. Briefly, FASTQ files were converted to unaligned BAM (uBMA) using Picard v2.10.10 and reads were appended with raw UMI and corrected cell barcode sequences using Single Cell Tools (sctools) v0.3.4 and the 10x Genomics barcodes whitelist, allowing for up to one edit distance (Levenshtein distance). uBAMs were then aligned to GENCODE mouse (M21) or human (V27) references using STAR v2.5.3a with default parameters in addition to --BAM unsorted --outSAMattributes all --outSAMunmapped --readFilestype SAM SE. Genes were annotated and reads were tagged with Drop-seq Tools v1.12 using the TagReadwithGeneExon function. UMIs were then corrected and duplicate reads marked using UMI-tools v0.0.1 with default parameters in addition to --extract-umi-method=tag --umi-tag UR --cell-tag CB --gene-tag GE --umi-group-tag UB --per-gene --per-cell --no-sort-output. All reads (UMI-corrected, duplicate, and untagged) were merged into a single BAM file and tagged. Gene and cell-specific metrics were calculated using the sctools v.0.3.7 functions CalculateGeneMetrics and CalculateCellMetrics. Empty droplets were identified, but not removed to enable downstream filtering, using the DropletUtils v.1.2.1 with --fdr-cutoff 0.01 --emptydrops-niters 10000 --min-molecules 100 --emptydrops-lower 100. UMI-aware count matrices for exon-only alignments were produced using the sctools v0.3.7. All cell and gene metrics (alignment, mitochondrial, and other QC metrics), count matrices and DropletUtils results were then aggregated into a final Loom file for downstream processing. The final outputs included the unfiltered Loom and unfiltered (but tagged) BAM files. +Data preprocessing and count matrix construction were performed using the Optimus v4.0.2 Pipeline. Briefly, FASTQ files were converted to unaligned BAM (uBMA) using Picard v2.10.10 and reads were appended with raw UMI and corrected cell barcode sequences using Single Cell Tools (sctools) v0.3.10 and the 10x Genomics barcodes whitelist, allowing for up to one edit distance (Levenshtein distance). uBAMs were then aligned to GENCODE mouse (M21) or human (V27) references using STAR v2.5.3a with default parameters in addition to --BAM unsorted --outSAMattributes all --outSAMunmapped --readFilestype SAM SE. Genes were annotated and reads were tagged with Drop-seq Tools v1.12 using the TagReadwithGeneExon function. UMIs were then corrected and duplicate reads marked using UMI-tools v0.0.1 with default parameters in addition to --extract-umi-method=tag --umi-tag UR --cell-tag CB --gene-tag GE --umi-group-tag UB --per-gene --per-cell --no-sort-output. All reads (UMI-corrected, duplicate, and untagged) were merged into a single BAM file and tagged. Gene and cell-specific metrics were calculated using the sctools v.0.3.7 functions CalculateGeneMetrics and CalculateCellMetrics. Empty droplets were identified, but not removed to enable downstream filtering, using the DropletUtils v.1.2.1 with --fdr-cutoff 0.01 --emptydrops-niters 10000 --min-molecules 100 --emptydrops-lower 100. UMI-aware count matrices for exon-only alignments were produced using the sctools v0.3.7. All cell and gene metrics (alignment, mitochondrial, and other QC metrics), count matrices and DropletUtils results were then aggregated into a final Loom file for downstream processing. The final outputs included the unfiltered Loom and unfiltered (but tagged) BAM files. An example of the pipeline and outputs is available on the Terra HCA Optimus Pipeline Featured Workspace (https://app.terra.bio/#workspaces/featured-workspaces-hca/HCA_Optimus_Pipeline), and additional documentation is available on GitHub (https://github.com/HumanCellAtlas/skylab/blob/master/pipelines/optimus/README.md). Examples of genomic references, whitelists, and other inputs are available in the Skylab repository (see example JSONs). ## Single-nuclei (sn_rna mode) -Data preprocessing and count matrix construction were performed using the Optimus v3.0.1 Pipeline. Briefly, FASTQ files were converted to unaligned BAM (uBMA) using Picard v2.10.10 and reads were appended with raw UMI and corrected cell barcode sequences using Single Cell Tools (sctools) v0.3.4 and the 10x Genomics barcodes whitelist, allowing for up to one edit distance (Levenshtein distance). uBAMs were aligned to GENCODE mouse (M21) or human (V27) references using STAR v2.5.3a with default parameters in addition to --BAM unsorted --outSAMattributes all --outSAMunmapped --readFilestype SAM SE. Genes were annotated and reads were tagged with Drop-seq Tools v2.3.0 using TagReadWithGeneFunction. UMIs were then corrected using UMI-tools v0.0.1 with default parameters in addition to --extract-umi-method=tag --umi-tag UR --cell-tag CB --gene-tag GE --umi-group-tag UB --per-gene --per-cell --no-sort-output. All reads (UMI-corrected, duplicate, and untagged) were merged into a single BAM file. Gene and cell-specific metrics were calculated using the sctools v.0.3.7 functions CalculateGeneMetrics and CalculateCellMetrics. UMI-aware count matrices for all alignments (introns, exons, UTRs) were produced using the sctools v0.3.7. All cell and gene metrics (alignment, mitochondrial, and other QC metrics), annotations, and count matrices were aggregated into a final Loom file for downstream processing. The final outputs included the unfiltered Loom and unfiltered (but tagged) BAM files. +Data preprocessing and count matrix construction were performed using the Optimus v4.0.2 Pipeline. Briefly, FASTQ files were converted to unaligned BAM (uBMA) using Picard v2.10.10 and reads were appended with raw UMI and corrected cell barcode sequences using Single Cell Tools (sctools) v0.3.10 and the 10x Genomics barcodes whitelist, allowing for up to one edit distance (Levenshtein distance). uBAMs were aligned to GENCODE mouse (M21) or human (V27) references using STAR v2.5.3a with default parameters in addition to --BAM unsorted --outSAMattributes all --outSAMunmapped --readFilestype SAM SE. Genes were annotated and reads were tagged with Drop-seq Tools v2.3.0 using TagReadWithGeneFunction. UMIs were then corrected using UMI-tools v0.0.1 with default parameters in addition to --extract-umi-method=tag --umi-tag UR --cell-tag CB --gene-tag GE --umi-group-tag UB --per-gene --per-cell --no-sort-output. All reads (UMI-corrected, duplicate, and untagged) were merged into a single BAM file. Gene and cell-specific metrics were calculated using the sctools v.0.3.7 functions CalculateGeneMetrics and CalculateCellMetrics. UMI-aware count matrices for all alignments (introns, exons, UTRs) were produced using the sctools v0.3.7. All cell and gene metrics (alignment, mitochondrial, and other QC metrics), annotations, and count matrices were aggregated into a final Loom file for downstream processing. The final outputs included the unfiltered Loom and unfiltered (but tagged) BAM files. An example of the pipeline and outputs is available on the Terra HCA Optimus Pipeline Featured Workspace (https://app.terra.bio/#workspaces/featured-workspaces-hca/HCA_Optimus_Pipeline), and additional documentation is available on GitHub (https://github.com/HumanCellAtlas/skylab/blob/master/pipelines/optimus/README.md). Examples of genomic references, whitelists, and other inputs are available in the Skylab repository (see the *_example.json files at https://github.com/HumanCellAtlas/skylab/tree/master/pipelines/optimus). diff --git a/pipelines/skylab/scATAC/README.md b/pipelines/skylab/scATAC/README.md index 72962d7f19..5e1c92c915 100644 --- a/pipelines/skylab/scATAC/README.md +++ b/pipelines/skylab/scATAC/README.md @@ -1,6 +1,6 @@ | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [scATAC 1.1.0 ](scATAC.wdl) | August 24th 2020 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in skylab or contact [Kylee Degatano](mailto:kdegatano@broadinstitute.org) | +| [scATAC 1.1.0 ](https://github.com/broadinstitute/warp/releases) | August 24th 2020 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in skylab or contact [Kylee Degatano](mailto:kdegatano@broadinstitute.org) | - [Overview](#overview) - [Introduction](#introduction) diff --git a/pipelines/skylab/smartseq2_multisample/README.md b/pipelines/skylab/smartseq2_multisample/README.md index c63368ab56..2383080c2e 100644 --- a/pipelines/skylab/smartseq2_multisample/README.md +++ b/pipelines/skylab/smartseq2_multisample/README.md @@ -1,6 +1,6 @@ | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [MultiSampleSmartSeq2_v2.1.0](https://github.com/broadinstitute/warp/releases/tag/MultiSampleSmartSeq2_v2.1.0) | August, 2020 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in skylab or contact [Kylee Degatano](mailto:kdegatano@broadinstitute.org) | +| [MultiSampleSmartSeq2_v2.1.0](https://github.com/broadinstitute/warp/releases) | August, 2020 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in skylab or contact [Kylee Degatano](mailto:kdegatano@broadinstitute.org) | # Smart-seq2 Multi Sample Pipeline ## Introduction diff --git a/pipelines/skylab/smartseq2_multisample/smart-seq2.methods.md b/pipelines/skylab/smartseq2_multisample/smart-seq2.methods.md index 7cd67080ef..5c8a850cf3 100644 --- a/pipelines/skylab/smartseq2_multisample/smart-seq2.methods.md +++ b/pipelines/skylab/smartseq2_multisample/smart-seq2.methods.md @@ -1,7 +1,7 @@ -# Smart-seq2 v2.0.1 Publication Methods -Below we provide a sample methods sections for a publication. For the complete pipeline documentation, see the [Smart-seq2 Multi Sample Readme](README.md) +# Smart-seq2 v2.1.0 Publication Methods +Below we provide a sample methods sections for a publication. For the complete pipeline documentation, see the [Smart-seq2 Multi Sample README](README.md). # Methods -Data preprocessing and count matrix construction for a sample batch (or plate) were performed using the Smart-seq2 Multi Sample v.3.0.0 Pipeline. For each cell in the batch, paired- or single-end FASTQ files were first processed with the Smart-seq2 Single Sample v4.0.0 Pipeline. Reads were aligned to the GENCODE mouse (M21) or human (V27) reference genome using HISAT2 v2.1.0 with default parameters in addition to --k 10 options. Metrics were collected and duplicate reads marked using the Picard v.2.10.10 CollectMultipleMetrics and CollectRnaSeqMetrics, and MarkDuplicates functions with validation_stringency=silent. For transcriptome quantification, reads were aligned to the GENCODE transcriptome using HISAT2 v2.1.0 with --k 10 --no-mixed --no-softclip --no-discordant --rdg 99999999,99999999 --rfg 99999999,99999999 --no-spliced-alignment options. Gene expression was calculated using RSEM v1.3.0’s rsem-calculate-expression --calc-pme --single-cell-prior. QC metrics, RSEM TPMs and RSEM estimated counts were exported to a single Loom file for each cell. All individual Loom files for the entire batch were aggregated into a single Loom file for downstream processing. The final output included the unfiltered Loom and the tagged, unfiltered individual BAM files. +Data preprocessing and count matrix construction for a sample batch (or plate) were performed using the Smart-seq2 Multi Sample v2.1.0 Pipeline. For each cell in the batch, paired- or single-end FASTQ files were first processed with the Smart-seq2 Single Sample v5.0.0 Pipeline. Reads were aligned to the GENCODE mouse (M21) or human (V27) reference genome using HISAT2 v2.1.0 with default parameters in addition to --k 10 options. Metrics were collected and duplicate reads marked using the Picard v.2.10.10 CollectMultipleMetrics and CollectRnaSeqMetrics, and MarkDuplicates functions with validation_stringency=silent. For transcriptome quantification, reads were aligned to the GENCODE transcriptome using HISAT2 v2.1.0 with --k 10 --no-mixed --no-softclip --no-discordant --rdg 99999999,99999999 --rfg 99999999,99999999 --no-spliced-alignment options. Gene expression was calculated using RSEM v1.3.0’s rsem-calculate-expression --calc-pme --single-cell-prior. QC metrics, RSEM TPMs and RSEM estimated counts were exported to a single Loom file for each cell. All individual Loom files for the entire batch were aggregated into a single Loom file for downstream processing. The final output included the unfiltered Loom and the tagged, unfiltered individual BAM files. An example of the pipeline and outputs can be found in Terra (https://app.terra.bio/#workspaces/featured-workspaces-hca/HCA%20Smart-seq2%20Multi%20Sample%20Pipeline) and additional documentation can be found at https://github.com/HumanCellAtlas/skylab/tree/master/pipelines/smartseq2_multisample. Examples of genomic references, whitelists, and other inputs are available in the Skylab repository (see the *_example.json files at https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/smartseq2_multisample/human_single_example.json). diff --git a/pipelines/skylab/smartseq2_single_sample/README.md b/pipelines/skylab/smartseq2_single_sample/README.md index 8012a590f6..cb73b229d7 100644 --- a/pipelines/skylab/smartseq2_single_sample/README.md +++ b/pipelines/skylab/smartseq2_single_sample/README.md @@ -1,6 +1,6 @@ | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [smartseq2_v5.0.0](https://github.com/broadinstitute/warp/releases/tag/SmartSeq2SingleSample_v5.0.0) | August, 2020 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in skylab or contact [Kylee Degatano](mailto:kdegatano@broadinstitute.org) | +| [smartseq2_v5.0.0](https://github.com/broadinstitute/warp/releases) | August, 2020 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in skylab or contact [Kylee Degatano](mailto:kdegatano@broadinstitute.org) | # Table of Contents - [Smart-seq2 Single Sample Pipeline Overview](#smart-seq2-single-sample-pipeline-overview) From 50bd6ea9588fc262e79d562b4fd6c509f21d3e6e Mon Sep 17 00:00:00 2001 From: Farzaneh Khajouei Date: Wed, 7 Oct 2020 12:01:23 -0400 Subject: [PATCH 2/3] GL-1192-Prepare filenames for downstream tools (#75) * fixing compressed fastq checks * version changes * fastq change * syntax fix * changelog version changes * applying the comments and fixes * added checks for files ending in fastq * added a new task to check fastq compression * changing docker images * fix optimus wdl bug * bug fix * wdl bug fix to HISAT * wdl bug fix to HISAT * wdl bug fix to HISAT * wdl bug fix to HISAT * Jess recommendations * pipeline version fix --- dockers/skylab/hisat2/Dockerfile | 1 - pipelines/skylab/optimus/Optimus.changelog.md | 10 +- pipelines/skylab/optimus/Optimus.wdl | 8 +- .../skylab/optimus/scripts/parse_terra_tsv.py | 48 +++++-- .../MultiSampleSmartSeq2.changelog.md | 6 + .../MultiSampleSmartSeq2.wdl | 4 +- .../SmartSeq2SingleSample.changelog.md | 6 + .../SmartSeq2SingleSample.wdl | 2 +- tasks/skylab/FastqToUBam.wdl | 26 +++- tasks/skylab/HISAT2.wdl | 133 ++++++++++++++---- .../pr/test_inputs.json | 4 +- 11 files changed, 182 insertions(+), 66 deletions(-) diff --git a/dockers/skylab/hisat2/Dockerfile b/dockers/skylab/hisat2/Dockerfile index 4c6355a155..ce9d8d9b29 100644 --- a/dockers/skylab/hisat2/Dockerfile +++ b/dockers/skylab/hisat2/Dockerfile @@ -60,4 +60,3 @@ RUN \ cd gffread && \ make && \ cp gffread /usr/local/bin/ -WORKDIR /opt/tools diff --git a/pipelines/skylab/optimus/Optimus.changelog.md b/pipelines/skylab/optimus/Optimus.changelog.md index ad8770fe35..207ddfc6c2 100644 --- a/pipelines/skylab/optimus/Optimus.changelog.md +++ b/pipelines/skylab/optimus/Optimus.changelog.md @@ -1,10 +1,17 @@ +# 4.1.1 + +2020-10-07 (Date of Last Commit) + +* Removed extra trailing slash in ouput directory from cloud to cloud copy job + +* Removed fastq_suffix optional input - the pipeline now dynamically determines if a file is zipped + # 4.1.0 2020-10-05 (Date of Last Commit) * Updated sctools dockers and made them consistent across the Optimus pipeline - # 4.0.2 2020-09-30 (Date of Last Commit) @@ -18,7 +25,6 @@ * Refactored the pipeline to preprocess fastqs using the task `FastqProcessing`. Outputs are identical and the pipeline should be significantly faster - # 4.0.0 2020-08-10 (Date of Last Commit) diff --git a/pipelines/skylab/optimus/Optimus.wdl b/pipelines/skylab/optimus/Optimus.wdl index d9b407434a..2608c0124f 100644 --- a/pipelines/skylab/optimus/Optimus.wdl +++ b/pipelines/skylab/optimus/Optimus.wdl @@ -42,9 +42,6 @@ workflow Optimus { # tenX_v2, tenX_v3 String chemistry = "tenX_v2" - # environment-specific parameters - String fastq_suffix = "" - # Emptydrops lower cutoff Int emptydrops_lower = 100 @@ -58,7 +55,8 @@ workflow Optimus { } # version of this pipeline - String pipeline_version = "4.1.0" + + String pipeline_version = "4.1.1" # this is used to scatter matched [r1_fastq, r2_fastq, i1_fastq] arrays Array[Int] indices = range(length(r1_fastq)) @@ -73,8 +71,6 @@ workflow Optimus { ref_genome_fasta: "genome fasta file (must match star reference)" whitelist: "10x genomics cell barcode whitelist" tenX_v3_chemistry: "assume 10X Genomics v3 chemistry with 12bp UMI (in contrast to default v2 with 10bp UMI)" - fastq_suffix: "when running in green box, need to add '.gz' for picard to detect the compression" - output_zarr: "whether to run the taks that converts the outputs to Zarr format, by default it's true" force_no_check: "Set to true to override input checks and allow pipeline to proceed with invalid input" } diff --git a/pipelines/skylab/optimus/scripts/parse_terra_tsv.py b/pipelines/skylab/optimus/scripts/parse_terra_tsv.py index a625241126..9a8c19391f 100644 --- a/pipelines/skylab/optimus/scripts/parse_terra_tsv.py +++ b/pipelines/skylab/optimus/scripts/parse_terra_tsv.py @@ -13,7 +13,7 @@ Sample set TSV to be uploaded to terra. The samples will be grouped by sample_id. """ -def create_output_files(input_file,output_file,output_set): +def create_output_files(input_file,output_file,output_set,output_entity): """ Args: input_file: tsv file from HCA @@ -27,39 +27,53 @@ def create_output_files(input_file,output_file,output_set): #TBD: move this to a function and call on each row of df change. Note: change index 0 to get other participants # for each fastq read, create a lane - n_lanes = r1_fastq.shape[1] #number of fastq reads + n_lanes = r1_fastq.shape[1] - r1_fastq.isnull().sum(axis=1) #number of fastq reads n_participants = r1_fastq.shape[0] #number of participants - - column_names = ['entity:participant_lane_id', 'bundle_uuid', 'sample_id', 'r1_fastq','r2_fastq', 'i1_fastq'] + column_names = ['entity:participant_lane_id', 'input_id', 'input_name','input_id_metadata_field','input_name_metadata_field', 'r1_fastq','r2_fastq', 'i1_fastq'] participant_df = pd.DataFrame(columns = column_names) for j in range(n_participants): a = [] - for i in range(n_lanes): - a.append("lane_"+str(i)+"_participant_"+str(j)+"_"+str(df.sample__provenance__document_id[j])+"_id") + for i in range(n_lanes[j]): + a.append("participant_"+str(j)+"_lane_"+str(i)+"_"+str(df.sequencing_process__provenance__document_id[j])+"_id") + lane_id = pd.DataFrame({"entity:participant_lane_id":a}) lane_fastq_r1 = pd.DataFrame({"fastq1":r1_fastq.iloc[j].to_numpy()}) lane_fastq_r2 = pd.DataFrame({"fastq2":r2_fastq.iloc[j].to_numpy()}) lane_fastq_i1 = pd.DataFrame({"fastqi":i1_fastq.iloc[j].to_numpy()}) - bundle_uuid = pd.DataFrame({"entity:lane_id":np.repeat(df.bundle_uuid[j],n_lanes)}) - sample_id = pd.DataFrame({"sample_id":np.repeat(df.sample__provenance__document_id[j],n_lanes)}) + input_id = pd.DataFrame({"input_id":np.repeat(df.sequencing_process__provenance__document_id[j],n_lanes[j])}) + input_id_metadata_field = pd.DataFrame({"input_id_metadata_field":np.repeat("sequencing_process.provenance.document_id",n_lanes[j])}) + input_name = pd.DataFrame({"input_name_metadata_field":np.repeat(df.sequencing_input__biomaterial_core__biomaterial_id[j],n_lanes[j])}) + input_name_metadata_field = pd.DataFrame({"entity:lane_id":np.repeat("sequencing_input.biomaterial_core.biomaterial_id",n_lanes[j])}) + + column_names = ['entity:participant_lane_id', 'input_id', 'input_name','input_id_metadata_field','input_name_metadata_field', 'r1_fastq','r2_fastq', 'i1_fastq'] + lane_df = pd.concat([lane_id, - bundle_uuid, - sample_id, + input_id, + input_name, + input_id_metadata_field, + input_name_metadata_field, lane_fastq_r1, lane_fastq_r2, lane_fastq_i1 ], axis=1) lane_df.columns = column_names - participant_df = participant_df.append(lane_df) - participant_df.to_csv(output_file,sep="\t",index=None) + participant_df = participant_df.append(lane_df) + participant_lane_df = participant_df.dropna() + + participant_lane_df.to_csv(output_file,sep="\t",index=None) #print(out_df.shape,out_df.columns,out_df.r2_fastq) - particpant_set_df = participant_df[['sample_id','entity:participant_lane_id']] + particpant_set_df = participant_df[['input_id','entity:participant_lane_id']] particpant_set_df.columns = ['membership:participant_lane_set_id', 'participant_lane'] particpant_set_df.to_csv(output_set,sep="\t",index=None) + temp = df[['sequencing_process__provenance__document_id','sequencing_input__biomaterial_core__biomaterial_id']] + temp.columns = ['entity:participant_lane_set_id','input_name'] + temp.to_csv(output_entity,sep="\t",index=None) + + def main(): description = """This script converts the tsv file from HCA to data table to be used in terra. """ @@ -84,10 +98,16 @@ def main(): required=True, help="Sample set TSV to be uploaded to terra" ) + parser.add_argument( + "--output_entity", + dest="output_entity", + required=True, + help="Additional columns for the sample set TSV to be uploaded to terra" + ) args = parser.parse_args() #print(args.output_file) - create_output_files(args.input_file, args.output_file, args.output_set) + create_output_files(args.input_file, args.output_file, args.output_set,args.output_entity) if __name__ == "__main__": diff --git a/pipelines/skylab/smartseq2_multisample/MultiSampleSmartSeq2.changelog.md b/pipelines/skylab/smartseq2_multisample/MultiSampleSmartSeq2.changelog.md index 97957b95a3..a2ea940923 100644 --- a/pipelines/skylab/smartseq2_multisample/MultiSampleSmartSeq2.changelog.md +++ b/pipelines/skylab/smartseq2_multisample/MultiSampleSmartSeq2.changelog.md @@ -1,3 +1,9 @@ +# 2.1.1 + +2020-10-01 (Date of Last Commit) + +* Added checks for compressed fastq input files + # 2.1.0 2020-08-10 (Date of Last Commit) diff --git a/pipelines/skylab/smartseq2_multisample/MultiSampleSmartSeq2.wdl b/pipelines/skylab/smartseq2_multisample/MultiSampleSmartSeq2.wdl index 29492620f4..4dbbf0d4dd 100644 --- a/pipelines/skylab/smartseq2_multisample/MultiSampleSmartSeq2.wdl +++ b/pipelines/skylab/smartseq2_multisample/MultiSampleSmartSeq2.wdl @@ -34,7 +34,7 @@ workflow MultiSampleSmartSeq2 { Boolean paired_end } # Version of this pipeline - String pipeline_version = "2.1.0" + String pipeline_version = "2.1.1" if (false) { String? none = "None" @@ -189,4 +189,4 @@ task checkInputArrays { disks: "local-disk 1 HDD" } -} \ No newline at end of file +} diff --git a/pipelines/skylab/smartseq2_single_sample/SmartSeq2SingleSample.changelog.md b/pipelines/skylab/smartseq2_single_sample/SmartSeq2SingleSample.changelog.md index a3e381f022..be7a24f6a1 100644 --- a/pipelines/skylab/smartseq2_single_sample/SmartSeq2SingleSample.changelog.md +++ b/pipelines/skylab/smartseq2_single_sample/SmartSeq2SingleSample.changelog.md @@ -1,3 +1,9 @@ +# 5.0.1 + +2020-10-01 (Date of Last Commit) + +* Added check to see if input fastq files are compressed in HISAT2.wdl task + # 5.0.0 2020-08-10 (Date of Last Commit) diff --git a/pipelines/skylab/smartseq2_single_sample/SmartSeq2SingleSample.wdl b/pipelines/skylab/smartseq2_single_sample/SmartSeq2SingleSample.wdl index fdde8e4a52..c4642f3f8a 100644 --- a/pipelines/skylab/smartseq2_single_sample/SmartSeq2SingleSample.wdl +++ b/pipelines/skylab/smartseq2_single_sample/SmartSeq2SingleSample.wdl @@ -38,7 +38,7 @@ workflow SmartSeq2SingleCell { } # version of this pipeline - String pipeline_version = "5.0.0" + String pipeline_version = "5.0.1" parameter_meta { genome_ref_fasta: "Genome reference in fasta format" diff --git a/tasks/skylab/FastqToUBam.wdl b/tasks/skylab/FastqToUBam.wdl index f4e8f8153e..ff66c6dbba 100644 --- a/tasks/skylab/FastqToUBam.wdl +++ b/tasks/skylab/FastqToUBam.wdl @@ -4,7 +4,6 @@ task FastqToUBam { input { File fastq_file String input_id - String fastq_suffix = "" # runtime values String docker = "quay.io/humancellatlas/secondary-analysis-picard:v0.2.2-2.10.10" @@ -25,7 +24,6 @@ task FastqToUBam { parameter_meta { fastq_file: "input fastq file" input_id: "name of sample matching this file, inserted into read group header" - fastq_suffix: "a suffix to add to the fastq file; useful with mangled file IDs, since picard requires that the file end in .gz or it will not detect the gzipping." docker: "(optional) the docker image containing the runtime environment for this task" machine_mem_mb: "(optional) the amount of memory (MiB) to provision for this task" cpu: "(optional) the number of cpus to provision for this task" @@ -34,16 +32,30 @@ task FastqToUBam { } command { + set -e - # Adds fastq_suffix if it is passed - if [ ! -z "~{fastq_suffix}" ]; - then - mv "~{fastq_file}" "~{fastq_file}""~{fastq_suffix}" + if (file ~{fastq_file} | grep -q compressed); then + if [[ ~{fastq_file} != *.gz ]]; then + if [[ ~{fastq_file} != *.fastq ]]; then + FQ="~{fastq_file}".fastq.gz + mv "~{fastq_file}" "~{fastq_file}".fastq.gz + else + FQ="~{fastq_file}".gz + mv "~{fastq_file}" "~{fastq_file}".gz + fi + else + FQ=~{fastq_file} + fi + elif [[ ~{fastq_file} != *.fastq ]]; then + FQ="~{fastq_file}".fastq + mv "~{fastq_file}" "~{fastq_file}".fastq + else + FQ="~{fastq_file}" fi java -Xmx~{command_mem_mb}m -jar /usr/picard/picard.jar FastqToSam \ - FASTQ="~{fastq_file}""~{fastq_suffix}" \ + FASTQ=$FQ \ SORT_ORDER=unsorted \ OUTPUT=bamfile.bam \ SAMPLE_NAME="~{input_id}" diff --git a/tasks/skylab/HISAT2.wdl b/tasks/skylab/HISAT2.wdl index a1f16645c2..f52fa711c0 100644 --- a/tasks/skylab/HISAT2.wdl +++ b/tasks/skylab/HISAT2.wdl @@ -43,18 +43,43 @@ task HISAT2PairedEnd { set -e - # fix names if necessary. - if [[ "${fastq1}" != *.fastq.gz ]]; then - FQ1=${fastq1}.fastq.gz - mv ${fastq1} ${fastq1}.fastq.gz + # fix names if necessary + if (file ~{fastq1} | grep -q compressed); then + if [[ ~{fastq1} != *.gz ]]; then + if [[ ~{fastq1} != *.fastq ]]; then + FQ1=~{fastq1}.fastq.gz + mv ~{fastq1} ~{fastq1}.fastq.gz + else + FQ1=~{fastq1}.gz + mv ~{fastq1} ~{fastq1}.gz + fi + else + FQ1=~{fastq1} + fi + elif [[ ~{fastq1} != *.fastq ]]; then + FQ1=~{fastq1}.fastq + mv ~{fastq1} ~{fastq1}.fastq else - FQ1=${fastq1} + FQ1=~{fastq1} fi - if [[ "${fastq2}" != *.fastq.gz ]]; then - FQ2=${fastq2}.fastq.gz - mv ${fastq2} ${fastq2}.fastq.gz + + if (file ~{fastq2} | grep -q compressed); then + if [[ ~{fastq2} != *.gz ]]; then + if [[ ~{fastq2} != *.fastq ]]; then + FQ2=~{fastq2}.fastq.gz + mv ~{fastq2} ~{fastq2}.fastq.gz + else + FQ2=~{fastq2}.gz + mv ~{fastq2} ~{fastq2}.gz + fi + else + FQ2=~{fastq2} + fi + elif [[ ~{fastq2} != *.fastq ]]; then + FQ1=~{fastq2}.fastq + mv ~{fastq2} ~{fastq2}.fastq else - FQ2=${fastq2} + FQ1=~{fastq2} fi tar --no-same-owner -xvf "${hisat2_ref}" @@ -133,21 +158,46 @@ task HISAT2RSEM { } command { + set -e - # fix names if necessary. - if [[ "${fastq1}" != *.fastq.gz ]]; then - FQ1=${fastq1}.fastq.gz - mv ${fastq1} ${fastq1}.fastq.gz + # fix names if necessary + if (file ~{fastq1} | grep -q compressed); then + if [[ ~{fastq1} != *.gz ]]; then + if [[ ~{fastq1} != *.fastq ]]; then + FQ1=~{fastq1}.fastq.gz + mv ~{fastq1} ~{fastq1}.fastq.gz + else + FQ1=~{fastq1}.gz + mv ~{fastq1} ~{fastq1}.gz + fi + else + FQ1="~{fastq1}" + fi + elif [[ ~{fastq1} != *.fastq ]]; then + FQ1=~{fastq1}.fastq + mv ~{fastq1} ~{fastq1}.fastq else - FQ1=${fastq1} + FQ1="~{fastq1}" fi - if [[ "${fastq2}" != *.fastq.gz ]]; then - FQ2=${fastq2}.fastq.gz - mv ${fastq2} ${fastq2}.fastq.gz + if (file ~{fastq2} | grep -q compressed); then + if [[ ~{fastq2} != *.gz ]]; then + if [[ ~{fastq2} != *.fastq ]]; then + FQ2=~{fastq2}.fastq.gz + mv ~{fastq2} ~{fastq2}.fastq.gz + else + FQ2=~{fastq2}.gz + mv ~{fastq2} ~{fastq2}.gz + fi + else + FQ2="~{fastq2}" + fi + elif [[ ~{fastq2} != *.fastq ]]; then + FQ2=~{fastq2}.fastq + mv ~{fastq2} ~{fastq2}.fastq else - FQ2=${fastq2} + FQ2="~{fastq2}" fi tar --no-same-owner -xvf "${hisat2_ref}" @@ -232,15 +282,25 @@ input { command { set -e - tar --no-same-owner -xvf "~{hisat2_ref}" - - # fix file names if necessary. - if [[ "~{fastq}" != *.fastq.gz ]]; then - FQ=~{fastq}.fastq.gz - mv ~{fastq} ~{fastq}.fastq.gz + if (file ~{fastq} | grep -q compressed); then + if [[ ~{fastq} != *.gz ]]; then + if [[ "~{fastq}" != *.fastq ]]; then + FQ=~{fastq}.fastq.gz + mv ~{fastq} ~{fastq}.fastq.gz + else + FQ=~{fastq}.gz + mv ~{fastq} ~{fastq}.gz + fi + else + FQ="~{fastq}" + fi + elif [[ "~{fastq}" != *.fastq ]]; then + FQ=~{fastq}.fastq + mv ~{fastq} ~{fastq}.fastq else - FQ=~{fastq} + FQ="~{fastq}" fi + tar --no-same-owner -xvf "~{hisat2_ref}" # The parameters for this task are copied from the HISAT2PairedEnd task. hisat2 -t \ @@ -358,15 +418,26 @@ task HISAT2RSEMSingleEnd { command { set -e - # fix names if necessary. - if [[ "${fastq}" != *.fastq.gz ]]; then - FQ=${fastq}.fastq.gz - mv ${fastq} ${fastq}.fastq.gz + if (file ~{fastq} | grep -q compressed); then + if [[ ~{fastq} != *.gz ]]; then + if [[ "~{fastq}" != *.fastq ]]; then + FQ=~{fastq}.fastq.gz + mv ~{fastq} ~{fastq}.fastq.gz + else + FQ=~{fastq}.gz + mv ~{fastq} ~{fastq}.gz + fi + else + FQ="~{fastq}" + fi + elif [[ "~{fastq}" != *.fastq ]]; then + FQ=~{fastq}.fastq + mv ~{fastq} ~{fastq}.fastq + else - FQ=${fastq} + FQ="~{fastq}" fi - tar --no-same-owner -xvf "${hisat2_ref}" # increase gap alignment penalty to avoid gap alignment diff --git a/tests/skylab/smartseq2_single_sample/pr/test_inputs.json b/tests/skylab/smartseq2_single_sample/pr/test_inputs.json index 82d2603160..a36ccbac97 100644 --- a/tests/skylab/smartseq2_single_sample/pr/test_inputs.json +++ b/tests/skylab/smartseq2_single_sample/pr/test_inputs.json @@ -8,8 +8,8 @@ "TestSmartSeq2SingleCellPR.gene_ref_flat": "gs://hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38_gencode.v27.refFlat.txt", "TestSmartSeq2SingleCellPR.hisat2_ref_name": "genome_snp_tran", "TestSmartSeq2SingleCellPR.stranded":"NONE", - "TestSmartSeq2SingleCellPR.fastq1":"gs://hca-dcp-mint-test-data/smartseq2_single_sample/patel_ap/SRR1294925_1.fastq.gz", - "TestSmartSeq2SingleCellPR.fastq2":"gs://hca-dcp-mint-test-data/smartseq2_single_sample/patel_ap/SRR1294925_2.fastq.gz", + "TestSmartSeq2SingleCellPR.fastq1":"gs://hca-dcp-mint-test-data/smartseq2_single_sample/patel_ap/SRR1294925_1", + "TestSmartSeq2SingleCellPR.fastq2":"gs://hca-dcp-mint-test-data/smartseq2_single_sample/patel_ap/SRR1294925_2", "TestSmartSeq2SingleCellPR.input_id":"SRR1294925", "TestSmartSeq2SingleCellPR.output_name":"SRR1294925", "TestSmartSeq2SingleCellPR.expected_counts_hash": "135a3fbb959583db17713dc8b9d7fe33", From 1b6370a3bdcfc8adfdbf17683ad8c0bf120dd1b5 Mon Sep 17 00:00:00 2001 From: gbggrant Date: Fri, 2 Oct 2020 14:24:53 -0400 Subject: [PATCH 3/3] Update Arrays.wdl to call the BafRegress task and store the metrics. Updated pipeline version and changelogs for Arrays.wdl and ValidateChip.wdl Added BafRegressMetricsFile as an output of Arrays.wdl Updated documentation to include BafRegress inputs and outputs. --- .../build_arrays_picard_private_docker.sh | 6 +-- ...d_arrays_picard_private_docker_version.tsv | 1 + .../arrays/single_sample/Arrays.changelog.md | 6 +++ .../single_sample/Arrays.documentation.md | 2 + .../broad/arrays/single_sample/Arrays.wdl | 19 +++++++- .../204126290052_R01C01_NA12878.json | 1 + .../validate_chip/ValidateChip.changelog.md | 6 +++ .../arrays/validate_chip/ValidateChip.wdl | 2 +- .../IlluminaGenotypingArray.documentation.md | 2 +- tasks/broad/InternalArraysTasks.wdl | 43 ++++++++++++++++--- 10 files changed, 76 insertions(+), 12 deletions(-) diff --git a/dockers/broad/arrays_picard_private/build_arrays_picard_private_docker.sh b/dockers/broad/arrays_picard_private/build_arrays_picard_private_docker.sh index ebb4aedfb8..04798c0789 100755 --- a/dockers/broad/arrays_picard_private/build_arrays_picard_private_docker.sh +++ b/dockers/broad/arrays_picard_private/build_arrays_picard_private_docker.sh @@ -4,12 +4,12 @@ set -e # Update DOCKER_IMAGE_VERSION after any substantial changes to the # image this builds. # -declare -r DOCKER_IMAGE_VERSION=4.0.9 +declare -r DOCKER_IMAGE_VERSION=4.0.10 # Update this when there is a new release of picard-private to use as the # default jar. # -declare -r PICARD_PRIVATE_VERSION=bca9362254e7cca14c79c1fd8833042a07f133d5 +declare -r PICARD_PRIVATE_VERSION=61af9bff4587783e5981a496f422ea36102482b5 declare -r ARTIFACTORY=https://broadinstitute.jfrog.io/broadinstitute declare -r LIBS_SNAPSHOT_LOCAL=$ARTIFACTORY/libs-snapshot-local @@ -109,7 +109,7 @@ function runDocker () { local -r gcr=us.gcr.io/$project/arrays-picard-private echo -e "$gcr:$tag\t$PICARD_PRIVATE_VERSION" >> ../build_arrays_picard_private_docker_version.tsv docker build $cache -t $gcr:$tag . - gcloud docker -- push $gcr:$tag + docker push $gcr:$tag } # Run docker login if cannot pull broadinstitute/dsde-toolbox. diff --git a/dockers/broad/arrays_picard_private/build_arrays_picard_private_docker_version.tsv b/dockers/broad/arrays_picard_private/build_arrays_picard_private_docker_version.tsv index dc5b53abad..5d5727328e 100644 --- a/dockers/broad/arrays_picard_private/build_arrays_picard_private_docker_version.tsv +++ b/dockers/broad/arrays_picard_private/build_arrays_picard_private_docker_version.tsv @@ -9,3 +9,4 @@ us.gcr.io/broad-arrays-prod/arrays-picard-private:4.0.7-1568906063 5521430e92e60 us.gcr.io/broad-arrays-prod/arrays-picard-private:4.0.8-1591037419 a9453e24777629c29d70fe01911c5f764afbfe00 us.gcr.io/broad-arrays-prod/arrays-picard-private:4.0.8-1591647180 62ec51fa1c8dcee4efb2c60fcdb58c2e6efd6098 us.gcr.io/broad-arrays-prod/arrays-picard-private:4.0.9-1593023734 bca9362254e7cca14c79c1fd8833042a07f133d5 +us.gcr.io/broad-arrays-prod/arrays-picard-private:4.0.10-1602016912 61af9bff4587783e5981a496f422ea36102482b5 diff --git a/pipelines/broad/arrays/single_sample/Arrays.changelog.md b/pipelines/broad/arrays/single_sample/Arrays.changelog.md index d49412f79f..547bf0d914 100644 --- a/pipelines/broad/arrays/single_sample/Arrays.changelog.md +++ b/pipelines/broad/arrays/single_sample/Arrays.changelog.md @@ -1,3 +1,9 @@ +# 2.3.0 +2020-10-07 + +* Added use of BafRegress to the pipeline. BafRegress detects and estimates sample contamination using B allele frequency data from Illumina genotyping arrays using a regression model. +* Updated all internal tasks to use the latest version of picard-private as best practice. + # 2.2.0 2020-10-01 diff --git a/pipelines/broad/arrays/single_sample/Arrays.documentation.md b/pipelines/broad/arrays/single_sample/Arrays.documentation.md index a722337368..8782f469b4 100644 --- a/pipelines/broad/arrays/single_sample/Arrays.documentation.md +++ b/pipelines/broad/arrays/single_sample/Arrays.documentation.md @@ -44,6 +44,7 @@ The Illumina Genotyping Array Pipeline takes the inputs described below. Inputs * Other * call_rate_threshold. A numeric value used for determining whether the pipeline reports this sample as passing or failing If the call rate calculated by the pipeline is greater than this value the sample is reported as passing. * genotype_concordance_threshold. A numeric value used for determining whether a sample with control data passes genotype_concordance. If the genotype concordance calculated by the pipeline is greater than this value the sample is reported as passing genotype concordance. + * minor_allele_frequency_file. The cloud path to a chip-specific text file containing locus-id to minor allele frequency used as an input to the BAFRegress tool used for calculating contamination. * contamination_controls_vcf. The cloud path to a VCF of samples run on this chip type to be used to supplement contamination calling. * subsampled_metrics_interval_list. The cloud path to the subsampled_metrics_interval_list. This file contains a list of sites that can be supplied to the pipeline to have it subset the output VCF and generate metrics specifically for those sites. * disk_size. The default disk size (in GiB) for cloud VMs spun up for the tasks in this pipeline. @@ -77,6 +78,7 @@ The pipeline generates a number of outputs. These are described here. * OutputVcfFile. The VCF generated by the pipeline * OutputVcfIndexFile. The index file of the VCF generated by the pipeline. * GTCFile. The GTC file generated by IlluminaGenotypingArray.Autocall +* BafRegressMetricsFile. A metrics file containing the metrics generated by BafRegress * ContaminationMetricsFile. A metrics file containing the metrics generated by VerifyIDIntensity * OutputFingerprintVcfFile. A VCF containing genotypes selected from the output_vcf at certain designated sites * OutputFingerprintVcfIndexFile. The index file of the output_fingerprint_vcf diff --git a/pipelines/broad/arrays/single_sample/Arrays.wdl b/pipelines/broad/arrays/single_sample/Arrays.wdl index 5c9a62d64b..25116f8ef2 100644 --- a/pipelines/broad/arrays/single_sample/Arrays.wdl +++ b/pipelines/broad/arrays/single_sample/Arrays.wdl @@ -21,7 +21,7 @@ import "../../../../tasks/broad/InternalArraysTasks.wdl" as InternalTasks workflow Arrays { - String pipeline_version = "2.2.0" + String pipeline_version = "2.3.0" input { @@ -69,6 +69,9 @@ workflow Arrays { # For Contamination Checking File? contamination_controls_vcf + # For BAFRegress + File? minor_allele_frequency_file + # For HapMap GenotypeConcordance Check: File? control_sample_vcf_file File? control_sample_vcf_index_file @@ -146,6 +149,7 @@ workflow Arrays { variant_rsids_file = variant_rsids_file, subsampled_metrics_interval_list = subsampled_metrics_interval_list, contamination_controls_vcf = contamination_controls_vcf, + minor_allele_frequency_file = minor_allele_frequency_file, control_sample_vcf_file = control_sample_vcf_file, control_sample_vcf_index_file = control_sample_vcf_index_file, control_sample_intervals_file = control_sample_intervals_file, @@ -202,6 +206,17 @@ workflow Arrays { disk_size = disk_size, preemptible_tries = preemptible_tries } + + if (defined(IlluminaGenotypingArray.bafregress_results_file)) { + call InternalTasks.CreateBafRegressMetricsFile { + input: + input_file = select_first([IlluminaGenotypingArray.bafregress_results_file]), + output_metrics_basefilename = chip_well_barcode, + disk_size = disk_size, + preemptible_tries = preemptible_tries + } + } + call InternalTasks.UploadArraysMetrics { input: arrays_variant_calling_detail_metrics = select_first([IlluminaGenotypingArray.arrays_variant_calling_detail_metrics]), @@ -213,6 +228,7 @@ workflow Arrays { genotype_concordance_detail_metrics = IlluminaGenotypingArray.genotype_concordance_detail_metrics, genotype_concordance_contingency_metrics = IlluminaGenotypingArray.genotype_concordance_contingency_metrics, verify_id_metrics = IlluminaGenotypingArray.contamination_metrics, + bafregress_metrics = CreateBafRegressMetricsFile.output_metrics_file, disk_size = disk_size, preemptible_tries = preemptible_tries, authentication = authentication_block, @@ -256,6 +272,7 @@ workflow Arrays { File? OutputVcfMd5CloudPath = IlluminaGenotypingArray.output_vcf_md5_cloud_path File? OutputVcfFile = IlluminaGenotypingArray.output_vcf File? OutputVcfIndexFile = IlluminaGenotypingArray.output_vcf_index + File? BafRegressMetricsFile = CreateBafRegressMetricsFile.output_metrics_file File? ContaminationMetricsFile = IlluminaGenotypingArray.contamination_metrics File? OutputFingerprintVcfFile = IlluminaGenotypingArray.output_fingerprint_vcf File? OutputFingerprintVcfIndexFile = IlluminaGenotypingArray.output_fingerprint_vcf_index diff --git a/pipelines/broad/arrays/single_sample/test_inputs/Scientific/204126290052_R01C01_NA12878.json b/pipelines/broad/arrays/single_sample/test_inputs/Scientific/204126290052_R01C01_NA12878.json index 2c8899fdc2..26fcff9074 100644 --- a/pipelines/broad/arrays/single_sample/test_inputs/Scientific/204126290052_R01C01_NA12878.json +++ b/pipelines/broad/arrays/single_sample/test_inputs/Scientific/204126290052_R01C01_NA12878.json @@ -28,6 +28,7 @@ "Arrays.dbSNP_vcf_index": "gs://gcp-public-data--broad-references/hg19/v0/dbsnp_138.b37.vcf.gz.tbi", "Arrays.haplotype_database_file": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.haplotype_database.txt", "Arrays.variant_rsids_file": "gs://broad-references-private/hg19/v0/Homo_sapiens_assembly19.haplotype_database.snps.list", + "Arrays.minor_allele_frequency_file": "gs://broad-gotc-test-storage/arrays/metadata/GDA-8v1-0_A5/GDA-8v1-0_A5.MAF.txt", "Arrays.preemptible_tries": 3, "Arrays.vault_token_path": "{VAULT_TOKEN_PATH}", "Arrays.environment": "{ENV}" diff --git a/pipelines/broad/arrays/validate_chip/ValidateChip.changelog.md b/pipelines/broad/arrays/validate_chip/ValidateChip.changelog.md index ad33322739..45991ca41e 100644 --- a/pipelines/broad/arrays/validate_chip/ValidateChip.changelog.md +++ b/pipelines/broad/arrays/validate_chip/ValidateChip.changelog.md @@ -1,3 +1,9 @@ +# 1.12.0 +2020-10-07 + +* Updated task definitions to include a new tool not currently used in ValidateChip wdl +* Updated all internal tasks to use the latest version of picard-private as best practice. + # 1.11.0 2020-10-01 diff --git a/pipelines/broad/arrays/validate_chip/ValidateChip.wdl b/pipelines/broad/arrays/validate_chip/ValidateChip.wdl index 753cb89956..37498233c7 100644 --- a/pipelines/broad/arrays/validate_chip/ValidateChip.wdl +++ b/pipelines/broad/arrays/validate_chip/ValidateChip.wdl @@ -21,7 +21,7 @@ import "../../../../tasks/broad/InternalArraysTasks.wdl" as InternalTasks workflow ValidateChip { - String pipeline_version = "1.11.0" + String pipeline_version = "1.12.0" input { String sample_alias diff --git a/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.documentation.md b/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.documentation.md index c999e65754..d3b11719e5 100644 --- a/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.documentation.md +++ b/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.documentation.md @@ -190,7 +190,7 @@ The tables below summarize all of the workflow's output according to task. Outpu | chip_well_barcode.vcf.gz | VCF generated by the pipeline | Required | Compressed VCF (vcf.gz) | | chip_well_barcode.vcf.gz.tbi | Index file of the VCF generated by the pipeline | Required | tabix index (vcf.gz.tbi) | | chip_well_barcode.gtc | GTC file generated by Autocall | Required | GTC | -| chip_well_barcode.bafregress_metrics | Text output file generated by BafRegress | Optional | txt | +| chip_well_barcode.bafregress_results_file | Text output file generated by BafRegress | Optional | txt | | chip_well_barcode.verifyidintensity_metrics | File containing metrics generated by VerifyIDIntensity | Required | txt | | chip_well_barcode.arrays_variant_calling_detail_metrics | Detailed metrics file for the output VCF generated by CollectArraysVariantCallingMetrics.detail_metrics | Required | txt | | chip_well_barcode.arrays_variant_calling_summary_metrics | Summary metrics file for the output VCF as generated by CollectArraysVariantCallingMetrics | Required | txt | diff --git a/tasks/broad/InternalArraysTasks.wdl b/tasks/broad/InternalArraysTasks.wdl index 24eb10a4fb..2fc2d30b9d 100644 --- a/tasks/broad/InternalArraysTasks.wdl +++ b/tasks/broad/InternalArraysTasks.wdl @@ -38,7 +38,7 @@ task CreateExtendedIlluminaManifest { >>> runtime { - docker: "us.gcr.io/broad-arrays-prod/arrays-picard-private:4.0.9-1593023734" + docker: "us.gcr.io/broad-arrays-prod/arrays-picard-private:4.0.10-1602016912" disks: "local-disk " + disk_size + " HDD" memory: "14 GiB" preemptible: preemptible_tries @@ -78,7 +78,7 @@ task GenerateEmptyVariantCallingMetricsFile { >>> runtime { - docker: "us.gcr.io/broad-arrays-prod/arrays-picard-private:4.0.9-1593023734" + docker: "us.gcr.io/broad-arrays-prod/arrays-picard-private:4.0.10-1602016912" memory: "3.5 GiB" preemptible: preemptible_tries } @@ -118,7 +118,7 @@ task BlacklistBarcode { >>> runtime { - docker: "us.gcr.io/broad-arrays-prod/arrays-picard-private:4.0.9-1593023734" + docker: "us.gcr.io/broad-arrays-prod/arrays-picard-private:4.0.10-1602016912" memory: "3.5 GiB" preemptible: preemptible_tries } @@ -169,7 +169,7 @@ task VcfToMercuryFingerprintJson { >>> runtime { - docker: "us.gcr.io/broad-arrays-prod/arrays-picard-private:4.0.9-1593023734" + docker: "us.gcr.io/broad-arrays-prod/arrays-picard-private:4.0.10-1602016912" disks: "local-disk " + disk_size + " HDD" memory: "3.5 GiB" preemptible: preemptible_tries @@ -180,6 +180,34 @@ task VcfToMercuryFingerprintJson { } } + +task CreateBafRegressMetricsFile { + input { + File input_file + String output_metrics_basefilename + + Int disk_size + Int preemptible_tries + } + + command { + java -Xms2g -Dpicard.useLegacyParser=false -jar /usr/gitc/picard-private.jar \ + CreateBafRegressMetricsFile \ + --INPUT ~{input_file} \ + --OUTPUT ~{output_metrics_basefilename} + } + runtime { + docker: "us.gcr.io/broad-arrays-prod/arrays-picard-private:4.0.10-1602016912" + disks: "local-disk " + disk_size + " HDD" + memory: "3.5 GiB" + preemptible: preemptible_tries + } + + output { + File output_metrics_file = "~{output_metrics_basefilename}.bafregress_metrics" + } +} + task UploadArraysMetrics { input { File arrays_variant_calling_detail_metrics @@ -191,6 +219,7 @@ task UploadArraysMetrics { File? genotype_concordance_detail_metrics File? genotype_concordance_contingency_metrics File? verify_id_metrics + File? bafregress_metrics Array[String] authentication String service_account_filename @@ -217,6 +246,8 @@ task UploadArraysMetrics { cp ~{genotype_concordance_contingency_metrics} metrics_upload_dir ! [ -z ~{verify_id_metrics} ] && cp ~{verify_id_metrics} metrics_upload_dir + ! [ -z ~{bafregress_metrics} ] && + cp ~{bafregress_metrics} metrics_upload_dir ! [ -z ~{fingerprinting_detail_metrics} ] && cp ~{fingerprinting_detail_metrics} metrics_upload_dir @@ -239,7 +270,7 @@ task UploadArraysMetrics { >>> runtime { - docker: "us.gcr.io/broad-arrays-prod/arrays-picard-private:4.0.9-1593023734" + docker: "us.gcr.io/broad-arrays-prod/arrays-picard-private:4.0.10-1602016912" disks: "local-disk " + disk_size + " HDD" memory: "3.5 GiB" preemptible: preemptible_tries @@ -274,7 +305,7 @@ task UpdateChipWellBarcodeIndex { >>> runtime { - docker: "us.gcr.io/broad-arrays-prod/arrays-picard-private:4.0.9-1593023734" + docker: "us.gcr.io/broad-arrays-prod/arrays-picard-private:4.0.10-1602016912" disks: "local-disk " + disk_size + " HDD" memory: "3.5 GiB" preemptible: preemptible_tries