Merge pull request #102 from broadinstitute/staging

Staging -> Master
broadinstitute · Oct 9, 2020 · e5230d6 · e5230d6
2 parents 3e08cd2 + 0677398
commit e5230d6
Show file tree

Hide file tree

Showing 31 changed files with 276 additions and 96 deletions.
diff --git a/dockers/broad/arrays_picard_private/build_arrays_picard_private_docker.sh b/dockers/broad/arrays_picard_private/build_arrays_picard_private_docker.sh
@@ -4,12 +4,12 @@ set -e
 # Update DOCKER_IMAGE_VERSION after any substantial changes to the
 # image this builds.
 #
-declare -r DOCKER_IMAGE_VERSION=4.0.9
+declare -r DOCKER_IMAGE_VERSION=4.0.10
 
 # Update this when there is a new release of picard-private to use as the
 # default jar.
 #
-declare -r PICARD_PRIVATE_VERSION=bca9362254e7cca14c79c1fd8833042a07f133d5
+declare -r PICARD_PRIVATE_VERSION=61af9bff4587783e5981a496f422ea36102482b5
 
 declare -r ARTIFACTORY=https://broadinstitute.jfrog.io/broadinstitute
 declare -r LIBS_SNAPSHOT_LOCAL=$ARTIFACTORY/libs-snapshot-local
@@ -109,7 +109,7 @@ function runDocker () {
     local -r gcr=us.gcr.io/$project/arrays-picard-private
     echo -e "$gcr:$tag\t$PICARD_PRIVATE_VERSION" >> ../build_arrays_picard_private_docker_version.tsv
     docker build $cache -t $gcr:$tag .
-    gcloud docker -- push $gcr:$tag
+    docker push $gcr:$tag
 }
 
 # Run docker login if cannot pull broadinstitute/dsde-toolbox.

diff --git a/dockers/broad/arrays_picard_private/build_arrays_picard_private_docker_version.tsv b/dockers/broad/arrays_picard_private/build_arrays_picard_private_docker_version.tsv
@@ -9,3 +9,4 @@ us.gcr.io/broad-arrays-prod/arrays-picard-private:4.0.7-1568906063	5521430e92e60
 us.gcr.io/broad-arrays-prod/arrays-picard-private:4.0.8-1591037419	a9453e24777629c29d70fe01911c5f764afbfe00
 us.gcr.io/broad-arrays-prod/arrays-picard-private:4.0.8-1591647180	62ec51fa1c8dcee4efb2c60fcdb58c2e6efd6098
 us.gcr.io/broad-arrays-prod/arrays-picard-private:4.0.9-1593023734	bca9362254e7cca14c79c1fd8833042a07f133d5
+us.gcr.io/broad-arrays-prod/arrays-picard-private:4.0.10-1602016912	61af9bff4587783e5981a496f422ea36102482b5
diff --git a/dockers/skylab/hisat2/Dockerfile b/dockers/skylab/hisat2/Dockerfile
@@ -60,4 +60,3 @@ RUN \
   cd gffread && \
   make && \ 
   cp gffread /usr/local/bin/
-WORKDIR /opt/tools
diff --git a/pipelines/broad/arrays/single_sample/Arrays.changelog.md b/pipelines/broad/arrays/single_sample/Arrays.changelog.md
@@ -1,3 +1,9 @@
+# 2.3.0
+2020-10-07
+
+* Added use of BafRegress to the pipeline.  BafRegress detects and estimates sample contamination using B allele frequency data from Illumina genotyping arrays using a regression model.
+* Updated all internal tasks to use the latest version of picard-private as best practice.
+
 # 2.2.0
 2020-10-01
 

diff --git a/pipelines/broad/arrays/single_sample/Arrays.documentation.md b/pipelines/broad/arrays/single_sample/Arrays.documentation.md
@@ -44,6 +44,7 @@ The Illumina Genotyping Array Pipeline takes the inputs described below. Inputs
 *   Other
     *   call_rate_threshold. A numeric value used for determining whether the pipeline reports this sample as passing or failing  If the call rate calculated by the pipeline is greater than this value the sample is reported as passing.
     *   genotype_concordance_threshold. A numeric value used for determining whether a sample with control data passes genotype_concordance.  If the genotype concordance calculated by the pipeline is greater than this value the sample is reported as passing genotype concordance.
+    *   minor_allele_frequency_file.  The cloud path to a chip-specific text file containing locus-id to minor allele frequency used as an input to the BAFRegress tool used for calculating contamination.
     *   contamination_controls_vcf.  The cloud path to a VCF of samples run on this chip type to be used to supplement contamination calling.
     *   subsampled_metrics_interval_list. The cloud path to the subsampled_metrics_interval_list. This file contains a list of sites that can be supplied to the pipeline to have it subset the output VCF and generate metrics specifically for those sites.
     *   disk_size. The default disk size (in GiB) for cloud VMs spun up for the tasks in this pipeline.
@@ -77,6 +78,7 @@ The pipeline generates a number of outputs. These are described here.
 *   OutputVcfFile. The VCF generated by the pipeline
 *   OutputVcfIndexFile. The index file of the VCF generated by the pipeline.
 *   GTCFile. The GTC file generated by IlluminaGenotypingArray.Autocall
+*   BafRegressMetricsFile.  A metrics file containing the metrics generated by BafRegress
 *   ContaminationMetricsFile. A metrics file containing the metrics generated by VerifyIDIntensity
 *   OutputFingerprintVcfFile. A VCF containing genotypes selected from the output_vcf at certain designated sites
 *   OutputFingerprintVcfIndexFile. The index file of the output_fingerprint_vcf

diff --git a/pipelines/broad/arrays/single_sample/Arrays.wdl b/pipelines/broad/arrays/single_sample/Arrays.wdl
@@ -21,7 +21,7 @@ import "../../../../tasks/broad/InternalArraysTasks.wdl" as InternalTasks
 
 workflow Arrays {
 
-  String pipeline_version = "2.2.0"
+  String pipeline_version = "2.3.0"
 
   input {
 
@@ -69,6 +69,9 @@ workflow Arrays {
     # For Contamination Checking
     File? contamination_controls_vcf
 
+    # For BAFRegress
+    File? minor_allele_frequency_file
+
     # For HapMap GenotypeConcordance Check:
     File? control_sample_vcf_file
     File? control_sample_vcf_index_file
@@ -146,6 +149,7 @@ workflow Arrays {
       variant_rsids_file = variant_rsids_file,
       subsampled_metrics_interval_list = subsampled_metrics_interval_list,
       contamination_controls_vcf = contamination_controls_vcf,
+      minor_allele_frequency_file = minor_allele_frequency_file,
       control_sample_vcf_file = control_sample_vcf_file,
       control_sample_vcf_index_file = control_sample_vcf_index_file,
       control_sample_intervals_file = control_sample_intervals_file,
@@ -202,6 +206,17 @@ workflow Arrays {
         disk_size = disk_size,
         preemptible_tries = preemptible_tries
     }
+
+    if (defined(IlluminaGenotypingArray.bafregress_results_file)) {
+      call InternalTasks.CreateBafRegressMetricsFile {
+        input:
+          input_file = select_first([IlluminaGenotypingArray.bafregress_results_file]),
+          output_metrics_basefilename = chip_well_barcode,
+          disk_size = disk_size,
+          preemptible_tries = preemptible_tries
+      }
+    }
+
     call InternalTasks.UploadArraysMetrics {
       input:
         arrays_variant_calling_detail_metrics = select_first([IlluminaGenotypingArray.arrays_variant_calling_detail_metrics]),
@@ -213,6 +228,7 @@ workflow Arrays {
         genotype_concordance_detail_metrics  = IlluminaGenotypingArray.genotype_concordance_detail_metrics,
         genotype_concordance_contingency_metrics = IlluminaGenotypingArray.genotype_concordance_contingency_metrics,
         verify_id_metrics = IlluminaGenotypingArray.contamination_metrics,
+        bafregress_metrics = CreateBafRegressMetricsFile.output_metrics_file,
         disk_size = disk_size,
         preemptible_tries = preemptible_tries,
         authentication = authentication_block,
@@ -256,6 +272,7 @@ workflow Arrays {
     File? OutputVcfMd5CloudPath = IlluminaGenotypingArray.output_vcf_md5_cloud_path
     File? OutputVcfFile = IlluminaGenotypingArray.output_vcf
     File? OutputVcfIndexFile = IlluminaGenotypingArray.output_vcf_index
+    File? BafRegressMetricsFile = CreateBafRegressMetricsFile.output_metrics_file
     File? ContaminationMetricsFile = IlluminaGenotypingArray.contamination_metrics
     File? OutputFingerprintVcfFile = IlluminaGenotypingArray.output_fingerprint_vcf
     File? OutputFingerprintVcfIndexFile = IlluminaGenotypingArray.output_fingerprint_vcf_index

diff --git a/pipelines/broad/arrays/single_sample/test_inputs/Scientific/204126290052_R01C01_NA12878.json b/pipelines/broad/arrays/single_sample/test_inputs/Scientific/204126290052_R01C01_NA12878.json
@@ -28,6 +28,7 @@
   "Arrays.dbSNP_vcf_index": "gs://gcp-public-data--broad-references/hg19/v0/dbsnp_138.b37.vcf.gz.tbi",
   "Arrays.haplotype_database_file": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.haplotype_database.txt",
   "Arrays.variant_rsids_file": "gs://broad-references-private/hg19/v0/Homo_sapiens_assembly19.haplotype_database.snps.list",
+  "Arrays.minor_allele_frequency_file": "gs://broad-gotc-test-storage/arrays/metadata/GDA-8v1-0_A5/GDA-8v1-0_A5.MAF.txt",
   "Arrays.preemptible_tries": 3,
   "Arrays.vault_token_path": "{VAULT_TOKEN_PATH}",
   "Arrays.environment": "{ENV}"

diff --git a/pipelines/broad/arrays/validate_chip/ValidateChip.changelog.md b/pipelines/broad/arrays/validate_chip/ValidateChip.changelog.md
@@ -1,3 +1,9 @@
+# 1.12.0
+2020-10-07
+
+* Updated task definitions to include a new tool not currently used in ValidateChip wdl
+* Updated all internal tasks to use the latest version of picard-private as best practice.
+
 # 1.11.0
 2020-10-01
 

diff --git a/pipelines/broad/arrays/validate_chip/ValidateChip.wdl b/pipelines/broad/arrays/validate_chip/ValidateChip.wdl
@@ -21,7 +21,7 @@ import "../../../../tasks/broad/InternalArraysTasks.wdl" as InternalTasks
 
 workflow ValidateChip {
 
-  String pipeline_version = "1.11.0"
+  String pipeline_version = "1.12.0"
 
   input {
     String sample_alias

diff --git a/pipelines/broad/dna_seq/germline/single_sample/exome/README.md b/pipelines/broad/dna_seq/germline/single_sample/exome/README.md
@@ -1,6 +1,6 @@
 | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback |
 | :----: | :---: | :----: | :--------------: |
-| [ExomeGermlineSingleSample_v2.0](ExomeGermlineSingleSample.wdl) | June 10, 2020 | [Elizabeth Kiernan](mailto:[email protected]) | Please file GitHub issues in dsde-pipelines or contact [Kylee Degatano](mailto:[email protected]) | 
+| [ExomeGermlineSingleSample_v2.0](https://github.com/broadinstitute/warp/releases) | June 10, 2020 | [Elizabeth Kiernan](mailto:[email protected]) | Please file GitHub issues in dsde-pipelines or contact [Kylee Degatano](mailto:[email protected]) | 
 
 # Table of Contents
 - [Introduction to the Exome Germline Single Sample Pipeline](#introduction-to-the-exome-germline-single-sample-pipeline)

diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/README.md b/pipelines/broad/dna_seq/germline/single_sample/wgs/README.md
@@ -1,6 +1,6 @@
 | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback |
 | :----: | :---: | :----: | :--------------: |
-| [WholeGenomeGermlineSingleSample_v2.0](WholeGenomeGermlineSingleSample.wdl) | June 22, 2020 | [Elizabeth Kiernan](mailto:[email protected]) | Please file GitHub issues in WARP or contact [Kylee Degatano](mailto:[email protected]) |
+| [WholeGenomeGermlineSingleSample_v2.0](https://github.com/broadinstitute/warp/releases) | June 22, 2020 | [Elizabeth Kiernan](mailto:[email protected]) | Please file GitHub issues in WARP or contact [Kylee Degatano](mailto:[email protected]) |
 
 # Introduction to the Whole Genome Germline Single Sample Pipeline
 

diff --git a/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.documentation.md b/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.documentation.md
@@ -1,10 +1,10 @@
 | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback |
 | :----: | :---: | :----: | :--------------: |
-| [Version 1.11.0](IlluminaGenotypingArray.wdl) | Oct 1, 2020 | [Elizabeth Kiernan](mailto:[email protected]) | Please file GitHub issues in warp or contact [Kylee Degatano](mailto:[email protected]) |
+| [Version 1.11.0](https://github.com/broadinstitute/warp/releases) | Oct 1, 2020 | [Elizabeth Kiernan](mailto:[email protected]) | Please file GitHub issues in warp or contact [Kylee Degatano](mailto:[email protected]) |
 
 # Table of Contents
 - [Illumina Genotyping Array Pipeline Overview](#illumina-genotyping-array-pipeline-overview)
-- [Introduction to the Illumina Genotyping Array] Pipeline](#introduction-to-the-illumina-genotyping-array-pipeline)
+- [Introduction to the Illumina Genotyping Array Pipeline](#introduction-to-the-illumina-genotyping-array-pipeline)
 - [Set-up](#set-up)
   * [Workflow Installation and Requirements](#workflow-installation-and-requirements)
   * [Inputs](#inputs)
@@ -190,7 +190,7 @@ The tables below summarize all of the workflow's output according to task. Outpu
 | chip_well_barcode.vcf.gz | VCF generated by the pipeline | Required | Compressed VCF (vcf.gz) |
 | chip_well_barcode.vcf.gz.tbi | Index file of the VCF generated by the pipeline | Required | tabix index (vcf.gz.tbi) |
 | chip_well_barcode.gtc | GTC file generated by Autocall | Required | GTC |
-| chip_well_barcode.bafregress_metrics | Text output file generated by BafRegress | Optional | txt |
+| chip_well_barcode.bafregress_results_file | Text output file generated by BafRegress | Optional | txt |
 | chip_well_barcode.verifyidintensity_metrics | File containing metrics generated by VerifyIDIntensity | Required | txt |
 | chip_well_barcode.arrays_variant_calling_detail_metrics | Detailed metrics file for the output VCF generated by CollectArraysVariantCallingMetrics.detail_metrics | Required | txt | 
 | chip_well_barcode.arrays_variant_calling_summary_metrics | Summary metrics file for the output VCF as generated by CollectArraysVariantCallingMetrics | Required | txt |

diff --git a/pipelines/cemba/cemba_methylcseq/CEMBA.methods.md b/pipelines/cemba/cemba_methylcseq/CEMBA.methods.md
@@ -1,9 +1,9 @@
-# CEMBA_v1.0 Publication Methods
+# CEMBA_v1.0.0 Publication Methods
 
 Below we provide a sample methods section for a publication. For the complete pipeline documentation, see the [CEMBA README](README.md).
 
 ### Methods
 
-Data processing was performed with the CEMBA v1.0 Pipeline. Sequencing reads were first trimmed to remove adaptors using Cutadapt 1.18 with the following parameters in paired-end mode: -f fastq -quality-cutoff 20 -minimum-length 62 -a AGATCGGAAGAGCACACGTCTGAAC -A AGATCGGAAGAGCGTCGTGTAGGGA. After trimming the adapters, an unaligned BAM (uBAM) for the trimmed R1 FASTQ was created using Picard v2.18.23. Cell barcodes were then extracted from the trimmed R1 FASTQ and tagged to the R1 uBAM with Single Cell Tools (sctools) v0.3.4a using a barcode whitelist as well as configurable barcode start positions and lengths. Next, for multiplexed samples, the random primer index sequence and Adaptase C/T tail were further removed from the adaptor-trimmed R1 and R2 FASTQs using Cutadapt with the following parameters: -f fastq -quality-cutoff 16 -quality-cutoff -16 -minimum-length 30. The trimmed R1 and R2 reads were then aligned to mouse (mm10) or human (hg19) genomes separately as single-end reads using Bismark v0.21.0 with the parameters --bowtie2 --icpc --X 2000 (paired-end mode) and --pbat (activated for mapping R1 reads). After alignment, the output R1 and R2 BAMs were sorted in coordinate order and duplicates removed using the Picard MarkDuplicates REMOVE_DUPLICATE option. Samtools 1.9 was used to further filter BAMs with a minimum map quality of 30 using the parameter -bhq 30. Methylation reports were produced for the filtered BAMs using Bismark. The barcodes from the R1 uBAM were then attached to the aligned, filtered R1 BAM with Picard. The R1 and R2 BAMs were merged with Samtools. Readnames were added to the merged BAM and a methylated VCF created using MethylationTypeCaller in GATK 4.1.2.0. Samtools was then used to calculate coverage depth for sites with coverage greater than 1 and to create BAM index files. The final outputs included the barcoded aligned BAM, BAM index, a VCF with locus-specific methylation information, VCF index, and methylation reports. 
+Data processing was performed with the CEMBA v1.0.0 Pipeline. Sequencing reads were first trimmed to remove adaptors using Cutadapt 1.18 with the following parameters in paired-end mode: -f fastq -quality-cutoff 20 -minimum-length 62 -a AGATCGGAAGAGCACACGTCTGAAC -A AGATCGGAAGAGCGTCGTGTAGGGA. After trimming the adapters, an unaligned BAM (uBAM) for the trimmed R1 FASTQ was created using Picard v2.18.23. Cell barcodes were then extracted from the trimmed R1 FASTQ and tagged to the R1 uBAM with Single Cell Tools (sctools) v0.3.4a using a barcode whitelist as well as configurable barcode start positions and lengths. Next, for multiplexed samples, the random primer index sequence and Adaptase C/T tail were further removed from the adaptor-trimmed R1 and R2 FASTQs using Cutadapt with the following parameters: -f fastq -quality-cutoff 16 -quality-cutoff -16 -minimum-length 30. The trimmed R1 and R2 reads were then aligned to mouse (mm10) or human (hg19) genomes separately as single-end reads using Bismark v0.21.0 with the parameters --bowtie2 --icpc --X 2000 (paired-end mode) and --pbat (activated for mapping R1 reads). After alignment, the output R1 and R2 BAMs were sorted in coordinate order and duplicates removed using the Picard MarkDuplicates REMOVE_DUPLICATE option. Samtools 1.9 was used to further filter BAMs with a minimum map quality of 30 using the parameter -bhq 30. Methylation reports were produced for the filtered BAMs using Bismark. The barcodes from the R1 uBAM were then attached to the aligned, filtered R1 BAM with Picard. The R1 and R2 BAMs were merged with Samtools. Readnames were added to the merged BAM and a methylated VCF created using MethylationTypeCaller in GATK 4.1.2.0. Samtools was then used to calculate coverage depth for sites with coverage greater than 1 and to create BAM index files. The final outputs included the barcoded aligned BAM, BAM index, a VCF with locus-specific methylation information, VCF index, and methylation reports. 
 
 An example of the pipeline and its outputs is available on Terra (https://app.terra.bio/#workspaces/brain-initiative-bcdc/Methyl-c-seq_Pipeline). Examples of genomic reference files and other inputs can be found in the pipeline’s [example JSON](example_inputs/CEMBA.inputs.json). 
diff --git a/pipelines/cemba/cemba_methylcseq/README.md b/pipelines/cemba/cemba_methylcseq/README.md
@@ -1,6 +1,6 @@
 | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback |
 | :----: | :---: | :----: | :--------------: |
-| [CEMBA_v1.0](CEMBA.wdl) | July 28, 2020 | [Elizabeth Kiernan](mailto:[email protected]) | Please file GitHub issues in warp or contact [Kylee Degatano](mailto:[email protected]) |
+| [CEMBA_v1.0.0](https://github.com/broadinstitute/warp/releases) | July 28, 2020 | [Elizabeth Kiernan](mailto:[email protected]) | Please file GitHub issues in warp or contact [Kylee Degatano](mailto:[email protected]) |
 
 # Table of Contents
 

diff --git a/pipelines/skylab/optimus/Optimus.changelog.md b/pipelines/skylab/optimus/Optimus.changelog.md
@@ -1,10 +1,17 @@
+# 4.1.1
+
+2020-10-07 (Date of Last Commit)
+
+* Removed extra trailing slash in ouput directory from cloud to cloud copy job
+
+* Removed fastq_suffix optional input - the pipeline now dynamically determines if a file is zipped
+
 # 4.1.0
 
 2020-10-05 (Date of Last Commit)
 
 * Updated sctools dockers and made them consistent across the Optimus pipeline
 
-
 # 4.0.2
 
 2020-09-30 (Date of Last Commit)
@@ -18,7 +25,6 @@
 
 * Refactored the pipeline to preprocess fastqs using the task `FastqProcessing`. Outputs are identical and the pipeline should be significantly faster
 
-
 # 4.0.0
 
 2020-08-10 (Date of Last Commit)

diff --git a/pipelines/skylab/optimus/Optimus.wdl b/pipelines/skylab/optimus/Optimus.wdl
@@ -42,9 +42,6 @@ workflow Optimus {
     # tenX_v2, tenX_v3
     String chemistry = "tenX_v2" 
 
-    # environment-specific parameters
-    String fastq_suffix = ""
-
     # Emptydrops lower cutoff
     Int emptydrops_lower = 100
 
@@ -58,7 +55,8 @@ workflow Optimus {
   }
 
   # version of this pipeline
-  String pipeline_version = "4.1.0"
+
+  String pipeline_version = "4.1.1"
 
   # this is used to scatter matched [r1_fastq, r2_fastq, i1_fastq] arrays
   Array[Int] indices = range(length(r1_fastq))
@@ -73,8 +71,6 @@ workflow Optimus {
     ref_genome_fasta: "genome fasta file (must match star reference)"
     whitelist: "10x genomics cell barcode whitelist"
     tenX_v3_chemistry: "assume 10X Genomics v3 chemistry with 12bp UMI (in contrast to default v2 with 10bp UMI)"
-    fastq_suffix: "when running in green box, need to add '.gz' for picard to detect the compression"
-    output_zarr: "whether to run the taks that converts the outputs to Zarr format, by default it's true"
     force_no_check: "Set to true to override input checks and allow pipeline to proceed with invalid input"
   }
 

diff --git a/pipelines/skylab/optimus/README.md b/pipelines/skylab/optimus/README.md
@@ -1,6 +1,6 @@
 | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback |
 | :----: | :---: | :----: | :--------------: |
-| [optimus_v4.0.1](https://github.com/broadinstitute/warp/releases/tag/Optimus_v4.0.1) | September 14, 2020 | [Elizabeth Kiernan](mailto:[email protected]) | Please file GitHub issues in warp or contact [Kylee Degatano](mailto:[email protected]) |
+| [optimus_v4.0.2](https://github.com/broadinstitute/warp/releases) | September 14, 2020 | [Elizabeth Kiernan](mailto:[email protected]) | Please file GitHub issues in warp or contact [Kylee Degatano](mailto:[email protected]) |
 
 
 # Table of Contents