From b7e44b18af900c17ce9b0c88097269fc0a3401a0 Mon Sep 17 00:00:00 2001 From: ggrant Date: Mon, 6 Jan 2025 17:07:20 -0500 Subject: [PATCH 01/31] Testing framework. --- .dockstore.yml | 8 + .../wdl/test/GvsQuickstartVATIntegration.wdl | 243 ++++++++++++++++++ 2 files changed, 251 insertions(+) create mode 100644 scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl diff --git a/.dockstore.yml b/.dockstore.yml index 14de032bf53..93da6e43e0d 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -319,6 +319,14 @@ workflows: - vs_1516_yolo tags: - /.*/ + - name: GvsQuickstartVATIntegration + subclass: WDL + primaryDescriptorPath: /scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl + filters: + branches: + - gg_VS-1549_AddVATToIntegrationTests + tags: + - /.*/ - name: GvsIngestTieout subclass: WDL primaryDescriptorPath: /scripts/variantstore/wdl/test/GvsIngestTieout.wdl diff --git a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl new file mode 100644 index 00000000000..0d7d4078492 --- /dev/null +++ b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl @@ -0,0 +1,243 @@ +version 1.0 + +import "../GvsUtils.wdl" as Utils +import "../../variant-annotations-table/GvsCreateVATfromVDS.wdl" as CreateVATFromVDS + +workflow GvsQuickstartVcfIntegration { + input { + String git_branch_or_tag + String? git_hash + String expected_output_prefix + String dataset_suffix + Boolean use_default_dockers = false + Boolean check_expected_cost_and_table_size_outputs = true + String? basic_docker + String? cloud_sdk_docker + String? cloud_sdk_slim_docker + String? variants_docker + String? variants_nirvana_docker + String? gatk_docker + File? gatk_override + String sample_id_column_name + String vcf_files_column_name + String vcf_index_files_column_name + String? sample_set_name ## NOTE: currently we only allow the loading of one sample set at a time + + String? workspace_bucket + String? workspace_id + String? submission_id + + File? target_interval_list + Int? maximum_alternate_alleles + } + String project_id = "gvs-internal" + File reference_fasta = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta" + + # WDL 1.0 trick to set a variable ('none') to be undefined. + if (false) { + File? none = "" + } + + call Utils.GetToolVersions { + input: + git_branch_or_tag = git_branch_or_tag, + } + + String effective_basic_docker = select_first([basic_docker, GetToolVersions.basic_docker]) + String effective_cloud_sdk_docker = select_first([cloud_sdk_docker, GetToolVersions.cloud_sdk_docker]) + String effective_cloud_sdk_slim_docker = select_first([cloud_sdk_slim_docker, GetToolVersions.cloud_sdk_slim_docker]) + String effective_variants_docker = select_first([variants_docker, GetToolVersions.variants_docker]) + String effective_variants_nirvana_docker = select_first([variants_nirvana_docker, GetToolVersions.variants_nirvana_docker]) + String effective_gatk_docker = select_first([gatk_docker, GetToolVersions.gatk_docker]) + String effective_git_hash = select_first([git_hash, GetToolVersions.git_hash]) + + String effective_workspace_bucket = select_first([workspace_bucket, GetToolVersions.workspace_bucket]) + String effective_workspace_id = select_first([workspace_id, GetToolVersions.workspace_id]) + String effective_submission_id = select_first([submission_id, GetToolVersions.submission_id]) + + if (!use_default_dockers && !defined(gatk_override)) { + call Utils.BuildGATKJar { + input: + git_branch_or_tag = git_branch_or_tag, + cloud_sdk_slim_docker = effective_cloud_sdk_slim_docker, + } + } + + call Utils.CreateDatasetForTest { + input: + git_branch_or_tag = git_branch_or_tag, + dataset_prefix = "quickit", + dataset_suffix = dataset_suffix, + cloud_sdk_docker = effective_cloud_sdk_docker, + } + + String extract_output_gcs_dir = "~{effective_workspace_bucket}/output_vcfs/by_submission_id/~{effective_submission_id}/~{dataset_suffix}" + + call CreateVATFromVDS.GvsCreateVATfromVDS as CreateVATfromVDS { + input: + project_id = project_id, + dataset_name = CreateDatasetForTest.dataset_name, + ancestry_file = "todo", + filter_set_name = "quickit", + vds_path = "todo", + output_path = "todo", + + git_branch_or_tag = git_branch_or_tag, + basic_docker = effective_basic_docker, + cloud_sdk_docker = effective_cloud_sdk_docker, + gatk_docker = effective_gatk_docker, + variants_docker = effective_variants_docker, + variants_nirvana_docker = effective_variants_nirvana_docker, + } + + String expected_prefix = expected_output_prefix + dataset_suffix + "/" +# call AssertIdenticalOutputs { +# input: +# expected_output_prefix = expected_prefix, +# expected_output_suffix = if (bgzip_output_vcfs) then ".bgz" else ".gz", +# actual_vcfs = JointVariantCalling.output_vcfs, +# gatk_docker = effective_gatk_docker +# } +# +# if (check_expected_cost_and_table_size_outputs) { +# call AssertCostIsTrackedAndExpected { +# input: +# go = JointVariantCalling.done, +# dataset_name = CreateDatasetForTest.dataset_name, +# project_id = project_id, +# expected_output_csv = expected_prefix + "cost_observability.csv", +# cloud_sdk_docker = effective_cloud_sdk_docker, +# } +# +# call AssertTableSizesAreExpected { +# input: +# go = JointVariantCalling.done, +# dataset_name = CreateDatasetForTest.dataset_name, +# project_id = project_id, +# expected_output_csv = expected_prefix + "table_sizes.csv", +# cloud_sdk_docker = effective_cloud_sdk_docker, +# } +# } + + + output { +# Array[File] output_vcfs = JointVariantCalling.output_vcfs +# Array[File] output_vcf_indexes = JointVariantCalling.output_vcf_indexes +# Float total_vcfs_size_mb = JointVariantCalling.total_vcfs_size_mb +# File manifest = JointVariantCalling.manifest + String dataset_name = CreateDatasetForTest.dataset_name + String filter_set_name = "quickit" + String recorded_git_hash = effective_git_hash + Boolean done = true +# Boolean used_tighter_gcp_quotas = JointVariantCalling.used_tighter_gcp_quotas + } +} + +task AssertIdenticalOutputs { + input { + String expected_output_prefix + String expected_output_suffix + Array[File] actual_vcfs + String gatk_docker + } + parameter_meta { + actual_vcfs: { + localization_optional: true + } + } + command <<< + # Prepend date, time and pwd to xtrace log entries. + PS4='\D{+%F %T} \w $ ' + set -o errexit -o nounset -o pipefail -o xtrace + + failures=() + + # Where the current set of expected results lives in the cloud + expected_prefix="~{expected_output_prefix}" + # Remove a trailing slash if there is one + expected_prefix=${expected_prefix%/} + + # Download all the expected data + mkdir expected + cd expected + gcloud storage cp -r "${expected_prefix}"'/*.vcf~{expected_output_suffix}' . + gzip -S ~{expected_output_suffix} -d *~{expected_output_suffix} + cd .. + + mkdir actual + cd actual + touch actual_manifest.txt + # Making the manifest is pretty uninteresting and very noisy so turn off xtrace temporarily. + set +o xtrace + for actual in ~{sep=' ' actual_vcfs} + do + echo $actual >> actual_manifest.txt + done + set -o xtrace + + cat actual_manifest.txt | gcloud storage cp -I . + # Unzip actual result data. + ls -1 | grep -E '\.vcf\~{expected_output_suffix}$' | xargs gzip -S ~{expected_output_suffix} -d + cd .. + + echo "Header Check" + # Headers first, these can yield useful diagnostics when there are mismatches. + for vcf in $(ls -1 actual | grep -E '\.vcf$') + do + actual="actual/$vcf" + expected="expected/$vcf" + set +o errexit + cmp <(grep '^#' $actual | grep -E -v '^##GATKCommandLine=') <(grep '^#' $expected | grep -E -v '^##GATKCommandLine=') + rc=$? + set -o errexit + if [[ $rc -ne 0 ]]; then + # If there is a mismatch add it to a list of failures but keep on looking for mismatches. + failures+=( $vcf ) + fi + done + + echo "Header Failure Check" + if [[ ${#failures[@]} -ne 0 ]]; then + echo "Error: headers for the following files do not match:" + for failure in ${failures[@]}; do + echo $failure + expected="expected/$failure" + actual="actual/$failure" + diff <(grep '^#' $actual) <(grep '^#' $expected) + done + exit 1 + fi + + echo "Overall Check" + # If the headers all matched look for any mismatches in overall file content. + fail=0 + for vcf in $(ls -1 actual | grep -E '\.vcf$') + do + expected="expected/$vcf" + actual="actual/$vcf" + set +o errexit + cmp <(grep -E -v '^##GATKCommandLine=' $actual) <(grep -E -v '^##GATKCommandLine=' $expected) + rc=$? + set -o errexit + if [[ $rc -ne 0 ]]; then + echo "Error: file contents of expected and actual do not match: $vcf" + fail=1 + fi + done + + if [[ $fail -ne 0 ]]; then + exit 1 + fi + + echo "All vcfs compared and matched!" + >>> + + runtime { + docker: gatk_docker + disks: "local-disk 500 HDD" + } + + output { + Boolean done = true + } +} From db988310ae4e994b9b26fa8c6860d10746ef4d8b Mon Sep 17 00:00:00 2001 From: ggrant Date: Mon, 6 Jan 2025 17:15:44 -0500 Subject: [PATCH 02/31] A little clean up --- .../wdl/test/GvsQuickstartVATIntegration.wdl | 9 --------- 1 file changed, 9 deletions(-) diff --git a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl index 0d7d4078492..0e7ea7562b3 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl @@ -10,7 +10,6 @@ workflow GvsQuickstartVcfIntegration { String expected_output_prefix String dataset_suffix Boolean use_default_dockers = false - Boolean check_expected_cost_and_table_size_outputs = true String? basic_docker String? cloud_sdk_docker String? cloud_sdk_slim_docker @@ -18,20 +17,12 @@ workflow GvsQuickstartVcfIntegration { String? variants_nirvana_docker String? gatk_docker File? gatk_override - String sample_id_column_name - String vcf_files_column_name - String vcf_index_files_column_name - String? sample_set_name ## NOTE: currently we only allow the loading of one sample set at a time String? workspace_bucket String? workspace_id String? submission_id - - File? target_interval_list - Int? maximum_alternate_alleles } String project_id = "gvs-internal" - File reference_fasta = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta" # WDL 1.0 trick to set a variable ('none') to be undefined. if (false) { From d11cb2ba05d87eaf3920ea0051b84a02b7d0848b Mon Sep 17 00:00:00 2001 From: ggrant Date: Tue, 7 Jan 2025 11:52:43 -0500 Subject: [PATCH 03/31] Add some required inputs --- .../wdl/test/GvsQuickstartVATIntegration.wdl | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl index 0e7ea7562b3..8a4cf8e3aee 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl @@ -7,6 +7,9 @@ workflow GvsQuickstartVcfIntegration { input { String git_branch_or_tag String? git_hash + String vds_path + String ancestry_path + String output_path String expected_output_prefix String dataset_suffix Boolean use_default_dockers = false @@ -68,10 +71,10 @@ workflow GvsQuickstartVcfIntegration { input: project_id = project_id, dataset_name = CreateDatasetForTest.dataset_name, - ancestry_file = "todo", + ancestry_file = ancestry_path, filter_set_name = "quickit", - vds_path = "todo", - output_path = "todo", + vds_path = vds_path, + output_path = output_path, git_branch_or_tag = git_branch_or_tag, basic_docker = effective_basic_docker, From 7b9cb648bb82d7b7da2eaa6e7aec42d1734f04b8 Mon Sep 17 00:00:00 2001 From: ggrant Date: Tue, 7 Jan 2025 12:23:09 -0500 Subject: [PATCH 04/31] Defined the scatter count --- scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl index 8a4cf8e3aee..5b654e6baa0 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl @@ -10,6 +10,7 @@ workflow GvsQuickstartVcfIntegration { String vds_path String ancestry_path String output_path + String split_intervals_scatter_count = 10 String expected_output_prefix String dataset_suffix Boolean use_default_dockers = false @@ -75,6 +76,7 @@ workflow GvsQuickstartVcfIntegration { filter_set_name = "quickit", vds_path = vds_path, output_path = output_path, + split_intervals_scatter_count = split_intervals_scatter_count, git_branch_or_tag = git_branch_or_tag, basic_docker = effective_basic_docker, From ea3c08217e9aaee50517946b9001459eb9a1740b Mon Sep 17 00:00:00 2001 From: ggrant Date: Fri, 17 Jan 2025 10:45:22 -0500 Subject: [PATCH 05/31] Minor documentation updated --- .../wdl/test/GvsQuickstartVATIntegration.wdl | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl index 5b654e6baa0..b459eb29091 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl @@ -7,8 +7,7 @@ workflow GvsQuickstartVcfIntegration { input { String git_branch_or_tag String? git_hash - String vds_path - String ancestry_path + Boolean use_vds = true String output_path String split_intervals_scatter_count = 10 String expected_output_prefix @@ -23,11 +22,16 @@ workflow GvsQuickstartVcfIntegration { File? gatk_override String? workspace_bucket - String? workspace_id String? submission_id } String project_id = "gvs-internal" + File input_data_prefix = "gs://gvs-internal-quickstart/integration/test_data/2025-01-17/" + File ancestry_path = input_data_prefix + "quickstart_ancestry.tsv" + File? vds_path = if (use_vds) then input_data_prefix + "gvs_export.vds" else none + File? sites_only_vcf = if (!use_vds) then input_data_prefix + "todo" else none + File? sites_only_vcf_index = if (!use_vds) then input_data_prefix + "todo" else none + # WDL 1.0 trick to set a variable ('none') to be undefined. if (false) { File? none = "" @@ -47,7 +51,6 @@ workflow GvsQuickstartVcfIntegration { String effective_git_hash = select_first([git_hash, GetToolVersions.git_hash]) String effective_workspace_bucket = select_first([workspace_bucket, GetToolVersions.workspace_bucket]) - String effective_workspace_id = select_first([workspace_id, GetToolVersions.workspace_id]) String effective_submission_id = select_first([submission_id, GetToolVersions.submission_id]) if (!use_default_dockers && !defined(gatk_override)) { @@ -75,6 +78,8 @@ workflow GvsQuickstartVcfIntegration { ancestry_file = ancestry_path, filter_set_name = "quickit", vds_path = vds_path, + sites_only_vcf = sites_only_vcf, + sites_only_vcf_index = sites_only_vcf_index, output_path = output_path, split_intervals_scatter_count = split_intervals_scatter_count, From de4a19b5efce3d558b31ab10089ea6ecd1064361 Mon Sep 17 00:00:00 2001 From: ggrant Date: Fri, 17 Jan 2025 14:23:01 -0500 Subject: [PATCH 06/31] Try to run the test with a sites only VCF as an alternate input --- .../variantstore/wdl/test/GvsQuickstartVATIntegration.wdl | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl index b459eb29091..9c81f7a4a0e 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl @@ -3,11 +3,11 @@ version 1.0 import "../GvsUtils.wdl" as Utils import "../../variant-annotations-table/GvsCreateVATfromVDS.wdl" as CreateVATFromVDS -workflow GvsQuickstartVcfIntegration { +workflow GvsQuickstartVATIntegration { input { String git_branch_or_tag String? git_hash - Boolean use_vds = true + Boolean use_vds = true # If true, use a VDS, otherwise use a sites only VCF. String output_path String split_intervals_scatter_count = 10 String expected_output_prefix @@ -29,8 +29,7 @@ workflow GvsQuickstartVcfIntegration { File input_data_prefix = "gs://gvs-internal-quickstart/integration/test_data/2025-01-17/" File ancestry_path = input_data_prefix + "quickstart_ancestry.tsv" File? vds_path = if (use_vds) then input_data_prefix + "gvs_export.vds" else none - File? sites_only_vcf = if (!use_vds) then input_data_prefix + "todo" else none - File? sites_only_vcf_index = if (!use_vds) then input_data_prefix + "todo" else none + File? sites_only_vcf = if (!use_vds) then input_data_prefix + "quickstart_sites_only.vcf.bgz" else none # WDL 1.0 trick to set a variable ('none') to be undefined. if (false) { @@ -79,7 +78,6 @@ workflow GvsQuickstartVcfIntegration { filter_set_name = "quickit", vds_path = vds_path, sites_only_vcf = sites_only_vcf, - sites_only_vcf_index = sites_only_vcf_index, output_path = output_path, split_intervals_scatter_count = split_intervals_scatter_count, From 6324fe56f7d21525cbfdb00c770c5d60d5718b79 Mon Sep 17 00:00:00 2001 From: ggrant Date: Thu, 23 Jan 2025 13:53:40 -0500 Subject: [PATCH 07/31] Adding validation of the VDS and size check on db table --- .../GvsCreateVATfromVDS.wdl | 7 +- .../wdl/test/GvsQuickstartVATIntegration.wdl | 268 +++++++++++------- 2 files changed, 164 insertions(+), 111 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 2d3120093de..95dfbdc66a5 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -85,7 +85,7 @@ workflow GvsCreateVATfromVDS { # If the vat version is undefined or v1 then the vat tables would be named like filter_vat, otherwise filter_vat_v2. String effective_vat_version = if (defined(vat_version) && select_first([vat_version]) != "v1") then "_" + select_first([vat_version]) else "" - String vat_table_name = filter_set_name + "_vat" + effective_vat_version + String effective_vat_table_name = filter_set_name + "_vat" + effective_vat_version String output_path_without_a_trailing_slash = sub(output_path, "/$", "") String effective_output_path = if (output_path == output_path_without_a_trailing_slash) then output_path + "/" else output_path @@ -261,7 +261,7 @@ workflow GvsCreateVATfromVDS { project_id = project_id, dataset_name = dataset_name, output_path = effective_output_path, - base_vat_table_name = vat_table_name, + base_vat_table_name = effective_vat_table_name, prep_vt_json_done = PrepVtAnnotationJson.done, prep_genes_json_done = PrepGenesAnnotationJson.done, cloud_sdk_docker = effective_cloud_sdk_docker, @@ -270,7 +270,7 @@ workflow GvsCreateVATfromVDS { call DeduplicateVatInBigQuery { input: input_vat_table_name = BigQueryLoadJson.vat_table, - output_vat_table_name = vat_table_name, + output_vat_table_name = effective_vat_table_name, nirvana_schema = MakeSubpopulationFilesAndReadSchemaFiles.vat_schema_json_file, project_id = project_id, dataset_name = dataset_name, @@ -293,6 +293,7 @@ workflow GvsCreateVATfromVDS { } output { + String vat_table_name = effective_vat_table_name String? cluster_name = GenerateSitesOnlyVcf.cluster_name File? dropped_sites_file = MergeTsvs.output_file File? final_tsv_file = GvsCreateVATFilesFromBigQuery.final_tsv_file diff --git a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl index 9c81f7a4a0e..c089fdcce6c 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl @@ -2,6 +2,7 @@ version 1.0 import "../GvsUtils.wdl" as Utils import "../../variant-annotations-table/GvsCreateVATfromVDS.wdl" as CreateVATFromVDS +import "../../variant-annotations-table/GvsValidateVAT.wdl" as ValidateVAT workflow GvsQuickstartVATIntegration { input { @@ -68,8 +69,6 @@ workflow GvsQuickstartVATIntegration { cloud_sdk_docker = effective_cloud_sdk_docker, } - String extract_output_gcs_dir = "~{effective_workspace_bucket}/output_vcfs/by_submission_id/~{effective_submission_id}/~{dataset_suffix}" - call CreateVATFromVDS.GvsCreateVATfromVDS as CreateVATfromVDS { input: project_id = project_id, @@ -89,6 +88,15 @@ workflow GvsQuickstartVATIntegration { variants_nirvana_docker = effective_variants_nirvana_docker, } + call ValidateVAT.GvsValidateVat { + input: + project_id = project_id, + dataset_name = CreateDatasetForTest.dataset_name, + vat_table_name = CreateVATfromVDS.vat_table_name, + cloud_sdk_docker = effective_cloud_sdk_docker, + variants_docker = effective_variants_docker, + } + String expected_prefix = expected_output_prefix + dataset_suffix + "/" # call AssertIdenticalOutputs { # input: @@ -108,135 +116,179 @@ workflow GvsQuickstartVATIntegration { # cloud_sdk_docker = effective_cloud_sdk_docker, # } # -# call AssertTableSizesAreExpected { -# input: -# go = JointVariantCalling.done, -# dataset_name = CreateDatasetForTest.dataset_name, -# project_id = project_id, -# expected_output_csv = expected_prefix + "table_sizes.csv", -# cloud_sdk_docker = effective_cloud_sdk_docker, -# } -# } - + call AssertTableSizeIsAsExpected { + input: + dataset_name = CreateDatasetForTest.dataset_name, + project_id = project_id, + vat_table_name = CreateVATfromVDS.vat_table_name, + expected_output_csv = expected_prefix + "table_sizes.csv", + cloud_sdk_docker = effective_cloud_sdk_docker, + } output { -# Array[File] output_vcfs = JointVariantCalling.output_vcfs -# Array[File] output_vcf_indexes = JointVariantCalling.output_vcf_indexes -# Float total_vcfs_size_mb = JointVariantCalling.total_vcfs_size_mb -# File manifest = JointVariantCalling.manifest + String dataset_name = CreateDatasetForTest.dataset_name String filter_set_name = "quickit" String recorded_git_hash = effective_git_hash Boolean done = true -# Boolean used_tighter_gcp_quotas = JointVariantCalling.used_tighter_gcp_quotas } } -task AssertIdenticalOutputs { - input { - String expected_output_prefix - String expected_output_suffix - Array[File] actual_vcfs - String gatk_docker +#task AssertIdenticalOutputs { +# input { +# String expected_output_prefix +# String expected_output_suffix +# Array[File] actual_vcfs +# String gatk_docker +# } +# parameter_meta { +# actual_vcfs: { +# localization_optional: true +# } +# } +# command <<< +# # Prepend date, time and pwd to xtrace log entries. +# PS4='\D{+%F %T} \w $ ' +# set -o errexit -o nounset -o pipefail -o xtrace +# +# failures=() +# +# # Where the current set of expected results lives in the cloud +# expected_prefix="~{expected_output_prefix}" +# # Remove a trailing slash if there is one +# expected_prefix=${expected_prefix%/} +# +# # Download all the expected data +# mkdir expected +# cd expected +# gcloud storage cp -r "${expected_prefix}"'/*.vcf~{expected_output_suffix}' . +# gzip -S ~{expected_output_suffix} -d *~{expected_output_suffix} +# cd .. +# +# mkdir actual +# cd actual +# touch actual_manifest.txt +# # Making the manifest is pretty uninteresting and very noisy so turn off xtrace temporarily. +# set +o xtrace +# for actual in ~{sep=' ' actual_vcfs} +# do +# echo $actual >> actual_manifest.txt +# done +# set -o xtrace +# +# cat actual_manifest.txt | gcloud storage cp -I . +# # Unzip actual result data. +# ls -1 | grep -E '\.vcf\~{expected_output_suffix}$' | xargs gzip -S ~{expected_output_suffix} -d +# cd .. +# +# echo "Header Check" +# # Headers first, these can yield useful diagnostics when there are mismatches. +# for vcf in $(ls -1 actual | grep -E '\.vcf$') +# do +# actual="actual/$vcf" +# expected="expected/$vcf" +# set +o errexit +# cmp <(grep '^#' $actual | grep -E -v '^##GATKCommandLine=') <(grep '^#' $expected | grep -E -v '^##GATKCommandLine=') +# rc=$? +# set -o errexit +# if [[ $rc -ne 0 ]]; then +# # If there is a mismatch add it to a list of failures but keep on looking for mismatches. +# failures+=( $vcf ) +# fi +# done +# +# echo "Header Failure Check" +# if [[ ${#failures[@]} -ne 0 ]]; then +# echo "Error: headers for the following files do not match:" +# for failure in ${failures[@]}; do +# echo $failure +# expected="expected/$failure" +# actual="actual/$failure" +# diff <(grep '^#' $actual) <(grep '^#' $expected) +# done +# exit 1 +# fi +# +# echo "Overall Check" +# # If the headers all matched look for any mismatches in overall file content. +# fail=0 +# for vcf in $(ls -1 actual | grep -E '\.vcf$') +# do +# expected="expected/$vcf" +# actual="actual/$vcf" +# set +o errexit +# cmp <(grep -E -v '^##GATKCommandLine=' $actual) <(grep -E -v '^##GATKCommandLine=' $expected) +# rc=$? +# set -o errexit +# if [[ $rc -ne 0 ]]; then +# echo "Error: file contents of expected and actual do not match: $vcf" +# fail=1 +# fi +# done +# +# if [[ $fail -ne 0 ]]; then +# exit 1 +# fi +# +# echo "All vcfs compared and matched!" +# >>> +# +# runtime { +# docker: gatk_docker +# disks: "local-disk 500 HDD" +# } +# +# output { +# Boolean done = true +# } +#} + +task AssertTableSizeIsAsExpected { + meta { + # we want to check the database each time this runs + volatile: true } - parameter_meta { - actual_vcfs: { - localization_optional: true - } + + input { + String dataset_name + String project_id + String vat_table_name + File expected_output_csv + String cloud_sdk_docker } + command <<< # Prepend date, time and pwd to xtrace log entries. PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace - failures=() - - # Where the current set of expected results lives in the cloud - expected_prefix="~{expected_output_prefix}" - # Remove a trailing slash if there is one - expected_prefix=${expected_prefix%/} - - # Download all the expected data - mkdir expected - cd expected - gcloud storage cp -r "${expected_prefix}"'/*.vcf~{expected_output_suffix}' . - gzip -S ~{expected_output_suffix} -d *~{expected_output_suffix} - cd .. - - mkdir actual - cd actual - touch actual_manifest.txt - # Making the manifest is pretty uninteresting and very noisy so turn off xtrace temporarily. - set +o xtrace - for actual in ~{sep=' ' actual_vcfs} - do - echo $actual >> actual_manifest.txt - done - set -o xtrace - - cat actual_manifest.txt | gcloud storage cp -I . - # Unzip actual result data. - ls -1 | grep -E '\.vcf\~{expected_output_suffix}$' | xargs gzip -S ~{expected_output_suffix} -d - cd .. - - echo "Header Check" - # Headers first, these can yield useful diagnostics when there are mismatches. - for vcf in $(ls -1 actual | grep -E '\.vcf$') - do - actual="actual/$vcf" - expected="expected/$vcf" - set +o errexit - cmp <(grep '^#' $actual | grep -E -v '^##GATKCommandLine=') <(grep '^#' $expected | grep -E -v '^##GATKCommandLine=') - rc=$? - set -o errexit - if [[ $rc -ne 0 ]]; then - # If there is a mismatch add it to a list of failures but keep on looking for mismatches. - failures+=( $vcf ) - fi - done - - echo "Header Failure Check" - if [[ ${#failures[@]} -ne 0 ]]; then - echo "Error: headers for the following files do not match:" - for failure in ${failures[@]}; do - echo $failure - expected="expected/$failure" - actual="actual/$failure" - diff <(grep '^#' $actual) <(grep '^#' $expected) - done - exit 1 - fi + mkdir output - echo "Overall Check" - # If the headers all matched look for any mismatches in overall file content. - fail=0 - for vcf in $(ls -1 actual | grep -E '\.vcf$') - do - expected="expected/$vcf" - actual="actual/$vcf" - set +o errexit - cmp <(grep -E -v '^##GATKCommandLine=' $actual) <(grep -E -v '^##GATKCommandLine=' $expected) - rc=$? - set -o errexit - if [[ $rc -ne 0 ]]; then - echo "Error: file contents of expected and actual do not match: $vcf" - fail=1 - fi - done - - if [[ $fail -ne 0 ]]; then - exit 1 - fi + echo "project_id = ~{project_id}" > ~/.bigqueryrc + bq --apilog=false query --project_id=~{project_id} --format=csv --use_legacy_sql=false \ + SELECT 'vat_total' AS total_name, sum(total_billable_bytes) AS total_bytes \ + FROM \`~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS\` \ + WHERE table_name = '~{vat_table_name}'" > output/table_sizes.csv + + set +o errexit + diff -w output/table_sizes.csv ~{expected_output_csv} > differences.txt + set -o errexit - echo "All vcfs compared and matched!" + if [[ -s differences.txt ]]; then + echo "Differences found:" + cat differences.txt + exit 1 + fi >>> runtime { - docker: gatk_docker - disks: "local-disk 500 HDD" + docker: cloud_sdk_docker + disks: "local-disk 10 HDD" } output { - Boolean done = true + File table_sizes_output_csv = "output/table_sizes.csv" + File differences = "differences.txt" } } + From ad8bbfcf79c752131c5f71a319f40c146c5dd5cb Mon Sep 17 00:00:00 2001 From: ggrant Date: Thu, 23 Jan 2025 15:59:54 -0500 Subject: [PATCH 08/31] Fix two bugs in validation --- .../GvsValidateVAT.wdl | 32 +++++++++++-------- .../wdl/test/GvsQuickstartVATIntegration.wdl | 2 +- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsValidateVAT.wdl b/scripts/variantstore/variant-annotations-table/GvsValidateVAT.wdl index 3b9e30958a1..e339cc19beb 100644 --- a/scripts/variantstore/variant-annotations-table/GvsValidateVAT.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsValidateVAT.wdl @@ -8,6 +8,7 @@ workflow GvsValidateVat { String project_id String dataset_name String vat_table_name + Boolean? is_small_callset String? cloud_sdk_docker String? variants_docker } @@ -25,20 +26,23 @@ workflow GvsValidateVat { String effective_cloud_sdk_docker = select_first([cloud_sdk_docker, GetToolVersions.cloud_sdk_docker]) String effective_variants_docker = select_first([variants_docker, GetToolVersions.variants_docker]) - call Utils.GetBQTableLastModifiedDatetime as SampleDateTime { - input: - project_id = project_id, - fq_table = fq_vat_table, - cloud_sdk_docker = effective_cloud_sdk_docker, - } + # Definining is_small_callset allows us to run this WDL on a dataset that has not had samples loaded (for testing) + if (!defined(is_small_callset)) { + call Utils.GetBQTableLastModifiedDatetime as SampleDateTime { + input: + project_id = project_id, + fq_table = fq_sample_table, + cloud_sdk_docker = effective_cloud_sdk_docker, + } - call Utils.GetNumSamplesLoaded { - input: - fq_sample_table = fq_sample_table, - project_id = project_id, - sample_table_timestamp = SampleDateTime.last_modified_timestamp, - control_samples = false, - cloud_sdk_docker = effective_cloud_sdk_docker, + call Utils.GetNumSamplesLoaded { + input: + fq_sample_table = fq_sample_table, + project_id = project_id, + sample_table_timestamp = SampleDateTime.last_modified_timestamp, + control_samples = false, + cloud_sdk_docker = effective_cloud_sdk_docker, + } } call Utils.GetBQTableLastModifiedDatetime as VatDateTime { @@ -153,7 +157,7 @@ workflow GvsValidateVat { } # only check certain things if the callset is larger than 10,000 samples (a guess) - Boolean callset_is_small = GetNumSamplesLoaded.num_samples < 10000 + Boolean callset_is_small = select_first([is_small_callset, select_first([GetNumSamplesLoaded.num_samples]) < 10000]) if (!callset_is_small) { call ClinvarSignificance { input: diff --git a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl index c089fdcce6c..e67198893ba 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl @@ -266,7 +266,7 @@ task AssertTableSizeIsAsExpected { echo "project_id = ~{project_id}" > ~/.bigqueryrc bq --apilog=false query --project_id=~{project_id} --format=csv --use_legacy_sql=false \ - SELECT 'vat_total' AS total_name, sum(total_billable_bytes) AS total_bytes \ + "SELECT 'vat_total' AS total_name, sum(total_billable_bytes) AS total_bytes \ FROM \`~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS\` \ WHERE table_name = '~{vat_table_name}'" > output/table_sizes.csv From bfbcb7c7dcc08fe885c1217bdf213f7f85b9a7ef Mon Sep 17 00:00:00 2001 From: ggrant Date: Thu, 23 Jan 2025 16:02:26 -0500 Subject: [PATCH 09/31] Forgot to pass that parameter --- scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl index e67198893ba..4bb3388deae 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl @@ -93,6 +93,7 @@ workflow GvsQuickstartVATIntegration { project_id = project_id, dataset_name = CreateDatasetForTest.dataset_name, vat_table_name = CreateVATfromVDS.vat_table_name, + is_small_callset = true, cloud_sdk_docker = effective_cloud_sdk_docker, variants_docker = effective_variants_docker, } From 9e991e0f3c905510312c5b50b378520f871e19ca Mon Sep 17 00:00:00 2001 From: ggrant Date: Thu, 23 Jan 2025 21:50:57 -0500 Subject: [PATCH 10/31] Trying to fix call to GvsValidateVAT.wdl --- .../variantstore/variant-annotations-table/GvsValidateVAT.wdl | 2 +- scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsValidateVAT.wdl b/scripts/variantstore/variant-annotations-table/GvsValidateVAT.wdl index e339cc19beb..d02fcd7bdae 100644 --- a/scripts/variantstore/variant-annotations-table/GvsValidateVAT.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsValidateVAT.wdl @@ -157,7 +157,7 @@ workflow GvsValidateVat { } # only check certain things if the callset is larger than 10,000 samples (a guess) - Boolean callset_is_small = select_first([is_small_callset, select_first([GetNumSamplesLoaded.num_samples]) < 10000]) + Boolean callset_is_small = select_first([is_small_callset, select_first([GetNumSamplesLoaded.num_samples, 1]) < 10000]) if (!callset_is_small) { call ClinvarSignificance { input: diff --git a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl index 4bb3388deae..80891d87457 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl @@ -4,6 +4,8 @@ import "../GvsUtils.wdl" as Utils import "../../variant-annotations-table/GvsCreateVATfromVDS.wdl" as CreateVATFromVDS import "../../variant-annotations-table/GvsValidateVAT.wdl" as ValidateVAT +# A comment for debugging + workflow GvsQuickstartVATIntegration { input { String git_branch_or_tag From fba4bd7e85e564068e0262432e0e802df3daff33 Mon Sep 17 00:00:00 2001 From: ggrant Date: Thu, 23 Jan 2025 21:58:06 -0500 Subject: [PATCH 11/31] Trying to fix call to GvsValidateVAT.wdl --- scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl index 80891d87457..5899fe585b7 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl @@ -4,7 +4,7 @@ import "../GvsUtils.wdl" as Utils import "../../variant-annotations-table/GvsCreateVATfromVDS.wdl" as CreateVATFromVDS import "../../variant-annotations-table/GvsValidateVAT.wdl" as ValidateVAT -# A comment for debugging +# A comment for debugging. workflow GvsQuickstartVATIntegration { input { From 879599efee01f71dfdd61415c9a223b971b4b454 Mon Sep 17 00:00:00 2001 From: ggrant Date: Fri, 24 Jan 2025 11:29:40 -0500 Subject: [PATCH 12/31] Added size check. --- .../wdl/test/GvsQuickstartVATIntegration.wdl | 177 +++++------------- 1 file changed, 46 insertions(+), 131 deletions(-) diff --git a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl index 5899fe585b7..f300827ef52 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl @@ -4,7 +4,7 @@ import "../GvsUtils.wdl" as Utils import "../../variant-annotations-table/GvsCreateVATfromVDS.wdl" as CreateVATFromVDS import "../../variant-annotations-table/GvsValidateVAT.wdl" as ValidateVAT -# A comment for debugging. +# A comment for debugging! workflow GvsQuickstartVATIntegration { input { @@ -71,7 +71,7 @@ workflow GvsQuickstartVATIntegration { cloud_sdk_docker = effective_cloud_sdk_docker, } - call CreateVATFromVDS.GvsCreateVATfromVDS as CreateVATfromVDS { + call CreateVATFromVDS.GvsCreateVATfromVDS as CreateVATFromVDS { input: project_id = project_id, dataset_name = CreateDatasetForTest.dataset_name, @@ -94,42 +94,31 @@ workflow GvsQuickstartVATIntegration { input: project_id = project_id, dataset_name = CreateDatasetForTest.dataset_name, - vat_table_name = CreateVATfromVDS.vat_table_name, + vat_table_name = CreateVATFromVDS.vat_table_name, is_small_callset = true, cloud_sdk_docker = effective_cloud_sdk_docker, variants_docker = effective_variants_docker, } String expected_prefix = expected_output_prefix + dataset_suffix + "/" -# call AssertIdenticalOutputs { -# input: -# expected_output_prefix = expected_prefix, -# expected_output_suffix = if (bgzip_output_vcfs) then ".bgz" else ".gz", -# actual_vcfs = JointVariantCalling.output_vcfs, -# gatk_docker = effective_gatk_docker -# } -# -# if (check_expected_cost_and_table_size_outputs) { -# call AssertCostIsTrackedAndExpected { -# input: -# go = JointVariantCalling.done, -# dataset_name = CreateDatasetForTest.dataset_name, -# project_id = project_id, -# expected_output_csv = expected_prefix + "cost_observability.csv", -# cloud_sdk_docker = effective_cloud_sdk_docker, -# } -# + call AssertIdenticalOutputs { + input: + actual_file = select_first([CreateVATFromVDS.final_tsv_file]), + expected_file = expected_prefix + "vat_complete.bgz.tsv.gz", + gatk_docker = effective_gatk_docker + } + + call AssertTableSizeIsAsExpected { input: dataset_name = CreateDatasetForTest.dataset_name, project_id = project_id, - vat_table_name = CreateVATfromVDS.vat_table_name, + vat_table_name = CreateVATFromVDS.vat_table_name, expected_output_csv = expected_prefix + "table_sizes.csv", cloud_sdk_docker = effective_cloud_sdk_docker, } output { - String dataset_name = CreateDatasetForTest.dataset_name String filter_set_name = "quickit" String recorded_git_hash = effective_git_hash @@ -137,114 +126,40 @@ workflow GvsQuickstartVATIntegration { } } -#task AssertIdenticalOutputs { -# input { -# String expected_output_prefix -# String expected_output_suffix -# Array[File] actual_vcfs -# String gatk_docker -# } -# parameter_meta { -# actual_vcfs: { -# localization_optional: true -# } -# } -# command <<< -# # Prepend date, time and pwd to xtrace log entries. -# PS4='\D{+%F %T} \w $ ' -# set -o errexit -o nounset -o pipefail -o xtrace -# -# failures=() -# -# # Where the current set of expected results lives in the cloud -# expected_prefix="~{expected_output_prefix}" -# # Remove a trailing slash if there is one -# expected_prefix=${expected_prefix%/} -# -# # Download all the expected data -# mkdir expected -# cd expected -# gcloud storage cp -r "${expected_prefix}"'/*.vcf~{expected_output_suffix}' . -# gzip -S ~{expected_output_suffix} -d *~{expected_output_suffix} -# cd .. -# -# mkdir actual -# cd actual -# touch actual_manifest.txt -# # Making the manifest is pretty uninteresting and very noisy so turn off xtrace temporarily. -# set +o xtrace -# for actual in ~{sep=' ' actual_vcfs} -# do -# echo $actual >> actual_manifest.txt -# done -# set -o xtrace -# -# cat actual_manifest.txt | gcloud storage cp -I . -# # Unzip actual result data. -# ls -1 | grep -E '\.vcf\~{expected_output_suffix}$' | xargs gzip -S ~{expected_output_suffix} -d -# cd .. -# -# echo "Header Check" -# # Headers first, these can yield useful diagnostics when there are mismatches. -# for vcf in $(ls -1 actual | grep -E '\.vcf$') -# do -# actual="actual/$vcf" -# expected="expected/$vcf" -# set +o errexit -# cmp <(grep '^#' $actual | grep -E -v '^##GATKCommandLine=') <(grep '^#' $expected | grep -E -v '^##GATKCommandLine=') -# rc=$? -# set -o errexit -# if [[ $rc -ne 0 ]]; then -# # If there is a mismatch add it to a list of failures but keep on looking for mismatches. -# failures+=( $vcf ) -# fi -# done -# -# echo "Header Failure Check" -# if [[ ${#failures[@]} -ne 0 ]]; then -# echo "Error: headers for the following files do not match:" -# for failure in ${failures[@]}; do -# echo $failure -# expected="expected/$failure" -# actual="actual/$failure" -# diff <(grep '^#' $actual) <(grep '^#' $expected) -# done -# exit 1 -# fi -# -# echo "Overall Check" -# # If the headers all matched look for any mismatches in overall file content. -# fail=0 -# for vcf in $(ls -1 actual | grep -E '\.vcf$') -# do -# expected="expected/$vcf" -# actual="actual/$vcf" -# set +o errexit -# cmp <(grep -E -v '^##GATKCommandLine=' $actual) <(grep -E -v '^##GATKCommandLine=' $expected) -# rc=$? -# set -o errexit -# if [[ $rc -ne 0 ]]; then -# echo "Error: file contents of expected and actual do not match: $vcf" -# fail=1 -# fi -# done -# -# if [[ $fail -ne 0 ]]; then -# exit 1 -# fi -# -# echo "All vcfs compared and matched!" -# >>> -# -# runtime { -# docker: gatk_docker -# disks: "local-disk 500 HDD" -# } -# -# output { -# Boolean done = true -# } -#} +task AssertIdenticalOutputs { + input { + File actual_file + File expected_file + String gatk_docker + } + + command <<< + # Prepend date, time and pwd to xtrace log entries. + PS4='\D{+%F %T} \w $ ' + set -o errexit -o nounset -o pipefail -o xtrace + + cat ~{actual_file} | gunzip | wc > actual_wc.txt + cat ~{expected_file} | gunzip | wc > expected_wc.txt + set +o errexit + diff actual_wc.txt expected_wc.txt + rc=$? + set -o errexit + if [[ $rc -ne 0 ]]; then + echo "The observed file ~{actual_file} differs from the expected ~{expected_file} in wc output!" + exit 1; + fi + echo "REMOVE ME! The observed file ~{actual_file} does not differ from the expected ~{expected_file} in wc output!" + >>> + + runtime { + docker: gatk_docker + disks: "local-disk 500 HDD" + } + + output { + Boolean done = true + } +} task AssertTableSizeIsAsExpected { meta { From ebf544ea2ecb2c0b8e0662588bf9accf5fa7d788 Mon Sep 17 00:00:00 2001 From: ggrant Date: Fri, 24 Jan 2025 13:41:13 -0500 Subject: [PATCH 13/31] A little more checking. --- .../variantstore/wdl/test/GvsQuickstartVATIntegration.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl index f300827ef52..33863878d6a 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl @@ -4,8 +4,6 @@ import "../GvsUtils.wdl" as Utils import "../../variant-annotations-table/GvsCreateVATfromVDS.wdl" as CreateVATFromVDS import "../../variant-annotations-table/GvsValidateVAT.wdl" as ValidateVAT -# A comment for debugging! - workflow GvsQuickstartVATIntegration { input { String git_branch_or_tag @@ -140,6 +138,8 @@ task AssertIdenticalOutputs { cat ~{actual_file} | gunzip | wc > actual_wc.txt cat ~{expected_file} | gunzip | wc > expected_wc.txt + cat actual_wc.txt + cat expected_wc.txt set +o errexit diff actual_wc.txt expected_wc.txt rc=$? @@ -148,7 +148,7 @@ task AssertIdenticalOutputs { echo "The observed file ~{actual_file} differs from the expected ~{expected_file} in wc output!" exit 1; fi - echo "REMOVE ME! The observed file ~{actual_file} does not differ from the expected ~{expected_file} in wc output!" + echo "No differences found" >>> runtime { From e87c58895082b8a5fa3b0c297cd810a0a07c2af6 Mon Sep 17 00:00:00 2001 From: ggrant Date: Mon, 27 Jan 2025 11:22:45 -0500 Subject: [PATCH 14/31] Add VAT integration test to main integration test --- .../wdl/test/GvsQuickstartIntegration.wdl | 28 +++++++++++++++++++ .../wdl/test/GvsQuickstartVATIntegration.wdl | 15 +++------- 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl index e639cea5428..4b1cdc9172b 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl @@ -2,9 +2,12 @@ version 1.0 import "GvsQuickstartVcfIntegration.wdl" as QuickstartVcfIntegration import "GvsQuickstartHailIntegration.wdl" as QuickstartHailIntegration +import "GvsQuickstartVATIntegration.wdl" as QuickstartVATIntegration import "../GvsJointVariantCalling.wdl" as JointVariantCalling import "../GvsUtils.wdl" as Utils +# A comment + workflow GvsQuickstartIntegration { input { String git_branch_or_tag @@ -14,6 +17,8 @@ workflow GvsQuickstartIntegration { Boolean run_exome_integration = true Boolean run_beta_integration = true Boolean run_bge_integration = true + Boolean run_vat_integration = true + Boolean run_vat_integration_test_from_vds = true # If false, will use sites-only VCF String sample_id_column_name = "sample_id" String vcf_files_column_name = "hg38_reblocked_gvcf" String vcf_index_files_column_name = "hg38_reblocked_gvcf_index" @@ -25,6 +30,7 @@ workflow GvsQuickstartIntegration { String? cloud_sdk_docker String? cloud_sdk_slim_docker String? variants_docker + String? variants_nirvana_docker String? gatk_docker String? hail_version Boolean chr20_X_Y_only = true @@ -52,6 +58,7 @@ workflow GvsQuickstartIntegration { String effective_cloud_sdk_docker = select_first([cloud_sdk_docker, GetToolVersions.cloud_sdk_docker]) String effective_cloud_sdk_slim_docker = select_first([cloud_sdk_slim_docker, GetToolVersions.cloud_sdk_slim_docker]) String effective_variants_docker = select_first([variants_docker, GetToolVersions.variants_docker]) + String effective_variants_nirvana_docker = select_first([variants_nirvana_docker, GetToolVersions.variants_nirvana_docker]) String effective_gatk_docker = select_first([gatk_docker, GetToolVersions.gatk_docker]) String effective_hail_version = select_first([hail_version, GetToolVersions.hail_version]) @@ -317,6 +324,27 @@ workflow GvsQuickstartIntegration { } } + if (run_vat_integration) { + String extract_output_gcs_dir = "~{workspace_bucket}/output_vat/by_submission_id/~{submission_id}/vat" + + call QuickstartVATIntegration.GvsQuickstartVATIntegration as GvsQuickstartVATIntegration { + input: + git_branch_or_tag = git_branch_or_tag, + git_hash = GetToolVersions.git_hash, + use_default_dockers = use_default_dockers, + expected_output_prefix = expected_output_prefix, + dataset_suffix = "vat", + output_path = extract_output_gcs_dir, + use_vds = run_vat_integration_test_from_vds, + basic_docker = effective_basic_docker, + cloud_sdk_docker = effective_cloud_sdk_docker, + cloud_sdk_slim_docker = effective_cloud_sdk_slim_docker, + variants_docker = effective_variants_docker, + variants_nirvana_docker = effective_variants_nirvana_docker, + gatk_docker = effective_gatk_docker, + } + } + output { String recorded_git_hash = GetToolVersions.git_hash } diff --git a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl index 33863878d6a..5693c8b36b7 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl @@ -8,22 +8,18 @@ workflow GvsQuickstartVATIntegration { input { String git_branch_or_tag String? git_hash + Boolean use_default_dockers = false + String expected_output_prefix + String dataset_suffix Boolean use_vds = true # If true, use a VDS, otherwise use a sites only VCF. String output_path String split_intervals_scatter_count = 10 - String expected_output_prefix - String dataset_suffix - Boolean use_default_dockers = false String? basic_docker String? cloud_sdk_docker String? cloud_sdk_slim_docker String? variants_docker String? variants_nirvana_docker String? gatk_docker - File? gatk_override - - String? workspace_bucket - String? submission_id } String project_id = "gvs-internal" @@ -50,10 +46,7 @@ workflow GvsQuickstartVATIntegration { String effective_gatk_docker = select_first([gatk_docker, GetToolVersions.gatk_docker]) String effective_git_hash = select_first([git_hash, GetToolVersions.git_hash]) - String effective_workspace_bucket = select_first([workspace_bucket, GetToolVersions.workspace_bucket]) - String effective_submission_id = select_first([submission_id, GetToolVersions.submission_id]) - - if (!use_default_dockers && !defined(gatk_override)) { + if (!use_default_dockers) { call Utils.BuildGATKJar { input: git_branch_or_tag = git_branch_or_tag, From 05cdc135db001a3720299642f669da5a53a1b07a Mon Sep 17 00:00:00 2001 From: ggrant Date: Mon, 27 Jan 2025 11:26:11 -0500 Subject: [PATCH 15/31] Add branch to .dockstore.yml --- .dockstore.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.dockstore.yml b/.dockstore.yml index 93da6e43e0d..3ebdf28d0db 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -317,6 +317,7 @@ workflows: - master - ah_var_store - vs_1516_yolo + - gg_VS-1549_AddVATToIntegrationTests tags: - /.*/ - name: GvsQuickstartVATIntegration @@ -324,6 +325,8 @@ workflows: primaryDescriptorPath: /scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl filters: branches: + - master + - ah_var_store - gg_VS-1549_AddVATToIntegrationTests tags: - /.*/ From 747ca566e91a6ed692243798b38b2af155025b64 Mon Sep 17 00:00:00 2001 From: ggrant Date: Mon, 27 Jan 2025 11:28:47 -0500 Subject: [PATCH 16/31] Fix wdl syntax error --- scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl index 4b1cdc9172b..e8ce34fde7f 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl @@ -325,7 +325,7 @@ workflow GvsQuickstartIntegration { } if (run_vat_integration) { - String extract_output_gcs_dir = "~{workspace_bucket}/output_vat/by_submission_id/~{submission_id}/vat" + String extract_vat_output_gcs_dir = "~{workspace_bucket}/output_vat/by_submission_id/~{submission_id}/vat" call QuickstartVATIntegration.GvsQuickstartVATIntegration as GvsQuickstartVATIntegration { input: @@ -334,7 +334,7 @@ workflow GvsQuickstartIntegration { use_default_dockers = use_default_dockers, expected_output_prefix = expected_output_prefix, dataset_suffix = "vat", - output_path = extract_output_gcs_dir, + output_path = extract_vat_output_gcs_dir, use_vds = run_vat_integration_test_from_vds, basic_docker = effective_basic_docker, cloud_sdk_docker = effective_cloud_sdk_docker, From 9a673f86b24f51903a223284a548e58b7af7a2cc Mon Sep 17 00:00:00 2001 From: ggrant Date: Mon, 27 Jan 2025 12:16:19 -0500 Subject: [PATCH 17/31] Need to define workspace_id and submission_id --- .../wdl/test/GvsQuickstartIntegration.wdl | 47 ++++++++++--------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl index e8ce34fde7f..de641c98d74 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl @@ -6,7 +6,7 @@ import "GvsQuickstartVATIntegration.wdl" as QuickstartVATIntegration import "../GvsJointVariantCalling.wdl" as JointVariantCalling import "../GvsUtils.wdl" as Utils -# A comment +# A comment! workflow GvsQuickstartIntegration { input { @@ -79,7 +79,12 @@ workflow GvsQuickstartIntegration { } } - # Note for `GvsQuickstartIntegration` we use the git_branch_or_tag *input* and its corresponding git hash. This is not + String workspace_bucket = GetToolVersions.workspace_bucket + String workspace_id = GetToolVersions.workspace_id + String submission_id = GetToolVersions.submission_id + + + # Note for `GvsQuickstartIntegration` we use the git_branch_or_tag *input* and its corresponding git hash. This is not # necessarily the same as the branch name selected in Terra for the integration `GvsQuickstartIntegration` workflow, # though in practice likely they are the same. if (run_hail_integration) { @@ -105,9 +110,9 @@ workflow GvsQuickstartIntegration { cloud_sdk_slim_docker = effective_cloud_sdk_slim_docker, variants_docker = effective_variants_docker, gatk_docker = effective_gatk_docker, - workspace_bucket = GetToolVersions.workspace_bucket, - workspace_id = GetToolVersions.workspace_id, - submission_id = GetToolVersions.submission_id, + workspace_bucket = workspace_bucket, + workspace_id = workspace_id, + submission_id = submission_id, hail_version = effective_hail_version, maximum_alternate_alleles = maximum_alternate_alleles, } @@ -144,9 +149,9 @@ workflow GvsQuickstartIntegration { cloud_sdk_slim_docker = effective_cloud_sdk_slim_docker, variants_docker = effective_variants_docker, gatk_docker = effective_gatk_docker, - workspace_bucket = GetToolVersions.workspace_bucket, - workspace_id = GetToolVersions.workspace_id, - submission_id = GetToolVersions.submission_id, + workspace_bucket = workspace_bucket, + workspace_id = workspace_id, + submission_id = submission_id, maximum_alternate_alleles = maximum_alternate_alleles, } call QuickstartVcfIntegration.GvsQuickstartVcfIntegration as QuickstartVcfVQSRIntegration { @@ -171,9 +176,9 @@ workflow GvsQuickstartIntegration { cloud_sdk_slim_docker = effective_cloud_sdk_slim_docker, variants_docker = effective_variants_docker, gatk_docker = effective_gatk_docker, - workspace_bucket = GetToolVersions.workspace_bucket, - workspace_id = GetToolVersions.workspace_id, - submission_id = GetToolVersions.submission_id, + workspace_bucket = workspace_bucket, + workspace_id = workspace_id, + submission_id = submission_id, maximum_alternate_alleles = maximum_alternate_alleles, } @@ -217,9 +222,9 @@ workflow GvsQuickstartIntegration { cloud_sdk_slim_docker = effective_cloud_sdk_slim_docker, variants_docker = effective_variants_docker, gatk_docker = effective_gatk_docker, - workspace_bucket = GetToolVersions.workspace_bucket, - workspace_id = GetToolVersions.workspace_id, - submission_id = GetToolVersions.submission_id, + workspace_bucket = workspace_bucket, + workspace_id = workspace_id, + submission_id = submission_id, maximum_alternate_alleles = maximum_alternate_alleles, target_interval_list = target_interval_list, } @@ -256,9 +261,9 @@ workflow GvsQuickstartIntegration { cloud_sdk_slim_docker = effective_cloud_sdk_slim_docker, variants_docker = effective_variants_docker, gatk_docker = effective_gatk_docker, - workspace_bucket = GetToolVersions.workspace_bucket, - workspace_id = GetToolVersions.workspace_id, - submission_id = GetToolVersions.submission_id, + workspace_bucket = workspace_bucket, + workspace_id = workspace_id, + submission_id = submission_id, maximum_alternate_alleles = maximum_alternate_alleles, target_interval_list = target_interval_list, } @@ -275,8 +280,6 @@ workflow GvsQuickstartIntegration { if (run_beta_integration) { String project_id = "gvs-internal" - String workspace_bucket = GetToolVersions.workspace_bucket - String submission_id = GetToolVersions.submission_id String extract_output_gcs_dir = "~{workspace_bucket}/output_vcfs/by_submission_id/~{submission_id}/beta" Boolean collect_variant_calling_metrics = true @@ -303,9 +306,9 @@ workflow GvsQuickstartIntegration { cloud_sdk_docker = effective_cloud_sdk_docker, variants_docker = effective_variants_docker, gatk_docker = effective_gatk_docker, - workspace_bucket = GetToolVersions.workspace_bucket, - workspace_id = GetToolVersions.workspace_id, - submission_id = GetToolVersions.submission_id, + workspace_bucket = workspace_bucket, + workspace_id = workspace_id, + submission_id = submission_id, maximum_alternate_alleles = maximum_alternate_alleles, git_branch_or_tag = git_branch_or_tag, sample_id_column_name = sample_id_column_name, From 57d2e77ff07e32d0ef176359e14f9975136bdf00 Mon Sep 17 00:00:00 2001 From: ggrant Date: Mon, 27 Jan 2025 13:55:06 -0500 Subject: [PATCH 18/31] Remove comment --- scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl index de641c98d74..1e6dd7cfabf 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl @@ -6,8 +6,6 @@ import "GvsQuickstartVATIntegration.wdl" as QuickstartVATIntegration import "../GvsJointVariantCalling.wdl" as JointVariantCalling import "../GvsUtils.wdl" as Utils -# A comment! - workflow GvsQuickstartIntegration { input { String git_branch_or_tag From 1e7f0f99ead110377a71f546dd3ab117f9974e77 Mon Sep 17 00:00:00 2001 From: ggrant Date: Mon, 27 Jan 2025 14:06:49 -0500 Subject: [PATCH 19/31] Stuff for final work --- .dockstore.yml | 2 +- scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl | 2 +- .../variantstore/wdl/test/GvsQuickstartVATIntegration.wdl | 6 ++++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.dockstore.yml b/.dockstore.yml index a8e4567a571..e4cb3f104d4 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -316,7 +316,7 @@ workflows: branches: - master - ah_var_store - - vs_1516_yolo + - gg_VS-1549_AddVATToIntegrationTests tags: - /.*/ - name: GvsQuickstartVATIntegration diff --git a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl index 1e6dd7cfabf..2a7668d3901 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl @@ -82,7 +82,7 @@ workflow GvsQuickstartIntegration { String submission_id = GetToolVersions.submission_id - # Note for `GvsQuickstartIntegration` we use the git_branch_or_tag *input* and its corresponding git hash. This is not + # Note for `GvsQuickstartIntegration` we use the git_branch_or_tag *input* and its corresponding git hash. This is not # necessarily the same as the branch name selected in Terra for the integration `GvsQuickstartIntegration` workflow, # though in practice likely they are the same. if (run_hail_integration) { diff --git a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl index 5693c8b36b7..33b8d97982f 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl @@ -92,7 +92,7 @@ workflow GvsQuickstartVATIntegration { } String expected_prefix = expected_output_prefix + dataset_suffix + "/" - call AssertIdenticalOutputs { + call AssertKindaIdenticalOutputs { input: actual_file = select_first([CreateVATFromVDS.final_tsv_file]), expected_file = expected_prefix + "vat_complete.bgz.tsv.gz", @@ -117,7 +117,9 @@ workflow GvsQuickstartVATIntegration { } } -task AssertIdenticalOutputs { +# Note - the tsv generated by the export of the VAT table is not consistently sorted (I think it's sorted by vid, but not by transcript?) +# So, a diff or cmp doesn't work. In this method I just verify that the files have the same counts (from wc) +task AssertKindaIdenticalOutputs { input { File actual_file File expected_file From 99dc3bd4a754b857c5ff9db8b29bdd89c17cdeb7 Mon Sep 17 00:00:00 2001 From: ggrant Date: Mon, 27 Jan 2025 14:09:01 -0500 Subject: [PATCH 20/31] Stuff for final work --- .dockstore.yml | 12 ++++++------ .../wdl/test/GvsQuickstartIntegration.wdl | 1 - 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/.dockstore.yml b/.dockstore.yml index e4cb3f104d4..35084851b7e 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -323,12 +323,12 @@ workflows: subclass: WDL primaryDescriptorPath: /scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl filters: - branches: - - master - - ah_var_store - - gg_VS-1549_AddVATToIntegrationTests - tags: - - /.*/ + branches: + - master + - ah_var_store + - gg_VS-1549_AddVATToIntegrationTests + tags: + - /.*/ - name: GvsIngestTieout subclass: WDL primaryDescriptorPath: /scripts/variantstore/wdl/test/GvsIngestTieout.wdl diff --git a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl index 2a7668d3901..ca3adcc2253 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl @@ -81,7 +81,6 @@ workflow GvsQuickstartIntegration { String workspace_id = GetToolVersions.workspace_id String submission_id = GetToolVersions.submission_id - # Note for `GvsQuickstartIntegration` we use the git_branch_or_tag *input* and its corresponding git hash. This is not # necessarily the same as the branch name selected in Terra for the integration `GvsQuickstartIntegration` workflow, # though in practice likely they are the same. From 8f56aa8eb5ee79ebd390fe78a40043b352eb9fca Mon Sep 17 00:00:00 2001 From: George Grant Date: Tue, 28 Jan 2025 17:59:55 -0500 Subject: [PATCH 21/31] Update scripts/variantstore/variant-annotations-table/GvsValidateVAT.wdl Co-authored-by: Miguel Covarrubias --- .../variantstore/variant-annotations-table/GvsValidateVAT.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/variant-annotations-table/GvsValidateVAT.wdl b/scripts/variantstore/variant-annotations-table/GvsValidateVAT.wdl index d02fcd7bdae..8ff5ab06ea2 100644 --- a/scripts/variantstore/variant-annotations-table/GvsValidateVAT.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsValidateVAT.wdl @@ -26,7 +26,7 @@ workflow GvsValidateVat { String effective_cloud_sdk_docker = select_first([cloud_sdk_docker, GetToolVersions.cloud_sdk_docker]) String effective_variants_docker = select_first([variants_docker, GetToolVersions.variants_docker]) - # Definining is_small_callset allows us to run this WDL on a dataset that has not had samples loaded (for testing) + # Defining is_small_callset allows us to run this WDL on a dataset that has not had samples loaded (for testing) if (!defined(is_small_callset)) { call Utils.GetBQTableLastModifiedDatetime as SampleDateTime { input: From f9fd0428a126a4ea6d7322cd60e95b852245a286 Mon Sep 17 00:00:00 2001 From: ggrant Date: Wed, 29 Jan 2025 10:38:43 -0500 Subject: [PATCH 22/31] Address code review comments --- .../wdl/test/GvsQuickstartIntegration.wdl | 6 ++-- .../wdl/test/GvsQuickstartVATIntegration.wdl | 32 ++++++++----------- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl index ca3adcc2253..31bac489fb9 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl @@ -39,6 +39,7 @@ workflow GvsQuickstartIntegration { File full_exome_interval_list = "gs://gcp-public-data--broad-references/hg38/v0/bge_exome_calling_regions.v1.1.interval_list" String expected_subdir = if (!chr20_X_Y_only) then "all_chrs/" else "" File expected_output_prefix = "gs://gvs-internal-quickstart/integration/2024-10-29/" + expected_subdir + File truth_data_prefix = "gs://gvs-internal-quickstart/integration/test_data/2025-01-17/" # WDL 1.0 trick to set a variable ('none') to be undefined. if (false) { @@ -327,15 +328,16 @@ workflow GvsQuickstartIntegration { if (run_vat_integration) { String extract_vat_output_gcs_dir = "~{workspace_bucket}/output_vat/by_submission_id/~{submission_id}/vat" - call QuickstartVATIntegration.GvsQuickstartVATIntegration as GvsQuickstartVATIntegration { + call QuickstartVATIntegration.GvsQuickstartVATIntegration { input: git_branch_or_tag = git_branch_or_tag, git_hash = GetToolVersions.git_hash, use_default_dockers = use_default_dockers, + truth_data_prefix = truth_data_prefix, expected_output_prefix = expected_output_prefix, dataset_suffix = "vat", output_path = extract_vat_output_gcs_dir, - use_vds = run_vat_integration_test_from_vds, + use_vds_as_input = run_vat_integration_test_from_vds, basic_docker = effective_basic_docker, cloud_sdk_docker = effective_cloud_sdk_docker, cloud_sdk_slim_docker = effective_cloud_sdk_slim_docker, diff --git a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl index 33b8d97982f..cfc69d63240 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl @@ -9,9 +9,10 @@ workflow GvsQuickstartVATIntegration { String git_branch_or_tag String? git_hash Boolean use_default_dockers = false + String truth_data_prefix String expected_output_prefix String dataset_suffix - Boolean use_vds = true # If true, use a VDS, otherwise use a sites only VCF. + Boolean use_vds_as_input = true # If true, use a VDS, otherwise use a sites only VCF. String output_path String split_intervals_scatter_count = 10 String? basic_docker @@ -23,10 +24,9 @@ workflow GvsQuickstartVATIntegration { } String project_id = "gvs-internal" - File input_data_prefix = "gs://gvs-internal-quickstart/integration/test_data/2025-01-17/" - File ancestry_path = input_data_prefix + "quickstart_ancestry.tsv" - File? vds_path = if (use_vds) then input_data_prefix + "gvs_export.vds" else none - File? sites_only_vcf = if (!use_vds) then input_data_prefix + "quickstart_sites_only.vcf.bgz" else none + File ancestry_path = truth_data_prefix + "quickstart_ancestry.tsv" + File? vds_path = if (use_vds_as_input) then truth_data_prefix + "gvs_export.vds" else none + File? sites_only_vcf = if (!use_vds_as_input) then truth_data_prefix + "quickstart_sites_only.vcf.bgz" else none # WDL 1.0 trick to set a variable ('none') to be undefined. if (false) { @@ -72,7 +72,6 @@ workflow GvsQuickstartVATIntegration { sites_only_vcf = sites_only_vcf, output_path = output_path, split_intervals_scatter_count = split_intervals_scatter_count, - git_branch_or_tag = git_branch_or_tag, basic_docker = effective_basic_docker, cloud_sdk_docker = effective_cloud_sdk_docker, @@ -92,7 +91,7 @@ workflow GvsQuickstartVATIntegration { } String expected_prefix = expected_output_prefix + dataset_suffix + "/" - call AssertKindaIdenticalOutputs { + call AssertIdenticalOutputs { input: actual_file = select_first([CreateVATFromVDS.final_tsv_file]), expected_file = expected_prefix + "vat_complete.bgz.tsv.gz", @@ -111,44 +110,39 @@ workflow GvsQuickstartVATIntegration { output { String dataset_name = CreateDatasetForTest.dataset_name - String filter_set_name = "quickit" String recorded_git_hash = effective_git_hash - Boolean done = true } } -# Note - the tsv generated by the export of the VAT table is not consistently sorted (I think it's sorted by vid, but not by transcript?) -# So, a diff or cmp doesn't work. In this method I just verify that the files have the same counts (from wc) -task AssertKindaIdenticalOutputs { +task AssertIdenticalOutputs { input { File actual_file File expected_file String gatk_docker } + Int disk_size_gb = ceil(10 * size(actual_file, "GiB") + 10 * size(expected_file)) + 200 + command <<< # Prepend date, time and pwd to xtrace log entries. PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace - cat ~{actual_file} | gunzip | wc > actual_wc.txt - cat ~{expected_file} | gunzip | wc > expected_wc.txt - cat actual_wc.txt - cat expected_wc.txt + cat ~{actual_file} | gunzip > actual_file.txt + cat ~{expected_file} | gunzip > expected_file.txt set +o errexit - diff actual_wc.txt expected_wc.txt + cmp actual_file.txt expected_file.txt rc=$? set -o errexit if [[ $rc -ne 0 ]]; then echo "The observed file ~{actual_file} differs from the expected ~{expected_file} in wc output!" exit 1; fi - echo "No differences found" >>> runtime { docker: gatk_docker - disks: "local-disk 500 HDD" + disks: "local-disk ${disk_size_gb} HDD" } output { From 31210ec3a749dad584635f6dec98ecd438426c55 Mon Sep 17 00:00:00 2001 From: ggrant Date: Wed, 29 Jan 2025 11:38:09 -0500 Subject: [PATCH 23/31] debugging comment! --- scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl index caee90d29f6..8e68cf2cd83 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl @@ -6,6 +6,8 @@ import "GvsQuickstartVATIntegration.wdl" as QuickstartVATIntegration import "../GvsJointVariantCalling.wdl" as JointVariantCalling import "../GvsUtils.wdl" as Utils +# Debugging comment! + workflow GvsQuickstartIntegration { input { String git_branch_or_tag From 26fc0f058923328d159c5f8feff63b0425f0555a Mon Sep 17 00:00:00 2001 From: ggrant Date: Wed, 29 Jan 2025 16:42:14 -0500 Subject: [PATCH 24/31] debugging comment!!! --- scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl index 8e68cf2cd83..0d1bd7b0370 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl @@ -6,7 +6,7 @@ import "GvsQuickstartVATIntegration.wdl" as QuickstartVATIntegration import "../GvsJointVariantCalling.wdl" as JointVariantCalling import "../GvsUtils.wdl" as Utils -# Debugging comment! +# Debugging comment!!! workflow GvsQuickstartIntegration { input { From 53385a7492e00b72bd9fbabc8b1eaa0554def46a Mon Sep 17 00:00:00 2001 From: ggrant Date: Thu, 30 Jan 2025 10:45:42 -0500 Subject: [PATCH 25/31] Fix disk sizing error --- scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl index cfc69d63240..28fedfa1caa 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl @@ -121,7 +121,7 @@ task AssertIdenticalOutputs { String gatk_docker } - Int disk_size_gb = ceil(10 * size(actual_file, "GiB") + 10 * size(expected_file)) + 200 + Int disk_size_gb = ceil(10 * size(actual_file, "GiB") + 10 * size(expected_file, "GiB")) + 200 command <<< # Prepend date, time and pwd to xtrace log entries. From 7929086697ea3e9435fd06bf3754f6713b8a16be Mon Sep 17 00:00:00 2001 From: ggrant Date: Thu, 30 Jan 2025 10:46:24 -0500 Subject: [PATCH 26/31] debugging comment --- scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl index 0d1bd7b0370..f044ce7c583 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl @@ -6,7 +6,7 @@ import "GvsQuickstartVATIntegration.wdl" as QuickstartVATIntegration import "../GvsJointVariantCalling.wdl" as JointVariantCalling import "../GvsUtils.wdl" as Utils -# Debugging comment!!! +# Debugging comment. workflow GvsQuickstartIntegration { input { From bb957808fb675b504267052d6aa13fd8f1b0a2fc Mon Sep 17 00:00:00 2001 From: ggrant Date: Thu, 30 Jan 2025 14:23:02 -0500 Subject: [PATCH 27/31] debugging comment --- scripts/variantstore/wdl/GvsUtils.wdl | 2 +- .../variantstore/wdl/test/GvsQuickstartVATIntegration.wdl | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl index 521770a3a1a..5f82cae5470 100644 --- a/scripts/variantstore/wdl/GvsUtils.wdl +++ b/scripts/variantstore/wdl/GvsUtils.wdl @@ -84,7 +84,7 @@ task GetToolVersions { # GVS generally uses the smallest `alpine` version of the Google Cloud SDK as it suffices for most tasks, but # there are a handlful of tasks that require the larger GNU libc-based `slim`. String cloud_sdk_slim_docker = "gcr.io/google.com/cloudsdktool/cloud-sdk:435.0.0-slim" - String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2025-01-27-alpine-46895d996b6b" + String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2025-01-30-alpine-a29a91e48317" String variants_nirvana_docker = "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19" String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024-11-28-gatkbase-b71132a18899" String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest" diff --git a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl index 28fedfa1caa..d49b8815b0d 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl @@ -128,14 +128,14 @@ task AssertIdenticalOutputs { PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace - cat ~{actual_file} | gunzip > actual_file.txt - cat ~{expected_file} | gunzip > expected_file.txt + cat ~{actual_file} | gunzip | sort > actual_file.txt + cat ~{expected_file} | gunzip | sort > expected_file.txt set +o errexit cmp actual_file.txt expected_file.txt rc=$? set -o errexit if [[ $rc -ne 0 ]]; then - echo "The observed file ~{actual_file} differs from the expected ~{expected_file} in wc output!" + echo "The observed file ~{actual_file} differs from the expected ~{expected_file}!" exit 1; fi >>> From 4efd1f6ef996722b57425656e501adc8c59328c5 Mon Sep 17 00:00:00 2001 From: ggrant Date: Thu, 30 Jan 2025 14:24:18 -0500 Subject: [PATCH 28/31] debugging comment.. --- scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl index f044ce7c583..fb3b57da06d 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl @@ -6,7 +6,7 @@ import "GvsQuickstartVATIntegration.wdl" as QuickstartVATIntegration import "../GvsJointVariantCalling.wdl" as JointVariantCalling import "../GvsUtils.wdl" as Utils -# Debugging comment. +# Debugging comment.. workflow GvsQuickstartIntegration { input { From b6cdfed026c83577acb9bbc8611f241fe500559f Mon Sep 17 00:00:00 2001 From: ggrant Date: Fri, 31 Jan 2025 09:30:37 -0500 Subject: [PATCH 29/31] Use standard variants docker --- .dockstore.yml | 3 +++ scripts/variantstore/wdl/GvsUtils.wdl | 2 +- scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.dockstore.yml b/.dockstore.yml index 82be0eb42fc..8e11d8bfe88 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -317,6 +317,8 @@ workflows: branches: - master - ah_var_store + - vs_1418_ploidy_for_foxtrot_vds + - gg_VS-1549_AddVATToIntegrationTests tags: - /.*/ - name: GvsIngestTieout @@ -344,6 +346,7 @@ workflows: branches: - master - ah_var_store + - vs_1418_ploidy_for_foxtrot_vds tags: - /.*/ - name: GvsCallsetStatistics diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl index 5f82cae5470..521770a3a1a 100644 --- a/scripts/variantstore/wdl/GvsUtils.wdl +++ b/scripts/variantstore/wdl/GvsUtils.wdl @@ -84,7 +84,7 @@ task GetToolVersions { # GVS generally uses the smallest `alpine` version of the Google Cloud SDK as it suffices for most tasks, but # there are a handlful of tasks that require the larger GNU libc-based `slim`. String cloud_sdk_slim_docker = "gcr.io/google.com/cloudsdktool/cloud-sdk:435.0.0-slim" - String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2025-01-30-alpine-a29a91e48317" + String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2025-01-27-alpine-46895d996b6b" String variants_nirvana_docker = "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19" String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024-11-28-gatkbase-b71132a18899" String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest" diff --git a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl index fb3b57da06d..83a3604d73a 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl @@ -6,7 +6,7 @@ import "GvsQuickstartVATIntegration.wdl" as QuickstartVATIntegration import "../GvsJointVariantCalling.wdl" as JointVariantCalling import "../GvsUtils.wdl" as Utils -# Debugging comment.. +# Debugging comment!! workflow GvsQuickstartIntegration { input { From ecf4e2b58ccb95dbd30286ce34b451807f74451d Mon Sep 17 00:00:00 2001 From: ggrant Date: Fri, 31 Jan 2025 14:20:36 -0500 Subject: [PATCH 30/31] Remove debugging comment --- scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl index 83a3604d73a..caee90d29f6 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl @@ -6,8 +6,6 @@ import "GvsQuickstartVATIntegration.wdl" as QuickstartVATIntegration import "../GvsJointVariantCalling.wdl" as JointVariantCalling import "../GvsUtils.wdl" as Utils -# Debugging comment!! - workflow GvsQuickstartIntegration { input { String git_branch_or_tag From 97acc254e54cfc3b1376c49630f3c5c876ed606c Mon Sep 17 00:00:00 2001 From: ggrant Date: Mon, 3 Feb 2025 13:32:53 -0500 Subject: [PATCH 31/31] Added a comment, removed branch from .dockstore.yml --- .dockstore.yml | 1 - .../variantstore/variant-annotations-table/GvsValidateVAT.wdl | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.dockstore.yml b/.dockstore.yml index 8e11d8bfe88..e5a98118246 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -318,7 +318,6 @@ workflows: - master - ah_var_store - vs_1418_ploidy_for_foxtrot_vds - - gg_VS-1549_AddVATToIntegrationTests tags: - /.*/ - name: GvsIngestTieout diff --git a/scripts/variantstore/variant-annotations-table/GvsValidateVAT.wdl b/scripts/variantstore/variant-annotations-table/GvsValidateVAT.wdl index 8ff5ab06ea2..a49bc65a655 100644 --- a/scripts/variantstore/variant-annotations-table/GvsValidateVAT.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsValidateVAT.wdl @@ -156,7 +156,8 @@ workflow GvsValidateVat { cloud_sdk_docker = effective_cloud_sdk_docker, } - # only check certain things if the callset is larger than 10,000 samples (a guess) + # Check if the input boolean `is_small_callset` is defined, + # if not use the `GetNumSamples` task to find the number of samples in the callset and set the flag if it's < 10000 Boolean callset_is_small = select_first([is_small_callset, select_first([GetNumSamplesLoaded.num_samples, 1]) < 10000]) if (!callset_is_small) { call ClinvarSignificance {