diff --git a/BALSAMIC/assets/scripts/extend_bedfile.py b/BALSAMIC/assets/scripts/extend_bedfile.py new file mode 100644 index 000000000..feb2eec05 --- /dev/null +++ b/BALSAMIC/assets/scripts/extend_bedfile.py @@ -0,0 +1,44 @@ +import click + + +@click.command() +@click.argument("input_bedfile", type=click.Path(exists=True)) +@click.argument("output_bedfile", type=click.Path()) +@click.option( + "--extend-to-min-region-size", + default=100, + help="Will extend regions shorter than the specified size to this minimum size.", +) +def extend_bedfile( + input_bedfile: str, output_bedfile: str, extend_to_min_region_size: int +): + """ + Process a BED file to ensure regions are at least a minimum size. + + Args: + input_bedfile (str): Input BED file path. + output_bedfile (str): Output BED file path. + min_region_size (int): Minimum region size to enforce. + """ + with open(input_bedfile, "r") as infile, open(output_bedfile, "w") as outfile: + for line in infile: + fields = line.strip().split("\t") + + chrom: str = fields[0] + start = int(fields[1]) + end = int(fields[2]) + + region_length: int = end - start + if region_length < extend_to_min_region_size: + center = (start + end) // 2 + half_size = extend_to_min_region_size // 2 + start = max(0, center - half_size) + end = center + half_size + if extend_to_min_region_size % 2 != 0: + end += 1 + + outfile.write(f"{chrom}\t{start}\t{end}\n") + + +if __name__ == "__main__": + extend_bedfile() diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py new file mode 100644 index 000000000..00c23d9e6 --- /dev/null +++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py @@ -0,0 +1,102 @@ +import click +from BALSAMIC.utils.io import read_csv, write_list_of_strings + + +def calculate_log2_ratio(purity, log2_ratio, ploidy): + """Adjuts log2 ratio according to purity and ploidy. + + Based on method in PureCN: https://github.com/lima1/PureCN/issues/40 + + Method is not used currently due to questionable results. + """ + # Ensure that the inputs are within valid ranges + if not (0 <= purity <= 1): + raise ValueError("Purity must be between 0 and 1") + + if ploidy <= 0: + raise ValueError("Ploidy must be a positive number") + + # Ploidy and purity adjustment formula + log2_adjusted = ( + purity * log2_ratio * ploidy + 2 * (1 - purity) * log2_ratio - 2 * (1 - purity) + ) / (purity * ploidy) + + return log2_adjusted + + +@click.command() +@click.option( + "-o", + "--output-file", + required=True, + type=click.Path(exists=False), + help="Name of output-file.", +) +@click.option( + "-c", + "--normalised-coverage-path", + required=True, + type=click.Path(exists=True), + help="Input CNVkit cnr. result.", +) +@click.option( + "-p", + "--tumor-purity-path", + required=True, + type=click.Path(exists=True), + help="Tumor purity file from PureCN", +) +def create_gens_cov_file( + output_file: str, normalised_coverage_path: str, tumor_purity_path: str +): + """Post-processes the CNVkit .cnr output for upload to GENS. + + Removes Antitarget regions, adjusts for purity and ploidy and outputs the coverages in multiple resolution-formats. + + Args: + output_file: Path to GENS output.cov file + normalised_coverage_path: Path to input CNVkit cnr file. + tumor_purity_path: Path to PureCN purity estimate csv file + """ + log2_data = [] + + # Process CNVkit file + cnr_dict_list: list[dict] = read_csv( + csv_path=normalised_coverage_path, delimeter="\t" + ) + + # Process PureCN purity file + purecn_dict_list: list[dict] = read_csv(csv_path=tumor_purity_path, delimeter=",") + purity = float(purecn_dict_list[0]["Purity"]) + ploidy = float(purecn_dict_list[0]["Ploidy"]) + + for row in cnr_dict_list: + if row["gene"] == "Antitarget": + continue + + # find midpoint + start: int = int(row["start"]) + end: int = int(row["end"]) + region_size: int = end - start + midpoint: int = start + int(region_size / 2) + + # adjust log2 ratio + log2: float = float(row["log2"]) + + # De-activate purity and ploidy adjustment + # log2: float = calculate_log2_ratio(purity, log2, ploidy) + # log2: float = round(log2, 4) + + # store values in list + log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}") + + # Write log2 data to output file + resolutions = ["o", "a", "b", "c", "d"] + resolution_log2_lines = [ + f"{resolution}_{line}" for resolution in resolutions for line in log2_data + ] + write_list_of_strings(resolution_log2_lines, output_file) + + +if __name__ == "__main__": + create_gens_cov_file() diff --git a/BALSAMIC/assets/scripts/preprocess_gens.py b/BALSAMIC/assets/scripts/preprocess_gens.py index 0e146a3f1..e3c07a5e0 100755 --- a/BALSAMIC/assets/scripts/preprocess_gens.py +++ b/BALSAMIC/assets/scripts/preprocess_gens.py @@ -26,7 +26,7 @@ "-s", "--sequencing-type", required=True, - type=click.Choice([SequencingType.WGS]), + type=click.Choice([SequencingType.WGS, SequencingType.TARGETED]), help="Sequencing type used.", ) @click.pass_context diff --git a/BALSAMIC/commands/config/case.py b/BALSAMIC/commands/config/case.py index d505f17c1..0c001b1a7 100644 --- a/BALSAMIC/commands/config/case.py +++ b/BALSAMIC/commands/config/case.py @@ -54,6 +54,7 @@ get_bioinfo_tools_version, get_panel_chrom, get_sample_list, + get_gens_references, ) from BALSAMIC.utils.io import read_json, write_json from BALSAMIC.utils.utils import get_absolute_paths_dict @@ -131,29 +132,22 @@ def case_config( if cadd_annotations: references.update(cadd_annotations_path) - if any([genome_interval, gens_coverage_pon, gnomad_min_af5]): - if panel_bed: - raise click.BadParameter( - "GENS is currently not compatible with TGA analysis, only WGS." - ) - if not all([genome_interval, gens_coverage_pon, gnomad_min_af5]): - raise click.BadParameter( - "All three arguments (genome_interval gens_coverage_pon, gnomad_min_af5) are required for GENS." - ) - - gens_ref_files = { - "genome_interval": genome_interval, - "gens_coverage_pon": gens_coverage_pon, - "gnomad_min_af5": gnomad_min_af5, - } - - references.update( - { - gens_file: path - for gens_file, path in gens_ref_files.items() - if path is not None - } + if analysis_workflow is not AnalysisWorkflow.BALSAMIC_QC: + gens_references: dict[str, str] = get_gens_references( + genome_interval=genome_interval, + gens_coverage_pon=gens_coverage_pon, + gnomad_min_af5=gnomad_min_af5, + panel_bed=panel_bed, ) + if gens_references: + # Update references dictionary with GENS reference-files + references.update( + { + gens_file: path + for gens_file, path in gens_references.items() + if path is not None + } + ) variants_observations = { "clinical_snv_observations": clinical_snv_observations, diff --git a/BALSAMIC/constants/cluster_analysis.json b/BALSAMIC/constants/cluster_analysis.json index 9dbeafe9e..236da2337 100644 --- a/BALSAMIC/constants/cluster_analysis.json +++ b/BALSAMIC/constants/cluster_analysis.json @@ -11,7 +11,7 @@ "time": "00:15:00", "n": 1 }, - "gens_preprocessing": { + "gens_preprocess": { "time": "01:00:00", "n": 4 }, @@ -23,6 +23,18 @@ "time": "05:00:00", "n": 10 }, + "extend_short_bedregions": { + "time": "01:00:00", + "n": 1 + }, + "cnvkit_create_coverage": { + "time": "6:00:00", + "n": 18 + }, + "cnvkit_create_targets": { + "time": "6:00:00", + "n": 2 + }, "finalize_gens_outputfiles": { "time": "01:00:00", "n": 2 @@ -96,7 +108,7 @@ "time": "10:00:00", "n": 5 }, - "gatk_denoisereadcounts":{ + "gatk_denoise_read_counts":{ "time": "10:00:00", "n": 10 }, @@ -156,6 +168,10 @@ "time": "24:00:00", "n": 36 }, + "sentieon_DNAscope_gnomad_tga": { + "time": "24:00:00", + "n": 12 + }, "sentieon_TNhaplotyper": { "time": "24:00:00", "n": 36 @@ -416,6 +432,10 @@ "time": "01:00:00", "n": 1 }, + "bedtools_merge": { + "time": "01:00:00", + "n": 1 + }, "bam_compress": { "time": "04:00:00", "n": 20 diff --git a/BALSAMIC/constants/rules.py b/BALSAMIC/constants/rules.py index a82e9f01d..1e9fc1e49 100644 --- a/BALSAMIC/constants/rules.py +++ b/BALSAMIC/constants/rules.py @@ -75,6 +75,8 @@ "snakemake_rules/umi/sentieon_consensuscall.rule", ], "varcall": [ + "snakemake_rules/variant_calling/extend_bed.rule", + "snakemake_rules/variant_calling/cnvkit_preprocess.rule", "snakemake_rules/variant_calling/germline_tga.rule", "snakemake_rules/variant_calling/split_bed.rule", "snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule", @@ -107,6 +109,8 @@ "snakemake_rules/umi/umi_sentieon_alignment.rule", ], "varcall": [ + "snakemake_rules/variant_calling/extend_bed.rule", + "snakemake_rules/variant_calling/cnvkit_preprocess.rule", "snakemake_rules/variant_calling/germline_tga.rule", "snakemake_rules/variant_calling/split_bed.rule", "snakemake_rules/variant_calling/somatic_tumor_normal.rule", diff --git a/BALSAMIC/constants/tools.py b/BALSAMIC/constants/tools.py index 197650df4..0fe61fc4a 100644 --- a/BALSAMIC/constants/tools.py +++ b/BALSAMIC/constants/tools.py @@ -40,6 +40,9 @@ "c": 1000, "d": 100, }, - } + }, + SequencingType.TARGETED: { + "BAF_SKIP_N": {"o": 0, "a": 0, "b": 0, "c": 0, "d": 0}, + }, }, } diff --git a/BALSAMIC/constants/workflow_params.py b/BALSAMIC/constants/workflow_params.py index 0428db129..a3095b640 100644 --- a/BALSAMIC/constants/workflow_params.py +++ b/BALSAMIC/constants/workflow_params.py @@ -121,6 +121,9 @@ "bam_post_processing": { "manta_max_base_quality": 70, }, + "bed_pre_processing": { + "minimum_region_size": 100, + }, "common": { "header_per_lane": "'@RG\\tID:{fastq_pattern}\\tSM:{sample_type}\\tPL:ILLUMINAi'", "header_per_sample": "'@RG\\tID:{sample}\\tSM:{sample_type}\\tPL:ILLUMINAi'", diff --git a/BALSAMIC/models/config.py b/BALSAMIC/models/config.py index a4f6a686c..d08962aa9 100644 --- a/BALSAMIC/models/config.py +++ b/BALSAMIC/models/config.py @@ -436,7 +436,7 @@ def get_final_bam_name( if self.analysis.analysis_type == AnalysisType.PON: # Only dedup is necessary for panel of normals - final_bam_suffix = "dedup" + final_bam_suffix = "dedup.fixmate" elif self.analysis.sequencing_type == SequencingType.TARGETED: # Only dedup is necessary for TGA final_bam_suffix = "dedup.fixmate" diff --git a/BALSAMIC/models/params.py b/BALSAMIC/models/params.py index 8c1790e9b..16a2f134f 100644 --- a/BALSAMIC/models/params.py +++ b/BALSAMIC/models/params.py @@ -185,7 +185,7 @@ class UMIParamsTNscope(BaseModel): class BAMPostProcessingParams(BaseModel): - """This class defines the params settings used as constants bam post processing rules + """This class defines the params settings used as constants bam post-processing rules. Attributes: manta_max_base_quality: int (required); the maximum base quality in bamfile used downstream in Manta rules @@ -194,6 +194,16 @@ class BAMPostProcessingParams(BaseModel): manta_max_base_quality: int +class BEDPreProcessingParams(BaseModel): + """This class defines the params settings used as constants in bed pre-processing rules. + + Attributes: + minimum_region_size: int (required); the minimum region size in input bedfiles for CNV analysis + """ + + minimum_region_size: int + + class BalsamicWorkflowConfig(BaseModel): """Defines set of rules in balsamic workflow @@ -216,6 +226,7 @@ class BalsamicWorkflowConfig(BaseModel): """ bam_post_processing: BAMPostProcessingParams + bed_pre_processing: BEDPreProcessingParams common: ParamsCommon insert_size_metrics: ParamsInsertSizeMetrics manta: ParamsManta diff --git a/BALSAMIC/snakemake_rules/pon/cnvkit_create_pon.rule b/BALSAMIC/snakemake_rules/pon/cnvkit_create_pon.rule index 1d4dbd541..a306c18f8 100644 --- a/BALSAMIC/snakemake_rules/pon/cnvkit_create_pon.rule +++ b/BALSAMIC/snakemake_rules/pon/cnvkit_create_pon.rule @@ -1,42 +1,5 @@ """Rules for creation of CNVkit PON.""" -rule create_target: - input: - target_bait = target_bed, - refgene_flat = refgene_flat, - access_bed = access_5kb_hg19, - wake_up = result_dir + "start_analysis", - output: - target_bed = cnv_dir + "target.bed", - offtarget_bed = cnv_dir + "antitarget.bed" - singularity: - Path(singularity_image, "cnvkit.sif").as_posix() - benchmark: - Path(benchmark_dir, "cnvkit.targets.tsv").as_posix() - shell: - """ -cnvkit.py target {input.target_bait} --annotate {input.refgene_flat} --split -o {output.target_bed}; -cnvkit.py antitarget {input.target_bait} -g {input.access_bed} -o {output.offtarget_bed}; - """ - -rule create_coverage: - input: - bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample), - target_bed = cnv_dir + "target.bed", - antitarget_bed = cnv_dir + "antitarget.bed" - output: - target_cnn = cnv_dir + "{sample}.targetcoverage.cnn", - antitarget_cnn = cnv_dir + "{sample}.antitargetcoverage.cnn" - singularity: - Path(singularity_image, "cnvkit.sif").as_posix() - benchmark: - Path(benchmark_dir, "cnvkit_{sample}.coverage.tsv").as_posix() - shell: - """ -cnvkit.py coverage {input.bam} {input.target_bed} -o {output.target_cnn}; -cnvkit.py coverage {input.bam} {input.antitarget_bed} -o {output.antitarget_cnn}; - """ - rule create_reference: input: cnn = expand(cnv_dir + "{sample}.{prefix}coverage.cnn", sample=config_model.get_all_sample_names(), prefix=["target", "antitarget"]), diff --git a/BALSAMIC/snakemake_rules/variant_calling/cnvkit_preprocess.rule b/BALSAMIC/snakemake_rules/variant_calling/cnvkit_preprocess.rule new file mode 100644 index 000000000..38dcc5490 --- /dev/null +++ b/BALSAMIC/snakemake_rules/variant_calling/cnvkit_preprocess.rule @@ -0,0 +1,45 @@ +rule create_target: + input: + bed_expanded_merged = Path(cnv_dir + "capture_kit_expanded_merged.bed").as_posix(), + refgene_flat = config_model.reference["refgene_flat"], + access_bed = config_model.reference["access_regions"], + wake_up = result_dir + "start_analysis", + output: + targets = cnv_dir + "targets.bed", + antitargets = cnv_dir + "antitarget.bed", + singularity: + Path(singularity_image, "cnvkit.sif").as_posix() + threads: + get_threads(cluster_config, "cnvkit_create_targets") + benchmark: + Path(benchmark_dir, "cnvkit.targets.tsv").as_posix() + shell: + """ +cnvkit.py target {input.bed_expanded_merged} --annotate {input.refgene_flat} --split -o {output.targets}; +cnvkit.py antitarget {input.bed_expanded_merged} -g {input.access_bed} -o {output.antitargets}; + """ + +rule create_coverage: + input: + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample), + target_bed = cnv_dir + "targets.bed", + antitarget_bed = cnv_dir + "antitarget.bed" + output: + target_cnn = cnv_dir + "{sample}.targetcoverage.cnn", + antitarget_cnn = cnv_dir + "{sample}.antitargetcoverage.cnn" + benchmark: + Path(benchmark_dir, "cnvkit_{sample}.coverage.tsv").as_posix() + params: + case_name = config["analysis"]["case_id"], + min_mapq = params.common.min_mapq, + singularity: + Path(singularity_image, "cnvkit.sif").as_posix() + threads: + get_threads(cluster_config, "cnvkit_create_coverage") + message: + "Segmenting genomic regions using CNVkit for {params.case_name}" + shell: + """ +cnvkit.py coverage {input.bam} {input.target_bed} -o {output.target_cnn} --min-mapq {params.min_mapq} --processes {threads} ; +cnvkit.py coverage {input.bam} {input.antitarget_bed} -o {output.antitarget_cnn} --min-mapq {params.min_mapq} --processes {threads} ; + """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/extend_bed.rule b/BALSAMIC/snakemake_rules/variant_calling/extend_bed.rule new file mode 100644 index 000000000..453730107 --- /dev/null +++ b/BALSAMIC/snakemake_rules/variant_calling/extend_bed.rule @@ -0,0 +1,42 @@ + +rule extend_short_bedregions: + input: + baits_bed = config_model.panel.capture_kit, + wake_up= result_dir + "start_analysis", + output: + baits_bed_expanded=Path(cnv_dir + "capture_kit_expanded.bed").as_posix(), + benchmark: + Path(benchmark_dir,"extend_short_bedregions.tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("pysam") + ".sif").as_posix() + params: + bedfile_extend_script = get_script_path("extend_bedfile.py"), + minimum_region_size = params.bed_pre_processing.minimum_region_size + threads: + get_threads(cluster_config, "extend_short_bedregions") + message: + "Extending regions in bedfile to a minimum size of {params.minimum_region_size}." + shell: + """ +python {params.bedfile_extend_script} --extend-to-min-region-size {params.minimum_region_size} {input.baits_bed} {output.baits_bed_expanded} ; + """ + + +rule bedtools_sort_and_merge: + input: + bed_expanded = Path(cnv_dir + "capture_kit_expanded.bed").as_posix(), + output: + bed_expanded_merged = Path(cnv_dir + "capture_kit_expanded_merged.bed").as_posix(), + benchmark: + Path(benchmark_dir, 'bedtools_merge_expanded_bedfile.tsv').as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("bedtools") + ".sif").as_posix() + threads: + get_threads(cluster_config, "bedtools_merge") + message: + "Running bedtools sort and merge to merge potentially overlapping regions." + shell: + """ +bedtools sort -i {input.bed_expanded} > {input.bed_expanded}_sorted.bed ; +bedtools merge -i {input.bed_expanded}_sorted.bed > {output.bed_expanded_merged} + """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/gatk_read_counts.rule b/BALSAMIC/snakemake_rules/variant_calling/gatk_read_counts.rule index 9fa5ccd47..63f32ec82 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/gatk_read_counts.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/gatk_read_counts.rule @@ -3,7 +3,7 @@ rule gatk_collectreadcounts: input: - bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample), + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample, specified_suffix="dedup"), genome_interval = config["reference"]["genome_interval"] output: readcounts_hdf5 = cnv_dir + "{sample}.collectreadcounts.hdf5" diff --git a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule index 26083c6b6..d5ee5dc80 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule @@ -1,98 +1,163 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 -rule sentieon_DNAscope_gnomad: - input: - ref = config["reference"]["reference_genome"], - gnomad_af5= config["reference"]["gnomad_min_af5"], - bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample) - output: - gnomad_af5_vcf = cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz", - params: - tmpdir = tempfile.mkdtemp(prefix=tmp_dir), - pcr_model = params.common.pcr_model, - sentieon_exec = config_model.sentieon.sentieon_exec, - sentieon_lic = config_model.sentieon.sentieon_license, - sentieon_ml_dnascope = config_model.sentieon.dnascope_model, - sample = "{sample}" - benchmark: - Path(benchmark_dir, "sentieon_DNAscope_gnomad_{sample}.tsv").as_posix() - threads: - get_threads(cluster_config, "sentieon_DNAscope_gnomad") - message: - "Calling germline variants on positions in Gnomad AF > 0.05 using Sentieon DNAscope for {params.sample}" - shell: - """ -export TMPDIR={params.tmpdir}; -export SENTIEON_TMPDIR={params.tmpdir}; -export SENTIEON_LICENSE={params.sentieon_lic}; -export SENTIEON_DNASCOPE={params.sentieon_ml_dnascope}; +if config["analysis"]["sequencing_type"] == SequencingType.WGS: + rule sentieon_DNAscope_gnomad: + input: + ref = config["reference"]["reference_genome"], + gnomad_af5= config["reference"]["gnomad_min_af5"], + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample) + output: + gnomad_af5_vcf = cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz", + params: + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + pcr_model = params.common.pcr_model, + sentieon_exec = config_model.sentieon.sentieon_exec, + sentieon_lic = config_model.sentieon.sentieon_license, + sentieon_ml_dnascope = config_model.sentieon.dnascope_model, + sample = "{sample}" + benchmark: + Path(benchmark_dir, "sentieon_DNAscope_gnomad_{sample}.tsv").as_posix() + threads: + get_threads(cluster_config, "sentieon_DNAscope_gnomad") + message: + "Calling germline variants on positions in Gnomad AF > 0.05 using Sentieon DNAscope for {params.sample}" + shell: + """ + export TMPDIR={params.tmpdir}; + export SENTIEON_TMPDIR={params.tmpdir}; + export SENTIEON_LICENSE={params.sentieon_lic}; + export SENTIEON_DNASCOPE={params.sentieon_ml_dnascope}; + + {params.sentieon_exec} driver \ + -t {threads} \ + -r {input.ref} \ + -i {input.bam} \ + --algo DNAscope \ + --pcr_indel_mode {params.pcr_model} \ + --given {input.gnomad_af5} {output.gnomad_af5_vcf}; + + rm -rf {params.tmpdir}; + """ -{params.sentieon_exec} driver \ --t {threads} \ --r {input.ref} \ --i {input.bam} \ ---algo DNAscope \ ---pcr_indel_mode {params.pcr_model} \ ---given {input.gnomad_af5} {output.gnomad_af5_vcf}; + rule gatk_denoise_read_counts: + input: + gens_pon=config["reference"]["gens_coverage_pon"], + readcounts_hdf5=cnv_dir + "{sample}.collectreadcounts.hdf5" + output: + denoised_cr=cnv_dir + "{sample}.denoisedCR.tsv", + standardized_cr=cnv_dir + "{sample}.standardizedCR.tsv" + params: + tmpdir=tempfile.mkdtemp(prefix=tmp_dir), + sample="{sample}" + benchmark: + Path(benchmark_dir,"gatk_denoise_read_counts_{sample}.tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("gatk") + ".sif").as_posix() + threads: + get_threads(cluster_config,"gatk_denoise_read_counts") + message: + "Running GATK DenoiseReadCounts on {params.sample} for GENS." + shell: + """ + export TMPDIR={params.tmpdir}; -rm -rf {params.tmpdir}; - """ + gatk --java-options "-Xmx60g" DenoiseReadCounts \ + -I {input.readcounts_hdf5} \ + --count-panel-of-normals {input.gens_pon} \ + --tmp-dir {params.tmpdir} \ + --standardized-copy-ratios {output.standardized_cr} \ + --denoised-copy-ratios {output.denoised_cr} ; -rule gatk_denoisereadcounts: - input: - gens_pon = config["reference"]["gens_coverage_pon"], - readcounts_hdf5 = cnv_dir + "{sample}.collectreadcounts.hdf5" - output: - denoised_cr = cnv_dir + "{sample}.denoisedCR.tsv", - standardized_cr = cnv_dir + "{sample}.standardizedCR.tsv" - params: - tmpdir=tempfile.mkdtemp(prefix=tmp_dir), - sample="{sample}" - benchmark: - Path(benchmark_dir, "gatk_denoisereadcounts_{sample}.tsv").as_posix() - singularity: - Path(singularity_image,config["bioinfo_tools"].get("gatk") + ".sif").as_posix() - threads: - get_threads(cluster_config,"gatk_denoisereadcounts") - message: - "Running GATK DenoiseReadCounts on {params.sample} for GENS." - shell: - """ -export TMPDIR={params.tmpdir}; + rm -rf {params.tmpdir} + """ + + rule gens_preprocess_wgs: + input: + denoised_cr = cnv_dir + "{sample}.denoisedCR.tsv", + gnomad_af5_vcf = cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz", + output: + gens_baf_bed = cnv_dir + "{sample}.baf.bed", + gens_cov_bed = cnv_dir + "{sample}.cov.bed" + params: + gens_preprocess = get_script_path("preprocess_gens.py"), + sequencing_type = sequencing_type, + sample="{sample}" + benchmark: + Path(benchmark_dir, "gens_preprocess_wgs_{sample}.tsv").as_posix() + threads: + get_threads(cluster_config, "gens_preprocess") + message: + "Formatting output for GENS for sample: {params.sample}." + shell: + """ + python {params.gens_preprocess} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf} ; + python {params.gens_preprocess} -s {params.sequencing_type} -o {output.gens_cov_bed} create-coverage-regions --normalised-coverage-path {input.denoised_cr} + """ +else: + rule sentieon_DNAscope_gnomad_tga: + input: + bed=config["panel"]["capture_kit"], + ref=config["reference"]["reference_genome"], + gnomad_af5=config["reference"]["gnomad_min_af5"], + bam=lambda wildcards: config_model.get_final_bam_name(bam_dir=bam_dir,sample_name=wildcards.sample) + output: + gnomad_af5_vcf=cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz", + params: + tmpdir=tempfile.mkdtemp(prefix=tmp_dir), + pcr_model=params.common.pcr_model, + sentieon_exec = config_model.sentieon.sentieon_exec, + sentieon_lic = config_model.sentieon.sentieon_license, + sample="{sample}" + benchmark: + Path(benchmark_dir,"sentieon_DNAscope_gnomad_{sample}.tsv").as_posix() + threads: + get_threads(cluster_config,"sentieon_DNAscope_gnomad_tga") + message: + "Calling germline variants on positions in Gnomad AF > 0.05 using Sentieon DNAscope for {params.sample}" + shell: + """ + export TMPDIR={params.tmpdir}; + export SENTIEON_TMPDIR={params.tmpdir}; + export SENTIEON_LICENSE={params.sentieon_lic}; -gatk --java-options "-Xmx60g" DenoiseReadCounts \ --I {input.readcounts_hdf5} \ ---count-panel-of-normals {input.gens_pon} \ ---tmp-dir {params.tmpdir} \ ---standardized-copy-ratios {output.standardized_cr} \ ---denoised-copy-ratios {output.denoised_cr} + {params.sentieon_exec} driver \ + -t {threads} \ + -r {input.ref} \ + -i {input.bam} \ + --interval {input.bed} \ + --interval_padding 20 \ + --algo DNAscope \ + --pcr_indel_mode {params.pcr_model} \ + --given {input.gnomad_af5} {output.gnomad_af5_vcf}; -rm -rf {params.tmpdir} - """ + rm -rf {params.tmpdir}; + """ -rule gens_preprocessing: - input: - denoised_cr = cnv_dir + "{sample}.denoisedCR.tsv", - gnomad_af5_vcf = cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz", - output: - gens_baf_bed = cnv_dir + "{sample}.baf.bed", - gens_cov_bed = cnv_dir + "{sample}.cov.bed" - params: - gens_preprocessing = get_script_path("preprocess_gens.py"), - sequencing_type = sequencing_type, - sample="{sample}" - benchmark: - Path(benchmark_dir, "gens_preprocessing_{sample}.tsv").as_posix() - threads: - get_threads(cluster_config, "gens_preprocessing") - message: - "Formatting output for GENS for sample: {params.sample}." - shell: - """ -python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf} -python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_cov_bed} create-coverage-regions --normalised-coverage-path {input.denoised_cr} - """ + rule gens_preprocess_tga: + input: + cnvkit_cnr = cnv_dir + "tumor.merged.cnr", + gnomad_af5_vcf = cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz", + purecn_purity_csv = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.purity.csv", + output: + gens_baf_bed = cnv_dir + "{sample}.baf.bed", + gens_cov_bed = cnv_dir + "{sample}.cov.bed" + params: + sequencing_type = sequencing_type, + gens_preprocess = get_script_path("preprocess_gens.py"), + gens_preprocess_cnvkit = get_script_path("postprocess_gens_cnvkit.py"), + sample="{sample}" + benchmark: + Path(benchmark_dir, "gens_preprocess_tga_{sample}.tsv").as_posix() + threads: + get_threads(cluster_config, "gens_preprocess") + message: + "Formatting output for GENS for sample: {params.sample}." + shell: + """ + python {params.gens_preprocess} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf} ; + python {params.gens_preprocess_cnvkit} -o {output.gens_cov_bed} --normalised-coverage-path {input.cnvkit_cnr} --tumor-purity-path {input.purecn_purity_csv} + """ rule finalize_gens_outputfiles: input: @@ -113,7 +178,7 @@ rule finalize_gens_outputfiles: "Bgzip and index GENS output: {params.gens_input} for sample: {params.sample_id}." shell: """ -bgzip {input.gens_input} +bgzip {input.gens_input} ; tabix {input.gens_input}.gz """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_normal_tga.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_normal_tga.rule index 7c1ee0f6c..622d28024 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_normal_tga.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_normal_tga.rule @@ -43,20 +43,15 @@ rm -rf {params.tmpdir}; rule cnvkit_segment_CNV_research: input: access_bed = config["reference"]["access_regions"], - baits_bed = config["panel"]["capture_kit"], fasta = config["reference"]["reference_genome"], refgene_flat = config["reference"]["refgene_flat"], - bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), - bamN = config_model.get_final_bam_name(bam_dir = bam_dir,sample_name = normal_sample), + tumor_target_cnn=expand(cnv_dir + "{sample}.targetcoverage.cnn", sample=tumor_sample), + tumor_antitarget_cnn=expand(cnv_dir + "{sample}.antitargetcoverage.cnn", sample=tumor_sample), + normal_target_cnn=expand(cnv_dir + "{sample}.targetcoverage.cnn",sample=normal_sample), + normal_antitarget_cnn=expand(cnv_dir + "{sample}.antitargetcoverage.cnn",sample=normal_sample), output: cns_initial = cnv_dir + "tumor.initial.cns", cnr = cnv_dir + "tumor.merged.cnr", - targets = cnv_dir + "targets.bed", - antitargets = cnv_dir + "antitarget.bed", - tumor_target_coverage = cnv_dir + "tumor.targetcoverage.cnn", - tumor_antitarget_coverage = cnv_dir + "tumor.antitargetcoverage.cnn", - normal_target_coverage = cnv_dir + "normal.targetcoverage.cnn", - normal_antitarget_coverage= cnv_dir + "normal.antitargetcoverage.cnn", segment = cnv_dir + "tumor.seg", benchmark: Path(f"{benchmark_dir}/cnvkit_segment_{config['analysis']['case_id']}.tsv").as_posix() @@ -70,69 +65,33 @@ rule cnvkit_segment_CNV_research: case_name = config["analysis"]["case_id"], sample_id = "TUMOR", normal_id = "NORMAL", - min_mapq = params.common.min_mapq, normal_reference_cnn = cnv_dir + "normal_reference.cnn", pon = pon_cnn, message: "Segmenting genomic regions using CNVkit for {params.case_name}" shell: """ -# create target and anti-target bed files -cnvkit.py target {input.baits_bed} \ ---annotate {input.refgene_flat} \ ---split \ ---output {output.targets}; - -cnvkit.py antitarget {input.baits_bed} \ ---access {input.access_bed} \ ---output {output.antitargets}; - -# calculate coverage in the given regions from BAM read depths -cnvkit.py coverage {input.bamT} \ -{output.targets} \ ---min-mapq {params.min_mapq} \ ---processes {threads} \ ---output {output.tumor_target_coverage}; - -cnvkit.py coverage {input.bamT} \ -{output.antitargets} \ ---min-mapq {params.min_mapq} \ ---processes {threads} \ ---output {output.tumor_antitarget_coverage}; - -cnvkit.py coverage {input.bamN} \ -{output.targets} \ ---min-mapq {params.min_mapq} \ ---processes {threads} \ ---output {output.normal_target_coverage}; - -cnvkit.py coverage {input.bamN} \ -{output.antitargets} \ ---min-mapq {params.min_mapq} \ ---processes {threads} \ ---output {output.normal_antitarget_coverage}; - # Combine the uncorrected target and antitarget coverage tables (.cnn) and # correct for biases in regional coverage and GC content, according to the given normal or PON reference if [[ ! -f "{params.pon}" ]]; then # Compile a coverage reference from the given list of files cnvkit.py reference \ -{output.normal_target_coverage} \ -{output.normal_antitarget_coverage} \ +{input.normal_target_cnn} \ +{input.normal_antitarget_cnn} \ --fasta {input.fasta} \ --output {params.normal_reference_cnn}; -cnvkit.py fix {output.tumor_target_coverage} \ -{output.tumor_antitarget_coverage} \ +cnvkit.py fix {input.tumor_target_cnn} \ +{input.tumor_antitarget_cnn} \ {params.normal_reference_cnn} \ --output {output.cnr}; else echo "PON reference exists- Using it for coverage correction" -cnvkit.py fix {output.tumor_target_coverage} \ -{output.tumor_antitarget_coverage} \ +cnvkit.py fix {input.tumor_target_cnn} \ +{input.tumor_antitarget_cnn} \ {params.pon} \ --output {output.cnr}; @@ -243,14 +202,11 @@ rule cnvkit_call_CNV_research: input: access_bed = config["reference"]["access_regions"], fasta = config["reference"]["reference_genome"], - baits_bed = config["panel"]["capture_kit"], refgene_flat = config["reference"]["refgene_flat"], purity_ploidy = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.purity.csv", cns_initial = cnv_dir + "tumor.initial.cns", cnr = cnv_dir + "tumor.merged.cnr", snv_merged = vcf_dir + "SNV.germline.merged.dnascope.vcf.gz", - bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), - bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = normal_sample), output: cns = cnv_dir + "tumor.merged.cns", gene_breaks = cnv_dir + config["analysis"]["case_id"] + ".gene_breaks", diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule index b95d46df3..155e630cf 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule @@ -4,17 +4,13 @@ rule cnvkit_segment_CNV_research: input: access_bed = config["reference"]["access_regions"], - baits_bed = config["panel"]["capture_kit"], fasta = config["reference"]["reference_genome"], refgene_flat = config["reference"]["refgene_flat"], - bamT = config_model.get_final_bam_name(bam_dir = bam_dir,sample_name=tumor_sample), + tumor_target_cnn=expand(cnv_dir + "{sample}.targetcoverage.cnn",sample=tumor_sample), + tumor_antitarget_cnn=expand(cnv_dir + "{sample}.antitargetcoverage.cnn",sample=tumor_sample), output: cns_initial = cnv_dir + "tumor.initial.cns", cnr = cnv_dir + "tumor.merged.cnr", - targets = cnv_dir + "targets.bed", - antitargets = cnv_dir + "antitarget.bed", - tumor_target_coverage = cnv_dir + "tumor.targetcoverage.cnn", - tumor_antitarget_coverage = cnv_dir + "tumor.antitargetcoverage.cnn", segment = cnv_dir + "tumor.seg", benchmark: Path(f"{benchmark_dir}/cnvkit_segment_{config['analysis']['case_id']}.tsv").as_posix() @@ -24,7 +20,6 @@ rule cnvkit_segment_CNV_research: get_threads(cluster_config,"cnvkit_segment_CNV_research") params: housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "cnv"}, - min_mapq = params.common.min_mapq, case_name = config["analysis"]["case_id"], sample_id = "TUMOR", flat_reference_cnn = cnv_dir + "flat_reference.cnn", @@ -33,47 +28,24 @@ rule cnvkit_segment_CNV_research: ("Segmenting genomic regions using CNVkit for {params.case_name}") shell: """ -# create target and anti-target bed files -cnvkit.py target {input.baits_bed} \ ---annotate {input.refgene_flat} \ ---split \ ---output {output.targets}; - -cnvkit.py antitarget {input.baits_bed} \ ---access {input.access_bed} \ ---output {output.antitargets}; - -# calculate coverage in the given regions from BAM read depths -cnvkit.py coverage {input.bamT} \ -{output.targets} \ ---min-mapq {params.min_mapq} \ ---processes {threads} \ ---output {output.tumor_target_coverage}; - -cnvkit.py coverage {input.bamT} \ -{output.antitargets} \ ---min-mapq {params.min_mapq} \ ---processes {threads} \ ---output {output.tumor_antitarget_coverage}; - # Combine the uncorrected target and antitarget coverage tables (.cnn) and # correct for biases in regional coverage and GC content, according to the given reference if [[ ! -f "{params.pon}" ]]; then cnvkit.py reference --output {params.flat_reference_cnn} \ --fasta {input.fasta} \ ---targets {output.targets} \ ---antitargets {output.antitargets}; +--targets {input.tumor_target_cnn} \ +--antitargets {input.tumor_antitarget_cnn}; -cnvkit.py fix {output.tumor_target_coverage} \ -{output.tumor_antitarget_coverage} \ +cnvkit.py fix {input.tumor_target_cnn} \ +{input.tumor_antitarget_cnn} \ {params.flat_reference_cnn} \ --output {output.cnr}; else echo "PON reference exists- Using it for coverage correction" -cnvkit.py fix {output.tumor_target_coverage} \ -{output.tumor_antitarget_coverage} \ +cnvkit.py fix {input.tumor_target_cnn} \ +{input.tumor_antitarget_cnn} \ {params.pon} \ --output {output.cnr}; @@ -180,7 +152,6 @@ rule cnvkit_call_CNV_research: input: access_bed = config["reference"]["access_regions"], fasta = config["reference"]["reference_genome"], - baits_bed = config["panel"]["capture_kit"], refgene_flat = config["reference"]["refgene_flat"], purity_ploidy = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.purity.csv", cns_initial= cnv_dir + "tumor.initial.cns", diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py index 1a292de9c..a146fa4d7 100644 --- a/BALSAMIC/utils/cli.py +++ b/BALSAMIC/utils/cli.py @@ -215,6 +215,41 @@ def bioinfo_tool_version_conda( return conda_bioinfo_version +def get_gens_references( + genome_interval: Optional[str], + gens_coverage_pon: Optional[str], + gnomad_min_af5: Optional[str], + panel_bed: Optional[str], +) -> Dict[str, str] | None: + """ + Assigns reference-files required for GENS if they have been supplied, else exists with error message. + + Args: + genome_interval: Optional[str] Coverage-regions. (required for WGS GENS) + gens_coverage_pon: Optional[str] PON for GATK CollectReadCounts. (required for WGS GENS) + gnomad_min_af5: Optional[str] gnomad VCF filtered to keep variants above 5% VAF (required for WGS and TGA GENS). + panel_bed: Optional[str] Bedfile supplied for TGA analyses. + + Returns: + Dict[str, str] with paths to GENS reference files or None + """ + + if panel_bed and gnomad_min_af5: + return {"gnomad_min_af5": gnomad_min_af5} + + if gnomad_min_af5 and genome_interval and gens_coverage_pon: + return { + "genome_interval": genome_interval, + "gens_coverage_pon": gens_coverage_pon, + "gnomad_min_af5": gnomad_min_af5, + } + if any([gnomad_min_af5, genome_interval, gens_coverage_pon]): + error_message = "GENS for WGS requires all arguments: genome_interval, gens_coverage_pon, gnomad_min_af5" + LOG.error(error_message) + raise BalsamicError(error_message) + return None + + def get_bioinfo_tools_version( bioinfo_tools: dict, container_conda_env_path: Path ) -> dict: diff --git a/BALSAMIC/utils/io.py b/BALSAMIC/utils/io.py index ccc4d8a20..977c6e3b3 100644 --- a/BALSAMIC/utils/io.py +++ b/BALSAMIC/utils/io.py @@ -1,5 +1,6 @@ """Input/Output utility methods.""" +import csv import gzip import json import logging @@ -66,6 +67,20 @@ def write_json(json_obj: dict, path: str) -> None: raise OSError(f"Error while writing JSON file: {path}, error: {error}") +def read_csv(csv_path: str, delimeter: str = ",") -> List[dict]: + """Read data from a csv file.""" + with open(csv_path, mode="r") as csv_file: + csv_reader = csv.DictReader(csv_file, delimiter=delimeter) + return [row for row in csv_reader] + + +def write_list_of_strings(list_of_strings: list[str], output_file: str): + """Writes a list of strings to a file, each on a new line.""" + with open(output_file, "w") as file: + for string in list_of_strings: + file.write(f"{string}\n") + + def read_yaml(yaml_path: str) -> dict: """Read data from a yaml file.""" if Path(yaml_path).exists(): diff --git a/BALSAMIC/workflows/PON.smk b/BALSAMIC/workflows/PON.smk index 543736aa9..8c37287b5 100644 --- a/BALSAMIC/workflows/PON.smk +++ b/BALSAMIC/workflows/PON.smk @@ -68,6 +68,8 @@ if sequence_type == SequencingType.TARGETED: rules_to_include.append("snakemake_rules/quality_control/fastp_tga.rule") rules_to_include.append("snakemake_rules/align/tga_sentieon_alignment.rule") rules_to_include.append("snakemake_rules/align/tga_bam_postprocess.rule") + rules_to_include.append("snakemake_rules/variant_calling/extend_bed.rule") + rules_to_include.append("snakemake_rules/variant_calling/cnvkit_preprocess.rule") else: rules_to_include.append("snakemake_rules/quality_control/fastp_wgs.rule") rules_to_include.append("snakemake_rules/align/wgs_sentieon_alignment.rule") diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk index 044a49e1d..b02ffd98d 100644 --- a/BALSAMIC/workflows/balsamic.smk +++ b/BALSAMIC/workflows/balsamic.smk @@ -9,7 +9,11 @@ from pathlib import Path from typing import Dict, List from BALSAMIC.constants.constants import FileType -from BALSAMIC.constants.analysis import FastqName, MutationType, SampleType +from BALSAMIC.constants.analysis import ( + FastqName, + MutationType, + SampleType, + SequencingType) from BALSAMIC.constants.paths import BALSAMIC_DIR from BALSAMIC.constants.rules import SNAKEMAKE_RULES from BALSAMIC.constants.variant_filters import ( @@ -90,7 +94,6 @@ delivery_dir: str = Path(result_dir, "delivery").as_posix() + "/" umi_dir: str = Path(result_dir, "umi").as_posix() + "/" umi_qc_dir: str = Path(qc_dir, "umi_qc").as_posix() + "/" - # Annotations research_annotations = [] clinical_annotations = [] @@ -495,9 +498,10 @@ if "dragen" in config: rules_to_include.append("snakemake_rules/concatenation/concatenation.rule") # Add rule for GENS -if "gens_coverage_pon" in config["reference"]: - rules_to_include.append("snakemake_rules/variant_calling/gatk_read_counts.rule") +if "gnomad_min_af5" in config["reference"]: rules_to_include.append("snakemake_rules/variant_calling/gens_preprocessing.rule") +if "gnomad_min_af5" in config["reference"] and sequence_type == SequencingType.WGS: + rules_to_include.append("snakemake_rules/variant_calling/gatk_read_counts.rule") LOG.info(f"The following rules will be included in the workflow: {rules_to_include}") LOG.info( @@ -661,10 +665,7 @@ if config["analysis"]["analysis_type"] == "single": ) # GENS Outputs -if ( - config["analysis"]["sequencing_type"] == "wgs" - and "gens_coverage_pon" in config["reference"] -): +if "gnomad_min_af5" in config["reference"]: analysis_specific_results.extend( expand( cnv_dir + "{sample}.{gens_input}.bed.gz", @@ -673,6 +674,7 @@ if ( ) ) + # Dragen if ( config["analysis"]["sequencing_type"] == "wgs" diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 8fd850de3..d07b58c82 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,10 @@ Added: * MSI analysis to the tumor-normal workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1454 * Sentieon install directory path to case config arguments https://github.com/Clinical-Genomics/BALSAMIC/pull/1461 * UMI extraction and deduplication to TGA workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1358 +* GENS input files for TGA https://github.com/Clinical-Genomics/BALSAMIC/pull/1448 +* Padding of bed-regions for CNVkit to minimum 100 bases https://github.com/Clinical-Genomics/BALSAMIC/pull/1469 +* Added min mapq 20 to CNVkit PON workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1465 +* CNVkit PONs for Exome comprehensive 10.2, GMSsolid 15.2, GMCKsolid 4.2 https://github.com/Clinical-Genomics/BALSAMIC/pull/1465 Changed: ^^^^^^^^ @@ -14,6 +18,8 @@ Changed: * `SLEEP_BEFORE_START` to 600s https://github.com/Clinical-Genomics/BALSAMIC/pull/1372 * Updated Multiqc to version 1.22.3 https://github.com/Clinical-Genomics/BALSAMIC/pull/1441 * Upgrade `vcf2cytosure` version to 0.9.1 and remove hardcoded versions https://github.com/Clinical-Genomics/BALSAMIC/pull/1456 +* Create new PONs for GMCKSolid v4.1, GMSMyeloid v5.3, and GMSlymphoid v7.3 https://github.com/Clinical-Genomics/BALSAMIC/pull/1465 +* Refactored CNVkit rules https://github.com/Clinical-Genomics/BALSAMIC/pull/1465 Removed: ^^^^^^^^ diff --git a/docs/balsamic_pon.rst b/docs/balsamic_pon.rst index cdc13a473..4b11eaa8e 100644 --- a/docs/balsamic_pon.rst +++ b/docs/balsamic_pon.rst @@ -54,6 +54,9 @@ When creating a new PON reference file, the next steps have to be followed: /path/analysis/analysis_PON_finish /path/analysis/cnv/_CNVkit_PON_reference_.cnn +.. warning:: + The bedfile from the bait-set will be padded in the generation of the PON according to the minimum bed region size set in Balsamic as well as during the analysis with CNVkit, this to avoid CNVkit filtering out short regions. + Using the PON during analysis ----------------------------- diff --git a/herey b/herey new file mode 100644 index 000000000..9004fd246 --- /dev/null +++ b/herey @@ -0,0 +1,15 @@ +o_1 834184 834185 -0.015159 +o_1 1718821 1718822 -0.120548 +o_1 1720598 1720599 -0.213593 +a_1 834184 834185 -0.015159 +a_1 1718821 1718822 -0.120548 +a_1 1720598 1720599 -0.213593 +b_1 834184 834185 -0.015159 +b_1 1718821 1718822 -0.120548 +b_1 1720598 1720599 -0.213593 +c_1 834184 834185 -0.015159 +c_1 1718821 1718822 -0.120548 +c_1 1720598 1720599 -0.213593 +d_1 834184 834185 -0.015159 +d_1 1718821 1718822 -0.120548 +d_1 1720598 1720599 -0.213593 diff --git a/tests/commands/config/test_config_sample.py b/tests/commands/config/test_config_sample.py index 97736767d..aa76ca6a2 100644 --- a/tests/commands/config/test_config_sample.py +++ b/tests/commands/config/test_config_sample.py @@ -17,8 +17,7 @@ def test_tumor_normal_config( fastq_dir_tumor_normal_parameterize: str, balsamic_cache: str, panel_bed_file: str, - sentieon_license: str, - sentieon_install_dir: str, + config_case_cli_tga: list[str], ): """Test tumor normal balsamic config case command.""" @@ -44,11 +43,8 @@ def test_tumor_normal_config( tumor_sample_name, "--normal-sample-name", normal_sample_name, - "--sentieon-install-dir", - sentieon_install_dir, - "--sentieon-license", - sentieon_license, - ], + ] + + config_case_cli_tga, ) # THEN a config should be created and exist @@ -66,8 +62,7 @@ def test_tumor_only_config( fastq_dir_tumor_only: str, balsamic_cache: str, panel_bed_file: str, - sentieon_license: str, - sentieon_install_dir: str, + config_case_cli_tga: list[str], ): """Test tumor only balsamic config case command.""" @@ -90,11 +85,8 @@ def test_tumor_only_config( balsamic_cache, "--tumor-sample-name", tumor_sample_name, - "--sentieon-install-dir", - sentieon_install_dir, - "--sentieon-license", - sentieon_license, - ], + ] + + config_case_cli_tga, ) # THEN a config should be created and exist @@ -199,8 +191,7 @@ def test_tumor_only_umi_config_background_file( balsamic_cache: str, panel_bed_file: str, background_variant_file: str, - sentieon_license: str, - sentieon_install_dir: str, + config_case_cli_tga: list[str], ): """Test balsamic umi config case providing a background variants file.""" @@ -227,11 +218,8 @@ def test_tumor_only_umi_config_background_file( balsamic_cache, "--tumor-sample-name", tumor_sample_name, - "--sentieon-install-dir", - sentieon_install_dir, - "--sentieon-license", - sentieon_license, - ], + ] + + config_case_cli_tga ) # THEN program exits and checks for filepath @@ -250,6 +238,7 @@ def test_pon_cnn_file( case_id_tumor_only: str, sentieon_license: str, sentieon_install_dir: str, + config_case_cli_tga: list[str], ): """Test balsamic config case with a PON reference.""" @@ -274,11 +263,8 @@ def test_pon_cnn_file( balsamic_cache, "--tumor-sample-name", tumor_sample_name, - "--sentieon-install-dir", - sentieon_install_dir, - "--sentieon-license", - sentieon_license, - ], + ] + + config_case_cli_tga, ) # THEN program exits and checks for filepath @@ -390,7 +376,7 @@ def test_config_graph_failed( assert case_result.exit_code == 1 -def test_missing_required_gens_arguments( +def test_missing_required_gens_arguments_wgs( invoke_cli, tumor_sample_name: str, analysis_dir: str, @@ -432,13 +418,53 @@ def test_missing_required_gens_arguments( ], ) # THEN the CLI should exit code 2 and display an informative error message - assert result.exit_code == 2 + assert result.exit_code == 1 assert ( - "All three arguments (genome_interval gens_coverage_pon, gnomad_min_af5) are required for GENS." + "GENS for WGS requires all arguments: genome_interval, gens_coverage_pon, gnomad_min_af5" in result.output ) +def test_missing_gens_arguments_tga( + invoke_cli, + tumor_sample_name: str, + analysis_dir: str, + balsamic_cache: str, + fastq_dir_tumor_only: str, + panel_bed_file: str, + case_id_tumor_only: str, + sentieon_license: str, + sentieon_install_dir: str, +): + """Test balsamic config case without required GENS arguments.""" + + # GIVEN CLI arguments including optional GENS input-files + + # WHEN invoking the config case command + result = invoke_cli( + [ + "config", + "case", + "--case-id", + case_id_tumor_only, + "--analysis-dir", + analysis_dir, + "--fastq-path", + fastq_dir_tumor_only, + "--balsamic-cache", + balsamic_cache, + "--tumor-sample-name", + tumor_sample_name, + "--sentieon-install-dir", + sentieon_install_dir, + "--sentieon-license", + sentieon_license, + ], + ) + # THEN the CLI should exit code 0 + assert result.exit_code == 0 + + def test_config_with_gens_arguments( invoke_cli, tumor_sample_name: str, @@ -523,12 +549,8 @@ def test_config_with_gens_arguments_for_tga( balsamic_cache, "--tumor-sample-name", tumor_sample_name, - "--gens-coverage-pon", - gens_cov_pon_file, "--gnomad-min-af5", gens_min_5_af_gnomad_file, - "--genome-interval", - gens_hg19_interval_list, "-p", panel_bed_file, "--sentieon-install-dir", @@ -537,11 +559,11 @@ def test_config_with_gens_arguments_for_tga( sentieon_license, ], ) - # THEN config should fail with error message - assert result.exit_code == 2 - assert ( - "GENS is currently not compatible with TGA analysis, only WGS." in result.output - ) + # THEN a config should be created and exist + assert result.exit_code == 0 + assert Path( + analysis_dir, case_id_tumor_only, f"{case_id_tumor_only}.{FileType.JSON}" + ).exists() def test_config_wgs_with_exome( @@ -593,8 +615,7 @@ def test_config_tga_with_exome( fastq_dir_tumor_only: str, case_id_tumor_only: str, panel_bed_file: str, - sentieon_license: str, - sentieon_install_dir: str, + config_case_cli_tga: list[str], ): """Test balsamic config case with GENS arguments for TGA.""" @@ -618,11 +639,8 @@ def test_config_tga_with_exome( "-p", panel_bed_file, "--exome", - "--sentieon-install-dir", - sentieon_install_dir, - "--sentieon-license", - sentieon_license, - ], + ] + + config_case_cli_tga, ) # THEN a config should be created and exist assert result.exit_code == 0 diff --git a/tests/conftest.py b/tests/conftest.py index e50ea436f..0b31397cc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -534,6 +534,26 @@ def gens_dummy_cov_bed(test_data_dir: str) -> str: ).as_posix() +@pytest.fixture(scope="session") +def gens_dummy_cnvkit_cnr(test_data_dir: str) -> str: + """Return path CNVkit cnr path for GENS TGA pre-processing script test.""" + return Path( + test_data_dir, + "gens_files", + "tumor.merged.cnr", + ).as_posix() + + +@pytest.fixture(scope="session") +def gens_dummy_cov_bed_expected(test_data_dir: str) -> str: + """Return path expected dummy result-file created from GENS pre-processing TGA test.""" + return Path( + test_data_dir, + "gens_files", + "dummy_gens_tga.cov.bed", + ).as_posix() + + @pytest.fixture(scope="session") def gens_dummy_denoised_cov(test_data_dir: str) -> str: """Return path dummy coverage file for GENS pre-processing test.""" @@ -667,6 +687,12 @@ def references_dir(test_data_dir) -> Path: return Path(test_data_dir, "references") +@pytest.fixture(scope="session") +def bedfile_path(test_data_dir) -> Path: + """Return bedfile path.""" + return Path(test_data_dir, "bedfiles", "test_bedfile.bed") + + @pytest.fixture(scope="session") def purity_csv_path(test_data_dir) -> Path: """Return pureCN purity CSV path.""" @@ -1226,7 +1252,60 @@ def balsamic_pon_model( @pytest.fixture(scope="session") -def config_case_cli( +def config_case_cli_wgs( + balsamic_cache: str, + background_variant_file: str, + cadd_annotations: str, + swegen_snv_frequency_path: str, + swegen_sv_frequency_path: str, + clinical_snv_observations_path: str, + clinical_sv_observations_path: str, + somatic_sv_observations_path: str, + cancer_germline_snv_observations_path: str, + cancer_somatic_snv_observations_path: str, + sentieon_license: str, + sentieon_install_dir: str, + gens_hg19_interval_list: str, + gens_min_5_af_gnomad_file: str, + gens_cov_pon_file: str, +) -> List[str]: + """Return common config case CLI.""" + return [ + "--balsamic-cache", + balsamic_cache, + "--background-variants", + background_variant_file, + "--cadd-annotations", + cadd_annotations, + "--swegen-snv", + swegen_snv_frequency_path, + "--swegen-sv", + swegen_sv_frequency_path, + "--clinical-snv-observations", + clinical_snv_observations_path, + "--clinical-sv-observations", + clinical_sv_observations_path, + "--cancer-somatic-sv-observations", + somatic_sv_observations_path, + "--cancer-germline-snv-observations", + cancer_germline_snv_observations_path, + "--cancer-somatic-snv-observations", + cancer_somatic_snv_observations_path, + "--sentieon-install-dir", + sentieon_install_dir, + "--sentieon-license", + sentieon_license, + "--genome-interval", + gens_hg19_interval_list, + "--gnomad-min-af5", + gens_min_5_af_gnomad_file, + "--gens-coverage-pon", + gens_cov_pon_file, + ] + + +@pytest.fixture(scope="session") +def config_case_cli_tga( balsamic_cache: str, background_variant_file: str, cadd_annotations: str, @@ -1239,6 +1318,7 @@ def config_case_cli( cancer_somatic_snv_observations_path: str, sentieon_license: str, sentieon_install_dir: str, + gens_min_5_af_gnomad_file: str, ) -> List[str]: """Return common config case CLI.""" return [ @@ -1266,6 +1346,8 @@ def config_case_cli( sentieon_install_dir, "--sentieon-license", sentieon_license, + "--gnomad-min-af5", + gens_min_5_af_gnomad_file, ] @@ -1276,7 +1358,7 @@ def tumor_only_config_qc( fastq_dir_tumor_only_qc: str, tumor_sample_name: str, panel_bed_file: str, - config_case_cli: list[str], + config_case_cli_tga: list[str], ) -> str: """Invoke balsamic config sample to create sample configuration file for tumor-only TGA.""" @@ -1299,7 +1381,7 @@ def tumor_only_config_qc( "-p", panel_bed_file, ] - + config_case_cli, + + config_case_cli_tga, ) return Path( @@ -1313,8 +1395,9 @@ def tumor_normal_config_qc( tumor_sample_name: str, normal_sample_name: str, analysis_dir: str, + panel_bed_file: str, fastq_dir_tumor_normal_qc: str, - config_case_cli: list[str], + config_case_cli_tga: list[str], ) -> str: """Invoke balsamic config sample to create sample configuration file for tumor-normal TGA QC workflow.""" @@ -1336,8 +1419,10 @@ def tumor_normal_config_qc( tumor_sample_name, "--normal-sample-name", normal_sample_name, + "-p", + panel_bed_file, ] - + config_case_cli, + + config_case_cli_tga, ) return Path( @@ -1354,7 +1439,7 @@ def tumor_normal_config_qc_wgs( fastq_dir_tumor_normal_qc_wgs: str, tumor_sample_name: str, normal_sample_name: str, - config_case_cli: List[str], + config_case_cli_wgs: List[str], ) -> str: """Invoke balsamic config sample to create sample configuration file for tumor-normal WGS QC workflow.""" @@ -1377,7 +1462,7 @@ def tumor_normal_config_qc_wgs( "--normal-sample-name", normal_sample_name, ] - + config_case_cli, + + config_case_cli_wgs, ) return Path( @@ -1394,7 +1479,7 @@ def tumor_only_config( analysis_dir: str, fastq_dir_tumor_only: str, panel_bed_file: str, - config_case_cli: list[str], + config_case_cli_tga: list[str], ) -> str: """Invoke balsamic config sample to create sample configuration file for tumor-only TGA.""" @@ -1415,7 +1500,7 @@ def tumor_only_config( "-p", panel_bed_file, ] - + config_case_cli, + + config_case_cli_tga, ) return Path( analysis_dir, @@ -1432,7 +1517,7 @@ def tumor_normal_config( analysis_dir: str, fastq_dir_tumor_normal: str, panel_bed_file: str, - config_case_cli: list[str], + config_case_cli_tga: list[str], ) -> str: """Invoke balsamic config sample to create sample configuration file for tumor-normal TGA.""" @@ -1455,7 +1540,7 @@ def tumor_normal_config( "-p", panel_bed_file, ] - + config_case_cli, + + config_case_cli_tga, ) return Path( @@ -1472,7 +1557,7 @@ def tumor_only_umi_config( analysis_dir: str, fastq_dir_tumor_only: str, panel_bed_file: str, - config_case_cli: list[str], + config_case_cli_tga: list[str], ) -> str: """Invoke balsamic config sample to create sample configuration file for tumor-only TGA.""" @@ -1495,7 +1580,7 @@ def tumor_only_umi_config( "-p", panel_bed_file, ] - + config_case_cli, + + config_case_cli_tga, ) return Path( @@ -1513,7 +1598,7 @@ def tumor_normal_umi_config( analysis_dir: str, fastq_dir_tumor_normal: str, panel_bed_file: str, - config_case_cli: list[str], + config_case_cli_tga: list[str], ) -> str: """Invoke balsamic config sample to create sample configuration file for tumor-normal TGA.""" @@ -1540,7 +1625,7 @@ def tumor_normal_umi_config( "-p", panel_bed_file, ] - + config_case_cli, + + config_case_cli_tga, ) return Path( @@ -1556,7 +1641,7 @@ def tumor_only_wgs_config( analysis_dir: str, fastq_dir_tumor_only_wgs: str, tumor_sample_name: str, - config_case_cli: List[str], + config_case_cli_wgs: List[str], ) -> str: """Invoke balsamic config sample to create sample configuration file for tumor-only WGS.""" @@ -1575,7 +1660,7 @@ def tumor_only_wgs_config( "--tumor-sample-name", tumor_sample_name, ] - + config_case_cli, + + config_case_cli_wgs, ) return Path( @@ -1592,7 +1677,7 @@ def tumor_normal_wgs_config( fastq_dir_tumor_normal_wgs: str, tumor_sample_name: str, normal_sample_name: str, - config_case_cli: str, + config_case_cli_wgs: List[str], ) -> str: """Invoke balsamic config sample to create sample configuration file for tumor-normal WGS.""" @@ -1613,7 +1698,7 @@ def tumor_normal_wgs_config( "--normal-sample-name", normal_sample_name, ] - + config_case_cli, + + config_case_cli_wgs, ) return Path( @@ -1632,8 +1717,7 @@ def tumor_only_config_dummy_vep( fastq_dir_tumor_only_dummy_vep: str, panel_bed_file: str, background_variant_file: str, - sentieon_license: str, - sentieon_install_dir: str, + config_case_cli_tga: list[str], ) -> str: """Invoke balsamic config sample to create sample configuration file for tumor-only TGA with dummy VEP file.""" @@ -1657,11 +1741,8 @@ def tumor_only_config_dummy_vep( background_variant_file, "--tumor-sample-name", tumor_sample_name, - "--sentieon-install-dir", - sentieon_install_dir, - "--sentieon-license", - sentieon_license, - ], + ] + + config_case_cli_tga, ) return Path( analysis_dir, @@ -1679,8 +1760,7 @@ def tumor_only_pon_config( panel_bed_file: str, pon_cnn_path: str, balsamic_cache: str, - sentieon_license: str, - sentieon_install_dir: str, + config_case_cli_tga: List[str], ) -> str: """Invoke balsamic PON config sample to create sample configuration file for tumor-only TGA.""" @@ -1700,15 +1780,10 @@ def tumor_only_pon_config( panel_bed_file, "--pon-cnn", pon_cnn_path, - "--balsamic-cache", - balsamic_cache, "--tumor-sample-name", tumor_sample_name, - "--sentieon-install-dir", - sentieon_install_dir, - "--sentieon-license", - sentieon_license, - ], + ] + + config_case_cli_tga, ) return Path( diff --git a/tests/models/test_config_models.py b/tests/models/test_config_models.py index 21cb5a2b1..9793133e2 100644 --- a/tests/models/test_config_models.py +++ b/tests/models/test_config_models.py @@ -418,5 +418,5 @@ def test_get_final_bam_name_pon(balsamic_pon_model: ConfigModel): ) # Then retrieved final bam names should match the expected format and be identical regardless of request parameter - expected_final_bam_name = f"{bam_dir}{sample_type}.{sample_name}.dedup.bam" + expected_final_bam_name = f"{bam_dir}{sample_type}.{sample_name}.dedup.fixmate.bam" assert expected_final_bam_name == bam_name_sample_name diff --git a/tests/scripts/test_extend_bedfile.py b/tests/scripts/test_extend_bedfile.py new file mode 100644 index 000000000..a68a1e29c --- /dev/null +++ b/tests/scripts/test_extend_bedfile.py @@ -0,0 +1,57 @@ +"""Test extending bedfile.""" +from pathlib import Path + +from click.testing import CliRunner, Result + +from BALSAMIC.assets.scripts.extend_bedfile import extend_bedfile +from BALSAMIC.constants.workflow_params import WORKFLOW_PARAMS + + +def test_extend_bedfile( + bedfile_path: Path, tmp_path: Path, cli_runner: CliRunner +) -> None: + """Test extended bedfile with regions shorter than minimum_region_size used in CNV analysis.""" + + # GIVEN an input bedfile with regions smaller than minimum region size + + # GIVEN a minimum region size + minimum_region_size = WORKFLOW_PARAMS["bed_pre_processing"]["minimum_region_size"] + + # GIVEN an output bedfile + extended_bed_path: Path = Path(tmp_path, "test_extended_bed.bed") + + # WHEN running the extend bedfile script + result: Result = cli_runner.invoke( + extend_bedfile, + [ + "--extend-to-min-region-size", + minimum_region_size, + bedfile_path.as_posix(), + extended_bed_path.as_posix(), + ], + ) + + # THEN the output bedfile should exist + assert result.exit_code == 0 + assert extended_bed_path.is_file() + + with open(bedfile_path, "r") as input_bedfile: + lines_before = [] + for line in input_bedfile: + chrom_start_end = line.strip().split("\t") + lines_before.append(chrom_start_end) + + with open(extended_bed_path, "r") as output_bedfile: + lines_after = [] + for line in output_bedfile: + chrom_start_end = line.strip().split("\t") + lines_after.append(chrom_start_end) + + # THEN the first row should be unchanged + assert lines_before[0] == lines_after[0] + + # THEN the second row should be extended by 99 bases + assert lines_before[1] == ["1", "10005000", "10005001"] + assert lines_after[1] == ["1", "10004950", "10005050"] + region_size = int(lines_after[1][2]) - int(lines_after[1][1]) + assert region_size == minimum_region_size diff --git a/tests/scripts/test_postprocess_gens_cnvkit.py b/tests/scripts/test_postprocess_gens_cnvkit.py new file mode 100644 index 000000000..5ceb7781d --- /dev/null +++ b/tests/scripts/test_postprocess_gens_cnvkit.py @@ -0,0 +1,50 @@ +"""Test extending bedfile.""" +from pathlib import Path + +from click.testing import CliRunner, Result + +from BALSAMIC.assets.scripts.postprocess_gens_cnvkit import create_gens_cov_file + + +def test_create_gens_cov_file( + gens_dummy_cnvkit_cnr, + purity_csv_path, + gens_dummy_cov_bed_expected, + tmp_path: Path, + cli_runner: CliRunner, +) -> None: + """Test postprocess CNVkit TGA output for GENS script.""" + + # GIVEN an input cnvkit cnr file and a pureCN purity csv + + # GIVEN a path to an outputfile + gens_cnvkit_cov: Path = Path(tmp_path, "gens_tga.cov.bed") + + # WHEN running the postprocess gens cnvkit script + result: Result = cli_runner.invoke( + create_gens_cov_file, + [ + "--output-file", + gens_cnvkit_cov.as_posix(), + "--normalised-coverage-path", + gens_dummy_cnvkit_cnr, + "--tumor-purity-path", + purity_csv_path, + ], + ) + + # THEN the output bedfile should exist + assert result.exit_code == 0 + assert gens_cnvkit_cov.is_file() + + # WHEN reading produced output file and expected output file + with open(gens_cnvkit_cov, "r") as actual_file: + test_output = actual_file.read() + + with open(gens_dummy_cov_bed_expected, "r") as expected_file: + expected_output = expected_file.read() + + # THEN test file and expected test file should be identical + assert ( + test_output == expected_output + ), "The expected and produced TGA GENS files do not match." diff --git a/tests/test_data/bedfiles/test_bedfile.bed b/tests/test_data/bedfiles/test_bedfile.bed new file mode 100644 index 000000000..12616d701 --- /dev/null +++ b/tests/test_data/bedfiles/test_bedfile.bed @@ -0,0 +1,2 @@ +1 100050 10005000 +1 10005000 10005001 diff --git a/tests/test_data/gens_files/dummy_gens_tga.cov.bed b/tests/test_data/gens_files/dummy_gens_tga.cov.bed new file mode 100644 index 000000000..9004fd246 --- /dev/null +++ b/tests/test_data/gens_files/dummy_gens_tga.cov.bed @@ -0,0 +1,15 @@ +o_1 834184 834185 -0.015159 +o_1 1718821 1718822 -0.120548 +o_1 1720598 1720599 -0.213593 +a_1 834184 834185 -0.015159 +a_1 1718821 1718822 -0.120548 +a_1 1720598 1720599 -0.213593 +b_1 834184 834185 -0.015159 +b_1 1718821 1718822 -0.120548 +b_1 1720598 1720599 -0.213593 +c_1 834184 834185 -0.015159 +c_1 1718821 1718822 -0.120548 +c_1 1720598 1720599 -0.213593 +d_1 834184 834185 -0.015159 +d_1 1718821 1718822 -0.120548 +d_1 1720598 1720599 -0.213593 diff --git a/tests/test_data/gens_files/tumor.merged.cnr b/tests/test_data/gens_files/tumor.merged.cnr new file mode 100644 index 000000000..dac0a16f4 --- /dev/null +++ b/tests/test_data/gens_files/tumor.merged.cnr @@ -0,0 +1,5 @@ +chromosome start end gene log2 depth weight +1 521868 563949 Antitarget 0.261883 2.07707 0.854028 +1 834125 834245 - -0.015159 4345.75 0.796757 +1 1718694 1718951 GNB1 -0.120548 3439.94 0.970694 +1 1720416 1720783 GNB1 -0.213593 3005.26 0.905559 \ No newline at end of file diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py index eb0d7fbe6..3dfde35b0 100644 --- a/tests/utils/test_utils.py +++ b/tests/utils/test_utils.py @@ -46,8 +46,10 @@ from BALSAMIC.utils.exc import BalsamicError, WorkflowRunError from BALSAMIC.utils.io import ( read_json, + read_csv, read_vcf_file, read_yaml, + write_list_of_strings, write_finish_file, write_json, write_sacct_to_yaml, @@ -455,6 +457,31 @@ def test_write_json(tmp_path, reference): assert len(list(tmp.iterdir())) == 1 +def test_write_list_of_strings(tmp_path): + """Test writing list of strings to file""" + + # GIVEN a list of strings and an output file path + list_of_strings = ["header1\theader2\theader3", "row1_col1\trow1_col2\trow1_col3"] + tmp = tmp_path / "tmp" + tmp.mkdir() + output_file: Path = Path(tmp / "output.csv") + + # WHEN writing list of strings + write_list_of_strings(list_of_strings, output_file.as_posix()) + + # THEN file should have been written + assert output_file.exists() + + # AND contain the same information + read_written_file: list[Dict] = read_csv( + csv_path=output_file.as_posix(), delimeter="\t" + ) + assert read_written_file == [ + {"header1": "row1_col1", "header2": "row1_col2", "header3": "row1_col3"} + ] + assert len(read_written_file) == 1 + + def test_write_json_error(tmp_path: Path): """Test JSON write error.""" @@ -479,6 +506,20 @@ def test_read_json(config_path: str): assert type(config_dict) is dict +def test_read_csv(purity_csv_path: str): + """Test data extraction from a CSV file.""" + + # GIVEN a CSV path + + # WHEN calling the function + csv_list: list[Dict] = read_csv(purity_csv_path) + + # THEN the config.json file should be correctly parsed + assert len(csv_list) == 1 + assert csv_list[0]["Purity"] == "0.64" + assert csv_list[0]["Sampleid"] == "tumor.initial" + + def test_read_json_error(): """Test data extraction from a BALSAMIC config JSON file for an invalid path."""