feat: update cnvkit pons (#1465)

#### Added - Added extension of target bed regions to a minimum size of 100 for CNV analysis - PON for: Exome comprehensive 10.2 - PON for: GMSsolid 15.2 - PON for: GMCKsolid 4.2 #### Changed - updated PON for GMCKSolid v4.1 - updated PON for GMSMyeloid v5.3 - updated PON for GMSlymphoid v7.3
Clinical-Genomics · Oct 11, 2024 · 0df968c · 0df968c
1 parent a50ff90
commit 0df968c
Show file tree

Hide file tree

Showing 33 changed files with 897 additions and 333 deletions.
diff --git a/BALSAMIC/assets/scripts/extend_bedfile.py b/BALSAMIC/assets/scripts/extend_bedfile.py
@@ -0,0 +1,44 @@
+import click
+
+
+@click.command()
+@click.argument("input_bedfile", type=click.Path(exists=True))
+@click.argument("output_bedfile", type=click.Path())
+@click.option(
+    "--extend-to-min-region-size",
+    default=100,
+    help="Will extend regions shorter than the specified size to this minimum size.",
+)
+def extend_bedfile(
+    input_bedfile: str, output_bedfile: str, extend_to_min_region_size: int
+):
+    """
+    Process a BED file to ensure regions are at least a minimum size.
+
+    Args:
+        input_bedfile (str): Input BED file path.
+        output_bedfile (str): Output BED file path.
+        min_region_size (int): Minimum region size to enforce.
+    """
+    with open(input_bedfile, "r") as infile, open(output_bedfile, "w") as outfile:
+        for line in infile:
+            fields = line.strip().split("\t")
+
+            chrom: str = fields[0]
+            start = int(fields[1])
+            end = int(fields[2])
+
+            region_length: int = end - start
+            if region_length < extend_to_min_region_size:
+                center = (start + end) // 2
+                half_size = extend_to_min_region_size // 2
+                start = max(0, center - half_size)
+                end = center + half_size
+                if extend_to_min_region_size % 2 != 0:
+                    end += 1
+
+            outfile.write(f"{chrom}\t{start}\t{end}\n")
+
+
+if __name__ == "__main__":
+    extend_bedfile()
diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
@@ -0,0 +1,102 @@
+import click
+from BALSAMIC.utils.io import read_csv, write_list_of_strings
+
+
+def calculate_log2_ratio(purity, log2_ratio, ploidy):
+    """Adjuts log2 ratio according to purity and ploidy.
+
+    Based on method in PureCN: https://github.com/lima1/PureCN/issues/40
+
+    Method is not used currently due to questionable results.
+    """
+    # Ensure that the inputs are within valid ranges
+    if not (0 <= purity <= 1):
+        raise ValueError("Purity must be between 0 and 1")
+
+    if ploidy <= 0:
+        raise ValueError("Ploidy must be a positive number")
+
+    # Ploidy and purity adjustment formula
+    log2_adjusted = (
+        purity * log2_ratio * ploidy + 2 * (1 - purity) * log2_ratio - 2 * (1 - purity)
+    ) / (purity * ploidy)
+
+    return log2_adjusted
+
+
+@click.command()
+@click.option(
+    "-o",
+    "--output-file",
+    required=True,
+    type=click.Path(exists=False),
+    help="Name of output-file.",
+)
+@click.option(
+    "-c",
+    "--normalised-coverage-path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Input CNVkit cnr. result.",
+)
+@click.option(
+    "-p",
+    "--tumor-purity-path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Tumor purity file from PureCN",
+)
+def create_gens_cov_file(
+    output_file: str, normalised_coverage_path: str, tumor_purity_path: str
+):
+    """Post-processes the CNVkit .cnr output for upload to GENS.
+
+    Removes Antitarget regions, adjusts for purity and ploidy and outputs the coverages in multiple resolution-formats.
+
+    Args:
+        output_file: Path to GENS output.cov file
+        normalised_coverage_path: Path to input CNVkit cnr file.
+        tumor_purity_path: Path to PureCN purity estimate csv file
+    """
+    log2_data = []
+
+    # Process CNVkit file
+    cnr_dict_list: list[dict] = read_csv(
+        csv_path=normalised_coverage_path, delimeter="\t"
+    )
+
+    # Process PureCN purity file
+    purecn_dict_list: list[dict] = read_csv(csv_path=tumor_purity_path, delimeter=",")
+    purity = float(purecn_dict_list[0]["Purity"])
+    ploidy = float(purecn_dict_list[0]["Ploidy"])
+
+    for row in cnr_dict_list:
+        if row["gene"] == "Antitarget":
+            continue
+
+        # find midpoint
+        start: int = int(row["start"])
+        end: int = int(row["end"])
+        region_size: int = end - start
+        midpoint: int = start + int(region_size / 2)
+
+        # adjust log2 ratio
+        log2: float = float(row["log2"])
+
+        # De-activate purity and ploidy adjustment
+        # log2: float = calculate_log2_ratio(purity, log2, ploidy)
+        # log2: float = round(log2, 4)
+
+        # store values in list
+        log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}")
+
+    # Write log2 data to output file
+    resolutions = ["o", "a", "b", "c", "d"]
+    resolution_log2_lines = [
+        f"{resolution}_{line}" for resolution in resolutions for line in log2_data
+    ]
+    write_list_of_strings(resolution_log2_lines, output_file)
+
+
+if __name__ == "__main__":
+    create_gens_cov_file()
diff --git a/BALSAMIC/assets/scripts/preprocess_gens.py b/BALSAMIC/assets/scripts/preprocess_gens.py
@@ -26,7 +26,7 @@
     "-s",
     "--sequencing-type",
     required=True,
-    type=click.Choice([SequencingType.WGS]),
+    type=click.Choice([SequencingType.WGS, SequencingType.TARGETED]),
     help="Sequencing type used.",
 )
 @click.pass_context

diff --git a/BALSAMIC/commands/config/case.py b/BALSAMIC/commands/config/case.py
@@ -54,6 +54,7 @@
     get_bioinfo_tools_version,
     get_panel_chrom,
     get_sample_list,
+    get_gens_references,
 )
 from BALSAMIC.utils.io import read_json, write_json
 from BALSAMIC.utils.utils import get_absolute_paths_dict
@@ -131,29 +132,22 @@ def case_config(
     if cadd_annotations:
         references.update(cadd_annotations_path)
 
-    if any([genome_interval, gens_coverage_pon, gnomad_min_af5]):
-        if panel_bed:
-            raise click.BadParameter(
-                "GENS is currently not compatible with TGA analysis, only WGS."
-            )
-        if not all([genome_interval, gens_coverage_pon, gnomad_min_af5]):
-            raise click.BadParameter(
-                "All three arguments (genome_interval gens_coverage_pon, gnomad_min_af5) are required for GENS."
-            )
-
-        gens_ref_files = {
-            "genome_interval": genome_interval,
-            "gens_coverage_pon": gens_coverage_pon,
-            "gnomad_min_af5": gnomad_min_af5,
-        }
-
-        references.update(
-            {
-                gens_file: path
-                for gens_file, path in gens_ref_files.items()
-                if path is not None
-            }
+    if analysis_workflow is not AnalysisWorkflow.BALSAMIC_QC:
+        gens_references: dict[str, str] = get_gens_references(
+            genome_interval=genome_interval,
+            gens_coverage_pon=gens_coverage_pon,
+            gnomad_min_af5=gnomad_min_af5,
+            panel_bed=panel_bed,
         )
+        if gens_references:
+            # Update references dictionary with GENS reference-files
+            references.update(
+                {
+                    gens_file: path
+                    for gens_file, path in gens_references.items()
+                    if path is not None
+                }
+            )
 
     variants_observations = {
         "clinical_snv_observations": clinical_snv_observations,

diff --git a/BALSAMIC/constants/cluster_analysis.json b/BALSAMIC/constants/cluster_analysis.json
@@ -11,7 +11,7 @@
 		"time": "00:15:00",
 		"n": 1
 	},
-	"gens_preprocessing": {
+	"gens_preprocess": {
 		"time": "01:00:00",
 		"n": 4
 	},
@@ -23,6 +23,18 @@
 		"time": "05:00:00",
 		"n": 10
 	},
+	"extend_short_bedregions": {
+		"time": "01:00:00",
+		"n": 1
+	},
+	"cnvkit_create_coverage": {
+		"time": "6:00:00",
+		"n": 18
+	},
+	"cnvkit_create_targets": {
+		"time": "6:00:00",
+		"n": 2
+	},
 	"finalize_gens_outputfiles": {
 		"time": "01:00:00",
 		"n": 2
@@ -96,7 +108,7 @@
 		"time": "10:00:00",
 		"n": 5
 	},
-	"gatk_denoisereadcounts":{
+	"gatk_denoise_read_counts":{
 		"time": "10:00:00",
 		"n": 10
 	},
@@ -156,6 +168,10 @@
 		"time": "24:00:00",
 		"n": 36
 	},
+	"sentieon_DNAscope_gnomad_tga": {
+		"time": "24:00:00",
+		"n": 12
+	},
 	"sentieon_TNhaplotyper": {
 		"time": "24:00:00",
 		"n": 36
@@ -416,6 +432,10 @@
 		"time": "01:00:00",
 		"n": 1
 	},
+	"bedtools_merge": {
+		"time": "01:00:00",
+		"n": 1
+	},
 	"bam_compress": {
 		"time": "04:00:00",
 		"n": 20

diff --git a/BALSAMIC/constants/rules.py b/BALSAMIC/constants/rules.py
@@ -75,6 +75,8 @@
             "snakemake_rules/umi/sentieon_consensuscall.rule",
         ],
         "varcall": [
+            "snakemake_rules/variant_calling/extend_bed.rule",
+            "snakemake_rules/variant_calling/cnvkit_preprocess.rule",
             "snakemake_rules/variant_calling/germline_tga.rule",
             "snakemake_rules/variant_calling/split_bed.rule",
             "snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule",
@@ -107,6 +109,8 @@
             "snakemake_rules/umi/umi_sentieon_alignment.rule",
         ],
         "varcall": [
+            "snakemake_rules/variant_calling/extend_bed.rule",
+            "snakemake_rules/variant_calling/cnvkit_preprocess.rule",
             "snakemake_rules/variant_calling/germline_tga.rule",
             "snakemake_rules/variant_calling/split_bed.rule",
             "snakemake_rules/variant_calling/somatic_tumor_normal.rule",

diff --git a/BALSAMIC/constants/tools.py b/BALSAMIC/constants/tools.py
@@ -40,6 +40,9 @@
                 "c": 1000,
                 "d": 100,
             },
-        }
+        },
+        SequencingType.TARGETED: {
+            "BAF_SKIP_N": {"o": 0, "a": 0, "b": 0, "c": 0, "d": 0},
+        },
     },
 }
diff --git a/BALSAMIC/constants/workflow_params.py b/BALSAMIC/constants/workflow_params.py
@@ -121,6 +121,9 @@
     "bam_post_processing": {
         "manta_max_base_quality": 70,
     },
+    "bed_pre_processing": {
+        "minimum_region_size": 100,
+    },
     "common": {
         "header_per_lane": "'@RG\\tID:{fastq_pattern}\\tSM:{sample_type}\\tPL:ILLUMINAi'",
         "header_per_sample": "'@RG\\tID:{sample}\\tSM:{sample_type}\\tPL:ILLUMINAi'",

diff --git a/BALSAMIC/models/config.py b/BALSAMIC/models/config.py
@@ -436,7 +436,7 @@ def get_final_bam_name(
 
         if self.analysis.analysis_type == AnalysisType.PON:
             # Only dedup is necessary for panel of normals
-            final_bam_suffix = "dedup"
+            final_bam_suffix = "dedup.fixmate"
         elif self.analysis.sequencing_type == SequencingType.TARGETED:
             # Only dedup is necessary for TGA
             final_bam_suffix = "dedup.fixmate"

diff --git a/BALSAMIC/models/params.py b/BALSAMIC/models/params.py
@@ -185,7 +185,7 @@ class UMIParamsTNscope(BaseModel):
 
 
 class BAMPostProcessingParams(BaseModel):
-    """This class defines the params settings used as constants bam post processing rules
+    """This class defines the params settings used as constants bam post-processing rules.
 
     Attributes:
        manta_max_base_quality: int (required); the maximum base quality in bamfile used downstream in Manta rules
@@ -194,6 +194,16 @@ class BAMPostProcessingParams(BaseModel):
     manta_max_base_quality: int
 
 
+class BEDPreProcessingParams(BaseModel):
+    """This class defines the params settings used as constants in bed pre-processing rules.
+
+    Attributes:
+       minimum_region_size: int (required); the minimum region size in input bedfiles for CNV analysis
+    """
+
+    minimum_region_size: int
+
+
 class BalsamicWorkflowConfig(BaseModel):
     """Defines set of rules in balsamic workflow
 
@@ -216,6 +226,7 @@ class BalsamicWorkflowConfig(BaseModel):
     """
 
     bam_post_processing: BAMPostProcessingParams
+    bed_pre_processing: BEDPreProcessingParams
     common: ParamsCommon
     insert_size_metrics: ParamsInsertSizeMetrics
     manta: ParamsManta

diff --git a/BALSAMIC/snakemake_rules/pon/cnvkit_create_pon.rule b/BALSAMIC/snakemake_rules/pon/cnvkit_create_pon.rule
@@ -1,42 +1,5 @@
 """Rules for creation of CNVkit PON."""
 
-rule create_target:
-    input:
-        target_bait = target_bed,
-        refgene_flat = refgene_flat,
-        access_bed = access_5kb_hg19,
-        wake_up = result_dir + "start_analysis",
-    output:
-        target_bed = cnv_dir + "target.bed",
-        offtarget_bed = cnv_dir + "antitarget.bed"
-    singularity:
-        Path(singularity_image, "cnvkit.sif").as_posix()
-    benchmark:
-        Path(benchmark_dir, "cnvkit.targets.tsv").as_posix()
-    shell:
-        """
-cnvkit.py target {input.target_bait} --annotate {input.refgene_flat} --split -o {output.target_bed};
-cnvkit.py antitarget {input.target_bait} -g {input.access_bed} -o {output.offtarget_bed};
-        """
-
-rule create_coverage:
-    input:
-        bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample),
-        target_bed = cnv_dir + "target.bed",
-        antitarget_bed = cnv_dir + "antitarget.bed"
-    output:
-        target_cnn = cnv_dir + "{sample}.targetcoverage.cnn",
-        antitarget_cnn = cnv_dir + "{sample}.antitargetcoverage.cnn"
-    singularity:
-        Path(singularity_image, "cnvkit.sif").as_posix()
-    benchmark:
-        Path(benchmark_dir, "cnvkit_{sample}.coverage.tsv").as_posix()
-    shell:
-        """
-cnvkit.py coverage {input.bam} {input.target_bed} -o {output.target_cnn};
-cnvkit.py coverage {input.bam} {input.antitarget_bed} -o {output.antitarget_cnn};
-        """
-
 rule create_reference:
     input:
         cnn = expand(cnv_dir + "{sample}.{prefix}coverage.cnn", sample=config_model.get_all_sample_names(), prefix=["target", "antitarget"]),