Skip to content

Commit

Permalink
feat: update cnvkit pons (#1465)
Browse files Browse the repository at this point in the history
#### Added

- Added extension of target bed regions to a minimum size of 100 for CNV analysis
- PON for: Exome comprehensive 10.2 
- PON for: GMSsolid 15.2 
- PON for: GMCKsolid 4.2

#### Changed

- updated PON for GMCKSolid v4.1 
- updated PON for GMSMyeloid v5.3 
- updated PON for GMSlymphoid v7.3
  • Loading branch information
mathiasbio authored Oct 11, 2024
1 parent a50ff90 commit 0df968c
Show file tree
Hide file tree
Showing 33 changed files with 897 additions and 333 deletions.
44 changes: 44 additions & 0 deletions BALSAMIC/assets/scripts/extend_bedfile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import click


@click.command()
@click.argument("input_bedfile", type=click.Path(exists=True))
@click.argument("output_bedfile", type=click.Path())
@click.option(
"--extend-to-min-region-size",
default=100,
help="Will extend regions shorter than the specified size to this minimum size.",
)
def extend_bedfile(
input_bedfile: str, output_bedfile: str, extend_to_min_region_size: int
):
"""
Process a BED file to ensure regions are at least a minimum size.
Args:
input_bedfile (str): Input BED file path.
output_bedfile (str): Output BED file path.
min_region_size (int): Minimum region size to enforce.
"""
with open(input_bedfile, "r") as infile, open(output_bedfile, "w") as outfile:
for line in infile:
fields = line.strip().split("\t")

chrom: str = fields[0]
start = int(fields[1])
end = int(fields[2])

region_length: int = end - start
if region_length < extend_to_min_region_size:
center = (start + end) // 2
half_size = extend_to_min_region_size // 2
start = max(0, center - half_size)
end = center + half_size
if extend_to_min_region_size % 2 != 0:
end += 1

outfile.write(f"{chrom}\t{start}\t{end}\n")


if __name__ == "__main__":
extend_bedfile()
102 changes: 102 additions & 0 deletions BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import click
from BALSAMIC.utils.io import read_csv, write_list_of_strings


def calculate_log2_ratio(purity, log2_ratio, ploidy):
"""Adjuts log2 ratio according to purity and ploidy.
Based on method in PureCN: https://github.com/lima1/PureCN/issues/40
Method is not used currently due to questionable results.
"""
# Ensure that the inputs are within valid ranges
if not (0 <= purity <= 1):
raise ValueError("Purity must be between 0 and 1")

if ploidy <= 0:
raise ValueError("Ploidy must be a positive number")

# Ploidy and purity adjustment formula
log2_adjusted = (
purity * log2_ratio * ploidy + 2 * (1 - purity) * log2_ratio - 2 * (1 - purity)
) / (purity * ploidy)

return log2_adjusted


@click.command()
@click.option(
"-o",
"--output-file",
required=True,
type=click.Path(exists=False),
help="Name of output-file.",
)
@click.option(
"-c",
"--normalised-coverage-path",
required=True,
type=click.Path(exists=True),
help="Input CNVkit cnr. result.",
)
@click.option(
"-p",
"--tumor-purity-path",
required=True,
type=click.Path(exists=True),
help="Tumor purity file from PureCN",
)
def create_gens_cov_file(
output_file: str, normalised_coverage_path: str, tumor_purity_path: str
):
"""Post-processes the CNVkit .cnr output for upload to GENS.
Removes Antitarget regions, adjusts for purity and ploidy and outputs the coverages in multiple resolution-formats.
Args:
output_file: Path to GENS output.cov file
normalised_coverage_path: Path to input CNVkit cnr file.
tumor_purity_path: Path to PureCN purity estimate csv file
"""
log2_data = []

# Process CNVkit file
cnr_dict_list: list[dict] = read_csv(
csv_path=normalised_coverage_path, delimeter="\t"
)

# Process PureCN purity file
purecn_dict_list: list[dict] = read_csv(csv_path=tumor_purity_path, delimeter=",")
purity = float(purecn_dict_list[0]["Purity"])
ploidy = float(purecn_dict_list[0]["Ploidy"])

for row in cnr_dict_list:
if row["gene"] == "Antitarget":
continue

# find midpoint
start: int = int(row["start"])
end: int = int(row["end"])
region_size: int = end - start
midpoint: int = start + int(region_size / 2)

# adjust log2 ratio
log2: float = float(row["log2"])

# De-activate purity and ploidy adjustment
# log2: float = calculate_log2_ratio(purity, log2, ploidy)
# log2: float = round(log2, 4)

# store values in list
log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}")

# Write log2 data to output file
resolutions = ["o", "a", "b", "c", "d"]
resolution_log2_lines = [
f"{resolution}_{line}" for resolution in resolutions for line in log2_data
]
write_list_of_strings(resolution_log2_lines, output_file)


if __name__ == "__main__":
create_gens_cov_file()
2 changes: 1 addition & 1 deletion BALSAMIC/assets/scripts/preprocess_gens.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
"-s",
"--sequencing-type",
required=True,
type=click.Choice([SequencingType.WGS]),
type=click.Choice([SequencingType.WGS, SequencingType.TARGETED]),
help="Sequencing type used.",
)
@click.pass_context
Expand Down
38 changes: 16 additions & 22 deletions BALSAMIC/commands/config/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
get_bioinfo_tools_version,
get_panel_chrom,
get_sample_list,
get_gens_references,
)
from BALSAMIC.utils.io import read_json, write_json
from BALSAMIC.utils.utils import get_absolute_paths_dict
Expand Down Expand Up @@ -131,29 +132,22 @@ def case_config(
if cadd_annotations:
references.update(cadd_annotations_path)

if any([genome_interval, gens_coverage_pon, gnomad_min_af5]):
if panel_bed:
raise click.BadParameter(
"GENS is currently not compatible with TGA analysis, only WGS."
)
if not all([genome_interval, gens_coverage_pon, gnomad_min_af5]):
raise click.BadParameter(
"All three arguments (genome_interval gens_coverage_pon, gnomad_min_af5) are required for GENS."
)

gens_ref_files = {
"genome_interval": genome_interval,
"gens_coverage_pon": gens_coverage_pon,
"gnomad_min_af5": gnomad_min_af5,
}

references.update(
{
gens_file: path
for gens_file, path in gens_ref_files.items()
if path is not None
}
if analysis_workflow is not AnalysisWorkflow.BALSAMIC_QC:
gens_references: dict[str, str] = get_gens_references(
genome_interval=genome_interval,
gens_coverage_pon=gens_coverage_pon,
gnomad_min_af5=gnomad_min_af5,
panel_bed=panel_bed,
)
if gens_references:
# Update references dictionary with GENS reference-files
references.update(
{
gens_file: path
for gens_file, path in gens_references.items()
if path is not None
}
)

variants_observations = {
"clinical_snv_observations": clinical_snv_observations,
Expand Down
24 changes: 22 additions & 2 deletions BALSAMIC/constants/cluster_analysis.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"time": "00:15:00",
"n": 1
},
"gens_preprocessing": {
"gens_preprocess": {
"time": "01:00:00",
"n": 4
},
Expand All @@ -23,6 +23,18 @@
"time": "05:00:00",
"n": 10
},
"extend_short_bedregions": {
"time": "01:00:00",
"n": 1
},
"cnvkit_create_coverage": {
"time": "6:00:00",
"n": 18
},
"cnvkit_create_targets": {
"time": "6:00:00",
"n": 2
},
"finalize_gens_outputfiles": {
"time": "01:00:00",
"n": 2
Expand Down Expand Up @@ -96,7 +108,7 @@
"time": "10:00:00",
"n": 5
},
"gatk_denoisereadcounts":{
"gatk_denoise_read_counts":{
"time": "10:00:00",
"n": 10
},
Expand Down Expand Up @@ -156,6 +168,10 @@
"time": "24:00:00",
"n": 36
},
"sentieon_DNAscope_gnomad_tga": {
"time": "24:00:00",
"n": 12
},
"sentieon_TNhaplotyper": {
"time": "24:00:00",
"n": 36
Expand Down Expand Up @@ -416,6 +432,10 @@
"time": "01:00:00",
"n": 1
},
"bedtools_merge": {
"time": "01:00:00",
"n": 1
},
"bam_compress": {
"time": "04:00:00",
"n": 20
Expand Down
4 changes: 4 additions & 0 deletions BALSAMIC/constants/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@
"snakemake_rules/umi/sentieon_consensuscall.rule",
],
"varcall": [
"snakemake_rules/variant_calling/extend_bed.rule",
"snakemake_rules/variant_calling/cnvkit_preprocess.rule",
"snakemake_rules/variant_calling/germline_tga.rule",
"snakemake_rules/variant_calling/split_bed.rule",
"snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule",
Expand Down Expand Up @@ -107,6 +109,8 @@
"snakemake_rules/umi/umi_sentieon_alignment.rule",
],
"varcall": [
"snakemake_rules/variant_calling/extend_bed.rule",
"snakemake_rules/variant_calling/cnvkit_preprocess.rule",
"snakemake_rules/variant_calling/germline_tga.rule",
"snakemake_rules/variant_calling/split_bed.rule",
"snakemake_rules/variant_calling/somatic_tumor_normal.rule",
Expand Down
5 changes: 4 additions & 1 deletion BALSAMIC/constants/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@
"c": 1000,
"d": 100,
},
}
},
SequencingType.TARGETED: {
"BAF_SKIP_N": {"o": 0, "a": 0, "b": 0, "c": 0, "d": 0},
},
},
}
3 changes: 3 additions & 0 deletions BALSAMIC/constants/workflow_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,9 @@
"bam_post_processing": {
"manta_max_base_quality": 70,
},
"bed_pre_processing": {
"minimum_region_size": 100,
},
"common": {
"header_per_lane": "'@RG\\tID:{fastq_pattern}\\tSM:{sample_type}\\tPL:ILLUMINAi'",
"header_per_sample": "'@RG\\tID:{sample}\\tSM:{sample_type}\\tPL:ILLUMINAi'",
Expand Down
2 changes: 1 addition & 1 deletion BALSAMIC/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,7 +436,7 @@ def get_final_bam_name(

if self.analysis.analysis_type == AnalysisType.PON:
# Only dedup is necessary for panel of normals
final_bam_suffix = "dedup"
final_bam_suffix = "dedup.fixmate"
elif self.analysis.sequencing_type == SequencingType.TARGETED:
# Only dedup is necessary for TGA
final_bam_suffix = "dedup.fixmate"
Expand Down
13 changes: 12 additions & 1 deletion BALSAMIC/models/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ class UMIParamsTNscope(BaseModel):


class BAMPostProcessingParams(BaseModel):
"""This class defines the params settings used as constants bam post processing rules
"""This class defines the params settings used as constants bam post-processing rules.
Attributes:
manta_max_base_quality: int (required); the maximum base quality in bamfile used downstream in Manta rules
Expand All @@ -194,6 +194,16 @@ class BAMPostProcessingParams(BaseModel):
manta_max_base_quality: int


class BEDPreProcessingParams(BaseModel):
"""This class defines the params settings used as constants in bed pre-processing rules.
Attributes:
minimum_region_size: int (required); the minimum region size in input bedfiles for CNV analysis
"""

minimum_region_size: int


class BalsamicWorkflowConfig(BaseModel):
"""Defines set of rules in balsamic workflow
Expand All @@ -216,6 +226,7 @@ class BalsamicWorkflowConfig(BaseModel):
"""

bam_post_processing: BAMPostProcessingParams
bed_pre_processing: BEDPreProcessingParams
common: ParamsCommon
insert_size_metrics: ParamsInsertSizeMetrics
manta: ParamsManta
Expand Down
37 changes: 0 additions & 37 deletions BALSAMIC/snakemake_rules/pon/cnvkit_create_pon.rule
Original file line number Diff line number Diff line change
@@ -1,42 +1,5 @@
"""Rules for creation of CNVkit PON."""

rule create_target:
input:
target_bait = target_bed,
refgene_flat = refgene_flat,
access_bed = access_5kb_hg19,
wake_up = result_dir + "start_analysis",
output:
target_bed = cnv_dir + "target.bed",
offtarget_bed = cnv_dir + "antitarget.bed"
singularity:
Path(singularity_image, "cnvkit.sif").as_posix()
benchmark:
Path(benchmark_dir, "cnvkit.targets.tsv").as_posix()
shell:
"""
cnvkit.py target {input.target_bait} --annotate {input.refgene_flat} --split -o {output.target_bed};
cnvkit.py antitarget {input.target_bait} -g {input.access_bed} -o {output.offtarget_bed};
"""

rule create_coverage:
input:
bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample),
target_bed = cnv_dir + "target.bed",
antitarget_bed = cnv_dir + "antitarget.bed"
output:
target_cnn = cnv_dir + "{sample}.targetcoverage.cnn",
antitarget_cnn = cnv_dir + "{sample}.antitargetcoverage.cnn"
singularity:
Path(singularity_image, "cnvkit.sif").as_posix()
benchmark:
Path(benchmark_dir, "cnvkit_{sample}.coverage.tsv").as_posix()
shell:
"""
cnvkit.py coverage {input.bam} {input.target_bed} -o {output.target_cnn};
cnvkit.py coverage {input.bam} {input.antitarget_bed} -o {output.antitarget_cnn};
"""

rule create_reference:
input:
cnn = expand(cnv_dir + "{sample}.{prefix}coverage.cnn", sample=config_model.get_all_sample_names(), prefix=["target", "antitarget"]),
Expand Down
Loading

0 comments on commit 0df968c

Please sign in to comment.