From f8b55cfb58cd24f998751651f3a6c237f224677b Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Mon, 21 Nov 2022 10:16:49 +0100 Subject: [PATCH 01/25] bumo version --- vafator/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vafator/__init__.py b/vafator/__init__.py index 51abfa5..61fe77d 100755 --- a/vafator/__init__.py +++ b/vafator/__init__.py @@ -1 +1 @@ -VERSION='2.1.0' \ No newline at end of file +VERSION='2.2.0' \ No newline at end of file From e4cd471ba090e76435b0f88f478c9604168f77e7 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Mon, 21 Nov 2022 11:28:35 +0100 Subject: [PATCH 02/25] add counts of ambiguous bases --- README.md | 17 +++--- vafator/__init__.py | 5 +- vafator/annotator.py | 24 +++++++- vafator/pileups.py | 8 +++ vafator/tests/test_annotator.py | 103 +++++++++++++------------------- 5 files changed, 85 insertions(+), 72 deletions(-) mode change 100644 => 100755 vafator/pileups.py diff --git a/README.md b/README.md index 9cde21b..84f9c73 100755 --- a/README.md +++ b/README.md @@ -20,25 +20,28 @@ somatic variant calling, tumor evolution with multiple samples at different time |-----------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------|-------|-----------------| | Allele frequency (AF) | Ratio of reads supporting the alternate allele | {sample}_af | float | A | | Allele count (AC) | Count of reads supporting the alternate allele | {sample}_ac | int | A | +| Count ambiguous bases (N) | Count of ambiguous bases (N + all IUPAC ambiguity codes) overlapping the variant | {sample}_n | int | 1 | | Depth of coverage (DP) | Count of reads covering the position of the variant | {sample}_dp | int | 1 | | Expected allele frequency | Expected allele frequency assuming a multiplicity of the mutation m=1 (the number of DNA copies bearing a mutation) considering the given purity and ploidy/copy numbers | {sample}_eaf | float | 1 | | Probability undetected | Probability that a given somatic mutation is undetected given the total coverage, supporting reads and expected allele frequency | {sample}_pu | float | A | | Power | Power to detect a somatic mutation given the total coverage and expected allele frequency (Carter, 2012) | {sample}_pw | float | 1 | | k | Minimum number of supporting reads such that the probability of observing k or more non-reference reads due to sequencing error is less than the defined false positive rate (FPR) (Carter, 2012) | {sample}_k | float | 1 | | Mapping quality median | Median mapping quality of reads supporting each of the reference and the alternate alleles | {sample}_mq | float | R | -| Mapping quality rank sum test | Rank sum test comparing the MQ distribution between reads supporting the reference and the alternate. Values closer to zero indicate similar distributions | {sample}_rsmq | float | A | -| Mapping quality rank sum test p-value | Significance of the rank sum test comparing the MQ distribution between reads supporting the reference and the alternate. The null hypothesis is that there is no difference in the distributions | {sample}_rsmq_pv | float | A | +| Mapping quality rank sum test (§§) | Rank sum test comparing the MQ distribution between reads supporting the reference and the alternate. Values closer to zero indicate similar distributions. | {sample}_rsmq | float | A | +| Mapping quality rank sum test p-value (§§) | Significance of the rank sum test comparing the MQ distribution between reads supporting the reference and the alternate. The null hypothesis is that there is no difference in the distributions | {sample}_rsmq_pv | float | A | | Base call quality median | Median base call quality of reads supporting each of the reference and the alternate alleles (not available for deletions) | {sample}_bq | float | R | -| Base call quality rank sum test | Rank sum test comparing the BQ distribution between reads supporting the reference and the alternate. Values closer to zero indicate similar distributions | {sample}_rsbq | float | A | -| Base call quality rank sum test p-value | Significance of the rank sum test comparing the BQ distribution between reads supporting the reference and the alternate. The null hypothesis is that there is no difference in the distributions | {sample}_rsbq_pv | float | A | +| Base call quality rank sum test (§§) | Rank sum test comparing the BQ distribution between reads supporting the reference and the alternate. Values closer to zero indicate similar distributions | {sample}_rsbq | float | A | +| Base call quality rank sum test p-value (§§) | Significance of the rank sum test comparing the BQ distribution between reads supporting the reference and the alternate. The null hypothesis is that there is no difference in the distributions | {sample}_rsbq_pv | float | A | | Position median | Median position within reads supporting each of the reference and the alternate alleles (in indels this is the start position) | {sample}_pos | float | R | -| Position rank sum test | Rank sum test comparing the position distribution between reads supporting the reference and the alternate. Values closer to zero indicate similar distributions | {sample}_rspos | float | A | -| Position rank sum test p-value | Significance of the rank sum test comparing the position distribution between reads supporting the reference and the alternate. The null hypothesis is that there is no difference in the distributions | {sample}_rspos_pv | float | A | - +| Position rank sum test (§§) | Rank sum test comparing the position distribution between reads supporting the reference and the alternate. Values closer to zero indicate similar distributions | {sample}_rspos | float | A | +| Position rank sum test p-value (§§) | Significance of the rank sum test comparing the position distribution between reads supporting the reference and the alternate. The null hypothesis is that there is no difference in the distributions | {sample}_rspos_pv | float | A | § cardinality is defined as in the VCF format specification: `A` refers to one value per alternate allele, `R` refers to one value per possible allele (including the reference), `1` refers to one value. +§§ rank sum tests require at least one read supporting the reference and one read supporting the alternate + + VAFator uses cyvcf2 (Pederson, 2017) to read/write VCF files and pysam (https://github.com/pysam-developers/pysam) to read BAM files. Both libraries are cython wrappers around HTSlib (Bonfield, 2021). diff --git a/vafator/__init__.py b/vafator/__init__.py index 61fe77d..d7b3f80 100755 --- a/vafator/__init__.py +++ b/vafator/__init__.py @@ -1 +1,4 @@ -VERSION='2.2.0' \ No newline at end of file +VERSION='2.2.0' + + +AMBIGUOUS_BASES = ['N', 'M', 'R', 'W', 'S', 'Y', 'K', 'V', 'H', 'D', 'B'] diff --git a/vafator/annotator.py b/vafator/annotator.py index 533caaf..3f0cfdc 100755 --- a/vafator/annotator.py +++ b/vafator/annotator.py @@ -94,6 +94,13 @@ def _get_headers(input_bams: dict): 'Type': 'Integer', 'Number': 'A' }) + headers.append({ + 'ID': "{}_n".format(s), + 'Description': "Allele count for ambiguous bases (any IUPAC ambiguity code is counted) " + "in the {} sample/s".format(s), + 'Type': 'Integer', + 'Number': '1' + }) headers.append({ 'ID': "{}_pu".format(s), 'Description': "Probability of an undetected mutation given the observed supporting reads (AC), " @@ -208,6 +215,10 @@ def _get_headers(input_bams: dict): {'ID': "{}_ac_{}".format(s, i), 'Description': "Allele count for the alternate alleles in the {} sample {}".format(s, n), 'Type': 'Integer', 'Number': 'A'}, + {'ID': "{}_n_{}".format(s, i), + 'Description': "Allele count for ambiguous bases (any IUPAC ambiguity code is counted) " + "in the {} sample {}".format(s, n), + 'Type': 'Integer', 'Number': '1'}, {'ID': "{}_pu_{}".format(s, i), 'Description': "Probability of an undetected mutation given the observed supporting " "reads (AC), the observed total coverage (DP) and the provided tumor " @@ -289,9 +300,13 @@ def _add_stats(self, variant: Variant): coverage_metrics = get_metrics(variant=variant, pileups=pileups) if coverage_metrics is not None: if len(bams) > 1: - variant.INFO["{}_af_{}".format(sample, i + 1)] = ",".join( - [str(self._calculate_af(coverage_metrics.ac[alt], coverage_metrics.dp)) for alt in variant.ALT]) - variant.INFO["{}_ac_{}".format(sample, i + 1)] = ",".join([str(coverage_metrics.ac[alt]) for alt in variant.ALT]) + variant.INFO["{}_af_{}".format(sample, i + 1)] = \ + ",".join([str(self._calculate_af(coverage_metrics.ac[alt], coverage_metrics.dp)) + for alt in variant.ALT]) + variant.INFO["{}_ac_{}".format(sample, i + 1)] = \ + ",".join([str(coverage_metrics.ac[alt]) for alt in variant.ALT]) + variant.INFO["{}_n_{}".format(sample, i + 1)] = \ + str(sum([coverage_metrics.ac.get(n, 0) for n in vafator.AMBIGUOUS_BASES])) variant.INFO["{}_dp_{}".format(sample, i + 1)] = coverage_metrics.dp variant.INFO["{}_pu_{}".format(sample, i + 1)] = ",".join( [str(self.power.calculate_power( @@ -337,6 +352,7 @@ def _add_stats(self, variant: Variant): variant.INFO["{}_af".format(sample)] = ",".join([str(self._calculate_af(global_ac[alt], global_dp)) for alt in variant.ALT]) variant.INFO["{}_ac".format(sample)] = ",".join([str(global_ac[alt]) for alt in variant.ALT]) + variant.INFO["{}_n".format(sample)] = str(sum([global_ac.get(n, 0) for n in vafator.AMBIGUOUS_BASES])) variant.INFO["{}_dp".format(sample)] = global_dp variant.INFO["{}_eaf".format(sample)] = str(self.power.calculate_expected_vaf( sample=sample, variant=variant)) @@ -354,6 +370,8 @@ def _add_stats(self, variant: Variant): variant.INFO["{}_pos".format(sample)] = ",".join( [str(global_pos[variant.REF])] + [str(global_pos[alt]) for alt in variant.ALT]) + # for these rank sum tests it is required at least one value for the alternate and one value for the + # reference otherwise it cannot be calculated pvalues, stats = get_rank_sum_tests(global_all_mqs, variant) if stats: variant.INFO["{}_rsmq".format(sample)] = ",".join(stats) diff --git a/vafator/pileups.py b/vafator/pileups.py old mode 100644 new mode 100755 index 5eac256..50d42fa --- a/vafator/pileups.py +++ b/vafator/pileups.py @@ -33,13 +33,21 @@ def get_variant_pileup( @dataclass class CoverageMetrics: + # number supporting reads of each base, including the reference ac: dict + # total depth of coverage dp: int + # median base call quality of each base, including the reference bqs: dict = None + # median mapping quality of each alternate base, including the reference mqs: dict = None + # median position within the read of each alternate base, including the reference positions: dict = None + # base call quality distribution of each base, including the reference all_bqs: dict = None + # mapping quality distribution of each base, including the reference all_mqs: dict = None + # position within the read distribution of each base, including the reference all_positions: dict = None diff --git a/vafator/tests/test_annotator.py b/vafator/tests/test_annotator.py index 5a3daf6..688d3d8 100755 --- a/vafator/tests/test_annotator.py +++ b/vafator/tests/test_annotator.py @@ -10,6 +10,18 @@ from logzero import logger +EXPECTED_ANNOTATIONS = [ + 'af', 'dp', 'ac', 'n', 'pu', 'pw', 'k', 'eaf', 'bq', 'mq', 'pos', 'rsmq', 'rsmq_pv', 'rsbq', 'rsbq_pv', 'rspos', + 'rspos_pv' +] + +# replicates do not have EAF annotation +EXPECTED_ANNOTATIONS_REPLICATES = [ + 'af', 'dp', 'ac', 'n', 'pu', 'pw', 'k', 'bq', 'mq', 'pos', 'rsmq', 'rsmq_pv', 'rsbq', 'rsbq_pv', 'rspos', + 'rspos_pv' +] + + class TestAnnotator(TestCase): def test_annotator(self): @@ -27,12 +39,9 @@ def test_annotator(self): self.assertTrue(n_variants_input == n_variants_output) info_annotations = test_utils._get_info_fields(output_vcf) - self.assertTrue("tumor_af" in info_annotations) - self.assertTrue("normal_af" in info_annotations) - self.assertTrue("tumor_ac" in info_annotations) - self.assertTrue("normal_ac" in info_annotations) - self.assertTrue("tumor_dp" in info_annotations) - self.assertTrue("normal_dp" in info_annotations) + for a in EXPECTED_ANNOTATIONS: + self.assertTrue("tumor_{}".format(a) in info_annotations) + self.assertTrue("normal_{}".format(a) in info_annotations) def test_annotator_with_multiple_bams(self): input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf") @@ -49,18 +58,11 @@ def test_annotator_with_multiple_bams(self): self.assertTrue(n_variants_input == n_variants_output) info_annotations = test_utils._get_info_fields(output_vcf) - self.assertTrue("tumor_af_1" in info_annotations) - self.assertTrue("normal_af_1" in info_annotations) - self.assertTrue("tumor_ac_1" in info_annotations) - self.assertTrue("normal_ac_1" in info_annotations) - self.assertTrue("tumor_dp_1" in info_annotations) - self.assertTrue("normal_dp_1" in info_annotations) - self.assertTrue("tumor_af_2" in info_annotations) - self.assertTrue("normal_af_2" in info_annotations) - self.assertTrue("tumor_ac_2" in info_annotations) - self.assertTrue("normal_ac_2" in info_annotations) - self.assertTrue("tumor_dp_2" in info_annotations) - self.assertTrue("normal_dp_2" in info_annotations) + for a in EXPECTED_ANNOTATIONS_REPLICATES: + self.assertTrue("tumor_{}_1".format(a) in info_annotations, + "Missing annotation tumor_{}_1".format(a)) + self.assertTrue("normal_{}_1".format(a) in info_annotations, + "Missing annotation normal_{}_1".format(a)) def test_annotator_with_prefix(self): input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf") @@ -78,18 +80,11 @@ def test_annotator_with_prefix(self): self.assertTrue(n_variants_input == n_variants_output) info_annotations = test_utils._get_info_fields(output_vcf) - self.assertTrue("RNA_tumor_af_1" in info_annotations) - self.assertTrue("RNA_normal_af_1" in info_annotations) - self.assertTrue("RNA_tumor_ac_1" in info_annotations) - self.assertTrue("RNA_normal_ac_1" in info_annotations) - self.assertTrue("RNA_tumor_dp_1" in info_annotations) - self.assertTrue("RNA_normal_dp_1" in info_annotations) - self.assertTrue("RNA_tumor_af_2" in info_annotations) - self.assertTrue("RNA_normal_af_2" in info_annotations) - self.assertTrue("RNA_tumor_ac_2" in info_annotations) - self.assertTrue("RNA_normal_ac_2" in info_annotations) - self.assertTrue("RNA_tumor_dp_2" in info_annotations) - self.assertTrue("RNA_normal_dp_2" in info_annotations) + for a in EXPECTED_ANNOTATIONS_REPLICATES: + self.assertTrue("RNA_tumor_{}_1".format(a) in info_annotations, + "Missing annotation RNA_tumor_{}_1".format(a)) + self.assertTrue("RNA_normal_{}_1".format(a) in info_annotations, + "Missing annotation RNA_normal_{}_1".format(a)) def test_annotator_with_mnvs(self): input_file = pkg_resources.resource_filename(__name__, "resources/test_tumor_normal.vcf") @@ -107,18 +102,11 @@ def test_annotator_with_mnvs(self): self.assertTrue(n_variants_input == n_variants_output) info_annotations = test_utils._get_info_fields(output_vcf) - self.assertTrue("RNA_tumor_af_1" in info_annotations) - self.assertTrue("RNA_normal_af_1" in info_annotations) - self.assertTrue("RNA_tumor_ac_1" in info_annotations) - self.assertTrue("RNA_normal_ac_1" in info_annotations) - self.assertTrue("RNA_tumor_dp_1" in info_annotations) - self.assertTrue("RNA_normal_dp_1" in info_annotations) - self.assertTrue("RNA_tumor_af_2" in info_annotations) - self.assertTrue("RNA_normal_af_2" in info_annotations) - self.assertTrue("RNA_tumor_ac_2" in info_annotations) - self.assertTrue("RNA_normal_ac_2" in info_annotations) - self.assertTrue("RNA_tumor_dp_2" in info_annotations) - self.assertTrue("RNA_normal_dp_2" in info_annotations) + for a in EXPECTED_ANNOTATIONS_REPLICATES: + self.assertTrue("RNA_tumor_{}_1".format(a) in info_annotations, + "Missing annotation RNA_tumor_{}_1".format(a)) + self.assertTrue("RNA_normal_{}_1".format(a) in info_annotations, + "Missing annotation RNA_normal_{}_1".format(a)) def _get_info_at(self, input_file, chromosome, position, annotation): vcf = VCF(input_file) @@ -218,7 +206,6 @@ def test_nist(self): self.assertEqual(variant.INFO['normal_pos'][0], 95.0) self.assertEqual(variant.INFO['normal_pos'][1], 56.0) - def test_annotator_bams_order(self): input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf") output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test_annotator1_output.vcf") @@ -236,8 +223,15 @@ def test_annotator_bams_order(self): vcf_2 = VCF(output_vcf_2) for v, v2 in zip(vcf, vcf_2): - self.assertEqual(v.INFO["normal_dp"], v2.INFO["normal_dp"]) - self.assertEqual(v.INFO["tumor_dp"], v2.INFO["tumor_dp"]) + for a in EXPECTED_ANNOTATIONS: + self.assertEqual( + v.INFO.get("normal_{}".format(a), ""), + v2.INFO.get("normal_{}".format(a), ""), + "Variant {}:{}:{}>{} is missing annotation normal_{}".format(v.CHROM, v.POS, v.REF, v.ALT[0], a)) + self.assertEqual( + v.INFO.get("tumor_{}".format(a), ""), + v2.INFO.get("tumor_{}".format(a), ""), + "Variant {}:{}:{}>{} is missing annotation tumor_{}".format(v.CHROM, v.POS, v.REF, v.ALT[0], a)) def test_annotator_with_purities(self): input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf") @@ -256,22 +250,9 @@ def test_annotator_with_purities(self): self.assertTrue(n_variants_input == n_variants_output) info_annotations = test_utils._get_info_fields(output_vcf) - self.assertTrue("tumor_af" in info_annotations) - self.assertTrue("normal_af" in info_annotations) - self.assertTrue("tumor_ac" in info_annotations) - self.assertTrue("normal_ac" in info_annotations) - self.assertTrue("tumor_dp" in info_annotations) - self.assertTrue("normal_dp" in info_annotations) - self.assertTrue("tumor_pu" in info_annotations) - self.assertTrue("normal_pu" in info_annotations) - self.assertTrue("normal_eaf" in info_annotations) - self.assertTrue("tumor_eaf" in info_annotations) - self.assertTrue("tumor_pw" in info_annotations) - self.assertTrue("normal_pw" in info_annotations) - self.assertTrue("tumor_bq" in info_annotations) - self.assertTrue("normal_bq" in info_annotations) - self.assertTrue("tumor_mq" in info_annotations) - self.assertTrue("normal_mq" in info_annotations) + for a in EXPECTED_ANNOTATIONS: + self.assertTrue("tumor_{}".format(a) in info_annotations) + self.assertTrue("normal_{}".format(a) in info_annotations) annotator = Annotator( input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam1], "tumor": [bam2]}, From 87c90265a28f841136a1cf393a7fac725cabeea5 Mon Sep 17 00:00:00 2001 From: priesgo Date: Mon, 21 Nov 2022 11:34:41 +0100 Subject: [PATCH 03/25] make scipy dependency less strict --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 requirements.txt diff --git a/requirements.txt b/requirements.txt old mode 100644 new mode 100755 index 89d2b56..aa18723 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,4 @@ pysam~=0.19.1 cyvcf2~=0.30.14 logzero~=1.7.0 pybedtools~=0.9.0 -scipy~=1.8.1 \ No newline at end of file +scipy>=1.0.0,<2.0.0 \ No newline at end of file From 0ed6f918e7f4a291c23cc40ac8b925ae2ff2b643 Mon Sep 17 00:00:00 2001 From: priesgo Date: Mon, 21 Nov 2022 11:59:55 +0100 Subject: [PATCH 04/25] add option to exclude ambiguous bases from the depth of coverage --- README.md | 49 +++++++++++++++++++++++++---------------- vafator/annotator.py | 8 +++++-- vafator/command_line.py | 5 ++++- vafator/pileups.py | 13 +++++++---- 4 files changed, 49 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 84f9c73..168fc51 100755 --- a/README.md +++ b/README.md @@ -67,9 +67,26 @@ vafator --input-vcf /path/yo/your.vcf \ ``` This will add annotations for each of the three samples `normal`, `primary` and `metastasis`: `normal_ac`, -`normal_dp`, `normal_af`, `normal_pw`, `primary_ac`, `primary_dp`, etc. +`normal_dp`, `normal_af`, `normal_pw`, `primary_ac`, `primary_dp`, etc. -## Support for technical replicates +### Support for indels + +VAFator provides equivalent annotations for indels. Depth of coverage and allele frequency are calculated on the +position immediately before the indel. Only insertions and deletions as recorded in the CIGAR matching the respective +coordinates and sequence from the VCF file are taken into account. Any read supporting a similar but not identical indel +is not counted. + +**NOTE**: multiallelic mutations are not supported for indels, the indel in the multiallelic position will be +annotated with null values. This problem can be circumvented by using the Nextflow normalization pipeline described above. + +### Support for MNVs + +Not supported at the moment when not decomposed. + +If running the nextflow pipeline indicated above, MNVs and complex variants are by default decomposed and hence +correctly annotated by VAFator. + +### Support for technical replicates If more than one BAM for the same sample is provided then the annotations are calculated across all BAMs and for also each of them separately (eg: `primary_af` provides the allele frequency across all primary tumor BAMs, @@ -172,6 +189,17 @@ The statistic value will be close to zero for similar distributions and further The significance value corresponds to the null hypothesis of similar distributions. No multiple test correction is applied over this p-value. +### Ambiguous bases + +Some reads may contain ambiguous bases with high base call quality scores. +The count of all reads passing the quality thresholds that contain an +ambiguous base overlapping the mutation is annotated. +All IUPAC ambiguity codes are taken into account. + +Furthermore, these reads supporting ambiguous bases are taken into account in the depth of coverage (DP) +and hence they may dilute the VAF values. In order to exclude those from the depth of coverage use the flag +`--exclude-ambiguous-bases`. Only SNVs are supported. + ## Understanding the output The output is a VCF with the some new annotations in the INFO field for the provided sample names. @@ -211,23 +239,6 @@ nextflow run tron-bioinformatics/tronflow-vcf-postprocessing -r 2.2.0 -profile c See https://github.com/TRON-Bioinformatics/tronflow-vcf-postprocessing for more details -## Support for indels - -VAFator provides equivalent annotations for indels. Depth of coverage and allele frequency are calculated on the -position immediately before the indel. Only insertions and deletions as recorded in the CIGAR matching the respective -coordinates and sequence from the VCF file are taken into account. Any read supporting a similar but not identical indel -is not counted. - -**NOTE**: multiallelic mutations are not supported for indels, the indel in the multiallelic position will be -annotated with null values. This problem can be circumvented by using the Nextflow normalization pipeline described above. - -## Support for MNVs - -Not supported at the moment when not decomposed. - -If running the nextflow pipeline indicated above, MNVs and complex variants are by default decomposed and hence -correctly annotated by VAFator. - ## Bibliography - Pedersen, B. S., & Quinlan, A. R. (2017). cyvcf2: fast, flexible variant analysis with Python. Bioinformatics, 33(12), 1867–1869. https://doi.org/10.1093/BIOINFORMATICS/BTX057 diff --git a/vafator/annotator.py b/vafator/annotator.py index 3f0cfdc..baf28f8 100755 --- a/vafator/annotator.py +++ b/vafator/annotator.py @@ -42,13 +42,15 @@ def __init__(self, input_vcf: str, output_vcf: str, tumor_ploidies: dict = {}, normal_ploidy=2, fpr=DEFAULT_FPR, - error_rate=DEFAULT_ERROR_RATE): + error_rate=DEFAULT_ERROR_RATE, + exclude_ambiguous_bases=False): self.mapping_quality_threshold = mapping_qual_thr self.base_call_quality_threshold = base_call_qual_thr self.purities = purities self.tumor_ploidies = tumor_ploidies self.normal_ploidy = normal_ploidy + self.exclude_ambiguous_bases = exclude_ambiguous_bases self.power = PowerCalculator( normal_ploidy=normal_ploidy, tumor_ploidies=tumor_ploidies, purities=purities, error_rate=error_rate, fpr=fpr) @@ -64,6 +66,7 @@ def __init__(self, input_vcf: str, output_vcf: str, self.vafator_header["purities"] = ";".join(["{}:{}".format(s, p) for s, p in purities.items()]) self.vafator_header["normal_ploidy"] = normal_ploidy self.vafator_header["tumor_ploidy"] = ";".join(["{}:{}".format(s, p) for s, p in tumor_ploidies.items()]) + self.vafator_header["exclude_ambiguous_bases"] = self.exclude_ambiguous_bases self.vcf.add_to_header("##vafator_command_line={}".format(json.dumps(self.vafator_header))) # adds to the header all the names of the annotations for a in Annotator._get_headers(input_bams): @@ -297,7 +300,8 @@ def _add_stats(self, variant: Variant): variant=variant, bam=bam, min_base_quality=self.base_call_quality_threshold, min_mapping_quality=self.mapping_quality_threshold) - coverage_metrics = get_metrics(variant=variant, pileups=pileups) + coverage_metrics = get_metrics(variant=variant, pileups=pileups, + exclude_ambiguous_bases=self.exclude_ambiguous_bases) if coverage_metrics is not None: if len(bams) > 1: variant.INFO["{}_af_{}".format(sample, i + 1)] = \ diff --git a/vafator/command_line.py b/vafator/command_line.py index edcdec3..cafe343 100755 --- a/vafator/command_line.py +++ b/vafator/command_line.py @@ -48,6 +48,8 @@ def annotator(): help="False Positive Rate (FPR) to use in the power calculation") parser.add_argument("--error-rate", dest="error_rate", required=False, default=DEFAULT_ERROR_RATE, type=float, help="Error rate to use in the power calculation") + parser.add_argument("--exclude-ambiguous-bases", dest="exclude_ambiguous_bases", action='store_true', + help="Flag indicating to exclude ambiguous bases from the DP calculation") args = parser.parse_args() @@ -96,7 +98,8 @@ def annotator(): tumor_ploidies=tumor_ploidies, normal_ploidy=int(args.normal_ploidy), fpr=args.fpr, - error_rate=args.error_rate + error_rate=args.error_rate, + exclude_ambiguous_bases=args.exclude_ambiguous_bases ) annotator.run() except Exception as e: diff --git a/vafator/pileups.py b/vafator/pileups.py index 50d42fa..288d2dc 100755 --- a/vafator/pileups.py +++ b/vafator/pileups.py @@ -3,6 +3,8 @@ from typing import Union from cyvcf2 import Variant from pysam.libcalignmentfile import IteratorColumnRegion, AlignmentFile + +from vafator import AMBIGUOUS_BASES from vafator.tests.utils import VafatorVariant import numpy as np @@ -51,9 +53,9 @@ class CoverageMetrics: all_positions: dict = None -def get_metrics(variant: Variant, pileups: IteratorColumnRegion) -> CoverageMetrics: +def get_metrics(variant: Variant, pileups: IteratorColumnRegion, exclude_ambiguous_bases=False) -> CoverageMetrics: if is_snp(variant): - return get_snv_metrics(pileups) + return get_snv_metrics(pileups, exclude_ambiguous_bases) elif is_insertion(variant): return get_insertion_metrics(variant, pileups) elif is_deletion(variant): @@ -166,7 +168,7 @@ def get_deletion_metrics(variant: Variant, pileups: IteratorColumnRegion) -> Cov ) -def get_snv_metrics(pileups: IteratorColumnRegion) -> CoverageMetrics: +def get_snv_metrics(pileups: IteratorColumnRegion, exclude_ambiguous_bases=False) -> CoverageMetrics: try: pileup = next(pileups) bases = [s.upper() for s in pileup.get_query_sequences()] @@ -178,7 +180,10 @@ def get_snv_metrics(pileups: IteratorColumnRegion) -> CoverageMetrics: all_mqs = aggregate_list_per_base(bases, pileup.get_mapping_qualities()) all_positions = aggregate_list_per_base(bases, pileup.get_query_positions()) - dp = len(bases) + if exclude_ambiguous_bases: + dp = len([b for b in bases if b not in AMBIGUOUS_BASES]) + else: + dp = len(bases) ac = Counter(bases) except StopIteration: # no reads From 77549e33d13cf0efbf562b9bfed92c618c9445d3 Mon Sep 17 00:00:00 2001 From: priesgo Date: Mon, 21 Nov 2022 12:06:43 +0100 Subject: [PATCH 05/25] change installation of bedtools in CI environment to conda --- .github/workflows/integration_tests.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) mode change 100644 => 100755 .github/workflows/integration_tests.yml diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml old mode 100644 new mode 100755 index 35f503f..434bb53 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -19,9 +19,11 @@ jobs: - name: Install dependencies run: | sudo apt-get update - sudo apt-get --assume-yes install build-essential libcurl4-openssl-dev libz-dev liblzma-dev bedtools + sudo apt-get --assume-yes install build-essential libcurl4-openssl-dev libz-dev liblzma-dev python -m pip install --upgrade pip pip install setuptools wheel + # this is needed by pybedtools + conda install --yes bedtools # this is needed by cyvcf2 and pysam conda install --yes htslib=1.14 - name: Install vafator From 207369e8610f4b370170b3f0a08ab3e9b9fbcdf8 Mon Sep 17 00:00:00 2001 From: priesgo Date: Mon, 21 Nov 2022 13:40:05 +0100 Subject: [PATCH 06/25] try installing pybedtools through conda --- .github/workflows/integration_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index 434bb53..8d5ade3 100755 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -23,7 +23,7 @@ jobs: python -m pip install --upgrade pip pip install setuptools wheel # this is needed by pybedtools - conda install --yes bedtools + conda install --yes bedtools pybedtools # this is needed by cyvcf2 and pysam conda install --yes htslib=1.14 - name: Install vafator From 9ac36d711f21d168855fa45f319edcbc37402880 Mon Sep 17 00:00:00 2001 From: priesgo Date: Mon, 21 Nov 2022 14:05:29 +0100 Subject: [PATCH 07/25] change how dependencies are installed --- .github/workflows/integration_tests.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index 8d5ade3..17cc1bd 100755 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -23,13 +23,15 @@ jobs: python -m pip install --upgrade pip pip install setuptools wheel # this is needed by pybedtools - conda install --yes bedtools pybedtools + conda install --yes bedtools # this is needed by cyvcf2 and pysam conda install --yes htslib=1.14 + # install python requirements + pip install -r requirements.txt - name: Install vafator run: | python setup.py bdist_wheel - pip install dist/* + pip install dist/vafator-*.whl - name: Run integration tests run: | make clean integration_tests \ No newline at end of file From 51f4328063beacedd991669f7a8a072356eab405 Mon Sep 17 00:00:00 2001 From: priesgo Date: Mon, 21 Nov 2022 15:15:05 +0100 Subject: [PATCH 08/25] #39 fix issue with wrong pvalues in replicates --- vafator/annotator.py | 4 +- vafator/tests/test_annotator.py | 67 ++++++++++++++++++++++++++++++++- 2 files changed, 68 insertions(+), 3 deletions(-) diff --git a/vafator/annotator.py b/vafator/annotator.py index baf28f8..bdcb4c6 100755 --- a/vafator/annotator.py +++ b/vafator/annotator.py @@ -338,12 +338,12 @@ def _add_stats(self, variant: Variant): pvalues, stats = get_rank_sum_tests(coverage_metrics.all_bqs, variant) if stats: variant.INFO["{}_rsbq_{}".format(sample, i + 1)] = ",".join(stats) - variant.INFO["{}_rsbq_pv_{}".format(sample, i + 1)] = ",".join(stats) + variant.INFO["{}_rsbq_pv_{}".format(sample, i + 1)] = ",".join(pvalues) pvalues, stats = get_rank_sum_tests(coverage_metrics.all_positions, variant) if stats: variant.INFO["{}_rspos_{}".format(sample, i + 1)] = ",".join(stats) - variant.INFO["{}_rspos_pv_{}".format(sample, i + 1)] = ",".join(stats) + variant.INFO["{}_rspos_pv_{}".format(sample, i + 1)] = ",".join(pvalues) global_ac.update(coverage_metrics.ac) global_bq.update(coverage_metrics.bqs) diff --git a/vafator/tests/test_annotator.py b/vafator/tests/test_annotator.py index 688d3d8..98ab92d 100755 --- a/vafator/tests/test_annotator.py +++ b/vafator/tests/test_annotator.py @@ -1,4 +1,6 @@ import os + +import numpy as np import pkg_resources from unittest import TestCase from cyvcf2 import VCF @@ -132,7 +134,8 @@ def test_nist(self): duration = time.time() - start logger.info("Duration {} seconds".format(round(duration, 3))) - self.assertTrue(os.path.exists(output_vcf)) + self._assert_vafator_vcf(output_vcf, sample_name='normal') + n_variants_input = test_utils._get_count_variants(input_file) n_variants_output = test_utils._get_count_variants(output_vcf) self.assertTrue(n_variants_input == n_variants_output) @@ -206,6 +209,24 @@ def test_nist(self): self.assertEqual(variant.INFO['normal_pos'][0], 95.0) self.assertEqual(variant.INFO['normal_pos'][1], 56.0) + def test_nist_with_replicates(self): + input_file = pkg_resources.resource_filename( + __name__, "resources/project.NIST.hc.snps.indels.chr1_1000000_2000000.vcf") + output_vcf = pkg_resources.resource_filename( + __name__, "resources/results/project.NIST.hc.snps.indels.chr1_1000000_2000000.vaf_replicates.vcf") + bam_file = pkg_resources.resource_filename( + __name__, + "resources/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.chr1_1000000_2000000.bam") + start = time.time() + annotator = Annotator(input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam_file]}) + annotator.run() + duration = time.time() - start + logger.info("Duration {} seconds".format(round(duration, 3))) + + self._assert_vafator_vcf(output_vcf, sample_name='normal') + self._assert_vafator_vcf(output_vcf, sample_name='normal', replicate=1) + self._assert_vafator_vcf(output_vcf, sample_name='normal', replicate=2) + def test_annotator_bams_order(self): input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf") output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test_annotator1_output.vcf") @@ -219,6 +240,11 @@ def test_annotator_bams_order(self): self.assertTrue(os.path.exists(output_vcf)) self.assertTrue(os.path.exists(output_vcf_2)) + self._assert_vafator_vcf(output_vcf, sample_name='normal') + self._assert_vafator_vcf(output_vcf, sample_name='tumor') + self._assert_vafator_vcf(output_vcf_2, sample_name='normal') + self._assert_vafator_vcf(output_vcf_2, sample_name='tumor') + vcf = VCF(output_vcf) vcf_2 = VCF(output_vcf_2) @@ -259,3 +285,42 @@ def test_annotator_with_purities(self): purities={"tumor": 0.2}, tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=1.5)} ) annotator.run() + + def _assert_vafator_vcf(self, vcf_filename, sample_name, replicate=None): + self.assertTrue(os.path.exists(vcf_filename)) + vcf = VCF(vcf_filename) + for v in vcf: + # p-values or VAFs + self._assert_probability(v.INFO.get(self._get_annotation_name('rsbq_pv', sample_name, replicate=replicate), 0)) + self._assert_probability(v.INFO.get(self._get_annotation_name('rsmq_pv', sample_name, replicate=replicate), 0)) + self._assert_probability(v.INFO.get(self._get_annotation_name('rspos_pv', sample_name, replicate=replicate), 0)) + self._assert_probability(v.INFO.get(self._get_annotation_name('eaf', sample_name, replicate=replicate), 0)) + self._assert_probability(v.INFO.get(self._get_annotation_name('af', sample_name, replicate=replicate), 0)) + + # positive integer annotations + self._assert_positive_integer(v.INFO.get(self._get_annotation_name('ac', sample_name, replicate=replicate), 0)) + self._assert_positive_integer(v.INFO.get(self._get_annotation_name('dp', sample_name, replicate=replicate), 0)) + self._assert_positive_integer(v.INFO.get(self._get_annotation_name('mq', sample_name, replicate=replicate), 0)) + self._assert_positive_integer(v.INFO.get(self._get_annotation_name('bq', sample_name, replicate=replicate), 0)) + self._assert_positive_integer(v.INFO.get(self._get_annotation_name('pos', sample_name, replicate=replicate), 0)) + vcf.close() + + def _get_annotation_name(self, annotation_name, sample_name, replicate=None): + annotation_name = "{}_{}".format(sample_name, annotation_name) + if replicate: + annotation_name = "{}_{}".format(annotation_name, replicate) + return annotation_name + + def _assert_probability(self, annotation): + if isinstance(annotation, list) or isinstance(annotation, tuple): + for a in annotation: + self.assertTrue(0.0 <= float(a) <= 1.0, "Expected probability has a value of {}".format(a)) + else: + self.assertTrue(0.0 <= float(annotation) <= 1.0, "Expected probability has a value of {}".format(annotation)) + + def _assert_positive_integer(self, annotation): + if isinstance(annotation, list) or isinstance(annotation, tuple): + for a in annotation: + self.assertTrue(np.isnan(a) or 0.0 <= a, "Expected positive integer has a value of {}".format(a)) + else: + self.assertTrue(np.isnan(annotation) or 0.0 <= annotation, "Expected positive integer has a value of {}".format(annotation)) From ff1da8da1a4fac9ce1582a536d792ef3e5b47c9b Mon Sep 17 00:00:00 2001 From: priesgo Date: Mon, 21 Nov 2022 15:30:25 +0100 Subject: [PATCH 09/25] #35 fix vafator header about tumor ploidies --- vafator/annotator.py | 5 ++++- vafator/ploidies.py | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) mode change 100644 => 100755 vafator/ploidies.py diff --git a/vafator/annotator.py b/vafator/annotator.py index bdcb4c6..ae803f2 100755 --- a/vafator/annotator.py +++ b/vafator/annotator.py @@ -10,6 +10,7 @@ import asyncio import time +from vafator.ploidies import DEFAULT_PLOIDY from vafator.rank_sum_test import calculate_rank_sum_test, get_rank_sum_tests from vafator.power import PowerCalculator, DEFAULT_ERROR_RATE, DEFAULT_FPR from vafator.pileups import get_variant_pileup, get_metrics @@ -65,7 +66,9 @@ def __init__(self, input_vcf: str, output_vcf: str, self.vafator_header["base_call_quality_threshold"] = base_call_qual_thr self.vafator_header["purities"] = ";".join(["{}:{}".format(s, p) for s, p in purities.items()]) self.vafator_header["normal_ploidy"] = normal_ploidy - self.vafator_header["tumor_ploidy"] = ";".join(["{}:{}".format(s, p) for s, p in tumor_ploidies.items()]) + self.vafator_header["tumor_ploidy"] = ";".join(["{}:{}".format(s, p.report_value) + for s, p in tumor_ploidies.items()]) \ + if tumor_ploidies else DEFAULT_PLOIDY self.vafator_header["exclude_ambiguous_bases"] = self.exclude_ambiguous_bases self.vcf.add_to_header("##vafator_command_line={}".format(json.dumps(self.vafator_header))) # adds to the header all the names of the annotations diff --git a/vafator/ploidies.py b/vafator/ploidies.py old mode 100644 new mode 100755 index b114d85..b9f9099 --- a/vafator/ploidies.py +++ b/vafator/ploidies.py @@ -15,6 +15,7 @@ def __init__(self, local_copy_numbers: str = None, genome_wide_ploidy: float = D if local_copy_numbers is not None and not os.path.exists(local_copy_numbers): raise ValueError('The provided tumor ploidy is neither a copy number value or a BED file with copy ' 'numbers') + self.report_value = local_copy_numbers if local_copy_numbers else genome_wide_ploidy self.bed = pd.read_csv(local_copy_numbers, sep='\t', names=['chromosome', 'start', 'end', 'copy_number']) \ if local_copy_numbers is not None else None self.ploidy = genome_wide_ploidy From 516759ad04be8692201120de822c659fe5730ccd Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Fri, 16 Dec 2022 18:17:43 +0100 Subject: [PATCH 10/25] switch from defaulting to exclude ambiguous bases --- vafator/annotator.py | 8 ++++---- vafator/command_line.py | 6 +++--- vafator/pileups.py | 12 ++++++------ 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/vafator/annotator.py b/vafator/annotator.py index ae803f2..00f8293 100755 --- a/vafator/annotator.py +++ b/vafator/annotator.py @@ -44,14 +44,14 @@ def __init__(self, input_vcf: str, output_vcf: str, normal_ploidy=2, fpr=DEFAULT_FPR, error_rate=DEFAULT_ERROR_RATE, - exclude_ambiguous_bases=False): + include_ambiguous_bases=False): self.mapping_quality_threshold = mapping_qual_thr self.base_call_quality_threshold = base_call_qual_thr self.purities = purities self.tumor_ploidies = tumor_ploidies self.normal_ploidy = normal_ploidy - self.exclude_ambiguous_bases = exclude_ambiguous_bases + self.include_ambiguous_bases = include_ambiguous_bases self.power = PowerCalculator( normal_ploidy=normal_ploidy, tumor_ploidies=tumor_ploidies, purities=purities, error_rate=error_rate, fpr=fpr) @@ -69,7 +69,7 @@ def __init__(self, input_vcf: str, output_vcf: str, self.vafator_header["tumor_ploidy"] = ";".join(["{}:{}".format(s, p.report_value) for s, p in tumor_ploidies.items()]) \ if tumor_ploidies else DEFAULT_PLOIDY - self.vafator_header["exclude_ambiguous_bases"] = self.exclude_ambiguous_bases + self.vafator_header["include_ambiguous_bases"] = self.include_ambiguous_bases self.vcf.add_to_header("##vafator_command_line={}".format(json.dumps(self.vafator_header))) # adds to the header all the names of the annotations for a in Annotator._get_headers(input_bams): @@ -304,7 +304,7 @@ def _add_stats(self, variant: Variant): min_base_quality=self.base_call_quality_threshold, min_mapping_quality=self.mapping_quality_threshold) coverage_metrics = get_metrics(variant=variant, pileups=pileups, - exclude_ambiguous_bases=self.exclude_ambiguous_bases) + include_ambiguous_bases=self.include_ambiguous_bases) if coverage_metrics is not None: if len(bams) > 1: variant.INFO["{}_af_{}".format(sample, i + 1)] = \ diff --git a/vafator/command_line.py b/vafator/command_line.py index cafe343..06cdd85 100755 --- a/vafator/command_line.py +++ b/vafator/command_line.py @@ -48,8 +48,8 @@ def annotator(): help="False Positive Rate (FPR) to use in the power calculation") parser.add_argument("--error-rate", dest="error_rate", required=False, default=DEFAULT_ERROR_RATE, type=float, help="Error rate to use in the power calculation") - parser.add_argument("--exclude-ambiguous-bases", dest="exclude_ambiguous_bases", action='store_true', - help="Flag indicating to exclude ambiguous bases from the DP calculation") + parser.add_argument("--include-ambiguous-bases", dest="include_ambiguous_bases", action='store_true', + help="Flag indicating to include ambiguous bases from the DP calculation") args = parser.parse_args() @@ -99,7 +99,7 @@ def annotator(): normal_ploidy=int(args.normal_ploidy), fpr=args.fpr, error_rate=args.error_rate, - exclude_ambiguous_bases=args.exclude_ambiguous_bases + include_ambiguous_bases=args.include_ambiguous_bases ) annotator.run() except Exception as e: diff --git a/vafator/pileups.py b/vafator/pileups.py index 288d2dc..d285bf4 100755 --- a/vafator/pileups.py +++ b/vafator/pileups.py @@ -53,9 +53,9 @@ class CoverageMetrics: all_positions: dict = None -def get_metrics(variant: Variant, pileups: IteratorColumnRegion, exclude_ambiguous_bases=False) -> CoverageMetrics: +def get_metrics(variant: Variant, pileups: IteratorColumnRegion, include_ambiguous_bases=False) -> CoverageMetrics: if is_snp(variant): - return get_snv_metrics(pileups, exclude_ambiguous_bases) + return get_snv_metrics(pileups, include_ambiguous_bases) elif is_insertion(variant): return get_insertion_metrics(variant, pileups) elif is_deletion(variant): @@ -168,7 +168,7 @@ def get_deletion_metrics(variant: Variant, pileups: IteratorColumnRegion) -> Cov ) -def get_snv_metrics(pileups: IteratorColumnRegion, exclude_ambiguous_bases=False) -> CoverageMetrics: +def get_snv_metrics(pileups: IteratorColumnRegion, include_ambiguous_bases=False) -> CoverageMetrics: try: pileup = next(pileups) bases = [s.upper() for s in pileup.get_query_sequences()] @@ -180,10 +180,10 @@ def get_snv_metrics(pileups: IteratorColumnRegion, exclude_ambiguous_bases=False all_mqs = aggregate_list_per_base(bases, pileup.get_mapping_qualities()) all_positions = aggregate_list_per_base(bases, pileup.get_query_positions()) - if exclude_ambiguous_bases: - dp = len([b for b in bases if b not in AMBIGUOUS_BASES]) - else: + if include_ambiguous_bases: dp = len(bases) + else: + dp = len([b for b in bases if b not in AMBIGUOUS_BASES]) ac = Counter(bases) except StopIteration: # no reads From 2d09976399187197259cfd29ec4db2ef7fd1aebf Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Fri, 16 Dec 2022 18:19:56 +0100 Subject: [PATCH 11/25] update documentation with last change --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 168fc51..0e40335 100755 --- a/README.md +++ b/README.md @@ -193,12 +193,12 @@ No multiple test correction is applied over this p-value. Some reads may contain ambiguous bases with high base call quality scores. The count of all reads passing the quality thresholds that contain an -ambiguous base overlapping the mutation is annotated. +ambiguous base overlapping the mutation are annotated. All IUPAC ambiguity codes are taken into account. -Furthermore, these reads supporting ambiguous bases are taken into account in the depth of coverage (DP) -and hence they may dilute the VAF values. In order to exclude those from the depth of coverage use the flag -`--exclude-ambiguous-bases`. Only SNVs are supported. +Also, these reads supporting ambiguous bases are not taken into account in the depth of coverage (DP) +as they may dilute the VAF values. In order to include those into the depth of coverage use the flag +`--include-ambiguous-bases`. Only SNVs are supported. ## Understanding the output From c7b10ca9d95520e76de9bfbe95af75bcd81cda7a Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Mon, 30 Jan 2023 10:31:05 +0100 Subject: [PATCH 12/25] fix style codacy issue --- vafator/tests/test_annotator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vafator/tests/test_annotator.py b/vafator/tests/test_annotator.py index 98ab92d..7bcb716 100755 --- a/vafator/tests/test_annotator.py +++ b/vafator/tests/test_annotator.py @@ -305,7 +305,8 @@ def _assert_vafator_vcf(self, vcf_filename, sample_name, replicate=None): self._assert_positive_integer(v.INFO.get(self._get_annotation_name('pos', sample_name, replicate=replicate), 0)) vcf.close() - def _get_annotation_name(self, annotation_name, sample_name, replicate=None): + @staticmethod + def _get_annotation_name(annotation_name, sample_name, replicate=None): annotation_name = "{}_{}".format(sample_name, annotation_name) if replicate: annotation_name = "{}_{}".format(annotation_name, replicate) From b1fa4136c7718ff59b2a6f847c306303c1481c4b Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Mon, 30 Jan 2023 10:40:19 +0100 Subject: [PATCH 13/25] remove support for python 3.10 --- setup.py | 1 - tox.ini | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 1025dd9..1e63e44 100755 --- a/setup.py +++ b/setup.py @@ -42,7 +42,6 @@ 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3 :: Only', "License :: OSI Approved :: MIT License", "Operating System :: Unix" diff --git a/tox.ini b/tox.ini index b14bfb1..3c15d80 100644 --- a/tox.ini +++ b/tox.ini @@ -1,12 +1,11 @@ [tox] -envlist = {py37, py38, py39, py310} +envlist = {py37, py38, py39} [gh-actions] python = 3.7: py37 3.8: py38 3.9: py39 - 3.10: py310 [testenv] wheel = true From 7eb18afd457f2fd5495939a875e4a91b40c3ae93 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Mon, 30 Jan 2023 10:42:49 +0100 Subject: [PATCH 14/25] remove support for python 3.10 --- .github/workflows/integration_tests.yml | 2 +- .github/workflows/unit_tests.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index 17cc1bd..a918939 100755 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-20.04 strategy: matrix: - python-version: [ 3.7, 3.8, 3.9, 3.10 ] + python-version: [ 3.7, 3.8, 3.9 ] steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 855d23c..1fe39fd 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-20.04 strategy: matrix: - python-version: [ 3.7, 3.8, 3.9, 3.10 ] + python-version: [ 3.7, 3.8, 3.9 ] steps: - uses: actions/checkout@v2 From 623f36b3c8393fca4e0355feef8ae626332781d9 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Mon, 30 Jan 2023 10:59:38 +0100 Subject: [PATCH 15/25] try removing some dependencies to fix the integration tests --- .github/workflows/integration_tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index a918939..d2380a1 100755 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -19,13 +19,13 @@ jobs: - name: Install dependencies run: | sudo apt-get update - sudo apt-get --assume-yes install build-essential libcurl4-openssl-dev libz-dev liblzma-dev + #sudo apt-get --assume-yes install build-essential libcurl4-openssl-dev libz-dev liblzma-dev python -m pip install --upgrade pip pip install setuptools wheel # this is needed by pybedtools - conda install --yes bedtools + #conda install --yes bedtools # this is needed by cyvcf2 and pysam - conda install --yes htslib=1.14 + #conda install --yes htslib=1.14 # install python requirements pip install -r requirements.txt - name: Install vafator From 7d692f30451596b756637f6c581d8e9318827a73 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Mon, 30 Jan 2023 11:17:26 +0100 Subject: [PATCH 16/25] add coverage to unit tests + install some basic dependencies for integration tests --- .github/workflows/integration_tests.yml | 2 +- .github/workflows/unit_tests.yml | 6 ++++-- tox.ini | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index d2380a1..496082d 100755 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -19,7 +19,7 @@ jobs: - name: Install dependencies run: | sudo apt-get update - #sudo apt-get --assume-yes install build-essential libcurl4-openssl-dev libz-dev liblzma-dev + sudo apt-get --assume-yes install build-essential libcurl4-openssl-dev libz-dev liblzma-dev python -m pip install --upgrade pip pip install setuptools wheel # this is needed by pybedtools diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 1fe39fd..24b99e3 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -15,8 +15,10 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install setuptools wheel + pip install setuptools wheel coverage pip install virtualenv tox==3.26.0 tox-wheel==1.0.0 tox-gh-actions - name: Build, install and run unit tests run: | - tox \ No newline at end of file + tox + - name: Upload Coverage to Codecov + uses: codecov/codecov-action@v3 \ No newline at end of file diff --git a/tox.ini b/tox.ini index 3c15d80..2e9ba56 100644 --- a/tox.ini +++ b/tox.ini @@ -12,4 +12,4 @@ wheel = true passenv = * commands= pip install -r requirements.txt - python -m unittest discover vafator.tests \ No newline at end of file + coverage run -m unittest discover vafator.tests \ No newline at end of file From 7f1a589cc537557fc5ddfa0e00337c6f3cabe776 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Mon, 30 Jan 2023 11:54:48 +0100 Subject: [PATCH 17/25] add back in bedtools dependency --- .github/workflows/integration_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index 496082d..6664274 100755 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -23,7 +23,7 @@ jobs: python -m pip install --upgrade pip pip install setuptools wheel # this is needed by pybedtools - #conda install --yes bedtools + conda install --yes bedtools # this is needed by cyvcf2 and pysam #conda install --yes htslib=1.14 # install python requirements From 708ca675e46a1004a092fae1b6ef493567a34427 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Mon, 30 Jan 2023 13:48:11 +0100 Subject: [PATCH 18/25] update the matrix of python versions --- .github/workflows/integration_tests.yml | 7 +++++-- .github/workflows/unit_tests.yml | 5 ++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index 6664274..128f355 100755 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -11,7 +11,10 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} - uses: conda-incubator/setup-miniconda@v2 with: auto-update-conda: true @@ -23,7 +26,7 @@ jobs: python -m pip install --upgrade pip pip install setuptools wheel # this is needed by pybedtools - conda install --yes bedtools + #conda install --yes bedtools # this is needed by cyvcf2 and pysam #conda install --yes htslib=1.14 # install python requirements diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 24b99e3..131e817 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -11,7 +11,10 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip From da02eeb5e69227844e04cc27f20fdd34088102f0 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Mon, 30 Jan 2023 13:50:36 +0100 Subject: [PATCH 19/25] remove the use of tox --- .github/workflows/unit_tests.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 131e817..6800733 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -22,6 +22,7 @@ jobs: pip install virtualenv tox==3.26.0 tox-wheel==1.0.0 tox-gh-actions - name: Build, install and run unit tests run: | - tox + pip install -r requirements.txt + coverage run -m unittest discover vafator.tests - name: Upload Coverage to Codecov uses: codecov/codecov-action@v3 \ No newline at end of file From baf692264fc948ad8cd9549f742ef7cde9c1cd94 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Mon, 30 Jan 2023 13:56:12 +0100 Subject: [PATCH 20/25] add bedtools back in... --- .github/workflows/integration_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index 128f355..9cb5905 100755 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -26,7 +26,7 @@ jobs: python -m pip install --upgrade pip pip install setuptools wheel # this is needed by pybedtools - #conda install --yes bedtools + conda install --yes bedtools # this is needed by cyvcf2 and pysam #conda install --yes htslib=1.14 # install python requirements From f743c3a9cadcde922f992740aab2d795a7b00cb1 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Mon, 30 Jan 2023 14:42:17 +0100 Subject: [PATCH 21/25] install bedtools with apt-get --- .github/workflows/integration_tests.yml | 3 ++- README.md | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index 9cb5905..ace5bb7 100755 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -26,7 +26,8 @@ jobs: python -m pip install --upgrade pip pip install setuptools wheel # this is needed by pybedtools - conda install --yes bedtools + #conda install --yes bedtools + apt-get --assume-yes install bedtools # this is needed by cyvcf2 and pysam #conda install --yes htslib=1.14 # install python requirements diff --git a/README.md b/README.md index 0e40335..63efa1d 100755 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ [![PyPI version](https://badge.fury.io/py/vafator.svg)](https://badge.fury.io/py/vafator) [![Anaconda-Server Badge](https://anaconda.org/bioconda/vafator/badges/version.svg)](https://anaconda.org/bioconda/vafator) [![Run unit tests](https://github.com/TRON-Bioinformatics/vafator/actions/workflows/unit_tests.yml/badge.svg?branch=master)](https://github.com/TRON-Bioinformatics/vafator/actions/workflows/unit_tests.yml) +[![codecov](https://codecov.io/gh/TRON-Bioinformatics/vafator/branch/master/graph/badge.svg?token=QLK84NI44G)](https://codecov.io/gh/TRON-Bioinformatics/vafator) [![License](https://img.shields.io/badge/license-MIT-green)](https://opensource.org/licenses/MIT) From 7b7d07f66c920a755fc980ab4a95c6bed3c0932c Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Mon, 30 Jan 2023 14:48:11 +0100 Subject: [PATCH 22/25] install bedtools with sudo --- .github/workflows/integration_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index ace5bb7..dff4446 100755 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -27,7 +27,7 @@ jobs: pip install setuptools wheel # this is needed by pybedtools #conda install --yes bedtools - apt-get --assume-yes install bedtools + sudo apt-get --assume-yes install bedtools # this is needed by cyvcf2 and pysam #conda install --yes htslib=1.14 # install python requirements From f6b3db019b71b9191cc57ca23972331e2fd43159 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Mon, 30 Jan 2023 15:00:44 +0100 Subject: [PATCH 23/25] try python 10 again --- .github/workflows/integration_tests.yml | 9 +-------- .github/workflows/unit_tests.yml | 2 +- tox.ini | 15 --------------- 3 files changed, 2 insertions(+), 24 deletions(-) delete mode 100644 tox.ini diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index dff4446..6125779 100755 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-20.04 strategy: matrix: - python-version: [ 3.7, 3.8, 3.9 ] + python-version: [ 3.7, 3.8, 3.9, 3.10 ] steps: - uses: actions/checkout@v2 @@ -15,10 +15,6 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - - uses: conda-incubator/setup-miniconda@v2 - with: - auto-update-conda: true - channels: defaults,conda-forge,bioconda - name: Install dependencies run: | sudo apt-get update @@ -26,10 +22,7 @@ jobs: python -m pip install --upgrade pip pip install setuptools wheel # this is needed by pybedtools - #conda install --yes bedtools sudo apt-get --assume-yes install bedtools - # this is needed by cyvcf2 and pysam - #conda install --yes htslib=1.14 # install python requirements pip install -r requirements.txt - name: Install vafator diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 6800733..e9f01ab 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-20.04 strategy: matrix: - python-version: [ 3.7, 3.8, 3.9 ] + python-version: [ 3.7, 3.8, 3.9, 3.10 ] steps: - uses: actions/checkout@v2 diff --git a/tox.ini b/tox.ini deleted file mode 100644 index 2e9ba56..0000000 --- a/tox.ini +++ /dev/null @@ -1,15 +0,0 @@ -[tox] -envlist = {py37, py38, py39} - -[gh-actions] -python = - 3.7: py37 - 3.8: py38 - 3.9: py39 - -[testenv] -wheel = true -passenv = * -commands= - pip install -r requirements.txt - coverage run -m unittest discover vafator.tests \ No newline at end of file From 850202b2cfc118ad0d141758ff13b8907e774425 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Mon, 30 Jan 2023 15:15:24 +0100 Subject: [PATCH 24/25] try python 10 again properly --- .github/workflows/integration_tests.yml | 2 +- .github/workflows/unit_tests.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index 6125779..bd8f232 100755 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-20.04 strategy: matrix: - python-version: [ 3.7, 3.8, 3.9, 3.10 ] + python-version: [ '3.7', '3.8', '3.9', '3.10' ] steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index e9f01ab..473fa02 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-20.04 strategy: matrix: - python-version: [ 3.7, 3.8, 3.9, 3.10 ] + python-version: [ '3.7', '3.8', '3.9', '3.10' ] steps: - uses: actions/checkout@v2 From 1e9be7850ae09b2c39e7166ef726400f994892f0 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Mon, 30 Jan 2023 15:20:05 +0100 Subject: [PATCH 25/25] add python 3.10 support to package metadata --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 1e63e44..1025dd9 100755 --- a/setup.py +++ b/setup.py @@ -42,6 +42,7 @@ 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3 :: Only', "License :: OSI Approved :: MIT License", "Operating System :: Unix"