Improve cluster and merge (#42)

* Add function syscall * Add function normalise_vcf; move tests to root and add tox * Fix how to run tests * Add singularity file * Run tox instead of pytest * Split normalise_vcf into separate commands * New merging implemented * Implement clustering * Run black and remove pyflakes errors * Require pysam * Add python3-setuptools * specify python3.6 * Install tabix * Add vcflib to path * Simplify VCF before normalizing * Remove debug prints * More efficient checking if too many combinations * Bug fix ignoring excluded variants because of long ref * Optimise clustering VCF * Remove commented out code * Run black * Tidy up logging * default to temp dir inside root dir * Delete temp_dir if it was created * Put PASS in every line of filter column, for gramtools * Add options to keep ref call records * Version bump 0.12.0 * Add variant_tracking
iqbal-lab-org · Jun 30, 2020 · 27b891c · 27b891c
1 parent fd038d6
commit 27b891c
Show file tree

Hide file tree

Showing 87 changed files with 1,456 additions and 14 deletions.
diff --git a/.ci/install_dependencies.sh b/.ci/install_dependencies.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+set -vexu
+
+install_root=$1
+
+apt-get install -y software-properties-common
+apt-add-repository universe
+apt-get update
+
+apt-get install -y \
+  build-essential \
+  cmake \
+  git \
+  libbz2-dev \
+  libcurl4-gnutls-dev \
+  liblzma-dev \
+  libssl-dev \
+  python3-pip \
+  python3-setuptools \
+  tabix \
+  zlib1g-dev
+
+
+
+if [ ! -d $install_root ]; then
+  mkdir $install_root
+fi
+cd $install_root
+
+
+#________________________ vt __________________________________#
+cd $install_root
+git clone https://github.com/atks/vt.git vt-git
+cd vt-git
+git checkout 2187ff6347086e38f71bd9f8ca622cd7dcfbb40c
+make
+cd ..
+cp -s vt-git/vt .
+
+
+#______________________vcflib _________________________________#
+cd $install_root
+git clone --recursive https://github.com/vcflib/vcflib.git
+cd vcflib
+make
+
+
+#______________________ python ________________________________#
+pip3 install tox "six>=1.14.0"
diff --git a/.travis.yml b/.travis.yml
@@ -1,10 +1,14 @@
 language: python
 python:
 - '3.6'
-sudo: false
+
+install: sudo ./.ci/install_dependencies.sh $HOME/tools
+
+before_script:
+- export PATH=$HOME/tools:$HOME/tools/vcflib/bin:$PATH
 
 script:
-- python setup.py test
+- tox
 
 deploy:
   provider: pypi

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+include requirements.txt
diff --git a/README.md b/README.md
@@ -22,7 +22,7 @@ For further information and a description of optional arguments, see `help(clust
 
 ## Develop
 
-Run `python3 -m unittest` from the top-level directory to run all unit tests.
+Run `tox` (or `pytest`) from the top-level directory to run all unit tests.
 
 # License
 MIT
diff --git a/Singularity.def b/Singularity.def
@@ -0,0 +1,21 @@
+BootStrap: debootstrap
+OSVersion: bionic
+MirrorURL: http://us.archive.ubuntu.com/ubuntu/
+
+%environment
+export PATH=/bioinf-tools/:/bioinf-tools/vcflib/bin:$PATH
+
+
+%setup
+    mkdir $SINGULARITY_ROOTFS/cluster_vcf_records
+    rsync -a .ci/install_dependencies.sh MANIFEST.in cluster_vcf_records requirements.txt setup.py tests tox.ini $SINGULARITY_ROOTFS/cluster_vcf_records
+
+
+%post
+    #_____________________ setup $PATH _______________________#
+    export PATH=/bioinf-tools/:/bioinf-tools/vcflib/bin:$PATH
+
+    /cluster_vcf_records/install_dependencies.sh /bioinf-tools
+    cd /cluster_vcf_records
+    tox
+    pip3 install .
diff --git a/cluster_vcf_records/__init__.py b/cluster_vcf_records/__init__.py
@@ -7,6 +7,9 @@
 
 
 __all__ = [
+    "allele_combinations",
+    "utils",
+    "variant_tracking",
     "vcf_clusterer",
     "vcf_file_read",
     "vcf_merge",

diff --git a/cluster_vcf_records/allele_combinations.py b/cluster_vcf_records/allele_combinations.py
@@ -0,0 +1,143 @@
+import copy
+import itertools
+from operator import attrgetter
+
+from cluster_vcf_records import variant_tracking
+
+
+def var_cluster_to_coords_and_snps_and_non_snps(variants):
+    assert len(variants) > 0
+    nucleotides = {"A", "C", "G", "T"}
+    snps = {}  # position => set of alts (and the ref nucleotide)
+    non_snps = []  #  list of variants
+    start = float("Inf")
+    end = -1
+
+    for var in variants:
+        start = min(start, var.pos)
+        end = max(end, var.pos + len(var.ref) - 1)
+
+        if var.ref in nucleotides and var.alt in nucleotides:
+            if var.pos not in snps:
+                snps[var.pos] = {var.ref}
+            snps[var.pos].add(var.alt)
+        else:
+            ref = var.ref
+            alt = var.alt
+            i = 0
+            while i < len(ref) - 1 and i < len(alt) and ref[i] == alt[i]:
+                i += 1
+            non_snps.append(
+                variant_tracking.Variant(var.seq_id, var.pos + i, ref[i:], alt[i:])
+            )
+
+    return start, end, snps, non_snps
+
+
+def any_vars_overlap(variants):
+    if len(variants) < 2:
+        return False
+
+    for v1, v2 in itertools.combinations(variants, 2):
+        if variant_tracking.variants_overlap(v1, v2):
+            return True
+
+    return False
+
+
+def var_cluster_to_coords_and_alts(variants, ref_seq, max_alleles=None):
+    if len(variants) == 1:
+        end = variants[0].pos + len(variants[0].ref) - 1
+        return variants[0].pos, end, {variants[0].alt}
+
+    if max_alleles is None:
+        max_alleles = float("Inf")
+    (
+        final_start,
+        final_end,
+        snps,
+        non_snps,
+    ) = var_cluster_to_coords_and_snps_and_non_snps(variants)
+
+    # generate all the allele combinations from the SNPs.
+    snp_positions = []
+    snp_nucleotides = []
+    for position in sorted(snps):
+        snp_positions.append(position)
+        snp_nucleotides.append(sorted(list(snps[position])))
+
+    # work out min total alleles without making them. Making them could
+    # take a long time if too many! Can only put lower bound on
+    # the final unique number because all the combinations may have duplicates.
+    total_alleles_lower_bound = 1
+    for x in snp_nucleotides:
+        total_alleles_lower_bound *= len(x)
+    if len(non_snps):
+        total_alleles_lower_bound *= len(non_snps)
+
+    if total_alleles_lower_bound > max_alleles:
+        return final_start, final_end, None
+
+    alts = set()
+    ref_seq_for_vcf = ref_seq[final_start : final_end + 1]
+
+    # This generates all possible combinations of indels. Checks if
+    # each combination contains any that overlap, in which case that combination
+    # is not used because can't apply two conflicting indels.
+    overlaps = {}
+    for v1, v2 in itertools.combinations(non_snps, 2):
+        if variant_tracking.variants_overlap(v1, v2):
+            key1, key2 = sorted([v1, v2])
+            if key1 not in overlaps:
+                overlaps[key1] = set()
+            overlaps[key1].add(key2)
+
+    # snp_nucleotides has the ref and all the alt snps.
+    # This loops over all combinations of them, so we're getting every
+    # possibility of using each snp or not.
+    for combination in itertools.product(*snp_nucleotides):
+        alt_seq = list(ref_seq_for_vcf)
+        for i, position in enumerate(snp_positions):
+            alt_seq[position - final_start] = combination[i]
+        alts.add("".join(alt_seq))
+        if len(alts) > max_alleles:
+            return final_start, final_end, None
+
+        number_of_combos_used = 1
+        for number_of_non_snps in range(1, len(non_snps) + 1):
+            # If the previous combination of number_of_non_snps of non_snps
+            # resulted in none getting used because of overlapping non_snps,
+            # then taking more than number_of_non_snps won't work either so stop
+            if number_of_combos_used == 0:
+                break
+            else:
+                number_of_combos_used = 0
+
+            for non_snp_combo in itertools.combinations(non_snps, number_of_non_snps):
+                overlap = False
+                if number_of_non_snps > 1:
+                    for v1 in non_snps:
+                        if v1 in overlaps:
+                            for v2 in non_snps:
+                                if v2 in overlaps[v1]:
+                                    overlap = True
+                                    break
+
+                        if overlap:
+                            break
+
+                if overlap:
+                    continue
+
+                number_of_combos_used += 1
+                non_snp_combo = sorted(list(non_snp_combo), key=attrgetter("pos"))
+                new_alt = copy.copy(alt_seq)
+                for non_snp in reversed(non_snp_combo):
+                    start_pos = non_snp.pos - final_start
+                    new_alt[start_pos : start_pos + len(non_snp.ref)] = non_snp.alt
+                    alts.add("".join(new_alt))
+                    if len(alts) > max_alleles:
+                        return final_start, final_end, None
+
+    alts.discard(ref_seq_for_vcf)
+    return final_start, final_end, alts
diff --git a/cluster_vcf_records/utils.py b/cluster_vcf_records/utils.py
@@ -0,0 +1,86 @@
+import logging
+import os
+import re
+import subprocess
+import sys
+
+from cluster_vcf_records import vcf_record
+
+
+def syscall(command):
+    logging.debug(f"Run command: {command}")
+    completed_process = subprocess.run(
+        command,
+        shell=True,
+        stderr=subprocess.PIPE,
+        stdout=subprocess.PIPE,
+        universal_newlines=True,
+    )
+    if completed_process.returncode != 0:
+        print("Error running this command:", command, file=sys.stderr)
+        print("Return code:", completed_process.returncode, file=sys.stderr)
+        print(
+            "\nOutput from stdout:", completed_process.stdout, sep="\n", file=sys.stderr
+        )
+        print(
+            "\nOutput from stderr:", completed_process.stderr, sep="\n", file=sys.stderr
+        )
+        raise Exception("Error in system call. Cannot continue")
+
+    return completed_process
+
+
+def simplify_vcf(infile, outfile, keep_ref_calls=False, max_ref_call_alleles=10):
+    """Removes records that are all null calls and (optionally) ref calls.
+    If record has GT, removes all non-called alleles and replaces FORMAT
+    with just GT"""
+    with open(infile) as f_in, open(outfile, "w") as f_out:
+        for line in f_in:
+            if line.startswith("#"):
+                print(line, end="", file=f_out)
+            else:
+                record = vcf_record.VcfRecord(line)
+                if "GT" in record.FORMAT:
+                    gt_indexes = set(re.split("[/|]", record.FORMAT["GT"]))
+                    if "." in gt_indexes:
+                        continue
+                    gt_indexes = {int(x) for x in gt_indexes}
+                    record.FORMAT.clear()
+                    if gt_indexes == {0}:
+                        if keep_ref_calls and len(record.ALT) <= max_ref_call_alleles:
+                            record.set_format_key_value("GT", "0/0")
+                        else:
+                            continue
+                    else:
+                        gt_indexes = sorted(list(gt_indexes))
+                        record.ALT = [record.ALT[i - 1] for i in gt_indexes if i > 0]
+                        if len(gt_indexes) == 1:
+                            record.set_format_key_value("GT", "1/1")
+                        else:
+                            assert len(gt_indexes) == 2
+                            if 0 in gt_indexes:
+                                record.set_format_key_value("GT", "0/1")
+                            else:
+                                record.set_format_key_value("GT", "1/2")
+                print(record, file=f_out)
+
+
+def normalise_vcf(vcf_in, ref_fasta, vcf_out):
+    # Would be nice to pipe all these to save disk IO. ie
+    # f"vcfbreakmulti {vcf_in} | vcfallelicprimitives -L 10000 | vt normalize -r {ref_fasta} - | vcfuniq > {vcf_out}"
+    # But am concerned about errors, where error code getting lost in pipes.
+    # So run one by one using temp files and then clean up.
+    vcf_breakmulti = f"{vcf_out}.1.breakmulti.vcf"
+    vcf_allelic = f"{vcf_out}.2.allelicprimitives.vcf"
+    vcf_normalize = f"{vcf_out}.3.normalize.vcf"
+    syscall(f"vcfbreakmulti {vcf_in} > {vcf_breakmulti}")
+    syscall(f"vcfallelicprimitives -L 10000 {vcf_breakmulti} > {vcf_allelic}")
+    syscall(f"vt normalize -r {ref_fasta} {vcf_allelic} > {vcf_normalize}")
+    syscall(f"vcfuniq {vcf_normalize} > {vcf_out}")
+    os.unlink(vcf_breakmulti)
+    os.unlink(vcf_allelic)
+    os.unlink(vcf_normalize)
+
+
+def rm_rf(filename):
+    syscall(f"rm -rf {filename}")