Skip to content

Commit

Permalink
Improve cluster and merge (#42)
Browse files Browse the repository at this point in the history
* Add function syscall

* Add function normalise_vcf; move tests to root and add tox

* Fix how to run tests

* Add singularity file

* Run tox instead of pytest

* Split normalise_vcf into separate commands

* New merging implemented

* Implement clustering

* Run black and remove pyflakes errors

* Require pysam

* Add python3-setuptools

* specify python3.6

* Install tabix

* Add vcflib to path

* Simplify VCF before normalizing

* Remove debug prints

* More efficient checking if too many combinations

* Bug fix ignoring excluded variants because of long ref

* Optimise clustering VCF

* Remove commented out code

* Run black

* Tidy up logging

* default to temp dir inside root dir

* Delete temp_dir if it was created

* Put PASS in every line of filter column, for gramtools

* Add options to keep ref call records

* Version bump 0.12.0

* Add variant_tracking
  • Loading branch information
martinghunt authored Jun 30, 2020
1 parent fd038d6 commit 27b891c
Show file tree
Hide file tree
Showing 87 changed files with 1,456 additions and 14 deletions.
49 changes: 49 additions & 0 deletions .ci/install_dependencies.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env bash
set -vexu

install_root=$1

apt-get install -y software-properties-common
apt-add-repository universe
apt-get update

apt-get install -y \
build-essential \
cmake \
git \
libbz2-dev \
libcurl4-gnutls-dev \
liblzma-dev \
libssl-dev \
python3-pip \
python3-setuptools \
tabix \
zlib1g-dev



if [ ! -d $install_root ]; then
mkdir $install_root
fi
cd $install_root


#________________________ vt __________________________________#
cd $install_root
git clone https://github.com/atks/vt.git vt-git
cd vt-git
git checkout 2187ff6347086e38f71bd9f8ca622cd7dcfbb40c
make
cd ..
cp -s vt-git/vt .


#______________________vcflib _________________________________#
cd $install_root
git clone --recursive https://github.com/vcflib/vcflib.git
cd vcflib
make


#______________________ python ________________________________#
pip3 install tox "six>=1.14.0"
8 changes: 6 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
language: python
python:
- '3.6'
sudo: false

install: sudo ./.ci/install_dependencies.sh $HOME/tools

before_script:
- export PATH=$HOME/tools:$HOME/tools/vcflib/bin:$PATH

script:
- python setup.py test
- tox

deploy:
provider: pypi
Expand Down
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
include requirements.txt
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ For further information and a description of optional arguments, see `help(clust

## Develop

Run `python3 -m unittest` from the top-level directory to run all unit tests.
Run `tox` (or `pytest`) from the top-level directory to run all unit tests.

# License
MIT
21 changes: 21 additions & 0 deletions Singularity.def
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
BootStrap: debootstrap
OSVersion: bionic
MirrorURL: http://us.archive.ubuntu.com/ubuntu/

%environment
export PATH=/bioinf-tools/:/bioinf-tools/vcflib/bin:$PATH


%setup
mkdir $SINGULARITY_ROOTFS/cluster_vcf_records
rsync -a .ci/install_dependencies.sh MANIFEST.in cluster_vcf_records requirements.txt setup.py tests tox.ini $SINGULARITY_ROOTFS/cluster_vcf_records


%post
#_____________________ setup $PATH _______________________#
export PATH=/bioinf-tools/:/bioinf-tools/vcflib/bin:$PATH

/cluster_vcf_records/install_dependencies.sh /bioinf-tools
cd /cluster_vcf_records
tox
pip3 install .
3 changes: 3 additions & 0 deletions cluster_vcf_records/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@


__all__ = [
"allele_combinations",
"utils",
"variant_tracking",
"vcf_clusterer",
"vcf_file_read",
"vcf_merge",
Expand Down
143 changes: 143 additions & 0 deletions cluster_vcf_records/allele_combinations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import copy
import itertools
from operator import attrgetter

from cluster_vcf_records import variant_tracking


def var_cluster_to_coords_and_snps_and_non_snps(variants):
assert len(variants) > 0
nucleotides = {"A", "C", "G", "T"}
snps = {} # position => set of alts (and the ref nucleotide)
non_snps = [] #  list of variants
start = float("Inf")
end = -1

for var in variants:
start = min(start, var.pos)
end = max(end, var.pos + len(var.ref) - 1)

if var.ref in nucleotides and var.alt in nucleotides:
if var.pos not in snps:
snps[var.pos] = {var.ref}
snps[var.pos].add(var.alt)
else:
ref = var.ref
alt = var.alt
i = 0
while i < len(ref) - 1 and i < len(alt) and ref[i] == alt[i]:
i += 1
non_snps.append(
variant_tracking.Variant(var.seq_id, var.pos + i, ref[i:], alt[i:])
)

return start, end, snps, non_snps


def any_vars_overlap(variants):
if len(variants) < 2:
return False

for v1, v2 in itertools.combinations(variants, 2):
if variant_tracking.variants_overlap(v1, v2):
return True

return False


def var_cluster_to_coords_and_alts(variants, ref_seq, max_alleles=None):
if len(variants) == 1:
end = variants[0].pos + len(variants[0].ref) - 1
return variants[0].pos, end, {variants[0].alt}

if max_alleles is None:
max_alleles = float("Inf")
(
final_start,
final_end,
snps,
non_snps,
) = var_cluster_to_coords_and_snps_and_non_snps(variants)

# generate all the allele combinations from the SNPs.
snp_positions = []
snp_nucleotides = []
for position in sorted(snps):
snp_positions.append(position)
snp_nucleotides.append(sorted(list(snps[position])))

# work out min total alleles without making them. Making them could
# take a long time if too many! Can only put lower bound on
# the final unique number because all the combinations may have duplicates.
total_alleles_lower_bound = 1
for x in snp_nucleotides:
total_alleles_lower_bound *= len(x)
if len(non_snps):
total_alleles_lower_bound *= len(non_snps)

if total_alleles_lower_bound > max_alleles:
return final_start, final_end, None

alts = set()
ref_seq_for_vcf = ref_seq[final_start : final_end + 1]

# This generates all possible combinations of indels. Checks if
# each combination contains any that overlap, in which case that combination
# is not used because can't apply two conflicting indels.
overlaps = {}
for v1, v2 in itertools.combinations(non_snps, 2):
if variant_tracking.variants_overlap(v1, v2):
key1, key2 = sorted([v1, v2])
if key1 not in overlaps:
overlaps[key1] = set()
overlaps[key1].add(key2)

# snp_nucleotides has the ref and all the alt snps.
# This loops over all combinations of them, so we're getting every
# possibility of using each snp or not.
for combination in itertools.product(*snp_nucleotides):
alt_seq = list(ref_seq_for_vcf)
for i, position in enumerate(snp_positions):
alt_seq[position - final_start] = combination[i]
alts.add("".join(alt_seq))
if len(alts) > max_alleles:
return final_start, final_end, None

number_of_combos_used = 1
for number_of_non_snps in range(1, len(non_snps) + 1):
# If the previous combination of number_of_non_snps of non_snps
# resulted in none getting used because of overlapping non_snps,
# then taking more than number_of_non_snps won't work either so stop
if number_of_combos_used == 0:
break
else:
number_of_combos_used = 0

for non_snp_combo in itertools.combinations(non_snps, number_of_non_snps):
overlap = False
if number_of_non_snps > 1:
for v1 in non_snps:
if v1 in overlaps:
for v2 in non_snps:
if v2 in overlaps[v1]:
overlap = True
break

if overlap:
break

if overlap:
continue

number_of_combos_used += 1
non_snp_combo = sorted(list(non_snp_combo), key=attrgetter("pos"))
new_alt = copy.copy(alt_seq)
for non_snp in reversed(non_snp_combo):
start_pos = non_snp.pos - final_start
new_alt[start_pos : start_pos + len(non_snp.ref)] = non_snp.alt
alts.add("".join(new_alt))
if len(alts) > max_alleles:
return final_start, final_end, None

alts.discard(ref_seq_for_vcf)
return final_start, final_end, alts
86 changes: 86 additions & 0 deletions cluster_vcf_records/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import logging
import os
import re
import subprocess
import sys

from cluster_vcf_records import vcf_record


def syscall(command):
logging.debug(f"Run command: {command}")
completed_process = subprocess.run(
command,
shell=True,
stderr=subprocess.PIPE,
stdout=subprocess.PIPE,
universal_newlines=True,
)
if completed_process.returncode != 0:
print("Error running this command:", command, file=sys.stderr)
print("Return code:", completed_process.returncode, file=sys.stderr)
print(
"\nOutput from stdout:", completed_process.stdout, sep="\n", file=sys.stderr
)
print(
"\nOutput from stderr:", completed_process.stderr, sep="\n", file=sys.stderr
)
raise Exception("Error in system call. Cannot continue")

return completed_process


def simplify_vcf(infile, outfile, keep_ref_calls=False, max_ref_call_alleles=10):
"""Removes records that are all null calls and (optionally) ref calls.
If record has GT, removes all non-called alleles and replaces FORMAT
with just GT"""
with open(infile) as f_in, open(outfile, "w") as f_out:
for line in f_in:
if line.startswith("#"):
print(line, end="", file=f_out)
else:
record = vcf_record.VcfRecord(line)
if "GT" in record.FORMAT:
gt_indexes = set(re.split("[/|]", record.FORMAT["GT"]))
if "." in gt_indexes:
continue
gt_indexes = {int(x) for x in gt_indexes}
record.FORMAT.clear()
if gt_indexes == {0}:
if keep_ref_calls and len(record.ALT) <= max_ref_call_alleles:
record.set_format_key_value("GT", "0/0")
else:
continue
else:
gt_indexes = sorted(list(gt_indexes))
record.ALT = [record.ALT[i - 1] for i in gt_indexes if i > 0]
if len(gt_indexes) == 1:
record.set_format_key_value("GT", "1/1")
else:
assert len(gt_indexes) == 2
if 0 in gt_indexes:
record.set_format_key_value("GT", "0/1")
else:
record.set_format_key_value("GT", "1/2")
print(record, file=f_out)


def normalise_vcf(vcf_in, ref_fasta, vcf_out):
# Would be nice to pipe all these to save disk IO. ie
# f"vcfbreakmulti {vcf_in} | vcfallelicprimitives -L 10000 | vt normalize -r {ref_fasta} - | vcfuniq > {vcf_out}"
# But am concerned about errors, where error code getting lost in pipes.
# So run one by one using temp files and then clean up.
vcf_breakmulti = f"{vcf_out}.1.breakmulti.vcf"
vcf_allelic = f"{vcf_out}.2.allelicprimitives.vcf"
vcf_normalize = f"{vcf_out}.3.normalize.vcf"
syscall(f"vcfbreakmulti {vcf_in} > {vcf_breakmulti}")
syscall(f"vcfallelicprimitives -L 10000 {vcf_breakmulti} > {vcf_allelic}")
syscall(f"vt normalize -r {ref_fasta} {vcf_allelic} > {vcf_normalize}")
syscall(f"vcfuniq {vcf_normalize} > {vcf_out}")
os.unlink(vcf_breakmulti)
os.unlink(vcf_allelic)
os.unlink(vcf_normalize)


def rm_rf(filename):
syscall(f"rm -rf {filename}")
Loading

0 comments on commit 27b891c

Please sign in to comment.