-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add function syscall * Add function normalise_vcf; move tests to root and add tox * Fix how to run tests * Add singularity file * Run tox instead of pytest * Split normalise_vcf into separate commands * New merging implemented * Implement clustering * Run black and remove pyflakes errors * Require pysam * Add python3-setuptools * specify python3.6 * Install tabix * Add vcflib to path * Simplify VCF before normalizing * Remove debug prints * More efficient checking if too many combinations * Bug fix ignoring excluded variants because of long ref * Optimise clustering VCF * Remove commented out code * Run black * Tidy up logging * default to temp dir inside root dir * Delete temp_dir if it was created * Put PASS in every line of filter column, for gramtools * Add options to keep ref call records * Version bump 0.12.0 * Add variant_tracking
- Loading branch information
1 parent
fd038d6
commit 27b891c
Showing
87 changed files
with
1,456 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
#!/usr/bin/env bash | ||
set -vexu | ||
|
||
install_root=$1 | ||
|
||
apt-get install -y software-properties-common | ||
apt-add-repository universe | ||
apt-get update | ||
|
||
apt-get install -y \ | ||
build-essential \ | ||
cmake \ | ||
git \ | ||
libbz2-dev \ | ||
libcurl4-gnutls-dev \ | ||
liblzma-dev \ | ||
libssl-dev \ | ||
python3-pip \ | ||
python3-setuptools \ | ||
tabix \ | ||
zlib1g-dev | ||
|
||
|
||
|
||
if [ ! -d $install_root ]; then | ||
mkdir $install_root | ||
fi | ||
cd $install_root | ||
|
||
|
||
#________________________ vt __________________________________# | ||
cd $install_root | ||
git clone https://github.com/atks/vt.git vt-git | ||
cd vt-git | ||
git checkout 2187ff6347086e38f71bd9f8ca622cd7dcfbb40c | ||
make | ||
cd .. | ||
cp -s vt-git/vt . | ||
|
||
|
||
#______________________vcflib _________________________________# | ||
cd $install_root | ||
git clone --recursive https://github.com/vcflib/vcflib.git | ||
cd vcflib | ||
make | ||
|
||
|
||
#______________________ python ________________________________# | ||
pip3 install tox "six>=1.14.0" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
include requirements.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
BootStrap: debootstrap | ||
OSVersion: bionic | ||
MirrorURL: http://us.archive.ubuntu.com/ubuntu/ | ||
|
||
%environment | ||
export PATH=/bioinf-tools/:/bioinf-tools/vcflib/bin:$PATH | ||
|
||
|
||
%setup | ||
mkdir $SINGULARITY_ROOTFS/cluster_vcf_records | ||
rsync -a .ci/install_dependencies.sh MANIFEST.in cluster_vcf_records requirements.txt setup.py tests tox.ini $SINGULARITY_ROOTFS/cluster_vcf_records | ||
|
||
|
||
%post | ||
#_____________________ setup $PATH _______________________# | ||
export PATH=/bioinf-tools/:/bioinf-tools/vcflib/bin:$PATH | ||
|
||
/cluster_vcf_records/install_dependencies.sh /bioinf-tools | ||
cd /cluster_vcf_records | ||
tox | ||
pip3 install . |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
import copy | ||
import itertools | ||
from operator import attrgetter | ||
|
||
from cluster_vcf_records import variant_tracking | ||
|
||
|
||
def var_cluster_to_coords_and_snps_and_non_snps(variants): | ||
assert len(variants) > 0 | ||
nucleotides = {"A", "C", "G", "T"} | ||
snps = {} # position => set of alts (and the ref nucleotide) | ||
non_snps = [] # list of variants | ||
start = float("Inf") | ||
end = -1 | ||
|
||
for var in variants: | ||
start = min(start, var.pos) | ||
end = max(end, var.pos + len(var.ref) - 1) | ||
|
||
if var.ref in nucleotides and var.alt in nucleotides: | ||
if var.pos not in snps: | ||
snps[var.pos] = {var.ref} | ||
snps[var.pos].add(var.alt) | ||
else: | ||
ref = var.ref | ||
alt = var.alt | ||
i = 0 | ||
while i < len(ref) - 1 and i < len(alt) and ref[i] == alt[i]: | ||
i += 1 | ||
non_snps.append( | ||
variant_tracking.Variant(var.seq_id, var.pos + i, ref[i:], alt[i:]) | ||
) | ||
|
||
return start, end, snps, non_snps | ||
|
||
|
||
def any_vars_overlap(variants): | ||
if len(variants) < 2: | ||
return False | ||
|
||
for v1, v2 in itertools.combinations(variants, 2): | ||
if variant_tracking.variants_overlap(v1, v2): | ||
return True | ||
|
||
return False | ||
|
||
|
||
def var_cluster_to_coords_and_alts(variants, ref_seq, max_alleles=None): | ||
if len(variants) == 1: | ||
end = variants[0].pos + len(variants[0].ref) - 1 | ||
return variants[0].pos, end, {variants[0].alt} | ||
|
||
if max_alleles is None: | ||
max_alleles = float("Inf") | ||
( | ||
final_start, | ||
final_end, | ||
snps, | ||
non_snps, | ||
) = var_cluster_to_coords_and_snps_and_non_snps(variants) | ||
|
||
# generate all the allele combinations from the SNPs. | ||
snp_positions = [] | ||
snp_nucleotides = [] | ||
for position in sorted(snps): | ||
snp_positions.append(position) | ||
snp_nucleotides.append(sorted(list(snps[position]))) | ||
|
||
# work out min total alleles without making them. Making them could | ||
# take a long time if too many! Can only put lower bound on | ||
# the final unique number because all the combinations may have duplicates. | ||
total_alleles_lower_bound = 1 | ||
for x in snp_nucleotides: | ||
total_alleles_lower_bound *= len(x) | ||
if len(non_snps): | ||
total_alleles_lower_bound *= len(non_snps) | ||
|
||
if total_alleles_lower_bound > max_alleles: | ||
return final_start, final_end, None | ||
|
||
alts = set() | ||
ref_seq_for_vcf = ref_seq[final_start : final_end + 1] | ||
|
||
# This generates all possible combinations of indels. Checks if | ||
# each combination contains any that overlap, in which case that combination | ||
# is not used because can't apply two conflicting indels. | ||
overlaps = {} | ||
for v1, v2 in itertools.combinations(non_snps, 2): | ||
if variant_tracking.variants_overlap(v1, v2): | ||
key1, key2 = sorted([v1, v2]) | ||
if key1 not in overlaps: | ||
overlaps[key1] = set() | ||
overlaps[key1].add(key2) | ||
|
||
# snp_nucleotides has the ref and all the alt snps. | ||
# This loops over all combinations of them, so we're getting every | ||
# possibility of using each snp or not. | ||
for combination in itertools.product(*snp_nucleotides): | ||
alt_seq = list(ref_seq_for_vcf) | ||
for i, position in enumerate(snp_positions): | ||
alt_seq[position - final_start] = combination[i] | ||
alts.add("".join(alt_seq)) | ||
if len(alts) > max_alleles: | ||
return final_start, final_end, None | ||
|
||
number_of_combos_used = 1 | ||
for number_of_non_snps in range(1, len(non_snps) + 1): | ||
# If the previous combination of number_of_non_snps of non_snps | ||
# resulted in none getting used because of overlapping non_snps, | ||
# then taking more than number_of_non_snps won't work either so stop | ||
if number_of_combos_used == 0: | ||
break | ||
else: | ||
number_of_combos_used = 0 | ||
|
||
for non_snp_combo in itertools.combinations(non_snps, number_of_non_snps): | ||
overlap = False | ||
if number_of_non_snps > 1: | ||
for v1 in non_snps: | ||
if v1 in overlaps: | ||
for v2 in non_snps: | ||
if v2 in overlaps[v1]: | ||
overlap = True | ||
break | ||
|
||
if overlap: | ||
break | ||
|
||
if overlap: | ||
continue | ||
|
||
number_of_combos_used += 1 | ||
non_snp_combo = sorted(list(non_snp_combo), key=attrgetter("pos")) | ||
new_alt = copy.copy(alt_seq) | ||
for non_snp in reversed(non_snp_combo): | ||
start_pos = non_snp.pos - final_start | ||
new_alt[start_pos : start_pos + len(non_snp.ref)] = non_snp.alt | ||
alts.add("".join(new_alt)) | ||
if len(alts) > max_alleles: | ||
return final_start, final_end, None | ||
|
||
alts.discard(ref_seq_for_vcf) | ||
return final_start, final_end, alts |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
import logging | ||
import os | ||
import re | ||
import subprocess | ||
import sys | ||
|
||
from cluster_vcf_records import vcf_record | ||
|
||
|
||
def syscall(command): | ||
logging.debug(f"Run command: {command}") | ||
completed_process = subprocess.run( | ||
command, | ||
shell=True, | ||
stderr=subprocess.PIPE, | ||
stdout=subprocess.PIPE, | ||
universal_newlines=True, | ||
) | ||
if completed_process.returncode != 0: | ||
print("Error running this command:", command, file=sys.stderr) | ||
print("Return code:", completed_process.returncode, file=sys.stderr) | ||
print( | ||
"\nOutput from stdout:", completed_process.stdout, sep="\n", file=sys.stderr | ||
) | ||
print( | ||
"\nOutput from stderr:", completed_process.stderr, sep="\n", file=sys.stderr | ||
) | ||
raise Exception("Error in system call. Cannot continue") | ||
|
||
return completed_process | ||
|
||
|
||
def simplify_vcf(infile, outfile, keep_ref_calls=False, max_ref_call_alleles=10): | ||
"""Removes records that are all null calls and (optionally) ref calls. | ||
If record has GT, removes all non-called alleles and replaces FORMAT | ||
with just GT""" | ||
with open(infile) as f_in, open(outfile, "w") as f_out: | ||
for line in f_in: | ||
if line.startswith("#"): | ||
print(line, end="", file=f_out) | ||
else: | ||
record = vcf_record.VcfRecord(line) | ||
if "GT" in record.FORMAT: | ||
gt_indexes = set(re.split("[/|]", record.FORMAT["GT"])) | ||
if "." in gt_indexes: | ||
continue | ||
gt_indexes = {int(x) for x in gt_indexes} | ||
record.FORMAT.clear() | ||
if gt_indexes == {0}: | ||
if keep_ref_calls and len(record.ALT) <= max_ref_call_alleles: | ||
record.set_format_key_value("GT", "0/0") | ||
else: | ||
continue | ||
else: | ||
gt_indexes = sorted(list(gt_indexes)) | ||
record.ALT = [record.ALT[i - 1] for i in gt_indexes if i > 0] | ||
if len(gt_indexes) == 1: | ||
record.set_format_key_value("GT", "1/1") | ||
else: | ||
assert len(gt_indexes) == 2 | ||
if 0 in gt_indexes: | ||
record.set_format_key_value("GT", "0/1") | ||
else: | ||
record.set_format_key_value("GT", "1/2") | ||
print(record, file=f_out) | ||
|
||
|
||
def normalise_vcf(vcf_in, ref_fasta, vcf_out): | ||
# Would be nice to pipe all these to save disk IO. ie | ||
# f"vcfbreakmulti {vcf_in} | vcfallelicprimitives -L 10000 | vt normalize -r {ref_fasta} - | vcfuniq > {vcf_out}" | ||
# But am concerned about errors, where error code getting lost in pipes. | ||
# So run one by one using temp files and then clean up. | ||
vcf_breakmulti = f"{vcf_out}.1.breakmulti.vcf" | ||
vcf_allelic = f"{vcf_out}.2.allelicprimitives.vcf" | ||
vcf_normalize = f"{vcf_out}.3.normalize.vcf" | ||
syscall(f"vcfbreakmulti {vcf_in} > {vcf_breakmulti}") | ||
syscall(f"vcfallelicprimitives -L 10000 {vcf_breakmulti} > {vcf_allelic}") | ||
syscall(f"vt normalize -r {ref_fasta} {vcf_allelic} > {vcf_normalize}") | ||
syscall(f"vcfuniq {vcf_normalize} > {vcf_out}") | ||
os.unlink(vcf_breakmulti) | ||
os.unlink(vcf_allelic) | ||
os.unlink(vcf_normalize) | ||
|
||
|
||
def rm_rf(filename): | ||
syscall(f"rm -rf {filename}") |
Oops, something went wrong.