Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dockerfile -> RHEL 8 + intrahost tn93 script #44

Merged
merged 10 commits into from
Jun 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# Docker image for a tn93 development environment
FROM oraclelinux:8
# Docker image for a tn93 development environment (Red Hat Universal Base Image 8.10)
FROM redhat/ubi8:8.10

# Set up environment and install dependencies
RUN yum -y update && \
yum install -y cmake gcc-c++ gcc-toolset-10 git make oracle-epel-release-el8 && \
echo 'source /opt/rh/gcc-toolset-10/enable' > ~/.bashrc && \
yum install -y cmake gcc-c++ gcc-toolset-12 git && \
echo 'source /opt/rh/gcc-toolset-12/enable' > ~/.bashrc && \
source ~/.bashrc

# To compile tn93 within the development environment:
Expand Down
101 changes: 101 additions & 0 deletions scripts/intrahost_dists.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#! /usr/bin/env python3
'''
Calculate all pairwise distances between sequences from the same individual
'''

# imports
from gzip import open as gopen
from subprocess import check_output, DEVNULL, PIPE
from sys import argv
import argparse

# constants
TN93_BASE_COMMAND = [
'tn93',
'-l', '1', # at least 1 overlap
]

# parse user args
def parse_args():
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-i', '--input', required=False, type=str, default='stdin', help="Input FASTA")
parser.add_argument('-t', '--threshold', required=False, type=float, default=1., help="TN93 Distance Threshold")
parser.add_argument('-a', '--ambigs', required=False, type=str, default='resolve', help="TN93 Resolve Ambigs")
parser.add_argument('-g', '--ambig_fraction', required=False, type=float, default=1., help="TN93 Ambiguity Fraction")
parser.add_argument('-o', '--output', required=False, type=str, default='stdout', help="Output File")
args = parser.parse_args()
return args

# load FASTA sequences from a file
def load_fasta(fn):
# set up input file stream
if fn.strip().lower() == 'stdin': # standard input
from sys import stdin as f
elif fn.strip().lower().endswith('.gz'): # gzip-compressed file
f = gopen(fn, 'rt')
else: # uncompressed file
f = open(fn, 'r')

# parse FASTA
seqs = dict(); curr_ID = None; curr_seq = ''
for line in f:
l = line.strip()
if len(l) == 0: # skip empty lines
continue
if l[0] == '>': # header line
if len(curr_seq) != 0:
seqs[curr_ID] = curr_seq
curr_ID = l; curr_seq = ''
else: # sequence line
curr_seq += l
if len(curr_seq) != 0:
seqs[curr_ID] = curr_seq
return seqs

# return dictionary where seqs_per_person[person] = set of sequence IDs from `person`
def load_seqs_per_person(seqs):
seqs_per_person = dict()
for k in seqs:
person = k.lstrip('>').split('|')[0].strip()
if person not in seqs_per_person:
seqs_per_person[person] = set()
seqs_per_person[person].add(k)
return seqs_per_person

# calculate all pairwise distances between sequences from the same individual (this is the main logic of this script)
# currently not parallelizing across people because tn93 parallelizes pairwise distance calculations,
# but we could easily parallelize across people as well
def calc_intrahost_dists(seqs, seqs_per_person, out_fn, tn93_t, tn93_a, tn93_g):
# open output file
if out_fn.strip().lower() == 'stdout':
from sys import stdout as output
elif out_fn.strip().lower().endswith('.gz'):
output = gopen(out_fn, 'wt')
else:
output = open(out_fn, 'w')

# build tn93 command
tn93_command = TN93_BASE_COMMAND # start with base command
tn93_command += ['-t', str(tn93_t)] # add max distance threshold
tn93_command += ['-a', str(tn93_a)] # add ambig resolve
tn93_command += ['-g', str(tn93_g)] # add ambiguity fraction

# run on all people
for person_num, person in enumerate(seqs_per_person):
if len(seqs_per_person[person]) < 2:
continue # skip people with just 1 sequence
curr_fasta = '\n'.join('%s\n%s' % (k,seqs[k]) for k in seqs_per_person[person])
o = check_output(tn93_command, stderr=DEVNULL, input=curr_fasta, encoding='ascii')
if person_num == 0:
output.write(o.strip()); output.write('\n')
else:
for l in o.splitlines()[1:]:
output.write(l.strip()); output.write('\n')
output.close()

# run tool
if __name__ == "__main__":
args = parse_args()
seqs = load_fasta(args.input)
seqs_per_person = load_seqs_per_person(seqs)
calc_intrahost_dists(seqs, seqs_per_person, args.output, args.threshold, args.ambigs, args.ambig_fraction)
Loading