-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter.py
185 lines (154 loc) · 9.47 KB
/
filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import os
import subprocess
from pathlib import Path
from shutil import copy
from Bio import SeqIO
from Datasnakes.Orthologs.Align.QualityControl.guidance2 import Guidance2Commandline
from Datasnakes.Orthologs.Align.pal2nal import Pal2NalCommandline
from Datasnakes.Orthologs.Genbank.utils import multi_fasta_manipulator
# TODO-ROB: Create appropriate class variables for Filtered Tree to inherit
# TODO-ROB: Create proper class variables to make simpler and for above todo
class FilteredAlignment(object):
"""Filters one alignment per gene at a time."""
def __init__(self, na_fasta, aa_fasta, gene_name=None, home=os.getcwd(), msaProgram='CLUSTALW', na_bootstraps=1,
aa_bootstraps=1, na_seqCutoff=0.6, aa_seqCutoff=0.6, na_colCutoff=0, aa_colCutoff=0.88):
if gene_name is None:
gene_name = Path(na_fasta).stem
# Initialize the default command line arguments
self.G2C_args = dict(outOrder='as_input', dataset=gene_name, msaProgram=msaProgram)
self.P2N_args = dict(nogap=True, nomismatch=True)
# Initialize
self.home = Path(home)
self.na_guidance_path = self.home / Path('NA_Guidance2')
self.aa_guidance_path = self.home / Path('AA_Guidance2')
self.gene = Path(na_fasta).stem
na_seqFile = str(self.home / Path(na_fasta)) # Guidance NA sequence file
aa_seqFile = str(self.home / Path(aa_fasta)) # Guidance AA sequence file
na_fasta = str(self.home / Path(self.gene + '_G2.ffn')) # Pal2Nal NA sequence file
na_alignment = str(Path(self.gene + '_P2N_na')) # Pal2Nal output file
# Guidance2 iterations over the nucleic acid sequences.
# Returns the file name that contains the filtered sequences.
rem_na_seqFile = self.nucleic_acid_guidance(seqFile=na_seqFile,
outDir=str(self.na_guidance_path),
bootstraps=na_bootstraps,
seqCutoff=na_seqCutoff,
colCutoff=na_colCutoff)
# Guidance 2 amino acid alignment filter. Filters the sequences based on the NA_Guidance2 runs.
# Returns the file name that contains the filtered alignment.
aa_alignment = self.amino_acid_guidance(seqFile=aa_seqFile,
remFile=rem_na_seqFile,
outDir=str(self.aa_guidance_path),
bootstraps=aa_bootstraps,
seqCutoff=aa_seqCutoff,
colCutoff=aa_colCutoff)
g2_seqFile = str(self.home / Path(self.gene + '_G2.ffn'))
# PAL2NAL nucleic acid alignment
self.pal2nal_conversion(str(aa_alignment), g2_seqFile, na_alignment)
def nucleic_acid_guidance(self, seqFile, outDir, bootstraps, seqCutoff, colCutoff):
seqType = 'nuc'
iteration_flag = True
iteration = 1
while iteration_flag is True:
iterDir = Path(outDir) / Path('iter_%s' % iteration) # /home/NA_Guidance2/iter_n
Path.mkdir(iterDir, parents=True, exist_ok=True)
g2_seqFile = str(self.home / Path(self.gene + '_G2.ffn')) # Need for all iterations
g2_rem_file = str(Path(iterDir) / Path('Seqs.Orig.fas.FIXED.Removed_Seq.With_Names')) # Need for all iterations
rem_file = str(self.home / Path(self.gene + '_G2_removed.ffn')) # Need for all iterations
if iteration == 1:
# seqFile is the given input
G2Cmd = Guidance2Commandline(self.G2C_args, seqFile=seqFile,
seqType=seqType, outDir=str(iterDir),
bootstraps=bootstraps,
seqCutoff=seqCutoff, colCutoff=colCutoff)
print(G2Cmd)
subprocess.check_call([str(G2Cmd)], stderr=subprocess.STDOUT, shell=True)
# Copy the Guidance removed seq file and paste it to the home directory
# Creates the rem_file
# Files without any removed don't have the file *.With_Names
if os.path.isfile(g2_rem_file) is False:
g2_rem_file = str(Path(iterDir) / Path('Seqs.Orig.fas.FIXED.Removed_Seq'))
SeqIO.write(SeqIO.parse(g2_rem_file, 'fasta'), rem_file, 'fasta') # Need for iter_1
# Filter the input NA fasta file using Guidance output
# Creates the g2_seqFile
multi_fasta_manipulator(seqFile, g2_rem_file, g2_seqFile, manipulation='remove') # Do after copying (iter_1) or adding (iter_n)
iteration_flag = True
elif iteration > 1:
# seqFile changes to g2_seqFile and the cutoffs change
seqCutoff = 0.7
colCutoff = 0.1
G2Cmd = Guidance2Commandline(self.G2C_args, seqFile=g2_seqFile, seqType=seqType, outDir=str(iterDir), bootstraps=bootstraps,
seqCutoff=seqCutoff, colCutoff=colCutoff)
print(G2Cmd)
subprocess.check_call([str(G2Cmd)], stderr=subprocess.STDOUT, shell=True)
# Get the removed sequence count
rem_count = 0
# Files without any removed don't have the file *.With_Names
if os.path.isfile(g2_rem_file) is False:
g2_rem_file = str(Path(iterDir) / Path('Seqs.Orig.fas.FIXED.Removed_Seq'))
for rec in SeqIO.parse(g2_rem_file, 'fasta'): # Need for all iterations
rem_count += 1
if rem_count > 0:
# Add new sequences to the rem_file
multi_fasta_manipulator(rem_file, g2_rem_file, rem_file, manipulation='add')
# Filter the input NA fasta file using the updated rem_file
multi_fasta_manipulator(seqFile, rem_file, g2_seqFile, manipulation='remove')
iteration_flag = True
else:
iteration_flag = False
iteration += 1
return rem_file
def amino_acid_guidance(self, seqFile, remFile, outDir, bootstraps, seqCutoff, colCutoff):
seqType = 'aa'
g2_seqFile = str(self.home / Path(self.gene + '_G2.faa'))
multi_fasta_manipulator(seqFile, remFile, g2_seqFile)
G2Cmd = Guidance2Commandline(self.G2C_args, seqFile=g2_seqFile, seqType=seqType, outDir=outDir, bootstraps=bootstraps,
seqCutoff=seqCutoff, colCutoff=colCutoff)
print(G2Cmd)
subprocess.check_call([str(G2Cmd)], stderr=subprocess.STDOUT, shell=True)
# DO not remove any columns.
filtered_alignment = Path(outDir) / Path('%s.CLUSTALW.aln.Sorted.With_Names' % self.G2C_args['dataset'])
renamed_alignment = copy(str(filtered_alignment), str(Path(self.gene + '_G2_aa.aln')))
renamed_alignment = multi_fasta_manipulator(str(renamed_alignment), str(seqFile), str(renamed_alignment), manipulation='sort')
print('Align the filtered amino acid sequences using guidance 2')
return Path(renamed_alignment)
def pal2nal_conversion(self, aa_alignment, na_fasta, output_file):
# TODO-ROB: Add a filter step so if error[0] = Error: #--- ERROR: inconsistency between the following pep and nuc seqs ---#
# TODO-ROB: ....then remove error[2] which is the sequence it can't read
removed = []
# Create output directory for PAL2NAL
outDir = self.home / Path('PAL2NAL')
Path.mkdir(outDir, exist_ok=True)
output_file = str(outDir / Path(output_file))
# Create an alignment for paml input
P2Ncmd = Pal2NalCommandline(self.P2N_args, pepaln=aa_alignment, nucfasta=na_fasta, output_file=output_file + '.paml.aln',
output='paml')
print(P2Ncmd)
pal2nal_flag = True
while pal2nal_flag is True:
pal2nal = subprocess.Popen([str(P2Ncmd)], stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True, encoding='utf-8')
error = pal2nal.stderr.readlines()
out = pal2nal.stdout.readlines()
pal2nal.wait()
if 'ERROR: inconsistency between the following pep and nuc seqs' in error[0]:
print('Caught the pal2nal error!')
print(error[0])
for err in error:
if '>' in err:
removed.append(err.strip('>' '\n'))
multi_fasta_manipulator(na_fasta, removed, na_fasta)
multi_fasta_manipulator(aa_alignment, removed, aa_alignment)
else:
pal2nal_flag = False
print('Error: ' + str(error))
print('Out: ' + str(out))
# Create an alignment for iqtree input
P2Ncmd = Pal2NalCommandline(self.P2N_args, pepaln=aa_alignment, nucfasta=na_fasta, output_file=output_file + '.iqtree.aln',
output='fasta')
print(P2Ncmd)
pal2nal = subprocess.Popen([str(P2Ncmd)], stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True, encoding='utf-8')
error = pal2nal.stderr.read()
out = pal2nal.stdout.read()
pal2nal.wait()
print('Error: ' + str(error))
print('Out: ' + str(out))
print('Align the nucleic acids using the amino acid alignment.')