-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain_engine.py
219 lines (178 loc) · 11.9 KB
/
main_engine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import sys, os
import argparse
import utils
from refExon import ReferenceExon
from queryExon import QueryExonSeqs
from alignmentClass import Alignment
from alignedSeqs import AlignedSeqPair
if __name__ == "__main__":
## argument parsing business
parser = argparse.ArgumentParser(description = "CESAR multiexon annotation engine")
parser.add_argument("input", help = 'Transcript/Gene identifier', type = str)
parser.add_argument("maf_index", help = 'The path to the mafIndex file', type = str)
parser.add_argument("gene_pred", help = 'The master genepred file that contains information about all the genes/transcripts', type = str)
parser.add_argument("reference", help = 'Reference species', type = str)
parser.add_argument("species_list", help = 'A file containing a list of query species', type = str)
parser.add_argument("two_bit_dir", help = 'Path to two bit diectory that contains two bit files for every query species', type = str)
parser.add_argument("cesar_dir", help = 'Path to the directory that contains CESAR binary', type = str)
parser.add_argument("output_fasta", help = 'Path to the directory that contains CESAR generated alignments for every input transcript/gene', type = str)
parser.add_argument("output_annotation", help = 'Path to the directory that contains the annotation output', type = str)
parser.add_argument("-l", "--list", help = 'If the input is a list', action = 'store_true') ## if the input is a list of genes
parser.add_argument('--verbosity', type = int, default = 0, choices = [0, 1, 2], help=('Level of details to print out along with the progress.'))
args = parser.parse_args()
input_gene = args.input
maf_index = args.maf_index
gene_pred = args.gene_pred
reference = args.reference
species_list = args.species_list
two_bit_dir = args.two_bit_dir
cesar_dir = args.cesar_dir
output_fasta_dir = args.output_fasta
output_anno_dir = args.output_annotation
input_list = args.list
verbosity = args.verbosity
clade = 'human'
######################################################################
############## Do some sanity checks ##############
######################################################################
assert os.path.isfile(maf_index), f'Aborting!! The maf_index file {maf_index} does not exist'
assert os.path.isfile(gene_pred), f'Aborting!! The gene_pred file {gene_pred} does not exist'
assert os.path.isdir(two_bit_dir), f'Aborting!! The two_bit_dir {two_bit_dir} does not exist'
assert os.path.isdir(cesar_dir), f'Aborting!! The CESAR binary directory {cesar_dir} does not exist'
os.makedirs(output_fasta_dir, exist_ok = True)
os.makedirs(output_anno_dir, exist_ok = True)
##############################################################################################################################
## check if two bit files are present for every query species:
with open(species_list, 'r') as FI:
query_species_list = FI.readlines()
query_species_list = [species.rstrip('\n') for species in query_species_list]
for species in query_species_list:
two_bit_file = os.path.join(two_bit_dir, species, f'{species}.2bit')
assert os.path.isfile(two_bit_file), f'Aborting, the 2bit file for {species} i.e {two_bit_file} is missing'
two_bit_files_dict = {species:os.path.join(two_bit_dir, species, f'{species}.2bit') for species in query_species_list}
two_bit_file_ref = os.path.join(two_bit_dir, reference, f'{reference}.2bit')
##############################################################################################################################
## if the input is a file that contains a list of transcripts, read this file into a list
genes_list = list()
if input_list:
with open(input_gene, 'r') as FI:
genes_list = [gene.rstrip('\n') for gene in FI.readlines()]
else:
genes_list.append(input_gene)
transcripts_info_dict = dict()
with open(gene_pred, 'r') as FI:
for line in FI:
line_list = line.rstrip('\n').split('\t')
transcript_id = line_list.pop(0)
if transcript_id in genes_list:
transcripts_info_dict[transcript_id] = '\t'.join(line_list)
if len([gene for gene in genes_list if gene in transcripts_info_dict]) != len(genes_list):
print(f'Aborting. Not all the transcripts from the input list are found in the input gene pred file {gene_pred}')
for gene in genes_list:
if gene not in transcripts_info_dict:
print(f'The following transcript {gene} is not present in the master annotation file {gene_pred}')
sys.exit()
if verbosity > 1:
print("Sanity checks done")
print(f"The following gene(s) will be subjected to this realignment procedure {genes_list}.")
print(f"The gene info dictionary looks like the following: {transcripts_info_dict}")
print(f"The two bit directory is {two_bit_dir}, the reference species is {reference}")
######################################################################
############## sanity checks passed. Time for business ##############
######################################################################
## create all temp files upfront and store them in temp_files_list
maf_file = utils.create_temp_devshm()
cesar_input = utils.create_temp_devshm()
temp_files_list = [maf_file, cesar_input]
######################################################################
for transcript_id, transcript_info in transcripts_info_dict.items():
output_fasta = os.path.join(output_fasta_dir, f'{transcript_id}.fa')
output_anno_file = os.path.join(output_anno_dir, f'{transcript_id}.txt')
chromosome, strand, *_, exons_list_str = transcript_info.split('\t')
exons_list = exons_list_str.split(',')
number_of_exons = len(exons_list)
if strand == '-':
exons_list = exons_list[::-1]
ref_exon = ReferenceExon(exons_list, chromosome, strand, os.path.join(two_bit_dir, reference, f'{reference}.2bit'), verbosity)
exon_seqs, ss_dict, exon_phases_list = ref_exon.extract_seqs_splice_sites()
if verbosity > 1:
print("The following object is created for the reference_exon: BEGIN #####\n")
print(f'{vars(ref_exon)}\n{exon_seqs}\n{ss_dict}')
print("###### END\n\n")
query_obj = QueryExonSeqs(reference, exons_list, chromosome, strand, two_bit_dir, query_species_list, maf_index, verbosity)
query_cds_dict, valid_seqs_ct = query_obj.get_query_seqs()
if verbosity > 1:
print("The following is the information for the query_sequence_object: BEGIN #####\n")
print(f'{vars(query_obj)}\n{query_cds_dict}')
print("###### END\n\n")
if valid_seqs_ct == 0:
continue
######################################################################
############## Prepare the input file for CESAR ##############
######################################################################
if verbosity > 0:
print("Preparing the input for CESAR : running mafExtract and then twoBitToFa on the query species\n")
with open(cesar_input, 'w') as FO:
for exon_id, exon_seq in exon_seqs.items():
_, exon_ct = exon_id.split('_')
exon_ct = int(exon_ct)
acc_profile_def = os.path.join("extra", "tables", f'{clade}', "acc_profile.txt")
do_profile_def = os.path.join("extra", "tables", f'{clade}', "do_profile.txt")
acc_profile = os.path.join("extra", "tables", f'{clade}', "firstCodon_profile.txt") if exon_ct == 0 else acc_profile_def
do_profile = os.path.join("extra", "tables", f'{clade}', "lastCodon_profile.txt") if exon_ct == number_of_exons - 1 else do_profile_def
if ss_dict["acc"][exon_id] == "AC" and exon_ct != 0:
acc_profile = os.path.join("extra", "tables", f'{clade}', "u12_acc_profile.txt")
if ss_dict["donor"][exon_id] == "AT" and exon_ct != len(exons_list) - 1:
do_profile = os.path.join("extra", "tables", f'{clade}', "u12_donor_profile.txt")
FO.write(f">{transcript_id}_{exon_ct}\t{acc_profile}\t{do_profile}\n{exon_seq}\n")
######################################################
#### now populate this file with query sequences ####
######################################################
FO.write("####\n")
species_for_alignment = 0
for query_species, query_species_cds in query_cds_dict.items():
_, cds_start, cds_stop, query_strand, query_chr = query_species_cds.split('#') #5235109#5202543#-#JH863841
cds_start = int(cds_start) - 500
cds_stop = int(cds_stop) + 500
if cds_start > cds_stop:
if verbosity > 1:
print(f'Omitting the species {query_species} because the cds_start is greater than the cds_stop {cds_start} > {cds_stop}')
continue
query_sequence = utils.get_sequence_from_2bit(cds_start, cds_stop, query_chr, two_bit_files_dict[query_species])
if query_strand != strand:
query_sequence = utils.reverse_comp(query_sequence)
FO.write(f">{query_species_cds}\n{query_sequence}\n")
species_for_alignment += 1
######################################################################
############## Run CESAR ##############
######################################################################
if species_for_alignment == 0:
print("Nothing to run CESAR on!! All query species are problematic - either not on the same scaffold/chromosome or the query locus has some issues with the genome assembly.")
continue
if verbosity > 0:
print('Running CESAR now!!!')
cesar_call = f'cesar {cesar_input} -x 32'
cesar_output = utils.run_subprocess(cesar_call, True)
if verbosity > 0:
print(f"CESAR run finished. This was the call {cesar_call}\n")
### process CESAR's output
alignments_obj = Alignment(cesar_output, output_fasta, reference)
alignments_split_exon_wise = alignments_obj.process_split_alignments()
######################################################################
########### CESAR output processing ###########
######################################################################
## multiple-alignment file for RELAX/Hyphy has already been generated and stored in output_fasta ##
## now annotate exons or check for exon intactness ##
print("Now looking into the annotation business")
with open(output_anno_file, 'w') as FO:
for species, alignments_dict in alignments_split_exon_wise.items():
for exon_index, alignment_block in alignments_dict.items():
ref_acceptor = ss_dict["acc"][f'exon_{exon_index}']
ref_donor = ss_dict["donor"][f'exon_{exon_index}']
aligned_ref, query_header, aligned_query = alignment_block.split('\n')
phase_string = exon_phases_list[exon_index]
aligned_ref_query_pair = AlignedSeqPair(aligned_ref, aligned_query, query_header, phase_string, ref_acceptor, ref_donor, exon_index, number_of_exons, strand, verbosity)
exon_cds = aligned_ref_query_pair.check_exon_intactness()
FO.write(f'{transcript_id}\t{species}\t{exon_index}\t{exon_cds}\n')
for f in temp_files_list:
os.remove(f)