-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmagicRE_count.py
274 lines (226 loc) · 13.3 KB
/
magicRE_count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
"""
Perform an alignment of reads to the pseudogenome,
Count unique, fractional, and total reads mapped.
Write output to tsv files.
"""
import argparse
import shlex
import subprocess
from collections import defaultdict
from pathlib import Path
def align_to_pseudogenome(out_dir, pseudogenome_db, sample_name, read_type, read_handle, read_handle2, threads, paired_end):
"""Align reads to the pseudogenome using magicBLAST"""
outfile = Path(out_dir, f"__{sample_name}_magicBLAST_results.tab")
if read_type == "sra":
align_cmd = f"magicblast -sra {read_handle} -db {pseudogenome_db} -num_threads {threads} -outfmt tabular -no_unaligned -splice F -limit_lookup F > {outfile}"
elif read_type == "fasta":
if paired_end:
align_cmd = f"magicblast -query {read_handle} -query_mate {read_handle2} -db {pseudogenome_db} -num_threads {threads} -outfmt tabular -no_unaligned -no_discordant -splice F -limit_lookup F > {outfile}"
else:
align_cmd = f"magicblast -query {read_handle} -db {pseudogenome_db} -num_threads {threads} -outfmt tabular -no_unaligned -splice F -limit_lookup F > {outfile}"
elif read_type == "fastq":
if paired_end:
align_cmd = f"magicblast -query {read_handle} -query_mate {read_handle2} -db {pseudogenome_db} -infmt fastq -num_threads {threads} -outfmt tabular -no_unaligned -no_discordant -splice F -limit_lookup F > {outfile}"
else:
align_cmd = f"magicblast -query {read_handle} -db {pseudogenome_db} -infmt fastq -num_threads {threads} -outfmt tabular -no_unaligned -splice F -limit_lookup F > {outfile}"
else:
raise TypeError("Unknown read_type: must be one of [sra, fasta, fastq]")
# run the alignment
print("Aligning sample to the pseudogenome. Handing off to magicblast...")
print(f"Effective magicBLAST command:\n{align_cmd}")
subprocess.run(align_cmd, shell=True)
return outfile
def combine_counts(counts1, counts2):
"""Generic function for combining two dictionaries of count data"""
combined_counts = defaultdict(lambda : defaultdict(lambda : defaultdict(float)))
for class_ in counts1.keys():
for family in counts1[class_].keys():
for elem in counts1[class_][family].keys():
combined_counts[class_][family][elem] += counts1[class_][family][elem]
for class_ in counts2.keys():
for family in counts2[class_].keys():
for elem in counts2[class_][family].keys():
combined_counts[class_][family][elem] += counts2[class_][family][elem]
return combined_counts
def count_magicBLAST_result(magicBLAST_outfile, id_threshold, debug):
"""Count the unique, total and fractional counts from the magicBLAST results"""
print("Generating counts from alignment to the pseudogenome...")
uniq_counts = defaultdict(float)
total_counts = defaultdict(float)
frac_counts = defaultdict(float)
read_counts = defaultdict(lambda: defaultdict(float))
prop_counts = defaultdict(float)
with open(magicBLAST_outfile, 'r') as infile:
for line in infile:
if line.startswith("#"): # skip header lines
continue
else:
l = line.strip().split()
read_id =l[0]
sub_family = l[1]
percent_identity = float(l[2])
N_alignments = int(l[17])
# calculate total counts
if percent_identity >= id_threshold:
total_counts[sub_family] += 1
# calculate unique counts
if percent_identity >= id_threshold and N_alignments == 1:
uniq_counts[sub_family] += 1
# calculate fractional counts -- still must be combined with uniq counts
counted = set()
if percent_identity >= id_threshold and N_alignments > 1:
if read_id not in counted:
frac_counts[sub_family] += (1 / N_alignments)
counted.add(read_id)
# count the sub families proportionally
if percent_identity >= id_threshold and N_alignments > 1:
read_counts[read_id][sub_family] += 1
read_counts[read_id]['N'] = N_alignments
# calculate the proportional counts
for read_id in read_counts:
for sub_family in read_counts[read_id]:
if sub_family == 'N':
continue
prop_counts[sub_family] += read_counts[read_id][sub_family] / read_counts[read_id]['N']
# remove the large intermediate files
if not debug:
print("Removing large intermediate files...")
subprocess.run(f"rm {magicBLAST_outfile}", shell=True)
return uniq_counts, frac_counts, total_counts, prop_counts
def create_element_mapping(repnames_bedfile):
"""Create a mapping of the element names to their classes and families"""
elem_key = defaultdict(lambda : defaultdict(str))
with open(repnames_bedfile, "r") as bed:
for line in bed:
l = line.strip().split("\t")
name = l[3]
class_ = l[4]
family = l[5]
elem_key[name]["class"] = class_
elem_key[name]["family"] = family
return elem_key
def map_counts(counts, element_mapper):
"""Create a dictionary of reads using the class > family > element hierarchy.
Since the output from the alignment step function returns element names and their counts,
these element names must be re-associated with the classes and families they are part of.
This association is performed by the element_mapper.
"""
hierarchy = defaultdict(lambda : defaultdict(lambda : defaultdict(float)))
for elem in counts.keys():
class_ = element_mapper[elem]['class']
family = element_mapper[elem]['family']
hierarchy[class_][family][elem] += counts[elem]
return hierarchy
def summarize(count_data):
"""Collapse the counts to the class and family level."""
class_data = defaultdict(float)
family_data = defaultdict(lambda : defaultdict(float))
# class level counts
for class_ in count_data:
for family in count_data[class_]:
for elem in count_data[class_][family]:
count = float(count_data[class_][family][elem])
class_data[class_] += count
# family level counts
for class_ in count_data:
for family in count_data[class_]:
for elem in count_data[class_][family]:
count = float(count_data[class_][family][elem])
family_data[class_][family] += count
return class_data, family_data
def write_class_summary(class_counts, outfile):
"""Write class-level counts to outfile. Output truncates floats to ints"""
with open(outfile, "w") as f:
for class_ in class_counts.keys():
count = int(class_counts[class_])
f.write(f"{class_}\t{count}\n")
def write_family_summary(family_counts, outfile):
"""Write family-level counts to outfile. Output truncates floats to ints."""
with open(outfile, "w") as f:
for class_ in family_counts.keys():
for family in family_counts[class_]:
count = int(family_counts[class_][family])
f.write(f"{class_}\t{family}\t{count}\n")
def write_output(count_data, outfile):
"""Write count data to the outfile. Output truncates floats to ints."""
print(f"Writing results to {outfile}...")
with open(outfile, "w") as f:
for class_ in count_data:
for family in count_data[class_]:
for elem in count_data[class_][family]:
count = int(count_data[class_][family][elem])
f.write(f"{class_}\t{family}\t{elem}\t{count}\n")
def main():
parser = argparse.ArgumentParser(description="Align reads to pseudogenome using magicBLAST and return counts of repeat elements")
parser.add_argument('sampleName', help='The name of the sample to be processed.')
parser.add_argument('setupDir', help="Path to the magicRE_Setup directory.")
parser.add_argument('queryType', default='fastq', choices=['sra', 'fasta', 'fastq'], help='Defines the input format to be aligned. One of [SRA, fasta, fastq]. Defines the input format to be aligned.')
parser.add_argument('query', help="Either the SRA accession number or the path to the fasta or fastq file to be aligned to the pseudogenome.")
parser.add_argument('--query2', default=None, help="If pairedEnd is set, designate this to be the path to second read pair.")
parser.add_argument('--percentIdentity', default=90, type=float, metavar='90.0', help='Threshold at which to include mapped reads in the count data. Mapped reads must be >= percentIdentity to be included.')
parser.add_argument('--threads', default=1, type=int, help='Number of threads to use for alignment.')
parser.add_argument('--pairedEnd', dest='pairedEnd', action='store_true', help='Designate this option for paired-end sequencing.')
parser.add_argument('--summarize', dest='summarize', action='store_true', help='In addition to the repeat name level output, produce collapsed counts at the class and family levels for each of the count types.')
parser.add_argument('--outDirectory', default=".", metavar='sampleName_magicRE_results', help="Specify where to save the count results. Defaults to the current_directory/sampleName_magicRE_results.")
parser.add_argument('--debug', dest='debug', action='store_true', help='Select this option to prevent the removal of temporary files; useful for debugging.')
parser.set_defaults(pairedEnd=False, summarize=False, debug=False)
args = parser.parse_args()
sample_name = args.sampleName
setup_dir = args.setupDir
query_type = args.queryType
query = args.query
query2 = args.query2
perc_id = args.percentIdentity
threads = args.threads
paired_end = args.pairedEnd
summarize_data = args.summarize
out_path = args.outDirectory
debug = args.debug
# setup file i/o ----------------------------------------------------------
out_dir = Path(out_path, sample_name + "_magicRE_results")
pseudogenome_db = Path(setup_dir, "pseudogenome")
repnames_bedfile = Path(setup_dir, "repnames.bed")
total_counts_outfile = Path(out_dir, sample_name + "_total_counts.tsv")
uniq_counts_outfile = Path(out_dir, sample_name + "_unique_counts.tsv")
prop_counts_outfile = Path(out_dir, sample_name + "_propFractional_counts.tsv")
fractional_counts_outfile = Path(out_dir, sample_name + "_fractional_counts.tsv")
class_total_counts_outfile = Path(out_dir, sample_name + "_class_total_counts.tsv")
class_uniq_counts_outfile = Path(out_dir, sample_name + "_class_unique_counts.tsv")
class_fractional_counts_outfile = Path(out_dir, sample_name + "_class_fractional_counts.tsv")
class_prop_counts_outfile = Path(out_dir, sample_name + "_class_propFractional_counts.tsv")
family_total_counts_outfile = Path(out_dir, sample_name + "_family_total_counts.tsv")
family_uniq_counts_outfile = Path(out_dir, sample_name + "_family_unique_counts.tsv")
family_fractional_counts_outfile = Path(out_dir, sample_name + "_family_fractional_counts.tsv")
family_prop_counts_outfile = Path(out_dir, sample_name + "_family_propFractional_counts.tsv")
if not out_dir.exists():
out_dir.mkdir(parents=True)
# main routine ------------------------------------------------------------
res_path = align_to_pseudogenome(out_dir, pseudogenome_db, sample_name, query_type, query, query2, threads, paired_end)
raw_uniq, raw_frac, raw_total, raw_prop = count_magicBLAST_result(res_path, perc_id, debug)
elem_mapper = create_element_mapping(repnames_bedfile)
uniq_counts = map_counts(raw_uniq, elem_mapper)
frac_counts = map_counts(raw_frac, elem_mapper)
total_counts = map_counts(raw_total, elem_mapper)
prop_counts = map_counts(raw_prop, elem_mapper)
fractional_counts = combine_counts(uniq_counts, frac_counts) # fractional counts must be combined with unique counts before writing out
prop_frac_counts = combine_counts(uniq_counts, prop_counts)
write_output(fractional_counts, fractional_counts_outfile)
write_output(total_counts, total_counts_outfile)
write_output(uniq_counts, uniq_counts_outfile)
write_output(prop_frac_counts, prop_counts_outfile)
if summarize_data:
print("Writing class-level and family-level summarized counts...")
class_total, family_total = summarize(total_counts)
class_uniq, family_uniq = summarize(uniq_counts)
class_fractional, family_fractional = summarize(fractional_counts)
class_prop, family_prop = summarize(prop_frac_counts)
write_class_summary(class_total, class_total_counts_outfile)
write_class_summary(class_uniq, class_uniq_counts_outfile)
write_class_summary(class_fractional, class_fractional_counts_outfile)
write_class_summary(class_prop, class_prop_counts_outfile)
write_family_summary(family_total, family_total_counts_outfile)
write_family_summary(family_uniq, family_uniq_counts_outfile)
write_family_summary(family_fractional, family_fractional_counts_outfile)
write_family_summary(family_prop, family_prop_counts_outfile)
if __name__ == '__main__':
main()