-
Notifications
You must be signed in to change notification settings - Fork 0
/
Reduplicate.py
49 lines (42 loc) · 1.51 KB
/
Reduplicate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/env python
import sys
import os
import os.path
import shutil
import subprocess
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
Barrnap = "/home/j/jparkins/mobolaji/Tools/Barrnap/bin/barrnap"
Infernal = "/home/j/jparkins/mobolaji/Tools/Infernal/infernal-1.1.2-linux-intel-gcc/binaries/cmsearch"
Rfam = "/home/j/jparkins/mobolaji/Databases/Rfam_rRNA.cm"
reference_file = sys.argv[1]
reference_sequences = SeqIO.to_dict(SeqIO.parse(reference_file, "fastq"))
dedeplicated_file = sys.argv[2]
dedeplicated_sequences = SeqIO.index(dedeplicated_file, "fastq")
cluster_file = sys.argv[3]
cluster_map = {}
reduplicated_file = sys.argv[4]
reduplicated_ids = set()
reduplicated_seqs = []
with open(cluster_file, "r") as clustr_read:
rep = ""
seq_id = ""
for line in clustr_read:
if line.startswith(">"):
continue
elif line.startswith("0"):
rep = line[line.find(">") + 1:line.find("...")]
seq_id = rep
cluster_map[rep] = [seq_id]
elif len(line) > 5:
seq_id = line[line.find(">") + 1:line.find("...")]
cluster_map[rep].append(seq_id)
for sequence in dedeplicated_sequences:
if len(cluster_map[sequence]) > 1:
for seq_id in cluster_map[sequence]:
reduplicated_ids.add(seq_id)
else:
reduplicated_ids.add(sequence)
reduplicated_seqs = [reference_sequences[seq_id] for seq_id in sorted(reduplicated_ids)]
with open(reduplicated_file, "w") as out:
SeqIO.write(reduplicated_seqs, out, "fastq")