-
Notifications
You must be signed in to change notification settings - Fork 0
/
File_splitter.py
110 lines (102 loc) · 4.18 KB
/
File_splitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python
import sys
import os
import os.path
import shutil
import subprocess
import multiprocessing
from Bio import SeqIO
splitlen = int(sys.argv[1])
input_file = sys.argv[2]
output_folder = sys.argv[3]
def check_file(ending, count):
output_file = os.path.join(output_folder, os.path.splitext(os.path.basename(input_file))[0] + "_split_" + str(count).zfill(3) + ending)
alt_output_file = os.path.join(output_folder, os.path.splitext(os.path.basename(input_file))[0][:-1] + "_split_" + str(count).zfill(3) + "_" + os.path.splitext(os.path.basename(input_file))[0][-1:] + ending)
result = False
if os.path.exists(output_file) or os.path.exists(alt_output_file):
result = True
return result
def write_fastq(fastq_dict, count, mode):
output_file = os.path.join(output_folder, os.path.splitext(os.path.basename(input_file))[0] + "_split_" + str(count).zfill(3) + ".fastq")
with open(output_file, mode) as outfile:
for seq_id in fastq_dict:
outfile.write("@" + seq_id + "\n")
outfile.write(fastq_dict[seq_id][0] + "\n")
outfile.write("+" + "\n")
outfile.write(fastq_dict[seq_id][1] + "\n")
def write_fasta(fasta_dict, count, mode):
output_file = os.path.join(output_folder, os.path.splitext(os.path.basename(input_file))[0] + "_split_" + str(count).zfill(3) + ".fasta")
with open(output_file, mode) as outfile:
for seq_id in fasta_dict:
outfile.write(">" + seq_id + "\n")
outfile.write(fasta_dict[seq_id][0] + "\n")
os.chdir(os.path.dirname(input_file))
if input_file.endswith("q"):
read_count = 0
with open(input_file, "r") as infile:
for line in infile:
read_count += 1
read_count = read_count/4
File_count = read_count / splitlen + 1
for n in range(File_count):
Start = n*splitlen
m = n + 1
Stop = m*splitlen
Temp_sequences = {}
if check_file(".fastq", m + 1):
continue
with open(input_file, "r") as infile:
seq_id = ""
seq_count = 0
line_count = 0
for line in infile:
line_class = line_count % 4
if seq_count < Start:
if line_class == 0:
seq_count += 1
elif seq_count >= Start and seq_count < Stop:
if line_class == 0 and line.startswith("@"):
seq_id = line[1:].split(" ")[0].strip("\n")
Temp_sequences[seq_id] = ["",""]
elif seq_id != "":
if line_class == 1 and Temp_sequences[seq_id][0] == "":
Temp_sequences[seq_id][0] = line.strip("\n")
elif line_class == 3 and Temp_sequences[seq_id][1] == "":
Temp_sequences[seq_id][1] = line.strip("\n")
seq_count += 1
elif seq_count == Stop:
break
line_count += 1
write_fastq(Temp_sequences, m, "w")
elif input_file.endswith("a"):
prot_count = 0
with open(input_file, "r") as infile:
for line in infile:
if line.startswith(">"):
prot_count += 1
File_count = prot_count / splitlen + 1
for n in range(File_count):
Start = n*splitlen
m = n + 1
Stop = m*splitlen
Temp_sequences = {}
if check_file(".fasta", m + 1):
continue
with open(input_file, "r") as infile:
seq_id = ""
seq_count = 0
for line in infile:
if line.startswith(">"):
if seq_count < Start:
seq_count += 1
elif seq_count >= Start and seq_count < Stop:
seq_id = line[1:].split(" ")[0].strip("\n")
Temp_sequences[seq_id] = [""]
seq_count += 1
elif seq_count == Stop:
break
elif seq_id != "":
Temp_sequences[seq_id][0] += line.strip("\n")
write_fasta(Temp_sequences, m, "w")
else:
print "Unrecognized File Type"