-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFastq_stat.py
74 lines (56 loc) · 2 KB
/
Fastq_stat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/python
import gzip
import sys
from Bio import SeqIO
import argparse
'''
filterFastqNReads.py
FUNCTION:
This script will take the next generation sequencing reads in fastq format.
Then it will filter out the sequence which has unknown base pair Ns.
The reads without N will be output to the result file while those with Ns will be discarded.
This will retrieve the perfect reads without missing data.
Statistcal summary will be generated for each input seq file
HELP: python3 Fastq_stat.py -h [--help]
USAGE:
python3 Fastq_stat.py -i input_fastq_file -o out_statistical_result_file
Example on testing data:
python3 Fastq_stat.py -i testdata_illumina_1.fq -o fastq.stat.txt
Path information could be added before the input and output file name if files are not in current directory.
Version 2021-June-15th
'''
__author__ ="Xuewen Wang"
parser = argparse.ArgumentParser()
parser.add_argument("-i","--input",required=True, help="The input file containing reads in fastq format")
parser.add_argument("-o","--output", default='fastq.stat.txt', help="Statistical of read file")
args=parser.parse_args()
ind=args.input
otd=args.output
lengths = []
sumlen = 0
sumraw=0
ct=0
ctgood=0
subword="N"
# outf = open(otd, 'w')
with gzip.open(ind,"rt") as IN:
for record in SeqIO.parse(IN, "fastq"):
ct +=1
sumraw += len(record.seq)
if subword in record.seq:
next
else:
# SeqIO.write(record, outf, "fastq")
sumlen += len(record.seq)
ctgood +=1
#average length
lenmean=float("{:.1f}".format(sumraw/ct))
# Report stats
# outf.write("Sequence file:\t"+ind)
print ("Sequence file:\t", ind)
print("Total number of input sequences/reads:\t", ct)
print("Total length (bp) of input sequences/reads:\t", sumraw)
print("Mean length (bp) of input sequences/reads:\t", lenmean)
print("Total number of sequences/reads after filtering:\t", ctgood)
print("Total length (bp) of filtered sequences/reads:\t", sumlen)
#print("Result:\t",otd)