-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfastqc-summary
71 lines (61 loc) · 2.72 KB
/
fastqc-summary
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env python3
"""
Search a directory for FastQC ZIP files,
outputs a summary in tab-delimited format.
"""
import argparse
import io
import os
import re
import sys
import zipfile
import pandas
def process_fastqc(path, detailed=False):
archive = zipfile.ZipFile(path)
element = [p for p in archive.namelist() if p.endswith("fastqc_data.txt")][0]
row = {}
data = archive.open(element).read().decode("ascii")
for section in re.findall(">>(.+?)>>END_MODULE", data, re.DOTALL):
section = section.strip()
lines = section.split("\n")
name, score = lines[0].strip().split("\t")
if len(lines) > 1:
extra_data = dict([line.strip()[1:].split("\t") for line in lines[1:] if line.startswith("#")][:-1])
header = [line.strip() for line in lines[1:] if line.startswith("#")][-1]
data = io.StringIO(header[1:] + "\n" + "\n".join([line.strip() for line in lines[1:] if not line.startswith("#")]))
table = pandas.read_table(data, sep="\t", index_col=0)
row[name] = score
if name == "Basic Statistics":
row["Filename"] = table.ix["Filename", "Value"]
row["Total Sequences"] = table.ix["Total Sequences", "Value"]
row["Sequence Length"] = table.ix["Sequence length", "Value"]
elif name == "Per base sequence quality":
row["Sequence Quality - Global Mean of Median"] = table["Median"].mean()
if detailed:
for base in table.index[:5]:
row["Sequence Quality - " + base] = table.ix[base, "Mean"]
for base in table.index[-5:]:
row["Sequence Quality - " + base] = table.ix[base, "Mean"]
elif name == "Sequence Duplication Levels":
row["Total Deduplicated Percentage"] = float(extra_data["Total Deduplicated Percentage"])
return pandas.Series(row)
def main(argv):
parser = argparse.ArgumentParser()
parser.add_argument("basedir", nargs="?",
default=os.getcwd(),
help="The root directory to be searched for FastQC results. If not specified, uses the current directory.")
parser.add_argument("-s", "--simple",
action="store_true",
help="Output simple quality information.")
opts = parser.parse_args(argv)
rows = []
for root, dirs, files in os.walk(opts.basedir):
for file in files:
if file.endswith("fastqc.zip"):
path = os.path.join(root, file)
rows.append(process_fastqc(path, not opts.simple))
df = pandas.DataFrame(rows)
df.index = df["Filename"]
df.drop("Filename",1).to_csv(sys.stdout, sep="\t")
if __name__ == "__main__":
main(sys.argv[1:])