-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathNC_summarizer.py
156 lines (131 loc) · 6.92 KB
/
NC_summarizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""
This script combines taxonomic classification results of multiple NanoCLUST runs into 1 CSV file.
"""
import os
import csv
import argparse
from rich import print as rprint
def process_directory(input_directory, output_file, taxonomic_level):
"""
Checks input directory, reads NanoCLUST output files, and creates a summary CSV file.
Parameters:
input_directory (str): The path to the input directory.
output_file (str): The path to the output CSV file.
taxonomic_level (str): The taxonomic level to consider.
Raises:
FileNotFoundError: If the input directory does not exist or if the expected input file does not exist.
FileExistsError: If the output file already exists.
"""
# Check if specified input_directory exists, notify and exit if it doesn't
if not os.path.exists(input_directory):
raise FileNotFoundError(f'The input directory {input_directory} does not exist.')
# Check if specified output_file exists, notify and exit if it does (don't want to overwrite)
if os.path.exists(output_file):
raise FileExistsError(f'The output file {output_file} already exists.')
# Grab taxonomic level information from taxonomic_code function
tax = taxonomic_code(taxonomic_level)
# Create counting variable
count=0
# Open the output summary.csv file for writing
with open(output_file, 'w', newline='') as output_file:
output_csv = csv.writer(output_file)
# Write header
output_csv.writerow(["runname", "taxid", "rel_abundance"])
# Loop through the subdirectories
for subdir in os.listdir(input_directory):
subdir_path = os.path.join(input_directory, subdir)
# Extract name of each run without .fastq
namelist = subdir.split(".")
name = str(namelist[0])
# Check if it's a directory
if os.path.isdir(subdir_path):
# Create specific input file path, with taxonomic level and name
input_file_path = os.path.join(subdir_path, f"{name}", f"rel_abundance_{name}_{tax}.csv")
# Check if input file path exists, notify and exit otherwise
if not os.path.exists(input_file_path):
error_message = (f"\nExpected a file named {input_file_path}. "
"Make sure that the input directory you specified contains the direct output directories from NanoCLUST, "
"with the classification data output in a folder with the same name as your run.\n")
# Clean up if error occured
os.remove(output_file.name)
# Raise error
raise FileNotFoundError(error_message)
# Create runname and add 1 to count if file path exists
if os.path.exists(input_file_path):
count += 1
# Open .csv file to read
with open(input_file_path, 'r') as input_file:
input_csv = csv.reader(input_file)
# Skip header
next(input_csv)
# Extract abundance information
for row in input_csv:
taxid, rel_abundance = row
output_csv.writerow([name, taxid, rel_abundance])
# Print summary
print(f"Processed {count} NanoCLUST output files, and created {output_file.name}")
# Function to create variables for possible taxonomic levels
def taxonomic_code(taxonomic_level):
"""
Returns the taxonomic 1 letter code for the given taxonomic level.
Parameters:
taxonomic_level (str): The taxonomic level ('species', 'genus', 'family', 'order').
Returns:
str: The taxonomic code ('S' for species, 'G' for genus, 'F' for family, 'O' for order).
"""
if taxonomic_level == 'species' or taxonomic_level == 's':
return 'S'
elif taxonomic_level == 'genus' or taxonomic_level == 'g':
return 'G'
elif taxonomic_level == 'family' or taxonomic_level == 'f':
return 'F'
elif taxonomic_level == 'order' or taxonomic_level == 'o':
return 'O'
# Main function, argparse and excecute process_directory
def main():
"""
Entry point of the NanoCLUST results summarizer program.
Prints the logo, parses command line arguments, creates an argparse object, and calls the `process_directory` function
to summarize the NanoCLUST taxonomic classification results from multiple runs.
"""
title = ("""
_ _ ____ _ _ _ ____ _____
| \ | | __ _ _ __ ___ / ___| | | | | / ___|_ _|
| \| |/ _` | '_ \ / _ \| | | | | | | \___ \ | |
| |\ | (_| | | | | (_) | |___| |__| |_| |___) || |
|_| \_|\__,_|_| |_|\___/ \____|_____\___/|____/ |_|
""")
titleSUMMARIZER= ("""
____ _ _ __ __ __ __ _ ____ ___ __________ ____
/ ___|| | | | \/ | \/ | / \ | _ \|_ _|__ / ____| _ \
\___ \| | | | |\/| | |\/| | / _ \ | |_) || | / /| _| | |_) |
___) | |_| | | | | | | |/ ___ \| _ < | | / /_| |___| _ <
|____/ \___/|_| |_|_| |_/_/ \_\_| \_\___/____|_____|_| \_\\
""")
# Print pretty ASCII logo and explanation to script
rprint(f"\n[yellow]Welcome to[/yellow]")
rprint(f'[green]{title}[magenta]{titleSUMMARIZER}[/green]')
rprint(f'[blue]This program combines taxonomic classification results of multiple NanoCLUST runs into 1 CSV file.[/blue]\n')
# Create argparse thingy
parser = argparse.ArgumentParser(
prog='NanoCLUST results summarizer',
description='This program combines taxonomic classification results of multiple NanoCLUST runs into 1 CSV file.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
epilog='Thanks for using this script!'
)
# Add arguments to parser
parser.add_argument('input_directory', type=str,
help='Path to directory containing the output directories from NanoCLUST')
parser.add_argument('-o', '--outfile', dest='output_file', type=str,
default='./NCsummary.csv',
help='Path to output .csv file')
parser.add_argument('-l', '--level', dest='taxonomic_level', type=str,
default='genus',
choices=['species', 's', 'genus', 'g', 'family', 'f', 'order', 'o'],
help='The taxonomic level you want the classification results summarized of')
# Parse
args = parser.parse_args()
process_directory(args.input_directory, args.output_file, args.taxonomic_level)
# Ensure that main code is only executed when this script runs directly, not when imported as module
if __name__ == "__main__":
main()