From 6dfb7ba85702d988c678d5c2c3f17c252b05d6bf Mon Sep 17 00:00:00 2001 From: iquasere Date: Fri, 8 Jan 2021 12:14:04 +0000 Subject: [PATCH] Now reports total time * also fixed Smart relational * and lotsa stuff --- recognizer.py | 192 +++++++++++++++++++++++--------------------------- 1 file changed, 90 insertions(+), 102 deletions(-) diff --git a/recognizer.py b/recognizer.py index 74c1943..cde2dd5 100644 --- a/recognizer.py +++ b/recognizer.py @@ -15,6 +15,8 @@ import shutil import subprocess import sys +import time +import datetime from multiprocessing import Pool from time import gmtime, strftime @@ -300,11 +302,20 @@ def write_table(table, output, out_format='excel', header=True): table.to_csv(output + '.tsv', index=False, sep='\t', header=header) +def multi_sheet_excel(writer, data, sheet_name='Sheet', lines=1000000, index=False): + i = 0 + j = 1 + while i + lines < len(data): + data.iloc[i:(i + lines)].to_excel(writer, sheet_name='{} ({})'.format(sheet_name, str(j)), index=index) + j += 1 + data.iloc[i:len(data)].to_excel(writer, sheet_name='{} ({})'.format(sheet_name, str(j)), index=index) + return writer + + def create_krona_plot(tsv, output=None): if output is None: output = tsv.replace('.tsv', '.html') - conda_exec = subprocess.check_output('which conda'.split()).decode('utf8') - run_command('ktImportText {} -o {}'.format(conda_exec.split('/bin')[0], tsv, output)) + run_command('ktImportText {} -o {}'.format(tsv, output)) def main(): @@ -374,113 +385,90 @@ def main(): ''' if inputted_db: exit() - ''' - cddid = parse_cddid('{}/cddid_all.tbl'.format(args.resources_directory)) - - timed_message("Converting CDD IDs to databases' IDs") - pbar = ProgressBar() - for db in pbar(args.databases): - db_report = parse_blast('{}/{}_aligned.blast'.format(args.output, db)) - db_report = pd.merge(db_report, cddid, left_on='sseqid', right_on='CDD ID', how='left') - del db_report['CDD ID'] - db_report.to_csv('{}/{}_report.tsv'.format(args.output, db), sep='\t', index=False) + # Load the relational tables + cddid = parse_cddid('{}/cddid_all.tbl'.format(args.resources_directory)) ncbi_table = pd.read_csv('{}/hmm_PGAP.tsv'.format(args.resources_directory), sep='\t', usecols=[1, 10, 12, 15]) ncbi_table['source_identifier'] = [ide.split('.')[0] for ide in ncbi_table['source_identifier']] ncbi_table['source_identifier'] = ncbi_table['source_identifier'].str.replace('PF', 'pfam') - - for db in ['CDD', 'Pfam', 'NCBIfam', 'Protein_Clusters', 'TIGRFAM']: - if db in args.databases: - report = pd.read_csv('{}/{}_report.tsv'.format(args.output, db), sep='\t') - report = pd.merge(report, ncbi_table, left_on='DB ID', right_on='source_identifier', how='left') - report.to_csv('{}/{}_report.tsv'.format(args.output, db), sep='\t', index=False) - - if 'Smart' in args.databases: - smart_table = pd.read_csv('{}/descriptions.pl'.format(args.resources_directory), - sep='\t', skiprows=2, header=None, usecols=[1, 2]) - smart_table.columns = ['Smart ID', 'Smart description'] - report = pd.read_csv('{}/Smart_report.tsv'.format(args.output), sep='\t') - report = pd.merge(report, smart_table, left_on='DB ID', right_on='Smart ID', how='left') - report.to_csv('{}/Smart_report.tsv'.format(args.output, db), sep='\t', index=False) - ''' fun = pd.read_csv('{}/fun.tsv'.format(args.resources_directory), sep='\t') - if 'KOG' in args.databases: - kog_table = parse_kog('{}/kog'.format(args.resources_directory)) - kog_table = pd.merge(kog_table, fun, left_on='KOG functional category (letter)', - right_on='COG functional category (letter)', how='left') - report = pd.read_csv('{}/KOG_report.tsv'.format(args.output), sep='\t') - report = pd.merge(report, kog_table, left_on='DB ID', right_on='kog', how='left') - report.to_csv('{}/KOG_report.tsv'.format(args.output), sep='\t', index=False) - - if 'COG' in args.databases: - # Add COG categories to BLAST info - cog_table = parse_whog('{}/cog-20.def.tab'.format(args.resources_directory)) - cog_table = pd.merge(cog_table, fun, on='COG functional category (letter)', how='left') - report = pd.read_csv('{}/COG_report.tsv'.format(args.output), sep='\t') - print(report) - print(cog_table) - report = pd.merge(report, cog_table, left_on='DB ID', right_on='cog', how='left') - report.to_csv('{}/COG_report.tsv'.format(args.output), sep='\t', index=False) - - # cog2ec - timed_message('Converting COG IDs to EC numbers.') - report = cog2ec(report, table=args.resources_directory + '/cog2ec.tsv', - resources_dir=args.resources_directory) - - # cog2ko - timed_message('Converting COG IDs to KEGG Orthologs.') - report = cog2ko(report, cog2ko=args.resources_directory + '/cog2ko.tsv') - - ''' - if not args.no_blast_info: - final_result = final_result[['qseqid', 'CDD ID', 'COG general functional category', - 'COG functional category', 'COG protein description', 'cog', 'pident', - 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', - 'evalue', 'bitscore']] - else: - final_result = final_result[['qseqid', 'CDD ID', 'COG general functional category', - 'COG functional category', 'COG protein description', 'cog']] - - - - # adding protein sequences if requested - if not args.no_output_sequences: - fasta = parse_fasta(args.file) - fasta = pd.DataFrame.from_dict(fasta, orient='index') - fasta.columns = ['Sequence'] - final_result = pd.merge(final_result, fasta, left_on='qseqid', - right_index=True, how='left') - - # write protein to COG assignment - out_format = 'tsv' if args.tsv else 'excel' - write_table(final_result, - args.output + '/protein2cog', - out_format=out_format) - - timed_message('Protein ID to COG and EC number is available at {}.'.format( - args.output + '/protein2cog')) - - # quantify COG categories - timed_message('Quantifying COG categories.') - cog_quantification = final_result.groupby(['COG general functional category', - 'COG functional category', 'COG protein description', 'cog']).size( - ).reset_index().rename(columns={0: 'count'}) - write_table(cog_quantification, - args.output + '/cog_quantification', - out_format=out_format) - timed_message('COG categories quantification is available at {}.'.format( - args.output + '/cog_quantification')) - - # represent that quantification in krona plot - timed_message('Creating Krona plot representation.') - write_table(cog_quantification[['count'] + cog_quantification.columns.tolist()[:-1]], - args.output + '/krona', - header=False, - out_format='tsv') - create_krona_plot(args.output + '/krona.tsv', args.output + '/cog_quantification.html') - ''' + timed_message("Organizing annotation results") + blast_cols = ['pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'] + + i = 1 + j = len(args.databases) + for db in args.databases: + cols = ['qseqid', 'sseqid', 'DB ID', 'DB description'] + print('[{}/{}] Handling {} identifications'.format(i, j, db)) + report = parse_blast('{}/{}_aligned.blast'.format(args.output, db)) + report = pd.merge(report, cddid, left_on='sseqid', right_on='CDD ID', how='left') + del report['CDD ID'] + + # adding protein sequences if requested + if not args.no_output_sequences: + fasta = parse_fasta(args.file) + fasta = pd.DataFrame.from_dict(fasta, orient='index') + fasta.columns = ['Sequence'] + report = pd.merge(report, fasta, left_on='qseqid', right_index=True, how='left') + cols += ['Sequence'] + + if db in ['CDD', 'Pfam', 'NCBIfam', 'Protein_Clusters', 'TIGRFAM']: + report = pd.merge(report, ncbi_table, left_on='DB ID', right_on='source_identifier', how='left') + cols += ['product_name', 'ec_number', 'taxonomic_range_name'] + + elif db == 'Smart': + smart_table = pd.read_csv('{}/descriptions.pl'.format(args.resources_directory), + sep='\t', skiprows=2, header=None, usecols=[1, 2]) + smart_table.columns = ['Smart ID', 'Smart description'] + smart_table['Smart ID'] = smart_table['Smart ID'].str.replace('SM', 'smart') + report = pd.merge(report, smart_table, left_on='DB ID', right_on='Smart ID', how='left') + cols += ['Smart description'] + + elif db == 'KOG': + kog_table = parse_kog('{}/kog'.format(args.resources_directory)) + kog_table = pd.merge(kog_table, fun, left_on='KOG functional category (letter)', + right_on='COG functional category (letter)', how='left') + report = pd.merge(report, kog_table, left_on='DB ID', right_on='kog', how='left') + cols += ['COG general functional category', 'COG functional category', 'KOG protein description'] + + else: + cog_table = parse_whog('{}/cog-20.def.tab'.format(args.resources_directory)) + cog_table = pd.merge(cog_table, fun, on='COG functional category (letter)', how='left') + report = pd.merge(report, cog_table, left_on='DB ID', right_on='cog', how='left') + report.to_csv('{}/COG_report.tsv'.format(args.output), sep='\t', index=False) + # cog2ec + report = cog2ec(report, table=args.resources_directory + '/cog2ec.tsv', + resources_dir=args.resources_directory) + # cog2ko + report = cog2ko(report, cog2ko=args.resources_directory + '/cog2ko.tsv') + + cols += ['COG general functional category', 'COG functional category', 'COG protein description', + 'EC number', 'KO'] + + # COG quantification + cog_quantification = report.groupby( + ['COG general functional category', 'COG functional category', 'COG protein description', 'cog']).size( + ).reset_index().rename(columns={0: 'count'}) + cog_quantification.to_excel('{}/COG_quantification.xlsx'.format(args.output)) + cog_quantification[['count'] + cog_quantification.columns.tolist()[:-1]].to_csv( + '{}/COG_quantification.tsv'.format(args.output), sep='\t') + create_krona_plot('{}/krona.tsv'.format(args.output), '{}/cog_quantification.html'.format(args.output)) + + cols += blast_cols + + report[cols].to_csv('{}/{}_report.tsv'.format(args.output, db), sep='\t', index=False) + i += 1 + + timed_message('Organizing all results at {}/reCOGnizer_results.xlsx'.format(args.output)) + writer = pd.ExcelWriter('{}/reCOGnizer_results.xlsx'.format(args.output), engine='xlsxwriter') + pbar = ProgressBar() + for base in pbar(args.databases): + multi_sheet_excel(writer, pd.read_csv('{}/{}_report.tsv'.format(args.output, base), sep='\t'), sheet_name=base) + writer.save() if __name__ == '__main__': + start_time = time.time() main() + print('reCOGnizer analysis finished in {}'.format(strftime("%Hh%Mm%Ss", gmtime(time.time() - start_time)))) \ No newline at end of file