Skip to content

Commit

Permalink
Added two new parameters
Browse files Browse the repository at this point in the history
--remove-spaces replaces spaces with underscores to keep the full IDs (BLAST disregards everything after a space)
--output-sequences will output protein2cog with a new column, "Sequences", with the sequences of proteins inputed
  • Loading branch information
iquasere committed Jun 3, 2020
1 parent 4c80f91 commit a17bbc8
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 9 deletions.
13 changes: 10 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@ sudo reCOGnizer/install.bash

reCOGnizer needs an input file, but that is all it needs!
```
usage: python recognizer.py [-h] [-t THREADS] [-o OUTPUT] [-rd RESOURCES_DIRECTORY]
usage: recognizer.py [-h] [-t THREADS] [-o OUTPUT] [-rd RESOURCES_DIRECTORY]
[-db DATABASE] [--custom-database]
[-seqs MAX_TARGET_SEQS] [--tsv] [-v] -f FILE
[-seqs MAX_TARGET_SEQS] [--tsv] [--remove-spaces]
[--keep-sequences] [-v] -f FILE
reCOGnizer - a tool for domain based annotation with the COG database
Expand All @@ -45,6 +46,12 @@ optional arguments:
Number of maximum identifications for each protein.
Default is 1.
--tsv Tables will be produced in TSV format (and not EXCEL).
--remove-spaces BLAST ignores sequences IDs after the first space.
This option changes all spaces to underscores to keep
the full IDs.
--keep-sequences Protein sequences from the FASTA input will be stored
in their own column. This produces considerably larger
files.
-v, --version show program's version number and exit
required named arguments:
Expand All @@ -69,7 +76,7 @@ Krona plot with the quantification of COGs identified in the simulated dataset u

reCOGnizer already has its own image! To use it, just pull the image and run it!
```
docker pull iquasere/recognizer:1.2.2
docker pull iquasere/recognizer:1.2.3
docker run -it -v absolute/path/to/fasta_folder:/input_folder /absolute/path/to/output_folder:/output_folder --rm iquasere/recognizer -f /input_folder/input_file.fasta -o /output_folder [other arguments]
```

Expand Down
55 changes: 49 additions & 6 deletions recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from time import gmtime, strftime
import argparse, sys, os, multiprocessing, glob, subprocess, pathlib

__version__ = '1.2.2'
__version__ = '1.2.3'

def get_arguments():
parser = argparse.ArgumentParser(description="reCOGnizer - a tool for domain based annotation with the COG database",
Expand All @@ -35,6 +35,12 @@ def get_arguments():
Default is 1.""", default = "1")
parser.add_argument("--tsv", action = "store_true", default = False,
help="Tables will be produced in TSV format (and not EXCEL).")
parser.add_argument("--remove-spaces", action = "store_true", default = False,
help="""BLAST ignores sequences IDs after the first space.
This option changes all spaces to underscores to keep the full IDs.""")
parser.add_argument("--output-sequences", action = "store_true", default = False,
help="""Protein sequences from the FASTA input will be stored
in their own column. This produces considerably larger files.""")
parser.add_argument('-v', '--version', action='version', version='reCOGnizer ' + __version__)

requiredNamed = parser.add_argument_group('required named arguments')
Expand Down Expand Up @@ -74,6 +80,31 @@ def run_command(bashCommand, print_command = True, stdout = None):
print(bashCommand)
subprocess.run(bashCommand.split(), stdout = stdout)

def run_pipe_command(bashCommand, file = '', mode = 'w', sep = ' ', print_message = True):
if print_message:
print(bashCommand)
subprocess.Popen(bashCommand, stdin=subprocess.PIPE, shell=True).communicate()

'''
Input:
fasta: str - filename of FASTA file to parse
Output:
return dict = {protein ID : protein sequence}
'''
def parse_fasta(file):
lines = [line.rstrip('\n') for line in open(file)]
i = 0
sequences = dict()
while i < len(lines):
if lines[i].startswith('>'):
name = lines[i][1:]
sequences[name] = ''
i += 1
while i < len(lines) and not lines[i].startswith('>'):
sequences[name] += lines[i]
i += 1
return sequences

'''
Input:
database_directory: str - directory to store files for database construction
Expand Down Expand Up @@ -325,8 +356,7 @@ def main():
databases = args.database.split(',')
for database in databases:
if not validate_database(args.database):
print('Database not valid!')
exit()
exit('Database not valid!')
else:
# check if necessary files exist to build database
download_resources(args.resources_directory)
Expand All @@ -339,9 +369,14 @@ def main():
# set database(s)
databases = [pn.split('.pn')[0] for pn in glob.glob('{}/COG_{}_*.pn'.format(
args.resources_directory, args.threads))]

# Replacing spaces for commas
timed_message('Replacing spaces for commas')
if args.remove_spaces:
run_pipe_command("sed -i -e 's/ /_/g' {}".format(args.file))

# run annotation with psi-blast and COG database
timed_message('Running annotation with PSI-BLAST and COG database as reference.')
# run annotation with rps-blast and COG database
timed_message('Running annotation with RPS-BLAST and COG database as reference.')
run_rpsblast(args.file, args.output + '/cdd_aligned.blast', ' '.join(databases),
threads = args.threads, max_target_seqs = args.max_target_seqs)

Expand All @@ -357,7 +392,15 @@ def main():
cogblast = cog2ec(cogblast, table = args.resources_directory + '/cog2ec.tsv',
resources_dir = args.resources_directory)

# write protein COG assignment
# adding protein sequences if requested
if args.output_sequences:
fasta = parse_fasta(args.file)
fasta = pd.DataFrame.from_dict(fasta, orient = 'index')
fasta.columns = ['Sequence']
cogblast = pd.merge(cogblast, fasta, left_on = 'qseqid', right_index = True,
how = 'right')

# write protein to COG assignment
out_format = 'tsv' if args.tsv else 'excel'
write_table(cogblast,
args.output + '/protein2cog',
Expand Down

0 comments on commit a17bbc8

Please sign in to comment.