Added two new parameters

--remove-spaces replaces spaces with underscores to keep the full IDs (BLAST disregards everything after a space) --output-sequences will output protein2cog with a new column, "Sequences", with the sequences of proteins inputed
iquasere · Jun 3, 2020 · a17bbc8 · a17bbc8
1 parent 4c80f91
commit a17bbc8
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -21,9 +21,10 @@ sudo reCOGnizer/install.bash
 
 reCOGnizer needs an input file, but that is all it needs!
 ```
-usage: python recognizer.py [-h] [-t THREADS] [-o OUTPUT] [-rd RESOURCES_DIRECTORY]
+usage: recognizer.py [-h] [-t THREADS] [-o OUTPUT] [-rd RESOURCES_DIRECTORY]
                      [-db DATABASE] [--custom-database]
-                     [-seqs MAX_TARGET_SEQS] [--tsv] [-v] -f FILE
+                     [-seqs MAX_TARGET_SEQS] [--tsv] [--remove-spaces]
+                     [--keep-sequences] [-v] -f FILE
 
 reCOGnizer - a tool for domain based annotation with the COG database
 
@@ -45,6 +46,12 @@ optional arguments:
                         Number of maximum identifications for each protein.
                         Default is 1.
   --tsv                 Tables will be produced in TSV format (and not EXCEL).
+  --remove-spaces       BLAST ignores sequences IDs after the first space.
+                        This option changes all spaces to underscores to keep
+                        the full IDs.
+  --keep-sequences      Protein sequences from the FASTA input will be stored
+                        in their own column. This produces considerably larger
+                        files.
   -v, --version         show program's version number and exit
 
 required named arguments:
@@ -69,7 +76,7 @@ Krona plot with the quantification of COGs identified in the simulated dataset u
 
 reCOGnizer already has its own image! To use it, just pull the image and run it!
 ```
-docker pull iquasere/recognizer:1.2.2
+docker pull iquasere/recognizer:1.2.3
 docker run -it -v absolute/path/to/fasta_folder:/input_folder /absolute/path/to/output_folder:/output_folder --rm iquasere/recognizer -f /input_folder/input_file.fasta -o /output_folder [other arguments]
 ```
 

diff --git a/recognizer.py b/recognizer.py
@@ -11,7 +11,7 @@
 from time import gmtime, strftime
 import argparse, sys, os, multiprocessing, glob, subprocess, pathlib
 
-__version__ = '1.2.2'
+__version__ = '1.2.3'
 
 def get_arguments():    
     parser = argparse.ArgumentParser(description="reCOGnizer - a tool for domain based annotation with the COG database",
@@ -35,6 +35,12 @@ def get_arguments():
                         Default is 1.""", default = "1")
     parser.add_argument("--tsv", action = "store_true", default = False,
                         help="Tables will be produced in TSV format (and not EXCEL).")
+    parser.add_argument("--remove-spaces", action = "store_true", default = False,
+                        help="""BLAST ignores sequences IDs after the first space.
+                        This option changes all spaces to underscores to keep the full IDs.""")
+    parser.add_argument("--output-sequences", action = "store_true", default = False,
+                        help="""Protein sequences from the FASTA input will be stored
+                        in their own column. This produces considerably larger files.""")
     parser.add_argument('-v', '--version', action='version', version='reCOGnizer ' + __version__)
 
     requiredNamed = parser.add_argument_group('required named arguments')
@@ -74,6 +80,31 @@ def run_command(bashCommand, print_command = True, stdout = None):
         print(bashCommand)
     subprocess.run(bashCommand.split(), stdout = stdout)
 
+def run_pipe_command(bashCommand, file = '', mode = 'w', sep = ' ', print_message = True):
+    if print_message:
+        print(bashCommand)
+    subprocess.Popen(bashCommand, stdin=subprocess.PIPE, shell=True).communicate()
+
+'''
+Input:
+    fasta: str - filename of FASTA file to parse
+Output:
+    return dict = {protein ID : protein sequence}
+'''
+def parse_fasta(file):
+    lines = [line.rstrip('\n') for line in open(file)]
+    i = 0
+    sequences = dict()
+    while i < len(lines):
+        if lines[i].startswith('>'):
+            name = lines[i][1:]
+            sequences[name] = ''
+            i += 1
+            while i < len(lines) and not lines[i].startswith('>'):
+                sequences[name] += lines[i]
+                i += 1
+    return sequences
+
 '''
 Input:
     database_directory: str - directory to store files for database construction
@@ -325,8 +356,7 @@ def main():
             databases = args.database.split(',')
         for database in databases:
             if not validate_database(args.database):
-                print('Database not valid!')
-                exit()
+                exit('Database not valid!')
     else:
         # check if necessary files exist to build database
         download_resources(args.resources_directory) 
@@ -339,9 +369,14 @@ def main():
         # set database(s)
         databases = [pn.split('.pn')[0] for pn in glob.glob('{}/COG_{}_*.pn'.format(
                 args.resources_directory, args.threads))]
+
+    # Replacing spaces for commas
+    timed_message('Replacing spaces for commas')
+    if args.remove_spaces:
+        run_pipe_command("sed -i -e 's/ /_/g' {}".format(args.file))
 
-    # run annotation with psi-blast and COG database
-    timed_message('Running annotation with PSI-BLAST and COG database as reference.')
+    # run annotation with rps-blast and COG database
+    timed_message('Running annotation with RPS-BLAST and COG database as reference.')
     run_rpsblast(args.file, args.output + '/cdd_aligned.blast', ' '.join(databases),
                  threads = args.threads, max_target_seqs = args.max_target_seqs)
 
@@ -357,7 +392,15 @@ def main():
     cogblast = cog2ec(cogblast, table = args.resources_directory + '/cog2ec.tsv',
                       resources_dir = args.resources_directory)
 
-    # write protein COG assignment
+    # adding protein sequences if requested
+    if args.output_sequences:
+        fasta = parse_fasta(args.file)
+        fasta = pd.DataFrame.from_dict(fasta, orient = 'index')
+        fasta.columns = ['Sequence']
+        cogblast = pd.merge(cogblast, fasta, left_on = 'qseqid', right_index = True, 
+                            how = 'right')
+
+    # write protein to COG assignment
     out_format = 'tsv' if args.tsv else 'excel'
     write_table(cogblast,
                 args.output + '/protein2cog',