Updated README

iquasere · May 5, 2020 · bb7604e · bb7604e
1 parent 325f291
commit bb7604e
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 10 deletions.
diff --git a/Databases/.gitignore b/Databases/.gitignore
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ A tool for domain based annotation with the COG database.
 
 ## Features
 
-reCOGnizer is a user-friendly implementation of protein functional identification using COG database. It builds a split version of the COG database with which RPS-BLAST can run in multithread, significantly increasing the speed of the most time intensive step of protein annotation. After COG assignment to proteins, reCOGnizer makes use of cdd2cog to convert CDD IDs to their respective COGs, before organizing those COGs into a relational table of protein to COG, with the inclusion of the three levels of functional classification from COG.
+reCOGnizer is a user-friendly implementation of protein functional identification using COG database. It builds a split version of the COG database with which RPS-BLAST can run in multithread, significantly increasing the speed of the most time intensive step of protein annotation. After COG assignment to proteins, reCOGnizer makes use of cdd2cog to convert CDD IDs to their respective COGs, before organizing those COGs into a relational table of protein to COG, with the inclusion of the three levels of functional classification from COG. reCOGnizer further converts assigned COG functions to EC numbers, providing more functional information.
 
 ## Installation
 
@@ -33,8 +33,7 @@ optional arguments:
   -db DATABASE, --database DATABASE
                         Basename of COG database for annotation. If multiple
                         databases, use comma separated list (db1,db2,db3)
-  --database-by-recognizer DATABASE_BY_RECOGNIZER
-                        If inputed database was produced by reCOGnizer
+  --custom-database     If database was NOT produced by reCOGnizer
   -seqs MAX_TARGET_SEQS, --max-target-seqs MAX_TARGET_SEQS
                         Number of maximum identifications for each protein.
                         Default is 1.
@@ -59,6 +58,6 @@ Krona plot with the quantification of COGs identified in the simulated dataset u
 
 reCOGnizer already has its own image! To use it, just pull the image and run it!
 ```
-docker pull iquasere/recognizer:latest
+docker pull iquasere/recognizer:1.2.0
 docker run -it -v absolute/path/to/fasta_folder:/input_folder /absolute/path/to/output_folder:/output_folder --rm iquasere/recognizer -f /input_folder/input_file.fasta -o /output_folder [other arguments]
 ```
diff --git a/install.bash b/install.bash
@@ -9,4 +9,4 @@ conda install -y -c anaconda lxml
 conda install -y -c anaconda openpyxl
 cd reCOGnizer
 git clone https://github.com/marbl/Krona.git
-wget https://github.com/aleimba/bac-genomics-scripts/raw/master/cdd2cog/cdd2cog.pl
+wget https://github.com/aleimba/bac-genomics-scripts/raw/master/cdd2cog/cdd2cog.pl
diff --git a/recognizer.py b/recognizer.py
@@ -78,7 +78,7 @@ def run_command(bashCommand, print_command = True):
 '''
 def download_resources(database_directory):
     if not os.path.isfile('{}/COG0001.smp'.format(database_directory)):
-        print('{}/COG0001.smp not found!'.format(database_directory))
+        print('{}/COG0001.smp not found! Retrieving from cdd.tar.gz...'.format(database_directory))
         if not os.path.isfile('{}/cdd.tar.gz'.format(database_directory)):
             print('{}/cdd.tar.gz not found! Downloading...'.format(database_directory))
             run_command('wget ftp://ftp.ncbi.nih.gov/pub/mmdb/cdd/cdd.tar.gz -P {}'.format(database_directory))
@@ -88,14 +88,14 @@ def download_resources(database_directory):
         subprocess.Popen('tar -xzf cdd.tar.gz --wildcards "COG*.smp"', shell = True).communicate() # I couldn't, for the life of me, put the -C or --directory flags to work. No idea what happened, this just works
         os.chdir(wd)
     if not os.path.isfile('{}/cddid.tbl'.format(database_directory)):
-        print('{}/cddid.tbl not found!'.format(database_directory))
+        print('{}/cddid.tbl not found! Downloading...'.format(database_directory))
         run_command('wget ftp://ftp.ncbi.nlm.nih.gov/pub/mmdb/cdd/cddid.tbl.gz -P {}'.format(database_directory))
         run_command('gunzip {}/cddid.tbl.gz'.format(database_directory))
     if not os.path.isfile('{}/fun.txt'.format(database_directory)):
-        print('{}/fun.txt not found!'.format(database_directory))
+        print('{}/fun.txt not found! Downloading...'.format(database_directory))
         run_command('wget ftp://ftp.ncbi.nlm.nih.gov/pub/COG/COG/fun.txt -P {}'.format(database_directory))
     if not os.path.isfile('{}/whog'.format(database_directory)):
-        print('{}/whog not found!'.format(database_directory))
+        print('{}/whog not found! Downloading...'.format(database_directory))
         run_command('wget ftp://ftp.ncbi.nlm.nih.gov/pub/COG/COG/whog -P {}'.format(database_directory))
 
 '''
@@ -276,7 +276,7 @@ def cog2ec(cogblast, table = sys.path[0] + '/Databases/cog2ec.tsv'):
         run_command('python {0}/cog2ec.py -c {0}/eggnog4.protein_id_conversion.tsv -m {0}/NOG.members.tsv > {1}'.format(
                 sys.path[0] + '/Databases', table))
     cog2ec = pd.read_csv(table, sep = '\t', names = ['cog', 'EC number'])
-    return pd.merge(cogblast, cog2ec, on = 'cog')
+    return pd.merge(cogblast, cog2ec, on = 'cog', how = 'left')
 
 
 '''
@@ -358,6 +358,9 @@ def main():
                 args.output + '/protein2cog', 
                 out_format = out_format)
 
+    timed_message('Protein ID to COG and EC number is available at {}.'.format(
+            args.output + '/protein2cog'))
+
     # quantify COG categories
     timed_message('Quantifying COG categories.')
     cog_quantification = cogblast.groupby(['COG general functional category',