Handles multi-categories

Sheet names improved
iquasere · Feb 24, 2021 · 4129953 · 4129953
1 parent ee1e711
commit 4129953
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 37 deletions.
diff --git a/README.md b/README.md
@@ -19,9 +19,23 @@ git clone https://github.com/iquasere/reCOGnizer.git
 sudo reCOGnizer/install.bash
 ```
 
+### With Bioconda
+
+reCOGnizer can also be installed with Conda! Many thanks to [Devon Ryan](https://github.com/dpryan79) for his precious help!
+
+Install:  ```conda install -c conda-forge -c bioconda recognizer```
+
+Test installation: ```recognizer.py -v```
+
+**Warning:** running with Conda is better performed using the -rd parameter to store the databases and other resources in a directory of your choice. Doing so will prevent reCOGnizer from putting these files in unexpected locations.
+
 ## Usage
 
-reCOGnizer needs an input file, but that is all it needs!
+The simplest way to run reCOGnizer is to just specify the fasta filename and an output directory - though even the output directory is not mandatory. It is recommended that a "resources" directory is specified to store the databases that reCOGnizer requires.
+```
+recognizer.py -f input_file.fasta -o recognizer_output -rd resources_directory
+```
+However, it offers several options for customizing its workflow:
 ```
 usage: recognizer.py [-h] [-t THREADS] [-o OUTPUT] [-rd RESOURCES_DIRECTORY]
                      [-db DATABASE] [--custom-database]
@@ -62,11 +76,6 @@ required named arguments:
   -f FILE, --file FILE  Fasta file with protein sequences for annotation
 ```
 
-The simplest way to run reCOGnizer is to just specify the fasta filename and an output directory - though even the output directory is not mandatory.
-```
-python recognizer.py -f input_file.fasta -o output_folder
-```
-
 ## Outputs
 
 reCOGnizer takes a FASTA file as input and produces two main outputs into the output directory:
@@ -76,19 +85,6 @@ reCOGnizer takes a FASTA file as input and produces two main outputs into the ou
 ![ScreenShot](krona_plot.png)
 Krona plot with the quantification of COGs identified in the simulated dataset used to test [MOSCA](github.com/iquasere/MOSCA) and reCOGnizer.
 
-## Docker
-
-reCOGnizer already has its own image! To use it, just pull the image and run it!
-```
-docker pull iquasere/recognizer:1.2.5
-docker run -it -v absolute/path/to/fasta_folder:/input_folder /absolute/path/to/output_folder:/output_folder --rm iquasere/recognizer -f /input_folder/input_file.fasta -o /output_folder [other arguments]
-```
+## Referencing reCOGnizer
 
-## Bioconda
-
-reCOGnizer can also be installed with Conda! Many thanks to [Devon Ryan](https://github.com/dpryan79) for his precious help!
-```
-conda install -c bioconda recognizer
-recognizer.py -v
-```
-**Warning:** running with Conda is better performed using the -rd parameter to store the databases and other resources in a directory of your choice. Doing so will prevent reCOGnizer from putting these files in unexpected locations.
+reCOGnizer is still not published. If you use it, please reference the bioconda package: https://anaconda.org/bioconda/recognizer
diff --git a/meta.yaml b/meta.yaml
@@ -1,5 +1,5 @@
 {% set name = "recognizer" %}
-{% set version = "1.4.2" %}
+{% set version = "1.4.3" %}
 {% set sha256 = "aea5ac40e5b58490c913aa0a1f6d205d54e80e11aa692b40d1fc5c59a5dbe978" %}
 
 package:

diff --git a/recognizer.py b/recognizer.py
@@ -16,14 +16,13 @@
 import subprocess
 import sys
 import time
-import datetime
+import numpy as np
+import pandas as pd
 from multiprocessing import Pool
 from time import gmtime, strftime
-
-import pandas as pd
 from progressbar import ProgressBar
 
-__version__ = '1.4.2'
+__version__ = '1.4.3'
 
 
 def get_arguments():
@@ -166,21 +165,27 @@ def run_rpsblast(query, output, reference, threads='0', max_target_seqs='1'):
     subprocess.run(bashCommand)
 
 
-'''
-Handling COG
-'''
-
-
 def parse_cddid(cddid):
     cddid = pd.read_csv(cddid, sep='\t', header=None)[[0, 1, 3]]
     cddid.columns = ['CDD ID', 'DB ID', 'DB description']
     cddid['CDD ID'] = ['CDD:{}'.format(str(ide)) for ide in cddid['CDD ID']]
     return cddid
 
 
+def expand_by_list_column(df, column='COG functional category (letter)'):
+    lens = [len(item) for item in df[column]]
+    dictionary = dict()
+    for col in df.columns:
+        dictionary[col] = np.repeat(df[col].values, lens)
+    dictionary[column] = np.concatenate(df[column].values)
+    return pd.DataFrame(dictionary)
+
+
 def parse_whog(whog):
     df = pd.read_csv(whog, sep='\t', usecols=[0, 1, 2], header=None, encoding='ISO 8859-1')
     df.columns = ['cog', 'COG functional category (letter)', 'COG protein description']
+    df['COG functional category (letter)'] = df['COG functional category (letter)'].apply(lambda x: [i for i in x])
+    df = expand_by_list_column(df, column='COG functional category (letter)')
     return df
 
 
@@ -191,6 +196,8 @@ def parse_kog(kog):
         lines.append([line[0][1], line[1], ' '.join(line[2:])])
     df = pd.DataFrame(lines)
     df.columns = ['KOG functional category (letter)', 'kog', 'KOG protein description']
+    df['KOG functional category (letter)'] = df['KOG functional category (letter)'].apply(lambda x: [i for i in x])
+    df = expand_by_list_column(df, column='KOG functional category (letter)')
     return df
 
 
@@ -294,12 +301,12 @@ def write_table(table, output, out_format='excel', header=True):
 
 
 def multi_sheet_excel(writer, data, sheet_name='Sheet', lines=1000000, index=False):
-    i = 0
-    j = 1
-    while i + lines < len(data):
-        data.iloc[i:(i + lines)].to_excel(writer, sheet_name='{} ({})'.format(sheet_name, str(j)), index=index)
-        j += 1
-    data.iloc[i:len(data)].to_excel(writer, sheet_name='{} ({})'.format(sheet_name, str(j)), index=index)
+    if len(data) < lines:
+        data.to_excel(writer, sheet_name='{}'.format(sheet_name), index=index)
+    else:
+        for i in range(0, len(data), lines):
+            j = min(i + lines, len(data))
+            data.iloc[i:(i + lines)].to_excel(writer, sheet_name='{} ({})'.format(sheet_name, j), index=index)
     return writer