Move to 0.5.5.

Fix issue in eggnog and add a check to downloaded proteomes.
AuReMe · Nov 9, 2024 · 4276046 · 4276046
1 parent 0748933
commit 4276046
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,15 @@
 # Changelog
 
+# EsMeCaTa v0.5.4 (2024-11-09)
+
+## Add
+
+* Check the good format of the gzip file.
+
+## Fix
+
+* Issue with protein IDs from UniParc during annotation (incorrect split on '|').
+
 # EsMeCaTa v0.5.4 (2024-11-06)
 
 ## Fix

diff --git a/esmecata/__init__.py b/esmecata/__init__.py
@@ -13,4 +13,4 @@
 # You should have received a copy of the GNU General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>
 
-__version__ = '0.5.4'
+__version__ = '0.5.5'
diff --git a/esmecata/core/eggnog.py b/esmecata/core/eggnog.py
@@ -209,7 +209,10 @@ def write_pathologic(base_filename, annotated_proteins, pathologic_output_file,
         element_file.write(';; ' + base_filename + '\n')
         element_file.write(';;;;;;;;;;;;;;;;;;;;;;;;;\n')
         for protein_annots in annotated_proteins:
-            protein = protein_annots[0].split('|')[1]
+            if '|' in protein_annots[0]:
+                protein = protein_annots[0].split('|')[1]
+            else:
+                protein = protein_annots[0]
 
             protein_annot = protein_annots[1]
             element_file.write('ID\t' + protein + '\n')
@@ -246,7 +249,10 @@ def write_annotation_reference(protein_annotations, reference_proteins, annotati
         csvwriter = csv.writer(output_tsv, delimiter='\t')
         csvwriter.writerow(['protein_cluster', 'cluster_members', 'gene_name', 'GO', 'EC', 'KEGG_reaction'])
         for protein_annot_tuples in protein_annotations:
-            protein = protein_annot_tuples[0].split('|')[1]
+            if '|' in protein_annot_tuples[0]:
+                protein = protein_annot_tuples[0].split('|')[1]
+            else:
+                protein = protein_annot_tuples[0]
             protein_annot = protein_annot_tuples[1]
             gene_name = protein_annot['Preferred_name']
             cluster_members = ','.join(reference_proteins[protein])

diff --git a/esmecata/core/proteomes.py b/esmecata/core/proteomes.py
@@ -1549,6 +1549,19 @@ def retrieve_proteomes(input_file, output_folder, busco_percentage_keep=80,
             if os.path.getsize(proteome_file_path) == 0:
                 logger.info('|EsMeCaTa|proteomes| Proteome file %s is still completly empty (0 octet), remove it to avoid issue with mmseqs2.', proteome)
                 os.remove(proteome_file_path)
+        # Try to open it to check the format.
+        chunksize = 1000000
+        with gzip.open(proteome_file_path, 'rb') as f:
+            try:
+                while f.read(chunksize) != b'':
+                    pass
+            except gzip.BadGzipFile:
+                logger.info('|EsMeCaTa|proteomes| Bad format for proteome file %s, try to redownload it.', proteome)
+                download_proteome_file(proteome, proteome_file_path, empty_proteomes, option_bioservices, session, uniprot_sparql_endpoint)
+            except EOFError:
+                logger.info('|EsMeCaTa|proteomes| Bad format for proteome file %s, try to redownload it.', proteome)
+                download_proteome_file(proteome, proteome_file_path, empty_proteomes, option_bioservices, session, uniprot_sparql_endpoint)
+
 
 
     # Download Uniprot metadata and create a json file containing them.