Skip to content

Commit

Permalink
Move to 0.5.5.
Browse files Browse the repository at this point in the history
Fix issue in eggnog and add a check to downloaded proteomes.
  • Loading branch information
ArnaudBelcour committed Nov 9, 2024
1 parent 0748933 commit 4276046
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 3 deletions.
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# Changelog

# EsMeCaTa v0.5.4 (2024-11-09)

## Add

* Check the good format of the gzip file.

## Fix

* Issue with protein IDs from UniParc during annotation (incorrect split on '|').

# EsMeCaTa v0.5.4 (2024-11-06)

## Fix
Expand Down
2 changes: 1 addition & 1 deletion esmecata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>

__version__ = '0.5.4'
__version__ = '0.5.5'
10 changes: 8 additions & 2 deletions esmecata/core/eggnog.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,10 @@ def write_pathologic(base_filename, annotated_proteins, pathologic_output_file,
element_file.write(';; ' + base_filename + '\n')
element_file.write(';;;;;;;;;;;;;;;;;;;;;;;;;\n')
for protein_annots in annotated_proteins:
protein = protein_annots[0].split('|')[1]
if '|' in protein_annots[0]:
protein = protein_annots[0].split('|')[1]
else:
protein = protein_annots[0]

protein_annot = protein_annots[1]
element_file.write('ID\t' + protein + '\n')
Expand Down Expand Up @@ -246,7 +249,10 @@ def write_annotation_reference(protein_annotations, reference_proteins, annotati
csvwriter = csv.writer(output_tsv, delimiter='\t')
csvwriter.writerow(['protein_cluster', 'cluster_members', 'gene_name', 'GO', 'EC', 'KEGG_reaction'])
for protein_annot_tuples in protein_annotations:
protein = protein_annot_tuples[0].split('|')[1]
if '|' in protein_annot_tuples[0]:
protein = protein_annot_tuples[0].split('|')[1]
else:
protein = protein_annot_tuples[0]
protein_annot = protein_annot_tuples[1]
gene_name = protein_annot['Preferred_name']
cluster_members = ','.join(reference_proteins[protein])
Expand Down
13 changes: 13 additions & 0 deletions esmecata/core/proteomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1549,6 +1549,19 @@ def retrieve_proteomes(input_file, output_folder, busco_percentage_keep=80,
if os.path.getsize(proteome_file_path) == 0:
logger.info('|EsMeCaTa|proteomes| Proteome file %s is still completly empty (0 octet), remove it to avoid issue with mmseqs2.', proteome)
os.remove(proteome_file_path)
# Try to open it to check the format.
chunksize = 1000000
with gzip.open(proteome_file_path, 'rb') as f:
try:
while f.read(chunksize) != b'':
pass
except gzip.BadGzipFile:
logger.info('|EsMeCaTa|proteomes| Bad format for proteome file %s, try to redownload it.', proteome)
download_proteome_file(proteome, proteome_file_path, empty_proteomes, option_bioservices, session, uniprot_sparql_endpoint)
except EOFError:
logger.info('|EsMeCaTa|proteomes| Bad format for proteome file %s, try to redownload it.', proteome)
download_proteome_file(proteome, proteome_file_path, empty_proteomes, option_bioservices, session, uniprot_sparql_endpoint)



# Download Uniprot metadata and create a json file containing them.
Expand Down

0 comments on commit 4276046

Please sign in to comment.