From 5515fdf913a6cb93d4fc6ccf9f1342b142395f36 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 11 Dec 2024 10:02:17 +0100 Subject: [PATCH] remove smiles prediction (#289) we decided on the community meeting to remove SMILES prediction. --- src/nplinker/genomics/aa_pred.py | 362 ------------------ .../genomics/antismash/antismash_loader.py | 8 - src/nplinker/genomics/bgc.py | 26 -- tests/unit/genomics/test_aa_pred.py | 37 -- tests/unit/genomics/test_antismash_loader.py | 1 - 5 files changed, 434 deletions(-) delete mode 100644 src/nplinker/genomics/aa_pred.py delete mode 100644 tests/unit/genomics/test_aa_pred.py diff --git a/src/nplinker/genomics/aa_pred.py b/src/nplinker/genomics/aa_pred.py deleted file mode 100644 index eccd943b2..000000000 --- a/src/nplinker/genomics/aa_pred.py +++ /dev/null @@ -1,362 +0,0 @@ -# CG: This module parse AntiSMASH files to get the predicted monomers of product. -# And it only cares the case when the monomers are amino acids. -# TODO This module is used in bgc.py, referenced in gcf.py and then in misc.py, -# NPLinker core business does not use it at all. - -from __future__ import annotations -import argparse -from Bio import SeqIO - - -AA_CODES = [ - "ala", - "cys", - "asp", - "glu", - "phe", - "gly", - "his", - "ile", - "lys", - "leu", - "met", - "asn", - "pyl", - "pro", - "gln", - "arg", - "ser", - "thr", - "sec", - "val", - "trp", - "tyr", -] - -# AA_CODES_ISOMER contains codes that should map to -# an entry in the AA_CODES list - i.e. isomers, etc. -AA_CODES_ISOMER = {} -for c in AA_CODES: - AA_CODES_ISOMER["d-%s" % c] = c -AA_CODES_ISOMER["b-ala"] = "ala" - - -# CG: create one class for handling antismash output files and records -## since antismash output formats change along with version -## we should delegate all processes in other classes/functions to the methods of this class -## so that one type of data, one class covering all methodes needed -class AntiSmashFile: - def __init__(self, filename): - """Parse product and specificity (predicted monomer) from AntiSMASH file. - - This class is a wrapper for AntiSmash4Record and AntiSmash5Record. - - Deprecated: - AntiSMASH file always contains one sequence record, so users should - directly use AntiSmash5Record or AntiSmash4Record. - - Args: - filename: AntiSMASH file path - """ - self.raw_data = [] - self.filename = filename - - with open(filename) as f: - # antiSMASH 5 vs. 4 output is vastly different - # CG: TODO should use SeqIO.read since antismash file has only one sequence record - for record in SeqIO.parse(f, "gb"): - if "structured_comment" in record.annotations: - as_version = record.annotations["structured_comment"]["antiSMASH-Data"][ - "Version" - ] - as_major_version = as_version.split(".")[0] - if as_major_version == "5": - self.raw_data.append(AntiSmash5Record(record)) - else: - raise ValueError(f"Invalid antiSMASH version: {as_version}") - else: - self.raw_data.append(AntiSmash4Record(record)) - - def get_spec(self): - """Get specificity (predicted monomer).""" - for r in self.raw_data: - yield from r.get_spec() - - @property - def products(self): - """Get list of products.""" - return [x.products for x in self.raw_data] - - def build_prob(self): - [x.build_prob() for x in self.raw_data] - - def get_prob(self, aa): - return [x.get_prob(aa) for x in self.raw_data] - - -class AntiSmash5Record: - # CG: TODO input should also include antismash file - def __init__(self, seq_record): - """Parse product and specificity (predicted monomer of product) from - AntiSMASH v5 data. - - Args: - seq_record: SeqRecord of AntiSMASH - """ - self.raw_data = seq_record - self.description = self.raw_data.description - - # parse "cand_cluster" - clusters = [x for x in self.raw_data.features if x.type == "cand_cluster"] - - products = [] - for prod_list in [x.qualifiers["product"] for x in clusters]: - for prod in prod_list: - products.append(prod) - - smiles = [] - for smiles_list in [x.qualifiers["SMILES"] for x in clusters if "SMILES" in x.qualifiers]: - for s in smiles_list: - smiles.append(s) - - # parse "aSDomain" - asdomains = [x for x in self.raw_data.features if x.type == "aSDomain"] - asdomains_with_predictions = [x for x in asdomains if "specificity" in x.qualifiers] - asdomains_with_predictions_known = [ - x for x in asdomains_with_predictions if "AMP-binding" in x.qualifiers["aSDomain"] - ] - - self.products = products - self.smiles = smiles - self.asdomains_with_predictions_known = asdomains_with_predictions_known - - # parse predicted monomer of product, i.e. "specificity" - self.specificities = [] - specificities_listed = [ - x.qualifiers["specificity"] for x in asdomains_with_predictions_known - ] - for specificities_single_list in specificities_listed: - for specificity in specificities_single_list: - if specificity.startswith("consensus: "): - self.specificities.append(specificity.split()[1]) - - def get_prob(self, aa): - """Get probability of predicted amino acid. - - Args: - aa: amino acid - - Returns: - prediction probability - """ - if aa in self.specificities: - return 1.0 - else: - return 0.0 - - def get_spec(self): - """Get predicted specificities (animo acids).""" - for aa in set(self.specificities): - if aa in AA_CODES: - yield aa - elif aa in AA_CODES_ISOMER: - yield AA_CODES_ISOMER[aa] - - -class AntiSmash4Record: - # CG: this is for antiSMASH version 4.0 results, it can be removed. - - def __init__(self, seq_record): - self.raw_data = seq_record - self.description = self.raw_data.description - - clusters = [x for x in self.raw_data.features if x.type == "cluster"] - - products = [x.qualifiers["product"] for x in clusters] - smiles = [ - [y for y in x.qualifiers["note"] if y.startswith("Monomers prediction")] - for x in clusters - ] - - asdomains = [x for x in self.raw_data.features if x.type == "aSDomain"] - asdomains_with_predictions = [x for x in asdomains if "specificity" in x.qualifiers] - asdomains_with_predictions_known = [ - x for x in asdomains_with_predictions if "AMP-binding" in x.qualifiers["domain"] - ] - - self.products = products - self.smiles = smiles - self.asdomains_with_predictions_known = asdomains_with_predictions_known - - self.build_prob() - - def display(self): - print(self.products) - print(self.smiles) - print(self.asdomains_with_predictions_known) - - print(self.asdomains_with_predictions_known[0].qualifiers["specificity"]) - - def get_probable_aa(self): - aalist = [] - for domain in self.specificities: - for vote in domain: - aalist.extend(vote) - aaset = set(aalist) - if "none" in aaset: - aaset.remove("none") - return aaset - - def get_spec(self): - for domain in self.asdomains_with_predictions_known: - yield process_specificity(domain.qualifiers["specificity"]) - # for x in domain.qualifiers['specificity']: - # if x.startswith('PID to NN:') or x.startswith('SNN score:'): - # continue - # else: - # yield x - - # x_split = x.split(': ') - # if len(x_split) > 1: - # yield x_split[1] - # else: - # x_split = x.split(' ') - # yield x_split[1] - - def build_prob(self): - self.specificities = [ - process_specificity(x.qualifiers["specificity"]) - for x in self.asdomains_with_predictions_known - ] - - def get_prob(self, aa): - # 'none' votes do not affect predictions, because then - # all AAs are equally likely, so we can't use them to exclude. - prob = 1.0 - for domain in self.specificities: - domain_vote_sum = 0 - for votes in domain: - for vote in set(votes): - if vote == "none": - p_vote = 0 - # p_vote = 1.0 / len(AA_CODES) - elif vote == aa: - p_vote = 1.0 / len(set(votes)) - else: - p_vote = 0 - domain_vote_sum += p_vote - domain_prob = domain_vote_sum / len(domain) - prob *= 1 - domain_prob - - return 1 - prob - - -def stachelhaus(prediction_string): - prediction_string = prediction_string.split(":")[-1].strip() - predictions = predict_aa_string(prediction_string) - return predictions - - -def sandpuma(prediction_string): - prediction_string = prediction_string.split(":")[-1].strip() - predictions = predict_aa_string(prediction_string) - return predictions - - -def predicat(prediction_string): - prediction_string = prediction_string.split("-")[-1] - predictions = predict_aa_string(prediction_string) - return predictions - - -def phmm(prediction_string): - prediction_string = prediction_string.split(":")[-1].strip() - predictions = predict_aa_string(prediction_string) - return predictions - - -def nrpspredictor3(prediction_string): - prediction_string = prediction_string.split(":")[-1].strip() - predictions = predict_aa_string(prediction_string) - return predictions - - -def predict_aa_string(prediction_string): - parsed_predictions = [] - for prediction in prediction_string.split("|"): - if prediction in AA_CODES: - parsed_predictions.append(prediction) - elif prediction in AA_CODES_ISOMER: - parsed_predictions.append(AA_CODES_ISOMER[prediction]) - else: - parsed_predictions.append("none") - return parsed_predictions - - -def process_specificity(prediction_list): - votes = [] - for prediction_string in prediction_list: - if prediction_string.startswith("PID to NN:") or prediction_string.startswith("SNN score:"): - continue - if prediction_string.startswith("SANDPUMA"): - prediction = sandpuma(prediction_string) - else: - continue - # Only process SANDPUMA votes for now (to not have to collate the votes) - # if prediction_string.startswith('Stachelhaus code:'): - # prediction = stachelhaus(prediction_string) - # elif prediction_string.startswith('NRPSpredictor3 SVM'): - # prediction = nrpspredictor3(prediction_string) - # elif prediction_string.startswith('pHMM'): - # prediction = phmm(prediction_string) - # elif prediction_string.startswith('PrediCAT'): - # prediction = predicat(prediction_string) - # elif prediction_string.startswith('SANDPUMA'): - # prediction = sandpuma(prediction_string) - # else: - # prediction = [] - votes.append(prediction) - return votes - - -def to_set(nested): - for item in nested: - if type(item) is str: - yield item - else: - yield from to_set(item) - - -def predict_aa(filename): - asr = AntiSmashFile(filename) - for item in asr.raw_data: - # for aa in item.get_probable_aa(): - for aa in AA_CODES: - res = item.get_prob(aa) - yield aa, res - - -def read_aa_losses(filename): - """Read AA losses from data file. (assume fixed structure...).""" - aa_losses = {} - with open(filename) as f: - reader = csv.reader(f, delimiter=",") - next(reader) # skip headers - for line in reader: - if len(line) == 0: - continue - aa_id = line[1] - aa_mono = float(line[4]) - aa_avg = float(line[5]) - aa_losses[aa_id.lower()] = (aa_mono, aa_avg) - - return aa_losses - - -if __name__ == "__main__": - parser = argparse.ArgumentParser("Predict AA specificity for gbk file") - parser.add_argument("file", help=".gbk file with specificity predictions") - args = parser.parse_args() - - for aa, res in predict_aa(args.file): - print(f"{aa},{res}") diff --git a/src/nplinker/genomics/antismash/antismash_loader.py b/src/nplinker/genomics/antismash/antismash_loader.py index 8c00c0658..908824a8b 100644 --- a/src/nplinker/genomics/antismash/antismash_loader.py +++ b/src/nplinker/genomics/antismash/antismash_loader.py @@ -142,7 +142,6 @@ def parse_bgc_genbank(file: str | PathLike) -> BGC: bgc.antismash_id = antismash_id bgc.antismash_file = str(file) bgc.antismash_region = features.get("region_number") - bgc.smiles = features.get("smiles") bgc.strain = Strain(fname) return bgc @@ -154,11 +153,4 @@ def _parse_antismash_genbank(record: SeqRecord.SeqRecord) -> dict: # biopython assumes region numer is a list, but it's actually an int features["region_number"] = feature.qualifiers.get("region_number")[0] features["product"] = feature.qualifiers.get("product") - if feature.type == "cand_cluster": - smiles = feature.qualifiers.get("SMILES") - # space is not allowed in SMILES spec - # biopython generates space when reading multi-line SMILES from .gbk - if smiles is not None: - smiles = tuple(i.replace(" ", "") for i in smiles) - features["smiles"] = smiles return features diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 8decbb815..e2849a978 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -2,9 +2,7 @@ import logging from typing import TYPE_CHECKING from typing import Any -from deprecated import deprecated from nplinker.strain import Strain -from .aa_pred import predict_aa if TYPE_CHECKING: @@ -45,9 +43,6 @@ class BGC: see [the paper](https://doi.org/10.1186/s40793-018-0318-y). description: Brief description of the BGC. Defaults to None. - smiles: A tuple of SMILES formulas of the BGC's - products. - Defaults to None. antismash_file: The path to the antiSMASH GenBank file. Defaults to None. antismash_id: Identifier of the antiSMASH BGC, referring @@ -82,7 +77,6 @@ def __init__(self, id: str, /, *product_prediction: str): self.mibig_bgc_class: tuple[str] | None = None self.description: str | None = None - self.smiles: tuple[str] | None = None # antismash related attributes self.antismash_file: str | None = None @@ -240,23 +234,3 @@ def _to_string(value: Any) -> str: else: formatted_value = str(value) return formatted_value - - # CG: why not providing whole product but only amino acid as product monomer? - # this property is not used in NPLinker core business. - @property - @deprecated(version="2.0.0", reason="This method will be removed soon") - def aa_predictions(self) -> list: - """Amino acids as predicted monomers of product. - - Returns: - list of dicts with key as amino acid and value as prediction - probability. - """ - # Load aa predictions and cache them - self._aa_predictions = None - if self._aa_predictions is None: - self._aa_predictions = {} - if self.antismash_file is not None: - for p in predict_aa(self.antismash_file): - self._aa_predictions[p[0]] = p[1] - return [self._aa_predictions] diff --git a/tests/unit/genomics/test_aa_pred.py b/tests/unit/genomics/test_aa_pred.py deleted file mode 100644 index e4b1f11fb..000000000 --- a/tests/unit/genomics/test_aa_pred.py +++ /dev/null @@ -1,37 +0,0 @@ -import pytest -from Bio import SeqIO -from nplinker.genomics.aa_pred import AntiSmash5Record -from nplinker.genomics.aa_pred import predict_aa -from .. import DATA_DIR - - -ANTISMASH_FILE = DATA_DIR / "antismash_v5_GCF_000016425.1_NC_009380.1.region017.gbk" - - -def test_predict_aa(): - pred = list(predict_aa(ANTISMASH_FILE)) - print(pred) - assert len(pred) == 22 - assert ("ala", 0.0) in pred - assert ("gly", 1.0) in pred - assert ("val", 1.0) in pred - - -# Test class AntiSmash5Record -@pytest.fixture() -def antismash5_record(): - record = AntiSmash5Record(SeqIO.read(ANTISMASH_FILE, "genbank")) - yield record - - -def test_get_prob(antismash5_record): - assert antismash5_record.get_prob("ala") == 0.0 - assert antismash5_record.get_prob("gly") == 1.0 - assert antismash5_record.get_prob("val") == 1.0 - - -def test_get_spec(antismash5_record): - aa = list(antismash5_record.get_spec()) - assert len(aa) == 2 - assert "gly" in aa - assert "val" in aa diff --git a/tests/unit/genomics/test_antismash_loader.py b/tests/unit/genomics/test_antismash_loader.py index f37c93dd3..238c945c6 100644 --- a/tests/unit/genomics/test_antismash_loader.py +++ b/tests/unit/genomics/test_antismash_loader.py @@ -68,7 +68,6 @@ def test_parse_bgc_genbank(): assert bgc.antismash_id == "NZ_AZWB01000005" assert bgc.antismash_file == gbk_file assert bgc.antismash_region == "1" - assert bgc.smiles == ("NC([*])C(=O)NC([*])C(=O)NC(CO)C(=O)NC(Cc1ccccc1)C(=O)NCC(=O)O",) def test_parse_bgc_genbank_error():