From 5515fdf913a6cb93d4fc6ccf9f1342b142395f36 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 11 Dec 2024 10:02:17 +0100
Subject: [PATCH] remove smiles prediction (#289)

we decided on the community meeting to remove SMILES prediction.
---
 src/nplinker/genomics/aa_pred.py              | 362 ------------------
 .../genomics/antismash/antismash_loader.py    |   8 -
 src/nplinker/genomics/bgc.py                  |  26 --
 tests/unit/genomics/test_aa_pred.py           |  37 --
 tests/unit/genomics/test_antismash_loader.py  |   1 -
 5 files changed, 434 deletions(-)
 delete mode 100644 src/nplinker/genomics/aa_pred.py
 delete mode 100644 tests/unit/genomics/test_aa_pred.py

diff --git a/src/nplinker/genomics/aa_pred.py b/src/nplinker/genomics/aa_pred.py
deleted file mode 100644
index eccd943b2..000000000
--- a/src/nplinker/genomics/aa_pred.py
+++ /dev/null
@@ -1,362 +0,0 @@
-# CG: This module parse AntiSMASH files to get the predicted monomers of product.
-# And it only cares the case when the monomers are amino acids.
-# TODO This module is used in bgc.py, referenced in gcf.py and then in misc.py,
-# NPLinker core business does not use it at all.
-
-from __future__ import annotations
-import argparse
-from Bio import SeqIO
-
-
-AA_CODES = [
-    "ala",
-    "cys",
-    "asp",
-    "glu",
-    "phe",
-    "gly",
-    "his",
-    "ile",
-    "lys",
-    "leu",
-    "met",
-    "asn",
-    "pyl",
-    "pro",
-    "gln",
-    "arg",
-    "ser",
-    "thr",
-    "sec",
-    "val",
-    "trp",
-    "tyr",
-]
-
-# AA_CODES_ISOMER contains codes that should map to
-# an entry in the AA_CODES list - i.e. isomers, etc.
-AA_CODES_ISOMER = {}
-for c in AA_CODES:
-    AA_CODES_ISOMER["d-%s" % c] = c
-AA_CODES_ISOMER["b-ala"] = "ala"
-
-
-# CG: create one class for handling antismash output files and records
-## since antismash output formats change along with version
-## we should delegate all processes in other classes/functions to the methods of this class
-## so that one type of data, one class covering all methodes needed
-class AntiSmashFile:
-    def __init__(self, filename):
-        """Parse product and specificity (predicted monomer) from AntiSMASH file.
-
-        This class is a wrapper for AntiSmash4Record and AntiSmash5Record.
-
-        Deprecated:
-            AntiSMASH file always contains one sequence record, so users should
-            directly use AntiSmash5Record or AntiSmash4Record.
-
-        Args:
-            filename: AntiSMASH file path
-        """
-        self.raw_data = []
-        self.filename = filename
-
-        with open(filename) as f:
-            # antiSMASH 5 vs. 4 output is vastly different
-            # CG: TODO should use SeqIO.read since antismash file has only one sequence record
-            for record in SeqIO.parse(f, "gb"):
-                if "structured_comment" in record.annotations:
-                    as_version = record.annotations["structured_comment"]["antiSMASH-Data"][
-                        "Version"
-                    ]
-                    as_major_version = as_version.split(".")[0]
-                    if as_major_version == "5":
-                        self.raw_data.append(AntiSmash5Record(record))
-                    else:
-                        raise ValueError(f"Invalid antiSMASH version: {as_version}")
-                else:
-                    self.raw_data.append(AntiSmash4Record(record))
-
-    def get_spec(self):
-        """Get specificity (predicted monomer)."""
-        for r in self.raw_data:
-            yield from r.get_spec()
-
-    @property
-    def products(self):
-        """Get list of products."""
-        return [x.products for x in self.raw_data]
-
-    def build_prob(self):
-        [x.build_prob() for x in self.raw_data]
-
-    def get_prob(self, aa):
-        return [x.get_prob(aa) for x in self.raw_data]
-
-
-class AntiSmash5Record:
-    # CG: TODO input should also include antismash file
-    def __init__(self, seq_record):
-        """Parse product and specificity (predicted monomer of product) from
-            AntiSMASH v5 data.
-
-        Args:
-            seq_record: SeqRecord of AntiSMASH
-        """
-        self.raw_data = seq_record
-        self.description = self.raw_data.description
-
-        # parse "cand_cluster"
-        clusters = [x for x in self.raw_data.features if x.type == "cand_cluster"]
-
-        products = []
-        for prod_list in [x.qualifiers["product"] for x in clusters]:
-            for prod in prod_list:
-                products.append(prod)
-
-        smiles = []
-        for smiles_list in [x.qualifiers["SMILES"] for x in clusters if "SMILES" in x.qualifiers]:
-            for s in smiles_list:
-                smiles.append(s)
-
-        # parse "aSDomain"
-        asdomains = [x for x in self.raw_data.features if x.type == "aSDomain"]
-        asdomains_with_predictions = [x for x in asdomains if "specificity" in x.qualifiers]
-        asdomains_with_predictions_known = [
-            x for x in asdomains_with_predictions if "AMP-binding" in x.qualifiers["aSDomain"]
-        ]
-
-        self.products = products
-        self.smiles = smiles
-        self.asdomains_with_predictions_known = asdomains_with_predictions_known
-
-        # parse predicted monomer of product, i.e. "specificity"
-        self.specificities = []
-        specificities_listed = [
-            x.qualifiers["specificity"] for x in asdomains_with_predictions_known
-        ]
-        for specificities_single_list in specificities_listed:
-            for specificity in specificities_single_list:
-                if specificity.startswith("consensus: "):
-                    self.specificities.append(specificity.split()[1])
-
-    def get_prob(self, aa):
-        """Get probability of predicted amino acid.
-
-        Args:
-            aa: amino acid
-
-        Returns:
-            prediction probability
-        """
-        if aa in self.specificities:
-            return 1.0
-        else:
-            return 0.0
-
-    def get_spec(self):
-        """Get predicted specificities (animo acids)."""
-        for aa in set(self.specificities):
-            if aa in AA_CODES:
-                yield aa
-            elif aa in AA_CODES_ISOMER:
-                yield AA_CODES_ISOMER[aa]
-
-
-class AntiSmash4Record:
-    # CG: this is for antiSMASH version 4.0 results, it can be removed.
-
-    def __init__(self, seq_record):
-        self.raw_data = seq_record
-        self.description = self.raw_data.description
-
-        clusters = [x for x in self.raw_data.features if x.type == "cluster"]
-
-        products = [x.qualifiers["product"] for x in clusters]
-        smiles = [
-            [y for y in x.qualifiers["note"] if y.startswith("Monomers prediction")]
-            for x in clusters
-        ]
-
-        asdomains = [x for x in self.raw_data.features if x.type == "aSDomain"]
-        asdomains_with_predictions = [x for x in asdomains if "specificity" in x.qualifiers]
-        asdomains_with_predictions_known = [
-            x for x in asdomains_with_predictions if "AMP-binding" in x.qualifiers["domain"]
-        ]
-
-        self.products = products
-        self.smiles = smiles
-        self.asdomains_with_predictions_known = asdomains_with_predictions_known
-
-        self.build_prob()
-
-    def display(self):
-        print(self.products)
-        print(self.smiles)
-        print(self.asdomains_with_predictions_known)
-
-        print(self.asdomains_with_predictions_known[0].qualifiers["specificity"])
-
-    def get_probable_aa(self):
-        aalist = []
-        for domain in self.specificities:
-            for vote in domain:
-                aalist.extend(vote)
-        aaset = set(aalist)
-        if "none" in aaset:
-            aaset.remove("none")
-        return aaset
-
-    def get_spec(self):
-        for domain in self.asdomains_with_predictions_known:
-            yield process_specificity(domain.qualifiers["specificity"])
-            # for x in domain.qualifiers['specificity']:
-            #     if x.startswith('PID to NN:') or x.startswith('SNN score:'):
-            #         continue
-            #     else:
-            #         yield x
-
-            # x_split = x.split(': ')
-            # if len(x_split) > 1:
-            #    yield x_split[1]
-            # else:
-            #    x_split = x.split(' ')
-            #    yield x_split[1]
-
-    def build_prob(self):
-        self.specificities = [
-            process_specificity(x.qualifiers["specificity"])
-            for x in self.asdomains_with_predictions_known
-        ]
-
-    def get_prob(self, aa):
-        # 'none' votes do not affect predictions, because then
-        # all AAs are equally likely, so we can't use them to exclude.
-        prob = 1.0
-        for domain in self.specificities:
-            domain_vote_sum = 0
-            for votes in domain:
-                for vote in set(votes):
-                    if vote == "none":
-                        p_vote = 0
-                    #     p_vote = 1.0 / len(AA_CODES)
-                    elif vote == aa:
-                        p_vote = 1.0 / len(set(votes))
-                    else:
-                        p_vote = 0
-                    domain_vote_sum += p_vote
-            domain_prob = domain_vote_sum / len(domain)
-            prob *= 1 - domain_prob
-
-        return 1 - prob
-
-
-def stachelhaus(prediction_string):
-    prediction_string = prediction_string.split(":")[-1].strip()
-    predictions = predict_aa_string(prediction_string)
-    return predictions
-
-
-def sandpuma(prediction_string):
-    prediction_string = prediction_string.split(":")[-1].strip()
-    predictions = predict_aa_string(prediction_string)
-    return predictions
-
-
-def predicat(prediction_string):
-    prediction_string = prediction_string.split("-")[-1]
-    predictions = predict_aa_string(prediction_string)
-    return predictions
-
-
-def phmm(prediction_string):
-    prediction_string = prediction_string.split(":")[-1].strip()
-    predictions = predict_aa_string(prediction_string)
-    return predictions
-
-
-def nrpspredictor3(prediction_string):
-    prediction_string = prediction_string.split(":")[-1].strip()
-    predictions = predict_aa_string(prediction_string)
-    return predictions
-
-
-def predict_aa_string(prediction_string):
-    parsed_predictions = []
-    for prediction in prediction_string.split("|"):
-        if prediction in AA_CODES:
-            parsed_predictions.append(prediction)
-        elif prediction in AA_CODES_ISOMER:
-            parsed_predictions.append(AA_CODES_ISOMER[prediction])
-        else:
-            parsed_predictions.append("none")
-    return parsed_predictions
-
-
-def process_specificity(prediction_list):
-    votes = []
-    for prediction_string in prediction_list:
-        if prediction_string.startswith("PID to NN:") or prediction_string.startswith("SNN score:"):
-            continue
-        if prediction_string.startswith("SANDPUMA"):
-            prediction = sandpuma(prediction_string)
-        else:
-            continue
-        # Only process SANDPUMA votes for now (to not have to collate the votes)
-        # if prediction_string.startswith('Stachelhaus code:'):
-        #     prediction = stachelhaus(prediction_string)
-        # elif prediction_string.startswith('NRPSpredictor3 SVM'):
-        #     prediction = nrpspredictor3(prediction_string)
-        # elif prediction_string.startswith('pHMM'):
-        #     prediction = phmm(prediction_string)
-        # elif prediction_string.startswith('PrediCAT'):
-        #     prediction = predicat(prediction_string)
-        # elif prediction_string.startswith('SANDPUMA'):
-        #     prediction = sandpuma(prediction_string)
-        # else:
-        #     prediction = []
-        votes.append(prediction)
-    return votes
-
-
-def to_set(nested):
-    for item in nested:
-        if type(item) is str:
-            yield item
-        else:
-            yield from to_set(item)
-
-
-def predict_aa(filename):
-    asr = AntiSmashFile(filename)
-    for item in asr.raw_data:
-        # for aa in item.get_probable_aa():
-        for aa in AA_CODES:
-            res = item.get_prob(aa)
-            yield aa, res
-
-
-def read_aa_losses(filename):
-    """Read AA losses from data file. (assume fixed structure...)."""
-    aa_losses = {}
-    with open(filename) as f:
-        reader = csv.reader(f, delimiter=",")
-        next(reader)  # skip headers
-        for line in reader:
-            if len(line) == 0:
-                continue
-            aa_id = line[1]
-            aa_mono = float(line[4])
-            aa_avg = float(line[5])
-            aa_losses[aa_id.lower()] = (aa_mono, aa_avg)
-
-    return aa_losses
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser("Predict AA specificity for gbk file")
-    parser.add_argument("file", help=".gbk file with specificity predictions")
-    args = parser.parse_args()
-
-    for aa, res in predict_aa(args.file):
-        print(f"{aa},{res}")
diff --git a/src/nplinker/genomics/antismash/antismash_loader.py b/src/nplinker/genomics/antismash/antismash_loader.py
index 8c00c0658..908824a8b 100644
--- a/src/nplinker/genomics/antismash/antismash_loader.py
+++ b/src/nplinker/genomics/antismash/antismash_loader.py
@@ -142,7 +142,6 @@ def parse_bgc_genbank(file: str | PathLike) -> BGC:
     bgc.antismash_id = antismash_id
     bgc.antismash_file = str(file)
     bgc.antismash_region = features.get("region_number")
-    bgc.smiles = features.get("smiles")
     bgc.strain = Strain(fname)
     return bgc
 
@@ -154,11 +153,4 @@ def _parse_antismash_genbank(record: SeqRecord.SeqRecord) -> dict:
             # biopython assumes region numer is a list, but it's actually an int
             features["region_number"] = feature.qualifiers.get("region_number")[0]
             features["product"] = feature.qualifiers.get("product")
-        if feature.type == "cand_cluster":
-            smiles = feature.qualifiers.get("SMILES")
-            # space is not allowed in SMILES spec
-            # biopython generates space when reading multi-line SMILES from .gbk
-            if smiles is not None:
-                smiles = tuple(i.replace(" ", "") for i in smiles)
-            features["smiles"] = smiles
     return features
diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
index 8decbb815..e2849a978 100644
--- a/src/nplinker/genomics/bgc.py
+++ b/src/nplinker/genomics/bgc.py
@@ -2,9 +2,7 @@
 import logging
 from typing import TYPE_CHECKING
 from typing import Any
-from deprecated import deprecated
 from nplinker.strain import Strain
-from .aa_pred import predict_aa
 
 
 if TYPE_CHECKING:
@@ -45,9 +43,6 @@ class BGC:
             see [the paper](https://doi.org/10.1186/s40793-018-0318-y).
         description: Brief description of the BGC.
             Defaults to None.
-        smiles: A tuple of SMILES formulas of the BGC's
-            products.
-            Defaults to None.
         antismash_file: The path to the antiSMASH GenBank file.
             Defaults to None.
         antismash_id: Identifier of the antiSMASH BGC, referring
@@ -82,7 +77,6 @@ def __init__(self, id: str, /, *product_prediction: str):
 
         self.mibig_bgc_class: tuple[str] | None = None
         self.description: str | None = None
-        self.smiles: tuple[str] | None = None
 
         # antismash related attributes
         self.antismash_file: str | None = None
@@ -240,23 +234,3 @@ def _to_string(value: Any) -> str:
         else:
             formatted_value = str(value)
         return formatted_value
-
-    # CG: why not providing whole product but only amino acid as product monomer?
-    # this property is not used in NPLinker core business.
-    @property
-    @deprecated(version="2.0.0", reason="This method will be removed soon")
-    def aa_predictions(self) -> list:
-        """Amino acids as predicted monomers of product.
-
-        Returns:
-            list of dicts with key as amino acid and value as prediction
-            probability.
-        """
-        # Load aa predictions and cache them
-        self._aa_predictions = None
-        if self._aa_predictions is None:
-            self._aa_predictions = {}
-            if self.antismash_file is not None:
-                for p in predict_aa(self.antismash_file):
-                    self._aa_predictions[p[0]] = p[1]
-        return [self._aa_predictions]
diff --git a/tests/unit/genomics/test_aa_pred.py b/tests/unit/genomics/test_aa_pred.py
deleted file mode 100644
index e4b1f11fb..000000000
--- a/tests/unit/genomics/test_aa_pred.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import pytest
-from Bio import SeqIO
-from nplinker.genomics.aa_pred import AntiSmash5Record
-from nplinker.genomics.aa_pred import predict_aa
-from .. import DATA_DIR
-
-
-ANTISMASH_FILE = DATA_DIR / "antismash_v5_GCF_000016425.1_NC_009380.1.region017.gbk"
-
-
-def test_predict_aa():
-    pred = list(predict_aa(ANTISMASH_FILE))
-    print(pred)
-    assert len(pred) == 22
-    assert ("ala", 0.0) in pred
-    assert ("gly", 1.0) in pred
-    assert ("val", 1.0) in pred
-
-
-# Test class AntiSmash5Record
-@pytest.fixture()
-def antismash5_record():
-    record = AntiSmash5Record(SeqIO.read(ANTISMASH_FILE, "genbank"))
-    yield record
-
-
-def test_get_prob(antismash5_record):
-    assert antismash5_record.get_prob("ala") == 0.0
-    assert antismash5_record.get_prob("gly") == 1.0
-    assert antismash5_record.get_prob("val") == 1.0
-
-
-def test_get_spec(antismash5_record):
-    aa = list(antismash5_record.get_spec())
-    assert len(aa) == 2
-    assert "gly" in aa
-    assert "val" in aa
diff --git a/tests/unit/genomics/test_antismash_loader.py b/tests/unit/genomics/test_antismash_loader.py
index f37c93dd3..238c945c6 100644
--- a/tests/unit/genomics/test_antismash_loader.py
+++ b/tests/unit/genomics/test_antismash_loader.py
@@ -68,7 +68,6 @@ def test_parse_bgc_genbank():
     assert bgc.antismash_id == "NZ_AZWB01000005"
     assert bgc.antismash_file == gbk_file
     assert bgc.antismash_region == "1"
-    assert bgc.smiles == ("NC([*])C(=O)NC([*])C(=O)NC(CO)C(=O)NC(Cc1ccccc1)C(=O)NCC(=O)O",)
 
 
 def test_parse_bgc_genbank_error():