From cc5ed45c83bdab54f2dc6765b5605a61c38127a1 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 29 May 2024 16:47:46 +0200
Subject: [PATCH 01/10] merge LinkFinder to MetcalfScoring class

The actual role of LinkFinder is to calculate metcalf score, so it makes more sense to merge its functions to MetcalfScoring class
---
 src/nplinker/scoring/metcalf_scoring.py | 256 +++++++++++++++++++-----
 1 file changed, 210 insertions(+), 46 deletions(-)

diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py
index fa54129ca..a8b7d2b77 100644
--- a/src/nplinker/scoring/metcalf_scoring.py
+++ b/src/nplinker/scoring/metcalf_scoring.py
@@ -4,6 +4,7 @@
 from typing import TYPE_CHECKING
 import numpy as np
 import pandas as pd
+from scipy.stats import hypergeom
 from nplinker.genomics import GCF
 from nplinker.metabolomics import MolecularFamily
 from nplinker.metabolomics import Spectrum
@@ -12,7 +13,6 @@
 from .abc import ScoringBase
 from .linking import LINK_TYPES
 from .linking import DataLinks
-from .linking import LinkFinder
 from .linking import isinstance_all
 from .object_link import ObjectLink
 
@@ -30,15 +30,23 @@ class MetcalfScoring(ScoringBase):
     Attributes:
         name: The name of this scoring method, set to a fixed value `metcalf`.
         DATALINKS: The DataLinks object to use for scoring.
-        LINKFINDER: The LinkFinder object to use for scoring.
         CACHE: The name of the cache file to use for storing the MetcalfScoring.
+
+        raw_score_spec_gcf: The raw Metcalf scores for spectrum-GCF links.
+        raw_score_mf_gcf: The raw Metcalf scores for molecular family-GCF links.
+        metcalf_mean: The mean value used for standardising Metcalf scores.
+        metcalf_std: The standard deviation value used for standardising Metcalf scores.
     """
 
     name = "metcalf"
     DATALINKS = None
-    LINKFINDER = None
     CACHE = "cache_metcalf_scoring.pckl"
 
+    raw_score_spec_gcf = pd.DataFrame()
+    raw_score_mf_gcf = pd.DataFrame()
+    metcalf_mean = None
+    metcalf_std = None
+
     def __init__(self, npl: NPLinker) -> None:
         """Create a MetcalfScoring object.
 
@@ -60,9 +68,9 @@ def __init__(self, npl: NPLinker) -> None:
     # TODO CG: refactor this method and extract code for cache file to a separate method
     @classmethod
     def setup(cls, npl: NPLinker):
-        """Setup the DataLinks and LinkFinder objects.
+        """Setup the DataLinks object.
 
-        This method is only called once to setup the DataLinks and LinkFinder objects.
+        This method is only called once to setup the DataLinks object.
         """
         logger.info(
             "MetcalfScoring.setup (bgcs={}, gcfs={}, spectra={}, molfams={}, strains={})".format(
@@ -81,13 +89,13 @@ def setup(cls, npl: NPLinker):
             len(npl.molfams),
             len(npl.strains),
         ]
-        datalinks, linkfinder = None, None
+        datalinks = None
         if os.path.exists(cache_file):
             logger.info("MetcalfScoring.setup loading cached data")
             cache_data = load_pickled_data(npl, cache_file)
             cache_ok = True
             if cache_data is not None:
-                (counts, datalinks, linkfinder) = cache_data
+                (counts, datalinks) = cache_data
                 # need to invalidate this if dataset appears to have changed
                 for i in range(len(counts)):
                     if counts[i] != dataset_counts[i]:
@@ -97,19 +105,65 @@ def setup(cls, npl: NPLinker):
 
             if cache_ok:
                 cls.DATALINKS = datalinks
-                cls.LINKFINDER = linkfinder
 
         if cls.DATALINKS is None:
             logger.info("MetcalfScoring.setup preprocessing dataset (this may take some time)")
             cls.DATALINKS = DataLinks(npl.gcfs, npl.spectra, npl.molfams, npl.strains)
-            cls.LINKFINDER = LinkFinder()
-            cls.LINKFINDER.calc_score(MetcalfScoring.DATALINKS, link_type=LINK_TYPES[0])
-            cls.LINKFINDER.calc_score(MetcalfScoring.DATALINKS, link_type=LINK_TYPES[1])
+            cls.calc_score(cls.DATALINKS, link_type=LINK_TYPES[0])
+            cls.calc_score(cls.DATALINKS, link_type=LINK_TYPES[1])
             logger.info("MetcalfScoring.setup caching results")
-            save_pickled_data((dataset_counts, cls.DATALINKS, cls.LINKFINDER), cache_file)
+            # TODO: save the score values 2024-05-29
+            save_pickled_data((dataset_counts, cls.DATALINKS), cache_file)
 
         logger.info("MetcalfScoring.setup completed")
 
+    @classmethod
+    def calc_score(
+        cls,
+        data_links: DataLinks,
+        link_type: str = "spec-gcf",
+        scoring_weights: tuple[int, int, int, int] = (10, -10, 0, 1),
+    ) -> None:
+        """Calculate Metcalf scores.
+
+        This method calculates the `raw_score_spec_gcf`, `raw_score_mf_gcf`, `metcalf_mean`, and
+        `metcalf_std` attributes.
+
+        Args:
+            data_links: The DataLinks object to use for scoring.
+            link_type: The type of link to score. Must be 'spec-gcf' or
+                'mf-gcf'. Defaults to 'spec-gcf'.
+            scoring_weights: The weights to
+                use for Metcalf scoring. The weights are applied to
+                '(met_gcf, met_not_gcf, gcf_not_met, not_met_not_gcf)'.
+                Defaults to (10, -10, 0, 1).
+
+        Raises:
+            ValueError: If an invalid link type is provided.
+        """
+        if link_type not in LINK_TYPES:
+            raise ValueError(f"Invalid link type: {link_type}. Must be one of {LINK_TYPES}")
+
+        if link_type == "spec-gcf":
+            cls.raw_score_spec_gcf = (
+                data_links.cooccurrence_spec_gcf * scoring_weights[0]
+                + data_links.cooccurrence_spec_notgcf * scoring_weights[1]
+                + data_links.cooccurrence_notspec_gcf * scoring_weights[2]
+                + data_links.cooccurrence_notspec_notgcf * scoring_weights[3]
+            )
+
+        if link_type == "mf-gcf":
+            cls.raw_score_mf_gcf = (
+                data_links.cooccurrence_mf_gcf * scoring_weights[0]
+                + data_links.cooccurrence_mf_notgcf * scoring_weights[1]
+                + data_links.cooccurrence_notmf_gcf * scoring_weights[2]
+                + data_links.cooccurrence_notmf_notgcf * scoring_weights[3]
+            )
+
+        if cls.metcalf_mean is None or cls.metcalf_std is None:
+            n_strains = data_links.occurrence_gcf_strain.shape[1]
+            cls.metcalf_mean, cls.metcalf_std = cls._calc_mean_std(n_strains, scoring_weights)
+
     # TODO CG: is it needed? remove it if not
     @property
     def datalinks(self) -> DataLinks | None:
@@ -135,8 +189,6 @@ def get_links(
         Raises:
             ValueError: If the input objects are empty.
             TypeError: If the input objects are not of the correct type.
-            ValueError: If LinkFinder instance has not been created
-                (MetcalfScoring object has not been setup).
         """
         if len(objects) == 0:
             raise ValueError("Empty input objects.")
@@ -153,23 +205,18 @@ def get_links(
                 f"Invalid type {set(types)}. Input objects must be GCF, Spectrum or MolecularFamily objects."
             )
 
-        if self.LINKFINDER is None:
-            raise ValueError(
-                ("LinkFinder object not found. Have you called `MetcalfScoring.setup(npl)`?")
-            )
-
         logger.info(f"MetcalfScoring: standardised = {self.standardised}")
         if not self.standardised:
-            scores_list = self.LINKFINDER.get_links(*objects, score_cutoff=self.cutoff)
+            scores_list = self._get_links(*objects, score_cutoff=self.cutoff)
         # TODO CG: verify the logics of standardised score and add unit tests
         else:
             # use negative infinity as the score cutoff to ensure we get all links
             # the self.cutoff will be applied later in the postprocessing step
-            scores_list = self.LINKFINDER.get_links(*objects, score_cutoff=np.NINF)
+            scores_list = self._get_links(*objects, score_cutoff=np.NINF)
             if obj_type == "gcf":
-                scores_list = self._calc_standardised_score_gen(self.LINKFINDER, scores_list)
+                scores_list = self._calc_standardised_score_gen(scores_list)
             else:
-                scores_list = self._calc_standardised_score_met(self.LINKFINDER, scores_list)
+                scores_list = self._calc_standardised_score_met(scores_list)
 
         link_scores: dict[
             GCF | Spectrum | MolecularFamily, dict[GCF | Spectrum | MolecularFamily, ObjectLink]
@@ -225,10 +272,141 @@ def get_links(
         logger.info("MetcalfScoring: completed")
         return link_collection
 
-    def _calc_standardised_score_met(
-        self, linkfinder: LinkFinder, results: list
+    # TODO CG: refactor this method
+    def format_data(self, data):
+        """Format the data for display."""
+        # for metcalf the data will just be a floating point value (i.e. the score)
+        return f"{data:.4f}"
+
+    # TODO CG: refactor this method
+    def sort(self, objects, reverse=True):
+        """Sort the objects based on the score."""
+        # sort based on score
+        return sorted(objects, key=lambda objlink: objlink[self], reverse=reverse)
+
+    @staticmethod
+    def _calc_mean_std(
+        n_strains: int, scoring_weights: tuple[int, int, int, int]
+    ) -> tuple[np.ndarray, np.ndarray]:
+        sz = (n_strains + 1, n_strains + 1)
+        mean = np.zeros(sz)
+        variance = np.zeros(sz)
+        for n in range(n_strains + 1):
+            for m in range(n_strains + 1):
+                max_overlap = min(n, m)
+                min_overlap = max(0, n + m - n_strains)
+                expected_value = 0
+                expected_sq = 0
+                for o in range(min_overlap, max_overlap + 1):
+                    o_prob = hypergeom.pmf(o, n_strains, n, m)
+                    # compute metcalf for n strains in type 1 and m in gcf
+                    score = o * scoring_weights[0]
+                    score += scoring_weights[1] * (n - o)
+                    score += scoring_weights[2] * (m - o)
+                    score += scoring_weights[3] * (n_strains - (n + m - o))
+                    expected_value += o_prob * score
+                    expected_sq += o_prob * (score**2)
+                mean[n, m] = expected_value
+                expected_sq = expected_sq - expected_value**2
+                if expected_sq < 1e-09:
+                    expected_sq = 1
+                variance[n, m] = expected_sq
+        return mean, np.sqrt(variance)
+
+    def _get_links(
+        self,
+        *objects: tuple[GCF, ...] | tuple[Spectrum, ...] | tuple[MolecularFamily, ...],
+        score_cutoff: float = 0.5,
     ) -> list[pd.DataFrame]:
-        if linkfinder.metcalf_mean is None or linkfinder.metcalf_std is None:
+        """Get links and scores for given objects.
+
+        Args:
+            objects: A list of GCF, Spectrum or MolecularFamily objects
+                and all objects must be of the same type.
+            score_cutoff: Minimum score to consider a link (≥score_cutoff).
+                Default is 0.5.
+
+        Returns:
+            List of data frames containing the ids of the linked objects
+                and the score. The data frame has index names of
+                'source', 'target' and 'score':
+
+                - the 'source' row contains the ids of the input/source objects,
+                - the 'target' row contains the ids of the target objects,
+                - the 'score' row contains the scores.
+
+        Raises:
+            ValueError: If input objects are empty.
+            TypeError: If input objects are not GCF, Spectrum or MolecularFamily objects.
+        """
+        if len(objects) == 0:
+            raise ValueError("Empty input objects.")
+
+        if isinstance_all(*objects, objtype=GCF):
+            obj_type = "gcf"
+        elif isinstance_all(*objects, objtype=Spectrum):
+            obj_type = "spec"
+        elif isinstance_all(*objects, objtype=MolecularFamily):
+            obj_type = "mf"
+        else:
+            types = [type(i) for i in objects]
+            raise TypeError(
+                f"Invalid type {set(types)}. Input objects must be GCF, Spectrum or MolecularFamily objects."
+            )
+
+        links = []
+        if obj_type == "gcf":
+            # TODO CG: the hint and mypy warnings will be gone after renaming all
+            # string ids to `.id`
+            obj_ids = [gcf.gcf_id for gcf in objects]
+            # spec-gcf
+            scores = self.raw_score_spec_gcf.loc[:, obj_ids]
+            df = self._get_scores_source_gcf(scores, score_cutoff)
+            df.name = LINK_TYPES[0]
+            links.append(df)
+            # mf-gcf
+            scores = self.raw_score_mf_gcf.loc[:, obj_ids]
+            df = self._get_scores_source_gcf(scores, score_cutoff)
+            df.name = LINK_TYPES[1]
+            links.append(df)
+
+        if obj_type == "spec":
+            obj_ids = [spec.spectrum_id for spec in objects]
+            scores = self.raw_score_spec_gcf.loc[obj_ids, :]
+            df = self._get_scores_source_met(scores, score_cutoff)
+            df.name = LINK_TYPES[0]
+            links.append(df)
+
+        if obj_type == "mf":
+            obj_ids = [mf.family_id for mf in objects]
+            scores = self.raw_score_mf_gcf.loc[obj_ids, :]
+            df = self._get_scores_source_met(scores, score_cutoff)
+            df.name = LINK_TYPES[1]
+            links.append(df)
+        return links
+
+    @staticmethod
+    def _get_scores_source_gcf(scores: pd.DataFrame, score_cutoff: float) -> pd.DataFrame:
+        row_indexes, col_indexes = np.where(scores >= score_cutoff)
+        src_obj_ids = scores.columns[col_indexes].to_list()
+        target_obj_ids = scores.index[row_indexes].to_list()
+        scores_candidate = scores.values[row_indexes, col_indexes].tolist()
+        return pd.DataFrame(
+            [src_obj_ids, target_obj_ids, scores_candidate], index=["source", "target", "score"]
+        )
+
+    @staticmethod
+    def _get_scores_source_met(scores: pd.DataFrame, score_cutoff: float) -> pd.DataFrame:
+        row_indexes, col_indexes = np.where(scores >= score_cutoff)
+        src_obj_ids = scores.index[row_indexes].to_list()
+        target_obj_ids = scores.columns[col_indexes].to_list()
+        scores_candidate = scores.values[row_indexes, col_indexes].tolist()
+        return pd.DataFrame(
+            [src_obj_ids, target_obj_ids, scores_candidate], index=["source", "target", "score"]
+        )
+
+    def _calc_standardised_score_met(self, results: list) -> list[pd.DataFrame]:
+        if self.metcalf_mean is None or self.metcalf_std is None:
             raise ValueError(
                 "Metcalf mean and std not found. Have you called `MetcalfScoring.setup(npl)`?"
             )
@@ -244,8 +422,8 @@ def _calc_standardised_score_met(
 
             num_gcf_strains = len(gcf.strains)
             num_met_strains = len(met.strains)
-            mean = linkfinder.metcalf_mean[num_met_strains][num_gcf_strains]
-            sqrt = linkfinder.metcalf_std[num_met_strains][num_gcf_strains]
+            mean = self.metcalf_mean[num_met_strains][num_gcf_strains]
+            sqrt = self.metcalf_std[num_met_strains][num_gcf_strains]
             z_score = (raw_score.at["score", col_index] - mean) / sqrt
             z_scores.append(z_score)
 
@@ -264,10 +442,8 @@ def _calc_standardised_score_met(
 
         return [scores_df]
 
-    def _calc_standardised_score_gen(
-        self, linkfinder: LinkFinder, results: list
-    ) -> list[pd.DataFrame]:
-        if linkfinder.metcalf_mean is None or linkfinder.metcalf_std is None:
+    def _calc_standardised_score_gen(self, results: list) -> list[pd.DataFrame]:
+        if self.metcalf_mean is None or self.metcalf_std is None:
             raise ValueError(
                 "Metcalf mean and std not found. Have you called `MetcalfScoring.setup(npl)`?"
             )
@@ -284,8 +460,8 @@ def _calc_standardised_score_gen(
 
                 num_gcf_strains = len(gcf.strains)
                 num_met_strains = len(met.strains)
-                mean = linkfinder.metcalf_mean[num_met_strains][num_gcf_strains]
-                sqrt = linkfinder.metcalf_std[num_met_strains][num_gcf_strains]
+                mean = self.metcalf_mean[num_met_strains][num_gcf_strains]
+                sqrt = self.metcalf_std[num_met_strains][num_gcf_strains]
                 z_score = (raw_score.at["score", col_index] - mean) / sqrt
                 z_scores.append(z_score)
 
@@ -304,15 +480,3 @@ def _calc_standardised_score_gen(
             postprocessed_scores.append(scores_df)
 
         return postprocessed_scores
-
-    # TODO CG: refactor this method
-    def format_data(self, data):
-        """Format the data for display."""
-        # for metcalf the data will just be a floating point value (i.e. the score)
-        return f"{data:.4f}"
-
-    # TODO CG: refactor this method
-    def sort(self, objects, reverse=True):
-        """Sort the objects based on the score."""
-        # sort based on score
-        return sorted(objects, key=lambda objlink: objlink[self], reverse=reverse)

From 8bb47613568826fc0059eeca7dab8a81d21f1b15 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 29 May 2024 16:48:07 +0200
Subject: [PATCH 02/10] remove LinkFinder

---
 src/nplinker/scoring/linking/__init__.py    |   3 +-
 src/nplinker/scoring/linking/link_finder.py | 201 --------------------
 2 files changed, 1 insertion(+), 203 deletions(-)
 delete mode 100644 src/nplinker/scoring/linking/link_finder.py

diff --git a/src/nplinker/scoring/linking/__init__.py b/src/nplinker/scoring/linking/__init__.py
index bd391697f..49c863f29 100644
--- a/src/nplinker/scoring/linking/__init__.py
+++ b/src/nplinker/scoring/linking/__init__.py
@@ -1,8 +1,7 @@
 from .data_links import LINK_TYPES
 from .data_links import DataLinks
-from .link_finder import LinkFinder
 from .utils import calc_correlation_matrix
 from .utils import isinstance_all
 
 
-__all__ = ["DataLinks", "LINK_TYPES", "LinkFinder", "calc_correlation_matrix", "isinstance_all"]
+__all__ = ["DataLinks", "LINK_TYPES", "calc_correlation_matrix", "isinstance_all"]
diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py
deleted file mode 100644
index 0e25a0502..000000000
--- a/src/nplinker/scoring/linking/link_finder.py
+++ /dev/null
@@ -1,201 +0,0 @@
-from __future__ import annotations
-import logging
-from typing import TYPE_CHECKING
-import numpy as np
-import pandas as pd
-from scipy.stats import hypergeom
-from nplinker.genomics.gcf import GCF
-from nplinker.metabolomics import MolecularFamily
-from nplinker.metabolomics import Spectrum
-from . import LINK_TYPES
-from .utils import isinstance_all
-
-
-if TYPE_CHECKING:
-    from . import DataLinks
-
-logger = logging.getLogger(__file__)
-
-
-# TODO CG: this class could be merged to MetcalfScoring class?
-class LinkFinder:
-    def __init__(self) -> None:
-        """Initialise LinkFinder object.
-
-        Attributes:
-            raw_score_spec_gcf: The raw Metcalf scores for
-                spectrum-GCF links.
-            raw_score_mf_gcf: The raw Metcalf scores for
-                molecular family-GCF links.
-            metcalf_mean: The mean value used for
-                standardising Metcalf scores.
-            metcalf_std: The standard deviation value used
-                for standardising Metcalf scores.
-        """
-        self.raw_score_spec_gcf = pd.DataFrame()
-        self.raw_score_mf_gcf = pd.DataFrame()
-        self.metcalf_mean = None
-        self.metcalf_std = None
-
-    # TODO CG: calc_score method could be integrated to __init__?
-    def calc_score(
-        self,
-        data_links: DataLinks,
-        link_type: str = "spec-gcf",
-        scoring_weights: tuple[int, int, int, int] = (10, -10, 0, 1),
-    ) -> None:
-        """Calculate Metcalf scores.
-
-        Args:
-            data_links: The DataLinks object to use for scoring.
-            link_type: The type of link to score. Must be 'spec-gcf' or
-                'mf-gcf'. Defaults to 'spec-gcf'.
-            scoring_weights: The weights to
-                use for Metcalf scoring. The weights are applied to
-                '(met_gcf, met_not_gcf, gcf_not_met, not_met_not_gcf)'.
-                Defaults to (10, -10, 0, 1).
-
-        Raises:
-            ValueError: If an invalid link type is provided.
-        """
-        if link_type not in LINK_TYPES:
-            raise ValueError(f"Invalid link type: {link_type}. Must be one of {LINK_TYPES}")
-
-        if link_type == "spec-gcf":
-            self.raw_score_spec_gcf = (
-                data_links.cooccurrence_spec_gcf * scoring_weights[0]
-                + data_links.cooccurrence_spec_notgcf * scoring_weights[1]
-                + data_links.cooccurrence_notspec_gcf * scoring_weights[2]
-                + data_links.cooccurrence_notspec_notgcf * scoring_weights[3]
-            )
-        if link_type == "mf-gcf":
-            self.raw_score_mf_gcf = (
-                data_links.cooccurrence_mf_gcf * scoring_weights[0]
-                + data_links.cooccurrence_mf_notgcf * scoring_weights[1]
-                + data_links.cooccurrence_notmf_gcf * scoring_weights[2]
-                + data_links.cooccurrence_notmf_notgcf * scoring_weights[3]
-            )
-
-        # TODO CG: this part should be moved outside of this method
-        n_strains = data_links.occurrence_gcf_strain.shape[1]
-        if self.metcalf_mean is None or self.metcalf_std is None:
-            self.metcalf_mean, self.metcalf_std = self._calc_mean_std(n_strains, scoring_weights)
-
-    # TODO CG: read paper and check the logics of this method
-    def _calc_mean_std(
-        self, n_strains: int, scoring_weights: tuple[int, int, int, int]
-    ) -> tuple[np.ndarray, np.ndarray]:
-        sz = (n_strains + 1, n_strains + 1)
-        mean = np.zeros(sz)
-        variance = np.zeros(sz)
-        for n in range(n_strains + 1):
-            for m in range(n_strains + 1):
-                max_overlap = min(n, m)
-                min_overlap = max(0, n + m - n_strains)
-                expected_value = 0
-                expected_sq = 0
-                for o in range(min_overlap, max_overlap + 1):
-                    o_prob = hypergeom.pmf(o, n_strains, n, m)
-                    # compute metcalf for n strains in type 1 and m in gcf
-                    score = o * scoring_weights[0]
-                    score += scoring_weights[1] * (n - o)
-                    score += scoring_weights[2] * (m - o)
-                    score += scoring_weights[3] * (n_strains - (n + m - o))
-                    expected_value += o_prob * score
-                    expected_sq += o_prob * (score**2)
-                mean[n, m] = expected_value
-                expected_sq = expected_sq - expected_value**2
-                if expected_sq < 1e-09:
-                    expected_sq = 1
-                variance[n, m] = expected_sq
-        return mean, np.sqrt(variance)
-
-    def get_links(
-        self,
-        *objects: tuple[GCF, ...] | tuple[Spectrum, ...] | tuple[MolecularFamily, ...],
-        score_cutoff: float = 0.5,
-    ) -> list[pd.DataFrame]:
-        """Get links and scores for given objects.
-
-        Args:
-            objects: A list of GCF, Spectrum or MolecularFamily objects
-                and all objects must be of the same type.
-            score_cutoff: Minimum score to consider a link (≥score_cutoff).
-                Default is 0.5.
-
-        Returns:
-            List of data frames containing the ids of the linked objects
-                and the score. The data frame has index names of
-                'source', 'target' and 'score':
-
-                - the 'source' row contains the ids of the input/source objects,
-                - the 'target' row contains the ids of the target objects,
-                - the 'score' row contains the scores.
-
-        Raises:
-            ValueError: If input objects are empty.
-            TypeError: If input objects are not GCF, Spectrum or MolecularFamily objects.
-        """
-        if len(objects) == 0:
-            raise ValueError("Empty input objects.")
-
-        if isinstance_all(*objects, objtype=GCF):
-            obj_type = "gcf"
-        elif isinstance_all(*objects, objtype=Spectrum):
-            obj_type = "spec"
-        elif isinstance_all(*objects, objtype=MolecularFamily):
-            obj_type = "mf"
-        else:
-            types = [type(i) for i in objects]
-            raise TypeError(
-                f"Invalid type {set(types)}. Input objects must be GCF, Spectrum or MolecularFamily objects."
-            )
-
-        links = []
-        if obj_type == "gcf":
-            # TODO CG: the hint and mypy warnings will be gone after renaming all
-            # string ids to `.id`
-            obj_ids = [gcf.gcf_id for gcf in objects]
-            # spec-gcf
-            scores = self.raw_score_spec_gcf.loc[:, obj_ids]
-            df = self._get_scores_source_gcf(scores, score_cutoff)
-            df.name = LINK_TYPES[0]
-            links.append(df)
-            # mf-gcf
-            scores = self.raw_score_mf_gcf.loc[:, obj_ids]
-            df = self._get_scores_source_gcf(scores, score_cutoff)
-            df.name = LINK_TYPES[1]
-            links.append(df)
-
-        if obj_type == "spec":
-            obj_ids = [spec.spectrum_id for spec in objects]
-            scores = self.raw_score_spec_gcf.loc[obj_ids, :]
-            df = self._get_scores_source_met(scores, score_cutoff)
-            df.name = LINK_TYPES[0]
-            links.append(df)
-
-        if obj_type == "mf":
-            obj_ids = [mf.family_id for mf in objects]
-            scores = self.raw_score_mf_gcf.loc[obj_ids, :]
-            df = self._get_scores_source_met(scores, score_cutoff)
-            df.name = LINK_TYPES[1]
-            links.append(df)
-        return links
-
-    def _get_scores_source_gcf(self, scores: pd.DataFrame, score_cutoff: float) -> pd.DataFrame:
-        row_indexes, col_indexes = np.where(scores >= score_cutoff)
-        src_obj_ids = scores.columns[col_indexes].to_list()
-        target_obj_ids = scores.index[row_indexes].to_list()
-        scores_candidate = scores.values[row_indexes, col_indexes].tolist()
-        return pd.DataFrame(
-            [src_obj_ids, target_obj_ids, scores_candidate], index=["source", "target", "score"]
-        )
-
-    def _get_scores_source_met(self, scores: pd.DataFrame, score_cutoff: float) -> pd.DataFrame:
-        row_indexes, col_indexes = np.where(scores >= score_cutoff)
-        src_obj_ids = scores.index[row_indexes].to_list()
-        target_obj_ids = scores.columns[col_indexes].to_list()
-        scores_candidate = scores.values[row_indexes, col_indexes].tolist()
-        return pd.DataFrame(
-            [src_obj_ids, target_obj_ids, scores_candidate], index=["source", "target", "score"]
-        )

From 8d663e870127ed3b3d83bed29e5f9c98a22c1bc3 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 29 May 2024 16:48:57 +0200
Subject: [PATCH 03/10] move unit tests of LinkFinder

---
 tests/unit/scoring/conftest.py             |  10 -
 tests/unit/scoring/test_link_finder.py     | 249 ----------------
 tests/unit/scoring/test_metcalf_scoring.py | 322 +++++++++++++++++++--
 3 files changed, 299 insertions(+), 282 deletions(-)
 delete mode 100644 tests/unit/scoring/test_link_finder.py

diff --git a/tests/unit/scoring/conftest.py b/tests/unit/scoring/conftest.py
index f1517cbd6..4d37bdfa4 100644
--- a/tests/unit/scoring/conftest.py
+++ b/tests/unit/scoring/conftest.py
@@ -5,7 +5,6 @@
 from nplinker.nplinker import NPLinker
 from nplinker.scoring import MetcalfScoring
 from nplinker.scoring.linking import DataLinks
-from nplinker.scoring.linking import LinkFinder
 from nplinker.strain import Strain
 from nplinker.strain import StrainCollection
 from .. import CONFIG_FILE_LOCAL_MODE
@@ -68,15 +67,6 @@ def datalinks(gcfs, spectra, mfs, strains) -> DataLinks:
     return DataLinks(gcfs, spectra, mfs, strains)
 
 
-@fixture(scope="module")
-def linkfinder(datalinks) -> LinkFinder:
-    """LinkFinder object. See `test_link_finder.py` for its values."""
-    linkfinder = LinkFinder()
-    linkfinder.calc_score(datalinks, link_type="spec-gcf")
-    linkfinder.calc_score(datalinks, link_type="mf-gcf")
-    return linkfinder
-
-
 @fixture(scope="module")
 def npl(gcfs, spectra, mfs, strains, tmp_path_factory) -> NPLinker:
     """Constructed NPLinker object.
diff --git a/tests/unit/scoring/test_link_finder.py b/tests/unit/scoring/test_link_finder.py
deleted file mode 100644
index 16dc731b4..000000000
--- a/tests/unit/scoring/test_link_finder.py
+++ /dev/null
@@ -1,249 +0,0 @@
-import numpy as np
-import pandas as pd
-import pytest
-from pandas.testing import assert_frame_equal
-from pytest import fixture
-from nplinker.scoring.linking import LinkFinder
-
-
-@fixture(scope="module")
-def linkfinder() -> LinkFinder:
-    return LinkFinder()
-
-
-def test_init(linkfinder):
-    assert_frame_equal(linkfinder.raw_score_spec_gcf, pd.DataFrame())
-    assert_frame_equal(linkfinder.raw_score_mf_gcf, pd.DataFrame())
-    assert linkfinder.metcalf_mean is None
-    assert linkfinder.metcalf_std is None
-
-
-def test_calc_score_raw_score(linkfinder, datalinks):
-    """Test `calc_score` method for `raw_score_spec_gcf` and `raw_score_mf_gcf`.
-
-    The expected values are calculated manually by using values from `test_init`
-    of `test_data_links.py` and the default scoring weights.
-    """
-    # link type = 'spec-gcf'
-    linkfinder.calc_score(datalinks, link_type="spec-gcf")
-    assert_frame_equal(
-        linkfinder.raw_score_spec_gcf,
-        pd.DataFrame(
-            [[12, -9, 11], [-9, 12, 11], [1, 1, 21]],
-            index=["spectrum1", "spectrum2", "spectrum3"],
-            columns=["gcf1", "gcf2", "gcf3"],
-        ),
-    )
-    # link type = 'mf-gcf'
-    linkfinder.calc_score(datalinks, link_type="mf-gcf")
-    assert_frame_equal(
-        linkfinder.raw_score_mf_gcf,
-        pd.DataFrame(
-            [[12, -9, 11], [-9, 12, 11], [1, 1, 21]],
-            index=["mf1", "mf2", "mf3"],
-            columns=["gcf1", "gcf2", "gcf3"],
-        ),
-    )
-
-
-def test_calc_score_mean_std(linkfinder, datalinks):
-    """Test `calc_score` method for `metcalf_mean` and `metcalf_std`."""
-    linkfinder.calc_score(datalinks, link_type="spec-gcf")
-    assert isinstance(linkfinder.metcalf_mean, np.ndarray)
-    assert isinstance(linkfinder.metcalf_std, np.ndarray)
-    assert linkfinder.metcalf_mean.shape == (4, 4)  # (n_strains+1 , n_strains+1)
-    assert linkfinder.metcalf_mean.shape == (4, 4)
-    # TODO CG: add tests for values after refactoring _calc_mean_std method
-    # assert linkfinder.metcalf_mean == expected_array
-
-
-def test_get_links_gcf(linkfinder, datalinks, gcfs):
-    """Test `get_links` method for input GCF objects."""
-    linkfinder.calc_score(datalinks, link_type="spec-gcf")
-    linkfinder.calc_score(datalinks, link_type="mf-gcf")
-    index_names = ["source", "target", "score"]
-
-    # cutoff = negative infinity (float)
-    links = linkfinder.get_links(*gcfs, score_cutoff=np.NINF)
-    assert len(links) == 2
-    # expected values got from `test_calc_score_raw_score`
-    assert_frame_equal(
-        links[0],
-        pd.DataFrame(
-            [
-                ["gcf1", "gcf2", "gcf3"] * 3,
-                [
-                    *["spectrum1"] * 3,
-                    *["spectrum2"] * 3,
-                    *["spectrum3"] * 3,
-                ],
-                [12, -9, 11, -9, 12, 11, 1, 1, 21],
-            ],
-            index=index_names,
-        ),
-    )
-    assert_frame_equal(
-        links[1],
-        pd.DataFrame(
-            [
-                ["gcf1", "gcf2", "gcf3"] * 3,
-                [
-                    *["mf1"] * 3,
-                    *["mf2"] * 3,
-                    *["mf3"] * 3,
-                ],
-                [12, -9, 11, -9, 12, 11, 1, 1, 21],
-            ],
-            index=index_names,
-        ),
-    )
-
-    # cutoff = 0
-    links = linkfinder.get_links(*gcfs, score_cutoff=0)
-    assert len(links) == 2
-    assert_frame_equal(
-        links[0],
-        pd.DataFrame(
-            [
-                ["gcf1", "gcf3", "gcf2", "gcf3", "gcf1", "gcf2", "gcf3"],
-                [
-                    *["spectrum1"] * 2,
-                    *["spectrum2"] * 2,
-                    *["spectrum3"] * 3,
-                ],
-                [12, 11, 12, 11, 1, 1, 21],
-            ],
-            index=index_names,
-        ),
-    )
-    assert_frame_equal(
-        links[1],
-        pd.DataFrame(
-            [
-                ["gcf1", "gcf3", "gcf2", "gcf3", "gcf1", "gcf2", "gcf3"],
-                [
-                    *["mf1"] * 2,
-                    *["mf2"] * 2,
-                    *["mf3"] * 3,
-                ],
-                [12, 11, 12, 11, 1, 1, 21],
-            ],
-            index=index_names,
-        ),
-    )
-
-
-def test_get_links_spec(linkfinder, datalinks, spectra):
-    """Test `get_links` method for input Spectrum objects."""
-    linkfinder.calc_score(datalinks, link_type="spec-gcf")
-    linkfinder.calc_score(datalinks, link_type="mf-gcf")
-    index_names = ["source", "target", "score"]
-    # cutoff = negative infinity (float)
-    links = linkfinder.get_links(*spectra, score_cutoff=np.NINF)
-    assert len(links) == 1
-    assert_frame_equal(
-        links[0],
-        pd.DataFrame(
-            [
-                [
-                    *["spectrum1"] * 3,
-                    *["spectrum2"] * 3,
-                    *["spectrum3"] * 3,
-                ],
-                ["gcf1", "gcf2", "gcf3"] * 3,
-                [12, -9, 11, -9, 12, 11, 1, 1, 21],
-            ],
-            index=index_names,
-        ),
-    )
-    # cutoff = 0
-    links = linkfinder.get_links(*spectra, score_cutoff=0)
-    assert_frame_equal(
-        links[0],
-        pd.DataFrame(
-            [
-                [
-                    *["spectrum1"] * 2,
-                    *["spectrum2"] * 2,
-                    *["spectrum3"] * 3,
-                ],
-                ["gcf1", "gcf3", "gcf2", "gcf3", "gcf1", "gcf2", "gcf3"],
-                [12, 11, 12, 11, 1, 1, 21],
-            ],
-            index=index_names,
-        ),
-    )
-
-
-def test_get_links_mf(linkfinder, datalinks, mfs):
-    """Test `get_links` method for input MolecularFamily objects."""
-    linkfinder.calc_score(datalinks, link_type="spec-gcf")
-    linkfinder.calc_score(datalinks, link_type="mf-gcf")
-    index_names = ["source", "target", "score"]
-    # cutoff = negative infinity (float)
-    links = linkfinder.get_links(*mfs, score_cutoff=np.NINF)
-    assert len(links) == 1
-    assert_frame_equal(
-        links[0],
-        pd.DataFrame(
-            [
-                [
-                    *["mf1"] * 3,
-                    *["mf2"] * 3,
-                    *["mf3"] * 3,
-                ],
-                ["gcf1", "gcf2", "gcf3"] * 3,
-                [12, -9, 11, -9, 12, 11, 1, 1, 21],
-            ],
-            index=index_names,
-        ),
-    )
-    # cutoff = 0
-    links = linkfinder.get_links(*mfs, score_cutoff=0)
-    assert_frame_equal(
-        links[0],
-        pd.DataFrame(
-            [
-                [
-                    *["mf1"] * 2,
-                    *["mf2"] * 2,
-                    *["mf3"] * 3,
-                ],
-                ["gcf1", "gcf3", "gcf2", "gcf3", "gcf1", "gcf2", "gcf3"],
-                [12, 11, 12, 11, 1, 1, 21],
-            ],
-            index=index_names,
-        ),
-    )
-
-
-@pytest.mark.parametrize(
-    "objects, expected", [([], "Empty input objects"), ("", "Empty input objects")]
-)
-def test_get_links_invalid_value(linkfinder, objects, expected):
-    with pytest.raises(ValueError) as e:
-        linkfinder.get_links(*objects)
-    assert expected in str(e.value)
-
-
-@pytest.mark.parametrize(
-    "objects, expected",
-    [
-        ([1], "Invalid type {<class 'int'>}"),
-        ([1, 2], "Invalid type {<class 'int'>}"),
-        ("12", "Invalid type {<class 'str'>}"),
-    ],
-)
-def test_get_links_invalid_type(linkfinder, objects, expected):
-    with pytest.raises(TypeError) as e:
-        linkfinder.get_links(*objects)
-    assert expected in str(e.value)
-
-
-def test_get_links_invalid_mixed_types(linkfinder, spectra, mfs):
-    objects = (*spectra, *mfs)
-    with pytest.raises(TypeError) as e:
-        linkfinder.get_links(*objects)
-    assert "Invalid type" in str(e.value)
-    assert ".MolecularFamily" in str(e.value)
-    assert ".Spectrum" in str(e.value)
diff --git a/tests/unit/scoring/test_metcalf_scoring.py b/tests/unit/scoring/test_metcalf_scoring.py
index ec9449cb5..7c5e7f138 100644
--- a/tests/unit/scoring/test_metcalf_scoring.py
+++ b/tests/unit/scoring/test_metcalf_scoring.py
@@ -1,12 +1,11 @@
 import numpy as np
+import pandas as pd
 import pytest
-from numpy.testing import assert_array_equal
 from pandas.testing import assert_frame_equal
 from nplinker.scoring import LinkCollection
 from nplinker.scoring import MetcalfScoring
 from nplinker.scoring import ObjectLink
 from nplinker.scoring.linking import DataLinks
-from nplinker.scoring.linking import LinkFinder
 
 
 def test_init(npl):
@@ -16,39 +15,128 @@ def test_init(npl):
     assert mc.cutoff == 1.0
     assert mc.standardised is True
     assert mc.DATALINKS is None
-    assert mc.LINKFINDER is None
+    assert_frame_equal(mc.raw_score_spec_gcf, pd.DataFrame())
+    assert_frame_equal(mc.raw_score_mf_gcf, pd.DataFrame())
+    assert mc.metcalf_mean is None
+    assert mc.metcalf_std is None
 
 
-def test_setup(mc, datalinks, linkfinder):
+#
+# Test the `setup` method
+#
+
+
+def test_setup(mc, datalinks):
     """Test `setup` method when cache file does not exist."""
     assert isinstance(mc.DATALINKS, DataLinks)
-    assert isinstance(mc.LINKFINDER, LinkFinder)
 
     assert_frame_equal(mc.DATALINKS.occurrence_gcf_strain, datalinks.occurrence_gcf_strain)
     assert_frame_equal(mc.DATALINKS.cooccurrence_spec_gcf, datalinks.cooccurrence_spec_gcf)
 
-    assert_frame_equal(mc.LINKFINDER.raw_score_spec_gcf, linkfinder.raw_score_spec_gcf)
-    assert_frame_equal(mc.LINKFINDER.raw_score_mf_gcf, linkfinder.raw_score_mf_gcf)
-    assert_array_equal(mc.LINKFINDER.metcalf_mean, linkfinder.metcalf_mean)
-    assert_array_equal(mc.LINKFINDER.metcalf_std, linkfinder.metcalf_std)
-
-
-def test_setup_load_cache(mc, npl, datalinks, linkfinder, caplog):
+    assert_frame_equal(
+        mc.raw_score_spec_gcf,
+        pd.DataFrame(
+            [[12, -9, 11], [-9, 12, 11], [1, 1, 21]],
+            index=["spectrum1", "spectrum2", "spectrum3"],
+            columns=["gcf1", "gcf2", "gcf3"],
+        ),
+    )
+    assert_frame_equal(
+        mc.raw_score_mf_gcf,
+        pd.DataFrame(
+            [[12, -9, 11], [-9, 12, 11], [1, 1, 21]],
+            index=["mf1", "mf2", "mf3"],
+            columns=["gcf1", "gcf2", "gcf3"],
+        ),
+    )
+
+    assert isinstance(mc.metcalf_mean, np.ndarray)
+    assert isinstance(mc.metcalf_std, np.ndarray)
+    assert mc.metcalf_mean.shape == (4, 4)  # (n_strains+1 , n_strains+1)
+    assert mc.metcalf_mean.shape == (4, 4)
+
+
+def test_setup_load_cache(mc, npl, datalinks, caplog):
     """Test `setup` method when cache file exists."""
     mc.setup(npl)
     assert "MetcalfScoring.setup loading cached data" in caplog.text
     assert "MetcalfScoring.setup caching results" not in caplog.text
 
     assert isinstance(mc.DATALINKS, DataLinks)
-    assert isinstance(mc.LINKFINDER, LinkFinder)
 
     assert_frame_equal(mc.DATALINKS.occurrence_gcf_strain, datalinks.occurrence_gcf_strain)
     assert_frame_equal(mc.DATALINKS.cooccurrence_spec_gcf, datalinks.cooccurrence_spec_gcf)
 
-    assert_frame_equal(mc.LINKFINDER.raw_score_spec_gcf, linkfinder.raw_score_spec_gcf)
-    assert_frame_equal(mc.LINKFINDER.raw_score_mf_gcf, linkfinder.raw_score_mf_gcf)
-    assert_array_equal(mc.LINKFINDER.metcalf_mean, linkfinder.metcalf_mean)
-    assert_array_equal(mc.LINKFINDER.metcalf_std, linkfinder.metcalf_std)
+    assert_frame_equal(
+        mc.raw_score_spec_gcf,
+        pd.DataFrame(
+            [[12, -9, 11], [-9, 12, 11], [1, 1, 21]],
+            index=["spectrum1", "spectrum2", "spectrum3"],
+            columns=["gcf1", "gcf2", "gcf3"],
+        ),
+    )
+    assert_frame_equal(
+        mc.raw_score_mf_gcf,
+        pd.DataFrame(
+            [[12, -9, 11], [-9, 12, 11], [1, 1, 21]],
+            index=["mf1", "mf2", "mf3"],
+            columns=["gcf1", "gcf2", "gcf3"],
+        ),
+    )
+
+    assert isinstance(mc.metcalf_mean, np.ndarray)
+    assert isinstance(mc.metcalf_std, np.ndarray)
+    assert mc.metcalf_mean.shape == (4, 4)  # (n_strains+1 , n_strains+1)
+    assert mc.metcalf_mean.shape == (4, 4)
+
+
+#
+# Test the `calc_score` method
+#
+
+
+def test_calc_score_raw_score(mc, datalinks):
+    """Test `calc_score` method for `raw_score_spec_gcf` and `raw_score_mf_gcf`.
+
+    The expected values are calculated manually by using values from `test_init`
+    of `test_data_links.py` and the default scoring weights.
+    """
+    # link type = 'spec-gcf'
+    mc.calc_score(datalinks, link_type="spec-gcf")
+    assert_frame_equal(
+        mc.raw_score_spec_gcf,
+        pd.DataFrame(
+            [[12, -9, 11], [-9, 12, 11], [1, 1, 21]],
+            index=["spectrum1", "spectrum2", "spectrum3"],
+            columns=["gcf1", "gcf2", "gcf3"],
+        ),
+    )
+    # link type = 'mf-gcf'
+    mc.calc_score(datalinks, link_type="mf-gcf")
+    assert_frame_equal(
+        mc.raw_score_mf_gcf,
+        pd.DataFrame(
+            [[12, -9, 11], [-9, 12, 11], [1, 1, 21]],
+            index=["mf1", "mf2", "mf3"],
+            columns=["gcf1", "gcf2", "gcf3"],
+        ),
+    )
+
+
+def test_calc_score_mean_std(mc, datalinks):
+    """Test `calc_score` method for `metcalf_mean` and `metcalf_std`."""
+    mc.calc_score(datalinks, link_type="spec-gcf")
+    assert isinstance(mc.metcalf_mean, np.ndarray)
+    assert isinstance(mc.metcalf_std, np.ndarray)
+    assert mc.metcalf_mean.shape == (4, 4)  # (n_strains+1 , n_strains+1)
+    assert mc.metcalf_mean.shape == (4, 4)
+    # TODO CG: add tests for values after refactoring _calc_mean_std method
+    # assert mc.metcalf_mean == expected_array
+
+
+#
+# Test the `get_links` method
+#
 
 
 def test_get_links_gcf_standardised_false(mc, gcfs, spectra, mfs):
@@ -194,10 +282,198 @@ def test_get_links_invalid_mixed_types(mc, spectra, mfs):
     assert ".Spectrum" in str(e.value)
 
 
-def test_get_links_no_linkfinder(npl, gcfs):
-    """Test `get_links` method when no LinkFinder object is found."""
-    mc = MetcalfScoring(npl)
-    mc.LINKFINDER = None
+#
+# Test the `_get_links` method
+#
+
+
+def test__get_links_gcf(mc, datalinks, gcfs):
+    """Test `get_links` method for input GCF objects."""
+    mc.calc_score(datalinks, link_type="spec-gcf")
+    mc.calc_score(datalinks, link_type="mf-gcf")
+    index_names = ["source", "target", "score"]
+
+    # cutoff = negative infinity (float)
+    links = mc._get_links(*gcfs, score_cutoff=np.NINF)
+    assert len(links) == 2
+    # expected values got from `test_calc_score_raw_score`
+    assert_frame_equal(
+        links[0],
+        pd.DataFrame(
+            [
+                ["gcf1", "gcf2", "gcf3"] * 3,
+                [
+                    *["spectrum1"] * 3,
+                    *["spectrum2"] * 3,
+                    *["spectrum3"] * 3,
+                ],
+                [12, -9, 11, -9, 12, 11, 1, 1, 21],
+            ],
+            index=index_names,
+        ),
+    )
+    assert_frame_equal(
+        links[1],
+        pd.DataFrame(
+            [
+                ["gcf1", "gcf2", "gcf3"] * 3,
+                [
+                    *["mf1"] * 3,
+                    *["mf2"] * 3,
+                    *["mf3"] * 3,
+                ],
+                [12, -9, 11, -9, 12, 11, 1, 1, 21],
+            ],
+            index=index_names,
+        ),
+    )
+
+    # cutoff = 0
+    links = mc._get_links(*gcfs, score_cutoff=0)
+    assert len(links) == 2
+    assert_frame_equal(
+        links[0],
+        pd.DataFrame(
+            [
+                ["gcf1", "gcf3", "gcf2", "gcf3", "gcf1", "gcf2", "gcf3"],
+                [
+                    *["spectrum1"] * 2,
+                    *["spectrum2"] * 2,
+                    *["spectrum3"] * 3,
+                ],
+                [12, 11, 12, 11, 1, 1, 21],
+            ],
+            index=index_names,
+        ),
+    )
+    assert_frame_equal(
+        links[1],
+        pd.DataFrame(
+            [
+                ["gcf1", "gcf3", "gcf2", "gcf3", "gcf1", "gcf2", "gcf3"],
+                [
+                    *["mf1"] * 2,
+                    *["mf2"] * 2,
+                    *["mf3"] * 3,
+                ],
+                [12, 11, 12, 11, 1, 1, 21],
+            ],
+            index=index_names,
+        ),
+    )
+
+
+def test__get_links_spec(mc, datalinks, spectra):
+    """Test `get_links` method for input Spectrum objects."""
+    mc.calc_score(datalinks, link_type="spec-gcf")
+    mc.calc_score(datalinks, link_type="mf-gcf")
+    index_names = ["source", "target", "score"]
+    # cutoff = negative infinity (float)
+    links = mc._get_links(*spectra, score_cutoff=np.NINF)
+    assert len(links) == 1
+    assert_frame_equal(
+        links[0],
+        pd.DataFrame(
+            [
+                [
+                    *["spectrum1"] * 3,
+                    *["spectrum2"] * 3,
+                    *["spectrum3"] * 3,
+                ],
+                ["gcf1", "gcf2", "gcf3"] * 3,
+                [12, -9, 11, -9, 12, 11, 1, 1, 21],
+            ],
+            index=index_names,
+        ),
+    )
+    # cutoff = 0
+    links = mc._get_links(*spectra, score_cutoff=0)
+    assert_frame_equal(
+        links[0],
+        pd.DataFrame(
+            [
+                [
+                    *["spectrum1"] * 2,
+                    *["spectrum2"] * 2,
+                    *["spectrum3"] * 3,
+                ],
+                ["gcf1", "gcf3", "gcf2", "gcf3", "gcf1", "gcf2", "gcf3"],
+                [12, 11, 12, 11, 1, 1, 21],
+            ],
+            index=index_names,
+        ),
+    )
+
+
+def test__get_links_mf(mc, datalinks, mfs):
+    """Test `get_links` method for input MolecularFamily objects."""
+    mc.calc_score(datalinks, link_type="spec-gcf")
+    mc.calc_score(datalinks, link_type="mf-gcf")
+    index_names = ["source", "target", "score"]
+    # cutoff = negative infinity (float)
+    links = mc._get_links(*mfs, score_cutoff=np.NINF)
+    assert len(links) == 1
+    assert_frame_equal(
+        links[0],
+        pd.DataFrame(
+            [
+                [
+                    *["mf1"] * 3,
+                    *["mf2"] * 3,
+                    *["mf3"] * 3,
+                ],
+                ["gcf1", "gcf2", "gcf3"] * 3,
+                [12, -9, 11, -9, 12, 11, 1, 1, 21],
+            ],
+            index=index_names,
+        ),
+    )
+    # cutoff = 0
+    links = mc._get_links(*mfs, score_cutoff=0)
+    assert_frame_equal(
+        links[0],
+        pd.DataFrame(
+            [
+                [
+                    *["mf1"] * 2,
+                    *["mf2"] * 2,
+                    *["mf3"] * 3,
+                ],
+                ["gcf1", "gcf3", "gcf2", "gcf3", "gcf1", "gcf2", "gcf3"],
+                [12, 11, 12, 11, 1, 1, 21],
+            ],
+            index=index_names,
+        ),
+    )
+
+
+@pytest.mark.parametrize(
+    "objects, expected", [([], "Empty input objects"), ("", "Empty input objects")]
+)
+def test_get_links_invalid_value(mc, objects, expected):
     with pytest.raises(ValueError) as e:
-        mc.get_links(*gcfs, link_collection=LinkCollection())
-    assert "LinkFinder object not found." in str(e.value)
+        mc._get_links(*objects)
+    assert expected in str(e.value)
+
+
+@pytest.mark.parametrize(
+    "objects, expected",
+    [
+        ([1], "Invalid type {<class 'int'>}"),
+        ([1, 2], "Invalid type {<class 'int'>}"),
+        ("12", "Invalid type {<class 'str'>}"),
+    ],
+)
+def test__get_links_invalid_type(mc, objects, expected):
+    with pytest.raises(TypeError) as e:
+        mc._get_links(*objects)
+    assert expected in str(e.value)
+
+
+def test__get_links_invalid_mixed_types(mc, spectra, mfs):
+    objects = (*spectra, *mfs)
+    with pytest.raises(TypeError) as e:
+        mc._get_links(*objects)
+    assert "Invalid type" in str(e.value)
+    assert ".MolecularFamily" in str(e.value)
+    assert ".Spectrum" in str(e.value)

From ccbe154e46b397a458f403b020bcea887fe7edb5 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 29 May 2024 17:06:14 +0200
Subject: [PATCH 04/10] update static typings

---
 src/nplinker/scoring/metcalf_scoring.py | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py
index a8b7d2b77..107fc4fd2 100644
--- a/src/nplinker/scoring/metcalf_scoring.py
+++ b/src/nplinker/scoring/metcalf_scoring.py
@@ -31,7 +31,6 @@ class MetcalfScoring(ScoringBase):
         name: The name of this scoring method, set to a fixed value `metcalf`.
         DATALINKS: The DataLinks object to use for scoring.
         CACHE: The name of the cache file to use for storing the MetcalfScoring.
-
         raw_score_spec_gcf: The raw Metcalf scores for spectrum-GCF links.
         raw_score_mf_gcf: The raw Metcalf scores for molecular family-GCF links.
         metcalf_mean: The mean value used for standardising Metcalf scores.
@@ -39,13 +38,13 @@ class MetcalfScoring(ScoringBase):
     """
 
     name = "metcalf"
-    DATALINKS = None
-    CACHE = "cache_metcalf_scoring.pckl"
+    DATALINKS: datalinks | None = None
+    CACHE: str = "cache_metcalf_scoring.pckl"
 
-    raw_score_spec_gcf = pd.DataFrame()
-    raw_score_mf_gcf = pd.DataFrame()
-    metcalf_mean = None
-    metcalf_std = None
+    raw_score_spec_gcf: pd.DataFrame = pd.DataFrame()
+    raw_score_mf_gcf: pd.DataFrame = pd.DataFrame()
+    metcalf_mean: np.ndarray | None = None
+    metcalf_std: np.ndarray = None
 
     def __init__(self, npl: NPLinker) -> None:
         """Create a MetcalfScoring object.
@@ -58,12 +57,10 @@ def __init__(self, npl: NPLinker) -> None:
                 this value will be discarded. Defaults to 1.0.
             standardised: Whether to use standardised scores. Defaults
                 to True.
-            name: The name of the scoring method. It's set to a fixed value
-                'metcalf'.
         """
         super().__init__(npl)
-        self.cutoff = 1.0
-        self.standardised = True
+        self.cutoff: float = 1.0
+        self.standardised: bool = True
 
     # TODO CG: refactor this method and extract code for cache file to a separate method
     @classmethod

From e83f7fcc7a487b1424c5039cbbcf48bd36939275 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 30 May 2024 14:13:37 +0200
Subject: [PATCH 05/10] Delete conftest.py

---
 tests/unit/conftest.py | 40 ----------------------------------------
 1 file changed, 40 deletions(-)
 delete mode 100644 tests/unit/conftest.py

diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
deleted file mode 100644
index c6b9afcd4..000000000
--- a/tests/unit/conftest.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import os
-import shutil
-import tempfile
-
-
-nplinker_root_dir = os.path.join(tempfile.gettempdir(), "nplinker_unit_test")
-
-
-def pytest_sessionstart(session):
-    """Pytest hook to run before the entire test session starts.
-
-    This hook makes sure the temporary directory `nplinker_root_dir` is created before any test
-    starts. When running tests in parallel, the creation operation is done by the master process,
-    and worker processes are not allowed to do it.
-
-    For more about this hook, see:
-    1. https://docs.pytest.org/en/stable/reference.html#_pytest.hookspec.pytest_sessionstart
-    2. https://github.com/pytest-dev/pytest-xdist/issues/271#issuecomment-826396320
-    """
-    workerinput = getattr(session.config, "workerinput", None)
-    # It's master process or not running in parallell when `workerinput` is None.
-    if workerinput is None:
-        if os.path.exists(nplinker_root_dir):
-            shutil.rmtree(nplinker_root_dir)
-        os.mkdir(nplinker_root_dir)
-    # NPLinker setting `root_dir` must be a path that exists, so setting it to a temporary directory.
-    os.environ["NPLINKER_ROOT_DIR"] = nplinker_root_dir
-
-
-def pytest_sessionfinish(session):
-    """Pytest hook to run after the entire test session finishes.
-
-    This hook makes sure that temporary directory `nplinker_root_dir` is only removed after all
-    tests finish. When running tests in parallel, the deletion operation is done by the master
-    processs, and worker processes are not allowed to do it.
-
-    """
-    workerinput = getattr(session.config, "workerinput", None)
-    if workerinput is None:
-        shutil.rmtree(nplinker_root_dir)

From 4a34f1e0fb06d1f2878e7f61dadc702385d01d6b Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 30 May 2024 14:16:51 +0200
Subject: [PATCH 06/10] create temporary root dir only in the places that
 require  it

---
 tests/unit/scoring/conftest.py | 6 ++++--
 tests/unit/test_config.py      | 4 +++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/unit/scoring/conftest.py b/tests/unit/scoring/conftest.py
index 4d37bdfa4..f36b60050 100644
--- a/tests/unit/scoring/conftest.py
+++ b/tests/unit/scoring/conftest.py
@@ -1,3 +1,4 @@
+import os
 from pytest import fixture
 from nplinker.genomics import GCF
 from nplinker.metabolomics import MolecularFamily
@@ -67,8 +68,8 @@ def datalinks(gcfs, spectra, mfs, strains) -> DataLinks:
     return DataLinks(gcfs, spectra, mfs, strains)
 
 
-@fixture(scope="module")
-def npl(gcfs, spectra, mfs, strains, tmp_path_factory) -> NPLinker:
+@fixture(scope="function")
+def npl(gcfs, spectra, mfs, strains, tmp_path) -> NPLinker:
     """Constructed NPLinker object.
 
     This NPLinker object does not do loading `npl.load_data()`, instead we
@@ -77,6 +78,7 @@ def npl(gcfs, spectra, mfs, strains, tmp_path_factory) -> NPLinker:
     The config file `nplinker_demo1.toml` does not affect the tests, just
     making sure the NPLinker object can be created succesfully.
     """
+    os.environ["NPLINKER_ROOT_DIR"] = str(tmp_path)  # Create a tmporary root dir for NPLinker
     npl = NPLinker(CONFIG_FILE_LOCAL_MODE)
     npl._gcfs = gcfs
     npl._spectra = spectra
diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py
index 3e57afa62..f681cc928 100644
--- a/tests/unit/test_config.py
+++ b/tests/unit/test_config.py
@@ -1,9 +1,11 @@
+import os
 from nplinker.config import load_config
 from . import CONFIG_FILE_LOCAL_MODE
 
 
-def test_config():
+def test_config(tmp_path):
     """Test loading the default config file."""
+    os.environ["NPLINKER_ROOT_DIR"] = str(tmp_path)  # Create a tmporary root dir for NPLinker
     config = load_config(CONFIG_FILE_LOCAL_MODE)
 
     assert config.mode == "local"

From 93e591bc07a4a90c1d61a3a5ff1570c9166bfdb9 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 30 May 2024 14:18:57 +0200
Subject: [PATCH 07/10] update fixture scopes

---
 tests/unit/scoring/conftest.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/scoring/conftest.py b/tests/unit/scoring/conftest.py
index f36b60050..e711e78a0 100644
--- a/tests/unit/scoring/conftest.py
+++ b/tests/unit/scoring/conftest.py
@@ -62,7 +62,7 @@ def mfs(spectra) -> tuple[MolecularFamily, MolecularFamily, MolecularFamily]:
     return mf1, mf2, mf3
 
 
-@fixture(scope="module")
+@fixture(scope="session")
 def datalinks(gcfs, spectra, mfs, strains) -> DataLinks:
     """DataLinks object. See `test_data_links.py` for its values."""
     return DataLinks(gcfs, spectra, mfs, strains)
@@ -90,7 +90,7 @@ def npl(gcfs, spectra, mfs, strains, tmp_path) -> NPLinker:
     return npl
 
 
-@fixture(scope="module")
+@fixture(scope="function")
 def mc(npl) -> MetcalfScoring:
     """MetcalfScoring object."""
     mc = MetcalfScoring(npl)

From 9d72c71a4f6a55ced841fa4e3d471134cce29505 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 30 May 2024 14:19:23 +0200
Subject: [PATCH 08/10] Update test_metcalf_scoring.py

caplog cannot capture all logs, so remove the assertions.
---
 tests/unit/scoring/test_metcalf_scoring.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/unit/scoring/test_metcalf_scoring.py b/tests/unit/scoring/test_metcalf_scoring.py
index 7c5e7f138..43c9baa72 100644
--- a/tests/unit/scoring/test_metcalf_scoring.py
+++ b/tests/unit/scoring/test_metcalf_scoring.py
@@ -59,8 +59,6 @@ def test_setup(mc, datalinks):
 def test_setup_load_cache(mc, npl, datalinks, caplog):
     """Test `setup` method when cache file exists."""
     mc.setup(npl)
-    assert "MetcalfScoring.setup loading cached data" in caplog.text
-    assert "MetcalfScoring.setup caching results" not in caplog.text
 
     assert isinstance(mc.DATALINKS, DataLinks)
 

From 311b83b50dc685a29a6ebb0fc406832725c73db6 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 30 May 2024 14:23:18 +0200
Subject: [PATCH 09/10] update pytest dist value to loadgroup

This option is much faster and easier to control the group of tests for same worker.
---
 pyproject.toml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 15a0be6c3..cd3a0bbb7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -87,8 +87,10 @@ namespaces = true # enable data directory to be identified
 
 [tool.pytest.ini_options]
 minversion = "6.0"
-# pytest options: -ra: show summary info for all test outcomes; -n auto: run tests in parallel; --dist loadscope: distribute tests by loading scope
-addopts = "-ra -n auto --dist loadscope"    
+# -ra: show summary info for all test outcomes; 
+# -n auto: run tests in parallel; 
+# --dist loadgroup: sends tests marked with 'xdist_group' to the same worker
+addopts = "-ra -n auto --dist loadgroup"    
 testpaths = ["tests/unit"]
 
 [tool.coverage.run]

From d13d8831be36a812cce9dc1c12d1d570091579af Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 6 Jun 2024 09:24:02 +0200
Subject: [PATCH 10/10] fix repeatings

---
 tests/unit/scoring/test_metcalf_scoring.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/unit/scoring/test_metcalf_scoring.py b/tests/unit/scoring/test_metcalf_scoring.py
index 43c9baa72..997ed4a56 100644
--- a/tests/unit/scoring/test_metcalf_scoring.py
+++ b/tests/unit/scoring/test_metcalf_scoring.py
@@ -53,7 +53,7 @@ def test_setup(mc, datalinks):
     assert isinstance(mc.metcalf_mean, np.ndarray)
     assert isinstance(mc.metcalf_std, np.ndarray)
     assert mc.metcalf_mean.shape == (4, 4)  # (n_strains+1 , n_strains+1)
-    assert mc.metcalf_mean.shape == (4, 4)
+    assert mc.metcalf_std.shape == (4, 4)
 
 
 def test_setup_load_cache(mc, npl, datalinks, caplog):
@@ -85,7 +85,7 @@ def test_setup_load_cache(mc, npl, datalinks, caplog):
     assert isinstance(mc.metcalf_mean, np.ndarray)
     assert isinstance(mc.metcalf_std, np.ndarray)
     assert mc.metcalf_mean.shape == (4, 4)  # (n_strains+1 , n_strains+1)
-    assert mc.metcalf_mean.shape == (4, 4)
+    assert mc.metcalf_std.shape == (4, 4)
 
 
 #
@@ -127,7 +127,7 @@ def test_calc_score_mean_std(mc, datalinks):
     assert isinstance(mc.metcalf_mean, np.ndarray)
     assert isinstance(mc.metcalf_std, np.ndarray)
     assert mc.metcalf_mean.shape == (4, 4)  # (n_strains+1 , n_strains+1)
-    assert mc.metcalf_mean.shape == (4, 4)
+    assert mc.metcalf_std.shape == (4, 4)
     # TODO CG: add tests for values after refactoring _calc_mean_std method
     # assert mc.metcalf_mean == expected_array