From cc5ed45c83bdab54f2dc6765b5605a61c38127a1 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 29 May 2024 16:47:46 +0200 Subject: [PATCH 01/10] merge LinkFinder to MetcalfScoring class The actual role of LinkFinder is to calculate metcalf score, so it makes more sense to merge its functions to MetcalfScoring class --- src/nplinker/scoring/metcalf_scoring.py | 256 +++++++++++++++++++----- 1 file changed, 210 insertions(+), 46 deletions(-) diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py index fa54129ca..a8b7d2b77 100644 --- a/src/nplinker/scoring/metcalf_scoring.py +++ b/src/nplinker/scoring/metcalf_scoring.py @@ -4,6 +4,7 @@ from typing import TYPE_CHECKING import numpy as np import pandas as pd +from scipy.stats import hypergeom from nplinker.genomics import GCF from nplinker.metabolomics import MolecularFamily from nplinker.metabolomics import Spectrum @@ -12,7 +13,6 @@ from .abc import ScoringBase from .linking import LINK_TYPES from .linking import DataLinks -from .linking import LinkFinder from .linking import isinstance_all from .object_link import ObjectLink @@ -30,15 +30,23 @@ class MetcalfScoring(ScoringBase): Attributes: name: The name of this scoring method, set to a fixed value `metcalf`. DATALINKS: The DataLinks object to use for scoring. - LINKFINDER: The LinkFinder object to use for scoring. CACHE: The name of the cache file to use for storing the MetcalfScoring. + + raw_score_spec_gcf: The raw Metcalf scores for spectrum-GCF links. + raw_score_mf_gcf: The raw Metcalf scores for molecular family-GCF links. + metcalf_mean: The mean value used for standardising Metcalf scores. + metcalf_std: The standard deviation value used for standardising Metcalf scores. """ name = "metcalf" DATALINKS = None - LINKFINDER = None CACHE = "cache_metcalf_scoring.pckl" + raw_score_spec_gcf = pd.DataFrame() + raw_score_mf_gcf = pd.DataFrame() + metcalf_mean = None + metcalf_std = None + def __init__(self, npl: NPLinker) -> None: """Create a MetcalfScoring object. @@ -60,9 +68,9 @@ def __init__(self, npl: NPLinker) -> None: # TODO CG: refactor this method and extract code for cache file to a separate method @classmethod def setup(cls, npl: NPLinker): - """Setup the DataLinks and LinkFinder objects. + """Setup the DataLinks object. - This method is only called once to setup the DataLinks and LinkFinder objects. + This method is only called once to setup the DataLinks object. """ logger.info( "MetcalfScoring.setup (bgcs={}, gcfs={}, spectra={}, molfams={}, strains={})".format( @@ -81,13 +89,13 @@ def setup(cls, npl: NPLinker): len(npl.molfams), len(npl.strains), ] - datalinks, linkfinder = None, None + datalinks = None if os.path.exists(cache_file): logger.info("MetcalfScoring.setup loading cached data") cache_data = load_pickled_data(npl, cache_file) cache_ok = True if cache_data is not None: - (counts, datalinks, linkfinder) = cache_data + (counts, datalinks) = cache_data # need to invalidate this if dataset appears to have changed for i in range(len(counts)): if counts[i] != dataset_counts[i]: @@ -97,19 +105,65 @@ def setup(cls, npl: NPLinker): if cache_ok: cls.DATALINKS = datalinks - cls.LINKFINDER = linkfinder if cls.DATALINKS is None: logger.info("MetcalfScoring.setup preprocessing dataset (this may take some time)") cls.DATALINKS = DataLinks(npl.gcfs, npl.spectra, npl.molfams, npl.strains) - cls.LINKFINDER = LinkFinder() - cls.LINKFINDER.calc_score(MetcalfScoring.DATALINKS, link_type=LINK_TYPES[0]) - cls.LINKFINDER.calc_score(MetcalfScoring.DATALINKS, link_type=LINK_TYPES[1]) + cls.calc_score(cls.DATALINKS, link_type=LINK_TYPES[0]) + cls.calc_score(cls.DATALINKS, link_type=LINK_TYPES[1]) logger.info("MetcalfScoring.setup caching results") - save_pickled_data((dataset_counts, cls.DATALINKS, cls.LINKFINDER), cache_file) + # TODO: save the score values 2024-05-29 + save_pickled_data((dataset_counts, cls.DATALINKS), cache_file) logger.info("MetcalfScoring.setup completed") + @classmethod + def calc_score( + cls, + data_links: DataLinks, + link_type: str = "spec-gcf", + scoring_weights: tuple[int, int, int, int] = (10, -10, 0, 1), + ) -> None: + """Calculate Metcalf scores. + + This method calculates the `raw_score_spec_gcf`, `raw_score_mf_gcf`, `metcalf_mean`, and + `metcalf_std` attributes. + + Args: + data_links: The DataLinks object to use for scoring. + link_type: The type of link to score. Must be 'spec-gcf' or + 'mf-gcf'. Defaults to 'spec-gcf'. + scoring_weights: The weights to + use for Metcalf scoring. The weights are applied to + '(met_gcf, met_not_gcf, gcf_not_met, not_met_not_gcf)'. + Defaults to (10, -10, 0, 1). + + Raises: + ValueError: If an invalid link type is provided. + """ + if link_type not in LINK_TYPES: + raise ValueError(f"Invalid link type: {link_type}. Must be one of {LINK_TYPES}") + + if link_type == "spec-gcf": + cls.raw_score_spec_gcf = ( + data_links.cooccurrence_spec_gcf * scoring_weights[0] + + data_links.cooccurrence_spec_notgcf * scoring_weights[1] + + data_links.cooccurrence_notspec_gcf * scoring_weights[2] + + data_links.cooccurrence_notspec_notgcf * scoring_weights[3] + ) + + if link_type == "mf-gcf": + cls.raw_score_mf_gcf = ( + data_links.cooccurrence_mf_gcf * scoring_weights[0] + + data_links.cooccurrence_mf_notgcf * scoring_weights[1] + + data_links.cooccurrence_notmf_gcf * scoring_weights[2] + + data_links.cooccurrence_notmf_notgcf * scoring_weights[3] + ) + + if cls.metcalf_mean is None or cls.metcalf_std is None: + n_strains = data_links.occurrence_gcf_strain.shape[1] + cls.metcalf_mean, cls.metcalf_std = cls._calc_mean_std(n_strains, scoring_weights) + # TODO CG: is it needed? remove it if not @property def datalinks(self) -> DataLinks | None: @@ -135,8 +189,6 @@ def get_links( Raises: ValueError: If the input objects are empty. TypeError: If the input objects are not of the correct type. - ValueError: If LinkFinder instance has not been created - (MetcalfScoring object has not been setup). """ if len(objects) == 0: raise ValueError("Empty input objects.") @@ -153,23 +205,18 @@ def get_links( f"Invalid type {set(types)}. Input objects must be GCF, Spectrum or MolecularFamily objects." ) - if self.LINKFINDER is None: - raise ValueError( - ("LinkFinder object not found. Have you called `MetcalfScoring.setup(npl)`?") - ) - logger.info(f"MetcalfScoring: standardised = {self.standardised}") if not self.standardised: - scores_list = self.LINKFINDER.get_links(*objects, score_cutoff=self.cutoff) + scores_list = self._get_links(*objects, score_cutoff=self.cutoff) # TODO CG: verify the logics of standardised score and add unit tests else: # use negative infinity as the score cutoff to ensure we get all links # the self.cutoff will be applied later in the postprocessing step - scores_list = self.LINKFINDER.get_links(*objects, score_cutoff=np.NINF) + scores_list = self._get_links(*objects, score_cutoff=np.NINF) if obj_type == "gcf": - scores_list = self._calc_standardised_score_gen(self.LINKFINDER, scores_list) + scores_list = self._calc_standardised_score_gen(scores_list) else: - scores_list = self._calc_standardised_score_met(self.LINKFINDER, scores_list) + scores_list = self._calc_standardised_score_met(scores_list) link_scores: dict[ GCF | Spectrum | MolecularFamily, dict[GCF | Spectrum | MolecularFamily, ObjectLink] @@ -225,10 +272,141 @@ def get_links( logger.info("MetcalfScoring: completed") return link_collection - def _calc_standardised_score_met( - self, linkfinder: LinkFinder, results: list + # TODO CG: refactor this method + def format_data(self, data): + """Format the data for display.""" + # for metcalf the data will just be a floating point value (i.e. the score) + return f"{data:.4f}" + + # TODO CG: refactor this method + def sort(self, objects, reverse=True): + """Sort the objects based on the score.""" + # sort based on score + return sorted(objects, key=lambda objlink: objlink[self], reverse=reverse) + + @staticmethod + def _calc_mean_std( + n_strains: int, scoring_weights: tuple[int, int, int, int] + ) -> tuple[np.ndarray, np.ndarray]: + sz = (n_strains + 1, n_strains + 1) + mean = np.zeros(sz) + variance = np.zeros(sz) + for n in range(n_strains + 1): + for m in range(n_strains + 1): + max_overlap = min(n, m) + min_overlap = max(0, n + m - n_strains) + expected_value = 0 + expected_sq = 0 + for o in range(min_overlap, max_overlap + 1): + o_prob = hypergeom.pmf(o, n_strains, n, m) + # compute metcalf for n strains in type 1 and m in gcf + score = o * scoring_weights[0] + score += scoring_weights[1] * (n - o) + score += scoring_weights[2] * (m - o) + score += scoring_weights[3] * (n_strains - (n + m - o)) + expected_value += o_prob * score + expected_sq += o_prob * (score**2) + mean[n, m] = expected_value + expected_sq = expected_sq - expected_value**2 + if expected_sq < 1e-09: + expected_sq = 1 + variance[n, m] = expected_sq + return mean, np.sqrt(variance) + + def _get_links( + self, + *objects: tuple[GCF, ...] | tuple[Spectrum, ...] | tuple[MolecularFamily, ...], + score_cutoff: float = 0.5, ) -> list[pd.DataFrame]: - if linkfinder.metcalf_mean is None or linkfinder.metcalf_std is None: + """Get links and scores for given objects. + + Args: + objects: A list of GCF, Spectrum or MolecularFamily objects + and all objects must be of the same type. + score_cutoff: Minimum score to consider a link (≥score_cutoff). + Default is 0.5. + + Returns: + List of data frames containing the ids of the linked objects + and the score. The data frame has index names of + 'source', 'target' and 'score': + + - the 'source' row contains the ids of the input/source objects, + - the 'target' row contains the ids of the target objects, + - the 'score' row contains the scores. + + Raises: + ValueError: If input objects are empty. + TypeError: If input objects are not GCF, Spectrum or MolecularFamily objects. + """ + if len(objects) == 0: + raise ValueError("Empty input objects.") + + if isinstance_all(*objects, objtype=GCF): + obj_type = "gcf" + elif isinstance_all(*objects, objtype=Spectrum): + obj_type = "spec" + elif isinstance_all(*objects, objtype=MolecularFamily): + obj_type = "mf" + else: + types = [type(i) for i in objects] + raise TypeError( + f"Invalid type {set(types)}. Input objects must be GCF, Spectrum or MolecularFamily objects." + ) + + links = [] + if obj_type == "gcf": + # TODO CG: the hint and mypy warnings will be gone after renaming all + # string ids to `.id` + obj_ids = [gcf.gcf_id for gcf in objects] + # spec-gcf + scores = self.raw_score_spec_gcf.loc[:, obj_ids] + df = self._get_scores_source_gcf(scores, score_cutoff) + df.name = LINK_TYPES[0] + links.append(df) + # mf-gcf + scores = self.raw_score_mf_gcf.loc[:, obj_ids] + df = self._get_scores_source_gcf(scores, score_cutoff) + df.name = LINK_TYPES[1] + links.append(df) + + if obj_type == "spec": + obj_ids = [spec.spectrum_id for spec in objects] + scores = self.raw_score_spec_gcf.loc[obj_ids, :] + df = self._get_scores_source_met(scores, score_cutoff) + df.name = LINK_TYPES[0] + links.append(df) + + if obj_type == "mf": + obj_ids = [mf.family_id for mf in objects] + scores = self.raw_score_mf_gcf.loc[obj_ids, :] + df = self._get_scores_source_met(scores, score_cutoff) + df.name = LINK_TYPES[1] + links.append(df) + return links + + @staticmethod + def _get_scores_source_gcf(scores: pd.DataFrame, score_cutoff: float) -> pd.DataFrame: + row_indexes, col_indexes = np.where(scores >= score_cutoff) + src_obj_ids = scores.columns[col_indexes].to_list() + target_obj_ids = scores.index[row_indexes].to_list() + scores_candidate = scores.values[row_indexes, col_indexes].tolist() + return pd.DataFrame( + [src_obj_ids, target_obj_ids, scores_candidate], index=["source", "target", "score"] + ) + + @staticmethod + def _get_scores_source_met(scores: pd.DataFrame, score_cutoff: float) -> pd.DataFrame: + row_indexes, col_indexes = np.where(scores >= score_cutoff) + src_obj_ids = scores.index[row_indexes].to_list() + target_obj_ids = scores.columns[col_indexes].to_list() + scores_candidate = scores.values[row_indexes, col_indexes].tolist() + return pd.DataFrame( + [src_obj_ids, target_obj_ids, scores_candidate], index=["source", "target", "score"] + ) + + def _calc_standardised_score_met(self, results: list) -> list[pd.DataFrame]: + if self.metcalf_mean is None or self.metcalf_std is None: raise ValueError( "Metcalf mean and std not found. Have you called `MetcalfScoring.setup(npl)`?" ) @@ -244,8 +422,8 @@ def _calc_standardised_score_met( num_gcf_strains = len(gcf.strains) num_met_strains = len(met.strains) - mean = linkfinder.metcalf_mean[num_met_strains][num_gcf_strains] - sqrt = linkfinder.metcalf_std[num_met_strains][num_gcf_strains] + mean = self.metcalf_mean[num_met_strains][num_gcf_strains] + sqrt = self.metcalf_std[num_met_strains][num_gcf_strains] z_score = (raw_score.at["score", col_index] - mean) / sqrt z_scores.append(z_score) @@ -264,10 +442,8 @@ def _calc_standardised_score_met( return [scores_df] - def _calc_standardised_score_gen( - self, linkfinder: LinkFinder, results: list - ) -> list[pd.DataFrame]: - if linkfinder.metcalf_mean is None or linkfinder.metcalf_std is None: + def _calc_standardised_score_gen(self, results: list) -> list[pd.DataFrame]: + if self.metcalf_mean is None or self.metcalf_std is None: raise ValueError( "Metcalf mean and std not found. Have you called `MetcalfScoring.setup(npl)`?" ) @@ -284,8 +460,8 @@ def _calc_standardised_score_gen( num_gcf_strains = len(gcf.strains) num_met_strains = len(met.strains) - mean = linkfinder.metcalf_mean[num_met_strains][num_gcf_strains] - sqrt = linkfinder.metcalf_std[num_met_strains][num_gcf_strains] + mean = self.metcalf_mean[num_met_strains][num_gcf_strains] + sqrt = self.metcalf_std[num_met_strains][num_gcf_strains] z_score = (raw_score.at["score", col_index] - mean) / sqrt z_scores.append(z_score) @@ -304,15 +480,3 @@ def _calc_standardised_score_gen( postprocessed_scores.append(scores_df) return postprocessed_scores - - # TODO CG: refactor this method - def format_data(self, data): - """Format the data for display.""" - # for metcalf the data will just be a floating point value (i.e. the score) - return f"{data:.4f}" - - # TODO CG: refactor this method - def sort(self, objects, reverse=True): - """Sort the objects based on the score.""" - # sort based on score - return sorted(objects, key=lambda objlink: objlink[self], reverse=reverse) From 8bb47613568826fc0059eeca7dab8a81d21f1b15 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 29 May 2024 16:48:07 +0200 Subject: [PATCH 02/10] remove LinkFinder --- src/nplinker/scoring/linking/__init__.py | 3 +- src/nplinker/scoring/linking/link_finder.py | 201 -------------------- 2 files changed, 1 insertion(+), 203 deletions(-) delete mode 100644 src/nplinker/scoring/linking/link_finder.py diff --git a/src/nplinker/scoring/linking/__init__.py b/src/nplinker/scoring/linking/__init__.py index bd391697f..49c863f29 100644 --- a/src/nplinker/scoring/linking/__init__.py +++ b/src/nplinker/scoring/linking/__init__.py @@ -1,8 +1,7 @@ from .data_links import LINK_TYPES from .data_links import DataLinks -from .link_finder import LinkFinder from .utils import calc_correlation_matrix from .utils import isinstance_all -__all__ = ["DataLinks", "LINK_TYPES", "LinkFinder", "calc_correlation_matrix", "isinstance_all"] +__all__ = ["DataLinks", "LINK_TYPES", "calc_correlation_matrix", "isinstance_all"] diff --git a/src/nplinker/scoring/linking/link_finder.py b/src/nplinker/scoring/linking/link_finder.py deleted file mode 100644 index 0e25a0502..000000000 --- a/src/nplinker/scoring/linking/link_finder.py +++ /dev/null @@ -1,201 +0,0 @@ -from __future__ import annotations -import logging -from typing import TYPE_CHECKING -import numpy as np -import pandas as pd -from scipy.stats import hypergeom -from nplinker.genomics.gcf import GCF -from nplinker.metabolomics import MolecularFamily -from nplinker.metabolomics import Spectrum -from . import LINK_TYPES -from .utils import isinstance_all - - -if TYPE_CHECKING: - from . import DataLinks - -logger = logging.getLogger(__file__) - - -# TODO CG: this class could be merged to MetcalfScoring class? -class LinkFinder: - def __init__(self) -> None: - """Initialise LinkFinder object. - - Attributes: - raw_score_spec_gcf: The raw Metcalf scores for - spectrum-GCF links. - raw_score_mf_gcf: The raw Metcalf scores for - molecular family-GCF links. - metcalf_mean: The mean value used for - standardising Metcalf scores. - metcalf_std: The standard deviation value used - for standardising Metcalf scores. - """ - self.raw_score_spec_gcf = pd.DataFrame() - self.raw_score_mf_gcf = pd.DataFrame() - self.metcalf_mean = None - self.metcalf_std = None - - # TODO CG: calc_score method could be integrated to __init__? - def calc_score( - self, - data_links: DataLinks, - link_type: str = "spec-gcf", - scoring_weights: tuple[int, int, int, int] = (10, -10, 0, 1), - ) -> None: - """Calculate Metcalf scores. - - Args: - data_links: The DataLinks object to use for scoring. - link_type: The type of link to score. Must be 'spec-gcf' or - 'mf-gcf'. Defaults to 'spec-gcf'. - scoring_weights: The weights to - use for Metcalf scoring. The weights are applied to - '(met_gcf, met_not_gcf, gcf_not_met, not_met_not_gcf)'. - Defaults to (10, -10, 0, 1). - - Raises: - ValueError: If an invalid link type is provided. - """ - if link_type not in LINK_TYPES: - raise ValueError(f"Invalid link type: {link_type}. Must be one of {LINK_TYPES}") - - if link_type == "spec-gcf": - self.raw_score_spec_gcf = ( - data_links.cooccurrence_spec_gcf * scoring_weights[0] - + data_links.cooccurrence_spec_notgcf * scoring_weights[1] - + data_links.cooccurrence_notspec_gcf * scoring_weights[2] - + data_links.cooccurrence_notspec_notgcf * scoring_weights[3] - ) - if link_type == "mf-gcf": - self.raw_score_mf_gcf = ( - data_links.cooccurrence_mf_gcf * scoring_weights[0] - + data_links.cooccurrence_mf_notgcf * scoring_weights[1] - + data_links.cooccurrence_notmf_gcf * scoring_weights[2] - + data_links.cooccurrence_notmf_notgcf * scoring_weights[3] - ) - - # TODO CG: this part should be moved outside of this method - n_strains = data_links.occurrence_gcf_strain.shape[1] - if self.metcalf_mean is None or self.metcalf_std is None: - self.metcalf_mean, self.metcalf_std = self._calc_mean_std(n_strains, scoring_weights) - - # TODO CG: read paper and check the logics of this method - def _calc_mean_std( - self, n_strains: int, scoring_weights: tuple[int, int, int, int] - ) -> tuple[np.ndarray, np.ndarray]: - sz = (n_strains + 1, n_strains + 1) - mean = np.zeros(sz) - variance = np.zeros(sz) - for n in range(n_strains + 1): - for m in range(n_strains + 1): - max_overlap = min(n, m) - min_overlap = max(0, n + m - n_strains) - expected_value = 0 - expected_sq = 0 - for o in range(min_overlap, max_overlap + 1): - o_prob = hypergeom.pmf(o, n_strains, n, m) - # compute metcalf for n strains in type 1 and m in gcf - score = o * scoring_weights[0] - score += scoring_weights[1] * (n - o) - score += scoring_weights[2] * (m - o) - score += scoring_weights[3] * (n_strains - (n + m - o)) - expected_value += o_prob * score - expected_sq += o_prob * (score**2) - mean[n, m] = expected_value - expected_sq = expected_sq - expected_value**2 - if expected_sq < 1e-09: - expected_sq = 1 - variance[n, m] = expected_sq - return mean, np.sqrt(variance) - - def get_links( - self, - *objects: tuple[GCF, ...] | tuple[Spectrum, ...] | tuple[MolecularFamily, ...], - score_cutoff: float = 0.5, - ) -> list[pd.DataFrame]: - """Get links and scores for given objects. - - Args: - objects: A list of GCF, Spectrum or MolecularFamily objects - and all objects must be of the same type. - score_cutoff: Minimum score to consider a link (≥score_cutoff). - Default is 0.5. - - Returns: - List of data frames containing the ids of the linked objects - and the score. The data frame has index names of - 'source', 'target' and 'score': - - - the 'source' row contains the ids of the input/source objects, - - the 'target' row contains the ids of the target objects, - - the 'score' row contains the scores. - - Raises: - ValueError: If input objects are empty. - TypeError: If input objects are not GCF, Spectrum or MolecularFamily objects. - """ - if len(objects) == 0: - raise ValueError("Empty input objects.") - - if isinstance_all(*objects, objtype=GCF): - obj_type = "gcf" - elif isinstance_all(*objects, objtype=Spectrum): - obj_type = "spec" - elif isinstance_all(*objects, objtype=MolecularFamily): - obj_type = "mf" - else: - types = [type(i) for i in objects] - raise TypeError( - f"Invalid type {set(types)}. Input objects must be GCF, Spectrum or MolecularFamily objects." - ) - - links = [] - if obj_type == "gcf": - # TODO CG: the hint and mypy warnings will be gone after renaming all - # string ids to `.id` - obj_ids = [gcf.gcf_id for gcf in objects] - # spec-gcf - scores = self.raw_score_spec_gcf.loc[:, obj_ids] - df = self._get_scores_source_gcf(scores, score_cutoff) - df.name = LINK_TYPES[0] - links.append(df) - # mf-gcf - scores = self.raw_score_mf_gcf.loc[:, obj_ids] - df = self._get_scores_source_gcf(scores, score_cutoff) - df.name = LINK_TYPES[1] - links.append(df) - - if obj_type == "spec": - obj_ids = [spec.spectrum_id for spec in objects] - scores = self.raw_score_spec_gcf.loc[obj_ids, :] - df = self._get_scores_source_met(scores, score_cutoff) - df.name = LINK_TYPES[0] - links.append(df) - - if obj_type == "mf": - obj_ids = [mf.family_id for mf in objects] - scores = self.raw_score_mf_gcf.loc[obj_ids, :] - df = self._get_scores_source_met(scores, score_cutoff) - df.name = LINK_TYPES[1] - links.append(df) - return links - - def _get_scores_source_gcf(self, scores: pd.DataFrame, score_cutoff: float) -> pd.DataFrame: - row_indexes, col_indexes = np.where(scores >= score_cutoff) - src_obj_ids = scores.columns[col_indexes].to_list() - target_obj_ids = scores.index[row_indexes].to_list() - scores_candidate = scores.values[row_indexes, col_indexes].tolist() - return pd.DataFrame( - [src_obj_ids, target_obj_ids, scores_candidate], index=["source", "target", "score"] - ) - - def _get_scores_source_met(self, scores: pd.DataFrame, score_cutoff: float) -> pd.DataFrame: - row_indexes, col_indexes = np.where(scores >= score_cutoff) - src_obj_ids = scores.index[row_indexes].to_list() - target_obj_ids = scores.columns[col_indexes].to_list() - scores_candidate = scores.values[row_indexes, col_indexes].tolist() - return pd.DataFrame( - [src_obj_ids, target_obj_ids, scores_candidate], index=["source", "target", "score"] - ) From 8d663e870127ed3b3d83bed29e5f9c98a22c1bc3 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 29 May 2024 16:48:57 +0200 Subject: [PATCH 03/10] move unit tests of LinkFinder --- tests/unit/scoring/conftest.py | 10 - tests/unit/scoring/test_link_finder.py | 249 ---------------- tests/unit/scoring/test_metcalf_scoring.py | 322 +++++++++++++++++++-- 3 files changed, 299 insertions(+), 282 deletions(-) delete mode 100644 tests/unit/scoring/test_link_finder.py diff --git a/tests/unit/scoring/conftest.py b/tests/unit/scoring/conftest.py index f1517cbd6..4d37bdfa4 100644 --- a/tests/unit/scoring/conftest.py +++ b/tests/unit/scoring/conftest.py @@ -5,7 +5,6 @@ from nplinker.nplinker import NPLinker from nplinker.scoring import MetcalfScoring from nplinker.scoring.linking import DataLinks -from nplinker.scoring.linking import LinkFinder from nplinker.strain import Strain from nplinker.strain import StrainCollection from .. import CONFIG_FILE_LOCAL_MODE @@ -68,15 +67,6 @@ def datalinks(gcfs, spectra, mfs, strains) -> DataLinks: return DataLinks(gcfs, spectra, mfs, strains) -@fixture(scope="module") -def linkfinder(datalinks) -> LinkFinder: - """LinkFinder object. See `test_link_finder.py` for its values.""" - linkfinder = LinkFinder() - linkfinder.calc_score(datalinks, link_type="spec-gcf") - linkfinder.calc_score(datalinks, link_type="mf-gcf") - return linkfinder - - @fixture(scope="module") def npl(gcfs, spectra, mfs, strains, tmp_path_factory) -> NPLinker: """Constructed NPLinker object. diff --git a/tests/unit/scoring/test_link_finder.py b/tests/unit/scoring/test_link_finder.py deleted file mode 100644 index 16dc731b4..000000000 --- a/tests/unit/scoring/test_link_finder.py +++ /dev/null @@ -1,249 +0,0 @@ -import numpy as np -import pandas as pd -import pytest -from pandas.testing import assert_frame_equal -from pytest import fixture -from nplinker.scoring.linking import LinkFinder - - -@fixture(scope="module") -def linkfinder() -> LinkFinder: - return LinkFinder() - - -def test_init(linkfinder): - assert_frame_equal(linkfinder.raw_score_spec_gcf, pd.DataFrame()) - assert_frame_equal(linkfinder.raw_score_mf_gcf, pd.DataFrame()) - assert linkfinder.metcalf_mean is None - assert linkfinder.metcalf_std is None - - -def test_calc_score_raw_score(linkfinder, datalinks): - """Test `calc_score` method for `raw_score_spec_gcf` and `raw_score_mf_gcf`. - - The expected values are calculated manually by using values from `test_init` - of `test_data_links.py` and the default scoring weights. - """ - # link type = 'spec-gcf' - linkfinder.calc_score(datalinks, link_type="spec-gcf") - assert_frame_equal( - linkfinder.raw_score_spec_gcf, - pd.DataFrame( - [[12, -9, 11], [-9, 12, 11], [1, 1, 21]], - index=["spectrum1", "spectrum2", "spectrum3"], - columns=["gcf1", "gcf2", "gcf3"], - ), - ) - # link type = 'mf-gcf' - linkfinder.calc_score(datalinks, link_type="mf-gcf") - assert_frame_equal( - linkfinder.raw_score_mf_gcf, - pd.DataFrame( - [[12, -9, 11], [-9, 12, 11], [1, 1, 21]], - index=["mf1", "mf2", "mf3"], - columns=["gcf1", "gcf2", "gcf3"], - ), - ) - - -def test_calc_score_mean_std(linkfinder, datalinks): - """Test `calc_score` method for `metcalf_mean` and `metcalf_std`.""" - linkfinder.calc_score(datalinks, link_type="spec-gcf") - assert isinstance(linkfinder.metcalf_mean, np.ndarray) - assert isinstance(linkfinder.metcalf_std, np.ndarray) - assert linkfinder.metcalf_mean.shape == (4, 4) # (n_strains+1 , n_strains+1) - assert linkfinder.metcalf_mean.shape == (4, 4) - # TODO CG: add tests for values after refactoring _calc_mean_std method - # assert linkfinder.metcalf_mean == expected_array - - -def test_get_links_gcf(linkfinder, datalinks, gcfs): - """Test `get_links` method for input GCF objects.""" - linkfinder.calc_score(datalinks, link_type="spec-gcf") - linkfinder.calc_score(datalinks, link_type="mf-gcf") - index_names = ["source", "target", "score"] - - # cutoff = negative infinity (float) - links = linkfinder.get_links(*gcfs, score_cutoff=np.NINF) - assert len(links) == 2 - # expected values got from `test_calc_score_raw_score` - assert_frame_equal( - links[0], - pd.DataFrame( - [ - ["gcf1", "gcf2", "gcf3"] * 3, - [ - *["spectrum1"] * 3, - *["spectrum2"] * 3, - *["spectrum3"] * 3, - ], - [12, -9, 11, -9, 12, 11, 1, 1, 21], - ], - index=index_names, - ), - ) - assert_frame_equal( - links[1], - pd.DataFrame( - [ - ["gcf1", "gcf2", "gcf3"] * 3, - [ - *["mf1"] * 3, - *["mf2"] * 3, - *["mf3"] * 3, - ], - [12, -9, 11, -9, 12, 11, 1, 1, 21], - ], - index=index_names, - ), - ) - - # cutoff = 0 - links = linkfinder.get_links(*gcfs, score_cutoff=0) - assert len(links) == 2 - assert_frame_equal( - links[0], - pd.DataFrame( - [ - ["gcf1", "gcf3", "gcf2", "gcf3", "gcf1", "gcf2", "gcf3"], - [ - *["spectrum1"] * 2, - *["spectrum2"] * 2, - *["spectrum3"] * 3, - ], - [12, 11, 12, 11, 1, 1, 21], - ], - index=index_names, - ), - ) - assert_frame_equal( - links[1], - pd.DataFrame( - [ - ["gcf1", "gcf3", "gcf2", "gcf3", "gcf1", "gcf2", "gcf3"], - [ - *["mf1"] * 2, - *["mf2"] * 2, - *["mf3"] * 3, - ], - [12, 11, 12, 11, 1, 1, 21], - ], - index=index_names, - ), - ) - - -def test_get_links_spec(linkfinder, datalinks, spectra): - """Test `get_links` method for input Spectrum objects.""" - linkfinder.calc_score(datalinks, link_type="spec-gcf") - linkfinder.calc_score(datalinks, link_type="mf-gcf") - index_names = ["source", "target", "score"] - # cutoff = negative infinity (float) - links = linkfinder.get_links(*spectra, score_cutoff=np.NINF) - assert len(links) == 1 - assert_frame_equal( - links[0], - pd.DataFrame( - [ - [ - *["spectrum1"] * 3, - *["spectrum2"] * 3, - *["spectrum3"] * 3, - ], - ["gcf1", "gcf2", "gcf3"] * 3, - [12, -9, 11, -9, 12, 11, 1, 1, 21], - ], - index=index_names, - ), - ) - # cutoff = 0 - links = linkfinder.get_links(*spectra, score_cutoff=0) - assert_frame_equal( - links[0], - pd.DataFrame( - [ - [ - *["spectrum1"] * 2, - *["spectrum2"] * 2, - *["spectrum3"] * 3, - ], - ["gcf1", "gcf3", "gcf2", "gcf3", "gcf1", "gcf2", "gcf3"], - [12, 11, 12, 11, 1, 1, 21], - ], - index=index_names, - ), - ) - - -def test_get_links_mf(linkfinder, datalinks, mfs): - """Test `get_links` method for input MolecularFamily objects.""" - linkfinder.calc_score(datalinks, link_type="spec-gcf") - linkfinder.calc_score(datalinks, link_type="mf-gcf") - index_names = ["source", "target", "score"] - # cutoff = negative infinity (float) - links = linkfinder.get_links(*mfs, score_cutoff=np.NINF) - assert len(links) == 1 - assert_frame_equal( - links[0], - pd.DataFrame( - [ - [ - *["mf1"] * 3, - *["mf2"] * 3, - *["mf3"] * 3, - ], - ["gcf1", "gcf2", "gcf3"] * 3, - [12, -9, 11, -9, 12, 11, 1, 1, 21], - ], - index=index_names, - ), - ) - # cutoff = 0 - links = linkfinder.get_links(*mfs, score_cutoff=0) - assert_frame_equal( - links[0], - pd.DataFrame( - [ - [ - *["mf1"] * 2, - *["mf2"] * 2, - *["mf3"] * 3, - ], - ["gcf1", "gcf3", "gcf2", "gcf3", "gcf1", "gcf2", "gcf3"], - [12, 11, 12, 11, 1, 1, 21], - ], - index=index_names, - ), - ) - - -@pytest.mark.parametrize( - "objects, expected", [([], "Empty input objects"), ("", "Empty input objects")] -) -def test_get_links_invalid_value(linkfinder, objects, expected): - with pytest.raises(ValueError) as e: - linkfinder.get_links(*objects) - assert expected in str(e.value) - - -@pytest.mark.parametrize( - "objects, expected", - [ - ([1], "Invalid type {}"), - ([1, 2], "Invalid type {}"), - ("12", "Invalid type {}"), - ], -) -def test_get_links_invalid_type(linkfinder, objects, expected): - with pytest.raises(TypeError) as e: - linkfinder.get_links(*objects) - assert expected in str(e.value) - - -def test_get_links_invalid_mixed_types(linkfinder, spectra, mfs): - objects = (*spectra, *mfs) - with pytest.raises(TypeError) as e: - linkfinder.get_links(*objects) - assert "Invalid type" in str(e.value) - assert ".MolecularFamily" in str(e.value) - assert ".Spectrum" in str(e.value) diff --git a/tests/unit/scoring/test_metcalf_scoring.py b/tests/unit/scoring/test_metcalf_scoring.py index ec9449cb5..7c5e7f138 100644 --- a/tests/unit/scoring/test_metcalf_scoring.py +++ b/tests/unit/scoring/test_metcalf_scoring.py @@ -1,12 +1,11 @@ import numpy as np +import pandas as pd import pytest -from numpy.testing import assert_array_equal from pandas.testing import assert_frame_equal from nplinker.scoring import LinkCollection from nplinker.scoring import MetcalfScoring from nplinker.scoring import ObjectLink from nplinker.scoring.linking import DataLinks -from nplinker.scoring.linking import LinkFinder def test_init(npl): @@ -16,39 +15,128 @@ def test_init(npl): assert mc.cutoff == 1.0 assert mc.standardised is True assert mc.DATALINKS is None - assert mc.LINKFINDER is None + assert_frame_equal(mc.raw_score_spec_gcf, pd.DataFrame()) + assert_frame_equal(mc.raw_score_mf_gcf, pd.DataFrame()) + assert mc.metcalf_mean is None + assert mc.metcalf_std is None -def test_setup(mc, datalinks, linkfinder): +# +# Test the `setup` method +# + + +def test_setup(mc, datalinks): """Test `setup` method when cache file does not exist.""" assert isinstance(mc.DATALINKS, DataLinks) - assert isinstance(mc.LINKFINDER, LinkFinder) assert_frame_equal(mc.DATALINKS.occurrence_gcf_strain, datalinks.occurrence_gcf_strain) assert_frame_equal(mc.DATALINKS.cooccurrence_spec_gcf, datalinks.cooccurrence_spec_gcf) - assert_frame_equal(mc.LINKFINDER.raw_score_spec_gcf, linkfinder.raw_score_spec_gcf) - assert_frame_equal(mc.LINKFINDER.raw_score_mf_gcf, linkfinder.raw_score_mf_gcf) - assert_array_equal(mc.LINKFINDER.metcalf_mean, linkfinder.metcalf_mean) - assert_array_equal(mc.LINKFINDER.metcalf_std, linkfinder.metcalf_std) - - -def test_setup_load_cache(mc, npl, datalinks, linkfinder, caplog): + assert_frame_equal( + mc.raw_score_spec_gcf, + pd.DataFrame( + [[12, -9, 11], [-9, 12, 11], [1, 1, 21]], + index=["spectrum1", "spectrum2", "spectrum3"], + columns=["gcf1", "gcf2", "gcf3"], + ), + ) + assert_frame_equal( + mc.raw_score_mf_gcf, + pd.DataFrame( + [[12, -9, 11], [-9, 12, 11], [1, 1, 21]], + index=["mf1", "mf2", "mf3"], + columns=["gcf1", "gcf2", "gcf3"], + ), + ) + + assert isinstance(mc.metcalf_mean, np.ndarray) + assert isinstance(mc.metcalf_std, np.ndarray) + assert mc.metcalf_mean.shape == (4, 4) # (n_strains+1 , n_strains+1) + assert mc.metcalf_mean.shape == (4, 4) + + +def test_setup_load_cache(mc, npl, datalinks, caplog): """Test `setup` method when cache file exists.""" mc.setup(npl) assert "MetcalfScoring.setup loading cached data" in caplog.text assert "MetcalfScoring.setup caching results" not in caplog.text assert isinstance(mc.DATALINKS, DataLinks) - assert isinstance(mc.LINKFINDER, LinkFinder) assert_frame_equal(mc.DATALINKS.occurrence_gcf_strain, datalinks.occurrence_gcf_strain) assert_frame_equal(mc.DATALINKS.cooccurrence_spec_gcf, datalinks.cooccurrence_spec_gcf) - assert_frame_equal(mc.LINKFINDER.raw_score_spec_gcf, linkfinder.raw_score_spec_gcf) - assert_frame_equal(mc.LINKFINDER.raw_score_mf_gcf, linkfinder.raw_score_mf_gcf) - assert_array_equal(mc.LINKFINDER.metcalf_mean, linkfinder.metcalf_mean) - assert_array_equal(mc.LINKFINDER.metcalf_std, linkfinder.metcalf_std) + assert_frame_equal( + mc.raw_score_spec_gcf, + pd.DataFrame( + [[12, -9, 11], [-9, 12, 11], [1, 1, 21]], + index=["spectrum1", "spectrum2", "spectrum3"], + columns=["gcf1", "gcf2", "gcf3"], + ), + ) + assert_frame_equal( + mc.raw_score_mf_gcf, + pd.DataFrame( + [[12, -9, 11], [-9, 12, 11], [1, 1, 21]], + index=["mf1", "mf2", "mf3"], + columns=["gcf1", "gcf2", "gcf3"], + ), + ) + + assert isinstance(mc.metcalf_mean, np.ndarray) + assert isinstance(mc.metcalf_std, np.ndarray) + assert mc.metcalf_mean.shape == (4, 4) # (n_strains+1 , n_strains+1) + assert mc.metcalf_mean.shape == (4, 4) + + +# +# Test the `calc_score` method +# + + +def test_calc_score_raw_score(mc, datalinks): + """Test `calc_score` method for `raw_score_spec_gcf` and `raw_score_mf_gcf`. + + The expected values are calculated manually by using values from `test_init` + of `test_data_links.py` and the default scoring weights. + """ + # link type = 'spec-gcf' + mc.calc_score(datalinks, link_type="spec-gcf") + assert_frame_equal( + mc.raw_score_spec_gcf, + pd.DataFrame( + [[12, -9, 11], [-9, 12, 11], [1, 1, 21]], + index=["spectrum1", "spectrum2", "spectrum3"], + columns=["gcf1", "gcf2", "gcf3"], + ), + ) + # link type = 'mf-gcf' + mc.calc_score(datalinks, link_type="mf-gcf") + assert_frame_equal( + mc.raw_score_mf_gcf, + pd.DataFrame( + [[12, -9, 11], [-9, 12, 11], [1, 1, 21]], + index=["mf1", "mf2", "mf3"], + columns=["gcf1", "gcf2", "gcf3"], + ), + ) + + +def test_calc_score_mean_std(mc, datalinks): + """Test `calc_score` method for `metcalf_mean` and `metcalf_std`.""" + mc.calc_score(datalinks, link_type="spec-gcf") + assert isinstance(mc.metcalf_mean, np.ndarray) + assert isinstance(mc.metcalf_std, np.ndarray) + assert mc.metcalf_mean.shape == (4, 4) # (n_strains+1 , n_strains+1) + assert mc.metcalf_mean.shape == (4, 4) + # TODO CG: add tests for values after refactoring _calc_mean_std method + # assert mc.metcalf_mean == expected_array + + +# +# Test the `get_links` method +# def test_get_links_gcf_standardised_false(mc, gcfs, spectra, mfs): @@ -194,10 +282,198 @@ def test_get_links_invalid_mixed_types(mc, spectra, mfs): assert ".Spectrum" in str(e.value) -def test_get_links_no_linkfinder(npl, gcfs): - """Test `get_links` method when no LinkFinder object is found.""" - mc = MetcalfScoring(npl) - mc.LINKFINDER = None +# +# Test the `_get_links` method +# + + +def test__get_links_gcf(mc, datalinks, gcfs): + """Test `get_links` method for input GCF objects.""" + mc.calc_score(datalinks, link_type="spec-gcf") + mc.calc_score(datalinks, link_type="mf-gcf") + index_names = ["source", "target", "score"] + + # cutoff = negative infinity (float) + links = mc._get_links(*gcfs, score_cutoff=np.NINF) + assert len(links) == 2 + # expected values got from `test_calc_score_raw_score` + assert_frame_equal( + links[0], + pd.DataFrame( + [ + ["gcf1", "gcf2", "gcf3"] * 3, + [ + *["spectrum1"] * 3, + *["spectrum2"] * 3, + *["spectrum3"] * 3, + ], + [12, -9, 11, -9, 12, 11, 1, 1, 21], + ], + index=index_names, + ), + ) + assert_frame_equal( + links[1], + pd.DataFrame( + [ + ["gcf1", "gcf2", "gcf3"] * 3, + [ + *["mf1"] * 3, + *["mf2"] * 3, + *["mf3"] * 3, + ], + [12, -9, 11, -9, 12, 11, 1, 1, 21], + ], + index=index_names, + ), + ) + + # cutoff = 0 + links = mc._get_links(*gcfs, score_cutoff=0) + assert len(links) == 2 + assert_frame_equal( + links[0], + pd.DataFrame( + [ + ["gcf1", "gcf3", "gcf2", "gcf3", "gcf1", "gcf2", "gcf3"], + [ + *["spectrum1"] * 2, + *["spectrum2"] * 2, + *["spectrum3"] * 3, + ], + [12, 11, 12, 11, 1, 1, 21], + ], + index=index_names, + ), + ) + assert_frame_equal( + links[1], + pd.DataFrame( + [ + ["gcf1", "gcf3", "gcf2", "gcf3", "gcf1", "gcf2", "gcf3"], + [ + *["mf1"] * 2, + *["mf2"] * 2, + *["mf3"] * 3, + ], + [12, 11, 12, 11, 1, 1, 21], + ], + index=index_names, + ), + ) + + +def test__get_links_spec(mc, datalinks, spectra): + """Test `get_links` method for input Spectrum objects.""" + mc.calc_score(datalinks, link_type="spec-gcf") + mc.calc_score(datalinks, link_type="mf-gcf") + index_names = ["source", "target", "score"] + # cutoff = negative infinity (float) + links = mc._get_links(*spectra, score_cutoff=np.NINF) + assert len(links) == 1 + assert_frame_equal( + links[0], + pd.DataFrame( + [ + [ + *["spectrum1"] * 3, + *["spectrum2"] * 3, + *["spectrum3"] * 3, + ], + ["gcf1", "gcf2", "gcf3"] * 3, + [12, -9, 11, -9, 12, 11, 1, 1, 21], + ], + index=index_names, + ), + ) + # cutoff = 0 + links = mc._get_links(*spectra, score_cutoff=0) + assert_frame_equal( + links[0], + pd.DataFrame( + [ + [ + *["spectrum1"] * 2, + *["spectrum2"] * 2, + *["spectrum3"] * 3, + ], + ["gcf1", "gcf3", "gcf2", "gcf3", "gcf1", "gcf2", "gcf3"], + [12, 11, 12, 11, 1, 1, 21], + ], + index=index_names, + ), + ) + + +def test__get_links_mf(mc, datalinks, mfs): + """Test `get_links` method for input MolecularFamily objects.""" + mc.calc_score(datalinks, link_type="spec-gcf") + mc.calc_score(datalinks, link_type="mf-gcf") + index_names = ["source", "target", "score"] + # cutoff = negative infinity (float) + links = mc._get_links(*mfs, score_cutoff=np.NINF) + assert len(links) == 1 + assert_frame_equal( + links[0], + pd.DataFrame( + [ + [ + *["mf1"] * 3, + *["mf2"] * 3, + *["mf3"] * 3, + ], + ["gcf1", "gcf2", "gcf3"] * 3, + [12, -9, 11, -9, 12, 11, 1, 1, 21], + ], + index=index_names, + ), + ) + # cutoff = 0 + links = mc._get_links(*mfs, score_cutoff=0) + assert_frame_equal( + links[0], + pd.DataFrame( + [ + [ + *["mf1"] * 2, + *["mf2"] * 2, + *["mf3"] * 3, + ], + ["gcf1", "gcf3", "gcf2", "gcf3", "gcf1", "gcf2", "gcf3"], + [12, 11, 12, 11, 1, 1, 21], + ], + index=index_names, + ), + ) + + +@pytest.mark.parametrize( + "objects, expected", [([], "Empty input objects"), ("", "Empty input objects")] +) +def test_get_links_invalid_value(mc, objects, expected): with pytest.raises(ValueError) as e: - mc.get_links(*gcfs, link_collection=LinkCollection()) - assert "LinkFinder object not found." in str(e.value) + mc._get_links(*objects) + assert expected in str(e.value) + + +@pytest.mark.parametrize( + "objects, expected", + [ + ([1], "Invalid type {}"), + ([1, 2], "Invalid type {}"), + ("12", "Invalid type {}"), + ], +) +def test__get_links_invalid_type(mc, objects, expected): + with pytest.raises(TypeError) as e: + mc._get_links(*objects) + assert expected in str(e.value) + + +def test__get_links_invalid_mixed_types(mc, spectra, mfs): + objects = (*spectra, *mfs) + with pytest.raises(TypeError) as e: + mc._get_links(*objects) + assert "Invalid type" in str(e.value) + assert ".MolecularFamily" in str(e.value) + assert ".Spectrum" in str(e.value) From ccbe154e46b397a458f403b020bcea887fe7edb5 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 29 May 2024 17:06:14 +0200 Subject: [PATCH 04/10] update static typings --- src/nplinker/scoring/metcalf_scoring.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py index a8b7d2b77..107fc4fd2 100644 --- a/src/nplinker/scoring/metcalf_scoring.py +++ b/src/nplinker/scoring/metcalf_scoring.py @@ -31,7 +31,6 @@ class MetcalfScoring(ScoringBase): name: The name of this scoring method, set to a fixed value `metcalf`. DATALINKS: The DataLinks object to use for scoring. CACHE: The name of the cache file to use for storing the MetcalfScoring. - raw_score_spec_gcf: The raw Metcalf scores for spectrum-GCF links. raw_score_mf_gcf: The raw Metcalf scores for molecular family-GCF links. metcalf_mean: The mean value used for standardising Metcalf scores. @@ -39,13 +38,13 @@ class MetcalfScoring(ScoringBase): """ name = "metcalf" - DATALINKS = None - CACHE = "cache_metcalf_scoring.pckl" + DATALINKS: datalinks | None = None + CACHE: str = "cache_metcalf_scoring.pckl" - raw_score_spec_gcf = pd.DataFrame() - raw_score_mf_gcf = pd.DataFrame() - metcalf_mean = None - metcalf_std = None + raw_score_spec_gcf: pd.DataFrame = pd.DataFrame() + raw_score_mf_gcf: pd.DataFrame = pd.DataFrame() + metcalf_mean: np.ndarray | None = None + metcalf_std: np.ndarray = None def __init__(self, npl: NPLinker) -> None: """Create a MetcalfScoring object. @@ -58,12 +57,10 @@ def __init__(self, npl: NPLinker) -> None: this value will be discarded. Defaults to 1.0. standardised: Whether to use standardised scores. Defaults to True. - name: The name of the scoring method. It's set to a fixed value - 'metcalf'. """ super().__init__(npl) - self.cutoff = 1.0 - self.standardised = True + self.cutoff: float = 1.0 + self.standardised: bool = True # TODO CG: refactor this method and extract code for cache file to a separate method @classmethod From e83f7fcc7a487b1424c5039cbbcf48bd36939275 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 30 May 2024 14:13:37 +0200 Subject: [PATCH 05/10] Delete conftest.py --- tests/unit/conftest.py | 40 ---------------------------------------- 1 file changed, 40 deletions(-) delete mode 100644 tests/unit/conftest.py diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py deleted file mode 100644 index c6b9afcd4..000000000 --- a/tests/unit/conftest.py +++ /dev/null @@ -1,40 +0,0 @@ -import os -import shutil -import tempfile - - -nplinker_root_dir = os.path.join(tempfile.gettempdir(), "nplinker_unit_test") - - -def pytest_sessionstart(session): - """Pytest hook to run before the entire test session starts. - - This hook makes sure the temporary directory `nplinker_root_dir` is created before any test - starts. When running tests in parallel, the creation operation is done by the master process, - and worker processes are not allowed to do it. - - For more about this hook, see: - 1. https://docs.pytest.org/en/stable/reference.html#_pytest.hookspec.pytest_sessionstart - 2. https://github.com/pytest-dev/pytest-xdist/issues/271#issuecomment-826396320 - """ - workerinput = getattr(session.config, "workerinput", None) - # It's master process or not running in parallell when `workerinput` is None. - if workerinput is None: - if os.path.exists(nplinker_root_dir): - shutil.rmtree(nplinker_root_dir) - os.mkdir(nplinker_root_dir) - # NPLinker setting `root_dir` must be a path that exists, so setting it to a temporary directory. - os.environ["NPLINKER_ROOT_DIR"] = nplinker_root_dir - - -def pytest_sessionfinish(session): - """Pytest hook to run after the entire test session finishes. - - This hook makes sure that temporary directory `nplinker_root_dir` is only removed after all - tests finish. When running tests in parallel, the deletion operation is done by the master - processs, and worker processes are not allowed to do it. - - """ - workerinput = getattr(session.config, "workerinput", None) - if workerinput is None: - shutil.rmtree(nplinker_root_dir) From 4a34f1e0fb06d1f2878e7f61dadc702385d01d6b Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 30 May 2024 14:16:51 +0200 Subject: [PATCH 06/10] create temporary root dir only in the places that require it --- tests/unit/scoring/conftest.py | 6 ++++-- tests/unit/test_config.py | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/unit/scoring/conftest.py b/tests/unit/scoring/conftest.py index 4d37bdfa4..f36b60050 100644 --- a/tests/unit/scoring/conftest.py +++ b/tests/unit/scoring/conftest.py @@ -1,3 +1,4 @@ +import os from pytest import fixture from nplinker.genomics import GCF from nplinker.metabolomics import MolecularFamily @@ -67,8 +68,8 @@ def datalinks(gcfs, spectra, mfs, strains) -> DataLinks: return DataLinks(gcfs, spectra, mfs, strains) -@fixture(scope="module") -def npl(gcfs, spectra, mfs, strains, tmp_path_factory) -> NPLinker: +@fixture(scope="function") +def npl(gcfs, spectra, mfs, strains, tmp_path) -> NPLinker: """Constructed NPLinker object. This NPLinker object does not do loading `npl.load_data()`, instead we @@ -77,6 +78,7 @@ def npl(gcfs, spectra, mfs, strains, tmp_path_factory) -> NPLinker: The config file `nplinker_demo1.toml` does not affect the tests, just making sure the NPLinker object can be created succesfully. """ + os.environ["NPLINKER_ROOT_DIR"] = str(tmp_path) # Create a tmporary root dir for NPLinker npl = NPLinker(CONFIG_FILE_LOCAL_MODE) npl._gcfs = gcfs npl._spectra = spectra diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index 3e57afa62..f681cc928 100644 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -1,9 +1,11 @@ +import os from nplinker.config import load_config from . import CONFIG_FILE_LOCAL_MODE -def test_config(): +def test_config(tmp_path): """Test loading the default config file.""" + os.environ["NPLINKER_ROOT_DIR"] = str(tmp_path) # Create a tmporary root dir for NPLinker config = load_config(CONFIG_FILE_LOCAL_MODE) assert config.mode == "local" From 93e591bc07a4a90c1d61a3a5ff1570c9166bfdb9 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 30 May 2024 14:18:57 +0200 Subject: [PATCH 07/10] update fixture scopes --- tests/unit/scoring/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/scoring/conftest.py b/tests/unit/scoring/conftest.py index f36b60050..e711e78a0 100644 --- a/tests/unit/scoring/conftest.py +++ b/tests/unit/scoring/conftest.py @@ -62,7 +62,7 @@ def mfs(spectra) -> tuple[MolecularFamily, MolecularFamily, MolecularFamily]: return mf1, mf2, mf3 -@fixture(scope="module") +@fixture(scope="session") def datalinks(gcfs, spectra, mfs, strains) -> DataLinks: """DataLinks object. See `test_data_links.py` for its values.""" return DataLinks(gcfs, spectra, mfs, strains) @@ -90,7 +90,7 @@ def npl(gcfs, spectra, mfs, strains, tmp_path) -> NPLinker: return npl -@fixture(scope="module") +@fixture(scope="function") def mc(npl) -> MetcalfScoring: """MetcalfScoring object.""" mc = MetcalfScoring(npl) From 9d72c71a4f6a55ced841fa4e3d471134cce29505 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 30 May 2024 14:19:23 +0200 Subject: [PATCH 08/10] Update test_metcalf_scoring.py caplog cannot capture all logs, so remove the assertions. --- tests/unit/scoring/test_metcalf_scoring.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/unit/scoring/test_metcalf_scoring.py b/tests/unit/scoring/test_metcalf_scoring.py index 7c5e7f138..43c9baa72 100644 --- a/tests/unit/scoring/test_metcalf_scoring.py +++ b/tests/unit/scoring/test_metcalf_scoring.py @@ -59,8 +59,6 @@ def test_setup(mc, datalinks): def test_setup_load_cache(mc, npl, datalinks, caplog): """Test `setup` method when cache file exists.""" mc.setup(npl) - assert "MetcalfScoring.setup loading cached data" in caplog.text - assert "MetcalfScoring.setup caching results" not in caplog.text assert isinstance(mc.DATALINKS, DataLinks) From 311b83b50dc685a29a6ebb0fc406832725c73db6 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 30 May 2024 14:23:18 +0200 Subject: [PATCH 09/10] update pytest dist value to loadgroup This option is much faster and easier to control the group of tests for same worker. --- pyproject.toml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 15a0be6c3..cd3a0bbb7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,8 +87,10 @@ namespaces = true # enable data directory to be identified [tool.pytest.ini_options] minversion = "6.0" -# pytest options: -ra: show summary info for all test outcomes; -n auto: run tests in parallel; --dist loadscope: distribute tests by loading scope -addopts = "-ra -n auto --dist loadscope" +# -ra: show summary info for all test outcomes; +# -n auto: run tests in parallel; +# --dist loadgroup: sends tests marked with 'xdist_group' to the same worker +addopts = "-ra -n auto --dist loadgroup" testpaths = ["tests/unit"] [tool.coverage.run] From d13d8831be36a812cce9dc1c12d1d570091579af Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 6 Jun 2024 09:24:02 +0200 Subject: [PATCH 10/10] fix repeatings --- tests/unit/scoring/test_metcalf_scoring.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/scoring/test_metcalf_scoring.py b/tests/unit/scoring/test_metcalf_scoring.py index 43c9baa72..997ed4a56 100644 --- a/tests/unit/scoring/test_metcalf_scoring.py +++ b/tests/unit/scoring/test_metcalf_scoring.py @@ -53,7 +53,7 @@ def test_setup(mc, datalinks): assert isinstance(mc.metcalf_mean, np.ndarray) assert isinstance(mc.metcalf_std, np.ndarray) assert mc.metcalf_mean.shape == (4, 4) # (n_strains+1 , n_strains+1) - assert mc.metcalf_mean.shape == (4, 4) + assert mc.metcalf_std.shape == (4, 4) def test_setup_load_cache(mc, npl, datalinks, caplog): @@ -85,7 +85,7 @@ def test_setup_load_cache(mc, npl, datalinks, caplog): assert isinstance(mc.metcalf_mean, np.ndarray) assert isinstance(mc.metcalf_std, np.ndarray) assert mc.metcalf_mean.shape == (4, 4) # (n_strains+1 , n_strains+1) - assert mc.metcalf_mean.shape == (4, 4) + assert mc.metcalf_std.shape == (4, 4) # @@ -127,7 +127,7 @@ def test_calc_score_mean_std(mc, datalinks): assert isinstance(mc.metcalf_mean, np.ndarray) assert isinstance(mc.metcalf_std, np.ndarray) assert mc.metcalf_mean.shape == (4, 4) # (n_strains+1 , n_strains+1) - assert mc.metcalf_mean.shape == (4, 4) + assert mc.metcalf_std.shape == (4, 4) # TODO CG: add tests for values after refactoring _calc_mean_std method # assert mc.metcalf_mean == expected_array