diff --git a/anvio/__init__.py b/anvio/__init__.py index 334722e311..4e9e19503c 100644 --- a/anvio/__init__.py +++ b/anvio/__init__.py @@ -3168,6 +3168,22 @@ def TABULATE(table, header, numalign="right", max_width=0): 'action': 'store_true', 'help': "Use this flag to skip using BRITE hierarchies, which we don't recommend but let you do anyways."} ), + 'skip-binary-relations': ( + ['--skip-binary-relations'], + {'default': False, + 'action': 'store_true', + 'help': "Use this flag to skip setting up KEGG binary relation files, which we don't " + "recommend, since they are necessary for running `anvi-reaction-network`, but " + "let you do anyways."} + ), + 'skip-map-images': ( + ['--skip-map-images'], + {'default': False, + 'action': 'store_true', + 'help': "Use this flag to skip setting up KEGG pathway map image files, which we don't " + "recommend, since they are used in visualizing pathway membership, but let you " + "do anyways."} + ), 'heuristic-e-value': ( ['-E', '--heuristic-e-value'], {'default': 1.0e-5, diff --git a/anvio/biochemistry/__init__.py b/anvio/biochemistry/__init__.py deleted file mode 100644 index 2b324eb805..0000000000 --- a/anvio/biochemistry/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# -*- coding: utf-8 -# pylint: disable=line-too-long -"""This package annotates protein orthologs with reference data and outputs metabolic models.""" diff --git a/anvio/biochemistry/metabolicmodel.py b/anvio/biochemistry/metabolicmodel.py deleted file mode 100644 index b8941e1498..0000000000 --- a/anvio/biochemistry/metabolicmodel.py +++ /dev/null @@ -1,830 +0,0 @@ -# -*- coding: utf-8 -# pylint: disable=line-too-long -"""Metabolic model generation tools.""" - -from __future__ import annotations - -import os -import json -import pandas as pd - -from argparse import Namespace -from abc import ABC, abstractmethod -from typing import Any, Dict, List, Set, Tuple - -import anvio.terminal as terminal -import anvio.biochemistry.refdbs as refdbs -import anvio.biochemistry.protein as protein - -from anvio.errors import ConfigError -from anvio.terminal import Run, Progress -from anvio.ccollections import Collections -from anvio.filesnpaths import is_output_file_writable -from anvio.dbops import ContigsSuperclass, PanSuperclass -from anvio import TABULATE, QUIET, __version__ as VERSION -from anvio.utils import is_contigs_db, is_genome_storage, is_pan_db - - -__copyright__ = "Copyleft 2015-2024, The Anvi'o Project (http://anvio.org/)" -__credits__ = [] -__license__ = "GPL 3.0" -__version__ = VERSION -__maintainer__ = "Samuel Miller" -__email__ = "samuelmiller10@gmail.com" -__status__ = "Development" - - -run_quiet = terminal.Run(verbose=False) - -class COBRApyJSONStructure: - """COBRApy JSON input file structure.""" - - @staticmethod - def get() -> Dict[str, Any]: - """JSON format.""" - return { - 'metabolites': [], - 'reactions': [], - 'genes': [], - 'id': '', - 'compartments': { - 'c': 'cytosol', - 'e': 'extracellular space' - }, - 'version': '1' - } - - @staticmethod - def get_metabolite_entry() -> Dict[str, Any]: - """Format of each object in the JSON 'metabolites' array.""" - return { - 'id': '', - 'name': '', - 'compartment': '', - 'charge': 0, # placeholder: uncharged - 'formula': '', - 'notes': {}, - 'annotation': {} - } - - @staticmethod - def get_reaction_entry() -> Dict[str, Any]: - """Format of each reaction object in the JSON 'reactions' array.""" - return { - 'id': '', - 'name': '', - 'metabolites': {}, - 'lower_bound': -1000.0, # placeholder: reversible reaction - 'upper_bound': 1000.0, - 'gene_reaction_rule': '', - 'subsystem': '', - 'notes': {}, - 'annotation': {} - } - - @staticmethod - def get_gene_entry() -> Dict[str, Any]: - """Format of each object in the JSON 'genes' array.""" - return { - 'id': '', - 'name': '', - 'notes': {}, - 'annotation': {} - } - - @staticmethod - def get_ecoli_objective() -> Dict[str, Any]: - """Biomass objective from JSON 'reactions' array in the COBRApy example file, - 'e_coli_core.json'. BiGG metabolite IDs have been replaced with KBase/ModelSEED compound - IDs.""" - return { - 'id': 'BIOMASS_Ecoli_core_w_GAM', - 'name': 'Biomass Objective Function with GAM', - 'metabolites': { - 'cpd00169_c': -1.496, - 'cpd00022_c': -3.7478, - 'cpd00008_c': 59.81, - 'cpd00024_c': 4.1182, - 'cpd00002_c': -59.81, - 'cpd00010_c': 3.7478, - 'cpd00236_c': -0.361, - 'cpd00072_c': -0.0709, - 'cpd00102_c': -0.129, - 'cpd00079_c': -0.205, - 'cpd00053_c': -0.2557, - 'cpd00023_c': -4.9414, - 'cpd00001_c': -59.81, - 'cpd00067_c': 59.81, - 'cpd00003_c': -3.547, - 'cpd00004_c': 3.547, - 'cpd00006_c': 13.0279, - 'cpd00005_c': -13.0279, - 'cpd00032_c': -1.7867, - 'cpd00061_c': -0.5191, - 'cpd00009_c': 59.81, - 'cpd00020_c': -2.8328, - 'cpd00101_c': -0.8977 - }, - 'lower_bound': 0.0, - 'upper_bound': 1000.0, - 'gene_reaction_rule': '', - 'objective_coefficient': 1.0, - 'subsystem': 'Biomass and maintenance functions', - 'notes': { - 'original_bigg_ids': [ - 'Biomass_Ecoli_core_w_GAM' - ], - 'original_metabolite_names': { - '3pg_c': -1.496, - 'accoa_c': -3.7478, - 'adp_c': 59.81, - 'akg_c': 4.1182, - 'atp_c': -59.81, - 'coa_c': 3.7478, - 'e4p_c': -0.361, - 'f6p_c': -0.0709, - 'g3p_c': -0.129, - 'g6p_c': -0.205, - 'gln__L_c': -0.2557, - 'glu__L_c': -4.9414, - 'h2o_c': -59.81, - 'h_c': 59.81, - 'nad_c': -3.547, - 'nadh_c': 3.547, - 'nadp_c': 13.0279, - 'nadph_c': -13.0279, - 'oaa_c': -1.7867, - 'pep_c': -0.5191, - 'pi_c': 59.81, - 'pyr_c': -2.8328, - 'r5p_c': -0.8977 - } - }, - 'annotation': { - 'bigg.reaction': [ - 'BIOMASS_Ecoli_core_w_GAM' - ], - 'metanetx.reaction': [ - 'MNXR96280' - ], - 'sbo': 'SBO:0000629' - } - } - -class ModelInput(ABC): - """Metabolic model input data.""" - def set_protein_annotation_source( - self, - source: str = 'KEGG', - cross_references: Tuple[str] = ('ModelSEED', ), - db_superdir: str = refdbs.ProteinReferenceDatabase.default_superdir - ) -> None: - """ - Set the source of protein annotations used in the model and prepare for analysis of - orthologs. - - Parameters - ========== - source : str - The source of protein ortholog annotations. For now, only KEGG orthologs can be - processed. Default, 'KEGG'. - cross_references : tuple - Protein reference databases with which orthologs are cross-referenced. For now, - 'ModelSEED' must be supplied as the sole cross-reference with source 'KEGG'. Default, - ('ModelSEED', ) - db_superdir : str - The directory containing protein reference database subdirectories, e.g., 'kegg', - 'modelseed'. The source and cross-references correspond to these databases. - """ - if source != 'KEGG': - raise ConfigError( - "For now, the only protein annotation source that can be processed is 'KEGG', not " - f"'{source}'." - ) - if source == 'KEGG': - if cross_references != ('ModelSEED', ): - raise ConfigError( - "For now, the 'ModelSEED' database is the sole required cross-reference for " - "source, 'KEGG', and not " - f"'{', '.join(r for r in cross_references if r != 'ModelSEED')}'." - ) - self.protein_annotation_source = source - self.cross_references = cross_references - for db in (self.protein_annotation_source, ) + self.cross_references: - db: str - if not os.path.isdir(os.path.join(db_superdir, db.lower())): - raise ConfigError( - f"'{db}' database files should be located in a directory named '{db.lower()}' " - ) - if self.protein_annotation_source == 'KEGG': - self.source_db = refdbs.KEGGDatabase(db_superdir=db_superdir) - self.source_db.load() - cross_ref_dbs = [] - for r in self.cross_references: - if r == 'ModelSEED': - cross_ref_db = refdbs.ModelSEEDDatabase(db_superdir=db_superdir) - cross_ref_db.load() - cross_ref_dbs.append(cross_ref_db) - self.cross_ref_dbs = tuple(sorted(cross_ref_dbs, key=lambda db: type(db).__name__)) - - def write_cobrapy_json(self, path: str, indent: int = 2) -> None: - """ - Write a COBRApy JSON file from the input data. - - Parameters - ========== - path : str - Output file path. - indent : int - Number of spaces of indentation per nesting level in JSON file. Default, 2. - """ - is_output_file_writable(path) - if self.protein_annotation_source == 'KEGG' and self.cross_references == ('ModelSEED', ): - # When obtaining ortholog data from KEGG cross-referenced with ModelSEED, setting - # reaction lookup tables indexed by KEGG ID and EC number greatly speeds up the process. - modelseed_db: refdbs.ModelSEEDDatabase = self.cross_ref_dbs[0] - modelseed_db._set_reaction_lookup_table('KEGG') - modelseed_db._set_reaction_lookup_table('ec_numbers') - cobrapy_dict = self._get_cobrapy_json_dict() - self._find_missing_objective_metabolites( - cobrapy_dict, remove=self.remove_missing_objective_metabolites - ) - with open(path, 'w') as f: - json.dump(cobrapy_dict, f, indent=indent) - run: Run = self.run - run.info("Metabolic model file", path, nl_before=1) - - @abstractmethod - def _get_cobrapy_json_dict(self) -> Dict[str, Any]: - """Get a dictionary representation of a COBRApy JSON file from the input data.""" - raise NotImplementedError - - @abstractmethod - def _get_cobrapy_gene_dict(self) -> Dict[str, Any]: - """Get a dictionary representation of a gene object in a COBRApy JSON file genes array.""" - raise NotImplementedError - - @abstractmethod - def _get_cobrapy_reaction_dict(self) -> Tuple[Dict[str, Any], bool]: - """ - Get a dictionary representation of a reaction object in a COBRApy JSON file reactions array. - - Returns - ======= - Dict[str, Any] - Reaction entry - bool - True if the reaction had already been recorded in the COBRApy JSON dictionary - """ - raise NotImplementedError - - @abstractmethod - def _get_cobrapy_metabolite_dict(self) -> Tuple[Dict[str, Any], bool]: - """ - Get a dictionary representation of a metabolite object in a COBRApy JSON file metabolites - array. - - Returns - ======= - Dict[str, Any] - Metabolite entry - bool - True if the metabolite had already been recorded in the COBRApy JSON dictionary - """ - raise NotImplementedError - - def _find_missing_objective_metabolites( - self, - cobrapy_dict: Dict[str, Any], - remove: bool = False - ) -> None: - """ - Find metabolites in the biomass objective function that are not produced or consumed by - enzymes in the model. - - Parameters - ========== - cobrapy_dict : Dict - Dictionary representation of a COBRApy JSON input file - remove : bool - If True (default False), remove metabolites from the biomass objective function that are - not produced or consumed by enzymes in the model. - """ - objective: Dict[str, Any] = cobrapy_dict['reactions'][0] - try: - objective_metabolites: Dict[str, float] = objective['metabolites'] - except KeyError: - raise ConfigError( - "The objective function of the metabolic model does not have a 'metabolites' " - "section characteristic of an expected biomass objective function." - ) - reaction_metabolites: List[Dict[str, Any]] = cobrapy_dict['metabolites'] - reaction_metabolite_ids = [m['id'] for m in reaction_metabolites] - missing_objective_metabolite_data = [] - for metabolite_id, stoichiometric_coefficient in objective_metabolites.items(): - if metabolite_id in reaction_metabolite_ids: - continue - missing_objective_metabolite_data.append((metabolite_id, stoichiometric_coefficient)) - if not missing_objective_metabolite_data: - return - if remove: - for metabolite_id, stoichiometric_coefficient in missing_objective_metabolite_data: - objective_metabolites.pop(metabolite_id) - run: Run = self.run - if remove: - run.warning( - f"Missing metabolite data removed from '{objective['name']}'", nl_after=0 - ) - else: - run.warning( - f"Metabolites in '{objective['name']}' missing from the model", nl_after=0 - ) - if QUIET: - return objective - # Print removed metabolites. - TABULATE( - pd.DataFrame( - missing_objective_metabolite_data, - index=range(1, len(missing_objective_metabolite_data) + 1) - ), - ('Metabolite', 'Stoichiometric coefficient') - ) - return objective - -class Pangenome(ModelInput): - """ - Metabolic model input data from a pangenome. - - Parameters - ========== - genomes_storage_path : str - Anvi'o database of each genome in the pangenome. - pan_db_path : str - Anvi'o database of pangenome gene clusters. - collection_name : str - With bin ID, use to select gene clusters in the pangenome. Default, None. - bin_id : str - With collection name, use to select gene clusters in the pangenome. Default, None. - protein_annotation_source : str - The source of protein ortholog annotations. For now, only KEGG orthologs can be processed. - Default, 'KEGG'. - cross_references : tuple - Protein reference databases with which orthologs are cross-referenced. For now, 'ModelSEED' - must be supplied as the sole cross-reference with source 'KEGG'. Default, ('ModelSEED', ) - db_superdir : str - The directory containing protein reference database subdirectories, e.g., 'kegg', - 'modelseed'. The source and cross-references correspond to these databases. - discard_ties : bool - If multiple protein annotations are most frequent among genes in a cluster, then do not - assign an annotation to the cluster itself when this argument is True. By default, this - argument is False, so one of the most frequent annotations would be arbitrarily chosen. - consensus_threshold : float - Without this argument (default None), the protein annotation most frequent among genes - in a cluster is assigned to the cluster itself. With this argument (a value on [0, 1]), - at least this proportion of genes in the cluster must have the most frequent annotation - for the cluster to be annotated. - remove_missing_objective_metabolites : bool - Metabolites in the biomass objective function are removed if they are not recorded as being - produced or consumed by enzymes in the model. - """ - def __init__( - self, - genomes_storage_path: str, - pan_db_path: str, - collection_name: str = None, - bin_id: str = None, - protein_annotation_source: str = 'KEGG', - cross_references: Tuple[str] = ('ModelSEED', ), - db_superdir: str = refdbs.ProteinReferenceDatabase.default_superdir, - discard_ties: bool = False, - consensus_threshold: float = None, - remove_missing_objective_metabolites: bool = False, - run: Run = Run(), - progress: Progress = Progress() - ) -> None: - self.genomes_storage_path = genomes_storage_path - is_genome_storage(self.genomes_storage_path) - self.pan_db_path = pan_db_path - is_pan_db(self.pan_db_path) - self.collection_name = collection_name - self.bin_id = bin_id - self._init_bin() - self.set_protein_annotation_source( - source=protein_annotation_source, - cross_references=cross_references, - db_superdir=db_superdir - ) - self.discard_ties = discard_ties - self.consensus_threshold = consensus_threshold - self.remove_missing_objective_metabolites = remove_missing_objective_metabolites - self.run = run - self.progress = progress - self._init_pan_super() - - def _init_bin(self) -> None: - """Select a bin of gene clusters for consideration.""" - if self.collection_name or self.bin_id: - self.collections = Collections(r=run_quiet) - self.collections.populate_collections_dict(self.pan_db_path) - self.collections.is_bin_in_collection(self.collection_name, self.bin_id) - self.select_gene_cluster_ids = set( - self.collections.get_collection_dict(self.collection_name)[self.bin_id] - ) - else: - self.collections = None - self.select_gene_cluster_ids = set() - - def _init_pan_super(self) -> None: - """Set up the pangenomic data.""" - args = Namespace() - args.pan_db = self.pan_db_path - args.genomes_storage = self.genomes_storage_path - args.discard_ties = self.discard_ties - args.consensus_threshold = self.consensus_threshold - self.pan_super = PanSuperclass(args, r=run_quiet) - self.pan_super.init_gene_clusters(gene_cluster_ids_to_focus=self.select_gene_cluster_ids) - self.pan_super.init_gene_clusters_functions() - self.pan_super.init_gene_clusters_functions_summary_dict() - - def _get_cobrapy_json_dict(self) -> Dict[str, Any]: - cobrapy_dict = COBRApyJSONStructure.get() - self.cobrapy_reactions: List[Dict] = cobrapy_dict['reactions'] - self.cobrapy_reactions.append(COBRApyJSONStructure.get_ecoli_objective()) - self.cobrapy_metabolites: List[Dict] = cobrapy_dict['metabolites'] - self.recorded_reactions: Dict[str, Dict] = {} - self.recorded_metabolites: Dict[str, Dict] = {} - self.progress.new("Analyzing gene clusters") - num_analyzed = 0 - total = len(self.pan_super.gene_clusters) - for gene_cluster_id, genome_gcids in self.pan_super.gene_clusters.items(): - self.progress.update(f"{num_analyzed} / {total}") - num_analyzed += 1 - genome_gcids: Dict[str, List[str]] - # Find consensus orthologs across genes in the cluster. - orthologs_data = self.pan_super.gene_clusters_functions_summary_dict[gene_cluster_id] - if self.protein_annotation_source == 'KEGG': - anvio_source = 'KOfam' - else: - anvio_source = self.protein_annotation_source - source_data = orthologs_data[anvio_source] - if source_data['accession'] is None: - # No ortholog from the source was assigned to the cluster. - continue - # Generate an ortholog object containing protein reference data. - entry = { - 'gene_cluster_id': gene_cluster_id, - 'source': self.protein_annotation_source, - 'accession': source_data['accession'], - 'function': source_data['function'], - 'evalue': 0.0 # arbitrary, since annotation in majority of genes is used - } - if self.protein_annotation_source == 'KEGG': - ortholog = protein.AnvioKOAnnotation(entry) - else: - raise ConfigError( - f"The protein annotation source, '{self.protein_annotation_source}', is not " - "recognized." - ) - reactions = ortholog.get_reactions(self.source_db, cross_reference_dbs=self.cross_ref_dbs) - if not reactions: - # No reference reaction data could be assigned to the ortholog. - continue - cobrapy_gene_dict = self._get_cobrapy_gene_dict(gene_cluster_id, genome_gcids, reactions) - cobrapy_genes: List = cobrapy_dict['genes'] - cobrapy_genes.append(cobrapy_gene_dict) - self.progress.end() - # Delete convenience attributes. - delattr(self, 'cobrapy_reactions') - delattr(self, 'cobrapy_metabolites') - delattr(self, 'recorded_reactions') - delattr(self, 'recorded_metabolites') - return cobrapy_dict - - def _get_cobrapy_gene_dict( - self, - gene_cluster_id: str, - genome_gcids: Dict[str, List[str]], - reactions: List[protein.Reaction] - ) -> Dict[str, Any]: - cobrapy_gene_dict = COBRApyJSONStructure.get_gene_entry() - cobrapy_gene_dict['id'] = gene_cluster_id - for reaction in reactions: - cobrapy_reaction_dict, already_recorded_reaction = self._get_cobrapy_reaction_dict( - reaction, genome_gcids - ) - if not already_recorded_reaction: - self.cobrapy_reactions.append(cobrapy_reaction_dict) - self.recorded_reactions[reaction.id] = cobrapy_reaction_dict - return cobrapy_gene_dict - - def _get_cobrapy_reaction_dict( - self, - reaction: protein.Reaction, - genome_gcids: Dict[str, List[str]] - ) -> Tuple[Dict[str, Any], bool]: - # Find the genomes encoding the reaction. - genome_ids = set() - for genome_id, gcids in genome_gcids.items(): - if gcids: - genome_ids.add(genome_id) - try: - # There is already a JSON object for the reaction. - cobrapy_reaction_dict = self.recorded_reactions[reaction.id] - already_recorded_reaction = True - except KeyError: - cobrapy_reaction_dict = COBRApyJSONStructure.get_reaction_entry() - already_recorded_reaction = False - # List the genomes encoding the reaction in the JSON reaction object. - cobrapy_reaction_notes = cobrapy_reaction_dict['notes'] - if already_recorded_reaction: - recorded_genomes: List = cobrapy_reaction_notes['genomes'] - cobrapy_reaction_notes['genomes'] = sorted( - genome_ids.union(set(recorded_genomes)) - ) - return cobrapy_reaction_dict, already_recorded_reaction - cobrapy_reaction_dict['id'] = reaction.id - try: - modelseed_name = reaction.reference_ids['ModelSEED_Name'][0] - except KeyError: - modelseed_name = '' - cobrapy_reaction_dict['name'] = modelseed_name - reversibility = reaction.reversibility - if not reversibility: - cobrapy_reaction_dict['lower_bound'] = 0.0 - cobrapy_reaction_notes['genomes'] = sorted(genome_ids) - for chemical, coefficient, compartment in zip( - reaction.chemicals, reaction.coefficients, reaction.compartments - ): - cobrapy_metabolite_dict, already_recorded_metabolite = self._get_cobrapy_metabolite_dict( - chemical, coefficient, compartment, reversibility, genome_ids - ) - metabolite_id = cobrapy_metabolite_dict['id'] - cobrapy_reaction_dict['metabolites'][metabolite_id] = float(coefficient) - if not already_recorded_metabolite: - self.cobrapy_metabolites.append(cobrapy_metabolite_dict) - self.recorded_metabolites[metabolite_id] = cobrapy_metabolite_dict - cobrapy_reaction_annotation = cobrapy_reaction_dict['annotation'] - for reference, json_key in [ - ('BiGG', 'bigg.reaction'), - ('EC', 'ec-code'), - ('KEGG', 'kegg.reaction'), - ('MetaCyc', 'metacyc.reaction'), - ('ModelSEED_Alternate_Name', 'modelseed-name') - ]: - try: - ids = reaction.reference_ids[reference] - except KeyError: - continue - cobrapy_reaction_annotation[json_key] = ids - return cobrapy_reaction_dict, already_recorded_reaction - - def _get_cobrapy_metabolite_dict( - self, - chemical: protein.Chemical, - coefficient: float, - compartment: str, - reversibility: bool, - genome_ids: Set[str] - ) -> Tuple[Dict[str, Any], bool]: - metabolite_id = f'{chemical.id}_{compartment}' - try: - cobrapy_metabolite_dict = self.recorded_metabolites[metabolite_id] - already_recorded_metabolite = True - except KeyError: - cobrapy_metabolite_dict = COBRApyJSONStructure.get_metabolite_entry() - already_recorded_metabolite = False - cobrapy_metabolite_notes = cobrapy_metabolite_dict['notes'] - if already_recorded_metabolite: - if reversibility: - recorded_consuming_genome_ids: List = cobrapy_metabolite_notes['consuming_genomes'] - recorded_producing_genome_ids: List = cobrapy_metabolite_notes['producing_genomes'] - cobrapy_metabolite_notes['consuming_genomes'] = sorted( - genome_ids.union(set(recorded_consuming_genome_ids)) - ) - cobrapy_metabolite_notes['producing_genomes'] = sorted( - genome_ids.union(set(recorded_producing_genome_ids)) - ) - elif coefficient < 0: - recorded_consuming_genome_ids: List = cobrapy_metabolite_notes['consuming_genomes'] - cobrapy_metabolite_notes['consuming_genomes'] = sorted( - genome_ids.union(set(recorded_consuming_genome_ids)) - ) - elif coefficient > 0: - recorded_producing_genome_ids: List = cobrapy_metabolite_notes['producing_genomes'] - cobrapy_metabolite_notes['producing_genomes'] = sorted( - genome_ids.union(set(recorded_producing_genome_ids)) - ) - return cobrapy_metabolite_dict, already_recorded_metabolite - cobrapy_metabolite_dict['id'] = metabolite_id - try: - modelseed_name = chemical.reference_ids['ModelSEED_Name'][0] - except KeyError: - modelseed_name = '' - cobrapy_metabolite_dict['name'] = modelseed_name - cobrapy_metabolite_dict['compartment'] = compartment - cobrapy_metabolite_dict['charge'] = chemical.charge if chemical.charge else 0 - cobrapy_metabolite_dict['formula'] = chemical.formula if chemical.formula else "" - if reversibility: - cobrapy_metabolite_notes['consuming_genomes'] = sorted(genome_ids) - cobrapy_metabolite_notes['producing_genomes'] = sorted(genome_ids) - elif coefficient < 0: - cobrapy_metabolite_notes['consuming_genomes'] = sorted(genome_ids) - cobrapy_metabolite_notes['producing_genomes'] = [] - elif coefficient > 0: - cobrapy_metabolite_notes['consuming_genomes'] = [] - cobrapy_metabolite_notes['producing_genomes'] = sorted(genome_ids) - cobrapy_metabolite_annotation = cobrapy_metabolite_dict['annotation'] - for reference, json_key in [ - ('BiGG', 'bigg.metabolite'), - ('KEGG', 'kegg.compound'), - ('InChIKey', 'inchi_key'), - ('MetaCyc', 'metacyc.compound'), - ('ModelSEED_Alternate_Name', 'modelseed-name') - ]: - try: - ids = chemical.reference_ids[reference] - except KeyError: - continue - cobrapy_metabolite_annotation[json_key] = ids - return cobrapy_metabolite_dict, already_recorded_metabolite - -class ExternalGenome(ModelInput): - """ - Metabolic model input data from an external genome. - - Parameters - ========== - contigs_db_path : str - Stores data on the genome. - protein_annotation_source : str - The source of protein ortholog annotations. For now, only KEGG orthologs can be processed. - Default, 'KEGG'. - remove_missing_objective_metabolites : bool - Metabolites in the biomass objective function are removed if they are not recorded as being - produced or consumed by enzymes in the model. - """ - def __init__( - self, - contigs_db_path: str, - protein_annotation_source: str = 'KEGG', - db_superdir: str = refdbs.ProteinReferenceDatabase.default_superdir, - remove_missing_objective_metabolites: bool = False, - run: Run = Run(), - progress: Progress = Progress() - ) -> None: - self.contigs_db_path = contigs_db_path - is_contigs_db(self.contigs_db_path) - self.remove_missing_objective_metabolites = remove_missing_objective_metabolites - self.run = run - self.progress = progress - self.set_protein_annotation_source(source=protein_annotation_source, db_superdir=db_superdir) - self._init_contigs_super() - - def _init_contigs_super(self) -> None: - args = Namespace() - args.contigs_db = self.contigs_db_path - self.contigs_super = ContigsSuperclass(args, r=run_quiet) - if self.protein_annotation_source == 'KEGG': - anvio_source = 'KOfam' - else: - anvio_source = self.protein_annotation_source - self.contigs_super.init_functions(requested_sources=[anvio_source]) - - def _get_cobrapy_json_dict(self) -> Dict[str, Any]: - cobrapy_dict = COBRApyJSONStructure.get() - self.cobrapy_reactions: List[Dict] = cobrapy_dict['reactions'] - self.cobrapy_reactions.append(COBRApyJSONStructure.get_ecoli_objective()) - self.cobrapy_metabolites: List[Dict] = cobrapy_dict['metabolites'] - self.recorded_reactions: Dict[str, Dict] = {} - self.recorded_metabolites: Dict[str, Dict] = {} - self.progress.new("Analyzing genes") - num_analyzed = 0 - total = len(self.contigs_super.gene_function_calls_dict) - for gcid, gene_dict in self.contigs_super.gene_function_calls_dict.items(): - self.progress.update(f"{num_analyzed} / {total}") - num_analyzed += 1 - if self.protein_annotation_source == 'KEGG': - anvio_source = 'KOfam' - else: - anvio_source = self.protein_annotation_source - source_data = gene_dict[anvio_source] - if not source_data: - # No ortholog was assigned to the gene. - continue - # Generate an ortholog object containing protein reference data. - entry = { - 'gene_callers_id': gcid, - 'source': self.protein_annotation_source, - 'accession': source_data[0], - 'function': source_data[1], - 'evalue': source_data[2] - } - if self.protein_annotation_source == 'KEGG': - ortholog = protein.AnvioKOAnnotation(entry) - else: - raise ConfigError( - f"The protein annotation source, '{self.protein_annotation_source}', is not " - "recognized." - ) - reactions = ortholog.get_reactions(self.source_db, cross_reference_dbs=self.cross_ref_dbs) - if not reactions: - # No reference reaction data could be assigned to the ortholog. - continue - cobrapy_gene_dict = self._get_cobrapy_gene_dict(gcid, reactions) - cobrapy_genes: List = cobrapy_dict['genes'] - cobrapy_genes.append(cobrapy_gene_dict) - self.progress.end() - # Delete convenience attributes. - delattr(self, 'cobrapy_reactions') - delattr(self, 'cobrapy_metabolites') - delattr(self, 'recorded_reactions') - delattr(self, 'recorded_metabolites') - return cobrapy_dict - - def _get_cobrapy_gene_dict(self, gcid: int, reactions: List[protein.Reaction]) -> Dict[str, Any]: - cobrapy_gene_dict = COBRApyJSONStructure.get_gene_entry() - cobrapy_gene_dict['id'] = gcid - for reaction in reactions: - cobrapy_reaction_dict, already_recorded_reaction = self._get_cobrapy_reaction_dict(reaction) - if not already_recorded_reaction: - self.cobrapy_reactions.append(cobrapy_reaction_dict) - self.recorded_reactions[reaction.id] = cobrapy_reaction_dict - return cobrapy_gene_dict - - def _get_cobrapy_reaction_dict(self, reaction: protein.Reaction) -> Tuple[Dict[str, Any], bool]: - try: - # There is already a JSON object for the reaction. - cobrapy_reaction_dict = self.recorded_reactions[reaction.id] - return cobrapy_reaction_dict, True - except KeyError: - cobrapy_reaction_dict = COBRApyJSONStructure.get_reaction_entry() - # List the genomes encoding the reaction in the JSON reaction object. - cobrapy_reaction_dict['id'] = reaction.id - try: - modelseed_name = reaction.reference_ids['ModelSEED_Name'][0] - except KeyError: - modelseed_name = '' - cobrapy_reaction_dict['name'] = modelseed_name - reversibility = reaction.reversibility - if not reversibility: - cobrapy_reaction_dict['lower_bound'] = 0.0 - for chemical, coefficient, compartment in zip( - reaction.chemicals, reaction.coefficients, reaction.compartments - ): - cobrapy_metabolite_dict, already_recorded_metabolite = self._get_cobrapy_metabolite_dict( - chemical, compartment - ) - metabolite_id = cobrapy_metabolite_dict['id'] - cobrapy_reaction_dict['metabolites'][metabolite_id] = float(coefficient) - if not already_recorded_metabolite: - self.cobrapy_metabolites.append(cobrapy_metabolite_dict) - self.recorded_metabolites[metabolite_id] = cobrapy_metabolite_dict - cobrapy_reaction_annotation = cobrapy_reaction_dict['annotation'] - for reference, json_key in [ - ('BiGG', 'bigg.reaction'), - ('EC', 'ec-code'), - ('KEGG', 'kegg.reaction'), - ('MetaCyc', 'metacyc.reaction'), - ('ModelSEED_Alternate_Name', 'modelseed-name') - ]: - try: - ids = reaction.reference_ids[reference] - except KeyError: - continue - cobrapy_reaction_annotation[json_key] = ids - return cobrapy_reaction_dict, False - - def _get_cobrapy_metabolite_dict( - self, - chemical: protein.Chemical, - compartment: str - ) -> Tuple[Dict[str, Any], bool]: - metabolite_id = f'{chemical.id}_{compartment}' - try: - cobrapy_metabolite_dict = self.recorded_metabolites[metabolite_id] - return cobrapy_metabolite_dict, True - except KeyError: - cobrapy_metabolite_dict = COBRApyJSONStructure.get_metabolite_entry() - cobrapy_metabolite_dict['id'] = metabolite_id - try: - modelseed_name = chemical.reference_ids['ModelSEED_Name'][0] - except KeyError: - modelseed_name = '' - cobrapy_metabolite_dict['name'] = modelseed_name - cobrapy_metabolite_dict['compartment'] = compartment - cobrapy_metabolite_dict['charge'] = chemical.charge if chemical.charge else 0 - cobrapy_metabolite_dict['formula'] = chemical.formula if chemical.formula else "" - cobrapy_metabolite_annotation = cobrapy_metabolite_dict['annotation'] - for reference, json_key in [ - ('BiGG', 'bigg.metabolite'), - ('KEGG', 'kegg.compound'), - ('InChIKey', 'inchi_key'), - ('MetaCyc', 'metacyc.compound'), - ('ModelSEED_Alternate_Name', 'modelseed-name') - ]: - try: - ids = chemical.reference_ids[reference] - except KeyError: - continue - cobrapy_metabolite_annotation[json_key] = ids - return cobrapy_metabolite_dict, False diff --git a/anvio/biochemistry/protein.py b/anvio/biochemistry/protein.py deleted file mode 100644 index 7cca98fb13..0000000000 --- a/anvio/biochemistry/protein.py +++ /dev/null @@ -1,229 +0,0 @@ -# -*- coding: utf-8 -# pylint: disable=line-too-long -"""Representation of gene protein homologs.""" - -from __future__ import annotations - -import pandas as pd - -from typing import Dict, List, Tuple - -import anvio.terminal as terminal -import anvio.biochemistry.refdbs as refdbs - -from anvio.errors import ConfigError -from anvio import __version__ as VERSION - - -__copyright__ = "Copyleft 2015-2024, The Anvi'o Project (http://anvio.org/)" -__credits__ = [] -__license__ = "GPL 3.0" -__version__ = VERSION -__maintainer__ = "Samuel Miller" -__email__ = "samuelmiller10@gmail.com" -__status__ = "Development" - - -run_quiet = terminal.Run(verbose=False) - -class Chemical: - """A chemical.""" - def __init__(self) -> None: - # The chemical may have one or more IDs (values) per reference database (key). - self.reference_ids: Dict[str, List[str]] = {} - self.charge: int = None - self.formula: str = None - self.smiles_string: str = None - - @property - def id(self) -> str: - """The first recorded reference database/ID. If none is recorded, return None.""" - try: - return next(iter(self.reference_ids.values()))[0] - except StopIteration: - return None - -class Reaction: - """A chemical reaction.""" - def __init__(self) -> None: - # The reaction may have one or more IDs (values) per reference database (key). - self.reference_ids: Dict[str, List[str]] = {} - self.chemicals: List[Chemical] = [] - self.coefficients: List[float] = [] - self.compartments: List[str] = [] - self.reversibility: bool = None - - @property - def id(self) -> str: - """The first recorded reference database/ID. If none is recorded, return None.""" - try: - return next(iter(self.reference_ids.values()))[0] - except StopIteration: - return None - -class OrthologAnnotation: - """Data regarding a group of protein functional orthologs.""" - def __init__(self) -> None: - self.source: str = None # e.g., 'KEGG' or 'Pfam' databases - self.accession: str = None # e.g., KEGG ortholog 'K00001' - -class AnvioOrthologAnnotation(OrthologAnnotation): - """ - An ortholog annotation of a gene/gene cluster as stored in an anvi'o database. - - Parameters - ========== - entry : Dict - This dictionary contains information on an ortholog annotation. For a gene, this - corresponds to a row of the gene functions table of a contigs database. - """ - def __init__(self, entry: Dict) -> None: - if 'gene_callers_id' in entry: - self.id = int(entry['gene_callers_id']) - else: - self.id = entry['gene_cluster_id'] - source = entry['source'] - if source == 'KEGG' or source == 'KOfam': - # In anvi'o databases, the source of KEGG orthologs is recorded as 'KOfam'. This - # suggests that KOs represented here must derived from a KOfam HMM, which need not be - # the case. - self.source = 'KEGG' - else: - self.source = source # e.g., 'Pfam' - self.accession = entry['accession'] # e.g., KEGG ortholog 'K00001' - self.function = entry['function'] # description of gene function - self.e_value = float(entry['evalue']) # annotation confidence score - - def __str__(self) -> str: - if isinstance(self.id, int): - return f"Gene '{self.id}' {self.source} ortholog accession, '{self.accession}'" - else: - return f"Gene cluster '{self.id}' {self.source} ortholog accession, '{self.accession}'" - -class AnvioKOAnnotation(AnvioOrthologAnnotation): - """A KEGG ortholog (KO) of a gene as stored in an anvi'o contigs database.""" - def __init__(self, entry: Dict) -> None: - super().__init__(entry) - - def get_reactions( - self, - kegg_db: refdbs.KEGGDatabase, - cross_reference_dbs: Tuple[refdbs.ProteinReferenceDatabase] = None - ) -> List[Reaction]: - """ - Get reaction data for the ortholog. - - Parameters - ========== - kegg_db : anvio.biochemistry.refdbs.KEGGDatabase - KEGG reference database. - cross_reference_dbs : tuple - Protein reference databases ('anvio.biochemistry.refdbs.ProteinReferenceDatabase') - with which KOs are cross-referenced. For now, a ModelSEED database must be supplied as - the sole cross-reference database. - - Returns - ======= - List[Reaction] - """ - for db in (kegg_db, ) + cross_reference_dbs: - db._check_reference_database_initialization() - cross_ref_db_names = tuple(db.db_name for db in cross_reference_dbs) - if cross_ref_db_names == ('modelseed', ): - reactions = self._get_reactions_from_kegg_and_modelseed( - kegg_db, cross_reference_dbs[0] - ) - else: - raise ConfigError( - "For now, a ModelSEED database must be supplied as the sole cross-reference " - "database." - ) - return reactions - - def _get_reactions_from_kegg_and_modelseed( - self, - kegg_db: refdbs.KEGGDatabase, - modelseed_db: refdbs.ModelSEEDDatabase - ) -> List[Reaction]: - """ - Get reaction data for the ortholog from KEGG and ModelSEED reference databases. - - A KO may be associated with KEGG Reactions and EC numbers. This method first considers KEGG - Reactions and, absent these, EC numbers. Reaction IDs and EC numbers are cross-referenced to - the ModelSEED database. Reaction data is not returned for Reaction IDs and EC numbers that - are not found in the ModelSEED database! - - Parameters - ========== - kegg_db : anvio.biochemistry.refdbs.KEGGDatabase - KEGG reference database. - modelseed_db : anvio.biochemistry.refdbs.ModelSEEDDatabase - ModelSEED reference database. The ModelSEED Biochemistry Database has harmonized - reaction and compound data with KEGG and other databases. - - Returns - ======= - List[Reaction] - """ - # Retrieve or generate ModelSEED reactions tables for looking up reactions by KEGG Reaction - # ID and EC number, respectively. - if hasattr(modelseed_db, 'reaction_lookup_tables'): - lookup_dict_existed = True - if ( - 'KEGG' in modelseed_db.reaction_lookup_tables and - 'ec_numbers' in modelseed_db.reaction_lookup_tables - ): - lookup_tables_existed = True - else: - lookup_tables_existed = False - else: - lookup_dict_existed = False - lookup_tables_existed = False - if not lookup_tables_existed: - modelseed_db._check_reference_database_initialization() - modelseed_db._set_reaction_lookup_table('KEGG') - modelseed_db._set_reaction_lookup_table('ec_numbers') - kegg_reactions_table = modelseed_db.reaction_lookup_tables['KEGG'] - ec_numbers_reactions_table = modelseed_db.reaction_lookup_tables['ec_numbers'] - # Restore the passed ModelSEED database object to its original state by removing KEGG or EC - # lookup tables added by the current method. - if not lookup_dict_existed: - delattr(modelseed_db, 'reaction_lookup_tables') - elif not lookup_tables_existed: - modelseed_db.reaction_lookup_tables.pop('KEGG') - modelseed_db.reaction_lookup_tables.pop('ec_numbers') - # Use any KEGG Reaction IDs associated with the KO to find cross-referenced ModelSEED - # reactions. - kegg_series = kegg_db.ko_data.loc[self.accession] - reactions = [] - reaction_accessions = kegg_series.loc['reactions'] - if pd.notna(reaction_accessions): - reaction_accessions: str - for reaction_data in kegg_reactions_table[ - kegg_reactions_table['KEGG'].isin(reaction_accessions.split()) - ].to_dict(orient='index').values(): - reaction = modelseed_db.get_reaction(reaction_data) - if reaction is None: - continue - reactions.append(reaction) - if reactions: - return reactions - # Reaching this point, either no KEGG Reaction IDs were associated with the KO, or Reaction - # IDs for the KO were not cross-referenced with any ModelSEED reactions. Next use any EC - # numbers associated with the KO to find cross-referenced ModelSEED reactions. KO Reactions - # are preferred as a more precise representation of the reactions that can be catalyzed than - # EC numbers. - ec_numbers = kegg_series.loc['ec_numbers'] - if pd.notna(ec_numbers): - ec_numbers: str - for reaction_data in ec_numbers_reactions_table[ - ec_numbers_reactions_table['ec_numbers'].isin(ec_numbers.split()) - ].to_dict(orient='index').values(): - reaction = modelseed_db.get_reaction(reaction_data) - if reaction is None: - continue - reactions.append(reaction) - if reactions: - return reactions - # No reaction data could be recovered for the KO. - return reactions diff --git a/anvio/biochemistry/reactionnetwork.py b/anvio/biochemistry/reactionnetwork.py deleted file mode 100644 index d67a88b0e6..0000000000 --- a/anvio/biochemistry/reactionnetwork.py +++ /dev/null @@ -1,8032 +0,0 @@ -# -*- coding: utf-8 -# pylint: disable=line-too-long -"""Generate, manipulate, and export metabolic reaction networks from gene annotations.""" - -from __future__ import annotations - -import os -import re -import glob -import json -import math -import time -import random -import shutil -import hashlib -import tarfile -import zipfile -import argparse -import fractions -import functools -import collections -import numpy as np -import pandas as pd -import multiprocessing as mp - -from copy import deepcopy -from argparse import Namespace -from dataclasses import dataclass, field -from typing import Any, Dict, List, Set, Tuple, Union, Iterable - -import anvio.utils as utils -import anvio.dbinfo as dbinfo -import anvio.tables as tables -import anvio.terminal as terminal -import anvio.filesnpaths as filesnpaths - -from anvio.errors import ConfigError -from anvio import DEBUG, __file__ as ANVIO_PATH, __version__ as VERSION -from anvio.dbops import ( - ContigsDatabase, - ProfileDatabase, - PanDatabase, - ContigsSuperclass, - PanSuperclass -) - - -__copyright__ = "Copyleft 2015-2024, The Anvi'o Project (http://anvio.org/)" -__credits__ = [] -__license__ = "GPL 3.0" -__version__ = VERSION -__maintainer__ = "Samuel Miller" -__email__ = "samuelmiller10@gmail.com" -__status__ = "Development" - - -run_quiet = terminal.Run(verbose=False) - -# Network statistics are stored in a dictionary of dictionaries. Keys in the outer dictionary are -# "classes" of network statistics. Keys in the inner dictionary are statistics themselves. -GenomicNetworkStats = Dict[str, Dict[str, Any]] -PangenomicNetworkStats = Dict[str, Dict[str, Any]] - -RANDOM_SEED = 1066 - - -@dataclass -class ModelSEEDCompound: - """ - Representation of a chemical (a compound, element, or ions thereof) or a class of chemicals - (either abstract, like 'Cofactors' and 'Biomass', or defined, like 'Carboxylic acid' and - 'Polynucleotides'), with properties given by the ModelSEED Biochemistry database. - - Attributes - ========== - modelseed_id : str, None - The ModelSEED compound ID, formatted 'cpdXXXXX', where each X is a digit, e.g., 'cpd00001'. - - modelseed_name : str, None - Name of the ModelSEED compound, e.g., 'cpd00001' has the name, 'H2O'. When absent in the - database, assumes a value of None. - - kegg_aliases : Tuple[str], None - The KEGG COMPOUND IDs that are known to possibly alias the ModelSEED compound, according to - the ModelSEED database, e.g., 'cpd00001' has the aliases, ('C00001', 'C01328'). A KEGG - COMPOUND ID is formatted 'CXXXXX', where each X is a digit, e.g., 'C00001'. - - charge : int, None - The electrical charge of the ModelSEED compound, e.g., 'cpd00001' has charge 0. ModelSEED - compounds without a formula have a nominal charge of 10000000 in the database. - - formula : str, None - The formula of the ModelSEED compound, e.g., 'cpd00001' has the formula, 'H2O'. When absent - in the database, assumes a value of None. - - abundances : Dict[str, float], dict() - Abundance profile data (from metabolomics, for instance) with each key being a sample name - and each value being the abundance of the ModelSEED compound in that sample. - """ - modelseed_id: str = None - modelseed_name: str = None - kegg_aliases: Tuple[str] = None - charge: int = None - formula: str = None - abundances: Dict[str, float] = field(default_factory=dict) - -@dataclass -class ModelSEEDReaction: - """ - Representation of a reaction, with properties given by the ModelSEED Biochemistry database. - - Attributes - ========== - modelseed_id : str, None - The ModelSEED reaction ID, formatted 'rxnXXXXX', where each X is a digit, e.g., - 'rxn00001'. - - modelseed_name : str, None - Name of the reaction, e.g., 'rxn00001' has the name, 'diphosphate phosphohydrolase'. When - absent in the database, assumes a value of None. - - kegg_aliases : Tuple[str], None - The KEGG REACTION IDs that are known to possibly alias the ModelSEED reaction, according to - the ModelSEED database, e.g., 'rxn00001' has the aliases, ('R00004'). A KEGG REACTION ID is - formatted 'RXXXXX', where each X is a digit, e.g., 'R00001'. - - ec_number_aliases : Tuple[str], None - The EC numbers that are known to possibly alias the ModelSEED reaction, according to the - ModelSEED database, e.g., 'rxn00001' has the aliases, ('3.6.1.1'). - - compounds : Tuple[ModelSEEDCompound], None - ModelSEED compound IDs of reactants and products involved in the reaction, e.g., 'rxn00001' - involves the compounds, ('cpd00001', 'cpd00012', 'cpd00009', 'cpd00067'). A compound ID is - formatted 'cpdXXXXX', where each X is a digit, e.g., 'cpd00001'. Each compound item has a - corresponding stoichiometric reaction coefficient in the attribute, 'coefficients', and a - corresponding cellular compartment in the attribute, 'compartments'. - - coefficients : Tuple[int], None - Integer stoichiometric reaction coefficients of reactants and products, with negative - coefficients indicating reactants and positive coefficients indicating products, e.g., - 'rxn00001' has the coefficients, (-1, -1, 2, 1). Each coefficient item has a corresponding - ModelSEED compound ID in the attribute, 'compounds', and a corresponding cellular - compartment in the attribute, 'compartments'. - - compartments : Tuple[str], None - Cellular compartments of reactants and products, with valid values being 'c' for 'cytosolic' - and 'e' for 'extracellular', e.g., 'rxn00001' involves the compartments, ('c', 'c', 'c', - 'c'). Each compartment item has a corresponding ModelSEED compound ID in the attribute, - 'compounds', and a corresponding stoichiometric reaction coefficient in the attribute, - 'coefficients'. - - reversibility : bool, None - Reaction reversibility, with True indicating the reaction is reversible and False indicating - the reaction is irreversible given the equation encoded in the attributes, 'compounds', - 'coefficients', and 'compartments'. For example, 'rxn00001' has a value of False. - """ - modelseed_id: str = None - modelseed_name: str = None - kegg_aliases: Tuple[str] = None - ec_number_aliases: Tuple[str] = None - compounds: Tuple[ModelSEEDCompound] = None - coefficients: Tuple[int] = None - compartments: Tuple[str] = None - reversibility: bool = None - -@dataclass -class KO: - """ - Representation of a KEGG Ortholog (KO). - - Attributes - ========== - id : str, None - KEGG ORTHOLOGY ID in the format, 'KXXXXX', where X is a digit, e.g., 'K00001'. - - name : str, None - Name of the KO, e.g., 'K00001' has the name, 'alcohol dehydrogenase [EC:1.1.1.1]'. - - reactions : Dict[str, ModelSEEDReaction], dict() - ModelSEED reactions associated with the KO via KO KEGG reaction and EC number annotations. - Keys are ModelSEED reaction IDs and values are 'ModelSEEDReaction' objects. A ModelSEED - reaction ID is formatted 'rxnXXXXX', where each X is a digit, e.g., 'rxn00001'. - - kegg_reaction_aliases : Dict[str, List[str]], dict() - KEGG reaction annotations of the KO that alias ModelSEED reactions. A KEGG REACTION ID is - formatted 'RXXXXX', where each X is a digit, e.g., 'R00001'. For example, KO 'K00003' has - two KEGG reaction annotations, both of which are associated with ModelSEED reactions via the - ModelSEED database: {'R01773': ['rxn01301', 'rxn27933'], 'R01775': ['rxn01302', - 'rxn27932']}. Note that a ModelSEED reaction may have more KEGG reaction aliases than those - annotating the KO: all known KEGG reaction aliases of the ModelSEED reaction in the - ModelSEED database are recorded in the 'kegg_aliases' attribute of a 'ModelSEEDReaction' - object. - - ec_number_aliases : Dict[str, List[str]], dict() - EC number annotations of the KO that alias ModelSEED reactions. For example, KO 'K00003' has - one EC number annotation, which is associated with ModelSEED reactions via the ModelSEED - database: {'1.1.1.3': ['rxn01301', 'rxn01302', 'rxn19904', 'rxn27931', 'rxn27932', - 'rxn27933', 'rxn33957']}. Note that a ModelSEED reaction may have more EC number aliases - than those annotating the KO: all known EC number aliases of the ModelSEED reaction in the - ModelSEED database are recorded in the 'ec_number_aliases' attribute of a - 'ModelSEEDReaction' object. - """ - id: str = None - name: str = None - reactions: Dict[str, ModelSEEDReaction] = field(default_factory=dict) - kegg_reaction_aliases: Dict[str, List[str]] = field(default_factory=dict) - ec_number_aliases: Dict[str, List[str]] = field(default_factory=dict) - -@dataclass -class Gene: - """ - Representation of a gene. - - Attributes - ========== - gcid : int, None - The gene callers ID, or unique anvi'o identifier, of the gene: a non-negative integer. - - kos : Dict[str, KO], dict() - KEGG Orthologs (KOs) annotating the gene. Keys are KO IDs, which are formatted as 'KXXXXX', - where each X is a digit, e.g., 'K00001'. Values are 'KO' objects. - - e_values : Dict[str, float], dict() - E-values express the strength of KO-gene associations. Keys are KO IDs; values are - non-negative numbers. - - protein : Protein, None - This object is used for storing abundance data on the protein expressed by the gene (from - proteomics, for instance). - """ - gcid: int = None - kos: Dict[str, KO] = field(default_factory=dict) - e_values: Dict[str, float] = field(default_factory=dict) - protein: Protein = None - -@dataclass -class Protein: - """ - This object stores protein abundance data (from proteomics, for instance). - - Attributes - ========== - id : int, None - The unique anvi'o ID for the protein: a non-negative integer. - - genes : Dict[int, Gene], dict() - Genes that can express the protein. Keys are gene callers IDs; values are 'Gene' objects. - - abundances : Dict[str, float], dict() - Protein abundance profile data with each key being a sample name and each value being the - abundance of the protein expressed by the gene in that sample. - """ - id: int = None - genes: Dict[int, Gene] = field(default_factory=dict) - abundances: Dict[str, float] = field(default_factory=dict) - -@dataclass -class GeneCluster: - """ - Representation of a gene cluster. - - Attributes - ========== - gene_cluster_id : int, None - The unique anvi'o ID for the gene cluster: a non-negative integer. - - genomes : List[str], [] - The names of the genomes contributing the genes in the cluster. - - ko : KO, None - The consensus KO among the genes in the cluster. (Consensus KOs can be found from a - pangenome by the anvi'o method, 'dbops.PanSuperclass.get_gene_cluster_function_summary'.) - Note that the individual gene KO annotations underlying the consensus annotation are not - tracked. - """ - gene_cluster_id: int = None - genomes: List[str] = field(default_factory=list) - ko: KO = None - -class Bin: - """Representation of a bin of genes or gene clusters.""" - pass - -class GeneBin(Bin): - """Representation of a bin of genes.""" - def __init__(self) -> None: - self.genes: List[Gene] = [] - -class GeneClusterBin(Bin): - """Representation of a bin of gene clusters.""" - def __init__(self) -> None: - self.gene_clusters: List[GeneCluster] = [] - -class BinCollection: - """Representation of a collection of bins.""" - def __init__(self) -> None: - self.bins: List[Bin] = [] - -class ReactionNetwork: - """ - A reaction network predicted from KEGG KO and ModelSEED annotations. - - A reaction network need not be fully connected: it is not guaranteed that there exists a path - through the network from one arbitrary reaction to another. - - Attributes - ========== - kos : Dict[str, KO], dict() - This dictionary maps the IDs of KOs in the network to object representations of the KOs. - - reactions : Dict[str, ModelSEEDReaction], dict() - This maps the IDs of ModelSEED reactions in the network to object representations of the - reactions. - - metabolites : Dict[str, ModelSEEDCompound], dict() - This maps the IDs of ModelSEED metabolites in the network to object representations of the - metabolites. - - kegg_modelseed_aliases : Dict[str, List[str]], dict() - This maps KEGG REACTION IDs associated with KOs in the network to ModelSEED reactions - aliased by the KEGG reaction. KO-associated KEGG reactions that do not alias ModelSEED - reactions are not included. - - ec_number_modelseed_aliases : Dict[str, List[str]], dict() - This maps EC numbers associated with KOs in the network to ModelSEED reactions aliased by - the EC number. KO-associated EC numbers that do not alias ModelSEED reactions are not - included. - - modelseed_kegg_aliases : Dict[str, List[str]], dict() - This maps the IDs of ModelSEED reactions in the network to lists of KEGG REACTION IDs that - are associated with KOs in the network and alias the ModelSEED reaction. - - modelseed_ec_number_aliases : Dict[str, List[str]], dict() - This maps the IDs of ModelSEED reactions in the network to lists of EC numbers that are - associated with KOs in the network and alias the ModelSEED reaction. - - run : anvio.terminal.Run, anvio.terminal.Run() - This object prints run information to the terminal. This attribute is assigned the argument - of the same name upon initialization. - - progress : anvio.terminal.Progress, anvio.terminal.Progress() - This object prints transient progress information to the terminal. This attribute is - assigned the argument of the same name upon initialization. - - verbose : bool, True - Report more information to the terminal if True. - """ - def __init__( - self, - run: terminal.Run = terminal.Run(), - progress: terminal.Progress = terminal.Progress(), - verbose: bool = True - ) -> None: - """ - Parameters - ========== - run : anvio.terminal.Run, anvio.terminal.Run() - This object sets the 'run' attribute, which prints run information to the terminal. - - progress : anvio.terminal.Progress, anvio.terminal.Progress() - This object sets the 'progress' attribute, which prints transient progress information - to the terminal. - - verbose : bool, True - This sets the 'verbose' attribute, causing more information to be reported to the - terminal if True. - - Returns - ======= - None - """ - self.kos: Dict[str, KO] = {} - self.reactions: Dict[str, ModelSEEDReaction] = {} - self.metabolites: Dict[str, ModelSEEDCompound] = {} - # The following dictionaries map reaction aliases in the network: as in, not all known - # aliases, but only those sourced from KOs and contributing ModelSEEDReaction objects. - self.kegg_modelseed_aliases: Dict[str, List[str]] = {} - self.ec_number_modelseed_aliases: Dict[str, List[str]] = {} - self.modelseed_kegg_aliases: Dict[str, List[str]] = {} - self.modelseed_ec_number_aliases: Dict[str, List[str]] = {} - - self.run = run - self.progress = progress - self.verbose = verbose - - def _copy(self, copied_network: Union[GenomicNetwork, PangenomicNetwork]) -> None: - """ - In copying a reaction network, copy the attributes of the network besides genes - (GenomicNetwork) or gene clusters (PangenomicNetwork) and protein abundances (which can only - be stored in a GenomicNetwork). - - Parameters - ========== - copied_network : Union[GenomicNetwork, PangenomicNetwork] - The network copy under construction. - """ - for modelseed_id, metabolite in self.metabolites.items(): - copied_metabolite = ModelSEEDCompound() - copied_metabolite.modelseed_id = modelseed_id - copied_metabolite.modelseed_name = metabolite.modelseed_name - copied_metabolite.kegg_aliases = metabolite.kegg_aliases - copied_metabolite.charge = metabolite.charge - copied_metabolite.formula = metabolite.formula - abundances: Dict[str, float] = getattr(metabolite, 'abundances', None) - if abundances is None: - # Metabolite sample abundances cannot be recorded in a pangenomic network. - continue - copied_metabolite.abundances = abundances.copy() - - copied_network.metabolites[modelseed_id] = copied_metabolite - - for modelseed_id, reaction in self.reactions.items(): - copied_reaction = ModelSEEDReaction() - copied_reaction.modelseed_id = modelseed_id - copied_reaction.modelseed_name = reaction.modelseed_name - copied_reaction.kegg_aliases = reaction.kegg_aliases - copied_reaction.ec_number_aliases = reaction.ec_number_aliases - metabolites = [] - for metabolite in reaction.compounds: - metabolites.append(copied_network.metabolites[metabolite.modelseed_id]) - copied_reaction.compounds = tuple(metabolites) - copied_reaction.coefficients = reaction.coefficients - copied_reaction.compartments = reaction.compartments - copied_reaction.reversibility = reaction.reversibility - - copied_network.reactions[modelseed_id] = copied_reaction - - for ko_id, ko in self.kos.items(): - copied_ko = KO() - copied_ko.id = ko_id - copied_ko.name = ko.name - for modelseed_id in ko.reactions: - copied_ko.reactions[modelseed_id] = copied_network.reactions[modelseed_id] - for modelseed_id, kegg_ids in ko.kegg_reaction_aliases.items(): - copied_ko.kegg_reaction_aliases[modelseed_id] = kegg_ids.copy() - for modelseed_id, ec_numbers in ko.ec_number_aliases.items(): - copied_ko.ec_number_aliases[modelseed_id] = ec_numbers.copy() - - copied_network.kos[ko_id] = copied_ko - - for kegg_id, modelseed_ids in self.kegg_modelseed_aliases.items(): - copied_network.kegg_modelseed_aliases[kegg_id] = modelseed_ids.copy() - - for ec_number, modelseed_ids in self.ec_number_modelseed_aliases.items(): - copied_network.ec_number_modelseed_aliases[ec_number] = modelseed_ids.copy() - - for modelseed_id, kegg_ids in self.modelseed_kegg_aliases.items(): - copied_network.modelseed_kegg_aliases[modelseed_id] = kegg_ids.copy() - - for modelseed_id, ec_numbers in self.modelseed_ec_number_aliases.items(): - copied_network.modelseed_ec_number_aliases[modelseed_id] = ec_numbers.copy() - - def remove_missing_objective_metabolites(self, objective_dict: Dict) -> None: - """ - Remove metabolites from a biomass objective dictionary that are not produced or consumed by - any reactions in the network. - - Parameters - ========== - objective_dict : dict - Biomass objective in COBRApy JSON format, like that returned by the method, - 'JSONStructure.get_e_coli_core_objective'. - - Returns - ======= - None - """ - objective_metabolites: Dict = objective_dict['metabolites'] - missing_metabolite_ids = [] - if 'original_metabolite_ids' in objective_dict['notes']: - # The E. coli objective had metabolite BiGG IDs, which were replaced with KEGG COMPOUND - # IDs, and the original BiGG IDs were recorded in the 'notes' section of the objective. - missing_original_metabolite_ids = [] - objective_original_metabolites: Dict = objective_dict['notes'][ - 'original_metabolite_ids' - ] - for metabolite_id, original_metabolite_id in zip( - objective_metabolites, objective_original_metabolites - ): - if metabolite_id[:-2] not in self.metabolites: - # The metabolite (removing localization substring) is not in the network. - missing_metabolite_ids.append(metabolite_id) - missing_original_metabolite_ids.append(original_metabolite_id) - for original_metabolite_id in missing_original_metabolite_ids: - objective_original_metabolites.pop(original_metabolite_id) - else: - for metabolite_id in objective_metabolites: - if metabolite_id[:-2] not in self.metabolites: - # The metabolite (removing localization substring) is not in the network. - missing_metabolite_ids.append(metabolite_id) - for metabolite_id in missing_metabolite_ids: - objective_metabolites.pop(metabolite_id) - - if not self.verbose: - return - - if 'original_metabolite_ids' in objective_dict['notes']: - id_string = "" - for original_id, modelseed_id in zip( - missing_original_metabolite_ids, missing_metabolite_ids - ): - id_string += f"{original_id} ({modelseed_id}), " - id_string = id_string[:-2] - self.run.info_single( - f"""\ - The following metabolites were removed from the biomass objective, with the original - IDs aliasing the ModelSEED compound IDs in parentheses: {id_string}\ - """ - ) - else: - self.run.info_single( - f"""\ - The following metabolites, given by their ModelSEED compound IDs, were removed from - the biomass objective: {', '.join(missing_metabolite_ids)}\ - """ - ) - - def _merge_network( - self, - network: Union[GenomicNetwork, PangenomicNetwork], - merged_network: Union[GenomicNetwork, PangenomicNetwork] - ) -> None: - """ - In merging reaction networks, merge the attributes of the network besides genes - (GenomicNetwork) or gene clusters (PangenomicNetwork) and protein abundances (which can only - be stored in a GenomicNetwork). - - Parameters - ========== - network : Union[GenomicNetwork, PangenomicNetwork] - The other reaction network being merged. - - merged_network : Union[GenomicNetwork, PangenomicNetwork] - The merged reaction network under construction. - - Returns - ======= - None - """ - if isinstance(network, GenomicNetwork): - assert isinstance(merged_network, GenomicNetwork) - else: - assert isinstance(merged_network, PangenomicNetwork) - - # Add metabolites to the merged network, starting with metabolites in the first network and - # continuing with metabolites exclusive to the second network. Assume objects representing - # the same metabolites in both networks properly have identical attributes. - for metabolite_id, first_metabolite in self.metabolites.items(): - merged_metabolite = ModelSEEDCompound() - merged_metabolite.modelseed_id = metabolite_id - merged_metabolite.modelseed_name = first_metabolite.modelseed_name - merged_metabolite.kegg_aliases = first_metabolite.kegg_aliases - merged_metabolite.charge = first_metabolite.charge - merged_metabolite.formula = first_metabolite.formula - abundances: Dict[str, float] = getattr(first_metabolite, 'abundances', None) - if abundances is None: - continue - merged_metabolite.abundances = abundances.copy() - - merged_network.metabolites[metabolite_id] = merged_metabolite - - for metabolite_id in set(network.metabolites).difference(self.metabolites): - second_metabolite = network.metabolites[metabolite_id] - - merged_metabolite = ModelSEEDCompound() - merged_metabolite.modelseed_id = metabolite_id - merged_metabolite.modelseed_name = second_metabolite.modelseed_name - merged_metabolite.kegg_aliases = second_metabolite.kegg_aliases - merged_metabolite.charge = second_metabolite.charge - merged_metabolite.formula = second_metabolite.formula - abundances: Dict[str, float] = getattr(second_metabolite, 'abundances', None) - if abundances is None: - continue - merged_metabolite.abundances = abundances.copy() - - merged_network.metabolites[metabolite_id] = merged_metabolite - - # Add reactions to the merged network, starting with reactions in the first network and - # continuing with reactions exclusive to the second network. Assume objects representing the - # same reactions in both networks properly have identical attributes. - - # Determine network attributes mapping reaction aliases. - kegg_modelseed_aliases: Dict[str, List[str]] = {} - ec_number_modelseed_aliases: Dict[str, List[str]] = {} - - for reaction_id, first_reaction in self.reactions.items(): - merged_reaction = ModelSEEDReaction() - merged_reaction.modelseed_id = reaction_id - merged_reaction.modelseed_name = first_reaction.modelseed_name - merged_reaction.kegg_aliases = first_reaction.kegg_aliases - merged_reaction.ec_number_aliases = first_reaction.ec_number_aliases - metabolites = [] - for metabolite in first_reaction.compounds: - metabolites.append(merged_network.metabolites[metabolite.modelseed_id]) - merged_reaction.compounds = tuple(metabolites) - merged_reaction.coefficients = first_reaction.coefficients - merged_reaction.compartments = first_reaction.compartments - merged_reaction.reversibility = first_reaction.reversibility - - merged_network.reactions[reaction_id] = merged_reaction - - try: - merged_network.modelseed_kegg_aliases[reaction_id] += list( - first_reaction.kegg_aliases - ) - except KeyError: - merged_network.modelseed_kegg_aliases[reaction_id] = list( - first_reaction.kegg_aliases - ) - - try: - merged_network.modelseed_ec_number_aliases[reaction_id] += list( - first_reaction.ec_number_aliases - ) - except KeyError: - merged_network.modelseed_ec_number_aliases[reaction_id] = list( - first_reaction.ec_number_aliases - ) - - for kegg_id in first_reaction.kegg_aliases: - try: - kegg_modelseed_aliases[kegg_id].append(reaction_id) - except KeyError: - kegg_modelseed_aliases[kegg_id] = [reaction_id] - - for ec_number in first_reaction.ec_number_aliases: - try: - ec_number_modelseed_aliases[ec_number].append(reaction_id) - except KeyError: - ec_number_modelseed_aliases[ec_number] = [reaction_id] - - for reaction_id in set(network.reactions).difference(self.reactions): - second_reaction = network.reactions[reaction_id] - - merged_reaction = ModelSEEDReaction() - merged_reaction.modelseed_id = reaction_id - merged_reaction.modelseed_name = second_reaction.modelseed_name - merged_reaction.kegg_aliases = second_reaction.kegg_aliases - merged_reaction.ec_number_aliases = second_reaction.ec_number_aliases - metabolites = [] - for metabolite in second_reaction.compounds: - metabolites.append(merged_network.metabolites[metabolite.modelseed_id]) - merged_reaction.compounds = tuple(metabolites) - merged_reaction.coefficients = second_reaction.coefficients - merged_reaction.compartments = second_reaction.compartments - merged_reaction.reversibility = second_reaction.reversibility - - merged_network.reactions[reaction_id] = merged_reaction - - try: - merged_network.modelseed_kegg_aliases[reaction_id] += list( - second_reaction.kegg_aliases - ) - except KeyError: - merged_network.modelseed_kegg_aliases[reaction_id] = list( - second_reaction.kegg_aliases - ) - - try: - merged_network.modelseed_ec_number_aliases[reaction_id] += list( - second_reaction.ec_number_aliases - ) - except KeyError: - merged_network.modelseed_ec_number_aliases[reaction_id] = list( - second_reaction.ec_number_aliases - ) - - for kegg_id in second_reaction.kegg_aliases: - try: - kegg_modelseed_aliases[kegg_id].append(reaction_id) - except KeyError: - kegg_modelseed_aliases[kegg_id] = [reaction_id] - - for ec_number in second_reaction.ec_number_aliases: - try: - ec_number_modelseed_aliases[ec_number].append(reaction_id) - except KeyError: - ec_number_modelseed_aliases[ec_number] = [reaction_id] - - if merged_network.kegg_modelseed_aliases: - for kegg_id, modelseed_ids in kegg_modelseed_aliases.items(): - try: - merged_network.kegg_modelseed_aliases[kegg_id] += modelseed_ids - except KeyError: - merged_network.kegg_modelseed_aliases[kegg_id] = modelseed_ids - else: - merged_network.kegg_modelseed_aliases = kegg_modelseed_aliases - - if merged_network.ec_number_modelseed_aliases: - for ec_number, modelseed_ids in ec_number_modelseed_aliases.items(): - try: - merged_network.ec_number_modelseed_aliases[ec_number] += modelseed_ids - except KeyError: - merged_network.ec_number_modelseed_aliases[ec_number] = modelseed_ids - else: - merged_network.ec_number_modelseed_aliases = ec_number_modelseed_aliases - - # Add KOs to the merged network, first adding KOs present in both source networks, and then - # adding KOs present exclusively in each source network. - first_ko_ids = set(self.kos) - second_ko_ids = set(network.kos) - - for ko_id in first_ko_ids.intersection(second_ko_ids): - first_ko = self.kos[ko_id] - second_ko = network.kos[ko_id] - - # The new object representing the KO in the merged network should have all reaction - # annotations from both source KO objects, as these objects can have different reaction - # references. - merged_ko = KO() - merged_ko.id = ko_id - merged_ko.name = first_ko.name - reaction_ids = set(first_ko.reactions).union(set(second_ko.reactions)) - merged_ko.reactions = { - reaction_id: merged_network.reactions[reaction_id] for reaction_id in reaction_ids - } - for reaction_id in reaction_ids: - try: - merged_ko.kegg_reaction_aliases[reaction_id] = first_ko.kegg_reaction_aliases[ - reaction_id - ] - except KeyError: - # The reaction has no KO KEGG REACTION aliases. - pass - try: - merged_ko.ec_number_aliases[reaction_id] = first_ko.ec_number_aliases[ - reaction_id - ] - except KeyError: - # The reaction has no KO KEGG REACTION aliases. - pass - - merged_network.kos[ko_id] = merged_ko - - for ko_id in first_ko_ids.difference(second_ko_ids): - first_ko = self.kos[ko_id] - - ko = KO() - ko.id = ko_id - ko.name = first_ko.name - ko.reactions = { - reaction_id: merged_network.reactions[reaction_id] - for reaction_id in first_ko.reactions - } - ko.kegg_reaction_aliases = deepcopy(first_ko.kegg_reaction_aliases) - ko.ec_number_aliases = deepcopy(first_ko.ec_number_aliases) - - merged_network.kos[ko_id] = ko - - for ko_id in second_ko_ids.difference(first_ko_ids): - second_ko = network.kos[ko_id] - - ko = KO() - ko.id = ko_id - ko.name = second_ko.name - ko.reactions = { - reaction_id: merged_network.reactions[reaction_id] - for reaction_id in second_ko.reactions - } - ko.kegg_reaction_aliases = deepcopy(second_ko.kegg_reaction_aliases) - ko.ec_number_aliases = deepcopy(second_ko.ec_number_aliases) - - merged_network.kos[ko_id] = ko - - def _get_common_overview_statistics( - self, - stats: Union[GenomicNetworkStats, PangenomicNetworkStats] - ) -> None: - """ - Calculate overview statistics that are found the same way for both genomic and pangenomic - networks. - - Parameters - ========== - stats : Union[GenomicNetworkStats, PangenomicNetworkStats] - Network statistics are stored in a dictionary of dictionaries. Keys in the outer - dictionary are "classes" of network statistics. Keys in the inner dictionary are - the names of the statistics themselves. - - Returns - ======= - None - """ - self.progress.new("Counting reactions and KO sources") - self.progress.update("...") - stats['Reactions and KO sources'] = stats_group = {} - - stats_group['Reactions in network'] = len(self.reactions) - reaction_counts = [] - for ko in self.kos.values(): - reaction_counts.append(len(ko.reactions)) - stats_group['Mean reactions per KO'] = round(np.mean(reaction_counts), 1) - stats_group['Stdev reactions per KO'] = round(np.std(reaction_counts), 1) - stats_group['Max reactions per KO'] = max(reaction_counts) - - self.progress.end() - - self.progress.new("Counting reactions from each alias source") - self.progress.update("...") - stats['Reaction alias sources'] = stats_group = {} - - kegg_aliased_modelseed_reaction_ids = [] - for modelseed_reaction_id, kegg_reaction_ids in self.modelseed_kegg_aliases.items(): - if len(kegg_reaction_ids) > 0: - kegg_aliased_modelseed_reaction_ids.append(modelseed_reaction_id) - ec_number_aliased_modelseed_reaction_ids = [] - for modelseed_reaction_id, ec_numbers in self.modelseed_ec_number_aliases.items(): - if len(ec_numbers) > 0: - ec_number_aliased_modelseed_reaction_ids.append(modelseed_reaction_id) - kegg_reaction_source_count = len(kegg_aliased_modelseed_reaction_ids) - ec_number_source_count = len(ec_number_aliased_modelseed_reaction_ids) - both_source_count = len( - set(kegg_aliased_modelseed_reaction_ids).intersection( - set(ec_number_aliased_modelseed_reaction_ids) - ) - ) - stats_group['Reactions aliased by KEGG reaction'] = kegg_reaction_source_count - stats_group['Reactions aliased by EC number'] = ec_number_source_count - stats_group['Rxns aliased by both KEGG rxn & EC number'] = both_source_count - stats_group['Reactions aliased only by KEGG reaction'] = ( - kegg_reaction_source_count - both_source_count - ) - stats_group['Reactions aliased only by EC number'] = ( - ec_number_source_count - both_source_count - ) - - stats_group['KEGG reactions contributing to network'] = len(self.kegg_modelseed_aliases) - reaction_counts = [] - for modelseed_reaction_ids in self.kegg_modelseed_aliases.values(): - reaction_counts.append(len(modelseed_reaction_ids)) - stats_group['Mean reactions per KEGG reaction'] = round(np.mean(reaction_counts), 1) - stats_group['Stdev reactions per KEGG reaction'] = round(np.std(reaction_counts), 1) - stats_group['Max reactions per KEGG reaction'] = ( - max(reaction_counts) if reaction_counts else 0 - ) - - stats_group['EC numbers contributing to network'] = len(self.ec_number_modelseed_aliases) - reaction_counts = [] - for modelseed_reaction_ids in self.ec_number_modelseed_aliases.values(): - reaction_counts.append(len(modelseed_reaction_ids)) - stats_group['Mean reactions per EC number'] = round(np.mean(reaction_counts), 1) - stats_group['Stdev reactions per EC number'] = round(np.std(reaction_counts), 1) - stats_group['Max reactions per EC number'] = ( - max(reaction_counts) if reaction_counts else 0 - ) - - self.progress.end() - - self.progress.new("Counting reactions and metabolites by property") - self.progress.update("...") - stats['Reaction and metabolite properties'] = stats_group = {} - - reversible_count = 0 - irreversible_count = 0 - cytoplasmic_compound_ids = [] - extracellular_compound_ids = [] - consumed_compound_ids = [] - produced_compound_ids = [] - compound_reaction_counts = {} - for reaction in self.reactions.values(): - if reaction.reversibility: - reversible_count += 1 - else: - irreversible_count += 1 - encountered_compound_ids = [] - for compartment, coefficient, compound in zip( - reaction.compartments, reaction.coefficients, reaction.compounds - ): - compound_id = compound.modelseed_id - if compartment == 'c': - cytoplasmic_compound_ids.append(compound_id) - else: - extracellular_compound_ids.append(compound_id) - if reaction.reversibility: - consumed_compound_ids.append(compound_id) - produced_compound_ids.append(compound_id) - elif coefficient < 0: - consumed_compound_ids.append(compound_id) - else: - produced_compound_ids.append(compound_id) - if compound_id not in encountered_compound_ids: - try: - compound_reaction_counts[compound_id] += 1 - except KeyError: - compound_reaction_counts[compound_id] = 1 - stats_group['Reversible reactions'] = reversible_count - stats_group['Irreversible reactions'] = irreversible_count - cytoplasmic_compound_ids = set(cytoplasmic_compound_ids) - extracellular_compound_ids = set(extracellular_compound_ids) - stats_group['Metabolites in network'] = metabolite_count = len(self.metabolites) - stats_group['Cytoplasmic metabolites'] = len(cytoplasmic_compound_ids) - stats_group['Extracellular metabolites'] = len(extracellular_compound_ids) - stats_group['Exclusively cytoplasmic metabolites'] = len( - cytoplasmic_compound_ids.difference(extracellular_compound_ids) - ) - stats_group['Exclusively extracellular metabolites'] = len( - extracellular_compound_ids.difference(cytoplasmic_compound_ids) - ) - stats_group['Cytoplasmic/extracellular metabolites'] = len( - cytoplasmic_compound_ids.intersection(extracellular_compound_ids) - ) - consumed_compound_ids = set(consumed_compound_ids) - produced_compound_ids = set(produced_compound_ids) - stats_group['Consumed metabolites'] = len(consumed_compound_ids) - stats_group['Produced metabolites'] = len(produced_compound_ids) - stats_group['Both consumed & produced metabolites'] = len( - consumed_compound_ids.intersection(produced_compound_ids) - ) - stats_group['Exclusively consumed metabolites'] = len( - consumed_compound_ids.difference(produced_compound_ids) - ) - stats_group['Exclusively produced metabolites'] = len( - produced_compound_ids.difference(consumed_compound_ids) - ) - metabolite_reaction_counts = collections.Counter(compound_reaction_counts.values()) - one_reaction_count = metabolite_reaction_counts[1] - stats_group['Metabolites consumed or produced by 1 rxn'] = one_reaction_count - two_reactions_count = metabolite_reaction_counts[2] - stats_group['Metabolites consumed or produced by 2 rxns'] = two_reactions_count - three_plus_reactions_count = metabolite_count - one_reaction_count - two_reactions_count - stats_group['Metabolites consumed or produced by 3+ rxns'] = three_plus_reactions_count - - self.progress.end() - - def _print_common_overview_statistics( - self, - stats: Union[GenomicNetworkStats, PangenomicNetworkStats] - ) -> None: - """ - Print overview statistics that are the same for both genomic and pangenomic networks. - - Parameters - ========== - stats : Union[GenomicNetworkStats, PangenomicNetworkStats] - Network statistics are stored in a dictionary of dictionaries. Keys in the outer - dictionary are "classes" of network statistics. Keys in the inner dictionary are - the names of the statistics themselves. - - Returns - ======= - None - """ - self.run.info_single("ModelSEED reactions in network and KO sources") - stats_group = stats['Reactions and KO sources'] - for key in ( - 'Reactions in network', - 'Mean reactions per KO', - 'Stdev reactions per KO', - 'Max reactions per KO' - ): - self.run.info(key, stats_group[key]) - - self.run.info_single("Reaction alias source comparison", nl_before=1) - stats_group = stats['Reaction alias sources'] - for key in ( - 'Reactions aliased by KEGG reaction', - 'Reactions aliased by EC number', - 'Rxns aliased by both KEGG rxn & EC number', - 'Reactions aliased only by KEGG reaction', - 'Reactions aliased only by EC number', - 'KEGG reactions contributing to network', - 'Mean reactions per KEGG reaction', - 'Stdev reactions per KEGG reaction', - 'Max reactions per KEGG reaction', - 'EC numbers contributing to network', - 'Mean reactions per EC number', - 'Stdev reactions per EC number', - 'Max reactions per EC number' - ): - self.run.info(key, stats_group[key]) - - stats_group = stats['Reaction and metabolite properties'] - self.run.info_single("Reaction reversibility", nl_before=1) - for key in ( - 'Reversible reactions', - 'Irreversible reactions' - ): - self.run.info(key, stats_group[key]) - - self.run.info_single("Metabolites and localization", nl_before=1) - for key in ( - 'Metabolites in network', - 'Cytoplasmic metabolites', - 'Extracellular metabolites', - 'Exclusively cytoplasmic metabolites', - 'Exclusively extracellular metabolites', - 'Cytoplasmic/extracellular metabolites' - ): - self.run.info(key, stats_group[key]) - - self.run.info_single("Metabolite consumption and production", nl_before=1) - for key in ( - 'Consumed metabolites', - 'Produced metabolites', - 'Both consumed & produced metabolites', - 'Exclusively consumed metabolites', - 'Exclusively produced metabolites', - 'Metabolites consumed or produced by 1 rxn', - 'Metabolites consumed or produced by 2 rxns', - 'Metabolites consumed or produced by 3+ rxns' - ): - self.run.info(key, stats_group[key]) - print() - - def write_overview_statistics( - self, - stats_file: str, - stats: Union[GenomicNetworkStats, PangenomicNetworkStats] = None - ) -> None: - """ - Write a tab-delimited file of overview statistics for the metabolic network. - - Parameters - ========== - stats_file : str - Path to output tab-delimited file of overview statistics. - - stats : Union[GenomicNetworkStats, PangenomicNetworkStats], None - With the default value of None, network statistics will be calculated and written to - file. Alternatively, provided network statistics will be written to file without - calculating anew. - - Returns - ======= - None - """ - if not stats: - # Subclasses must have a method, 'get_overview_statistics'. - stats = self.get_overview_statistics() - - filesnpaths.is_output_file_writable(stats_file) - - table = [] - for stats_group_name, stats_group in stats.items(): - for stat_name, stat_value in stats_group.items(): - table.append([stats_group_name, stat_name, stat_value]) - pd.DataFrame(table, columns=['Group', 'Statistic', 'Value']).to_csv( - stats_file, sep='\t', index=False - ) - - self.run.info("Metabolic network statistics output file", stats_file) - -class GenomicNetwork(ReactionNetwork): - """ - A reaction network predicted from KEGG Ortholog annotations of genes and ModelSEED data. - - Attributes - ========== - kos : Dict[str, KO], dict() - This dictionary maps the IDs of KOs in the network to object representations of the KOs. - - reactions : Dict[str, ModelSEEDReaction], dict() - This maps the IDs of ModelSEED reactions in the network to object representations of the - reactions. - - metabolites : Dict[str, ModelSEEDCompound], dict() - This maps the IDs of ModelSEED metabolites in the network to object representations of the - metabolites. - - kegg_modelseed_aliases : Dict[str, List[str]], dict() - This maps KEGG REACTION IDs associated with KOs in the network to ModelSEED reactions - aliased by the KEGG reaction. KO-associated KEGG reactions that do not alias ModelSEED - reactions are not included. - - ec_number_modelseed_aliases : Dict[str, List[str]], dict() - This maps EC numbers associated with KOs in the network to ModelSEED reactions aliased by - the EC number. KO-associated EC numbers that do not alias ModelSEED reactions are not - included. - - modelseed_kegg_aliases : Dict[str, List[str]], dict() - This maps the IDs of ModelSEED reactions in the network to lists of KEGG REACTION IDs that - are associated with KOs in the network and alias the ModelSEED reaction. - - modelseed_ec_number_aliases : Dict[str, List[str]], dict() - This maps the IDs of ModelSEED reactions in the network to lists of EC numbers that are - associated with KOs in the network and alias the ModelSEED reaction. - - contigs_db_source_path : str, None - Path to the contigs database from which the network was built. - - profile_db_source_path : str, None - Path to the profile database from which protein and metabolite abundance data was loaded. - - genes : Dict[int, Gene], dict() - This maps gene callers IDs to object representations of genes in the network. - - bins : Dict[str, GeneBin], dict() - - collection : BinCollection, None - - proteins : Dict[int, Protein], dict() - This maps protein IDs to object representations of proteins with abundance data in the - network. - """ - def __init__( - self, - run: terminal.Run = terminal.Run(), - progress: terminal.Progress = terminal.Progress(), - verbose: bool = True - ) -> None: - """ - Parameters - ========== - run : anvio.terminal.Run, anvio.terminal.Run() - This object sets the 'run' attribute, which prints run information to the terminal. - - progress : anvio.terminal.Progress, anvio.terminal.Progress() - This object sets the 'progress' attribute, which prints transient progress information - to the terminal. - - verbose : bool, True - This sets the 'verbose' attribute, causing more information to be reported to the - terminal if True. - - Returns - ======= - None - """ - super().__init__(run=run, progress=progress, verbose=verbose) - self.contigs_db_source_path: str = None - self.profile_db_source_path: str = None - self.genes: Dict[int, Gene] = {} - self.bins: Dict[str, GeneBin] = {} - self.collection: BinCollection = None - self.proteins: Dict[int, Protein] = {} - - def copy(self) -> GenomicNetwork: - """ - Create a deep copy of the reaction network. - - Returns - ======= - GenomicNetwork - Deep copy of the reaction network. - """ - copied_network = GenomicNetwork() - - self._copy(copied_network) - - for gcid, gene in self.genes.items(): - copied_gene = Gene() - copied_gene.gcid = gcid - for ko_id, ko in gene.kos.items(): - copied_gene.kos[ko_id] = ko - for ko_id, e_value in gene.e_values.items(): - copied_gene.e_values[ko_id] = e_value - - copied_network.genes[gcid] = copied_gene - - if self.proteins: - for protein_id, protein in self.proteins.items(): - copied_protein = Protein() - copied_protein.id = protein_id - for gcid, gene in protein.genes.items(): - copied_protein.genes[gcid] = gene = copied_network.genes[gcid] - gene.protein = copied_protein - copied_protein.abundances = protein.abundances.copy() - - copied_network.proteins[protein_id] = copied_protein - - return copied_network - - def remove_metabolites_without_formula(self, output_path: str = None) -> None: - """ - Remove metabolites without a formula in the ModelSEED database from the network. - - Other items can be removed from the network by association: reactions that involve a - formulaless metabolite; other metabolites with formulas that are exclusive to such - reactions; KOs predicted to exclusively catalyze such reactions; and genes exclusively - annotated with such KOs. Removed metabolites with a formula are reported alongside - formulaless metabolites to the optional output table of removed metabolites. - - output_path : str, None - If not None, write four tab-delimited files of metabolites, reactions, KEGG Orthologs, - and genes removed from the network to file locations based on the provided path. For - example, if the argument, 'removed.tsv', is provided, then the following files will be - written: 'removed-metabolites.tsv', 'removed-reactions.tsv', 'removed-kos.tsv', and - 'removed-genes.tsv'. - """ - if self.verbose: - self.progress.new("Removing metabolites without a formula in the network") - self.progress.update("...") - - if output_path: - path_basename, path_extension = os.path.splitext(output_path) - metabolite_path = f"{path_basename}-metabolites{path_extension}" - reaction_path = f"{path_basename}-reactions{path_extension}" - ko_path = f"{path_basename}-kos{path_extension}" - gene_path = f"{path_basename}-genes{path_extension}" - for path in (metabolite_path, reaction_path, ko_path, gene_path): - filesnpaths.is_output_file_writable(path) - - metabolites_to_remove = [] - for modelseed_compound_id, metabolite in self.metabolites.items(): - # ModelSEED compounds without a formula have a formula value of None in the network - # object. - if metabolite.formula is None: - metabolites_to_remove.append(modelseed_compound_id) - removed = self.purge_metabolites(metabolites_to_remove) - - if self.verbose: - self.progress.end() - self.run.info("Removed metabolites", len(removed['metabolite'])) - self.run.info("Removed reactions", len(removed['reaction'])) - self.run.info("Removed KOs", len(removed['ko'])) - self.run.info("Removed genes", len(removed['gene'])) - - if not output_path: - return - - if self.verbose: - self.progress.new("Writing output files of removed network items") - self.progress.update("...") - - # Record the reactions removed as a consequence of involving formulaless metabolites, and - # record the formulaless metabolites involved in removed reactions. - metabolite_removed_reactions: Dict[str, List[str]] = {} - reaction_removed_metabolites: Dict[str, List[str]] = {} - for reaction in removed['reaction']: - reaction: ModelSEEDReaction - reaction_removed_metabolites[reaction.modelseed_id] = metabolite_ids = [] - for metabolite in reaction.compounds: - if metabolite.modelseed_id in metabolites_to_remove: - try: - metabolite_removed_reactions[metabolite.modelseed_id].append( - reaction.modelseed_id - ) - except KeyError: - metabolite_removed_reactions[metabolite.modelseed_id] = [ - reaction.modelseed_id - ] - metabolite_ids.append(metabolite.modelseed_id) - - metabolite_table = [] - for metabolite in removed['metabolite']: - metabolite: ModelSEEDCompound - row = [] - row.append(metabolite.modelseed_id) - row.append(metabolite.modelseed_name) - row.append(metabolite.formula) - try: - # The metabolite did not have a formula. - removed_reaction_ids = metabolite_removed_reactions[metabolite.modelseed_id] - except KeyError: - # The metabolite had a formula but was removed as a consequence of all the reactions - # involving the metabolite being removed due to them containing formulaless - # metabolites: the metabolite did not cause any reactions to be removed. - row.append("") - continue - # The set accounts for the theoretical possibility that a compound is present on both - # sides of the reaction equation and thus the reaction is recorded multiple times. - row.append(", ".join(sorted(set(removed_reaction_ids)))) - - reaction_table = [] - for reaction in removed['reaction']: - reaction: ModelSEEDReaction - row = [] - row.append(reaction.modelseed_id) - row.append(reaction.modelseed_name) - # The set accounts for the theoretical possibility that a compound is present on both - # sides of the reaction equation and thus is recorded multiple times. - row.append( - ", ".join(set(reaction_removed_metabolites[reaction.modelseed_id])) - ) - row.append(", ".join([metabolite.modelseed_id for metabolite in reaction.compounds])) - row.append(get_chemical_equation(reaction)) - reaction_table.append(row) - - ko_table = [] - for ko in removed['ko']: - ko: KO - row = [] - row.append(ko.id) - row.append(ko.name) - row.append(", ".join(ko.reactions)) - ko_table.append(row) - - gene_table = [] - for gene in removed['gene']: - gene: Gene - row = [] - row.append(gene.gcid) - row.append(", ".join(gene.kos)) - gene_table.append(row) - - pd.DataFrame( - metabolite_table, - columns=[ - "ModelSEED compound ID", - "ModelSEED compound name", - "Formula", - "Removed reaction ModelSEED IDs" - ] - ).to_csv(metabolite_path, sep='\t', index=False) - pd.DataFrame( - reaction_table, - columns=[ - "ModelSEED reaction ID", - "ModelSEED reaction name", - "Removed ModelSEED compound IDs", - "Reaction ModelSEED compound IDs", - "Equation" - ] - ).to_csv(reaction_path, sep='\t', index=False) - pd.DataFrame( - ko_table, - columns=[ - "KO ID", - "KO name", - "KO ModelSEED reaction IDs" - ] - ).to_csv(ko_path, sep='\t', index=False) - pd.DataFrame( - gene_table, - columns=[ - "Gene callers ID", - "KO IDs" - ] - ).to_csv(gene_path, sep='\t', index=False) - - if self.verbose: - self.progress.end() - self.run.info("Table of removed metabolites", metabolite_path) - self.run.info("Table of removed reactions", reaction_path) - self.run.info("Table of removed KOs", ko_path) - self.run.info("Table of removed genes", gene_path) - - def purge_metabolites(self, metabolites_to_remove: Iterable[str]) -> Dict[str, List]: - """ - Remove any trace of the given metabolites from the network. - - Reactions involving the metabolite are also purged from the network. KOs that were only - associated with removed reactions are purged; genes that were only associated with removed - KOs are purged. - - Removal of reactions involving the metabolite can also result in other metabolites being - being removed from the network, those that exclusively participate in these reactions. - - Parameters - ========== - metabolites_to_remove : Iterable[str] - Metabolites to remove by ModelSEED compound ID. - - Returns - ======= - dict - This dictionary contains data removed from the network. - - If this method is NOT called from the method, 'purge_reactions', then the dictionary - will look like the following: - { - 'metabolite': [], - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'ko': [], - 'gene': [] - } - - If this method is called from the method, 'purge_reactions', then the dictionary will - only contain one significant entry: - { - 'metabolite': [], - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'ko': [], - 'gene': [] - } - """ - removed_metabolites: List[ModelSEEDCompound] = [] - for modelseed_compound_id in metabolites_to_remove: - try: - removed_metabolites.append(self.metabolites.pop(modelseed_compound_id)) - except KeyError: - # This can occur for two reasons. First, the metabolite from 'metabolites_to_remove' - # could not be in the network. - - # Second, this can occur when removing other "unintended" metabolites from the - # network. 'purge_metabolites' was first called with metabolites of interest, then - # 'purge_reactions' was called from within the method the remove reactions involving - # the metabolites of interest, and then 'purge_metabolites' was called again from - # within 'purge_reactions' to remove other metabolites exclusively found in the - # removed reactions. In this last call of 'purge_metabolites', the - # 'metabolites_to_remove' also include the metabolites of interest that were already - # removed from 'self.metabolites' in the original 'purge_metabolites' call. This - # KeyError occurs when trying to remove those already-removed metabolites. - pass - if not removed_metabolites: - return { - 'metabolite': [], - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'ko': [], - 'gene': [] - } - - reactions_to_remove = [] - for modelseed_reaction_id, reaction in self.reactions.items(): - for compound in reaction.compounds: - if compound.modelseed_id in metabolites_to_remove: - reactions_to_remove.append(modelseed_reaction_id) - break - - removed = {'metabolite': removed_metabolites} - if reactions_to_remove: - removed_cascading_up = self.purge_reactions(reactions_to_remove) - # There may be other metabolites exclusively involved in the removed reactions; these - # metabolites were therefore also removed. - removed['metabolite'] = removed_metabolites + removed_cascading_up.pop('metabolite') - else: - # This method must have been called from the method, 'purge_reactions', because the - # reactions containing the metabolites were already removed from the network. - removed_cascading_up = { - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'ko': [], - 'gene': [] - } - removed.update(removed_cascading_up) - return removed - - def purge_reactions(self, reactions_to_remove: Iterable[str]) -> Dict[str, List]: - """ - Remove any trace of the given reactions from the network. - - Metabolites that exclusively participate in removed reactions are purged. KOs that were only - associated with removed reactions are purged; genes that were only associated with removed - KOs are purged. - - Parameters - ========== - reactions_to_remove : Iterable[str] - Reactions to remove by ModelSEED reaction ID. - - Returns - ======= - dict - This dictionary contains data removed from the network. - - If this method is NOT called from the method, 'purge_metabolites', or the method, - 'purge_kos', then the dictionary will look like the following: - { - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'metabolite': [], - 'ko': [], - 'gene': [] - } - - If this method is called from the method, 'purge_metabolites', then the dictionary will - look like the following: - { - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'metabolite': [], - 'ko': [], - 'gene': [] - } - - If this method is called from the method, 'purge_kos', then the dictionary will look - like the following: - { - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'metabolite': [ Dict[str, List]: - """ - Remove any trace of the given KOs from the network. - - Reactions and metabolites that were only associated with removed KOs are purged. Genes that - were only associated with removed KOs are purged. - - Parameters - ========== - kos_to_remove : Iterable[str] - KOs to remove by ID. - - Returns - ======= - dict - This dictionary contains data removed from the network. - - If this method is NOT called from the method, 'purge_reactions', or the method, - 'purge_genes', then the dictionary will look like the following: - { - 'ko': [], - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'metabolite': [], - 'gene': [] - } - - If this method is called from the method, 'purge_reactions', then the dictionary will - look like the following: - { - 'ko': [], - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'metabolite': [], - 'gene': [] - } - - If this method is called from the method, 'purge_genes', then the dictionary will - look like the following: - { - 'ko': [], - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'metabolite': [ Dict[str, List]: - """ - Remove any trace of the given genes from the network. - - KOs, reactions, and metabolites that were only associated with removed genes are purged. - - Parameters - ========== - genes_to_remove : Iterable[str] - Genes to remove by gene callers ID. - - Returns - ======= - dict - This dictionary contains data removed from the network. - - If this method is NOT called from the method, 'purge_kos', then the dictionary will - look like the following: - { - 'gene': [], - 'ko': [], - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'metabolite': [] - } - - If this method is called from the method, 'purge_kos', then the dictionary will look - like the following: - { - 'gene': [], - 'ko': [], - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'metabolite': [] - } - """ - removed_genes: List[Gene] = [] - for gcid in genes_to_remove: - try: - removed_genes.append(self.genes.pop(gcid)) - except KeyError: - # This occurs if the gene in 'genes_to_remove' is not in the network. - pass - - if not removed_genes: - return { - 'metabolite': [], - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'ko': [], - 'gene': [] - } - - kos_to_remove: List[str] = [] - for gene in removed_genes: - for ko_id in gene.kos: - kos_to_remove.append(ko_id) - kos_to_remove = list(set(kos_to_remove)) - for gene in self.genes.values(): - kos_to_spare: List[str] = [] - for ko_id in gene.kos: - if ko_id in kos_to_remove: - # The KO is associated with a retained gene, so do not remove the KO. - kos_to_spare.append(ko_id) - for ko_id in kos_to_spare: - kos_to_remove.remove(ko_id) - if kos_to_remove: - removed_cascading_down = self.purge_kos(kos_to_remove) - removed_cascading_down.pop('gene') - else: - # This method must have been called from the method, 'purge_kos', because the KOs that - # are only associated with the removed genes were already removed from the network. - removed_cascading_down = { - 'ko': [], - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'metabolite': [] - } - - # TODO: remove genes from self.bins - - removed = {'gene': removed_genes} - removed.update(removed_cascading_down) - return removed - - def subset_network( - self, - kegg_modules_to_subset: Iterable[str] = None, - brite_categories_to_subset: Iterable[str] = None, - genes_to_subset: Iterable[int] = None, - kos_to_subset: Iterable[str] = None, - reactions_to_subset: Iterable[str] = None, - metabolites_to_subset: Iterable[str] = None - ) -> GenomicNetwork: - """ - Subset a smaller network from the metabolic network. - - If requested KEGG modules, BRITE categories, genes, KOs, reactions, or metabolites are not - present in the network, no error is raised. - - Subsetted items are not represented by the same objects as in the source network, i.e., new - gene, KO, reaction, and metabolite objects are created and added to the subsetted network. - - Network items (i.e., genes, KOs, reactions, and metabolites) that reference requested items - (e.g., genes in the network referencing requested KOs; KOs in the network referencing - requested reactions) are added to the subsetted network. A gene added to the subsetted - network due to references to requested KOs will be missing references to any other - "unrequested" KOs annotating the gene in the source network. Likewise, genes and KOs that - are added to the subsetted network due to references to requested reactions will be missing - references to any other unrequested reactions. In other words, certain KO and reaction - annotations can be selected to the exclusion of others, e.g., a KO encoding two reactions - can be "redefined" or "pruned" to encode one requested reaction in the subsetted network; a - KO encoding multiple reactions can be pruned to encode only those reactions involving - requested metabolites. - - If the 'verbose' attribute of the source 'GenomicNetwork' object is True, then report to the - terminal the identities of requested KEGG modules, BRITE categories, genes, KOs, reactions, - and metabolites that are not present in the network. - - Parameters - ========== - kegg_modules_to_subset : Iterable[str], None - KEGG module IDs to subset. - - brite_categories_to_subset : Iterable[str], None - BRITE categories to subset. - - genes_to_subset : Iterable[int], None - Gene callers IDs to subset. - - kos_to_subset : Iterable[str], None - KO IDs to subset. - - reactions_to_subset : Iterable[str], None - ModelSEED reaction IDs to subset. - - metabolites_to_subset : Iterable[str], None - ModelSEED metabolite IDs to subset. - - Returns - ======= - GenomicNetwork - New subsetted reaction network. - """ - # Sequentially subset the network for each type of request. Upon generating two subsetted - # networks from two types of request, merge the networks into a single subsetted network; - # repeat. - first_subnetwork = None - for items_to_subset, subset_network_method in ( - (kegg_modules_to_subset, self._subset_network_by_modules), - (brite_categories_to_subset, self._subset_network_by_brite), - (genes_to_subset, self._subset_network_by_genes), - (kos_to_subset, self._subset_network_by_kos), - (reactions_to_subset, self._subset_network_by_reactions), - (metabolites_to_subset, self._subset_network_by_metabolites) - ): - if not items_to_subset: - continue - - second_subnetwork = subset_network_method(items_to_subset) - - if first_subnetwork is None: - first_subnetwork = second_subnetwork - else: - first_subnetwork = first_subnetwork.merge_network(second_subnetwork) - - return first_subnetwork - - def _subset_network_by_modules(self, kegg_modules: Iterable[str]) -> GenomicNetwork: - """ - Subset the network by KOs in requested KEGG modules. - - Parameters - ========== - kegg_modules : Iterable[str] - KEGG modules (of KOs) to subset by ID. - - Returns - ======= - GenomicNetwork - New subsetted reaction network. - """ - pass - - def _subset_network_by_brite(self, brite_categories: Iterable[str]) -> GenomicNetwork: - """ - Subset the network by KOs in requested KEGG BRITE hierarchy categories. - - Parameters - ========== - brite_categories : Iterable[str] - KEGG BRITE hierarchy categories (of KOs) to subset. - - Returns - ======= - GenomicNetwork - New subsetted reaction network. - """ - pass - - def _subset_network_by_genes(self, gcids: Iterable[int]) -> GenomicNetwork: - """ - Subset the network by genes with requested gene callers IDs. - - Parameters - ========== - gcids : Iterable[int] - Genes to subset by gene callers ID. - - Returns - ======= - GenomicNetwork - New subsetted reaction network. - """ - subnetwork = GenomicNetwork() - - for gcid in gcids: - try: - gene = self.genes[gcid] - except KeyError: - # This occurs if the requested gene callers ID is not in the source network. - continue - - subsetted_gene = Gene() - subsetted_gene.gcid = gcid - subsetted_gene.e_values = gene.e_values.copy() - - # Add KOs annotating the gene to the subsetted network as new objects, and then - # reference these objects in the gene object. - ko_ids = list(gene.kos) - subnetwork = self._subset_network_by_kos(ko_ids, subnetwork=subnetwork) - subsetted_gene.kos = {ko_id: subnetwork.kos[ko_id] for ko_id in ko_ids} - - subnetwork.genes[gcid] = subsetted_gene - self._subset_proteins(subnetwork) - - return subnetwork - - def _subset_proteins(self, subnetwork: GenomicNetwork) -> None: - """ - Add protein abundance data to the subsetted network. - - Parameters - ========== - subnetwork : GenomicNetwork - The subsetted reaction network under construction. - - Returns - ======= - None - """ - if not self.proteins: - # Protein abundance profile data is not present in the source network. - return subnetwork - - # Parse each protein with abundance data. - for protein_id, protein in self.proteins.items(): - subsetted_gcids: List[int] = [] - for gcid in protein.genes: - if gcid in subnetwork.genes: - # A subsetted gene encodes the protein. - subsetted_gcids.append(gcid) - if not subsetted_gcids: - # No genes expressing the protein were subsetted, so the protein data is not added. - continue - - subsetted_protein = Protein() - subsetted_protein.id = protein_id - subsetted_protein.abundances = protein.abundances.copy() - for gcid in subsetted_gcids: - subsetted_gene = subnetwork.genes[gcid] - subsetted_gene.protein = subsetted_protein - subsetted_protein.genes[gcid] = subsetted_gene - - subnetwork.proteins[protein_id] = subsetted_protein - - def _subset_network_by_kos( - self, - ko_ids: Iterable[str], - subnetwork: GenomicNetwork = None - ) -> GenomicNetwork: - """ - Subset the network by KOs with requested KO IDs. - - Parameters - ========== - ko_ids : Iterable[str] - KOs to subset by KO ID. - - subnetwork : GenomicNetwork, None - This network under construction is provided when the KOs being added to the network - annotate already subsetted genes. - - Returns - ======= - GenomicNetwork - If a 'subnetwork' argument is provided, then that network is returned after - modification. Otherwise, a new subsetted reaction network is returned. - """ - if subnetwork is None: - subnetwork = GenomicNetwork() - # Signify that genes annotated by subsetted KOs are to be added to the network. - subset_referencing_genes = True - else: - assert isinstance(subnetwork, GenomicNetwork) - # Signify that the KOs being added to the network annotate subsetted genes that were - # already added to the network. - subset_referencing_genes = False - - for ko_id in ko_ids: - try: - ko = self.kos[ko_id] - except KeyError: - # This occurs if the requested KO ID is not in the source network. - continue - - subsetted_ko = KO() - subsetted_ko.id = ko.id - subsetted_ko.name = ko.name - subsetted_ko.kegg_reaction_aliases = deepcopy(ko.kegg_reaction_aliases) - subsetted_ko.ec_number_aliases = deepcopy(ko.ec_number_aliases) - - # Add reactions annotating the KO to the subsetted network as new objects, and then - # reference these objects in the KO object. - reaction_ids = [reaction_id for reaction_id in ko.reactions] - subnetwork = self._subset_network_by_reactions(reaction_ids, subnetwork=subnetwork) - subsetted_ko.reactions = { - reaction_id: subnetwork.reactions[reaction_id] for reaction_id in reaction_ids - } - - subnetwork.kos[ko_id] = subsetted_ko - - if subset_referencing_genes: - # Add genes that are annotated by the subsetted KOs to the network. - self._subset_genes_via_kos(subnetwork) - - return subnetwork - - def _subset_network_by_reactions( - self, - reaction_ids: Iterable[str], - subnetwork: GenomicNetwork = None - ) -> GenomicNetwork: - """ - Subset the network by reactions with ModelSEED reaction IDs. - - Parameters - ========== - reaction_ids : Iterable[str] - Reactions to subset by ModelSEED reaction ID. - - subnetwork : GenomicNetwork, None - This network under construction is provided when the reactions being added to the - network annotate already subsetted KOs. - - Returns - ======= - GenomicNetwork - If a 'subnetwork' argument is provided, then that network is returned after - modification. Otherwise, a new subsetted reaction network is returned. - """ - if subnetwork is None: - subnetwork = GenomicNetwork() - # Signify that KOs annotated by subsetted reactions are to be added to the network. - subset_referencing_kos = True - else: - assert isinstance(subnetwork, GenomicNetwork) - # Signify that the reactions being added to the network annotate subsetted KOs that were - # already added to the network. - subset_referencing_kos = False - - # Copy the network attributes mapping reaction aliases. - kegg_modelseed_aliases: Dict[str, List[str]] = {} - ec_number_modelseed_aliases: Dict[str, List[str]] = {} - - for reaction_id in reaction_ids: - try: - reaction = self.reactions[reaction_id] - except KeyError: - # This occurs if the requested reaction is not in the source network. - continue - - # Copy the reaction object, including referenced metabolite objects, from the source - # network. - subsetted_reaction: ModelSEEDReaction = deepcopy(reaction) - subnetwork.reactions[reaction_id] = subsetted_reaction - # Record the metabolites involved in the reaction, and add them to the network. - for metabolite in subsetted_reaction.compounds: - compound_id = metabolite.modelseed_id - subnetwork.metabolites[compound_id] = metabolite - - try: - subnetwork.modelseed_kegg_aliases[reaction_id] += list(reaction.kegg_aliases) - except KeyError: - subnetwork.modelseed_kegg_aliases[reaction_id] = list(reaction.kegg_aliases) - - try: - subnetwork.modelseed_ec_number_aliases[reaction_id] += list( - reaction.ec_number_aliases - ) - except KeyError: - subnetwork.modelseed_ec_number_aliases[reaction_id] = list( - reaction.ec_number_aliases - ) - - for kegg_id in reaction.kegg_aliases: - try: - kegg_modelseed_aliases[kegg_id].append(reaction_id) - except KeyError: - kegg_modelseed_aliases[kegg_id] = [reaction_id] - - for ec_number in reaction.ec_number_aliases: - try: - ec_number_modelseed_aliases[ec_number].append(reaction_id) - except KeyError: - ec_number_modelseed_aliases[ec_number] = [reaction_id] - - if subnetwork.kegg_modelseed_aliases: - for kegg_id, modelseed_ids in kegg_modelseed_aliases.items(): - try: - subnetwork.kegg_modelseed_aliases[kegg_id] += modelseed_ids - except KeyError: - subnetwork.kegg_modelseed_aliases[kegg_id] = modelseed_ids - else: - subnetwork.kegg_modelseed_aliases = kegg_modelseed_aliases - - if subnetwork.ec_number_modelseed_aliases: - for ec_number, modelseed_ids in ec_number_modelseed_aliases.items(): - try: - subnetwork.ec_number_modelseed_aliases[ec_number] += modelseed_ids - except KeyError: - subnetwork.ec_number_modelseed_aliases[ec_number] = modelseed_ids - else: - subnetwork.ec_number_modelseed_aliases = ec_number_modelseed_aliases - - if subset_referencing_kos: - # Add KOs that are annotated by the subsetted reactions to the network. - self._subset_kos_via_reactions(subnetwork) - - return subnetwork - - def _subset_genes_via_kos(self, subnetwork: GenomicNetwork) -> None: - """ - Add genes that are annotated with subsetted KOs to the subsetted network. - - These gene objects only reference subsetted KOs and not other KOs that also annotate the - gene but are not subsetted. - - Parameters - ========== - subnetwork : GenomicNetwork - The subsetted reaction network under construction. - - Returns - ======= - None - """ - subsetted_ko_ids = list(subnetwork.kos) - for gcid, gene in self.genes.items(): - # Check all genes in the source network for subsetted KOs. - subsetted_gene = None - for ko_id in gene.kos: - if ko_id not in subsetted_ko_ids: - # The gene is not annotated by the subsetted KO. - continue - - if not subsetted_gene: - # Create a new gene object for the subsetted gene. The gene object would already - # have been created had another subsetted KO been among the KOs annotating the - # gene. - subsetted_gene = Gene() - subsetted_gene.gcid = gcid - subsetted_gene.kos[ko_id] = subnetwork.kos[ko_id] - subsetted_gene.e_values[ko_id] = gene.e_values[ko_id] - - if subsetted_gene: - subnetwork.genes[gcid] = subsetted_gene - - def _subset_kos_via_reactions(self, subnetwork: GenomicNetwork) -> None: - """ - Add KOs that are annotated with subsetted reactions to the subsetted network. - - Then add genes that are annotated with these added KOs to the subsetted network. - - Parameters - ========== - subnetwork : GenomicNetwork - The subsetted reaction network under construction. - - Returns - ======= - None - """ - subsetted_reaction_ids = list(subnetwork.reactions) - for ko_id, ko in self.kos.items(): - # Check all KOs in the source network for subsetted reactions. - subsetted_ko = None - for reaction_id in ko.reactions: - if reaction_id not in subsetted_reaction_ids: - # The KO is not annotated by the subsetted reaction. - continue - - if not subsetted_ko: - # Create a new KO object for the subsetted KO. The subsetted KO object would - # already have been created had another subsetted reaction been among the - # reactions annotating the KO. - subsetted_ko = KO() - subsetted_ko.id = ko_id - subsetted_ko.name = ko.name - subsetted_ko.reactions[reaction_id] = subnetwork.reactions[reaction_id] - subsetted_ko.kegg_reaction_aliases = deepcopy(ko.kegg_reaction_aliases) - subsetted_ko.ec_number_aliases = deepcopy(ko.ec_number_aliases) - - if subsetted_ko: - subnetwork.kos[ko_id] = subsetted_ko - - # Add genes that are annotated with the added KOs to the subsetted network. - self._subset_genes_via_kos(subnetwork) - - def _subset_network_by_metabolites(self, compound_ids: Iterable[str]) -> GenomicNetwork: - """ - Subset the network by metabolites with ModelSEED compound IDs. - - Parameters - ========== - compound_ids : Iterable[str] - Metabolites to subset by ModelSEED compound ID. - - Returns - ======= - GenomicNetwork - New subsetted reaction network. - """ - subnetwork = GenomicNetwork() - - for reaction_id, reaction in self.reactions.items(): - # Check all reactions in the source network for subsetted metabolites. - for metabolite in reaction.compounds: - if metabolite.modelseed_id in compound_ids: - break - else: - # The reaction does not involve any of the requested metabolites. - continue - - # Copy the reaction object, including referenced metabolite objects, from the source - # network. - subsetted_reaction: ModelSEEDReaction = deepcopy(reaction) - subnetwork.reactions[reaction_id] = subsetted_reaction - - # Add the metabolites involved in the reaction to the subsetted network. (There can be - # unavoidable redundancy here in readding previously encountered metabolites.) - for subsetted_metabolite in subsetted_reaction.compounds: - subnetwork.metabolites[subsetted_metabolite.modelseed_id] = subsetted_metabolite - - # Add KOs that are annotated with the added reactions to the subsetted network, and then add - # genes annotated with the added KOs to the subsetted network. - self._subset_kos_via_reactions(subnetwork) - - return subnetwork - - def merge_network(self, network: GenomicNetwork) -> GenomicNetwork: - """ - Merge the genomic reaction network with another genomic reaction network. - - Each network can contain different genes, KOs, and reactions/metabolites. Merging - nonredundantly incorporates all of this data as new objects in the new network. - - Objects representing genes or KOs in both networks can have different sets of references: - genes can be annotated by different KOs, and KOs can be annotated by different reactions. - - Otherwise, object attributes should be consistent between the networks. For instance, the - same ModelSEED reactions and metabolites in both networks should have identical attributes. - If applicable, both networks should have been annotated with the same protein and metabolite - abundance data. - - The purpose of this method is to combine different, but potentially overlapping, subnetworks - from the same pangenome. - - Parameters - ========== - network : GenomicNetwork - The other genomic reaction network being merged. - - Returns - ======= - GenomicNetwork - The merged genomic reaction network. - """ - assert not ( - (self.proteins is None and network.proteins is not None) and - (self.proteins is not None and network.proteins is None) - ) - - merged_network = GenomicNetwork() - - self._merge_network(network, merged_network) - - # Add genes to the merged network, first adding genes present in both source networks, and - # then adding genes present exclusively in each source network. - first_gcids = set(self.genes) - second_gcids = set(network.genes) - - for gcid in first_gcids.intersection(second_gcids): - first_gene = self.genes[gcid] - second_gene = network.genes[gcid] - - # The new object representing the gene in the merged network should have all KO - # annotations from each source gene object, as these objects can have different KO - # references. - merged_gene = Gene() - merged_gene.gcid = gcid - ko_ids = set(first_gene.kos).union(set(second_gene.kos)) - for ko_id in ko_ids: - merged_gene.kos[ko_id] = merged_network.kos[ko_id] - first_ko_ids = set(first_gene.kos) - second_ko_ids = set(second_gene.kos).difference(set(first_gene.kos)) - for ko_id in first_ko_ids: - merged_gene.e_values[ko_id] = first_gene.e_values[ko_id] - for ko_id in second_ko_ids: - merged_gene.e_values[ko_id] = second_gene.e_values[ko_id] - - merged_network.genes[gcid] = merged_gene - - for gcid in first_gcids.difference(second_gcids): - first_gene = self.genes[gcid] - - gene = Gene() - gene.gcid = gcid - gene.kos = {ko_id: merged_network.kos[ko_id] for ko_id in first_gene.kos} - gene.e_values = first_gene.e_values.copy() - - merged_network.genes[gcid] = gene - - for gcid in second_gcids.difference(first_gcids): - second_gene = network.genes[gcid] - - gene = Gene() - gene.gcid = gcid - gene.kos = {ko_id: merged_network.kos[ko_id] for ko_id in second_gene.kos} - gene.e_values = second_gene.e_values.copy() - - merged_network.genes[gcid] = gene - - if not self.proteins and not network.proteins: - # No protein abundance data is present and needs to be added to the merged network. - return merged_network - - # Add protein abundance data to the merged network, first adding proteins annotating genes - # in both source networks, and then adding proteins annotating genes exclusively in each - # source network. - first_protein_ids = set(self.proteins) - second_protein_ids = set(network.proteins) - - # Assume that each source network was annotated with the same protein annotation data, so - # that the same gene in each network should have the same protein abundance profile. - for protein_id in first_protein_ids.intersection(second_protein_ids): - first_protein = self.proteins[protein_id] - second_protein = network.proteins[protein_id] - - merged_protein = Protein() - merged_protein.id = protein_id - for gcid in first_protein.genes: - merged_protein.genes[gcid] = merged_network.genes[gcid] - for gcid in set(second_protein.genes).difference(set(first_protein.genes)): - merged_protein.genes[gcid] = merged_network.genes[gcid] - merged_protein.abundances = first_protein.abundances.copy() - - merged_network.proteins[protein_id] = merged_protein - - for protein_id in first_protein_ids.difference(second_protein_ids): - first_protein = self.proteins[protein_id] - - protein = Protein() - protein.id = protein_id - protein.genes = {gcid: merged_network.genes[gcid] for gcid in first_protein.genes} - protein.abundances = first_protein.abundances.copy() - - merged_network.proteins[protein_id] = protein - - for protein_id in second_protein_ids.difference(first_protein_ids): - second_protein = network.proteins[protein_id] - - protein = Protein() - protein.id = protein_id - protein.genes = {gcid: merged_network.genes[gcid] for gcid in second_protein.genes} - protein.abundances = second_protein.abundances.copy() - - merged_network.proteins[protein_id] = protein - - return merged_network - - def get_overview_statistics( - self, - precomputed_counts: Dict[str, int] = None - ) -> GenomicNetworkStats: - """ - Calculate overview statistics for the genomic metabolic network. - - Parameters - ========== - precomputed_counts : Dict[str, int], None - To spare additional computations that involve loading and parsing the contigs database, - this dictionary can contain two pieces of precomputed data: the value for the key, - 'total_genes', should be the number of genes in the genome; the value for the key, - 'genes_assigned_kos', should be the number of genes in the genome assigned KOs; the - value for the key, 'kos_assigned_genes', should be the number of unique KOs assigned to - genes in the genome. - - Returns - ======= - GenomicNetworkStats - Network statistics are stored in a dictionary of dictionaries. Keys in the outer - dictionary are "classes" of network statistics. Keys in the inner dictionary are - statistics themselves. - """ - if ( - precomputed_counts is not None and - sorted(precomputed_counts) != [ - 'genes_assigned_kos', 'kos_assigned_genes', 'total_genes' - ] - ): - raise ConfigError( - "The 'precomputed_counts' argument must be a dictionary only containing the keys, " - "'total_genes', 'genes_assigned_kos', and 'kos_assigned_genes'." - ) - - stats: GenomicNetworkStats = {} - - self.progress.new("Counting genes and KOs") - self.progress.update("...") - stats['Gene and KO counts'] = stats_group = {} - - if precomputed_counts: - assert ( - type(precomputed_counts['total_genes']) is int and - precomputed_counts['total_genes'] >= 0 - ) - gene_count = precomputed_counts['total_genes'] - assert ( - type(precomputed_counts['genes_assigned_kos']) is int and - precomputed_counts['genes_assigned_kos'] >= 0 - ) - ko_annotated_gene_count = precomputed_counts['genes_assigned_kos'] - assert ( - type(precomputed_counts['kos_assigned_genes']) is int and - precomputed_counts['kos_assigned_genes'] >= 0 - ) - annotating_ko_count = precomputed_counts['kos_assigned_genes'] - else: - if self.contigs_db_source_path: - cdb = ContigsDatabase(self.contigs_db_source_path) - gene_count = cdb.db.get_row_counts_from_table('genes_in_contigs') - gene_ko_id_table = cdb.db.get_table_as_dataframe( - 'gene_functions', - where_clause='source = "KOfam"', - columns_of_interest=['gene_callers_id', 'source'] - ) - ko_annotated_gene_count = gene_ko_id_table['gene_callers_id'].nunique() - annotating_ko_count = gene_ko_id_table['KOfam'].nunique() - cdb.disconnect() - else: - gene_count = None - ko_annotated_gene_count = None - annotating_ko_count = None - - if gene_count is not None: - stats_group['Total gene calls in genome'] = gene_count - if ko_annotated_gene_count is not None: - stats_group['Genes annotated with protein KOs'] = ko_annotated_gene_count - stats_group['Genes in network'] = len(self.genes) - if annotating_ko_count is not None: - stats_group['Protein KOs annotating genes'] = annotating_ko_count - stats_group['KOs in network'] = len(self.kos) - self.progress.end() - - self._get_common_overview_statistics(stats) - - if precomputed_counts: - return stats - - if not self.contigs_db_source_path: - self.run.info_single( - f"""\ - Since the genomic network was not associated with a contigs database, the following - statistics could not be calculated and were not reported to the output file: - 'Total gene calls in genome', 'Genes annotated with protein KOs', and 'Protein KOs - annotating genes'.\ - """ - ) - - return stats - - def print_overview_statistics(self, stats: GenomicNetworkStats = None) -> None: - """ - Print overview statistics for the genomic metabolic network. - - Parameters - ========== - stats : GenomicNetworkStats, None - With the default value of None, network statistics will be calculated and printed. - Alternatively, provided network statistics will be printed without calculating anew. - - Returns - ======= - None - """ - if not stats: - stats = self.get_overview_statistics() - - self.run.info_single("METABOLIC REACTION NETWORK STATISTICS", mc='green', nl_after=1) - - self.run.info_single("Gene calls and KEGG Ortholog (KO) annotations") - stats_group = stats['Gene and KO counts'] - self.run.info("Total gene calls in genome", stats_group['Total gene calls in genome']) - self.run.info( - "Genes annotated with protein KOs", stats_group['Genes annotated with protein KOs'] - ) - self.run.info("Genes in network", stats_group['Genes in network']) - self.run.info("Protein KOs annotating genes", stats_group['Protein KOs annotating genes']) - self.run.info("KOs in network", stats_group['KOs in network'], nl_after=1) - - self._print_common_overview_statistics(stats) - - def export_json( - self, - path: str, - overwrite: bool = False, - objective: str = None, - remove_missing_objective_metabolites: bool = False, - # record_bins: Tuple[str] = ('gene', ), - indent: int = 2, - progress: terminal.Progress = terminal.Progress() - ) -> None: - """ - Export the network to a metabolic model file in JSON format. - - All information from the network is included in the JSON so that the file can by imported by - anvi'o as a GenomicNetwork object containing the same information. - - Parameters - ========== - path : str - output JSON file path - - overwrite : bool, False - Overwrite the JSON file if it already exists. - - objective : str, None - An objective to use in the model, stored as the first entry in the JSON 'reactions' - array. Currently, the only valid options are None and 'e_coli_core'. - - None means that no objective is added to the JSON, meaning that FBA cannot be performed - on the model. - - 'e_coli_core' is the biomass objective from the COBRApy example JSON file of E. coli - "core" metabolism, 'e_coli_core.json'. - - remove_missing_objective_metabolites : bool, False - If True, remove metabolites from the JSON objective that are not produced or consumed in - the reaction network. FBA fails with metabolites outside the network. - - record_bins : tuple, ('gene', ) - Record bin membership in JSON entries, if a collection of bins is present in the - reaction network. By default, bin membership is only recorded for genes with the - argument, ('gene', ). 'reaction' and 'metabolite' can also be provided in a tuple - argument (e.g., ('reaction', ) or ('metabolite', 'reaction', 'gene')) to likewise record - in which bins the reaction and metabolite entries occur. To not record bins at all, pass - either an empty tuple or None. - - indent : int, 2 - spaces of indentation per nesting level in JSON file - - progress : anvio.terminal.Progress, anvio.terminal.Progress() - This object prints transient progress information to the terminal. - """ - progress.new("Constructing JSON") - progress.update("Setting up") - filesnpaths.is_output_file_writable(path, ok_if_exists=overwrite) - json_dict = JSONStructure.get() - json_genes: List[Dict] = json_dict['genes'] - json_reactions: List[Dict] = json_dict['reactions'] - json_metabolites: List[Dict] = json_dict['metabolites'] - if objective == 'e_coli_core': - objective_dict = JSONStructure.get_e_coli_core_objective() - if remove_missing_objective_metabolites: - self.remove_missing_objective_metabolites(objective_dict) - json_reactions.append(objective_dict) - elif objective != None: - raise ConfigError(f"Anvi'o does not recognize an objective with the name, '{objective}'.") - - progress.update("Genes") - reaction_genes: Dict[str, List[str]] = {} - reaction_kos: Dict[str, List[KO]] = {} - for gcid, gene in self.genes.items(): - gene_entry = JSONStructure.get_gene_entry() - json_genes.append(gene_entry) - gcid_str = str(gcid) - gene_entry['id'] = gcid_str - # Record KO IDs and annotation e-values in the annotation section of the gene entry. - annotation = gene_entry['annotation'] - annotation['ko'] = annotation_kos = {} - for ko_id, ko in gene.kos.items(): - annotation_kos[ko_id] = str(gene.e_values[ko_id]) - for modelseed_reaction_id in ko.reactions: - try: - reaction_genes[modelseed_reaction_id].append(gcid_str) - except KeyError: - reaction_genes[modelseed_reaction_id] = [gcid_str] - try: - reaction_kos[modelseed_reaction_id].append(ko) - except KeyError: - reaction_kos[modelseed_reaction_id] = [ko] - - progress.update("Reactions") - compound_compartments: Dict[str, Set[str]] = {} - for modelseed_reaction_id, reaction in self.reactions.items(): - reaction_entry = JSONStructure.get_reaction_entry() - json_reactions.append(reaction_entry) - reaction_entry['id'] = modelseed_reaction_id - reaction_entry['name'] = reaction.modelseed_name - metabolites = reaction_entry['metabolites'] - for compound, compartment, coefficient in zip(reaction.compounds, reaction.compartments, reaction.coefficients): - modelseed_compound_id = compound.modelseed_id - metabolites[f"{modelseed_compound_id}_{compartment}"] = coefficient - try: - compound_compartments[modelseed_compound_id].add(compartment) - except KeyError: - compound_compartments[modelseed_compound_id] = set(compartment) - if not reaction.reversibility: - # By default, the reaction entry was set up to be reversible; here make it irreversible. - reaction_entry['lower_bound'] = 0.0 - reaction_entry['gene_reaction_rule'] = " or ".join([gcid for gcid in reaction_genes[modelseed_reaction_id]]) - notes = reaction_entry['notes'] - # Record gene KO annotations which aliased the reaction via KEGG REACTION or EC number. - notes['ko'] = ko_notes = {} - ko_kegg_aliases = [] - ko_ec_number_aliases = [] - for ko in reaction_kos[modelseed_reaction_id]: - try: - kegg_aliases = ko.kegg_reaction_aliases[modelseed_reaction_id] - except KeyError: - kegg_aliases = [] - try: - ec_number_aliases = ko.ec_number_aliases[modelseed_reaction_id] - except KeyError: - ec_number_aliases = [] - ko_notes[ko.id] = {'kegg.reaction': kegg_aliases, 'ec-code': ec_number_aliases} - ko_kegg_aliases += kegg_aliases - ko_ec_number_aliases += ec_number_aliases - ko_kegg_aliases = set(ko_kegg_aliases) - ko_ec_number_aliases = set(ko_ec_number_aliases) - # Record other KEGG REACTION or EC number aliases of the reaction in the ModelSEED - # database that did not happen to be associated with KO annotations. - notes['other_aliases'] = { - 'kegg.reaction': list(set(reaction.kegg_aliases).difference(ko_kegg_aliases)), - 'ec-code': list(set(reaction.ec_number_aliases).difference(ko_ec_number_aliases)) - } - - progress.update("Metabolites") - for modelseed_compound_id, metabolite in self.metabolites.items(): - modelseed_compound_name = metabolite.modelseed_name - charge = metabolite.charge - formula = metabolite.formula - kegg_compound_aliases = list(metabolite.kegg_aliases) - for compartment in compound_compartments[modelseed_compound_id]: - metabolite_entry = JSONStructure.get_metabolite_entry() - json_metabolites.append(metabolite_entry) - metabolite_entry['id'] = f"{modelseed_compound_id}_{compartment}" - metabolite_entry['name'] = modelseed_compound_name - metabolite_entry['compartment'] = compartment - # Compounds without a formula have a nominal charge of 10000000 in the ModelSEED - # compounds database, which is replaced by None in the reaction network and 0 in the JSON. - metabolite_entry['charge'] = charge if charge is not None else 0 - metabolite_entry['formula'] = formula if formula is not None else "" - metabolite_entry['annotation']['kegg.compound'] = kegg_compound_aliases - - progress.update("Saving") - with open(path, 'w') as f: - json.dump(json_dict, f, indent=indent) - progress.end() - -class PangenomicNetwork(ReactionNetwork): - """ - A reaction network predicted from KEGG KO and ModelSEED annotations of pangenomic gene clusters. - - Attributes - ========== - kos : Dict[str, KO], dict() - This dictionary maps the IDs of KOs in the network to object representations of the KOs. - - reactions : Dict[str, ModelSEEDReaction], dict() - This maps the IDs of ModelSEED reactions in the network to object representations of the - reactions. - - metabolites : Dict[str, ModelSEEDCompound], dict() - This maps the IDs of ModelSEED metabolites in the network to object representations of the - metabolites. - - kegg_modelseed_aliases : Dict[str, List[str]], dict() - This maps KEGG REACTION IDs associated with KOs in the network to ModelSEED reactions - aliased by the KEGG reaction. KO-associated KEGG reactions that do not alias ModelSEED - reactions are not included. - - ec_number_modelseed_aliases : Dict[str, List[str]], dict() - This maps EC numbers associated with KOs in the network to ModelSEED reactions aliased by - the EC number. KO-associated EC numbers that do not alias ModelSEED reactions are not - included. - - modelseed_kegg_aliases : Dict[str, List[str]], dict() - This maps the IDs of ModelSEED reactions in the network to lists of KEGG REACTION IDs that - are associated with KOs in the network and alias the ModelSEED reaction. - - modelseed_ec_number_aliases : Dict[str, List[str]], dict() - This maps the IDs of ModelSEED reactions in the network to lists of EC numbers that are - associated with KOs in the network and alias the ModelSEED reaction. - - pan_db_source_path : str, None - Path to the pan database from which the network was built. - - genomes_storage_db_source_path : str, None - Path to the genomes storage database from which the network was built. - - consensus_threshold : float, None - A parameter used in the selection of the gene cluster consensus KOs from which the network - was built. - - discard_ties : bool, None - A parameter used in the selection of the gene cluster consensus KOs from which the network - was built. - - consistent_annotations : bool, None - A loaded network may be based on a set of gene KO annotations in the genomes storage - database that has since changed, in which case this attribute would be False. - - gene_clusters : Dict[str, GeneCluster], dict() - This maps the IDs of gene clusters in the network to object representations of the clusters. - - bins : Dict[str, GeneClusterBin], dict() - - collection : BinCollection, None - """ - def __init__( - self, - run: terminal.Run = terminal.Run(), - progress: terminal.Progress = terminal.Progress(), - verbose: bool = True - ) -> None: - """ - Parameters - ========== - run : anvio.terminal.Run, anvio.terminal.Run() - This object sets the 'run' attribute, which prints run information to the terminal. - - progress : anvio.terminal.Progress, anvio.terminal.Progress() - This object sets the 'progress' attribute, which prints transient progress information - to the terminal. - - verbose : bool, True - This sets the 'verbose' attribute, causing more information to be reported to the - terminal if True. - - Returns - ======= - None - """ - super().__init__(run=run, progress=progress, verbose=verbose) - self.pan_db_source_path: str = None - self.genomes_storage_db_source_path: str = None - self.consensus_threshold: float = None - self.discard_ties: bool = None - self.consistent_annotations: bool = None - self.gene_clusters: Dict[str, GeneCluster] = {} - self.bins: Dict[str, GeneClusterBin] = {} - self.collection: BinCollection = None - - def copy(self) -> PangenomicNetwork: - """ - Create a deep copy of the reaction network. - - Returns - ======= - PangenomicNetwork - Deep copy of the reaction network. - """ - copied_network = PangenomicNetwork() - - self._copy(copied_network) - - for cluster_id, cluster in self.gene_clusters.items(): - copied_cluster = GeneCluster() - copied_cluster.gene_cluster_id = cluster_id - copied_cluster.genomes = cluster.genomes.copy() - copied_cluster.ko = copied_network.kos[cluster.ko.id] - - copied_network.gene_clusters[cluster_id] = copied_cluster - - return copied_network - - def remove_metabolites_without_formula(self, output_path: str = None) -> None: - """ - Remove metabolites without a formula in the ModelSEED database from the network. - - Other items can be removed from the network by association: reactions that involve a - formulaless metabolite; other metabolites with formulas that are exclusive to such - reactions; KOs predicted to exclusively catalyze such reactions; and gene clusters annotated - with such KOs. Removed metabolites with a formula are reported alongside formulaless - metabolites to the output table of removed metabolites. - - output_path : str, None - If not None, write four tab-delimited files of metabolites, reactions, KEGG Orthologs, - and gene clusters removed from the network to file locations based on the provided path. - For example, if the argument, 'removed.tsv', is provided, then the following files will - be written: 'removed-metabolites.tsv', 'removed-reactions.tsv', 'removed-kos.tsv', and - 'removed-gene-clusters.tsv'. - """ - if output_path: - path_basename, path_extension = os.path.splitext(output_path) - metabolite_path = f"{path_basename}-metabolites{path_extension}" - reaction_path = f"{path_basename}-reactions{path_extension}" - ko_path = f"{path_basename}-kos{path_extension}" - gene_cluster_path = f"{path_basename}-gene-clusters{path_extension}" - for path in (metabolite_path, reaction_path, ko_path, gene_cluster_path): - filesnpaths.is_output_file_writable(path) - - metabolites_to_remove = [] - for modelseed_compound_id, metabolite in self.metabolites.items(): - # ModelSEED compounds without a formula have a formula value of None in the network - # object. - if metabolite.formula is None: - metabolites_to_remove.append(modelseed_compound_id) - removed = self.purge_metabolites(metabolites_to_remove) - - if self.verbose: - self.run.info("Removed metabolites", len(removed['metabolite'])) - self.run.info("Removed reactions", len(removed['reaction'])) - self.run.info("Removed KOs", len(removed['ko'])) - self.run.info("Removed gene clusters", len(removed['gene_cluster'])) - - if not output_path: - return - - # Record the reactions removed as a consequence of involving formulaless metabolites, and - # record the formulaless metabolites involved in removed reactions. - metabolite_removed_reactions: Dict[str, List[str]] = {} - reaction_removed_metabolites: Dict[str, List[str]] = {} - for reaction in removed['reaction']: - reaction: ModelSEEDReaction - reaction_removed_metabolites[reaction.modelseed_id] = metabolite_ids = [] - for metabolite in reaction.compounds: - if metabolite.modelseed_id in metabolites_to_remove: - try: - metabolite_removed_reactions[metabolite.modelseed_id].append( - reaction.modelseed_id - ) - except KeyError: - metabolite_removed_reactions[metabolite.modelseed_id] = [ - reaction.modelseed_id - ] - metabolite_ids.append(metabolite.modelseed_id) - - metabolite_table = [] - for metabolite in removed['metabolite']: - metabolite: ModelSEEDCompound - row = [] - row.append(metabolite.modelseed_id) - row.append(metabolite.modelseed_name) - row.append(metabolite.formula) - try: - # The metabolite did not have a formula. - removed_reaction_ids = metabolite_removed_reactions[metabolite.modelseed_id] - except KeyError: - # The metabolite had a formula but was removed as a consequence of all the reactions - # involving the metabolite being removed due to them containing formulaless - # metabolites: the metabolite did not cause any reactions to be removed. - row.append("") - continue - # The set accounts for the theoretical possibility that a compound is present on both - # sides of the reaction equation and thus the reaction is recorded multiple times. - row.append(", ".join(sorted(set(removed_reaction_ids)))) - - reaction_table = [] - for reaction in removed['reaction']: - reaction: ModelSEEDReaction - row = [] - row.append(reaction.modelseed_id) - row.append(reaction.modelseed_name) - # The set accounts for the theoretical possibility that a compound is present on both - # sides of the reaction equation and thus is recorded multiple times. - row.append( - ", ".join(set(reaction_removed_metabolites[reaction.modelseed_id])) - ) - row.append(", ".join([metabolite.modelseed_id for metabolite in reaction.compounds])) - row.append(get_chemical_equation(reaction)) - reaction_table.append(row) - - ko_table = [] - for ko in removed['ko']: - ko: KO - row = [] - row.append(ko.id) - row.append(ko.name) - row.append(", ".join(ko.reactions)) - ko_table.append(row) - - gene_cluster_table = [] - for cluster in removed['gene_cluster']: - cluster: GeneCluster - row = [] - row.append(cluster.gene_cluster_id) - row.append(cluster.ko.id) - row.append(", ".join(cluster.genomes)) - gene_cluster_table.append(row) - - pd.DataFrame( - metabolite_table, - columns=[ - "ModelSEED compound ID", - "ModelSEED compound name", - "Formula", - "Removed reaction ModelSEED IDs" - ] - ).to_csv(metabolite_path, sep='\t', index=False) - pd.DataFrame( - reaction_table, - columns=[ - "ModelSEED reaction ID", - "ModelSEED reaction name", - "Removed ModelSEED compound IDs", - "Reaction ModelSEED compound IDs", - "Equation" - ] - ).to_csv(reaction_path, sep='\t', index=False) - pd.DataFrame( - ko_table, - columns=[ - "KO ID", - "KO name", - "KO ModelSEED reaction IDs" - ] - ).to_csv(ko_path, sep='\t', index=False) - pd.DataFrame( - gene_cluster_table, - columns=[ - "Gene cluster ID", - "KO ID", - "Gene cluster genomes" - ] - ).to_csv(gene_cluster_path, sep='\t', index=False) - - if self.verbose: - self.run.info("Table of removed metabolites", metabolite_path) - self.run.info("Table of removed reactions", reaction_path) - self.run.info("Table of removed KOs", ko_path) - self.run.info("Table of removed gene clusters", gene_cluster_path) - - def purge_metabolites(self, metabolites_to_remove: Iterable[str]) -> Dict[str, List]: - """ - Remove any trace of the given metabolites from the network. - - Reactions involving the metabolite are also purged from the network. KOs that were only - associated with removed reactions are purged; gene clusters that were only associated with - removed KOs are purged. - - Removal of reactions involving the metabolite can also result in other metabolites being - being removed from the network, those that exclusively participate in these reactions. - - Parameters - ========== - metabolites_to_remove : Iterable[str] - ModelSEED compound IDs identifying metabolites to remove. - - Returns - ======= - dict - This dictionary contains data removed from the network. - - If this method is NOT called from the method, 'purge_reactions', then the dictionary - will look like the following: - { - 'metabolite': [], - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'ko': [], - 'gene_cluster': [] - } - - If this method is called from the method, 'purge_reactions', then the dictionary will - only contain one significant entry: - { - 'metabolite': [], - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'ko': [], - 'gene_cluster': [] - } - """ - removed_metabolites: List[ModelSEEDCompound] = [] - for compound_id in metabolites_to_remove: - try: - removed_metabolites.append(self.metabolites.pop(compound_id)) - except KeyError: - # This can occur for two reasons. First, the metabolite from 'metabolites_to_remove' - # could not be in the network. - - # Second, this can occur when removing other "unintended" metabolites from the - # network. 'purge_metabolites' was first called with metabolites of interest, then - # 'purge_reactions' was called from within the method the remove reactions involving - # the metabolites of interest, and then 'purge_metabolites' was called again from - # within 'purge_reactions' to remove other metabolites exclusively found in the - # removed reactions. In this last call of 'purge_metabolites', the - # 'metabolites_to_remove' also include the metabolites of interest that were already - # removed from 'self.metabolites' in the original 'purge_metabolites' call. This - # KeyError occurs when trying to remove those already-removed metabolites. - pass - if not removed_metabolites: - return { - 'metabolite': [], - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'ko': [], - 'gene_cluster': [] - } - - reactions_to_remove = [] - for modelseed_reaction_id, reaction in self.reactions.items(): - for compound in reaction.compounds: - if compound.modelseed_id in metabolites_to_remove: - reactions_to_remove.append(modelseed_reaction_id) - break - - removed = {'metabolite': removed_metabolites} - if reactions_to_remove: - removed_cascading_up = self.purge_reactions(reactions_to_remove) - # There may be other metabolites exclusively involved in the removed reactions; these - # metabolites were therefore also removed. - removed['metabolite'] = removed_metabolites + removed_cascading_up.pop('metabolite') - else: - # This method must have been called from the method, 'purge_reactions', because the - # reactions containing the metabolites were already removed from the network. - removed_cascading_up = { - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'ko': [], - 'gene_cluster': [] - } - removed.update(removed_cascading_up) - return removed - - def purge_reactions(self, reactions_to_remove: Iterable[str]) -> Dict[str, List]: - """ - Remove any trace of the given reactions from the network. - - Metabolites that exclusively participate in removed reactions are purged. KOs that were only - associated with removed reactions are purged; gene clusters that were only associated with - removed KOs are purged. - - Parameters - ========== - reactions_to_remove : Iterable[str] - ModelSEED reaction IDs identifying reactions to remove. - - Returns - ======= - dict - This dictionary contains data removed from the network. - - If this method is NOT called from the method, 'purge_metabolites', or the method, - 'purge_kos', then the dictionary will look like the following: - { - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'metabolite': [], - 'ko': [], - 'gene_cluster': [] - } - - If this method is called from the method, 'purge_metabolites', then the dictionary will - look like the following: - { - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'metabolite': [], - 'ko': [], - 'gene_cluster': [] - } - - If this method is called from the method, 'purge_kos', then the dictionary will look - like the following: - { - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'metabolite': [ Dict[str, List]: - """ - Remove any trace of the given KOs from the network. - - Reactions and metabolites that were only associated with removed KOs are purged. Genes that - were only associated with removed KOs are purged. - - Parameters - ========== - kos_to_remove : Iterable[str] - KO IDs identifying KOs to remove. - - Returns - ======= - dict - This dictionary contains data removed from the network. - - If this method is NOT called from the method, 'purge_reactions', or the method, - 'purge_gene_clusters', then the dictionary will look like the following: - { - 'ko': [], - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'metabolite': [], - 'gene_cluster': [] - } - - If this method is called from the method, 'purge_reactions', then the dictionary will - look like the following: - { - 'ko': [], - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'metabolite': [], - 'gene_cluster': [] - } - - If this method is called from the method, 'purge_gene_clusters', then the dictionary - will look like the following: - { - 'ko': [], - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'metabolite': [ Dict[str, List]: - """ - Remove any trace of the given gene clusters from the network. - - KOs, reactions, and metabolites that were only associated with removed gene clusters are - purged. - - Parameters - ========== - gene_clusters_to_remove : Iterable[str] - Gene cluster IDs identifying clusters to remove. - - Returns - ======= - dict - This dictionary contains data removed from the network. - - If this method is NOT called from the method, 'purge_kos', then the dictionary will - look like the following: - { - 'gene_cluster': [], - 'ko': [], - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'metabolite': [] - } - - If this method is called from the method, 'purge_kos', then the dictionary will look - like the following: - { - 'gene_cluster': [], - 'ko': [], - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'metabolite': [] - } - """ - removed_gene_clusters: List[GeneCluster] = [] - for gene_cluster_id in gene_clusters_to_remove: - try: - removed_gene_clusters.append(self.gene_clusters.pop(gene_cluster_id)) - except KeyError: - # This occurs if the cluster in 'gene_clusters_to_remove' is not in the network. - pass - - if not removed_gene_clusters: - return { - 'metabolite': [], - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'ko': [], - 'gene_cluster': [] - } - - kos_to_remove: List[str] = [] - for cluster in removed_gene_clusters: - kos_to_remove.append(cluster.ko.id) - kos_to_remove = list(set(kos_to_remove)) - for gene_cluster in self.gene_clusters.values(): - kos_to_spare: List[str] = [] - if gene_cluster.ko.id in kos_to_remove: - # The KO is associated with a retained gene cluster, so do not remove the KO. - kos_to_spare.append(gene_cluster.ko.id) - for ko_id in kos_to_spare: - kos_to_remove.remove(ko_id) - if kos_to_remove: - removed_cascading_down = self.purge_kos(kos_to_remove) - removed_cascading_down.pop('gene_cluster') - else: - # This method must have been called from the method, 'purge_kos', because the KOs that - # are only associated with the removed gene clusters were already removed from the - # network. - removed_cascading_down = { - 'ko': [], - 'reaction': [], - 'kegg_reaction': [], - 'ec_number': [], - 'metabolite': [] - } - - # TODO: remove gene clusters from self.bins - - removed = {'gene_cluster': removed_gene_clusters} - removed.update(removed_cascading_down) - return removed - - def subset_network( - self, - kegg_modules_to_subset: Iterable[str] = None, - brite_categories_to_subset: Iterable[str] = None, - gene_clusters_to_subset: Iterable[int] = None, - kos_to_subset: Iterable[str] = None, - reactions_to_subset: Iterable[str] = None, - metabolites_to_subset: Iterable[str] = None - ) -> PangenomicNetwork: - """ - Subset a smaller network from the metabolic network. - - If requested KEGG modules, BRITE categories, gene clusters, KOs, reactions, or metabolites - are not present in the network, no error is raised. - - Subsetted items are not represented by the same objects as in the source network, i.e., new - gene cluster, KO, reaction, and metabolite objects are created and added to the subsetted - network. - - Network items (i.e., gene clusters, KOs, reactions, and metabolites) that reference - requested items (e.g., gene clusters in the network referencing requested KOs; KOs in the - network referencing requested reactions) are added to the subsetted network. KOs (and by - extension, gene clusters referencing KOs) that are added to the subsetted network due to - references to requested reactions will be missing references to any other unrequested - reactions. In other words, certain reaction annotations can be selected to the exclusion of - others, e.g., a KO encoding two reactions can be "redefined" or "pruned" to encode one - requested reaction in the subsetted network; a KO encoding multiple reactions can be pruned - to encode only those reactions involving requested metabolites. - - If the 'verbose' attribute of the source 'PangenomicNetwork' object is True, then report to - the terminal the identities of requested KEGG modules, BRITE categories, gene clusters, KOs, - reactions, and metabolites that are not present in the network. - - Parameters - ========== - kegg_modules_to_subset : List[str], None - KEGG modules (of KOs) to subset by ID. - - brite_categories_to_subset : List[str], None - KEGG BRITE hierarchy categories (of KOs) to subset. - - gene_clusters_to_subset : List[int], None - Gene clusters to subset by ID. - - kos_to_subset : List[str], None - KOs to subset by ID. - - reactions_to_subset : List[str], None - ModelSEED reactions to subset by ID. - - metabolites_to_subset : List[str], None - ModelSEED metabolites to subset by ID. - - Returns - ======= - PangenomicNetwork - New subsetted reaction network. - """ - # Sequentially subset the network for each type of request. Upon generating two subsetted - # networks from two types of request, merge the networks into a single subsetted network; - # repeat. - first_subnetwork = None - for items_to_subset, subset_network_method in ( - (kegg_modules_to_subset, self._subset_network_by_modules), - (brite_categories_to_subset, self._subset_network_by_brite), - (gene_clusters_to_subset, self._subset_network_by_gene_clusters), - (kos_to_subset, self._subset_network_by_kos), - (reactions_to_subset, self._subset_network_by_reactions), - (metabolites_to_subset, self._subset_network_by_metabolites) - ): - if not items_to_subset: - continue - - second_subnetwork = subset_network_method(items_to_subset) - - if first_subnetwork is None: - first_subnetwork = second_subnetwork - else: - first_subnetwork = first_subnetwork.merge_network(second_subnetwork) - - return first_subnetwork - - def _subset_network_by_modules(self, kegg_modules: Iterable[str]) -> PangenomicNetwork: - """ - Subset the network by KOs in requested KEGG modules. - - Parameters - ========== - kegg_modules : Iterable[str] - KEGG modules (of KOs) to subset by ID. - - Returns - ======= - PangenomicNetwork - New subsetted reaction network. - """ - pass - - def _subset_network_by_brite(self, brite_categories: Iterable[str]) -> PangenomicNetwork: - """ - Subset the network by KOs in requested KEGG BRITE hierarchy categories. - - Parameters - ========== - brite_categories : Iterable[str] - KEGG BRITE hierarchy categories (of KOs) to subset. - - Returns - ======= - PangenomicNetwork - New subsetted reaction network. - """ - pass - - def _subset_network_by_gene_clusters( - self, - gene_cluster_ids: Iterable[int] - ) -> PangenomicNetwork: - """ - Subset the network by gene clusters with requested IDs. - - Parameters - ========== - gene_cluster_ids : Iterable[int] - Gene clusters to subset by ID. - - Returns - ======= - PangenomicNetwork - New subsetted reaction network. - """ - subnetwork = PangenomicNetwork() - - for gene_cluster_id in gene_cluster_ids: - try: - cluster = self.gene_clusters[gene_cluster_id] - except KeyError: - # This occurs if the requested gene cluster ID is not in the source network. - continue - - subsetted_cluster = GeneCluster() - subsetted_cluster.gene_cluster_id = gene_cluster_id - subsetted_cluster.genomes = cluster.genomes.copy() - - # Add KOs annotating the gene cluster to the subsetted network as new objects, and then - # reference these objects in the cluster object. - ko_id = cluster.ko.id - subnetwork = self._subset_network_by_kos([ko_id], subnetwork=subnetwork) - subsetted_cluster.ko = subnetwork.kos[ko_id] - - subnetwork.gene_clusters[gene_cluster_id] = subsetted_cluster - - return subnetwork - - def _subset_network_by_kos( - self, - ko_ids: Iterable[str], - subnetwork: PangenomicNetwork = None - ) -> PangenomicNetwork: - """ - Subset the network by KOs with requested KO IDs. - - Parameters - ========== - ko_ids : Iterable[str] - KOs to subset by ID. - - subnetwork : PangenomicNetwork, None - This network under construction is provided when the KOs being added to the network - annotate already subsetted gene clusters. - - Returns - ======= - PangenomicNetwork - If a 'subnetwork' argument is provided, then that network is returned after - modification. Otherwise, a new subsetted reaction network is returned. - """ - if subnetwork is None: - subnetwork = PangenomicNetwork() - # Signify that gene clusters annotated by subsetted KOs are to be added to the network. - subset_referencing_gene_clusters = True - else: - assert isinstance(subnetwork, PangenomicNetwork) - # Signify that the KOs being added to the network annotate subsetted gene clusters that - # were already added to the network. - subset_referencing_gene_clusters = False - - for ko_id in ko_ids: - try: - ko = self.kos[ko_id] - except KeyError: - # This occurs if the requested KO ID is not in the source network. - continue - - subsetted_ko = KO() - subsetted_ko.id = ko.id - subsetted_ko.name = ko.name - subsetted_ko.kegg_reaction_aliases = deepcopy(ko.kegg_reaction_aliases) - subsetted_ko.ec_number_aliases = deepcopy(ko.ec_number_aliases) - - # Add reactions annotating the KO to the subsetted network as new objects, and then - # reference these objects in the KO object. - reaction_ids = [reaction_id for reaction_id in ko.reactions] - subnetwork = self._subset_network_by_reactions(reaction_ids, subnetwork=subnetwork) - subsetted_ko.reactions = { - reaction_id: subnetwork.reactions[reaction_id] for reaction_id in reaction_ids - } - - subnetwork.kos[ko_id] = subsetted_ko - - if subset_referencing_gene_clusters: - # Add gene clusters that are annotated by the subsetted KOs to the network. - self._subset_gene_clusters_via_kos(subnetwork) - - return subnetwork - - def _subset_network_by_reactions( - self, - reaction_ids: Iterable[str], - subnetwork: PangenomicNetwork = None - ) -> PangenomicNetwork: - """ - Subset the network by reactions with ModelSEED reaction IDs. - - Parameters - ========== - reaction_ids : Iterable[str] - Reactions to subset by ModelSEED reaction ID. - - subnetwork : PangenomicNetwork, None - This network under construction is provided when the reactions being added to the - network annotate already subsetted KOs. - - Returns - ======= - PangenomicNetwork - If a 'subnetwork' argument is provided, then that network is returned after - modification. Otherwise, a new subsetted reaction network is returned. - """ - if subnetwork is None: - subnetwork = PangenomicNetwork() - # Signify that KOs annotated by subsetted reactions are to be added to the network. - subset_referencing_kos = True - else: - assert isinstance(subnetwork, PangenomicNetwork) - # Signify that the reactions being added to the network annotate subsetted KOs that were - # already added to the network. - subset_referencing_kos = False - - # Copy the network attributes mapping reaction aliases. - kegg_modelseed_aliases: Dict[str, List[str]] = {} - ec_number_modelseed_aliases: Dict[str, List[str]] = {} - - for reaction_id in reaction_ids: - try: - reaction = self.reactions[reaction_id] - except KeyError: - # This occurs if the requested reaction is not in the source network. - continue - - # Copy the reaction object, including referenced metabolite objects, from the source - # network. - subsetted_reaction: ModelSEEDReaction = deepcopy(reaction) - subnetwork.reactions[reaction_id] = subsetted_reaction - # Record the metabolites involved in the reaction, and add them to the network. - for metabolite in subsetted_reaction.compounds: - compound_id = metabolite.modelseed_id - subnetwork.metabolites[compound_id] = metabolite - - try: - subnetwork.modelseed_kegg_aliases[reaction_id] += list(reaction.kegg_aliases) - except KeyError: - subnetwork.modelseed_kegg_aliases[reaction_id] = list(reaction.kegg_aliases) - - try: - subnetwork.modelseed_ec_number_aliases[reaction_id] += list( - reaction.ec_number_aliases - ) - except KeyError: - subnetwork.modelseed_ec_number_aliases[reaction_id] = list( - reaction.ec_number_aliases - ) - - for kegg_id in reaction.kegg_aliases: - try: - kegg_modelseed_aliases[kegg_id].append(reaction_id) - except KeyError: - kegg_modelseed_aliases[kegg_id] = [reaction_id] - - for ec_number in reaction.ec_number_aliases: - try: - ec_number_modelseed_aliases[ec_number].append(reaction_id) - except KeyError: - ec_number_modelseed_aliases[ec_number] = [reaction_id] - - if subnetwork.kegg_modelseed_aliases: - for kegg_id, modelseed_ids in kegg_modelseed_aliases.items(): - try: - subnetwork.kegg_modelseed_aliases[kegg_id] += modelseed_ids - except KeyError: - subnetwork.kegg_modelseed_aliases[kegg_id] = modelseed_ids - else: - subnetwork.kegg_modelseed_aliases = kegg_modelseed_aliases - - if subnetwork.ec_number_modelseed_aliases: - for ec_number, modelseed_ids in ec_number_modelseed_aliases.items(): - try: - subnetwork.ec_number_modelseed_aliases[ec_number] += modelseed_ids - except KeyError: - subnetwork.ec_number_modelseed_aliases[ec_number] = modelseed_ids - else: - subnetwork.ec_number_modelseed_aliases = ec_number_modelseed_aliases - - if subset_referencing_kos: - # Add KOs that are annotated by the subsetted reactions to the network. - self._subset_kos_via_reactions(subnetwork) - - return subnetwork - - def _subset_gene_clusters_via_kos(self, subnetwork: PangenomicNetwork) -> None: - """ - Add gene clusters that are annotated with subsetted KOs to the subsetted network. - - Parameters - ========== - subnetwork : PangenomicNetwork - The subsetted reaction network under construction. - - Returns - ======= - None - """ - subsetted_ko_ids = list(subnetwork.kos) - for gene_cluster_id, cluster in self.gene_clusters.items(): - # Check all gene clusters in the source network for subsetted KOs. - if cluster.ko.id in subsetted_ko_ids: - # Create a new gene cluster object for the subsetted cluster. - subsetted_cluster = GeneCluster() - subsetted_cluster.gene_cluster_id = gene_cluster_id - subsetted_cluster.genomes = cluster.genomes.copy() - subsetted_cluster.ko = subnetwork.kos[cluster.ko.id] - subnetwork.gene_clusters[gene_cluster_id] = subsetted_cluster - - def _subset_kos_via_reactions(self, subnetwork: PangenomicNetwork) -> None: - """ - Add KOs that are annotated with subsetted reactions to the subsetted network. - - Then add gene clusters that are annotated with these added KOs to the subsetted network. - - Parameters - ========== - subnetwork : PangenomicNetwork - The subsetted reaction network under construction. - - Returns - ======= - None - """ - subsetted_reaction_ids = list(subnetwork.reactions) - for ko_id, ko in self.kos.items(): - # Check all KOs in the source network for subsetted reactions. - subsetted_ko = None - for reaction_id in ko.reactions: - if reaction_id not in subsetted_reaction_ids: - # The KO is not annotated by the subsetted reaction. - continue - - if not subsetted_ko: - # Create a new KO object for the subsetted KO. The subsetted KO object would - # already have been created had another subsetted reaction been among the - # reactions annotating the KO. - subsetted_ko = KO() - subsetted_ko.id = ko_id - subsetted_ko.name = ko.name - subsetted_ko.reactions[reaction_id] = subnetwork.reactions[reaction_id] - subsetted_ko.kegg_reaction_aliases = deepcopy(ko.kegg_reaction_aliases) - subsetted_ko.ec_number_aliases = deepcopy(ko.ec_number_aliases) - - if subsetted_ko: - subnetwork.kos[ko_id] = subsetted_ko - - # Add gene clusters that are annotated with the added KOs to the subsetted network. - self._subset_gene_clusters_via_kos(subnetwork) - - def _subset_network_by_metabolites(self, compound_ids: List[str]) -> PangenomicNetwork: - """ - Subset the network by metabolites with ModelSEED compound IDs. - - Parameters - ========== - compound_ids : List[str] - List of metabolites to subset by ModelSEED compound ID. - - Returns - ======= - PangenomicNetwork - New subsetted reaction network. - """ - subnetwork = PangenomicNetwork() - - for reaction_id, reaction in self.reactions.items(): - # Check all reactions in the source network for subsetted metabolites. - for metabolite in reaction.compounds: - if metabolite.modelseed_id in compound_ids: - break - else: - # The reaction does not involve any of the requested metabolites. - continue - - # Copy the reaction object, including referenced metabolite objects, from the source - # network. - subsetted_reaction: ModelSEEDReaction = deepcopy(reaction) - subnetwork.reactions[reaction_id] = subsetted_reaction - - # Add the metabolites involved in the reaction to the subsetted network. (There can be - # unavoidable redundancy here in readding previously encountered metabolites.) - for subsetted_metabolite in subsetted_reaction.compounds: - subnetwork.metabolites[subsetted_metabolite.modelseed_id] = subsetted_metabolite - - # Add KOs that are annotated with the added reactions to the subsetted network, and then add - # gene_clusters annotated with the added KOs to the subsetted network. - self._subset_kos_via_reactions(subnetwork) - - return subnetwork - - def merge_network(self, network: PangenomicNetwork) -> PangenomicNetwork: - """ - Merge the pangenomic reaction network with another pangenomic reaction network. - - Each network can contain different gene clusters, KOs, and reactions/metabolites. Merging - nonredundantly incorporates all of this data as new objects in the new network. - - Objects representing KOs in both networks can have different sets of references: KOs can be - annotated by different reactions. - - However, the same gene cluster in each network should have the same consensus KO annotation: - the method of annotation should have been the same in each network. This reflects the - purpose of the method, which is to combine different, but potentially overlapping, - subnetworks from the same pangenome. - - ModelSEED reactions and metabolites in both networks should have identical attributes. - - Parameters - ========== - network : PangenomicNetwork - The other pangenomic reaction network being merged. - - Returns - ======= - PangenomicNetwork - The merged pangenomic reaction network. - """ - merged_network = PangenomicNetwork() - - self._merge_network(network, merged_network) - - # Add gene clusters to the merged network, first adding clusters present in both source - # networks, and then adding clusters present exclusively in each source network. - first_gene_cluster_ids = set(self.gene_clusters) - second_gene_cluster_ids = set(network.gene_clusters) - - for gene_cluster_id in first_gene_cluster_ids.intersection(second_gene_cluster_ids): - first_cluster = self.gene_clusters[gene_cluster_id] - second_cluster = network.gene_clusters[gene_cluster_id] - - # The new object representing the gene cluster in the merged network should have all KO - # annotations from each source cluster object, as these objects can have different KO - # references. - merged_cluster = GeneCluster() - merged_cluster.gene_cluster_id = gene_cluster_id - assert first_cluster.genomes == second_cluster.genomes - merged_cluster.genomes = first_cluster.genomes.copy() - assert first_cluster.ko.id == second_cluster.ko.id - merged_cluster.ko = merged_network.kos[first_cluster.ko.id] - - merged_network.gene_clusters[gene_cluster_id] = merged_cluster - - for gene_cluster_id in first_gene_cluster_ids.difference(second_gene_cluster_ids): - first_cluster = self.gene_clusters[gene_cluster_id] - - cluster = GeneCluster() - cluster.gene_cluster_id = gene_cluster_id - cluster.genomes = first_cluster.genomes.copy() - cluster.ko = merged_network.kos[first_cluster.ko.id] - - merged_network.gene_clusters[gene_cluster_id] = cluster - - for gene_cluster_id in second_gene_cluster_ids.difference(first_gene_cluster_ids): - second_cluster = network.gene_clusters[gene_cluster_id] - - cluster = GeneCluster() - cluster.gene_cluster_id = gene_cluster_id - cluster.genomes = second_cluster.genomes.copy() - cluster.ko = merged_network.kos[second_cluster.ko.id] - - merged_network.gene_clusters[gene_cluster_id] = cluster - - return merged_network - - def get_overview_statistics( - self, - precomputed_counts: Dict[str, int] = None - ) -> PangenomicNetworkStats: - """ - Calculate overview statistics for the pangenomic metabolic network. - - Parameters - ========== - precomputed_counts : Dict[str, int], None - To spare additional computations that involve loading and parsing databases, this - dictionary can contain three pieces of precomputed data: the value for the key, - 'total_gene_clusters', should be the number of gene clusters in the pangenome; the value - for the key, 'gene_clusters_assigned_ko', should be the number of gene clusters in the - pangenome assigned a consensus KO (or None if 'self.consistent_annotations' is False); - the value for the key, 'kos_assigned_gene_clusters', should be the number of consensus - KOs assigned to gene clusters in the pangenome (or None if 'self.consistent_annotations' - is False). - - Returns - ======= - PangenomicNetworkStats - Network statistics are stored in a dictionary of dictionaries. Keys in the outer - dictionary are "classes" of network statistics. Keys in the inner dictionary are - statistics themselves. - """ - if ( - precomputed_counts is not None and - sorted(precomputed_counts) != [ - 'gene_clusters_assigned_ko', 'kos_assigned_gene_clusters', 'total_gene_clusters' - ] - ): - raise ConfigError( - "The 'precomputed_counts' argument must be a dictionary only containing the keys, " - "'total_gene_clusters', 'gene_clusters_assigned_ko', and " - "'kos_assigned_gene_clusters'." - ) - - stats: PangenomicNetworkStats = {} - - self.progress.new("Counting gene clusters and KOs") - self.progress.update("...") - stats['Gene cluster and KO counts'] = stats_group = {} - - if precomputed_counts: - assert ( - type(precomputed_counts['total_gene_clusters']) is int and - precomputed_counts['total_gene_clusters'] >= 0 - ) - gene_cluster_count = precomputed_counts['total_gene_clusters'] - assert ( - precomputed_counts['gene_clusters_assigned_ko'] is None or - ( - type(precomputed_counts['gene_clusters_assigned_ko']) is int and - precomputed_counts['gene_clusters_assigned_ko'] >= 0 - ) - ) - ko_annotated_gene_cluster_count = precomputed_counts['gene_clusters_assigned_ko'] - assert ( - precomputed_counts['kos_assigned_gene_clusters'] is None or - ( - type(precomputed_counts['kos_assigned_gene_clusters']) is int and - precomputed_counts['kos_assigned_gene_clusters'] >= 0 - ) - ) - annotating_ko_count = precomputed_counts['kos_assigned_gene_clusters'] - assert not ( - (ko_annotated_gene_cluster_count is None and annotating_ko_count is not None) or - (ko_annotated_gene_cluster_count is not None and annotating_ko_count is None) - ) - else: - # One database cannot be available without the other. - assert not ( - ( - self.pan_db_source_path is None and - self.genomes_storage_db_source_path is not None - ) or - ( - self.pan_db_source_path is not None and - self.genomes_storage_db_source_path is None - ) - ) - - if self.pan_db_source_path and self.genomes_storage_db_source_path: - pdb = PanDatabase(self.pan_db_source_path) - gene_cluster_count = pdb.meta['num_gene_clusters'] - pdb.disconnect() - else: - gene_cluster_count = None - - if ( - self.pan_db_source_path and - self.genomes_storage_db_source_path and - self.consistent_annotations is False - ): - args = argparse.Namespace() - args.genomes_storage = self.genomes_storage_db_source_path - args.consensus_threshold = self.consensus_threshold - args.discard_ties = self.discard_ties - pan_super = PanSuperclass(args, r=run_quiet) - pan_super.init_gene_clusters() - pan_super.init_gene_clusters_functions() - pan_super.init_gene_clusters_functions_summary_dict() - gene_clusters_functions_summary_dict: Dict = ( - pan_super.gene_clusters_functions_summary_dict - ) - ko_annotated_gene_cluster_count = 0 - ko_ids = [] - for gene_cluster_functions_data in gene_clusters_functions_summary_dict.values(): - gene_cluster_ko_data = gene_cluster_functions_data['KOfam'] - if gene_cluster_ko_data != {'function': None, 'accession': None}: - # A KO was assigned to the cluster. - ko_annotated_gene_cluster_count += 1 - ko_ids.append(gene_cluster_ko_data['accession']) - annotating_ko_count = len(set(ko_ids)) - else: - ko_annotated_gene_cluster_count = None - annotating_ko_count = None - - if gene_cluster_count is not None: - stats_group['Total gene clusters in pangenome'] = gene_cluster_count - if ko_annotated_gene_cluster_count is not None: - stats_group['Gene clusters assigned protein KO'] = ko_annotated_gene_cluster_count - stats_group['Gene clusters in network'] = len(self.gene_clusters) - if annotating_ko_count is not None: - stats_group['Protein KOs assigned to gene clusters'] = annotating_ko_count - stats_group['KOs in network'] = len(self.kos) - self.progress.end() - - self._get_common_overview_statistics(stats) - - if precomputed_counts: - return stats - - if not (self.pan_db_source_path and self.genomes_storage_db_source_path): - self.run.info_single( - f"""\ - Since the pangenomic network was not associated with a pan database and genomes - storage database, the following statistics could not be calculated and were not - reported to the output file: 'Total gene clusters in pangenome', 'Gene clusters - assigned protein KOs', and 'Protein KOs assigned to gene clusters'.\ - """ - ) - elif self.consistent_annotations is False: - self.run.info_single( - f"""\ - The network attribute, 'consistent_annotations', is False, which indicates that the - reaction network stored in the pan database was made from a different set of KO gene - annotations than is currently in the genomes storage database. Therefore, the - following statistics were not calculated and reported to the output file to avoid - potential inaccuracies: 'Gene clusters assigned protein KO' and 'Protein KOs - assigned to gene clusters'.\ - """ - ) - - return stats - - def print_overview_statistics(self, stats: GenomicNetworkStats = None) -> None: - """ - Print overview statistics for the genomic metabolic network. - - Parameters - ========== - stats : GenomicNetworkStats, None - With the default value of None, network statistics will be calculated and printed. - Alternatively, provided network statistics will be printed without calculating anew. - - Returns - ======= - None - """ - if not stats: - stats = self.get_overview_statistics() - - self.run.info_single("METABOLIC REACTION NETWORK STATISTICS", mc='green', nl_after=1) - - self.run.info_single("Gene clusters and KEGG Ortholog (KO) annotations") - stats_group = stats['Gene cluster and KO counts'] - self.run.info( - "Total gene clusters in pangenome", stats_group['Total gene clusters in pangenome'] - ) - self.run.info( - "Gene clusters annotated with protein KO", - stats_group['Gene clusters assigned protein KO'] - ) - self.run.info("Gene clusters in network", stats_group['Gene clusters in network']) - self.run.info( - "Protein KOs assigned to gene clusters", - stats_group['Protein KOs assigned to gene clusters'] - ) - self.run.info("KOs in network", stats_group['KOs in network'], nl_after=1) - - self._print_common_overview_statistics(stats) - - def export_json( - self, - path: str, - overwrite: bool = False, - objective: str = None, - remove_missing_objective_metabolites: bool = False, - record_genomes: Tuple[str] = ('gene', 'reaction'), - # record_bins: Tuple[str] = ('gene', 'reaction'), - indent: int = 2, - progress: terminal.Progress = terminal.Progress() - ) -> None: - """ - Export the network to a metabolic model file in JSON format. Entries in the "gene" section - of this file represent gene clusters. - - All information from the network is included in the JSON so that the file can by imported by - anvi'o as a PangenomicNetwork object containing the same information. - - Parameters - ========== - path : str - output JSON file path - - overwrite : bool, False - Overwrite the JSON file if it already exists. - - objective : str, None - An objective to use in the model, stored as the first entry in the JSON 'reactions' - array. Currently, the only valid options are None and 'e_coli_core'. - - None means that no objective is added to the JSON, meaning that FBA cannot be performed - on the model. - - 'e_coli_core' is the biomass objective from the COBRApy example JSON file of E. coli - "core" metabolism, 'e_coli_core.json'. - - remove_missing_objective_metabolites : bool, False - If True, remove metabolites from the JSON objective that are not produced or consumed in - the reaction network. FBA fails with metabolites outside the network. - - record_genomes : tuple, ('gene cluster', 'reaction') - Record the genome membership of gene clusters in JSON entries. By default, genome names - are recorded for gene clusters and reactions with the argument, ('gene cluster', - 'reaction'). To not record genomes at all, pass either an empty tuple or None. The - following valid strings can be provided in a tuple in any combination: 'gene cluster', - 'reaction', and 'metabolite'. 'reaction' and 'metabolite' record the genomes predicted - to encode enzymes associated with reactions and metabolites, respectively. - - indent : int, 2 - spaces of indentation per nesting level in JSON file - - progress : terminal.Progress, terminal.Progress() - """ - if record_genomes is None: - record_genomes = () - valid_items = ('gene cluster', 'reaction', 'metabolite') - invalid_items = [] - for item in record_genomes: - if item not in valid_items: - invalid_items.append(item) - if invalid_items: - raise ConfigError( - f"The following items in the 'record_genomes' argument are invalid: {', '.join(invalid_items)}" - ) - - progress.new("Constructing JSON") - progress.update("Setting up") - filesnpaths.is_output_file_writable(path, ok_if_exists=overwrite) - json_dict = JSONStructure.get() - json_gene_clusters: List[Dict] = json_dict['genes'] - json_reactions: List[Dict] = json_dict['reactions'] - json_metabolites: List[Dict] = json_dict['metabolites'] - if objective == 'e_coli_core': - objective_dict = JSONStructure.get_e_coli_core_objective() - if remove_missing_objective_metabolites: - self.remove_missing_objective_metabolites(objective_dict) - json_reactions.append(objective_dict) - elif objective != None: - raise ConfigError(f"Anvi'o does not recognize an objective with the name, '{objective}'.") - - progress.update("Gene clusters") - reaction_gene_clusters: Dict[str, List[str]] = {} - reaction_kos: Dict[str, List[KO]] = {} - # The following two dictionaries are only needed for recording the occurrence of reactions - # and metabolites in genomes. - reaction_genomes: Dict[str, List[str]] = {} - metabolite_genomes: Dict[str, List[str]] = {} - for cluster_id, gene_cluster in self.gene_clusters.items(): - gene_cluster_entry = JSONStructure.get_gene_entry() - json_gene_clusters.append(gene_cluster_entry) - cluster_id_str = str(cluster_id) - gene_cluster_entry['id'] = cluster_id_str - # Record KO IDs in the annotation section of the gene cluster entry. In a JSON file produced - # from a 'GenomicNetwork', KO IDs are paired with their gene annotation e-values, which - # can't be done with consensus KOs for gene clusters. Therefore, where the e-value would - # be, put an empty string. - annotation = gene_cluster_entry['annotation'] - annotation['ko'] = annotation_kos = {} - ko = gene_cluster.ko - annotation_kos[ko.id] = "" - for modelseed_reaction_id in ko.reactions: - try: - reaction_gene_clusters[modelseed_reaction_id].append(cluster_id_str) - except KeyError: - reaction_gene_clusters[modelseed_reaction_id] = [cluster_id_str] - try: - reaction_kos[modelseed_reaction_id].append(ko) - except KeyError: - reaction_kos[modelseed_reaction_id] = [ko] - if not record_genomes: - continue - genome_names = gene_cluster.genomes - if 'gene cluster' in record_genomes: - # Record the names of the genomes contributing to the gene cluster in the notes section - # of the gene cluster entry. - gene_cluster_entry['notes']['genomes'] = genome_names - if 'reaction' in record_genomes: - for modelseed_reaction_id in ko.reactions: - try: - reaction_genomes[modelseed_reaction_id] += genome_names - except KeyError: - reaction_genomes[modelseed_reaction_id] = genome_names - if 'metabolite' in record_genomes: - for reaction in ko.reactions.values(): - for compartment, metabolite in zip(reaction.compartments, reaction.compounds): - entry_id = f"{metabolite.modelseed_id}_{compartment}" - try: - metabolite_genomes[entry_id] += genome_names - except KeyError: - metabolite_genomes[entry_id] = genome_names - - progress.update("Reactions") - compound_compartments: Dict[str, Set[str]] = {} - for modelseed_reaction_id, reaction in self.reactions.items(): - reaction_entry = JSONStructure.get_reaction_entry() - json_reactions.append(reaction_entry) - reaction_entry['id'] = modelseed_reaction_id - reaction_entry['name'] = reaction.modelseed_name - metabolites = reaction_entry['metabolites'] - for compound, compartment, coefficient in zip(reaction.compounds, reaction.compartments, reaction.coefficients): - modelseed_compound_id = compound.modelseed_id - metabolites[f"{modelseed_compound_id}_{compartment}"] = coefficient - try: - compound_compartments[modelseed_compound_id].add(compartment) - except KeyError: - compound_compartments[modelseed_compound_id] = set(compartment) - if not reaction.reversibility: - # By default, the reaction entry was set up to be reversible; here make it irreversible. - reaction_entry['lower_bound'] = 0.0 - reaction_entry['gene_reaction_rule'] = " or ".join([gcid for gcid in reaction_gene_clusters[modelseed_reaction_id]]) - notes = reaction_entry['notes'] - # Record gene KO annotations which aliased the reaction via KEGG REACTION or EC number. - notes['ko'] = ko_notes = {} - ko_kegg_aliases = [] - ko_ec_number_aliases = [] - for ko in reaction_kos[modelseed_reaction_id]: - try: - kegg_aliases = ko.kegg_reaction_aliases[modelseed_reaction_id] - except KeyError: - kegg_aliases = [] - try: - ec_number_aliases = ko.ec_number_aliases[modelseed_reaction_id] - except KeyError: - ec_number_aliases = [] - ko_notes[ko.id] = {'kegg.reaction': kegg_aliases, 'ec-code': ec_number_aliases} - ko_kegg_aliases += kegg_aliases - ko_ec_number_aliases += ec_number_aliases - ko_kegg_aliases = set(ko_kegg_aliases) - ko_ec_number_aliases = set(ko_ec_number_aliases) - # Record other KEGG REACTION or EC number aliases of the reaction in the ModelSEED - # database that did not happen to be associated with KO annotations. - notes['other_aliases'] = { - 'kegg.reaction': list(set(reaction.kegg_aliases).difference(ko_kegg_aliases)), - 'ec-code': list(set(reaction.ec_number_aliases).difference(ko_ec_number_aliases)) - } - if 'reaction' not in record_genomes: - continue - notes['genomes'] = sorted(set(reaction_genomes[modelseed_reaction_id])) - - progress.update("Metabolites") - for modelseed_compound_id, metabolite in self.metabolites.items(): - modelseed_compound_name = metabolite.modelseed_name - charge = metabolite.charge - formula = metabolite.formula - kegg_compound_aliases = list(metabolite.kegg_aliases) - for compartment in compound_compartments[modelseed_compound_id]: - metabolite_entry = JSONStructure.get_metabolite_entry() - json_metabolites.append(metabolite_entry) - entry_id = f"{modelseed_compound_id}_{compartment}" - metabolite_entry['id'] = entry_id - metabolite_entry['name'] = modelseed_compound_name - metabolite_entry['compartment'] = compartment - # Compounds without a formula have a nominal charge of 10000000 in the ModelSEED - # compounds database, which is replaced by None in the reaction network and 0 in the JSON. - metabolite_entry['charge'] = charge if charge is not None else 0 - metabolite_entry['formula'] = formula if formula is not None else "" - metabolite_entry['annotation']['kegg.compound'] = kegg_compound_aliases - if 'metabolite' not in record_genomes: - continue - notes['genomes'] = sorted(set(metabolite_genomes[entry_id])) - - progress.update("Saving") - with open(path, 'w') as f: - json.dump(json_dict, f, indent=indent) - progress.end() - -class JSONStructure: - """JSON structure of metabolic model file.""" - def get() -> Dict: - """Top-level file framework.""" - return { - 'metabolites': [], - 'reactions': [], - 'genes': [], - 'id': '', - 'compartments': { - 'c': 'cytosol', - 'e': 'extracellular space' - }, - 'version': '1' - } - - def get_metabolite_entry() -> Dict: - """"Format of each object in the 'metabolites' array.""" - return { - 'id': '', - 'name': '', - 'compartment': '', - 'charge': 0, # placeholder: uncharged - 'formula': '', - 'notes': {}, - 'annotation': {} - } - - def get_reaction_entry() -> Dict: - """Format of each object in the 'reactions' array.""" - return { - 'id': '', - 'name': '', - 'metabolites': {}, - # By default, make the reaction perfectly reversible. - 'lower_bound': -1000.0, - 'upper_bound': 1000.0, - 'gene_reaction_rule': '', - 'subsystem': '', - 'notes': {}, - 'annotation': {} - } - - def get_gene_entry() -> Dict: - """Format of each object in the 'genes' array.""" - return { - 'id': '', - 'name': '', - 'notes': {}, - 'annotation': {} - } - - def get_e_coli_core_objective() -> Dict: - """ - Biomass objective from the 'reactions' array in the COBRApy example JSON file, - 'e_coli_core.json', with KBase/ModelSEED compound IDs replacing BiGG metabolite IDs. - """ - return { - 'id': 'BIOMASS_Ecoli_core_w_GAM', - 'name': 'Biomass Objective Function with GAM', - 'metabolites': { - 'cpd00169_c': -1.496, - 'cpd00022_c': -3.7478, - 'cpd00008_c': 59.81, - 'cpd00024_c': 4.1182, - 'cpd00002_c': -59.81, - 'cpd00010_c': 3.7478, - 'cpd00236_c': -0.361, - 'cpd00072_c': -0.0709, - 'cpd00102_c': -0.129, - 'cpd00079_c': -0.205, - 'cpd00053_c': -0.2557, - 'cpd00023_c': -4.9414, - 'cpd00001_c': -59.81, - 'cpd00067_c': 59.81, - 'cpd00003_c': -3.547, - 'cpd00004_c': 3.547, - 'cpd00006_c': 13.0279, - 'cpd00005_c': -13.0279, - 'cpd00032_c': -1.7867, - 'cpd00061_c': -0.5191, - 'cpd00009_c': 59.81, - 'cpd00020_c': -2.8328, - 'cpd00101_c': -0.8977 - }, - 'lower_bound': 0.0, - 'upper_bound': 1000.0, - 'gene_reaction_rule': '', - 'objective_coefficient': 1.0, - 'subsystem': 'Biomass and maintenance functions', - 'notes': { - 'original_bigg_ids': [ - 'Biomass_Ecoli_core_w_GAM' - ], - 'original_metabolite_ids': { - '3pg_c': -1.496, - 'accoa_c': -3.7478, - 'adp_c': 59.81, - 'akg_c': 4.1182, - 'atp_c': -59.81, - 'coa_c': 3.7478, - 'e4p_c': -0.361, - 'f6p_c': -0.0709, - 'g3p_c': -0.129, - 'g6p_c': -0.205, - 'gln__L_c': -0.2557, - 'glu__L_c': -4.9414, - 'h2o_c': -59.81, - 'h_c': 59.81, - 'nad_c': -3.547, - 'nadh_c': 3.547, - 'nadp_c': 13.0279, - 'nadph_c': -13.0279, - 'oaa_c': -1.7867, - 'pep_c': -0.5191, - 'pi_c': 59.81, - 'pyr_c': -2.8328, - 'r5p_c': -0.8977 - } - }, - 'annotation': { - 'bigg.reaction': [ - 'BIOMASS_Ecoli_core_w_GAM' - ], - 'metanetx.reaction': [ - 'MNXR96280' - ], - 'sbo': 'SBO:0000629' - } - } - -class KODatabase: - """ - Representation of the KEGG KO database used in the construction of reaction networks. - - Unless an alternative directory is provided, the database is downloaded and set up in a - default anvi'o data directory, and loaded from this directory in network construction. - """ - default_dir = os.path.join(os.path.dirname(ANVIO_PATH), 'data/misc/KEGG/KO_REACTION_NETWORK') - expected_files = ['ko_info.txt', 'ko_data.tsv'] - - def __init__(self, ko_dir: str = None) -> None: - """ - Load the table derived from downloaded KEGG KO entry files that relates KOs to KEGG - reactions and EC numbers. - - Parameters - ========== - ko_dir : str, None - The directory containing reference KEGG Orthology (KO) tables set up by anvi'o. The - default argument of None expects KO data to be set up in the default anvi'o directory - used by the program `anvi-setup-kegg-data`. - """ - if ko_dir: - if not os.path.isdir(ko_dir): - raise ConfigError(f"There is no such directory, '{ko_dir}'.") - else: - ko_dir = self.default_dir - - for expected_file in self.expected_files: - if not os.path.isfile(os.path.join(ko_dir, expected_file)): - raise ConfigError(f"No required file named '{expected_file}' was found in the KO directory, '{ko_dir}'.") - - f = open(os.path.join(ko_dir, 'ko_info.txt')) - f.readline() - self.release = ' '.join(f.readline().strip().split()[1:]) - f.close() - - self.ko_table = pd.read_csv(os.path.join(ko_dir, 'ko_data.tsv'), sep='\t', header=0, index_col=0, low_memory=False) - - def set_up( - num_threads: int = 1, - dir: str = None, - reset: bool = False, - run: terminal.Run = terminal.Run(), - progress: terminal.Progress = terminal.Progress() - ) -> None: - """ - Download KEGG KO entry files and parse these files to construct a tab-delimited file - relating KOs to KEGG reactions and EC numbers. - - Parameters - ========== - num_threads : int, 1 - Number of threads to use in parallelizing the download of KO files. - - dir : str, None - Directory in which to create a subdirectory called `KO_REACTION_NETWORK`, - in which files are downloaded and set up. This argument overrides - the default directory. - - reset : bool, False - If True, remove any existing 'KO_REACTION_NETWORK' database directory and the files - therein. If False, an exception is raised if there are files in this directory. - - run : anvio.terminal.Run, anvio.terminal.Run() - This object prints run information to the terminal. - - progress : anvio.terminal.Progress, anvio.terminal.Progress() - """ - run.info_single("Info from Reaction Network Download") # delineates output specific to this data from `anvi-setup-kegg-data` - if dir: - if os.path.isdir(dir): - ko_dir = os.path.join(dir, 'KO_REACTION_NETWORK') - else: - raise ConfigError(f"There is no such directory, '{dir}'. You should create it " - "first if you want to use it.") - else: - ko_dir = KODatabase.default_dir - parent_dir = os.path.dirname(ko_dir) - if not os.path.exists(parent_dir): - os.makedirs(parent_dir) - if os.path.exists(ko_dir): - if reset: - shutil.rmtree(ko_dir) - else: - raise ConfigError( - f"The KO database directory, '{ko_dir}', already exists. 'reset' can be used " - "to remove the database at this location and set it up again." - ) - os.makedirs(ko_dir) - - if num_threads == 1: - run.warning( - "Only 1 thread will be used to download KO files. It is advisable to set a higher " - "number of threads to download faster." - ) - assert type(num_threads) is int and num_threads > 0 - - # Download a file for each entry in a KEGG database. - download_root = 'https://rest.kegg.jp/' - while True: - # Break out of this loop upon confirming that the KEGG release didn't change in the - # middle of downloading KO files. - progress.new(f"Downloading KEGG KO files") - # Get the database version before download. - progress.update("Database info") - info_before_path = os.path.join(ko_dir, 'ko_info_before.txt') - utils.download_file(f'{download_root}info/ko', info_before_path) - f = open(info_before_path) - f.readline() - release_before = ' '.join(f.readline().strip().split()[1:]) - f.close() - - # Get a list of all KO IDs. - progress.update("KO list") - list_path = os.path.join(ko_dir, 'ko_list.txt') - utils.download_file(f'{download_root}list/ko', list_path) - ko_ids = [] - f = open(list_path) - for line in f: - line.split()[0] - ko_ids.append(line.split('\t')[0]) - f.close() - - # Download KO entry files. - manager = mp.Manager() - input_queue = manager.Queue() - output_queue = manager.Queue() - for ko_id in ko_ids: - input_queue.put((f'{download_root}get/{ko_id}', os.path.join(ko_dir, f'{ko_id}.txt'))) - workers: List[mp.Process] = [] - for _ in range(num_threads): - worker = mp.Process(target=_download_worker, args=(input_queue, output_queue)) - workers.append(worker) - worker.start() - downloaded_count = 0 - undownloaded_count = 0 - total = len(ko_ids) - undownloaded = [] - while downloaded_count + undownloaded_count < total: - output = output_queue.get() - if output is True: - downloaded_count += 1 - progress.update(f"{downloaded_count} / {total} KO files") - else: - undownloaded_count += 1 - undownloaded.append(os.path.splitext(os.path.basename(output))[0]) - for worker in workers: - worker.terminate() - if undownloaded: - raise ConfigError( - "Unfortunately, files for the following KOs failed to download despite multiple attempts, " - f"and so the database needs to be set up again: {', '.join(undownloaded)}" - ) - - # Get the database version after download. - progress.update("Database info (again)") - info_after_path = os.path.join(ko_dir, 'ko_info.txt') - utils.download_file(f'{download_root}info/ko', info_after_path) - f = open(info_after_path) - f.readline() - release_after = ' '.join(f.readline().strip().split()[1:]) - f.close() - - # Check that the database had the same version before and after download. - progress.end() - if release_before == release_after: - # Retain one of the info files and delete the other. - info_path = info_after_path - os.remove(info_before_path) - break - else: - run.warning( - "It's your lucky day! The version of KEGG appears to have changed from " - f"'{release_before}' to '{release_after}' while anvi'o was downloading files " - "from the KO database. Anvi'o will now attempt to redownload all of the files. " - ) - run.info(f"Total number of KOs/entry files", total) - run.info("KEGG KO database version", release_after) - run.info("KEGG KO list", list_path) - run.info("KEGG KO info", info_path) - - progress.new("Processing KEGG KO database") - # Make a tab-delimited file relating KO IDs and names to KEGG reactions and EC numbers. - kos_data = {} - paths = glob.glob(os.path.join(ko_dir, 'K*.txt')) - for num_processed, path in enumerate(paths): - progress.update(f"{num_processed} / {total} KO files") - # Parse the KO file. - ko_data = {} - section = None - # Unfortunately, a non-unicode character can crop up. - f = open(path, 'rb') - for line in f.read().decode(errors='replace').split('\n'): - if line[0] == ' ': - pass - else: - section = line.split()[0] - if section == 'NAME': - # The name value follows 'NAME' at the beginning of the line. - ko_data['name'] = line[4:].strip() - # EC numbers associated with the KO are recorded at the end of the name value. - ec_string = re.search('\[EC:.*\]', line) - if ec_string: - ko_data['ec_numbers'] = ec_string[0][4:-1] - elif section == 'DBLINKS': - # There is a row for each linked database in this section. There can be a row - # for KEGG REACTION database entries. The first line of the section starts with - # 'DBLINKS' and is followed by a value for a linked database. Values from the - # linked database are separated by ': ' from the name of the database, e.g., - # 'RN: R00001'. - split_line = line.split() - try: - rn_index = split_line.index('RN:') - except ValueError: - continue - ko_data['reactions'] = ' '.join(split_line[rn_index + 1:]) - elif section == 'GENES': - # This is the section after DBLINKS. - break - f.close() - ko_id = os.path.splitext(os.path.basename(path))[0] - kos_data[ko_id] = ko_data - progress.update("Making a table mapping KOs to KEGG reactions and EC numbers") - columns = {h: [] for h in ['name', 'reactions', 'ec_numbers']} - for ko_data in kos_data.values(): - for h, column in columns.items(): - try: - value = ko_data[h] - except KeyError: - value = None - column.append(value) - table: pd.DataFrame = pd.DataFrame.from_dict(columns) - table.index = kos_data - table = table.sort_index() - table_path = os.path.join(ko_dir, 'ko_data.tsv') - table.to_csv(table_path, sep='\t') - progress.end() - run.info("Table of select KEGG KO data", table_path) - - # Tarball the KO entry files. - progress.new("Compressing downloaded KEGG KO entry files") - progress.update("...") - ko_entries_dir = os.path.join(ko_dir, 'ko_entries') - os.mkdir(ko_entries_dir) - for path in paths: - shutil.move(path, ko_entries_dir) - tar_path = os.path.join(ko_dir, 'ko_entries.tar.gz') - with tarfile.open(tar_path, mode='w:gz') as tar: - tar.add(ko_entries_dir, arcname='.') - progress.end() - shutil.rmtree(ko_entries_dir) - run.info("Archived KEGG KO entry files", tar_path) - -class ModelSEEDDatabase: - """ - The ModelSEED Biochemistry database set up by anvi'o. - - By default, the database is loaded from a default directory of ModelSEED files unless an - alternative directory is provided. - """ - default_dir = os.path.join(os.path.dirname(ANVIO_PATH), 'data/misc/MODELSEED') - - # Compounds are identified as cytosolic or extracellular in ModelSEED reactions. - compartment_ids = {0: 'c', 1: 'e'} - - def __init__(self, modelseed_dir: str = None) -> None: - """ - Load and set up reorganized tables of reactions and compounds from the ModelSEED directory. - - Parameters - ========== - modelseed_dir : str, None - Directory of ModelSEED files to use instead of the default. - """ - if modelseed_dir: - if not os.path.isdir(modelseed_dir): - raise ConfigError(f"There is no such directory, '{modelseed_dir}'.") - else: - modelseed_dir = self.default_dir - sha_path = os.path.join(modelseed_dir, 'sha.txt') - if not os.path.isfile(sha_path): - raise ConfigError( - f"No required file named 'sha.txt' was found in the ModelSEED directory, '{modelseed_dir}'." - ) - reactions_path = os.path.join(modelseed_dir, 'reactions.tsv') - if not os.path.isfile(reactions_path): - raise ConfigError( - f"No required file named 'reactions.tsv' was found in the ModelSEED directory, '{modelseed_dir}'." - ) - compounds_path = os.path.join(modelseed_dir, 'compounds.tsv') - if not os.path.isfile(compounds_path): - raise ConfigError( - f"No required file named 'compounds.tsv' was found in the ModelSEED directory, '{modelseed_dir}'." - ) - - with open(sha_path) as f: - self.sha = f.read().strip() - reactions_table = pd.read_csv(reactions_path, sep='\t', header=0, low_memory=False) - self.compounds_table: pd.DataFrame = pd.read_csv(compounds_path, sep='\t', header=0, index_col='id', low_memory=False) - - # Facilitate lookup of reaction data by KEGG REACTION ID via a reorganized reactions table. - # Remove reactions without KEGG aliases. - reactions_table_without_na = reactions_table.dropna(subset=['KEGG']) - expanded = [] - ko_id_col = [] - for ko_ids, row in zip( - reactions_table_without_na['KEGG'], - reactions_table_without_na.itertuples(index=False) - ): - ko_ids: str - # A ModelSEED reaction can have multiple KEGG aliases. - for ko_id in ko_ids.split('; '): - ko_id_col.append(ko_id) - expanded.append(row) - kegg_reactions_table = pd.DataFrame(expanded) - kegg_reactions_table['KEGG_REACTION_ID'] = ko_id_col - self.kegg_reactions_table = kegg_reactions_table - - # Facilitate lookup of reaction data by EC number via a reorganized reactions table. - # Remove reactions without EC number aliases. - reactions_table_without_na = reactions_table.dropna(subset=['ec_numbers']) - expanded = [] - ec_number_col = [] - for ec_numbers, row in zip( - reactions_table_without_na['ec_numbers'], - reactions_table_without_na.itertuples(index=False) - ): - ec_numbers: str - # A ModelSEED reaction can have multiple EC number aliases. - for ec_number in ec_numbers.split('|'): - ec_number_col.append(ec_number) - expanded.append(row) - ec_reactions_table = pd.DataFrame(expanded) - ec_reactions_table['EC_number'] = ec_number_col - self.ec_reactions_table = ec_reactions_table - - def set_up( - dir: str = None, - reset: bool = False, - run: terminal.Run = terminal.Run(), - progress: terminal.Progress = terminal.Progress() - ) -> None: - """ - Download the ModelSEED Biochemistry database, which consists of two tables of reaction and - metabolite data, and reorganize the tables. - - Parameters - ========== - dir : str, None - Directory in which to create a new subdirectory called 'MODELSEED', in which files are - downloaded and set up. This argument overrides the default directory. - - reset : bool, False - If True, remove any existing 'MODELSEED' database directory and the files therein. If - False, an exception is raised if there are files in this directory. - - run : anvio.terminal.Run, anvio.terminal.Run() - This object prints run information to the terminal. - - progress : anvio.terminal.Progress, anvio.terminal.Progress() - This object prints transient progress information to the terminal. - """ - if dir: - if os.path.isdir(dir): - modelseed_dir = os.path.join(dir, 'MODELSEED') - else: - raise ConfigError(f"There is no such directory, '{dir}'.") - else: - modelseed_dir = ModelSEEDDatabase.default_dir - parent_dir = os.path.dirname(modelseed_dir) - if not os.path.exists(parent_dir): - os.mkdir(parent_dir) - if os.path.exists(modelseed_dir): - if reset: - shutil.rmtree(modelseed_dir) - else: - raise ConfigError( - f"The ModelSEED database directory, '{modelseed_dir}', already exists. 'reset' " - "can be used to remove the database at this location and set it up again." - ) - os.mkdir(modelseed_dir) - - def download(url, path): - max_num_tries = 100 - wait_secs = 10.0 - num_tries = 0 - while True: - try: - utils.download_file(url, path, progress=progress) - break - except ConnectionResetError: - num_tries += 1 - if num_tries > max_num_tries: - raise ConnectionResetError( - f"The connection was reset by the peer more than {max_num_tries} times, " - "the maximum number of attempts. Try setting up the ModelSEED database again." - ) - time.sleep(wait_secs) - # The commit SHA taken from the following file is stored in a text file to track the version - # of the ModelSEED database. - json_url = 'https://api.github.com/repos/ModelSEED/ModelSEEDDatabase/commits' - json_path = os.path.join(modelseed_dir, 'commits.json') - download(json_url, json_path) - with open(json_path) as f: - sha = json.load(f)[0]['sha'] - zip_url = f'https://github.com/ModelSEED/ModelSEEDDatabase/archive/{sha}.zip' - zip_path = os.path.join(modelseed_dir, f'ModelSEEDDatabase-{sha}.zip') - download(zip_url, zip_path) - - progress.new("Setting up ModelSEED files") - progress.update("Extracting") - with zipfile.ZipFile(zip_path, 'r') as f: - f.extractall(modelseed_dir) - reactions_path = os.path.join(modelseed_dir, f'ModelSEEDDatabase-{sha}', 'Biochemistry', 'reactions.tsv') - compounds_path = os.path.join(modelseed_dir, f'ModelSEEDDatabase-{sha}', 'Biochemistry', 'compounds.tsv') - shutil.move(reactions_path, modelseed_dir) - shutil.move(compounds_path, modelseed_dir) - reactions_path = os.path.join(modelseed_dir, 'reactions.tsv') - compounds_path = os.path.join(modelseed_dir, 'compounds.tsv') - sha_path = os.path.join(modelseed_dir, 'sha.txt') - with open(sha_path, 'w') as f: - f.write(sha) - os.remove(json_path) - os.remove(zip_path) - shutil.rmtree(os.path.join(modelseed_dir, f'ModelSEEDDatabase-{sha}')) - - progress.update("Loading") - reactions_table = pd.read_csv(reactions_path, sep='\t', header=0, low_memory=False) - compounds_table = pd.read_csv(compounds_path, sep='\t', header=0, low_memory=False) - - progress.update("Reorganizing tables") - # Reorganize the downloaded tables, storing in the same locations. The tables each have a - # column of aliases, or IDs for the same reaction or compound from various databases. Split - # these IDs into separate columns added to the end of the table, dropping the alias column. - def expand_aliases(table: pd.DataFrame) -> pd.DataFrame: - new_rows = [] - for aliases in table.aliases: - aliases: str - new_row = {} - if pd.isna(aliases): - new_rows.append(new_row) - continue - split_aliases = aliases.split('|') - for alias in split_aliases: - sep_index = alias.index(': ') - alias_key = alias[: sep_index] - alias_value = alias[sep_index + 2:].lstrip() - new_row[alias_key] = alias_value - new_rows.append(new_row) - alias_df = pd.DataFrame(new_rows) - alias_df.fillna('') - new_table = pd.concat([table.drop('aliases', axis=1), alias_df], axis=1) - return new_table - reactions_table = expand_aliases(reactions_table) - compounds_table = expand_aliases(compounds_table) - - progress.update("Saving reorganized tables") - reactions_table.to_csv(reactions_path, sep='\t', index=None) - compounds_table.to_csv(compounds_path, sep='\t', index=None) - progress.end() - - run.info("ModelSEED database version (git commit hash)", sha) - run.info("Reorganized ModelSEED reactions table", reactions_path) - run.info("Reorganized ModelSEED compounds table", compounds_path) - -class Constructor: - """Make, store, and load metabolic reaction networks.""" - def __init__( - self, - ko_dir: str = None, - modelseed_dir: str = None, - run: terminal.Run = terminal.Run(), - progress: terminal.Progress = terminal.Progress() - ) -> None: - """ - Parameters - ========== - ko_dir : str, None - The directory containing reference KEGG Orthology (KO) tables set up by anvi'o. The - default argument of None expects KO data to be set up in the default anvi'o directory - used by the program `anvi-setup-kegg-data`. - - modelseed_dir : str, None - The directory containing reference ModelSEED Biochemistry tables set up by anvi'o. The - default argument of None expects ModelSEED data to be set up in the default anvi'o - directory used by the program `anvi-setup-modelseed-database`. - - run : anvio.terminal.Run, anvio.terminal.Run() - This object prints run information to the terminal. - - progress : anvio.terminal.Progress, anvio.terminal.Progress() - This object prints transient progress information to the terminal. - """ - self.ko_dir = ko_dir - self.modelseed_dir = modelseed_dir - self.run = run - self.progress = progress - - def load_network( - self, - contigs_db: str = None, - pan_db: str = None, - genomes_storage_db: str = None, - check_gene_annotations: bool = True, - load_protein_abundances: bool = False, - load_metabolite_abundances: bool = False, - profile_db: str = None, - quiet: bool = False, - stats_file: str = None - ) -> ReactionNetwork: - """ - Load a reaction network stored in a database as a reaction network object. - - Parameters - ========== - contigs_db : str, None - Path to a contigs database in which a reaction network is stored. - - pan_db : str, None - Path to a pan database in which a reaction network is stored. 'genomes_storage_db' is - also required. - - genomes_storage_db : str, None - Path to a genomes storage database in which KO annotations are stored. 'pan_db' is also - required. - - check_gene_annotations : bool, True - If True, as by default, check that the stored reaction network was made from the set of - gene KO annotations that is currently stored. An exception is raised if this is not the - case. If False, allow the stored reaction network to have been made from a different set - of gene KO annotations than is currently stored. This can result in different KOs in the - returned ReactionNetwork than in the original network that was stored. - - load_protein_abundances : bool, False - If loading the network from a contigs database, also load abundance measurements of - proteins that can be expressed by genes in the network. 'profile_db' is also required, - as abundance profile data is stored there. - - load_metabolite_abundances : bool, False - If loading the network from a contigs database, also load stored abundance measurements - of metabolites found in the network. 'profile_db' is also required, as abundance profile - data is stored there. - - profile_db : str, None - If loading protein or metabolite abundance data, this database is required, as abundance - profile data is stored there. - - quiet : bool, False - Print network overview statistics to the terminal if False. - - stats_file : str, None - Write network overview statistics to a tab-delimited file at this output path. - - Returns - ======= - ReactionNetwork - Reaction network loaded from the input database. - """ - # Check that the reaction network stored in a database is derived from the current gene KO - # annotations in the database. - if contigs_db: - network = self.load_contigs_database_network( - contigs_db, - check_gene_annotations=check_gene_annotations, - load_protein_abundances=load_protein_abundances, - load_metabolite_abundances=load_metabolite_abundances, - profile_db=profile_db, - quiet=quiet, - stats_file=stats_file - ) - elif genomes_storage_db or pan_db: - network = self.load_pan_database_network( - genomes_storage_db=genomes_storage_db, - pan_db=pan_db, - check_gene_annotations=check_gene_annotations, - quiet=quiet, - stats_file=stats_file - ) - else: - raise ConfigError( - "A reaction network must be loaded from a database source. " - "Either a contigs database or a genomes storage database and pan database are required." - ) - return network - - def load_contigs_database_network( - self, - contigs_db: str, - check_gene_annotations: bool = True, - load_protein_abundances: bool = False, - load_metabolite_abundances: bool = False, - profile_db: str = None, - quiet: bool = False, - stats_file: str = None - ) -> GenomicNetwork: - """ - Load reaction network data stored in a contigs database as a reaction network object. - - Parameters - ========== - contigs_db : str - Path to a contigs database in which a reaction network is stored. - - check_gene_annotations : bool, True - If True, as by default, check that the reaction network stored in the contigs database - was made from the same set of gene KO annotations as currently in the database, and - throw an error if this is not the case. If False, allow the stored reaction network to - have been made from a different set of gene KO annotations than is currently stored in - the database. This can result in different KO assignments to genes in the returned - GenomicNetwork than in the original network that was stored. - - load_protein_abundances : bool, False - Load stored abundance measurements of proteins that can be expressed by genes in the - network. 'profile_db' is also required, as abundance profile data is stored there. - - load_metabolite_abundances : bool, False - Load stored abundance measurements of metabolites found in the network. 'profile_db' is - also required, as abundance profile data is stored there. - - profile_db : str, None - If loading protein or metabolite abundance data, this database is required, as abundance - profile data is stored there. - - quiet : bool, False - Print network overview statistics to the terminal if False. - - stats_file : str, None - Write network overview statistics to a tab-delimited file at this output path. - - Returns - ======= - GenomicNetwork - Reaction network loaded from the contigs database. - """ - # Preemptively check the statistics file path. - if stats_file is not None: - filesnpaths.is_output_file_writable(stats_file) - - # Load the contigs database. - utils.is_contigs_db(contigs_db) - args = argparse.Namespace() - args.contigs_db = contigs_db - contigs_super = ContigsSuperclass(args, r=run_quiet) - contigs_super.init_functions(requested_sources=['KOfam']) - - # Check that the network stored in the contigs database was made from the same set of KO - # gene annotations as currently in the database. - stored_hash = contigs_super.a_meta['reaction_network_ko_annotations_hash'] - current_hash = self.hash_contigs_db_ko_annotations(contigs_super.gene_function_calls_dict) - if stored_hash != current_hash: - if check_gene_annotations: - raise ConfigError( - f"""\ - The reaction network stored in the contigs database was made from a different - set of KEGG KO gene annotations than is currently in the database. There are two - solutions to this problem. First, 'anvi-reaction-network' can be run again to - overwrite the existing network stored in the database with a new network from - the new KO gene annotations. Second, 'check_gene_annotations' can be made False - rather than True, allowing the stored network to have been made from a different - set of KO gene annotations than is currently stored in the database. This can - result in different genes being associated with KOs in the returned - GenomicNetwork than in the original network that was stored. The available - version of the KO database that has been set up by anvi'o is used to fill in - data for KOs in the network that are not current gene annotations.\ - """ - ) - self.run.warning( - f"""\ - The reaction network stored in the contigs database was made from a different set of - KEGG KO gene annotations than is currently in the database. This will be ignored - since 'check_gene_annotations' is False. This can result in different genes being - associated with KOs in the returned GenomicNetwork than in the original network that - was stored.\ - """ - ) - - network = GenomicNetwork(run=self.run, progress=self.progress) - network.contigs_db_source_path = os.path.abspath(contigs_db) - - cdb = ContigsDatabase(contigs_db) - - # Make objects representing all genes with KO annotations in the contigs database, including - # genes that are not in the network, which are later removed from the network. - functions_table = cdb.db.get_table_as_dataframe( - 'gene_functions', where_clause='source = "KOfam"' - ) - for gcid, ko_id, ko_name, e_value in zip( - functions_table['gene_callers_id'], - functions_table['accession'], - functions_table['function'], - functions_table['e_value'] - ): - try: - # This is not the first annotation involving the gene, so an object for it already - # exists. - gene = network.genes[gcid] - except KeyError: - gene = Gene() - gene.gcid = gcid - network.genes[gcid] = gene - try: - # This is not the first annotation involving the KO, so an object for it already - # exists. - ko = network.kos[ko_id] - except KeyError: - ko = KO() - ko.id = ko_id - ko.name = ko_name - network.kos[ko_id] = ko - gene.kos[ko_id] = ko - gene.e_values[ko_id] = e_value - - self._load_modelseed_reactions(cdb, network) - self._load_modelseed_compounds(cdb, network) - - # Remove any trace of genes that do not contribute to the reaction network. Also remove - # unnetworked KO links to genes. - unnetworked_gcids = [] - for gcid, gene in network.genes.items(): - gene_in_network = False - unnetworked_kos: List[str] = [] - for ko_id, ko in gene.kos.items(): - if ko.reactions: - gene_in_network = True - else: - unnetworked_kos.append(ko_id) - if gene_in_network: - for unnetworked_ko_id in unnetworked_kos: - gene.kos.pop(unnetworked_ko_id) - gene.e_values.pop(unnetworked_ko_id) - else: - unnetworked_gcids.append(gcid) - for gcid in unnetworked_gcids: - network.genes.pop(gcid) - - # Remove any trace of KOs that do not contribute to the reaction network. - unnetworked_ko_ids = [] - for ko_id, ko in network.kos.items(): - if not ko.reactions: - unnetworked_ko_ids.append(ko_id) - for ko_id in unnetworked_ko_ids: - network.kos.pop(ko_id) - - # Remove entries in the network attribute mapping ModelSEED reaction IDs to KO KEGG - # REACTION ID aliases if no such aliases were found to exist. - modelseed_reaction_ids = [] - for modelseed_reaction_id, kegg_reaction_ids in network.modelseed_kegg_aliases.items(): - if not kegg_reaction_ids: - modelseed_reaction_ids.append(modelseed_reaction_id) - for modelseed_reaction_id in modelseed_reaction_ids: - network.modelseed_kegg_aliases.pop(modelseed_reaction_id) - - # Remove entries in the network attribute mapping ModelSEED reaction IDs to KO EC number - # aliases if no such aliases were found to exist. - modelseed_reaction_ids = [] - for modelseed_reaction_id, ec_numbers in network.modelseed_ec_number_aliases.items(): - if not ec_numbers: - modelseed_reaction_ids.append(modelseed_reaction_id) - for modelseed_reaction_id in modelseed_reaction_ids: - network.modelseed_ec_number_aliases.pop(modelseed_reaction_id) - - if load_protein_abundances or load_metabolite_abundances: - network.profile_db_source_path = os.path.abspath(profile_db) - pdb = ProfileDatabase(profile_db) - if load_protein_abundances: - self._load_protein_abundances(pdb, cdb, network) - if load_metabolite_abundances: - self._load_metabolite_abundances(pdb, network) - pdb.disconnect() - - if quiet and not stats_file: - return network - - precomputed_counts = { - 'total_genes': cdb.db.get_row_counts_from_table('genes_in_contigs'), - 'genes_assigned_kos': len(network.genes) + len(unnetworked_gcids), - 'kos_assigned_genes': len(network.kos) + len(unnetworked_ko_ids) - } - cdb.disconnect() - stats = network.get_overview_statistics(precomputed_counts=precomputed_counts) - if not quiet: - network.print_overview_statistics(stats=stats) - if stats_file: - network.write_overview_statistics(stats_file, stats=stats) - - return network - - def _load_protein_abundances( - self, - profile_database: ProfileDatabase, - contigs_database: ContigsDatabase, - network: GenomicNetwork - ) -> None: - """ - Load abundance data for proteins that can be expressed by genes in the metabolic network. - - Protein isoforms are not supported. - - Parameters - ========== - profile_database : ProfileDatabase - The database storing protein measurements. - - contigs_database : ContigsDatabase - The database storing associations between genes and proteins. - - network : GenomicNetwork - The genomic network under construction. - - Returns - ======= - None - """ - protein_abundances_table = profile_database.db.get_table_as_dataframe( - tables.protein_abundances_table_name - ) - if len(protein_abundances_table) == 0: - return - - gene_functions_table = contigs_database.db.get_table_as_dataframe( - 'gene_functions', columns_of_interest=['gene_callers_id', 'source', 'accession'] - ) - gene_functions_table = gene_functions_table[ - gene_functions_table['gene_callers_id'].isin(network.genes) - ] - gene_functions_table = gene_functions_table.rename( - {'source': 'reference_source', 'accession': 'reference_id'}, axis=1 - ) - - protein_abundances_table = protein_abundances_table.merge( - gene_functions_table, how='inner', on=['reference_source', 'reference_id'] - ) - - multiprotein_genes: Dict[int, List[int]] = {} - for key, protein_table in protein_abundances_table.groupby( - ['protein_id', 'reference_source', 'reference_id'] - ): - protein_id = key[0] - protein = Protein() - network.proteins[protein_id] = protein - protein.id = protein_id - for gcid in protein_table['gene_callers_id'].unique(): - gene = network.genes[gcid] - protein.genes[gcid] = gene - if gene.protein: - try: - multiprotein_genes[gene].append(protein_id) - except KeyError: - multiprotein_genes[gene] = [protein_id] - else: - gene.protein = protein - for row in protein_table.itertuples(): - protein.abundances[row.sample_name] = row.abundance_value - - if multiprotein_genes: - s = "" - for gcid, protein_ids in multiprotein_genes.items(): - s += f"{gcid}: {', '.join(protein_ids)}; " - s = s[: -1] - raise ConfigError( - f"""\ - Certain genes were unexpectedly associated with multiple proteins with abundance - data. These are as follows, with the gene callers ID separated by a comma-separated - list of protein IDs. {s}\ - """ - ) - - def _load_metabolite_abundances( - self, - profile_database: ProfileDatabase, - network: GenomicNetwork - ) -> None: - """ - Load abundance data for metabolites represented in the metabolic network. - - Parameters - ========== - profile_database : ProfileDatabase - The database storing protein measurement data that is loaded into the genomic network. - - network : GenomicNetwork - The genomic network under construction. - - Returns - ======= - None - """ - metabolite_abundances_table = profile_database.db.get_table_as_dataframe( - tables.metabolite_abundances_table_name - ) - metabolite_abundances_table = metabolite_abundances_table[ - metabolite_abundances_table['reference_id'].isin(network.metabolites) - ] - if len(metabolite_abundances_table) == 0: - return - - for modelseed_compound_id, metabolite_table in metabolite_abundances_table.groupby( - 'reference_id' - ): - metabolite = network.metabolites[modelseed_compound_id] - for row in metabolite_table.itertuples(): - metabolite.abundances[row.sample_name] = row.abundance_value - - def load_pan_database_network( - self, - pan_db: str, - genomes_storage_db: str, - check_gene_annotations: bool = True, - quiet: bool = False, - stats_file: str = None - ) -> PangenomicNetwork: - """ - Load reaction network data stored in a pan database as a reaction network object. - - Parameters - ========== - pan_db : str - Path to a pan database in which a reaction network is stored. - - genomes_storage_db : str - Path to the genomes storage database associated with the pan database. - - check_annotations : bool, True - If True, as by default, check that the reaction network stored in the pan database was - made from the set of gene KO annotations currently stored in the associated genomes - storage database. An exception is raised if this is not the case. If False, allow the - stored reaction network to have been made from a different set of gene KO annotations - than is currently stored in the genomes storage database. This can result in different - consensus KOs assigned to gene clusters in the returned PangenomicNetwork than in the - original network that was stored. - - quiet : bool, False - Print network overview statistics to the terminal if False. - - stats_file : str, None - Write network overview statistics to a tab-delimited file at this output path. - - Returns - ======= - PangenomicNetwork - The network derived from the pangenomic databases. - """ - # Preemptively check the statistics file path. - if stats_file is not None: - filesnpaths.is_output_file_writable(stats_file) - - # Load the pan database. - pan_db_info = dbinfo.PanDBInfo(pan_db) - self_table = pan_db_info.get_self_table() - # No consensus threshold may have been used in network construction, in which case the value - # of the parameter is None. - consensus_threshold = self_table['reaction_network_consensus_threshold'] - if consensus_threshold is not None: - consensus_threshold = float(consensus_threshold) - discard_ties = bool(int(self_table['reaction_network_discard_ties'])) - args = argparse.Namespace() - args.pan_db = pan_db - args.genomes_storage = genomes_storage_db - args.consensus_threshold = consensus_threshold - args.discard_ties = discard_ties - pan_super = PanSuperclass(args, r=run_quiet) - pan_super.init_gene_clusters() - pan_super.init_gene_clusters_functions() - pan_super.init_gene_clusters_functions_summary_dict() - gene_clusters_functions_summary_dict: Dict = pan_super.gene_clusters_functions_summary_dict - - # Check that the network stored in the pan database was made from the same set of KO gene - # annotations currently in the associated genomes storage database. - stored_hash = self_table['reaction_network_ko_annotations_hash'] - current_hash = self.hash_pan_db_ko_annotations( - genomes_storage_db, - gene_clusters_functions_summary_dict, - consensus_threshold, - discard_ties - ) - if stored_hash != current_hash: - if check_gene_annotations: - # Note that another unstated possible cause of the error could be due to manual - # meddling with the metavariables, 'consensus_threshold' and 'discard_ties', in the - # database. Assume that the user was not engaged in mischief. - raise ConfigError( - "The reaction network stored in the pan database was made from a different set " - "of KO gene annotations than is currently in the associated genomes storage " - "database. There are two solutions to this problem. First, the program, " - "'anvi-reaction-network', can be run again to overwrite the existing network " - "stored in the pan database with a new network from the new KO gene " - "annotations. Second, 'check_gene_annotations' can be given an argument of " - "False instead of True, preventing this exception from being raised if the " - "stored network was made from a different set of KO gene annotations than is " - "currently in the genomes storage database. This can result in different " - "consensus KOs assigned to gene clusters in the returned PangenomicNetwork " - "than in the original network that was stored. The available version of the KO " - "database that has been set up by anvi'o is used to fill in data for any KOs " - "in the network that are not current gene annotations in the genomes storage " - "database." - ) - self.run.warning( - "The reaction network stored in the pan database was made from a different set of " - "KO gene annotations than is currently in the genomes storage database. This will " - "be ignored since 'check_gene_annotations' is False. This can result in different " - "consensus KO assignments to gene clusters in the returned PangenomicNetwork than " - "in the original network that was stored." - ) - - network = PangenomicNetwork(run=self.run, progress=self.progress) - network.pan_db_source_path = os.path.abspath(pan_db) - network.genomes_storage_db_source_path = os.path.abspath(genomes_storage_db) - network.consensus_threshold = consensus_threshold - network.discard_ties = discard_ties - if stored_hash == current_hash: - network.consistent_annotations = True - else: - network.consistent_annotations = False - - # Make objects representing all gene clusters with consensus KO annotations. - for cluster_id, gene_cluster_functions_data in gene_clusters_functions_summary_dict.items(): - # Retrieve the consensus KO across genes in the cluster. Parameterization of the method - # used to select consensus KOs occurred in pan super initialization. Parameter values - # were loaded from pan database metavariables. - gene_cluster_ko_data = gene_cluster_functions_data['KOfam'] - if gene_cluster_ko_data == {'function': None, 'accession': None}: - # No KO was assigned to the cluster. - continue - ko_id = gene_cluster_ko_data['accession'] - - gene_cluster = GeneCluster() - gene_cluster.gene_cluster_id = cluster_id - gene_cluster.genomes = list(pan_super.gene_clusters[cluster_id]) - # Add the gene cluster to the network, regardless of whether it yields reactions. Gene - # clusters not contributing to the reaction network are removed later. - network.gene_clusters[cluster_id] = gene_cluster - - try: - # This is not the first gene cluster that has been encountered with the KO assigned - # to it, so an object for the KO already exists. - ko = network.kos[ko_id] - except KeyError: - ko = KO() - ko.id = ko_id - ko.name = gene_cluster_ko_data['function'] - network.kos[ko_id] = ko - gene_cluster.ko = ko - - pdb = PanDatabase(pan_db) - self._load_modelseed_reactions(pdb, network) - self._load_modelseed_compounds(pdb, network) - - # Remove any trace of gene clusters that do not contribute to the reaction network. - unnetworked_cluster_ids = [] - for cluster_id, gene_cluster in network.gene_clusters.items(): - if gene_cluster.ko.reactions: - continue - unnetworked_cluster_ids.append(cluster_id) - for cluster_id in unnetworked_cluster_ids: - network.gene_clusters.pop(cluster_id) - - # Remove any trace of KOs that do not contribute to the reaction network. - unnetworked_ko_ids = [] - for ko_id, ko in network.kos.items(): - if not ko.reactions: - unnetworked_ko_ids.append(ko_id) - for ko_id in unnetworked_ko_ids: - network.kos.pop(ko_id) - - # Remove entries in the network attribute mapping ModelSEED reaction IDs to KO KEGG - # REACTION ID aliases if no such aliases were found to exist. - modelseed_reaction_ids = [] - for modelseed_reaction_id, kegg_reaction_ids in network.modelseed_kegg_aliases.items(): - if not kegg_reaction_ids: - modelseed_reaction_ids.append(modelseed_reaction_id) - for modelseed_reaction_id in modelseed_reaction_ids: - network.modelseed_kegg_aliases.pop(modelseed_reaction_id) - - # Remove entries in the network attribute mapping ModelSEED reaction IDs to KO EC number - # aliases if no such aliases were found to exist. - modelseed_reaction_ids = [] - for modelseed_reaction_id, ec_numbers in network.modelseed_ec_number_aliases.items(): - if not ec_numbers: - modelseed_reaction_ids.append(modelseed_reaction_id) - for modelseed_reaction_id in modelseed_reaction_ids: - network.modelseed_ec_number_aliases.pop(modelseed_reaction_id) - - if quiet and not stats_file: - return network - - if network.consistent_annotations: - precomputed_counts = { - 'total_gene_clusters': pdb.meta['num_gene_clusters'], - 'gene_clusters_assigned_ko': len(network.gene_clusters) + len(unnetworked_cluster_ids), - 'kos_assigned_gene_clusters': len(network.kos) + len(unnetworked_ko_ids) - } - else: - precomputed_counts = { - 'total_gene_clusters': pdb.meta['num_gene_clusters'], - 'gene_clusters_assigned_ko': None, - 'kos_assigned_gene_clusters': None - } - pdb.disconnect() - stats = network.get_overview_statistics(precomputed_counts=precomputed_counts) - if not quiet: - network.print_overview_statistics(stats=stats) - if stats_file: - network.write_overview_statistics(stats_file, stats=stats) - - return network - - def _load_modelseed_reactions( - self, - database: Union[ContigsDatabase, PanDatabase], - network: Union[GenomicNetwork, PangenomicNetwork] - ) -> None: - """ - Add ModelSEED reactions to the network being loaded from the database. - - ModelSEED reaction objects are related to KOs through KEGG REACTION and EC number aliases. - - Parameters - ========== - database : ContigsDatabase or PanDatabase - The database storing a reaction network. In loading a genomic network, provide a contigs - database; in loading a pangenomic network, provide a pan database. - - network : GenomicNetwork or PangenomicNetwork - The reaction network under construction. - - Returns - ======= - None - """ - # Load the table of reactions data. - if type(database) is ContigsDatabase: - reactions_table = database.db.get_table_as_dataframe('gene_function_reactions') - if type(network) is not GenomicNetwork: - raise ConfigError( - "The provided 'database' was of type 'ContigsDatabase', so the provided " - "'network' must be of type 'GenomicNetwork'. Instead, the reaction network " - f"argument was of type '{type(network)}'." - ) - elif type(database) is PanDatabase: - reactions_table = database.db.get_table_as_dataframe('gene_cluster_function_reactions') - if type(network) is not PangenomicNetwork: - raise ConfigError( - "The provided 'database' was of type 'PanDatabase', so the provided 'network' " - "must be of type 'PangenomicNetwork'. Instead, the reaction network argument " - f"was of type '{type(network)}'." - ) - else: - raise ConfigError( - "The provided 'database' must be of type 'ContigsDatabase' or 'PanDatabase'. " - f"Instead, the argument was of type '{type(database)}'." - ) - - # The KO database is needed if KOs in the stored network aren't among the current gene - # annotations. - try: - ko_db = KODatabase(ko_dir=self.ko_dir) - except ConfigError as e: - raise ConfigError( - f"{e} Please set up the KO database in the default directory with the program, " - "'anvi-reaction-network'." - ) - - for row in reactions_table.itertuples(): - # Each row of the table contains information on a different ModelSEED reaction. - reaction = ModelSEEDReaction() - modelseed_reaction_id: str = row.modelseed_reaction_id - reaction.modelseed_id = modelseed_reaction_id - reaction.modelseed_name = row.modelseed_reaction_name - network.reactions[modelseed_reaction_id] = reaction - - modelseed_compound_ids: str = row.metabolite_modelseed_ids - reaction.compounds = [] - for modelseed_compound_id in modelseed_compound_ids.split(', '): - try: - # This is not the first reaction involving the compound, so an object for it - # already exists. - compound = network.metabolites[modelseed_compound_id] - except KeyError: - compound = ModelSEEDCompound() - compound.modelseed_id = modelseed_compound_id - network.metabolites[modelseed_compound_id] = compound - reaction.compounds.append(compound) - reaction.compounds = tuple(reaction.compounds) - - stoichiometry: str = row.stoichiometry - reaction.coefficients = tuple(int(coeff) for coeff in stoichiometry.split(', ')) - compartments: str = row.compartments - reaction.compartments = tuple(compartments.split(', ')) - reversibility: int = row.reversibility - reaction.reversibility = bool(reversibility) - - # Map KEGG reaction aliases of the ModelSEED reaction to all KOs that were associated - # with the KEGG reaction. - kegg_reaction_ko_ids: Dict[str, List[str]] = {} - kegg_reaction_sources: str = row.ko_kegg_reaction_source - for kegg_reaction_item in kegg_reaction_sources.split('; '): - if not kegg_reaction_item: - # The ModelSEED reaction was not sourced from KEGG reactions. - continue - kegg_reaction_id, ko_ids = kegg_reaction_item.split(': (') - ko_ids = ko_ids[:-1].split(', ') - kegg_reaction_ko_ids[kegg_reaction_id] = ko_ids - # Record *all* KEGG reaction aliases of the ModelSEED reaction, including those not - # associated with KO annotations. - other_kegg_reaction_ids: str = row.other_kegg_reaction_ids - reaction.kegg_aliases = list(kegg_reaction_ko_ids) - if other_kegg_reaction_ids: - reaction.kegg_aliases += other_kegg_reaction_ids.split(', ') - reaction.kegg_aliases = tuple(reaction.kegg_aliases) - - network.modelseed_kegg_aliases[modelseed_reaction_id] = modelseed_kegg_aliases = [] - orphan_ko_ids = [] - reaction_added_to_ko = False - for kegg_reaction_id, ko_ids in kegg_reaction_ko_ids.items(): - # Record the ModelSEED reaction as one of the aliases of the KEGG reaction in the - # network. - try: - network.kegg_modelseed_aliases[kegg_reaction_id].append(modelseed_reaction_id) - except KeyError: - network.kegg_modelseed_aliases[kegg_reaction_id] = [modelseed_reaction_id] - modelseed_kegg_aliases.append(kegg_reaction_id) - for ko_id in ko_ids: - try: - ko = network.kos[ko_id] - except KeyError: - # In the case of a genomic network, this error arises when the current set - # of gene KO annotations in the contigs database does not match the set from - # which the reaction network was originally made, and the KO under - # consideration in the network is no longer a gene annotation in the - # database. In the case of a pangenomic network, this error arises when the - # current set of gene cluster consensus KO annotations does not match the - # set from which the reaction network was originally made and the consensus - # KO under consideration in the network no longer annotates a gene cluster - # in the pan database. (The current set of gene cluster consensus KO - # annotations is derived from the pan and genomes storage databases using - # the parameters, 'consensus_threshold' and 'discard_ties'.) - ko = KO() - ko.ko_id = ko_id - # The KO name is unknown from the database, so take it from the KO database. - ko.ko_name = ko_db.ko_table.loc[ko_id, 'name'] - network.kos[ko_id] = ko - orphan_ko_ids.append(ko_id) - if not reaction_added_to_ko: - # This is the first encounter with the reaction for the KO. - ko.reactions[modelseed_reaction_id] = reaction - reaction_added_to_ko = True - try: - ko.kegg_reaction_aliases[modelseed_reaction_id].append(kegg_reaction_id) - except KeyError: - ko.kegg_reaction_aliases[modelseed_reaction_id] = [kegg_reaction_id] - - # Map EC number aliases of the ModelSEED reaction to all KOs that were associated with - # the EC number. - ec_number_ko_ids: Dict[str, List[str]] = {} - ec_number_sources: str = row.ko_ec_number_source - for ec_number_item in ec_number_sources.split('; '): - if not ec_number_item: - # The ModelSEED reaction was not sourced from EC numbers. - continue - ec_number, ko_ids = ec_number_item.split(': (') - ko_ids = ko_ids[:-1].split(', ') - ec_number_ko_ids[ec_number] = ko_ids - # Record *all* EC number aliases of the ModelSEED reaction, including those not - # associated with KO annotations. - other_ec_numbers: str = row.other_ec_numbers - reaction.ec_number_aliases = list(ec_number_ko_ids) - if other_ec_numbers: - reaction.ec_number_aliases += other_ec_numbers.split(', ') - reaction.ec_number_aliases = tuple(reaction.ec_number_aliases) - - network.modelseed_ec_number_aliases[modelseed_reaction_id] = modelseed_ec_number_aliases = [] - for ec_number, ko_ids in ec_number_ko_ids.items(): - # Record the ModelSEED reaction as one of the aliases of the EC number in the - # network. - try: - network.ec_number_modelseed_aliases[ec_number].append(modelseed_reaction_id) - except KeyError: - network.ec_number_modelseed_aliases[ec_number] = [modelseed_reaction_id] - modelseed_ec_number_aliases.append(ec_number) - for ko_id in ko_ids: - try: - ko = network.kos[ko_id] - except KeyError: - # This error arises for the same reason as before (processing KEGG reactions). - ko = KO() - ko.ko_id = ko_id - # The KO name is unknown from the database, so take it from the KO database. - ko.ko_name = ko_db.ko_table.loc[ko_id, 'name'] - network.kos[ko_id] = ko - orphan_ko_ids.append(ko_id) - if not reaction_added_to_ko: - # This is the first encounter with the reaction for the KO. - ko.reactions[modelseed_reaction_id] = reaction - reaction_added_to_ko = True - try: - ko.ec_number_aliases[modelseed_reaction_id].append(ec_number) - except KeyError: - ko.ec_number_aliases[modelseed_reaction_id] = [ec_number] - - if DEBUG: - # "Orphan" KOs can only arise when 'check_gene_annotations' is False in the calling - # method, 'load_contigs_database_network' or 'load_pan_database_network'. - if type(network) is GenomicNetwork: - self.run.info_single( - "The following KOs are found in the stored reaction network in the contigs " - "database, but they are not found among the current gene KO annotations in " - "the contigs database. The available version of the KO database set up by " - "anvi'o was used to retrieve the function 'names' of these KOs: " - f"{', '.join(orphan_ko_ids)}" - ) - elif type(network) is PangenomicNetwork: - self.run.info_single( - "The following KOs are found in the stored reaction network in the pan " - "database, but they are not found among the current gene KO annotations in " - "the genomes storage database. The available version of the KO database " - "set up by anvi'o was used to retrieve the function 'names' of these KOs: " - f"{', '.join(orphan_ko_ids)}" - ) - - def _load_modelseed_compounds( - self, - database: Union[ContigsDatabase, PanDatabase], - network: Union[GenomicNetwork, PangenomicNetwork] - ) -> None: - """ - Add ModelSEED compounds to the network being loaded from the database. - - Parameters - ========== - database : ContigsDatabase or PanDatabase - The database storign a reaction network. In loading a genomic network, provide a contigs - database; in loading a pangenomic network, provide a pan database. - - network : GenomicNetwork or PangenomicNetwork - The reaction network under construction. - - Returns - ======= - None - """ - # Load the table of compounds data. - if type(database) is ContigsDatabase: - metabolites_table = database.db.get_table_as_dataframe('gene_function_metabolites') - if type(network) is not GenomicNetwork: - raise ConfigError( - "The provided 'database' was of type 'ContigsDatabase', so the provided " - "'network' must be of type 'GenomicNetwork'. Instead, the reaction network " - f"argument was of type '{type(network)}'." - ) - elif type(database) is PanDatabase: - metabolites_table = database.db.get_table_as_dataframe('gene_cluster_function_metabolites') - if type(network) is not PangenomicNetwork: - raise ConfigError( - "The provided 'database' was of type 'PanDatabase', so the provided 'network' " - "must be of type 'PangenomicNetwork'. Instead, the reaction network argument " - f"was of type '{type(database)}'." - ) - else: - raise ConfigError( - "The provided 'database' must be of type 'ContigsDatabase' or 'PanDatabase'. " - f"Instead, the argument was of type '{type(database)}'." - ) - - for row in metabolites_table.itertuples(): - # Each row of the table contains information on a different ModelSEED compound. - modelseed_compound_id = row.modelseed_compound_id - compound = network.metabolites[modelseed_compound_id] - modelseed_compound_name: str = row.modelseed_compound_name - compound.modelseed_name = modelseed_compound_name - kegg_aliases: str = row.kegg_aliases - compound.kegg_aliases = tuple(kegg_aliases.split(', ')) - # Compounds without a formula, recorded here as None, have a nominal charge of 10000000 - # in the ModelSEED compounds database. This is replaced by NaN in the table and here as - # None in the reaction network. - formula: str = row.formula - compound.formula = formula - charge: int = row.charge - compound.charge = charge if not np.isnan(charge) else None - - def make_network( - self, - contigs_db: str = None, - pan_db: str = None, - genomes_storage_db: str = None, - store: bool = True, - overwrite_existing_network: bool = False, - consensus_threshold: float = None, - discard_ties: bool = False, - stats_file: str = None - ) -> ReactionNetwork: - """ - Make a metabolic reaction network from KEGG Orthologs stored in an anvi'o database, - associated KEGG annotations, and the ModelSEED Biochemistry database. - - Parameters - ========== - contigs_db : str, None - Path to a contigs database. The database can represent different types of samples, - including a single genome, metagenome, or transcriptome. The network is derived from - gene KO annotations stored in the database. If 'store' is True, the network is saved in - the database. - - pan_db : str, None - Path to a pan database. The pangenomic network is determined for gene clusters stored in - the database. If 'store' is True, the network is saved in the database. - An argument for the paired 'genomes_storage_db' is also required. - - genomes_storage_db : str, None - Path to a genomes storage database. The pangenomic network is derived from gene KO - annotations stored in the database. An argument for the paired 'pan_db' is also - required. - - store : bool, True - Save the network. A network constructed from a contigs database is stored in that - database. A pangenomic network constructed from a genomes stroage database and pan - database is stored in the pan database. - - overwrite_existing_network : bool, False - Overwrite an existing network stored in the contigs or pan database. 'store' is also - required. - - consensus_threshold : float, None - This parameter applies to pangenomes. With the default of None, the protein annotation - most frequent among genes in a cluster is assigned to the cluster itself. If a - non-default argument is provided (a value on [0, 1]), at least this proportion of genes - in the cluster must have the most frequent annotation for the cluster to be annotated. - - discard_ties : bool, False - This parameter applies to pangenomes. If multiple protein annotations are most frequent - among genes in a cluster, then do not assign an annotation to the cluster itself when - this argument is True. By default, this argument is False, so one of the most frequent - annotations would be arbitrarily chosen. - - stats_file : str, None - Write network overview statistics to a tab-delimited file at this output path. - - Returns - ======= - ReactionNetwork - Reaction network loaded from the input database. - """ - if contigs_db and (pan_db or genomes_storage_db): - raise ConfigError( - "Either a contigs database OR both a pan database and genomes storage database are required " - "to make either a (meta)genomic reaction network or a pangenomic reaction network, respectively." - ) - elif contigs_db: - self.run.info_single( - "A reaction network will be made from protein orthology annotations in the contigs database." - ) - network = self.make_contigs_database_network( - contigs_db, - store=store, - overwrite_existing_network=overwrite_existing_network, - stats_file=stats_file - ) - elif genomes_storage_db or pan_db: - self.run.info_single( - "A pangenomic reaction network will be made from protein orthology annotations " - "in the genomes storage database and gene clusters in the pan database." - ) - network = self.make_pangenomic_network( - pan_db, - genomes_storage_db, - store=store, - overwrite_existing_network=overwrite_existing_network, - consensus_threshold=consensus_threshold, - discard_ties=discard_ties, - stats_file=stats_file - ) - else: - raise ConfigError( - "A reaction network cannot be made without a database source. Either a contigs database OR " - "a pan database and genomes storage database are required to make either a (meta)genomic " - "reaction network or a pangenomic reaction network, respectively." - ) - return network - - def make_contigs_database_network( - self, - contigs_db: str, - store: bool = True, - overwrite_existing_network: bool = False, - stats_file: str = None - ) -> GenomicNetwork: - """ - Make a metabolic reaction network from KEGG Orthologs stored in a contigs database. - - Parameters - ========== - contigs_db : str - Path to a contigs database. The database can represent different types of samples, - including a single genome, metagenome, or transcriptome. The network is derived from - gene KO annotations stored in the database. - - store : bool, True - Save the network to the contigs database. - - overwrite_existing_network : bool, False - Overwrite an existing network stored in the contigs database. 'store' is also required. - - stats_file : str, None - Write network overview statistics to a tab-delimited file at this output path. - - Returns - ======= - GenomicNetwork - The network derived from the contigs database. - """ - # Here is an example of the information used to create a genomic network. - # gene 1 ---> KO 1 ---> KEGG rxn 1 ---> ModelSEED rxn 1 ---> ModelSEED metabs 1, 2, ... - # | | | - # | | ---> EC number 1 --> ModelSEED rxn 1 ---> ModelSEED metabs 1, 2, ... - # | | | | - # | | | --> ModelSEED rxn 2 ---> ... - # | | | - # | | ---> EC number 2 --> ... - # | | - # | ---> KO 2 ---> ... - # | - # gene 2 ---> ... - - # Preemptively check the statistics file path. - if stats_file is not None: - filesnpaths.is_output_file_writable(stats_file) - - # Load the contigs database. - self.run.info("Contigs database", contigs_db) - utils.is_contigs_db(contigs_db) - args = argparse.Namespace() - args.contigs_db = contigs_db - contigs_super = ContigsSuperclass(args, r=run_quiet) - if ( - store and - contigs_super.a_meta['reaction_network_ko_annotations_hash'] and - not overwrite_existing_network - ): - raise ConfigError( - f"""\ - The existing reaction network in the contigs database must be explicitly - overwritten.\ - """ - ) - contigs_super.init_functions(requested_sources=['KOfam']) - - self.progress.new("Building reaction network") - self.progress.update("Loading reference databases") - - ko_db = KODatabase(self.ko_dir) - modelseed_db = ModelSEEDDatabase(self.modelseed_dir) - - network = GenomicNetwork(run=self.run, progress=self.progress) - network.contigs_db_source_path = os.path.abspath(contigs_db) - - modelseed_kegg_reactions_table = modelseed_db.kegg_reactions_table - modelseed_ec_reactions_table = modelseed_db.ec_reactions_table - modelseed_compounds_table = modelseed_db.compounds_table - - # List KOs that annotated genes in the contigs database but for some reason are not found in - # the KO database. - undefined_ko_ids = [] - - # Parse gene-KO matches recorded in the contigs database. - gene_function_calls_dict: Dict = contigs_super.gene_function_calls_dict - total_ko_matches = len(gene_function_calls_dict) - num_ko_matches_parsed = -1 - for gcid, gene_dict in gene_function_calls_dict.items(): - num_ko_matches_parsed += 1 - self.progress.update( - f"Gene-KO matches parsed: {num_ko_matches_parsed} / {total_ko_matches}" - ) - - if gcid in network.genes: - # An object representing the gene was already added to the network. - gene = network.genes[gcid] - else: - gene = Gene() - gene.gcid = gcid - # Add the gene to the network, regardless of whether it yields reactions. Genes not - # contributing to the reaction network are removed later. - network.genes[gcid] = gene - - ko_data = gene_dict['KOfam'] - ko_id = ko_data[0] - gene.e_values[ko_id] = float(ko_data[2]) - if ko_id in network.kos: - # The KO was associated with an already encountered gene and added to the network. - # Objects representing ModelSEED reactions and metabolites and other data associated - # with the KO were added to the network as well. - gene.kos[ko_id] = network.kos[ko_id] - continue - ko = KO() - ko.id = ko_id - ko.name = ko_data[1] - gene.kos[ko_id] = ko - # Add the KO to the network, regardless of whether it yields reactions. KOs not - # contributing to the network are removed later. - network.kos[ko_id] = ko - - # Find KEGG reactions and EC numbers associated with the newly encountered KO. - try: - ko_info = ko_db.ko_table.loc[ko.id] - except KeyError: - undefined_ko_ids.append(ko_id) - continue - ko_kegg_reaction_info: str = ko_info.loc['reactions'] - if pd.isna(ko_kegg_reaction_info): - # The KO is not associated with KEGG reactions. - ko_kegg_reaction_ids = [] - else: - ko_kegg_reaction_ids = ko_kegg_reaction_info.split() - ko_ec_number_info: str = ko_info.loc['ec_numbers'] - if pd.isna(ko_ec_number_info): - # The KO is not associated with EC numbers. - ko_ec_numbers = [] - else: - ko_ec_numbers = ko_ec_number_info.split() - - if not (ko_kegg_reaction_ids or ko_ec_numbers): - # The KO is not associated with any KEGG reactions or EC numbers, and thereby cannot - # be associated with ModelSEED reactions. - continue - - new_kegg_reaction_ids = self._parse_ko_kegg_reaction_ids( - network, ko, ko_kegg_reaction_ids, ko_ec_numbers - ) - new_ec_numbers = self._parse_ko_ec_numbers( - network, ko, ko_ec_numbers, ko_kegg_reaction_ids - ) - if not (new_kegg_reaction_ids or new_ec_numbers): - # All of the KEGG reactions and EC numbers associated with the KO have already been - # encountered in previously processed KOs and added to the network, so proceed to - # the next gene KO annotation. - continue - modelseed_reactions_data = self._get_modelseed_reactions_data( - network, - new_kegg_reaction_ids, - new_ec_numbers, - modelseed_kegg_reactions_table, - modelseed_ec_reactions_table - ) - if not modelseed_reactions_data: - # The newly encountered KEGG REACTION IDs and EC numbers do not map to ModelSEED - # reactions (are not in the table). - continue - - # Process the ModelSEED reactions aliased by newly encountered KEGG reactions and EC - # numbers. - for modelseed_reaction_id, modelseed_reaction_data in modelseed_reactions_data.items(): - if modelseed_reaction_id in network.reactions: - # The ModelSEED reaction is aliased by previously encountered KEGG reactions and - # EC numbers, and so has already been added to the network. - continue - # Make a new reaction object for the ModelSEED ID. This object does not yet have - # metabolite objects (for the ModelSEED compound IDs) added to it yet. - reaction, modelseed_compound_ids = self._get_modelseed_reaction( - modelseed_reaction_data - ) - if reaction is None: - # For some reason, the reaction does not have a equation in the ModelSEED - # database. Associations between such reactions without equations and sourcing - # KEGG reactions and EC numbers are later removed from the network attributes, - # 'kegg_modelseed_aliases', 'ec_number_modelseed_aliases', - # 'modelseed_kegg_aliases', and 'modelseed_ec_number_aliases'. - continue - self._add_modelseed_reaction( - network, - ko, - reaction, - new_kegg_reaction_ids, - new_ec_numbers, - modelseed_compound_ids, - modelseed_compounds_table - ) - - # List genes that do not contribute to the reaction network. Remove any trace of these genes - # from the network. - unnetworked_gcids = [] - for gcid, gene in network.genes.items(): - for ko in gene.kos.values(): - if ko.reactions: - break - else: - unnetworked_gcids.append(gcid) - for gcid in unnetworked_gcids: - network.genes.pop(gcid) - - # List KOs that do not contribute to the reaction network. Remove any trace of these KOs - # from the network. - unnetworked_ko_ids = [] - for ko_id, ko in network.kos.items(): - if not ko.reactions: - unnetworked_ko_ids.append(ko_id) - for ko_id in unnetworked_ko_ids: - network.kos.pop(ko_id) - - # List KO KEGG reactions that do not map to ModelSEED reactions. Remove any trace of these - # KEGG reactions from the network. - unnetworked_kegg_reaction_ids = [] - for kegg_reaction_id, modelseed_reaction_ids in network.kegg_modelseed_aliases.items(): - if not modelseed_reaction_ids: - unnetworked_kegg_reaction_ids.append(kegg_reaction_id) - for kegg_reaction_id in unnetworked_kegg_reaction_ids: - network.kegg_modelseed_aliases.pop(kegg_reaction_id) - - # List KO EC numbers that do not map to ModelSEED reactions. Remove any trace of these EC - # numbers from the network. - unnetworked_ec_numbers = [] - for ec_number, modelseed_reaction_ids in network.ec_number_modelseed_aliases.items(): - if not modelseed_reaction_ids: - unnetworked_ec_numbers.append(ec_number) - for ec_number in unnetworked_ec_numbers: - network.ec_number_modelseed_aliases.pop(ec_number) - - # List aliased ModelSEED reactions that did not yield a ModelSEEDReaction object due to the - # lack of an equation for the reaction in the ModelSEED database. Remove any trace of these - # reactions from the network. - undefined_modelseed_reaction_ids = list( - set(network.modelseed_kegg_aliases).difference(set(network.reactions)) - ) - for modelseed_reaction_id in undefined_modelseed_reaction_ids: - network.modelseed_kegg_aliases.pop(modelseed_reaction_id) - network.modelseed_ec_number_aliases.pop(modelseed_reaction_id) - self.progress.end() - - if DEBUG: - self.run.info_single( - f"""\ - The following ModelSEED reactions would have been added to the reaction network had - there been a chemical equation in the ModelSEED database; perhaps it is worth - investigating the ModelSEED reactions table to understand why this is not the case: - {', '.join(undefined_modelseed_reaction_ids)}\ - """ - ) - - if undefined_ko_ids: - self.run.info_single( - f"""\ - Certain genes matched KOs that were not found in the reference KO database. These - KOs will not be used in network construction. It could be that the KOfams used to - annotate genes were not from the same KEGG database version as the reference KO - files. Here are the unrecognized KO IDs from the contigs database: - {','.join(undefined_ko_ids)}\ - """ - ) - - ko_dir = KODatabase.default_dir if self.ko_dir is None else self.ko_dir - if self.modelseed_dir is None: - modelseed_dir = ModelSEEDDatabase.default_dir - else: - modelseed_dir = self.modelseed_dir - self.run.info("Reference KEGG KO database directory", ko_dir, nl_before=1) - self.run.info("Reference ModelSEED database directory", modelseed_dir) - - if store: - if contigs_super.a_meta['reaction_network_ko_annotations_hash']: - self.run.warning("Deleting existing reaction network from contigs database") - cdb = ContigsDatabase(contigs_db) - cdb.db._exec(f'''DELETE from {tables.gene_function_reactions_table_name}''') - cdb.db._exec(f'''DELETE from {tables.gene_function_metabolites_table_name}''') - cdb.disconnect() - self.run.info_single( - "Deleted data in gene function reactions and metabolites tables", nl_after=1 - ) - - self.progress.new("Saving reaction network to contigs database") - self.progress.update("Reactions table") - reactions_table = self._get_database_reactions_table(network) - cdb = ContigsDatabase(contigs_db) - sql_statement = ( - f"INSERT INTO {tables.gene_function_reactions_table_name} VALUES " - f"({','.join('?' * len(tables.gene_function_reactions_table_structure))})" - ) - cdb.db._exec_many(sql_statement, reactions_table.values) - cdb.disconnect() - self.progress.update("Metabolites table") - metabolites_table = self._get_database_metabolites_table(network) - cdb = ContigsDatabase(contigs_db) - sql_statement = ( - f"INSERT INTO {tables.gene_function_metabolites_table_name} VALUES " - f"({','.join('?' * len(tables.gene_function_metabolites_table_structure))})" - ) - cdb.db._exec_many(sql_statement, metabolites_table.values) - cdb.disconnect() - - self.progress.update("Metadata") - ko_annotations_hash = self.hash_contigs_db_ko_annotations(gene_function_calls_dict) - cdb = ContigsDatabase(contigs_db) - cdb.db.set_meta_value('reaction_network_ko_annotations_hash', ko_annotations_hash) - cdb.db.set_meta_value('reaction_network_kegg_database_release', ko_db.release) - cdb.db.set_meta_value('reaction_network_modelseed_database_sha', modelseed_db.sha) - cdb.disconnect() - self.progress.end() - - cdb = ContigsDatabase(contigs_db) - precomputed_counts = { - 'total_genes': cdb.db.get_row_counts_from_table('genes_in_contigs'), - 'genes_assigned_kos': len(network.genes) + len(unnetworked_gcids), - 'kos_assigned_genes': len(network.kos) + len(unnetworked_ko_ids) - } - cdb.disconnect() - stats = network.get_overview_statistics(precomputed_counts=precomputed_counts) - network.print_overview_statistics(stats=stats) - if stats_file: - network.write_overview_statistics(stats_file, stats=stats) - - return network - - def make_pangenomic_network( - self, - pan_db: str, - genomes_storage_db: str, - store: bool = True, - overwrite_existing_network: bool = False, - consensus_threshold: float = None, - discard_ties: bool = False, - stats_file: str = None - ) -> PangenomicNetwork: - """ - Make a pangenomic metabolic reaction network from KEGG Orthologs stored a genomes storage - database and gene clusters stored in a pan database. - - Parameters - ========== - pan_db : str - Path to a pan database. The pangenomic network is determined for gene clusters stored in - the database. - - genomes_storage_db : str - Path to a genomes storage database. The pangenomic network is derived from gene KO - annotations stored in the database. - - store : bool, True - Save the network to the pan database. - - overwrite_existing_network : bool, False - Overwrite an existing network stored in the pan database. 'store' is also required. - - consensus_threshold : float, None - With the default of None, the protein annotation most frequent among genes in a cluster - is assigned to the cluster itself. If a non-default argument is provided (a value on [0, - 1]), at least this proportion of genes in the cluster must have the most frequent - annotation for the cluster to be annotated. - - discard_ties : bool, False - If multiple protein annotations are most frequent among genes in a cluster, then do not - assign an annotation to the cluster itself when this argument is True. By default, this - argument is False, so one of the most frequent annotations would be arbitrarily chosen. - - stats_file : str, None - Write network overview statistics to a tab-delimited file at this output path. - - Returns - ======= - PangenomicNetwork - The network derived from the pangenomic databases. - """ - # Preemptively check the statistics file path. - if stats_file is not None: - filesnpaths.is_output_file_writable(stats_file) - - # Load the pan database. - args = Namespace() - args.pan_db = pan_db - args.genomes_storage = genomes_storage_db - args.discard_ties = discard_ties - args.consensus_threshold = consensus_threshold - pan_super = PanSuperclass(args, r=run_quiet) - - if ( - store and - pan_super.p_meta['reaction_network_ko_annotations_hash'] and - not overwrite_existing_network - ): - raise ConfigError( - "The existing reaction network in the pan database must be explicitly overwritten." - ) - - # Check that genome contigs databases were annotated with KOs before building the pan - # database. Unlike in contigs super, the initialization of functions by a method of pan - # super does not allow specification of particular functional annotation sources, with - # concomitant checks for their existence. - gs_info = dbinfo.GenomeStorageDBInfo(genomes_storage_db) - gs_sources: str = gs_info.get_self_table()['gene_function_sources'] - if 'KOfam' not in [source.strip() for source in gs_sources.split(',')]: - raise ConfigError( - f"""\ - The genomes of the pangenome were not annotated with KOs, which can be rectified - by running `anvi-run-kegg-kofams` on the genome contigs databases and remaking - the pangenome.\ - """ - ) - pan_super.init_gene_clusters() - pan_super.init_gene_clusters_functions() - pan_super.init_gene_clusters_functions_summary_dict() - - self.progress.new("Building reaction network") - self.progress.update("Loading reference databases") - - # Load the required orthology reference databases set up by anvi'o. - ko_db = KODatabase(self.ko_dir) - modelseed_db = ModelSEEDDatabase(self.modelseed_dir) - - network = PangenomicNetwork(run=self.run, progress=self.progress) - network.pan_db_source_path = os.path.abspath(pan_db) - network.genomes_storage_db_source_path = os.path.abspath(genomes_storage_db) - network.consensus_threshold = consensus_threshold - network.discard_ties = discard_ties - network.consistent_annotations = True - - modelseed_kegg_reactions_table = modelseed_db.kegg_reactions_table - modelseed_ec_reactions_table = modelseed_db.ec_reactions_table - modelseed_compounds_table = modelseed_db.compounds_table - - # List KOs that annotated gene clusters in the pan database but for some reason are not - # found in the KO database. - undefined_ko_ids = [] - - # Parse gene clusters. - gene_clusters_functions_summary_dict: Dict = pan_super.gene_clusters_functions_summary_dict - total_gene_clusters = len(pan_super.gene_clusters) - num_gene_clusters_parsed = -1 - for cluster_id, gene_cluster_functions_data in gene_clusters_functions_summary_dict.items(): - num_gene_clusters_parsed += 1 - self.progress.update( - f"Gene clusters parsed: {num_gene_clusters_parsed} / {total_gene_clusters}" - ) - # Retrieve the consensus KO across genes in the cluster. Parameterization of the method - # used to select consensus KOs occurred in pan super initialization. - gene_cluster_ko_data = gene_cluster_functions_data['KOfam'] - if gene_cluster_ko_data == {'function': None, 'accession': None}: - # No KO was assigned to the cluster. - continue - ko_id = gene_cluster_ko_data['accession'] - - gene_cluster = GeneCluster() - gene_cluster.gene_cluster_id = cluster_id - gene_cluster.genomes = list(pan_super.gene_clusters[cluster_id]) - # Add the gene cluster to the network, regardless of whether it yields reactions. Gene - # clusters not contributing to the reaction network are removed later. - network.gene_clusters[cluster_id] = gene_cluster - - if ko_id in network.kos: - # The KO was assigned to another gene cluster that was already processed and added - # to the network. Objects representing ModelSEED reactions and metabolites and other - # data associated with the KO were added to the network in addition to a KO object. - gene_cluster.ko = network.kos[ko_id] - continue - ko = KO() - ko.id = ko_id - ko.name = gene_cluster_ko_data['function'] - gene_cluster.ko = ko - # Add the newly encountered KO to the network, regardless of whether it yields - # reactions. KOs not contributing to the network are removed later. - network.kos[ko_id] = ko - - # Find KEGG reactions and EC numbers associated with the newly encountered KO. - try: - ko_info = ko_db.ko_table.loc[ko.id] - except KeyError: - undefined_ko_ids.append(ko_id) - continue - ko_kegg_reaction_info: str = ko_info.loc['reactions'] - if pd.isna(ko_kegg_reaction_info): - # The KO is not associated with KEGG reactions. - ko_kegg_reaction_ids = [] - else: - ko_kegg_reaction_ids = ko_kegg_reaction_info.split() - ko_ec_number_info: str = ko_info.loc['ec_numbers'] - if pd.isna(ko_ec_number_info): - # The KO is not associated with EC numbers. - ko_ec_numbers = [] - else: - ko_ec_numbers = ko_ec_number_info.split() - - if not (ko_kegg_reaction_ids or ko_ec_numbers): - # The KO is not associated with any KEGG reactions or EC numbers, and thereby cannot - # be associated with ModelSEED reactions. - continue - - new_kegg_reaction_ids = self._parse_ko_kegg_reaction_ids( - network, ko, ko_kegg_reaction_ids, ko_ec_numbers - ) - new_ec_numbers = self._parse_ko_ec_numbers( - network, ko, ko_ec_numbers, ko_kegg_reaction_ids - ) - if not (new_kegg_reaction_ids or new_ec_numbers): - # All of the KEGG reactions and EC numbers associated with the KO have already been - # encountered in previously processed KOs and added to the network, so proceed to - # the next gene cluster. - continue - modelseed_reactions_data = self._get_modelseed_reactions_data( - network, - new_kegg_reaction_ids, - new_ec_numbers, - modelseed_kegg_reactions_table, - modelseed_ec_reactions_table - ) - if not modelseed_reactions_data: - # The newly encountered KEGG REACTION IDs and EC numbers do not map to ModelSEED - # reactions (are not in the ModelSEED table). - continue - - # Process the ModelSEED reactions aliased by newly encountered KEGG reactions and EC - # numbers. - for modelseed_reaction_id, modelseed_reaction_data in modelseed_reactions_data.items(): - if modelseed_reaction_id in network.reactions: - # The ModelSEED reaction is aliased by previously encountered KEGG reactions and - # EC numbers, and so has already been added to the network. - continue - # Make a new reaction object for the ModelSEED ID. This object does not yet have - # metabolite objects (for the ModelSEED compound IDs) added to it yet. - reaction, modelseed_compound_ids = self._get_modelseed_reaction( - modelseed_reaction_data - ) - if reaction is None: - # For some reason, the reaction does not have a equation in the ModelSEED - # database. Associations between such reactions without equations and sourcing - # KEGG reactions and EC numbers are later removed from the network attributes, - # 'kegg_modelseed_aliases', 'ec_number_modelseed_aliases', - # 'modelseed_kegg_aliases', and 'modelseed_ec_number_aliases'. - continue - self._add_modelseed_reaction( - network, - ko, - reaction, - new_kegg_reaction_ids, - new_ec_numbers, - modelseed_compound_ids, - modelseed_compounds_table - ) - - # List gene clusters and KOs that do not contribute to the reaction network. Remove any - # trace of these gene clusters and KOs from the network. - unnetworked_cluster_ids = [] - unnetworked_ko_ids = [] - for cluster_id, gene_cluster in network.gene_clusters.items(): - ko = gene_cluster.ko - if ko.reactions: - break - unnetworked_cluster_ids.append(cluster_id) - unnetworked_ko_ids.append(ko.id) - for cluster_id in unnetworked_cluster_ids: - network.gene_clusters.pop(cluster_id) - for ko_id in unnetworked_ko_ids: - network.kos.pop(ko_id) - - # List KO KEGG reactions that do not map to ModelSEED reactions. Remove any trace of these - # KEGG reactions from the network. - unnetworked_kegg_reaction_ids = [] - for kegg_reaction_id, modelseed_reaction_ids in network.kegg_modelseed_aliases.items(): - if not modelseed_reaction_ids: - unnetworked_kegg_reaction_ids.append(kegg_reaction_id) - for kegg_reaction_id in unnetworked_kegg_reaction_ids: - network.kegg_modelseed_aliases.pop(kegg_reaction_id) - - # List KO EC numbers that do not map to ModelSEED reactions. Remove any trace of these EC - # numbers from the network. - unnetworked_ec_numbers = [] - for ec_number, modelseed_reaction_ids in network.ec_number_modelseed_aliases.items(): - if not modelseed_reaction_ids: - unnetworked_ec_numbers.append(ec_number) - for ec_number in unnetworked_ec_numbers: - network.ec_number_modelseed_aliases.pop(ec_number) - - # List aliased ModelSEED reactions that did not yield a ModelSEEDReaction object due to the - # lack of an equation for the reaction in the ModelSEED database. Remove any trace of these - # reactions from the network. - undefined_modelseed_reaction_ids = list( - set(network.modelseed_kegg_aliases).difference(set(network.reactions)) - ) - for modelseed_reaction_id in undefined_modelseed_reaction_ids: - network.modelseed_kegg_aliases.pop(modelseed_reaction_id) - network.modelseed_ec_number_aliases.pop(modelseed_reaction_id) - self.progress.end() - - if DEBUG: - self.run.info_single( - f"""\ - The following ModelSEED reactions would have been added to the reaction network had - there been a chemical equation in the ModelSEED database; perhaps it is worth - investigating the ModelSEED reactions table to understand why this is not the case: - {', '.join(undefined_modelseed_reaction_ids)}\ - """ - ) - - if undefined_ko_ids: - self.run.info_single( - f"""\ - Certain gene clusters were assigned consensus KOs that were not found in the - reference KO database. These consensus KOs will not be used in network construction. - It could be that the KOfams used to annotate gene clusters were not from the same - KEGG database version as the reference KO files. Here are the unrecognized KO IDs - from the pan database: {', '.join(undefined_ko_ids)}\ - """ - ) - - ko_dir = KODatabase.default_dir if self.ko_dir is None else self.ko_dir - if self.modelseed_dir is None: - modelseed_dir = ModelSEEDDatabase.default_dir - else: - modelseed_dir = self.modelseed_dir - self.run.info("Reference KEGG KO database directory", ko_dir, nl_before=1) - self.run.info("Reference ModelSEED database directory", modelseed_dir) - - if store: - if pan_super.p_meta['reaction_network_ko_annotations_hash']: - self.run.warning("Deleting existing reaction network from pan database") - pdb = PanDatabase(pan_db) - pdb.db._exec( - f'''DELETE from {tables.pan_gene_cluster_function_reactions_table_name}''' - ) - pdb.db._exec( - f'''DELETE from {tables.pan_gene_cluster_function_metabolites_table_name}''' - ) - pdb.disconnect() - self.run.info_single( - "Deleted data in gene cluster function reactions and metabolites tables", - nl_after=1 - ) - - self.progress.new("Saving reaction network to pan database") - self.progress.update("Reactions table") - reactions_table = self._get_database_reactions_table(network) - pdb = PanDatabase(pan_db) - table_name = tables.pan_gene_cluster_function_reactions_table_name - table_structure = tables.pan_gene_cluster_function_reactions_table_structure - pdb.db._exec_many( - f'''INSERT INTO {table_name} VALUES ({','.join('?' * len(table_structure))})''', - reactions_table.values - ) - pdb.disconnect() - self.progress.update("Metabolites table") - metabolites_table = self._get_database_metabolites_table(network) - pdb = PanDatabase(pan_db) - table_name = tables.pan_gene_cluster_function_metabolites_table_name - table_structure = tables.gene_function_metabolites_table_structure - pdb.db._exec_many( - f'''INSERT INTO {table_name} VALUES ({','.join('?' * len(table_structure))})''', - metabolites_table.values - ) - pdb.disconnect() - - self.progress.update("Metadata") - ko_annotations_hash = self.hash_pan_db_ko_annotations( - genomes_storage_db, - gene_clusters_functions_summary_dict, - consensus_threshold=consensus_threshold, - discard_ties=discard_ties - ) - pdb = PanDatabase(pan_db) - pdb.db.set_meta_value('reaction_network_ko_annotations_hash', ko_annotations_hash) - pdb.db.set_meta_value('reaction_network_kegg_database_release', ko_db.release) - pdb.db.set_meta_value('reaction_network_modelseed_database_sha', modelseed_db.sha) - pdb.db.set_meta_value('reaction_network_consensus_threshold', consensus_threshold) - pdb.db.set_meta_value('reaction_network_discard_ties', int(discard_ties)) - pdb.disconnect() - self.progress.end() - - pdb = PanDatabase(pan_db) - precomputed_counts = { - 'total_gene_clusters': pdb.meta['num_gene_clusters'], - 'gene_clusters_assigned_ko': len(network.gene_clusters) + len(unnetworked_cluster_ids), - 'kos_assigned_gene_clusters': len(network.kos) + len(unnetworked_ko_ids) - } - pdb.disconnect() - stats = network.get_overview_statistics(precomputed_counts=precomputed_counts) - network.print_overview_statistics(stats=stats) - if stats_file: - network.write_overview_statistics(stats_file, stats=stats) - - return network - - def _parse_ko_kegg_reaction_ids( - self, - network: ReactionNetwork, - ko: KO, - ko_kegg_reaction_ids: Iterable[str], - ko_ec_numbers: Iterable[str] - ) -> List[str]: - """ - Parse KEGG reactions associated with a KO in the process of building a reaction network. - - Report KEGG REACTION IDs that have not been encountered in association with previously - processed KOs. Record the existence of these KEGG reactions in the reaction network object. - For previously encountered KEGG reactions, retrieve data on aliased ModelSEED reactions and - record that data in the KO object. - - Parameters - ========== - network : ReactionNetwork - The reaction network object being built. - - ko : KO - The representation of the KO being processed. - - ko_kegg_reaction_ids : Iterable[str] - KEGG REACTION IDs associated with the KO. - - ko_ec_numbers: Iterable[str] - EC numbers associated with the KO. - - Returns - ======= - list - Newly encountered KEGG REACTION IDs not associated with previously processed KOs - """ - # If a KEGG reaction has already been encountered, then aliased ModelSEED reactions have - # also been processed and added as ModelSEEDReaction objects to the network. Therefore, KEGG - # reactions that have already been encountered are treated differently than KEGG reactions - # encountered for the first time. - new_kegg_reaction_ids = [] - for kegg_reaction_id in ko_kegg_reaction_ids: - try: - # The KEGG reaction has already been encountered. Retrieve ModelSEED reactions - # aliased by the KEGG reaction. - modelseed_reaction_ids = network.kegg_modelseed_aliases[kegg_reaction_id] - except KeyError: - new_kegg_reaction_ids.append(kegg_reaction_id) - # The following list of ModelSEED reaction IDs associated with the KEGG reaction - # is filled in later. If no ModelSEED reactions are associated with the KEGG - # reaction, the entry in the dictionary will be removed. - network.kegg_modelseed_aliases[kegg_reaction_id] = [] - continue - for modelseed_reaction_id in modelseed_reaction_ids: - try: - # Retrieve the existing ModelSEEDReaction object. - reaction = network.reactions[modelseed_reaction_id] - except KeyError: - # The ModelSEED reaction associated with the EC number did not have valid - # data: for example, when the 'stoichiometry' field is empty. - continue - # Associate the ModelSEED reaction with the newly encountered KO. - ko.reactions[modelseed_reaction_id] = reaction - # Record which KEGG REACTION IDs and EC numbers from the KO yield the ModelSEED reaction. - ko.kegg_reaction_aliases[modelseed_reaction_id] = list( - set(ko_kegg_reaction_ids).intersection(set(reaction.kegg_aliases)) - ) - ko.ec_number_aliases[modelseed_reaction_id] = list( - set(ko_ec_numbers).intersection(set(reaction.ec_number_aliases)) - ) - return new_kegg_reaction_ids - - def _parse_ko_ec_numbers( - self, - network: ReactionNetwork, - ko: KO, - ko_ec_numbers: Iterable[str], - ko_kegg_reaction_ids: Iterable[str] - ) -> List[str]: - """ - Parse EC numbers associated with a KO in the process of building a reaction network. - - Report EC numbers that have not been encountered in association with previously processed - KOs. Record the existence of these EC numbers in the reaction network object. For previously - encountered EC numbers, retrieve data on aliased ModelSEED reactions and record that data in - the KO object. - - Parameters - ========== - network : ReactionNetwork - The reaction network object being built. - - ko : KO - The representation of the KO being processed. - - ko_ec_numbers: Iterable[str] - EC numbers associated with the KO. - - ko_kegg_reaction_ids : Iterable[str] - KEGG REACTION IDs associated with the KO. - - Returns - ======= - list - Newly encountered EC numbers not associated with previously processed KOs. - """ - # As before with KEGG reactions, if an EC number has already been encountered, then aliased - # ModelSEED reactions have also been processed and added as ModelSEEDReaction objects to the - # network. Therefore, EC numbers that have already been encountered are treated differently - # than EC numbers encountered for the first time. - new_ec_numbers = [] - for ec_number in ko_ec_numbers: - try: - # The EC number has already been encountered. Retrieve ModelSEED reactions - # aliased by the EC number. - modelseed_reaction_ids = network.ec_number_modelseed_aliases[ec_number] - except KeyError: - new_ec_numbers.append(ec_number) - # The following list of ModelSEED reaction IDs associated with the EC number is - # filled in later. If no ModelSEED reactions are associated with the EC number, - # the entry in the dictionary will be removed. - network.ec_number_modelseed_aliases[ec_number] = [] - continue - for modelseed_reaction_id in modelseed_reaction_ids: - try: - # Retrieve the existing ModelSEEDReaction object. - reaction = network.reactions[modelseed_reaction_id] - except KeyError: - # The ModelSEED reaction associated with the EC number did not have valid - # data: for example, when the 'stoichiometry' field is empty. - continue - if modelseed_reaction_id in reaction.ec_number_aliases: - # A KEGG reaction associated with the newly encountered KO was also - # associated with the ModelSEED reaction. KO EC number aliases were - # previously recorded along with KO KEGG reaction aliases. Redundant work - # can be avoided here linking the ModelSEED reaction to the KO in the network. - continue - ko.reactions[modelseed_reaction_id] = reaction - ko.kegg_reaction_aliases[modelseed_reaction_id] = list( - set(ko_kegg_reaction_ids).intersection(set(reaction.kegg_aliases)) - ) - ko.ec_number_aliases[modelseed_reaction_id] = list( - set(ko_ec_numbers).intersection(set(reaction.ec_number_aliases)) - ) - return new_ec_numbers - - def _get_modelseed_reactions_data( - self, - network: ReactionNetwork, - new_kegg_reaction_ids: List[str], - new_ec_numbers: List[str], - modelseed_kegg_reactions_table: pd.DataFrame, - modelseed_ec_reactions_table: pd.DataFrame - ) -> Dict: - """ - Get data on ModelSEED reactions aliased by newly encountered KEGG REACTION IDs and EC numbers. - - Parameters - ========== - network : ReactionNetwork - The reaction network object being built. - - new_kegg_reaction_ids : List[str] - Newly encountered KEGG REACTION IDs not associated with previously processed KOs. - - new_ec_numbers : List[str] - Newly encountered EC numbers not associated with previously processed KOs. - - modelseed_kegg_reactions_table : pd.DataFrame - Loaded ModelSEED Biochemistry reactions database structured by KEGG REACTION ID. - - modelseed_ec_reactions_table : pd.DataFrame - Loaded ModelSEED Biochemistry reactions database structured by EC number. - - Returns - ======= - dict - Data on the reaction sourced from the ModelSEED Biochemistry database. - """ - modelseed_reactions_data = {} - if new_kegg_reaction_ids: - # Each row of the table represents a unique KEGG reaction -> ModelSEED reaction mapping. - modelseed_kegg_reactions_dict: Dict[str, Dict] = modelseed_kegg_reactions_table[ - modelseed_kegg_reactions_table['KEGG_REACTION_ID'].isin(new_kegg_reaction_ids) - ].to_dict(orient='index') - for modelseed_reaction_data in modelseed_kegg_reactions_dict.values(): - kegg_reaction_id = modelseed_reaction_data['KEGG_REACTION_ID'] - modelseed_reaction_id = modelseed_reaction_data['id'] - # Record the association between the KEGG reaction and ModelSEED reaction in the - # network, and vice versa. - network.kegg_modelseed_aliases[kegg_reaction_id].append(modelseed_reaction_id) - try: - network.modelseed_kegg_aliases[modelseed_reaction_id].append(kegg_reaction_id) - except KeyError: - # This is the first time the ModelSEED reaction has been encountered. - network.modelseed_kegg_aliases[modelseed_reaction_id] = [kegg_reaction_id] - network.modelseed_ec_number_aliases[modelseed_reaction_id] = [] - if modelseed_reaction_id in modelseed_reactions_data: - # One of the other newly encountered KEGG reactions also mapped to this - # ModelSEED reaction, so do not record redundant ModelSEED reaction data. - continue - modelseed_reactions_data[modelseed_reaction_id] = modelseed_reaction_data - if new_ec_numbers: - # Each row of the table represents a unique EC number -> ModelSEED reaction mapping. - modelseed_ec_reactions_dict: Dict[str, Dict] = modelseed_ec_reactions_table[ - modelseed_ec_reactions_table['EC_number'].isin(new_ec_numbers) - ].to_dict(orient='index') - for modelseed_reaction_data in modelseed_ec_reactions_dict.values(): - ec_number = modelseed_reaction_data['EC_number'] - modelseed_reaction_id = modelseed_reaction_data['id'] - # Record the association between the EC number and ModelSEED reaction in the - # network, and vice versa. - network.ec_number_modelseed_aliases[ec_number].append(modelseed_reaction_id) - try: - network.modelseed_ec_number_aliases[modelseed_reaction_id].append(ec_number) - except KeyError: - # This is the first time the ModelSEED reaction has been encountered. - network.modelseed_ec_number_aliases[modelseed_reaction_id] = [ec_number] - network.modelseed_kegg_aliases[modelseed_reaction_id] = [] - if modelseed_reaction_id in modelseed_reactions_data: - # One of the other newly encountered KEGG reactions or EC numbers also - # mapped to this ModelSEED reaction, so do not record redundant ModelSEED reaction data. - continue - modelseed_reactions_data[modelseed_reaction_id] = modelseed_reaction_data - return modelseed_reactions_data - - def _add_modelseed_reaction( - self, - network: ReactionNetwork, - ko: KO, - reaction: ModelSEEDReaction, - new_kegg_reaction_ids: List[str], - new_ec_numbers: List[str], - modelseed_compound_ids: List[str], - modelseed_compounds_table: pd.DataFrame - ) -> None: - """ - Add an object representing the ModelSEED reaction and objects representing associated - ModelSEED compounds to the reaction network. - - Parameters - ========== - network : ReactionNetwork - The reaction network object being built. - - ko : KO - The representation of the KO being processed. - - reaction : ModelSEEDReaction - The representation of the reaction with data sourced from ModelSEED Biochemistry. - - new_kegg_reaction_ids : List[str] - Newly encountered KEGG REACTION IDs not associated with previously processed KOs. - - new_ec_numbers : List[str] - Newly encountered EC numbers not associated with previously processed KOs. - - modelseed_compound_ids : List[str] - ModelSEED compound IDs of the reactants and products in the reaction. - - modelseed_compounds_table : pd.DataFrame - Loaded ModelSEED Biochemistry compounds database. - - Returns - ======= - None - """ - modelseed_reaction_id = reaction.modelseed_id - ko.reactions[modelseed_reaction_id] = reaction - # Record which KEGG REACTION IDs and EC numbers from the KO yield the ModelSEED reaction. - ko.kegg_reaction_aliases[modelseed_reaction_id] = list( - set(new_kegg_reaction_ids).intersection(set(reaction.kegg_aliases)) - ) - ko.ec_number_aliases[modelseed_reaction_id] = list( - set(new_ec_numbers).intersection(set(reaction.ec_number_aliases)) - ) - network.reactions[modelseed_reaction_id] = reaction - - reaction_compounds = [] - for modelseed_compound_id in modelseed_compound_ids: - if modelseed_compound_id in network.metabolites: - # The ModelSEED compound ID has been encountered in previously processed reactions, - # so there is already a ModelSEEDCompound object for it. - reaction_compounds.append(network.metabolites[modelseed_compound_id]) - continue - - # Generate new metabolite objects in the network. - try: - modelseed_compound_series = modelseed_compounds_table.loc[modelseed_compound_id] - except KeyError: - raise ConfigError( - f"""\ - A row for the ModelSEED compound ID, '{modelseed_compound_id}', was expected but - not found in the ModelSEED compounds table. This ID was found in the equation - for the ModelSEED reaction, '{modelseed_reaction_id}'.\ - """ - ) - modelseed_compound_series: pd.Series - modelseed_compound_data = modelseed_compound_series.to_dict() - modelseed_compound_data['id'] = modelseed_compound_id - compound = self._get_modelseed_compound(modelseed_compound_data) - reaction_compounds.append(compound) - network.metabolites[modelseed_compound_id] = compound - reaction.compounds = tuple(reaction_compounds) - - def _get_modelseed_reaction( - self, - modelseed_reaction_data: Dict - ) -> Tuple[ModelSEEDReaction, List[str]]: - """ - Generate a ModelSEED reaction object and list of associated ModelSEED compound IDs from the - ModelSEED reaction table entry. The reaction object is not populated with metabolite objects - from the list of associated compound IDs. - - Parameters - ========== - modelseed_reaction_data : Dict - A dictionary representation of a row for a reaction in the ModelSEED reaction table set - up by anvi'o. - - Returns - ======= - ModelSEEDReaction - An object representation of the ModelSEED reaction. - - List[str] - ModelSEED compound IDs of reactants and products. - """ - stoichiometry: str = modelseed_reaction_data['stoichiometry'] - if pd.isna(stoichiometry): - # ignore any reaction lacking a chemical equation for some reason - return None, None - - reaction = ModelSEEDReaction() - - modelseed_id = modelseed_reaction_data['id'] - if pd.isna(modelseed_id): - raise ConfigError( - "The row for the reaction in the ModelSEED table does not but should have an ID. " - f"Here is the data in the row: '{modelseed_reaction_data}'" - ) - reaction.modelseed_id = modelseed_id - - modelseed_name = modelseed_reaction_data['name'] - if pd.isna(modelseed_name): - reaction.modelseed_name = None - else: - reaction.modelseed_name = modelseed_name - - kegg_reaction_ids: str = modelseed_reaction_data['KEGG'] - if pd.isna(kegg_reaction_ids): - reaction.kegg_aliases = tuple() - else: - reaction.kegg_aliases = tuple(kegg_reaction_ids.split('; ')) - - ec_numbers: str = modelseed_reaction_data['ec_numbers'] - if pd.isna(ec_numbers): - reaction.ec_number_aliases = [] - else: - reaction.ec_number_aliases = ec_numbers.split('|') - - reversibility = modelseed_reaction_data['reversibility'] - if pd.isna(reversibility): - raise ConfigError( - "The row for the reaction in the ModelSEED table was expected to have a 'reversibility' value. " - f"Here is the data in the row: '{modelseed_reaction_data}'" - ) - if reversibility == '=' or reversibility == '?': - # Assume that reactions lacking data ('?') are reversible. - reaction.reversibility = True - else: - reaction.reversibility = False - - decimal_reaction_coefficients = [] - split_stoichiometry = stoichiometry.split(';') - modelseed_compound_ids = [] - compartments = [] - for entry in split_stoichiometry: - split_entry = entry.split(':') - decimal_reaction_coefficients.append(split_entry[0]) - modelseed_compound_ids.append(split_entry[1]) - compartments.append(ModelSEEDDatabase.compartment_ids[int(split_entry[2])]) - reaction.compartments = tuple(compartments) - reaction_coefficients = to_lcm_denominator(decimal_reaction_coefficients) - direction = modelseed_reaction_data['direction'] - if pd.isna(direction): - raise ConfigError( - "The row for the reaction in the ModelSEED table was expected to have a 'direction' value. " - f"Here is the data in the row: '{modelseed_reaction_data}'" - ) - if (direction == '>' and reversibility == '<') or (direction == '<' and reversibility == '>'): - # The way the reaction is written is the opposite of the way the reaction proceeds. - reaction_coefficients = [-c for c in reaction_coefficients] - reaction.coefficients = tuple(reaction_coefficients) - - return reaction, modelseed_compound_ids - - def _get_modelseed_compound(self, modelseed_compound_data: Dict) -> ModelSEEDCompound: - """ - Generate a ModelSEED compound object from its entry in the ModelSEED table. - - Parameters - ========== - modelseed_compound_data : Dict - A dictionary representation of a row for a compound in the ModelSEED compound table set - up by anvi'o. - - Returns - ======= - ModelSEEDCompound - An object representation of the ModelSEED compound. - """ - compound = ModelSEEDCompound() - compound.modelseed_id = modelseed_compound_data['id'] - - modelseed_name = modelseed_compound_data['name'] - if pd.isna(modelseed_name): - compound.modelseed_name = None - else: - compound.modelseed_name = modelseed_name - - kegg_aliases: str = modelseed_compound_data['KEGG'] - if pd.isna(kegg_aliases): - compound.kegg_aliases = tuple() - else: - compound.kegg_aliases = tuple(kegg_aliases.split('; ')) - - formula = modelseed_compound_data['formula'] - if pd.isna(formula): - compound.formula = None - # compounds without formulas have a nominal charge of 10000000 in compounds.tsv - compound.charge = None - else: - compound.formula = formula - charge = modelseed_compound_data['charge'] - if pd.isna(charge): - raise ConfigError( - f"The charge of a ModelSEED compound, '{compound.modelseed_id}', was not recorded " - "in 'compounds.tsv' but is expected to be present as an integer. Here is the data " - f"in the row for the compound: '{modelseed_compound_data}'" - ) - compound.charge = charge - - return compound - - def _get_database_reactions_table(self, network: ReactionNetwork) -> pd.DataFrame: - """ - Make a reactions table that can be stored in either a contigs or pan database, as the tables - have the same structure. A `ReactionNetwork` can be reconstructed with the same data from - the reactions and metabolites tables of the database. - - Parameters - ========== - network : ReactionNetwork - The reaction network generated from gene or gene cluster KO annotations. - - Returns - ======= - pd.DataFrame - The table of reactions data to be stored in the contigs or pan database. - """ - assert tables.gene_function_reactions_table_structure == tables.pan_gene_cluster_function_reactions_table_structure - assert tables.gene_function_reactions_table_types == tables.pan_gene_cluster_function_reactions_table_types - - # Transfer data from reaction objects to dictionaries mapping to table entries. - reactions_data: Dict[str, Dict] = {} - for modelseed_reaction_id, reaction in network.reactions.items(): - reaction_data = {} - reaction_data['modelseed_reaction_id'] = modelseed_reaction_id - reaction_data['modelseed_reaction_name'] = reaction.modelseed_name - reaction_data['metabolite_modelseed_ids'] = ', '.join([c.modelseed_id for c in reaction.compounds]) - reaction_data['stoichiometry'] = ', '.join([str(c) for c in reaction.coefficients]) - reaction_data['compartments'] = ', '.join(reaction.compartments) - reaction_data['reversibility'] = reaction.reversibility - # Record KEGG REACTION IDs and EC numbers that are aliases of ModelSEED reactions but - # are *NOT* associated with gene KO annotations; associated aliases are recorded later. - reaction_data['other_kegg_reaction_ids'] = ', '.join( - set(reaction.kegg_aliases).difference(set(network.modelseed_kegg_aliases[modelseed_reaction_id])) - ) - reaction_data['other_ec_numbers'] = ', '.join( - set(reaction.ec_number_aliases).difference(set(network.modelseed_ec_number_aliases[modelseed_reaction_id])) - ) - reactions_data[modelseed_reaction_id] = reaction_data - - # Get *KO* KEGG REACTION ID and EC number aliases of each ModelSEED reaction. These are not - # all possible aliases, but only those associated with KOs that matched genes. Structure - # alias data as follows: - # : { - # : [], - # : [], - # ... - # } - # : { - # : [], - # : [], - # ... - # } - ko_reaction_aliases: Dict[str, Tuple[Dict[str, List[str]], Dict[str, List[str]]]] = { - modelseed_reaction_id: ({}, {}) for modelseed_reaction_id in reactions_data - } - for ko_id, ko in network.kos.items(): - for modelseed_reaction_id, reaction in ko.reactions.items(): - aliases = ko_reaction_aliases[modelseed_reaction_id] - - kegg_reaction_aliases = aliases[0] - kegg_reaction_ids = ko.kegg_reaction_aliases[modelseed_reaction_id] - for kegg_reaction_id in kegg_reaction_ids: - try: - ko_ids: List = kegg_reaction_aliases[kegg_reaction_id] - except KeyError: - kegg_reaction_aliases[kegg_reaction_id] = ko_ids = [] - ko_ids.append(ko_id) - - ec_number_aliases = aliases[1] - ec_numbers = ko.ec_number_aliases[modelseed_reaction_id] - for ec_number in ec_numbers: - try: - ko_ids: List = ec_number_aliases[ec_number] - except KeyError: - ec_number_aliases[ec_number] = ko_ids = [] - ko_ids.append(ko_id) - for modelseed_reaction_id, aliases in ko_reaction_aliases.items(): - reaction_data = reactions_data[modelseed_reaction_id] - - # Make the entry for KO KEGG REACTION aliases, which looks akin to the following arbitrary example: - # 'R00001: (K00010, K00100); R01234: (K54321)' - kegg_reaction_aliases = aliases[0] - entry = [] - for kegg_reaction_id, ko_ids in kegg_reaction_aliases.items(): - entry.append(f'{kegg_reaction_id}: ({", ".join(sorted(ko_ids))})') - reaction_data['ko_kegg_reaction_source'] = '; '.join(sorted(entry)) - - # Make the entry for KO EC number aliases, which looks akin to the following arbitrary example: - # '1.1.1.1: (K00010, K00100); 1.2.3.4: (K12345); 6.7.8.99: (K65432) - ec_number_aliases = aliases[1] - entry = [] - for ec_number, ko_ids in ec_number_aliases.items(): - entry.append(f'{ec_number}: ({", ".join(sorted(ko_ids))})') - reaction_data['ko_ec_number_source'] = '; '.join(sorted(entry)) - - reactions_table = pd.DataFrame.from_dict(reactions_data, orient='index').reset_index(drop=True).sort_values('modelseed_reaction_id') - reactions_table = reactions_table[tables.gene_function_reactions_table_structure] - return reactions_table - - def _get_database_metabolites_table(self, network: ReactionNetwork) -> pd.DataFrame: - """ - Make a metabolites table that can be stored in either a contigs or pan database, as the tables - have the same structure. A `ReactionNetwork` can be reconstructed with the same data from - the reactions and metabolites tables of the database. - - Parameters - ========== - network : ReactionNetwork - The reaction network generated from gene or gene cluster KO annotations. - - Returns - ======= - pd.DataFrame - The table of metabolites data to be stored in the contigs or pan database. - """ - assert tables.gene_function_metabolites_table_structure == tables.pan_gene_cluster_function_metabolites_table_structure - assert tables.gene_function_metabolites_table_types == tables.pan_gene_cluster_function_metabolites_table_types - - # Transfer data from metabolite objects to dictionaries mapping to table entries. - metabolites_data = {} - for modelseed_compound_id, compound in network.metabolites.items(): - metabolite_data = {} - metabolite_data['modelseed_compound_id'] = modelseed_compound_id - metabolite_data['modelseed_compound_name'] = compound.modelseed_name - metabolite_data['kegg_aliases'] = ', '.join(compound.kegg_aliases) - metabolite_data['formula'] = compound.formula - metabolite_data['charge'] = compound.charge - metabolites_data[modelseed_compound_id] = metabolite_data - - metabolites_table = pd.DataFrame.from_dict(metabolites_data, orient='index').reset_index(drop=True).sort_values('modelseed_compound_id') - metabolites_table = metabolites_table[tables.gene_function_metabolites_table_structure] - return metabolites_table - - def hash_contigs_db_ko_annotations(self, gene_function_calls_dict: Dict) -> str: - """ - To concisely represent the data underlying a reaction network, hash all gene KO annotations - in the contigs database. - - Parameters - ========== - gene_function_calls_dict : str - This dictionary is loaded by a contigs superclass and contains gene KO annotations. - - Returns - ======= - str - Hash representation of all gene KO annotations. - """ - ko_annotations = [] - for gcid, gene_dict in gene_function_calls_dict.items(): - ko_data = gene_dict['KOfam'] - ko_id = ko_data[0] - ko_name = ko_data[1] - e_value = ko_data[2] - ko_annotations.append((str(gcid), ko_id, ko_name, str(e_value))) - ko_annotations = sorted(ko_annotations, key=lambda x: (x[0], x[1])) - - ko_annotations_string = '' - for ko_annotation in ko_annotations: - ko_annotations_string += ''.join(ko_annotation) - - hashed_ko_annotations = hashlib.sha1(ko_annotations_string.encode('utf-8')).hexdigest() - return hashed_ko_annotations - - def hash_pan_db_ko_annotations( - self, - genomes_storage_db: str, - gene_clusters_functions_summary_dict: Dict, - consensus_threshold: float, - discard_ties: bool - ) -> str: - """ - To concisely represent the data underlying a reaction network, hash all gene KO annotations - in the constituent genomes, all consensus KO annotations of the gene clusters, and - parameters used to select consensus KOs. - - Parameters - ========== - genomes_storage_db : str - This is the path to a genomes storage database with the underlying gene KO annotations. - - gene_clusters_functions_summary_dict : dict - This dictionary is loaded by a pan superclass and contains gene cluster KO annotations. - - consensus_threshold : float, None - This parameter was used in setting consensus KO annotations of gene clusters. - - discard_ties : bool, False - This parameter was used in setting consensus KO annotations of gene clusters. - - Returns - ======= - str - Hash representation of all gene cluster consensus KO annotations and the parameters used - to select consensus KOs. - """ - gsdb = dbinfo.GenomeStorageDBInfo(genomes_storage_db).load_db() - functions_table = gsdb.get_table_as_dataframe('gene_function_calls', where_clause='source = "KOfam"') - gsdb.disconnect() - ko_annotations = [] - for row in functions_table.itertuples(index=False): - ko_annotations.append((row.genome_name, str(row.gene_callers_id), row.accession, row.function, str(row.e_value))) - ko_annotations = sorted(ko_annotations, key=lambda x: (x[0], x[1], x[2])) - - ko_annotations = [] - for cluster_id, gene_cluster_dict in gene_clusters_functions_summary_dict.items(): - ko_data = gene_cluster_dict['KOfam'] - ko_id = ko_data['accession'] - ko_name = ko_data['function'] - # When the KO ID and name are None, convert them into 'None'. - ko_annotations.append((str(cluster_id), str(ko_id), str(ko_name))) - ko_annotations = sorted(ko_annotations, key=lambda x: x[0]) - - ko_annotations_string = f'{consensus_threshold}_{int(discard_ties)}_' - for ko_annotation in ko_annotations: - ko_annotations_string += ''.join(ko_annotation) - - hashed_ko_annotations = hashlib.sha1(ko_annotations_string.encode('utf-8')).hexdigest() - return hashed_ko_annotations - -class Tester: - """ - This class tests reaction network construction and operations. - - Attributes - ========== - ko_dir : str, None - The directory containing reference KEGG Orthology (KO) tables set up by anvi'o. This - attribute is assigned the argument of the same name upon initialization. - - modelseed_dir : str, None - The directory containing reference ModelSEED Biochemistry tables set up by anvi'o. This - attribute is assigned the argument of the same name upon initialization. - - test_dir : str, None - The directory storing test files, including copied input files and output files. With the - default value of None, temporary directories are created and deleted as needed by methods. - None of the test files in a provided directory, in contrast, are deleted. This attribute is - assigned the argument of the same name upon initialization. - - run : anvio.terminal.Run, anvio.terminal.Run() - This object prints run information to the terminal. This attribute is assigned the argument - of the same name upon initialization. - - progress : anvio.terminal.Progress, anvio.terminal.Progress() - This object prints transient progress information to the terminal. This attribute is - assigned the argument of the same name upon initialization. - """ - def __init__( - self, - ko_dir: str = None, - modelseed_dir: str = None, - test_dir: str = None, - run: terminal.Run = terminal.Run(), - progress: terminal.Progress = terminal.Progress() - ) -> None: - """ - Parameters - ========== - ko_dir : str, None - The directory containing reference KEGG Orthology (KO) tables set up by anvi'o. The - default argument of None expects KO data to be set up in the default anvi'o directory - used by the program `anvi-setup-kegg-data`. - - modelseed_dir : str, None - The directory containing reference ModelSEED Biochemistry tables set up by anvi'o. The - default argument of None expects ModelSEED data to be set up in the default anvi'o - directory used by the program `anvi-setup-modelseed-database`. - - test_dir : str, None - The directory storing test files. With the default value of None, temporary test - directories are created and deleted by Tester methods; these methods operate on copies - of input files in the test directories. In contrast, a provided directory will not be - deleted, which can be useful for further work on output files. - - run : anvio.terminal.Run, anvio.terminal.Run() - This object prints run information to the terminal. - - progress : anvio.terminal.Progress, anvio.terminal.Progress() - This object prints transient progress information to the terminal. - """ - self.ko_dir = ko_dir - self.modelseed_dir = modelseed_dir - self.test_dir = test_dir - self.run = run - self.progress = progress - - def test_contigs_database_network(self, contigs_db: str) -> None: - """ - Test the construction of a reaction network from a contigs database, and test that network - methods are able to run and do not fail certain basic (by no means comprehensive) tests. - - Parameters - ========== - contigs_db : str - Path to a contigs database. The database can represent different types of samples, - including a single genome, metagenome, or transcriptome. The network is derived from - gene KO annotations stored in the database. - - Returns - ======= - None - """ - if self.test_dir is None: - test_dir = filesnpaths.get_temp_directory_path() - else: - test_dir = self.test_dir - self.run.info("Test directory", test_dir, nl_after=1) - - self.run.info_single("NETWORK CONSTRUCTION:", mc='magenta', level=0) - network, temp_dir = self.make_contigs_database_network(contigs_db) - - self.run.info_single( - "PURGE OF METABOLITES WITHOUT FORMULA:", mc='magenta', nl_before=1, level=0 - ) - network.copy().remove_metabolites_without_formula( - output_path=os.path.join(test_dir, "removed.tsv") - ) - print() - - self.progress.new("Testing network purge methods") - self.progress.update("...") - # Network pruning tests use a random sample of half the network items (nodes) of each type. - random.seed(RANDOM_SEED) - metabolite_sample = set(random.sample( - list(network.metabolites), math.ceil(len(network.metabolites) / 2) - )) - random.seed(RANDOM_SEED) - reaction_sample = set(random.sample( - list(network.reactions), math.ceil(len(network.reactions) / 2) - )) - random.seed(RANDOM_SEED) - ko_sample = set(random.sample(list(network.kos), math.ceil(len(network.kos) / 2))) - random.seed(RANDOM_SEED) - gene_sample = set(random.sample(list(network.genes), math.ceil(len(network.genes) / 2))) - - copied_network = network.copy() - # The basic tests of the copy method check that the network-level attributes appear to - # contain the same items. What remains untested is that all of the references between nodes - # are identical, e.g., the reactions referenced by each KO. - assert list(network.metabolites) == list(copied_network.metabolites) - assert list(network.reactions) == list(copied_network.reactions) - assert list(network.kos) == list(copied_network.kos) - assert list(network.genes) == list(copied_network.genes) - assert list(network.proteins) == list(copied_network.proteins) - assert network.kegg_modelseed_aliases == copied_network.kegg_modelseed_aliases - assert network.modelseed_kegg_aliases == copied_network.modelseed_kegg_aliases - assert network.ec_number_modelseed_aliases == copied_network.ec_number_modelseed_aliases - assert network.modelseed_ec_number_aliases == copied_network.modelseed_ec_number_aliases - removed = copied_network.purge_metabolites(metabolite_sample) - # The most basic test of the purge (pruning) method is that the network no longer contains - # the items that were requested to be removed. What remains untested, and would require a - # curated test dataset, is the removal of certain other "upstream" and "downstream" nodes - # associated with the nodes requested to be removed, e.g., KOs and genes upstream and - # metabolites downstream of requested reactions. - assert metabolite_sample.difference(set(copied_network.metabolites)) == metabolite_sample - assert not metabolite_sample.difference( - set([metabolite.modelseed_id for metabolite in removed['metabolite']]) - ) - - copied_network = network.copy() - removed = copied_network.purge_reactions(reaction_sample) - assert reaction_sample.difference(set(copied_network.reactions)) == reaction_sample - assert not reaction_sample.difference( - set([reaction.modelseed_id for reaction in removed['reaction']]) - ) - - copied_network = network.copy() - removed = copied_network.purge_kos(ko_sample) - assert ko_sample.difference(set(copied_network.kos)) == ko_sample - assert not ko_sample.difference(set([ko.id for ko in removed['ko']])) - - copied_network = network.copy() - removed = copied_network.purge_genes(gene_sample) - assert gene_sample.difference(set(copied_network.genes)) == gene_sample - assert not gene_sample.difference(set([gene.gcid for gene in removed['gene']])) - self.progress.end() - - self.progress.new("Testing network subset methods") - self.progress.update("...") - subnetwork = network.subset_network(metabolites_to_subset=metabolite_sample) - # The most basic test of the subset method is that the new network contains the requested - # items. What remains untested, and would require a curated test dataset, is the inclusion - # of certain other "upstream" and "downstream" nodes associated with the nodes requested to - # be removed, e.g., KOs and genes upstream and metabolites downstream of requested - # reactions. - assert not metabolite_sample.difference(set(subnetwork.metabolites)) - - subnetwork = network.subset_network(reactions_to_subset=reaction_sample) - assert not reaction_sample.difference(set(subnetwork.reactions)) - - subnetwork = network.subset_network(kos_to_subset=ko_sample) - assert not ko_sample.difference(set(subnetwork.kos)) - - subnetwork = network.subset_network(genes_to_subset=gene_sample) - assert not gene_sample.difference(set(subnetwork.genes)) - - # Network merging functionality is tested within the following command. - subnetwork = network.subset_network( - genes_to_subset=gene_sample, - kos_to_subset=ko_sample, - reactions_to_subset=reaction_sample, - metabolites_to_subset=metabolite_sample - ) - assert not metabolite_sample.difference(set(subnetwork.metabolites)) - assert not reaction_sample.difference(set(subnetwork.reactions)) - assert not ko_sample.difference(set(subnetwork.kos)) - assert not gene_sample.difference(set(subnetwork.genes)) - self.progress.end() - - if temp_dir is not None: - shutil.rmtree(temp_dir) - - self.run.info_single( - "All tests passed for the contigs database reaction network", mc='magenta', level=0 - ) - self.run.info_single("Network construction and storage in the contigs database") - self.run.info_single("Purge metabolites without formula") - self.run.info_single("Purge select metabolites") - self.run.info_single("Purge select reactions") - self.run.info_single("Purge select KOs") - self.run.info_single("Purge select genes") - self.run.info_single("Subset select metabolites") - self.run.info_single("Subset select reactions") - self.run.info_single("Subset select KOs") - self.run.info_single("Subset select genes") - self.run.info_single("Subset select metabolites, reactions, KOs, and genes") - - - def make_contigs_database_network( - self, - contigs_db: str, - store: bool = True, - overwrite_existing_network: bool = True, - stats_file: str = "contigs_db_network_stats.tsv" - ) -> Tuple[GenomicNetwork, str]: - """ - Test reaction network construction from a contigs database. - - Parameters - ========== - contigs_db : str - Path to a contigs database. The database can represent different types of samples, - including a single genome, metagenome, or transcriptome. The network is derived from - gene KO annotations stored in the database. - - store : bool, True - Save the network to a copy of the contigs database, stored with the same filename in the - test directory. If a contigs database already exists at this location, it is retained - and the network is saved to it, rather than overwriting the file with a copy. - - overwrite_existing_network : bool, True - Overwrite an existing network stored in the copy of the contigs database in the test - directory. 'store' is also required. - - stats_file : str, 'stats.tsv' - Write network overview statistics to a tab-delimited file with this filename in the test - directory. If this file already exists, it is overwritten. - - Returns - ======= - GenomicNetwork - The network derived from the contigs database. - - str - The path to the temporary directory in which a copy of the input contigs database and - output stats files are stored. If no temporary directory is created, then this return - value is None. - """ - utils.is_contigs_db(contigs_db) - - if self.test_dir is None: - test_dir = temp_dir = filesnpaths.get_temp_directory_path() - else: - test_dir = self.test_dir - temp_dir = None - - if store: - # Store the network in a copy of the input database. - contigs_db_target = os.path.join(test_dir, os.path.basename(contigs_db)) - if filesnpaths.is_file_exists(contigs_db_target, dont_raise=True): - raise ConfigError( - f"""\ - The contigs database will not be copied to a location in the test directory with - an existing file: {contigs_db_target}\ - """ - ) - shutil.copy(contigs_db, contigs_db_target) - else: - # The network is not stored, so the input file is used and remains unmodified. - contigs_db_target = contigs_db - - con = Constructor( - ko_dir=self.ko_dir, - modelseed_dir=self.modelseed_dir, - run=self.run, - progress=self.progress - ) - - if stats_file: - stats_file_target = os.path.join(test_dir, stats_file) - else: - stats_file_target = None - - network = con.make_contigs_database_network( - contigs_db=contigs_db_target, - store=store, - overwrite_existing_network=overwrite_existing_network, - stats_file=stats_file_target - ) - return network, temp_dir - - def test_pan_database_network( - self, - pan_db: str, - genomes_storage_db: str, - consensus_threshold: float = None, - discard_ties: bool = False - ) -> None: - """ - Test the construction of a reaction network from a pan database, and test that network - methods are able to run and do not fail certain basic (by no means comprehensive) tests. - - Parameters - ========== - pan_db : str - Path to a pan database. The pangenomic network is determined for gene clusters stored in - the database. - - genomes_storage_db : str - Path to a genomes storage database. The pangenomic network is derived from gene KO - annotations stored in the database. - - consensus_threshold : float, None - With the default of None, the protein annotation most frequent among genes in a cluster - is assigned to the cluster itself. If a non-default argument is provided (a value on [0, - 1]), at least this proportion of genes in the cluster must have the most frequent - annotation for the cluster to be annotated. - - discard_ties : bool, False - If multiple protein annotations are most frequent among genes in a cluster, then do not - assign an annotation to the cluster itself when this argument is True. By default, this - argument is False, so one of the most frequent annotations would be arbitrarily chosen. - - Returns - ======= - None - """ - if self.test_dir is None: - test_dir = filesnpaths.get_temp_directory_path() - else: - test_dir = self.test_dir - self.run.info("Test directory", test_dir, nl_after=1) - - self.run.info_single("NETWORK CONSTRUCTION:", mc='magenta', level=0) - network, temp_dir = self.make_pan_database_network( - pan_db, - genomes_storage_db, - consensus_threshold=consensus_threshold, - discard_ties=discard_ties - ) - - self.run.info_single( - "PURGE OF METABOLITES WITHOUT FORMULA:", mc='magenta', nl_before=1, level=0 - ) - network.copy().remove_metabolites_without_formula( - output_path=os.path.join(test_dir, "removed.tsv") - ) - print() - - self.progress.new("Testing network purge methods") - self.progress.update("...") - # Network pruning tests use a random sample of half the network items (nodes) of each type. - random.seed(RANDOM_SEED) - metabolite_sample = set(random.sample( - list(network.metabolites), math.ceil(len(network.metabolites) / 2) - )) - random.seed(RANDOM_SEED) - reaction_sample = set(random.sample( - list(network.reactions), math.ceil(len(network.reactions) / 2) - )) - random.seed(RANDOM_SEED) - ko_sample = set(random.sample(list(network.kos), math.ceil(len(network.kos) / 2))) - random.seed(RANDOM_SEED) - gene_cluster_sample = set(random.sample( - list(network.gene_clusters), math.ceil(len(network.gene_clusters) / 2) - )) - - copied_network = network.copy() - # The basic tests of the copy method check that the network-level attributes appear to - # contain the same items. What remains untested is that all of the references between nodes - # are identical, e.g., the reactions referenced by each KO. - assert list(network.metabolites) == list(copied_network.metabolites) - assert list(network.reactions) == list(copied_network.reactions) - assert list(network.kos) == list(copied_network.kos) - assert list(network.gene_clusters) == list(copied_network.gene_clusters) - assert network.kegg_modelseed_aliases == copied_network.kegg_modelseed_aliases - assert network.modelseed_kegg_aliases == copied_network.modelseed_kegg_aliases - assert network.ec_number_modelseed_aliases == copied_network.ec_number_modelseed_aliases - assert network.modelseed_ec_number_aliases == copied_network.modelseed_ec_number_aliases - removed = copied_network.purge_metabolites(metabolite_sample) - # The most basic test of the purge (pruning) method is that the network no longer contains - # the items that were requested to be removed. What remains untested, and would require a - # curated test dataset, is the removal of certain other "upstream" and "downstream" nodes - # associated with the nodes requested to be removed, e.g., KOs and gene clusters upstream - # and metabolites downstream of requested reactions. - assert metabolite_sample.difference(set(copied_network.metabolites)) == metabolite_sample - assert not metabolite_sample.difference( - set([metabolite.modelseed_id for metabolite in removed['metabolite']]) - ) - - copied_network = network.copy() - removed = copied_network.purge_reactions(reaction_sample) - assert reaction_sample.difference(set(copied_network.reactions)) == reaction_sample - assert not reaction_sample.difference( - set([reaction.modelseed_id for reaction in removed['reaction']]) - ) - - copied_network = network.copy() - removed = copied_network.purge_kos(ko_sample) - assert ko_sample.difference(set(copied_network.kos)) == ko_sample - assert not ko_sample.difference(set([ko.id for ko in removed['ko']])) - - copied_network = network.copy() - removed = copied_network.purge_gene_clusters(gene_cluster_sample) - assert ( - gene_cluster_sample.difference(set(copied_network.gene_clusters)) == - gene_cluster_sample - ) - assert not gene_cluster_sample.difference( - set([gene_cluster.gene_cluster_id for gene_cluster in removed['gene_cluster']]) - ) - self.progress.end() - - self.progress.new("Testing network subset methods") - self.progress.update("...") - subnetwork = network.subset_network(metabolites_to_subset=metabolite_sample) - # The most basic test of the subset method is that the new network contains the requested - # items. What remains untested, and would require a curated test dataset, is the inclusion - # of certain other "upstream" and "downstream" nodes associated with the nodes requested to - # be removed, e.g., KOs and gene clusters upstream and metabolites downstream of requested - # reactions. - assert not metabolite_sample.difference(set(subnetwork.metabolites)) - - subnetwork = network.subset_network(reactions_to_subset=reaction_sample) - assert not reaction_sample.difference(set(subnetwork.reactions)) - - subnetwork = network.subset_network(kos_to_subset=ko_sample) - assert not ko_sample.difference(set(subnetwork.kos)) - - subnetwork = network.subset_network(gene_clusters_to_subset=gene_cluster_sample) - assert not gene_cluster_sample.difference(set(subnetwork.gene_clusters)) - - # Network merging functionality is tested within the following command. - subnetwork = network.subset_network( - gene_clusters_to_subset=gene_cluster_sample, - kos_to_subset=ko_sample, - reactions_to_subset=reaction_sample, - metabolites_to_subset=metabolite_sample - ) - assert not metabolite_sample.difference(set(subnetwork.metabolites)) - assert not reaction_sample.difference(set(subnetwork.reactions)) - assert not ko_sample.difference(set(subnetwork.kos)) - assert not gene_cluster_sample.difference(set(subnetwork.gene_clusters)) - self.progress.end() - - if temp_dir is not None: - shutil.rmtree(temp_dir) - - self.run.info_single( - "All tests passed for the pan database reaction network", mc='magenta', level=0 - ) - self.run.info_single("Network construction and storage in the pan database") - self.run.info_single("Purge metabolites without formula") - self.run.info_single("Purge select metabolites") - self.run.info_single("Purge select reactions") - self.run.info_single("Purge select KOs") - self.run.info_single("Purge select gene clusters") - self.run.info_single("Subset select metabolites") - self.run.info_single("Subset select reactions") - self.run.info_single("Subset select KOs") - self.run.info_single("Subset select gene clusters") - self.run.info_single("Subset select metabolites, reactions, KOs, and gene clusters") - - def make_pan_database_network( - self, - pan_db: str, - genomes_storage_db: str, - store: bool = True, - overwrite_existing_network: bool = True, - consensus_threshold: float = None, - discard_ties: bool = False, - stats_file: str = "pan_db_network_stats.tsv" - ) -> Tuple[PangenomicNetwork, str]: - """ - Test reaction network construction from a pan database. - - Parameters - ========== - pan_db : str - Path to a pan database. The pangenomic network is determined for gene clusters stored in - the database. - - genomes_storage_db : str - Path to a genomes storage database. The pangenomic network is derived from gene KO - annotations stored in the database. - - store : bool, True - Save the network to a copy of the pan database, stored with the same filename in the - test directory. If this file copy already exists, it is retained and used. - - overwrite_existing_network : bool, False - Overwrite an existing network stored in the copy of the pan database in the test - directory. 'store' is also required. - - consensus_threshold : float, None - With the default of None, the protein annotation most frequent among genes in a cluster - is assigned to the cluster itself. If a non-default argument is provided (a value on [0, - 1]), at least this proportion of genes in the cluster must have the most frequent - annotation for the cluster to be annotated. - - discard_ties : bool, False - If multiple protein annotations are most frequent among genes in a cluster, then do not - assign an annotation to the cluster itself when this argument is True. By default, this - argument is False, so one of the most frequent annotations would be arbitrarily chosen. - - stats_file : str, None - Write network overview statistics to a tab-delimited file with this filename in the test - directory. If this file already exists, it is overwritten. - - Returns - ======= - PangenomicNetwork - The network derived from the pangenomic databases. - - str - The path to the temporary directory in which a copy of the input pan database and output - stats files are stored. If no temporary directory is created, then this return value is - None. - """ - utils.is_pan_db(pan_db) - utils.is_genome_storage(genomes_storage_db) - - if self.test_dir is None: - test_dir = temp_dir = filesnpaths.get_temp_directory_path() - else: - test_dir = self.test_dir - temp_dir = None - - if store: - # Store the network in a copy of the input database. - pan_db_target = os.path.join(test_dir, os.path.basename(pan_db)) - if filesnpaths.is_file_exists(pan_db_target, dont_raise=True): - raise ConfigError( - f"""\ - The pan database will not be copied to a location in the test directory with an - existing file: {pan_db_target}\ - """ - ) - shutil.copy(pan_db, pan_db_target) - else: - # The network is not stored, so the input file is used and remains unmodified. - pan_db_target = pan_db - - con = Constructor( - ko_dir=self.ko_dir, - modelseed_dir=self.modelseed_dir, - run=self.run, - progress=self.progress - ) - - if stats_file: - stats_file_target = os.path.join(test_dir, stats_file) - else: - stats_file_target = None - - network = con.make_pangenomic_network( - pan_db=pan_db_target, - genomes_storage_db=genomes_storage_db, - store=store, - overwrite_existing_network=overwrite_existing_network, - consensus_threshold=consensus_threshold, - discard_ties=discard_ties, - stats_file=stats_file_target - ) - return network, temp_dir - -def get_chemical_equation(reaction: ModelSEEDReaction) -> str: - """ - Get a decent-looking chemical equation. - - Parameters - ========== - reaction : ModelSEEDReaction - The representation of the reaction with data sourced from ModelSEED Biochemistry. - - Returns - ======= - str - The stoichiometric equation has integer coefficients; reactants and products are represented - by ModelSEED Biochemistry compound names and compartment symbols "(c)" if cytosolic and - "(e)" if extracellular; and a unidirectional arrow, "->", if irreversible and bidirectional - arrow, "<->", if reversible. - """ - equation = "" - leftside = True - for coefficient, metabolite, compartment in zip( - reaction.coefficients, reaction.compounds, reaction.compartments - ): - if leftside and coefficient > 0: - leftside = False - if reaction.reversibility: - equation += "<-> " - else: - equation += "-> " - - if leftside: - coeff = -coefficient - else: - coeff = coefficient - equation += f"{coeff} {metabolite.modelseed_name} [{compartment}] + " - - return equation.rstrip('+ ') - -def to_lcm_denominator(floats: Iterable[float]) -> Tuple[int]: - """ - Convert a list of floats to a list of integers, with a list containing fractional numbers - transformed to a list of lowest common integer multiples. - - Parameters - ========== - floats : Iterable[float] - Numbers to convert. - - Returns - ======= - List[int] - List of integers transformed from the input list. - """ - def lcm(a, b): - return a * b // math.gcd(a, b) - - rationals = [fractions.Fraction(f).limit_denominator() for f in floats] - lcm_denom = functools.reduce(lcm, [r.denominator for r in rationals]) - - return list(int(r.numerator * lcm_denom / r.denominator) for r in rationals) - -def _download_worker( - input_queue: mp.Queue, - output_queue: mp.Queue, - max_num_tries: int = 100, - wait_secs: float = 10.0 -) -> None: - """ - Multiprocessing worker to download files from a queue. - - Parameters - ========== - input_queue : multiprocessing.Queue - Queue of length-two iterables of the URL and local path for each file to download. - - output_queue : multiprocessing.Queue - Queue in which the success of each download operation is recorded, with True put in the - output queue if the download succeeded and the local path from the input queue put in the - output queue if the download failed (after exceeding the maximum number of tries). - - max_num_tries : int, 100 - The maximum number of times to try downloading a file (in case of a connection reset). - - wait_secs : float, 10.0 - The number of seconds to wait between each file download attempt. - - Returns - ======= - None - """ - while True: - url, path = input_queue.get() - num_tries = 0 - while True: - try: - utils.download_file(url, path) - output = True - break - except (ConfigError, ConnectionResetError) as e: - num_tries += 1 - if num_tries > max_num_tries: - output = path - break - time.sleep(wait_secs) - output_queue.put(output) diff --git a/anvio/biochemistry/refdbs.py b/anvio/biochemistry/refdbs.py deleted file mode 100644 index c7a1db0adb..0000000000 --- a/anvio/biochemistry/refdbs.py +++ /dev/null @@ -1,713 +0,0 @@ -# -*- coding: utf-8 -# pylint: disable=line-too-long -"""Reference databases of protein properties.""" - -import os -import re -import time -import tarfile -import pandas as pd -import multiprocessing as mp - -from math import gcd -from glob import glob -from shutil import rmtree -from functools import reduce -from fractions import Fraction -from typing import Dict, List, Tuple -from abc import ABC, abstractmethod, abstractproperty - -import anvio.biochemistry.protein as protein - -from anvio.errors import ConfigError -from anvio.utils import download_file -from anvio.terminal import Progress, Run -from anvio.filesnpaths import check_output_directory -from anvio import __file__ as ANVIO_PATH, __version__ as VERSION - -__copyright__ = "Copyleft 2015-2024, The Anvi'o Project (http://anvio.org/)" -__credits__ = [] -__license__ = "GPL 3.0" -__version__ = VERSION -__maintainer__ = "Samuel Miller" -__email__ = "samuelmiller10@gmail.com" -__status__ = "Development" - - -class ProteinReferenceDatabase(ABC): - """Protein reference database framework.""" - # By default, files for each database are stored in a subdirectory with the name of the database - # (e.g., 'modelseed', 'kegg') of the following superdirectory. - default_superdir = os.path.join(os.path.dirname(ANVIO_PATH), 'data/misc/') - db_name: str - pretty_db_name: str - # These are the final files stored in the database subdirectory. - files: List[str] - - @property - def default_db_dir(self) -> str: - return os.path.join(self.default_superdir, self.pretty_db_name) - - @abstractproperty - def loaded(self) -> bool: - raise NotImplementedError - - @abstractmethod - def download(self, reset: bool = False) -> None: - """Download database files.""" - raise NotImplementedError - - @abstractmethod - def load(self) -> None: - """Load database files into memory.""" - raise NotImplementedError - - def get_missing_files(self) -> List[str]: - """Find missing files that should have been downloaded to the database directory.""" - missing = [] - for f in self.files: - path = os.path.join(self.db_dir, f) - if os.path.isfile(path): - continue - missing.append(path) - return missing - - def raise_missing_files(self, missing: List[str]) -> None: - """Raise an exception if there are missing database files.""" - if len(missing) == len(self.files): - raise ConfigError( - f"No {self.pretty_db_name} reference database files were found in the database " - f"directory, '{self.pretty_db_name}'. Download the reference database to a default " - "directory with the command, 'anvi-get-metabolic-model-file --download-references " - f"{self.pretty_db_name}." - ) - elif 0 < len(missing) < len(self.files): - raise ConfigError( - f"{len(self.files) - len(missing)} of {len(self.files)} reference database files " - f"were found in the database directory, '{self.db_dir}'. Re-download the reference " - "database to a default directory with the command, 'anvi-get-metabolic-model-file " - f"--download-references {self.pretty_db_name}." - ) - - def _set_up_db_dir(self, reset: bool) -> None: - if os.path.split(self.db_dir)[0] == self.default_superdir and not os.path.exists(self.default_superdir): - os.mkdir(self.default_superdir) - if os.path.exists(self.db_dir): - if reset: - rmtree(self.db_dir) - else: - raise ConfigError( - f"The database directory, {self.db_dir}, already exists. The 'reset' option can " - "be used to remove the database and set it up again." - ) - os.mkdir(self.db_dir) - - def _check_reference_database_initialization(self) -> None: - if not self.loaded: - raise ConfigError( - f"The input {self.pretty_db_name} database is not initialized. The 'load' method " - "must be called." - ) - -class ModelSEEDDatabase(ProteinReferenceDatabase): - """ - The ModelSEED biochemistry database is designed for use in metabolic modeling of plants, fungi, - and microbes. - - The database is set up in a default directory if a directory is not provided. - """ - db_name = 'modelseed' - pretty_db_name = 'ModelSEED' - dl_root = 'https://raw.githubusercontent.com/ModelSEED/ModelSEEDDatabase/master/Biochemistry/' - # These files have the same names as the downloaded files but are changed by setup. - files = ('compounds.tsv', 'reactions.tsv') - # Compounds are identified as cytosolic or extracellular in reactions. - compartment_ids = {0: 'c', 1: 'e'} - - def __init__(self, db_superdir: str = None, run: Run = Run(), progress: Progress = Progress()) -> None: - if db_superdir: - check_output_directory(db_superdir, ok_if_exists=True) - self.db_dir = os.path.join(db_superdir, self.pretty_db_name) - else: - self.db_dir = self.default_db_dir - self.run = run - self.progress = progress - self.reactions_table: pd.DataFrame = None - self.compounds_table: pd.DataFrame = None - - def download(self, reset: bool = False) -> None: - """Download and set up biochemistry tables.""" - self._set_up_db_dir(reset=reset) - for f in self.files: - url = os.path.join(self.dl_root, f) - path = os.path.join(self.db_dir, f) - download_file(url, path, progress=self.progress) - self._set_up_reactions_table() - self._set_up_compounds_table() - - def load(self) -> None: - """Load the reaction and compound tables as DataFrame attributes.""" - missing = self.get_missing_files() - self.raise_missing_files(missing) - self.reactions_table = self._load_reactions() - self.compounds_table = self._load_compounds() - - @property - def loaded(self) -> bool: - if self.reactions_table is None or self.compounds_table is None: - return False - return True - - def _set_reaction_lookup_table(self, cross_reference: str) -> None: - """ - Store a modified version of the reactions table that can be used to look up reactions in the - cross-referenced database of interest. The new table is stored in an attribute called - 'reaction_lookup_tables', which is created if it does not already exist. The name of the - cross-referenced database, such as 'KEGG' or 'ec_numbers', must correspond to a column of - the reactions table. - - Parameters - ========== - cross_reference : str - The cross-referenced database name found in the reactions table header. - """ - col_names = self.reactions_table.columns.tolist() - alias_col_names = col_names[col_names.index('ec_numbers') + 1: ] - if cross_reference in alias_col_names: - formatted_reactions_table = self._get_reactions_table_per_alias( - self.reactions_table, cross_reference - ) - elif cross_reference == 'ec_numbers': - formatted_reactions_table = self._get_reactions_table_per_ec_number(self.reactions_table) - else: - raise ConfigError( - f"The source, '{cross_reference}', is not recognized as the name of a database " - "cross-referenced to ModelSEED." - ) - if hasattr(self, 'reaction_lookup_tables'): - self.reaction_lookup_tables[cross_reference] = formatted_reactions_table - else: - self.reaction_lookup_tables = {cross_reference: formatted_reactions_table} - - def get_reaction(self, reaction_data: Dict) -> protein.Reaction: - """ - Get a reaction object from information in the ModelSEED database. - - Parameters - ========== - reaction_data : Dict - A dictionary representation of a ModelSEED reactions table row. - - Returns - ======= - anvio.biochemistry.protein.Reaction - """ - self._check_reference_database_initialization() - stoichiometry: str = reaction_data['stoichiometry'] - if pd.isna(stoichiometry): - # Ignore a reaction if it does not have a chemical equation for some reason. - return None - reaction = protein.Reaction() - modelseed_id = reaction_data['id'] - if pd.isna(modelseed_id): - raise ConfigError( - "The row for the reaction in the ModelSEED table does not but should have an ID. " - f"Here is the data in the row: '{reaction_data}'" - ) - self._add_ids(reaction, reaction_data, 'id') - self._add_ids(reaction, reaction_data, 'name') - self._add_ids(reaction, reaction_data, 'ec_numbers') - self._add_ids(reaction, reaction_data, 'BiGG') - self._add_ids(reaction, reaction_data, 'KEGG') - self._add_ids(reaction, reaction_data, 'MetaCyc') - self._add_ids(reaction, reaction_data, 'Name') - reversibility = reaction_data['reversibility'] - if reversibility == '=' or reversibility == '?': - # Assume that reactions lacking data ('?') are reversible. - reaction.reversibility = True - else: - reaction.reversibility = False - decimal_reaction_coefficients = [] - for entry in stoichiometry.split(';'): - decimal_reaction_coefficients.append(entry.split(':')[0]) - reaction_coefficients = self._to_lcm_denominator(decimal_reaction_coefficients) - direction = reaction_data['direction'] - if (direction == '>' and reversibility == '<') or (direction == '<' and reversibility == '>'): - # The way the reaction is written is the opposite of the way the reaction proceeds. - reaction_coefficients = [-c for c in reaction_coefficients] - for chemical_entry, int_coefficient in zip(stoichiometry.split(';'), reaction_coefficients): - split_entry = chemical_entry.split(':') - reaction.coefficients.append(int_coefficient) - reaction.compartments.append(self.compartment_ids[int(split_entry[2])]) - chemical = protein.Chemical() - compound_id = split_entry[1] - chemical_data = self.compounds_table.loc[compound_id].to_dict() - chemical.reference_ids['ModelSEED_ID'] = [compound_id] - self._add_ids(chemical, chemical_data, 'name') - self._add_ids(chemical, chemical_data, 'inchikey') - self._add_ids(chemical, chemical_data, 'BiGG') - self._add_ids(chemical, chemical_data, 'KEGG') - self._add_ids(chemical, chemical_data, 'MetaCyc') - self._add_ids(chemical, chemical_data, 'Name') - if pd.notna(chemical_data['charge']): - chemical.charge = chemical_data['charge'] - if pd.notna(chemical_data['formula']): - chemical.formula = chemical_data['formula'] - if pd.notna(chemical_data['inchikey']): - chemical.inchi_key = chemical_data['inchikey'] - if pd.notna(chemical_data['smiles']): - chemical.smiles_string = chemical_data['smiles'] - reaction.chemicals.append(chemical) - return reaction - - def _add_ids(self, obj: protein.Reaction, data: Dict, source: str) -> bool: - """Add reference IDs to the reaction or metabolite object. Return True if ID(s) exist for - the reaction or metabolite in the ModelSEED database else False.""" - ref_ids: str = data[source] - if pd.isna(ref_ids): - return False - # IDs in "aliases" should be delimited by '; '. EC numbers should be delimited by '|'. - if source == 'id': - # This is an "id" entry in the ModelSEED reaction table, a single ModelSEED ID for the - # reaction or compound. - obj.reference_ids['ModelSEED_ID'] = [ref_ids] - elif source == 'name': - # This is a "name" entry in the ModelSEED reactions table, or a single name for the - # reaction or compound. This is found among the "Name" values (see below), or, absent - # "Name" values, this is the same as the ModelSEED "id" entry. - obj.reference_ids['ModelSEED_Name'] = [ref_ids] - elif source == 'inchikey': - obj.reference_ids['InChIKey'] = [ref_ids] - elif source == 'ec_numbers': - obj.reference_ids['EC'] = ref_ids.split('|') - elif source == 'Name': - # In the original ModelSEED reactions table, this is the "Name" field of an entry in the - # "aliases" column. - obj.reference_ids['ModelSEED_Alternate_Name'] = ref_ids.split('; ') - else: - obj.reference_ids[source] = ref_ids.split('; ') - return True - - def _to_lcm_denominator(self, floats) -> Tuple[int]: - def lcm(a, b): - return a * b // gcd(a, b) - rationals = [Fraction(f).limit_denominator() for f in floats] - lcm_denom = reduce(lcm, [r.denominator for r in rationals]) - return tuple(int(r.numerator * lcm_denom / r.denominator) for r in rationals) - - def _set_up_reactions_table(self) -> None: - """Reorganize the downloaded reaction table, storing in the same location.""" - reactions = self._load_reactions() - reactions = self._expand_aliases(reactions) - # Select a BiGG ID for each reaction, inserting the column of select BiGG IDs to the left of - # the new alias columns. - cols = reactions.columns.tolist() - reactions.insert( - cols.index('source') + 1, - 'select_bigg_id', - self._select_bigg_ids(reactions) - ) - path = os.path.join(self.db_dir, 'reactions.tsv') - reactions.to_csv(path, sep='\t', index=None) - self.run.info("Set-up reactions table", path) - - def _set_up_compounds_table(self) -> None: - """Change the stored compound table from the one downloaded.""" - path = os.path.join(self.db_dir, 'compounds.tsv') - compounds = pd.read_csv(path, sep='\t', header=0, low_memory=False) - compounds = self._expand_aliases(compounds) - # Select a BiGG ID for each compound, inserting the column of select BiGG IDs to the left of - # the new alias columns. - cols = compounds.columns.tolist() - compounds.insert( - cols.index('source') + 1, - 'select_bigg_id', - self._select_bigg_ids(compounds) - ) - compounds.to_csv(path, sep='\t', index=None) - self.run.info("Set-up compounds table", path) - - def _load_reactions(self) -> pd.DataFrame: - """Load the reaction table as a DataFrame.""" - path = os.path.join(self.db_dir, 'reactions.tsv') - reactions = pd.read_csv(path, sep='\t', header=0, low_memory=False) - return reactions - - def _load_compounds(self) -> pd.DataFrame: - """Load the compound table as a DataFrame.""" - path = os.path.join(self.db_dir, 'compounds.tsv') - compounds = pd.read_csv(path, sep='\t', header=0, index_col='id', low_memory=False) - return compounds - - def _expand_aliases(self, table: pd.DataFrame) -> pd.DataFrame: - """The downloaded reaction and compound tables each have a column of aliases: IDs - from different databases and common names. Split these IDs into separate columns.""" - rows = [] - for aliases in table.aliases: - aliases: str - row = {} - if pd.isna(aliases): - rows.append(row) - continue - split_aliases = aliases.split('|') - for alias in split_aliases: - sep_index = alias.index(': ') - alias_key = alias[: sep_index] - alias_value = alias[sep_index + 2: ].lstrip() - row[alias_key] = alias_value - rows.append(row) - alias_df = pd.DataFrame(rows) - alias_df.fillna('') - expanded_df = pd.concat([table.drop('aliases', axis=1), alias_df], axis=1) - return expanded_df - - def _select_bigg_ids(self, table: pd.DataFrame) -> List[str]: - """ - Select a single BiGG ID per compound or reaction. - - If there are multiple BiGG IDs, prefer one that matches the compound or reaction's - abbreviation entry, else return the first BiGG ID. - """ - select_ids = [] - for bigg_entry, abbreviation in zip(table.BiGG, table.abbreviation): - bigg_entry: str - abbreviation: str - if pd.isna(bigg_entry): - select_ids.append(None) - continue - bigg_ids = [b.strip() for b in bigg_entry.split(';')] - if len(bigg_ids) == 1: - select_ids.append(bigg_ids[0]) - continue - for bigg_id in bigg_ids: - if not abbreviation: - continue - if bigg_id == abbreviation: - select_ids.append(bigg_id) - break - else: - select_ids.append(bigg_ids[0]) - return select_ids - - def _get_reactions_table_per_alias( - self, - reactions: pd.DataFrame, - alias: str, - sep: str = '; ' - ) -> pd.DataFrame: - """ - Modify a ModelSEED reactions DataFrame, dropping reaction rows without IDs for the alias of - interest and expanding rows with multiple alias IDs so there is a row per ID. - - Inspection of the reactions table indicates, and it is therefore assumed, that IDs for all - aliases are delimited by '; '. However, this can be manually changed using 'sep'. - """ - reactions = reactions.dropna(subset=[alias]) - expanded = [] - alias_col = [] - for ids, row in zip(reactions[alias], reactions.drop(alias, axis=1).itertuples(index=False)): - ids: str - for id in ids.split(sep): - alias_col.append(id) - expanded.append(row) - reactions = pd.DataFrame(expanded) - reactions[alias] = alias_col - return reactions - - def _get_reactions_table_per_ec_number(self, reactions: pd.DataFrame) -> pd.DataFrame: - """ - Modify a ModelSEED reactions DataFrame, dropping reaction rows without EC number references - and expanding rows with multiple EC numbers so there is a row per EC number. - - Unlike alias IDs, EC numbers are delimited by '|'. - """ - reactions = self._get_reactions_table_per_alias(reactions, 'ec_numbers', sep='|') - return reactions - -class KEGGDatabase(ProteinReferenceDatabase): - db_name = 'kegg' - pretty_db_name = 'KEGG' - dl_root = 'https://rest.kegg.jp/' # See: https://www.kegg.jp/kegg/rest/keggapi.html - # These files are set up from downloaded files. - files = ('ko_data.tsv', 'reaction_data.tsv') - # Download files from the following KEGG databases. - db_categories = {'ko': 'KO', 'reaction': 'Reaction'} - - def __init__( - self, - db_superdir: str = None, - num_threads: int = 1, - run: Run = Run(), - progress: Progress = Progress() - ) -> None: - if db_superdir: - check_output_directory(db_superdir, ok_if_exists=True) - self.db_dir = os.path.join(db_superdir, self.pretty_db_name) - else: - self.db_dir = self.default_db_dir - self.num_threads = num_threads - self.run = run - self.progress = progress - self.ko_data: pd.DataFrame = None - self.reaction_data: pd.DataFrame = None - - def download(self, reset: bool = False) -> None: - """Download KEGG files and set up relational tables.""" - self._set_up_db_dir(reset=reset) - if self.num_threads == 1: - self.run.warning( - "Only 1 thread is being used to download from KEGG. It is advisable to set a " - "higher number of threads to download faster." - ) - for db_category in self.db_categories: - self._download_kegg_db_txt_files(db_category) - self._make_kegg_db_table(db_category) - self._archive_kegg_db(db_category) - - def load(self): - """Load the KO and reaction tables as DataFrame attributes.""" - missing = self.get_missing_files() - self.raise_missing_files(missing) - self.ko_data = self._load_ko_data() - self.reaction_data = self._load_reaction_data() - - @property - def loaded(self): - if self.ko_data is None or self.reaction_data is None: - return False - return True - - def _load_ko_data(self) -> pd.DataFrame: - """Load the KO data table and set it up as a DataFrame.""" - path = os.path.join(self.db_dir, 'ko_data.tsv') - ko_data = pd.read_csv(path, sep='\t', header=0, index_col=0, low_memory=False) - return ko_data - - def _load_reaction_data(self) -> pd.DataFrame: - """Load the reaction data table and set it up as a DataFrame.""" - path = os.path.join(self.db_dir, 'reaction_data.tsv') - reaction_data = pd.read_csv(path, sep='\t', header=0, index_col=0, low_memory=False) - return reaction_data - - def _download_kegg_db_txt_files(self, db_category: str) -> None: - """ - Download flat files for all entries in a KEGG database. - - Parameters - ========== - db_category : str - Lowercase name of KEGG database with downloadable flat files for each entry, e.g., 'ko', - 'reaction', 'compound'. - """ - kegg_ids = self._get_kegg_ids(db_category) - category_dir = os.path.join(self.db_dir, db_category) - os.mkdir(category_dir) - manager = mp.Manager() - input_queue = manager.Queue() - output_queue = manager.Queue() - for kegg_id in kegg_ids: - url = f'{self.dl_root}get/{kegg_id}' - path = os.path.join(category_dir, f'{kegg_id}.txt') - input_queue.put((url, path)) - workers: List[mp.Process] = [] - for _ in range(self.num_threads): - worker = mp.Process(target=_download_worker, args=(input_queue, output_queue)) - workers.append(worker) - worker.start() - self.progress.new(f"Downloading KEGG {self.db_categories[db_category]} entry files") - num_downloaded = 0 - total = len(kegg_ids) - while num_downloaded < total: - output_queue.get() - num_downloaded += 1 - self.progress.update(f"{num_downloaded} / {total}") - self.progress.end() - for worker in workers: - worker.terminate() - - def _get_kegg_ids(self, db_category: str) -> List[str]: - """ - Get all KEGG entry IDs from the database of the given category. - - Parameters - ========== - db_category : str - Lowercase name of KEGG database, e.g., 'ko', 'reaction', 'compound'. - - Returns - ======= - list - List of entry IDs. The 'KO', 'Reaction', and 'Compound' databases have IDs formatted as - the first letter of the database followed by five digits, e.g., 'K00001', 'R00010'. - """ - url = f'{self.dl_root}list/{db_category}' - path = os.path.join(self.db_dir, f'{db_category}.txt') - download_file(url, path) - kegg_ids = [] - f = open(path) - for line in f: - line.split()[0] - kegg_ids.append(line[: 6]) - f.close() - os.remove(path) - return kegg_ids - - def _make_kegg_db_table(self, db_category: str) -> None: - """ - Store a tab-delimited file for a KEGG database (e.g., KO, Reaction, Compound) derived - from downloaded text files for database entries. - - Parameters - ========== - db_category : str - Lowercase name of KEGG database, e.g., 'ko', 'reaction', 'compound'. - """ - kegg_db_dir = os.path.join(self.db_dir, db_category) - select_data = {} - self.progress.new(f"Processing KEGG {self.db_categories[db_category]} files") - txt_files = glob(os.path.join(kegg_db_dir, '*')) - total = len(txt_files) - for num_processed, path in enumerate(txt_files): - id = os.path.splitext(os.path.basename(path))[0] - if db_category == 'ko': - select_data[id] = self._get_ko_data(path) - elif db_category == 'reaction': - select_data[id] = self._get_reaction_data(path) - self.progress.update(f"{num_processed} / {total}") - self.progress.end() - if db_category == 'ko': - header = ['name', 'reactions', 'ec_numbers'] - elif db_category == 'reaction': - header = ['orthology'] - columns = {h: [] for h in header} - for data in select_data.values(): - for h, column in columns.items(): - try: - value = data[h] - except KeyError: - value = None - column.append(value) - table: pd.DataFrame = pd.DataFrame.from_dict(columns) - table.index = select_data - table = table.sort_index() - table_path = os.path.join(self.db_dir, f'{db_category}_data.tsv') - table.to_csv(table_path, sep='\t') - - def _get_ko_data(self, path: str) -> Dict: - """ - Get data from a KO database entry. - - Parameters - ========== - path : str - Flat file for KO entry. - - Returns - ======= - dict - Data of interest extracted from the file. - """ - data = {} - section = None - f = open(path) - for line in f: - if line[0] == ' ': - pass - else: - section = line.split()[0] - if section == 'NAME': - # The name value follows 'NAME' at the beginning of the line. - data['name'] = line[4: ].lstrip().rstrip() - # EC numbers associated with the KO are recorded at the end of the name value. - ec_string = re.search('\[EC:.*\]', line) - if ec_string: - data['ec_numbers'] = ec_string[0][4: -1] - elif section == 'DBLINKS': - # There is a row for each linked databaes in this section. There can be a row for - # KEGG Reaction database entries. The first line of the section starts with - # 'DBLINKS' and is followed by a value for a linked database. Values from the linked - # database are separated by ': ' from the name of the database, e.g., 'RN: R00001'. - split_line = line.split() - try: - rn_index = split_line.index('RN:') - except ValueError: - continue - data['reactions'] = ' '.join(split_line[rn_index + 1: ]) - f.close() - return data - - def _get_reaction_data(self, path : str) -> Dict: - """ - Get data from a Reaction database entry. - - Parameters - ========== - path : str - Flat file for Reaction entry. - - Returns - ======= - dict - Data of interest extracted from the file. - """ - data = {} - section = None - f = open(path) - for line in f: - if line[0] == ' ': - pass - else: - section = line.split()[0] - if section == 'ORTHOLOGY': - # A reaction may or may not be associated with KOs that can be involved in its - # catalysis. Each KO ID, formatted 'Kxxxxx', where each 'x' is a digit, is on a - # separate line in the section. The first line starts with 'ORTHOLOGY'. - if line[: 9] == 'ORTHOLOGY': - ko = line[9: ].lstrip()[: 6] - else: - ko = line.lstrip()[: 6] - try: - data['orthology'] += f' {ko}' - except KeyError: - data['orthology'] = ko - f.close() - return data - - def _archive_kegg_db(self, db_category: str) -> None: - """ - Turn the directory of downloaded KEGG database files into a tarball. - - Parameters - ========== - db_category : str - Lowercase name of KEGG database, e.g., 'ko', 'reaction', 'compound'. - """ - self.progress.new(f"Compressing downloaded KEGG {self.db_categories[db_category]} files") - self.progress.update("...") - tar_path = os.path.join(self.db_dir, f'{db_category}.tar.gz') - db_path = os.path.join(self.db_dir, db_category) - with tarfile.open(tar_path, mode='w:gz') as tar: - tar.add(db_path, arcname='.') - self.progress.end() - rmtree(db_path) - self.run.info(f"Downloaded KEGG {self.db_categories[db_category]} files", tar_path) - -def _download_worker( - input_queue: mp.Queue, - output_queue: mp.Queue, - max_num_tries: int = 10, - wait_secs: float = 10.0) -> None: - """Multiprocessing download worker.""" - while True: - url, path = input_queue.get() - num_tries = 0 - while True: - try: - download_file(url, path) - break - except ConfigError as e: - num_tries += 1 - if num_tries > max_num_tries: - raise e - time.sleep(wait_secs) - output_queue.put(True) diff --git a/anvio/data/interactive/js/bin.js b/anvio/data/interactive/js/bin.js index fb85dde8ec..42a0acf569 100644 --- a/anvio/data/interactive/js/bin.js +++ b/anvio/data/interactive/js/bin.js @@ -93,7 +93,7 @@ Bins.prototype.NewBin = function(id, binState) { `}
- ${ mode === 'full' || mode === 'refine' ? ` + ${ mode === 'full' || mode === 'refine' || mode === 'manual' ? `     diff --git a/anvio/data/misc/KEGG-SNAPSHOTS.yaml b/anvio/data/misc/KEGG-SNAPSHOTS.yaml index a74cd6eea1..6324ef9de2 100644 --- a/anvio/data/misc/KEGG-SNAPSHOTS.yaml +++ b/anvio/data/misc/KEGG-SNAPSHOTS.yaml @@ -8,6 +8,8 @@ v2020-04-27: modules_db_version: 1 no_modeling_data: True no_stray_KOs: True + no_binary_relations: True + no_maps: True v2020-06-23: url: https://ndownloader.figshare.com/files/23701919 @@ -16,6 +18,8 @@ v2020-06-23: modules_db_version: 2 no_modeling_data: True no_stray_KOs: True + no_binary_relations: True + no_maps: True v2020-08-06: url: https://ndownloader.figshare.com/files/25464530 @@ -24,6 +28,8 @@ v2020-08-06: modules_db_version: 2 no_modeling_data: True no_stray_KOs: True + no_binary_relations: True + no_maps: True v2020-12-23: url: https://ndownloader.figshare.com/files/25878342 @@ -32,6 +38,8 @@ v2020-12-23: modules_db_version: 2 no_modeling_data: True no_stray_KOs: True + no_binary_relations: True + no_maps: True v2021-12-18: url: https://figshare.com/ndownloader/files/31959416 @@ -40,6 +48,8 @@ v2021-12-18: modules_db_version: 3 no_modeling_data: True no_stray_KOs: True + no_binary_relations: True + no_maps: True v2022-04-14: url: https://figshare.com/ndownloader/files/34817812 @@ -48,6 +58,8 @@ v2022-04-14: modules_db_version: 4 no_modeling_data: True no_stray_KOs: True + no_binary_relations: True + no_maps: True v2023-01-10: url: https://figshare.com/ndownloader/files/38799687 @@ -56,6 +68,8 @@ v2023-01-10: modules_db_version: 4 no_modeling_data: True no_stray_KOs: True + no_binary_relations: True + no_maps: True v2023-09-18: url: https://figshare.com/ndownloader/files/42381873 @@ -64,6 +78,8 @@ v2023-09-18: modules_db_version: 4 no_modeling_data: True no_stray_KOs: True + no_binary_relations: True + no_maps: True v2023-09-22: url: https://figshare.com/ndownloader/files/42428115 @@ -71,6 +87,8 @@ v2023-09-22: hash: a2b5bde358bb modules_db_version: 4 no_stray_KOs: True + no_binary_relations: True + no_maps: True v2024-03-09: url: https://figshare.com/ndownloader/files/44953354 @@ -78,6 +96,21 @@ v2024-03-09: hash: 23910d68b4f2 modules_db_version: 4 no_modeling_data: True + no_binary_relations: True + no_maps: True + +v2024-08-30: + url: https://figshare.com/ndownloader/files/48903154 + archive_name: KEGG_build_2024-08-30_6b658b5c4379.tar.gz + hash: 6b658b5c4379 + modules_db_version: 4 + no_stray_KOs: True + +v2024-09-08: + url: https://figshare.com/ndownloader/files/49080904 + archive_name: KEGG_build_2024-09-08_5a9644d40061.tar.gz + hash: 5a9644d40061 + modules_db_version: 4 # How to add a new KEGG snapshot to this file: # 1. download the latest data directly from KEGG by running diff --git a/anvio/data/misc/PEOPLE/DEVELOPERS.yaml b/anvio/data/misc/PEOPLE/DEVELOPERS.yaml index 3f8b4a804c..0944f02335 100644 --- a/anvio/data/misc/PEOPLE/DEVELOPERS.yaml +++ b/anvio/data/misc/PEOPLE/DEVELOPERS.yaml @@ -24,7 +24,6 @@ - title: Fellow inst: Marine Biological Laboratory inst_link: http://www.mbl.edu/ - current: true - title: Fellow inst: Alfred P. Sloan Fondation inst_link: https://sloan.org/grant-detail/9260 diff --git a/anvio/dbops.py b/anvio/dbops.py index 7ffa12984b..3b9ef769d4 100644 --- a/anvio/dbops.py +++ b/anvio/dbops.py @@ -4218,8 +4218,9 @@ def touch(self): # creating empty default tables for pan specific operations: self.db.create_table(t.pan_gene_clusters_table_name, t.pan_gene_clusters_table_structure, t.pan_gene_clusters_table_types) - self.db.create_table(t.pan_gene_cluster_function_reactions_table_name, t.pan_gene_cluster_function_reactions_table_structure, t.pan_gene_cluster_function_reactions_table_types) - self.db.create_table(t.pan_gene_cluster_function_metabolites_table_name, t.pan_gene_cluster_function_metabolites_table_structure, t.pan_gene_cluster_function_metabolites_table_types) + self.db.create_table(t.pan_reaction_network_reactions_table_name, t.pan_reaction_network_reactions_table_structure, t.pan_reaction_network_reactions_table_types) + self.db.create_table(t.pan_reaction_network_metabolites_table_name, t.pan_reaction_network_metabolites_table_structure, t.pan_reaction_network_metabolites_table_types) + self.db.create_table(t.pan_reaction_network_kegg_table_name, t.pan_reaction_network_kegg_table_structure, t.pan_reaction_network_kegg_table_types) # creating empty default tables for standard anvi'o pan dbs self.db.create_table(t.item_additional_data_table_name, t.item_additional_data_table_structure, t.item_additional_data_table_types) @@ -4354,8 +4355,9 @@ def touch(self, db_variant='unknown'): self.db.create_table(t.genes_taxonomy_table_name, t.genes_taxonomy_table_structure, t.genes_taxonomy_table_types) self.db.create_table(t.contig_sequences_table_name, t.contig_sequences_table_structure, t.contig_sequences_table_types) self.db.create_table(t.gene_function_calls_table_name, t.gene_function_calls_table_structure, t.gene_function_calls_table_types) - self.db.create_table(t.gene_function_reactions_table_name, t.gene_function_reactions_table_structure, t.gene_function_reactions_table_types) - self.db.create_table(t.gene_function_metabolites_table_name, t.gene_function_metabolites_table_structure, t.gene_function_metabolites_table_types) + self.db.create_table(t.reaction_network_reactions_table_name, t.reaction_network_reactions_table_structure, t.reaction_network_reactions_table_types) + self.db.create_table(t.reaction_network_metabolites_table_name, t.reaction_network_metabolites_table_structure, t.reaction_network_metabolites_table_types) + self.db.create_table(t.reaction_network_kegg_table_name, t.reaction_network_kegg_table_structure, t.reaction_network_kegg_table_types) self.db.create_table(t.gene_amino_acid_sequences_table_name, t.gene_amino_acid_sequences_table_structure, t.gene_amino_acid_sequences_table_types) self.db.create_table(t.splits_info_table_name, t.splits_info_table_structure, t.splits_info_table_types) self.db.create_table(t.contigs_info_table_name, t.contigs_info_table_structure, t.contigs_info_table_types) @@ -4784,6 +4786,8 @@ def create(self, args): self.db.set_meta_value('reaction_network_ko_annotations_hash', None) self.db.set_meta_value('reaction_network_kegg_database_release', None) self.db.set_meta_value('reaction_network_modelseed_database_sha', None) + self.db.set_meta_value('reaction_network_consensus_threshold', None) + self.db.set_meta_value('reaction_network_discard_ties', None) self.db.set_meta_value('creation_date', self.get_date()) self.disconnect() diff --git a/anvio/docs/__init__.py b/anvio/docs/__init__.py index ec4ba01b4d..14d26e78aa 100644 --- a/anvio/docs/__init__.py +++ b/anvio/docs/__init__.py @@ -615,6 +615,12 @@ "provided_by_anvio": True, "provided_by_user": False }, + "kegg-pathway-map": { + "name": "KEGG PATHWAY MAP", + "type": "DISPLAY", + "provided_by_anvio": True, + "provided_by_user": False + }, "interactive": { "name": "INTERACTIVE DISPLAY", "type": "DISPLAY", @@ -944,7 +950,8 @@ "type": "CONCEPT", "provided_by_anvio": True, "provided_by_user": False - }, "contig-rename-report-txt": { + }, + "contig-rename-report-txt": { "name": "CONTIG RENAME REPORT TXT", "type": "TXT", "provided_by_anvio": True, diff --git a/anvio/docs/artifacts/kegg-pathway-map.md b/anvio/docs/artifacts/kegg-pathway-map.md new file mode 100644 index 0000000000..3a71bbe5ba --- /dev/null +++ b/anvio/docs/artifacts/kegg-pathway-map.md @@ -0,0 +1 @@ +Output PDF files produced by %(anvi-draw-kegg-pathways)s which show KEGG pathway map images. Files can consist of a single map or a grid of the same map with different information on each. diff --git a/anvio/docs/artifacts/reaction-network.md b/anvio/docs/artifacts/reaction-network.md index bb9df9371a..63c0e296f9 100644 --- a/anvio/docs/artifacts/reaction-network.md +++ b/anvio/docs/artifacts/reaction-network.md @@ -1,5 +1,15 @@ -This artifact represents **the metabolic reaction network stored in a %(contigs-db)s or a %(pan-db)s by %(anvi-reaction-network)s.** +This artifact represents the metabolic reaction network, which can be stored in a %(contigs-db)s or a %(pan-db)s by %(anvi-reaction-network)s. -The program, %(anvi-reaction-network)s, generates a reaction network from genes encoding enzymes in the %(contigs-db)s or from gene clusters with consensus enzyme annotations in the %(pan-db)s. The reaction network represents biochemical reactions and the constituent metabolites predicted from the genome or pangenome. The program relies upon [KEGG Orthology (KO)](https://www.genome.jp/kegg/ko.html) annotations of protein-coding genes and reference data in the [ModelSEED Biochemistry database](https://github.com/ModelSEED/ModelSEEDDatabase), and is therefore subject to all the limitations thereof, including incomplete annotation of genes with protein orthologs and imprecise knowledge of the reactions catalyzed by enzymes. +Reaction networks enable investigations of biochemical pathways at the molecular level, protein and metabolite abundances, and the relation of these to genomic data stored in anvi'o databases. Networks can be exported as a file formatted for compatibility with metabolic modeling packages such as COBRApy: see %(anvi-get-metabolic-model-file)s. -The representation of the reaction network in two tables of the %(contigs-db)s, `gene_function_reactions` and `gene_function_metabolites`, is generalizable to other sources of metabolic data, linking genes to predicted functional orthologs and the associated reactions and metabolites. Reaction and metabolite data are likewise stored in the identically formatted tables, `gene_cluster_function_reactions` and `gene_cluster_function_metabolites`, in the %(pan-db)s. This data can be exported to a JSON-formatted file by %(anvi-get-metabolic-model-file)s for inspection and metabolic model analyses. +## Network structure + +The biochemical reactions used to construct a network are derived from [KEGG protein ortholog (KO)](https://www.genome.jp/kegg/ko.html) annotations of genes. Many KOs reference [KEGG REACTION IDs](https://www.genome.jp/kegg/reaction/) and [Enzyme Commission (EC) numbers](https://www.enzyme-database.org/class.php). These reaction accessions are in turn related to the [ModelSEED Biochemistry database](https://github.com/ModelSEED/ModelSEEDDatabase) to retrieve standardized data on reaction properties. The network's representation of metabolism is subject to all the limitations of these references, including missing protein annotations of genes and imprecise knowledge of the reactions catalyzed by enzymes. The creation of a functional flux balance model from the draft model represented by the network requires manual curation, including gap-filling of missing reactions. + +![The basic structure of a reaction network](../../images/network_basic.png){:.center-img .width-50} + +A reaction network has the following structure. There are four types of nodes representing genes (or gene clusters in a pangenomic network), KOs, (ModelSEED) reactions, and metabolites (ModelSEED compounds). When the network is constructed, only gene KO annotations with stoichiometrically defined reactions are included. Each gene node references one or more KO nodes (each gene cluster node references only one KO node); KO nodes reference one or more reaction nodes; reaction nodes reference the metabolite nodes defining the reaction. The network stores a variety of information, including: KEGG Reaction and EC number aliases of the ModelSEED reactions; the strength of gene-KO annotations (e-values); reaction stoichiometry, reversibility, and substrate compartmentalization ("cytosolic" or "extracellular") according to the ModelSEED database; and compound formulae, structures as SMILES strings, and charges according to the ModelSEED database. + +![The structure of a reaction network with imported protein and metabolite abundances](../../images/network_abunds.png){:.center-img .width-50} + +Protein and metabolite abundance data can be imported into a reaction network, enabling analysis of proteomics and metabolomics data in the context of biochemical pathways. %(anvi-import-protein-profile)s and %(anvi-import-metabolite-profile)s load abundance data into the network from tab-delimited files. New nodes are added to the network for each protein with abundance data; metabolite abundances are assigned to metabolite nodes already in the network. Protein and gene nodes reference each other, and each gene is currently only allowed to express a single protein. diff --git a/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_color_blue.png b/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_color_blue.png new file mode 100644 index 0000000000..d58637a7e2 Binary files /dev/null and b/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_color_blue.png differ diff --git a/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_color_original.png b/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_color_original.png new file mode 100644 index 0000000000..159cb7a900 Binary files /dev/null and b/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_color_original.png differ diff --git a/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_database_grid.png b/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_database_grid.png new file mode 100644 index 0000000000..eefa3f2179 Binary files /dev/null and b/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_database_grid.png differ diff --git a/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_pan.png b/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_pan.png new file mode 100644 index 0000000000..882ff43996 Binary files /dev/null and b/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_pan.png differ diff --git a/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_pan_grid.png b/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_pan_grid.png new file mode 100644 index 0000000000..23cd723de1 Binary files /dev/null and b/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_pan_grid.png differ diff --git a/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_reverse_colormap.png b/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_reverse_colormap.png new file mode 100644 index 0000000000..b6eb11acbe Binary files /dev/null and b/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_reverse_colormap.png differ diff --git a/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_single_contigs_db.png b/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_single_contigs_db.png new file mode 100644 index 0000000000..c649fc4e77 Binary files /dev/null and b/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_single_contigs_db.png differ diff --git a/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_six_contigs_dbs.png b/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_six_contigs_dbs.png new file mode 100644 index 0000000000..397f1cedd4 Binary files /dev/null and b/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_six_contigs_dbs.png differ diff --git a/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_three_contigs_dbs.png b/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_three_contigs_dbs.png new file mode 100644 index 0000000000..fbc1e042d7 Binary files /dev/null and b/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_three_contigs_dbs.png differ diff --git a/anvio/docs/images/png/network_abunds.png b/anvio/docs/images/png/network_abunds.png new file mode 100644 index 0000000000..f0082ad4aa Binary files /dev/null and b/anvio/docs/images/png/network_abunds.png differ diff --git a/anvio/docs/images/png/network_basic.png b/anvio/docs/images/png/network_basic.png new file mode 100644 index 0000000000..eed9188f5b Binary files /dev/null and b/anvio/docs/images/png/network_basic.png differ diff --git a/anvio/docs/programs/anvi-compute-functional-enrichment.md b/anvio/docs/programs/anvi-compute-functional-enrichment.md deleted file mode 100644 index 199659020f..0000000000 --- a/anvio/docs/programs/anvi-compute-functional-enrichment.md +++ /dev/null @@ -1,5 +0,0 @@ -Hi there :) This program is now deprecated. Please continue with one of these below instead: - -* %(anvi-compute-metabolic-enrichment)s -* %(anvi-compute-functional-enrichment-in-pan)s -* %(anvi-compute-functional-enrichment-across-genomes)s diff --git a/anvio/docs/programs/anvi-draw-kegg-pathways.md b/anvio/docs/programs/anvi-draw-kegg-pathways.md new file mode 100644 index 0000000000..d6aabf654e --- /dev/null +++ b/anvio/docs/programs/anvi-draw-kegg-pathways.md @@ -0,0 +1,204 @@ +%(anvi-draw-kegg-pathways)s draws %(kegg-pathway-map)s files incorporating data from anvi'o databases. The visualization of user data in the context of KEGG's curated biochemical pathways can reveal patterns in metabolism. + +## Setup + +There are hundreds of pathway maps, listed and categorized [here](https://www.genome.jp/kegg/pathway.html). %(anvi-setup-kegg-data)s downloads, among other files, the maps that have corresponding [XML files](https://www.kegg.jp/kegg/xml/) that allow elements of the map to be modified. The following command sets up the database in a default anvi'o directory. + +{{ codestart }} +anvi-setup-kegg-data +{{ codestop }} + +Additional Python packages may be needed if you installed anvi'o `v8.0-dev` before this program's package requirements were included. These can be installed with the following command. + +{{ codestart }} +pip install biopython ReportLab pymupdf frontend +{{ codestop }} + +### Download newest available files + +Alternatively, KEGG data can be set up not from a snapshot but by downloading the newest files available from KEGG using the `-D` flag. In the following command, a higher number of download threads than the default of 1 is provided by `-T`, which significantly speeds up downloading. + +{{ codestart }} +anvi-setup-kegg-data -D -T 5 +{{ codestop }} + +### Install in non-default location + +To preserve KEGG data that you already have set up for whatever reason, the new snapshot or download can be placed in a non-default location using the option, `--kegg-data-dir`. + +{{ codestart }} +anvi-setup-kegg-data --kegg-data-dir path/to/other/directory +{{ codestop }} + +`anvi-draw-kegg-pathways` requires a `--kegg-dir` argument to seek KEGG data in a non-default location. + +## Pathway selection + +By default, this program draws the maps that contain data of interest, e.g., KO gene annotations in a %(contigs-db)s. + +To draw _all_ maps available in %(kegg-data)s, including those that don't contain data of interest, use the flag, `--draw-bare-maps`. + +The option, `--pathway-numbers`, limits the output to maps of interest. A single ID number can be provided, e.g., `00010` for `Glycolysis / Gluconeogenesis`, or multiple numbers can be listed, e.g., `00010 00020`. Regular expressions can also be provided, e.g., `011.. 012..`, where `.` represents any character: here the set of numbers given by `011..` corresponds to "global" maps and `012..` to "overview" maps. + +The following command would draw all global maps and the glycolysis map, regardless of whether they contain any anvi'o data of interest (here, KO annotations from a contigs database). + +{{ codestart }} +anvi-draw-kegg-pathways --contigs-dbs %(contigs-db)s \ + -o output_dir \ + --draw-bare-maps \ + --ko \ + --pathway-numbers 011.. 00010 +{{ codestop }} + +## Output file names + +Output file names just contain the ID of each map by default, i.e., `kos_00010.pdf` for `Glycolysis / Gluconeogenesis`. The `--name-files` flag attaches an altered version of the pathway name to the file name, i.e., `kos_00010_Glycolysis_Gluconeogenesis.pdf`. + +## KO occurrence + +Gene sequences in anvi'o databases can be annotated with KEGG Orthologs (KOs): see %(anvi-run-kegg-kofams)s. A KO indicates functional capabilities of the gene product. KO data from one or more contigs databases or a pan database can be mapped using the `--ko` flag, enabling investigation of the metabolic capabilities of individual organisms or multiple organisms, including community samples. Reactions associated with KOs are colored on the pathway maps. + +### Single contigs database + +Here is the basic command to draw KO data from a single %(contigs-db)s. + +{{ codestart }} +anvi-draw-kegg-pathways --contigs-dbs %(contigs-db)s \ + -o output_dir \ + --ko +{{ codestop }} + +Here are three maps drawn with this command from a bacterial genomic contigs database. The map in the upper left, `00010 Glycolysis / Gluconeogenesis`, is a "standard" map, in which boxes are associated with a reaction arrow and one or more KOs. The map in the upper right, `01200 Carbon metabolism`, is a metabolic "overview" map. Overview maps have numerical IDs in the range `012XX` and `013XX`. Reaction arrows in overview maps are associated with one or more KOs and are colored and widened if represented by anvi'o KO data. The bottom map, `01100 Metabolic pathways`, is a "global" metabolic map. Global maps have numerical IDs in the range `011XX`. Reaction lines in global maps are associated with one or more KOs and colored if represented by anvi'o KO data. In all maps, circles are colored if the compound they represent is involved in reactions that are also colored. (Occasionally complete data linking reaction and compound graphics is missing from the KEGG reference files, preventing the reaction color from being imparted to the compound. One such error can be seen at the very top of the overview map of `Carbon metabolism`, where `Glucono-1,5-lactone` is white when it should be green.) + +![Three maps showing KOs from a single contigs database](../../images/anvi-draw-kegg-pathways/kos_single_contigs_db.png) + +#### Set color + +The default color can be changed with the `--set-color` option. + +The argument value can be a color hex code, e.g., `"#FF0000"` for red. It is necessary to enclose a color hex code argument value in quotation marks, as `#` otherwise causes the rest of the command to be ignored as a comment. + +{{ codestart }} +anvi-draw-kegg-pathways --contigs-dbs %(contigs-db)s \ + -o output_dir \ + --pathway-numbers 00010 \ + --ko \ + --set-color "#2986cc" +{{ codestop }} + +![Change color to blue](../../images/anvi-draw-kegg-pathways/kos_color_blue.png) + +The argument value can also be the string, `original`, for the original color scheme of the reference map. Global maps are especially colorful, with reactions varying in color across the map as a broad indication of function. + +{{ codestart }} +anvi-draw-kegg-pathways --contigs-dbs %(contigs-db)s \ + -o output_dir \ + --pathway-numbers 00010 01100 01200 \ + --ko \ + --set-color original +{{ codestop }} + +![Use original color scheme](../../images/anvi-draw-kegg-pathways/kos_color_original.png) + +### Multiple contigs databases + +The KO content of multiple contigs databases can be compared. Database file paths can be provided directly on the command line or in an %(external-genomes)s text file. + +{{ codestart }} +anvi-draw-kegg-pathways --contigs-dbs %(contigs-db)s_1 %(contigs-db)s_2 ... %(contigs-db)s_N \ + -o output_dir \ + --ko +{{ codestop }} + +{{ codestart }} +anvi-draw-kegg-pathways --external-genomes %(external-genomes)s \ + -o output_dir \ + --ko +{{ codestop }} + +The images in this section show data from contigs databases of genomes from different strains of the same bacterial species. + +#### Color by database + +When comparing a small number of contigs databases (realistically, two or three), reactions can be colored by their occurrence across databases, with each color representing a different database or combination of databases. A colorbar key is drawn in a separate file in the output directory, `colorbar.pdf`. Compound circles are imparted the color of the associated reaction found in the greatest number of databases. + +![Three maps showing KOs from three contigs databases](../../images/anvi-draw-kegg-pathways/kos_three_contigs_dbs.png) + +#### Color by count + +When comparing a larger number of contigs databases, it makes more sense to color reactions by the number of databases in which they occur using a sequential colormap rather than by database or combination of databases using a qualitative colormap. By default, coloring explicitly by database automatically applies to three or fewer databases, whereas coloring by database count applies to four or more databases. The user can override this default with the argument, `--colormap-scheme`, which accepts the values `by_database` and `by_count`. For example, the user may have three databases but wish to color reactions by database count, and so would specify `--colormap-scheme by_count`. + +![Three maps showing KOs from six contigs databases](../../images/anvi-draw-kegg-pathways/kos_six_contigs_dbs.png) + +#### Reverse colormap + +Changing the colormap can draw attention to different information on maps. When coloring by count, the default sequential colormap, `plasma_r`, goes from dark to light colors; reactions shared among all of the contigs databases are assigned the darkest color, and reactions unique to a single database are assigned the lightest color. The colormap can be reversed to accentuate unshared reactions in the darkest colors and shared reactions in the lightest colors. Reversing the default colormap is accomplished with the option, `--colormap plasma 0.1 0.9`. Note that Matplotlib colormap names differing by `_r` (here, `plasma` and `plasma_r`) have the same colors in reverse. + +The second and third numerical `--colormap` values are not mandatory, but can be provided to trim a fraction of the colormap from each end to eliminate the lightest and darkest colors. The default coloring by database count with `plasma_r` uses limits of `0.1 0.9`. Just changing the colormap (e.g., `--colormap plasma`) removes the limits (i.e., changes them to `0.0 1.0`), so exactly reversing the default colormap requires that the same limits be specified. + +The `--reverse-overlay` flag should also be used to reverse the default drawing order. This causes unshared reactions to be rendered above rather than below shared reactions, which is especially important in cluttered global maps. + +{{ codestart }} +anvi-draw-kegg-pathways --external-genomes %(external-genomes)s \ + -o output_dir \ + --ko \ + --colormap plasma 0.1 0.9 \ + --reverse-overlay +{{ codestop }} + +![Emphasize unshared reactions with reversed coloring](../../images/anvi-draw-kegg-pathways/kos_reverse_colormap.png) + +#### Showing individual database maps + +Coloring by count obviously masks the individual contigs databases that contain the different reactions. However, options are provided to enable investigation of the distribution of reactions across databases. + +Standalone map files showing the presence/absence of reactions in individual contigs databases can be drawn by using the flag, `--draw-individual-files`. + +To facilitate comparisons, maps for individual databases can also be drawn alongside the "unified" map containing information from all databases by using the flag, `--draw-grid`. + +The following command would draw individual map files plus grid files; a reverse colormap is used in unified maps to emphasize unshared reactions. + +{{ codestart }} +anvi-draw-kegg-pathways --external-genomes %(external-genomes)s \ + -o output_dir \ + --draw-grid \ + --draw-individual-files \ + --ko \ + --colormap plasma 0.1 0.9 \ + --reverse-overlay +{{ codestop }} + +The following map grid reveals unique aspects of galactose metabolism among six related genomes. + +![Map grid](../../images/anvi-draw-kegg-pathways/kos_database_grid.png) + +### Pangenomic database + +Pangenomes are treated similarly to multiple contigs databases. Rather than comparing the occurrence of KOs across contigs databases, consensus KO annotations of gene clusters are compared across genomes in a pangenomic database. Here is the basic structure of the command. + +{{ codestart }} +anvi-draw-kegg-pathways -p %(pan-db)s \ + -g %(genomes-storage-db)s \ + -o output_dir \ + --ko +{{ codestop }} + +The following maps were produced with a basic command using a pangenome constructed from 12 strains of two related bacterial species. + +![Three maps showing KOs from a pangenome](../../images/anvi-draw-kegg-pathways/kos_pan.png) + +As with the comparison of contigs databases, it can be useful to reverse the colormap and create map grids to compare the KO content of genomes in the pangenome. + +{{ codestart }} +anvi-draw-kegg-pathways -p %(pan-db)s \ + -g %(genomes-storage-db)s \ + -o output_dir \ + --draw-grid \ + --ko \ + --colormap plasma 0.1 0.9 \ + --reverse-overlay +{{ codestop }} + +The following map grid reveals certain differences between the strains, and particularly the two species, in carbohydrate metabolism, with *faecalis* enriched in enzymes for xylose metabolism (towards the bottom of the map), and *faecium* enriched in enzymes for uronate metabolism (towards the top of the map). + +![Pangenomic map grid](../../images/anvi-draw-kegg-pathways/kos_pan_grid.png) diff --git a/anvio/docs/programs/anvi-import-metabolite-profile.md b/anvio/docs/programs/anvi-import-metabolite-profile.md new file mode 100644 index 0000000000..e0709ddc22 --- /dev/null +++ b/anvio/docs/programs/anvi-import-metabolite-profile.md @@ -0,0 +1,9 @@ +This program imports a metabolite abundance profile, such as from metabolomic experiments, into a %(profile-db)s. + +This program takes as input a tab-delimited file of metabolite abundance data and a %(profile-db)s. The tabular file must have three columns with the following names: "accession", "sample", and "abundance". Each row of the table corresponds to a distinct metabolite abundance measurement. + +- "accession" is the ModelSEED Compound ID, e.g., "cpd00027" for D-glucose. +- "sample" is the name of the sample in which the measurement was made. It need not be the same as any nucleotide sequence samples stored in the profile database. +- "abundance" is the metabolite abundance value, however defined. + +Once metabolite abundances are stored in a profile database, they can be loaded into a metabolic %(reaction-network)s for analysis in the context of biochemical pathways. Metabolites in the network are defined in terms of ModelSEED Compounds. diff --git a/anvio/docs/programs/anvi-import-protein-profile.md b/anvio/docs/programs/anvi-import-protein-profile.md new file mode 100644 index 0000000000..47894b8c52 --- /dev/null +++ b/anvio/docs/programs/anvi-import-protein-profile.md @@ -0,0 +1,10 @@ +This program imports a protein abundance profile, such as from proteomic experiments, into a %(profile-db)s. + +This program takes as input a tab-delimited file of protein abundance data and a %(profile-db)s. The tabular file must have four columns with the following names: "source", "accession", "sample", and "abundance". Each row of the table corresponds to a distinct protein abundance measurement. + +- "source" is the source of the protein accessions. It must be a gene function annotation source stored in the anvi'o %(profile-db)s (available sources can be found with the program, %(anvi-db-info)s). +- "accession" is the protein ID in the annotation source. A contigs database built from a GenBank file, for example, could contain the source, "NCBI_PGAP", and the accession, "WP_011862028.1". +- "sample" is the name of the sample in which the measurement was made. It need not be the same as any nucleotide sequence samples stored in the profile database. +- "abundance" is the protein abundance value, however defined. + +Once protein abundances are stored in a profile database, they can be loaded into a metabolic %(reaction-network)s for analysis in the context of biochemical pathways. diff --git a/anvio/docs/programs/anvi-reaction-network.md b/anvio/docs/programs/anvi-reaction-network.md index 9f36880fb5..055a132a32 100644 --- a/anvio/docs/programs/anvi-reaction-network.md +++ b/anvio/docs/programs/anvi-reaction-network.md @@ -1,25 +1,53 @@ This program **stores a metabolic %(reaction-network)s in a %(contigs-db)s or %(pan-db)s.** -The network consists of data on biochemical reactions predicted to be encoded by the genome or pangenome, referencing the [KEGG Orthology (KO)](https://www.genome.jp/kegg/ko.html) and [ModelSEED Biochemistry](https://github.com/ModelSEED/ModelSEEDDatabase) databases. +The network consists of data on biochemical reactions predicted to be encoded by the genome or pangenome. -Information on the predicted reactions and the involved metabolites are stored in two tables of the %(contigs-db)s or %(pan-db)s. The program, %(anvi-get-metabolic-model-file)s, can be used to export the %(reaction-network)s from the database to a %(reaction-network-json)s file formatted for flux balance analysis. +Information on the predicted reactions and the involved metabolites are stored in tables of the %(contigs-db)s or %(pan-db)s. The program, %(anvi-get-metabolic-model-file)s, can be used to export the %(reaction-network)s from the database to a %(reaction-network-json)s file formatted for input into programs for flux balance analysis. -## Usage +## Setup -%(anvi-reaction-network)s takes a either a %(contigs-db)s OR a %(pan-db)s and %(genomes-storage-db)s as required input. Genes stored within the %(contigs-db)s or %(genomes-storage-db)s must have KO protein annotations, which can be assigned by %(anvi-run-kegg-kofams)s. +%(anvi-setup-kegg-data)s downloads, among other files, the [binary relations files](https://www.genome.jp/brite/br08906) needed to construct a %(reaction-network)s from [KEGG Orthology (KO)](https://www.genome.jp/kegg/ko.html) sequence annotations. The following command sets up the database in a default anvi'o directory. -The KO and ModelSEED Biochemistry databases must be set up and available to the program. By default, these are expected to be set up in default anvi'o data directories. %(anvi-setup-kegg-data)s and %(anvi-setup-modelseed-database)s must be run to set up these databases. +{{ codestart }} +anvi-setup-kegg-data +{{ codestop }} + +%(anvi-setup-modelseed-database)s sets up the [ModelSEED Biochemistry database](https://github.com/ModelSEED/ModelSEEDDatabase), which harmonizes biochemical data from various reference databases, including KEGG. The following command sets up the database in a default anvi'o directory. {{ codestart }} -anvi-reaction-network -c /path/to/contigs-db +anvi-setup-modelseed-database {{ codestop }} -Custom locations for the reference databases can be provided with the flags, `--ko-dir` and `--modelseed-dir`. +### Download newest available KEGG files + +Alternatively, KEGG data can be set up not from a snapshot but by downloading the newest files available from KEGG using the `-D` flag. In the following command, a higher number of download threads than the default of 1 is provided by `-T`, which significantly speeds up downloading. {{ codestart }} -anvi-reaction-network -c /path/to/contigs-db \ - --ko-dir /path/to/set-up/ko-dir \ - --modelseed-dir /path/to/set-up/modelseed-dir +anvi-setup-kegg-data -D -T 5 +{{ codestop }} + +### Install in non-default location + +To preserve KEGG data that you already have set up for whatever reason, the new snapshot or download can be placed in a non-default location using the option, `--kegg-data-dir`. + +{{ codestart }} +anvi-setup-kegg-data --kegg-data-dir path/to/other/directory +{{ codestop }} + +`anvi-reaction-network` requires a `--kegg-dir` argument to seek KEGG data in a non-default location. + +Likewise, different versions of the ModelSEED Biochemistry database can be set up in non-default locations and used with the `--modelseed-dir` argument. + +{{ codestart }} +anvi-setup-modelseed-database --dir path/to/other/directory +{{ codestop }} + +## Usage + +%(anvi-reaction-network)s takes a either a %(contigs-db)s OR a %(pan-db)s and %(genomes-storage-db)s as required input. Genes stored within the %(contigs-db)s or %(genomes-storage-db)s must have KO protein annotations, which can be assigned by %(anvi-run-kegg-kofams)s. + +{{ codestart }} +anvi-reaction-network -c /path/to/contigs-db {{ codestop }} If a %(contigs-db)s already contains a %(reaction-network)s from a previous run of this program, the flag `--overwrite-existing-network` can overwrite the existing network with a new one. For example, if %(anvi-run-kegg-kofams)s is run again on a database using a newer version of KEGG, then %(anvi-reaction-network)s should be rerun to update the %(reaction-network)s derived from the KO annotations. diff --git a/anvio/docs/programs/anvi-self-test.md b/anvio/docs/programs/anvi-self-test.md new file mode 100644 index 0000000000..0448271666 --- /dev/null +++ b/anvio/docs/programs/anvi-self-test.md @@ -0,0 +1 @@ +This program enables an anvi'o user to run component tests on various anvi'o functionality to ensure they are working on their system. diff --git a/anvio/docs/programs/anvi-setup-kegg-data.md b/anvio/docs/programs/anvi-setup-kegg-data.md index c312d8ffe6..7d246e1129 100644 --- a/anvio/docs/programs/anvi-setup-kegg-data.md +++ b/anvio/docs/programs/anvi-setup-kegg-data.md @@ -75,7 +75,7 @@ Not sure what KEGG snapshots are available for you to request? Well, you could c anvi-setup-kegg-data --kegg-snapshot hahaha {{ codestop }} -Note that the latter method only shows you the date that each available snapshot was created. If you need more details about what types of data is included in each snapshot, you should look at the YAML file, which annotates each snapshot with a bit more detail. For example, the following entry does not contain metabolic modeling data OR models/thresholds for 'stray KOs': +Note that the latter method only shows you the date that each available snapshot was created. If you need more details about what types of data is included in each snapshot, you should look at the YAML file, which annotates each snapshot with a bit more detail. For example, the following entry does not contain metabolic modeling data (now obsolete as of anvi'o `v8.0-dev`), models/thresholds for 'stray KOs', binary relations files needed for reaction networks, or maps used for pathway visualization: ``` v2023-09-18: @@ -85,6 +85,8 @@ v2023-09-18: modules_db_version: 4 no_modeling_data: True no_stray_KOs: True + no_binary_relations: True + no_maps: True ``` ## Getting the most up-to-date KEGG data: downloading directly from KEGG @@ -182,15 +184,35 @@ anvi-setup-kegg-data --mode modules \ --overwrite-output-destinations {{ codestop }} -### Avoiding BRITE setup +### Avoiding files used in later versions of anvi'o -As of anvi'o `v7.1-dev` or later, KEGG BRITE hierarchies are added to the %(modules-db)s when running this program with `--mode modules`. If you don't want this cool new feature - because you are a rebel, or adverse to change, or something is not working on your computer, whatever - then fine. You can use the `--skip-brite-hierarchies` flag: +As anvi'o has expanded, new types of KEGG files have been included in the data pack. We highly recommend including these files, as this program does by default, when downloading and setting up KEGG data. However, options are available to skip these files -- perhaps you're a rebel, or adverse to change, or something is not working on your computer... whatever, all good by us. + +It should make sense to you that these flags do not work when setting up from a KEGG snapshot that already includes the newer types of files. + +#### BRITE hierarchies + +As of anvi'o `v7.1-dev` or later, [KEGG BRITE hierarchies](https://www.genome.jp/kegg/brite.html) are added to the %(modules-db)s when running this program with `--mode modules`. These hierarchies are especially useful for classifying and making sense of KOs and other KEGG data. To avoid these files, use the `--skip-brite-hierarchies` flag: {{ codestart }} anvi-setup-kegg-data --mode modules --skip-brite-hierarchies {{ codestop }} -Hopefully it makes sense to you that this flag does not work when setting up from a KEGG snapshot that already includes BRITE data in it. +#### Reaction network + +As of anvi'o `v8.0-dev`, [KEGG binary relations files](https://www.genome.jp/brite/br08906) are included in KEGG data when running this program with `--mode modules`. These are needed for %(reaction-network)s construction by %(anvi-reaction-network)s, which is how biochemical data associated with KOs is now interpreted and analyzed. To avoid these files, use the `--skip-binary-relations` flag: + +{{ codestart }} +anvi-setup-kegg-data --mode modules --skip-binary-relations +{{ codestop }} + +#### Pathway visualization + +As of anvi'o `v8.0-dev`, [KEGG pathway map](https://www.genome.jp/kegg/pathway.html) PNG and KGML files are included in KEGG data when running this program with `--mode modules`. These are needed by %(anvi-draw-kegg-pathways)s to visualize anvi'o data in the context of biochemical pathways. To avoid these files, use the `--skip-map-images` flag: + +{{ codestart }} +anvi-setup-kegg-data --mode modules --skip-map-images +{{ codestop }} ### How do I share this data? Suppose you have been living on the edge and annotating your contigs databases with a non-default version of %(kegg-data)s, and you share these databases with a collaborator who wants to run downstream programs like %(anvi-estimate-metabolism)s on them. Your collaborator (who has a different version of %(kegg-data)s on their computer) will likely get version errors as detailed on the %(anvi-estimate-metabolism)s help page. @@ -217,7 +239,7 @@ Periodically (especially before releasing a new version of anvi'o), we want to a Available KEGG snapshots are stored in the anvi'o code repository in `anvio/data/misc/KEGG-SNAPSHOTS.yaml`. To add a new snapshot, you first need to create one by downloading and processing the data from KEGG, testing to make sure it works, and then updating this file. Here are the steps: -1. Download the latest data directly from KEGG by running `anvi-setup-kegg-data -D --kegg-data-dir ./KEGG -T 5`. This will create the new KEGG data folder with its %(modules-db)s in your current working directory. Make sure you use the exact folder name of `./KEGG`, because that is what anvi'o expects to find when it unpacks a KEGG snapshot. You may want to reduce or increase the number of threads (`-T`) according to your available compute resources. +1. Download the latest data directly from KEGG by running `anvi-setup-kegg-data -D --include-stray-KOs --kegg-data-dir ./KEGG -T 5`. This will create the new KEGG data folder with its %(modules-db)s in your current working directory. Make sure you use the exact folder name of `./KEGG`, because that is what anvi'o expects to find when it unpacks a KEGG snapshot. You may want to reduce or increase the number of threads (`-T`) according to your available compute resources. 2. Get the hash value and version info from the MODULES.db by running `anvi-db-info ./KEGG/MODULES.db`. 3. Archive the KEGG data directory by running `tar -czvf KEGG_build_YYYY-MM-DD_HASH.tar.gz ./KEGG`. Please remember to replace YYYY-MM-DD with the current date and replace HASH with the MODULES.db hash value obtained in step 2. This convention makes it easier to distinguish between KEGG snapshots by simply looking at the file name. 4. Test that setup works with this archive by running `anvi-setup-kegg-data --kegg-archive KEGG_build_YYYY-MM-DD_HASH.tar.gz --kegg-data-dir TEST_NEW_KEGG_ARCHIVE`. diff --git a/anvio/errors.py b/anvio/errors.py index 9dd1d4f733..59ef0b0444 100644 --- a/anvio/errors.py +++ b/anvio/errors.py @@ -37,8 +37,8 @@ def __init__(self, e=None): return def __str__(self): - max_len = max([len(l) for l in textwrap.fill(self.e, 80).split('\n')]) - error_lines = ['%s%s' % (l, ' ' * (max_len - len(l))) for l in textwrap.fill(self.e, 80).split('\n')] + max_len = max([len(l) for l in textwrap.fill(textwrap.dedent(self.e), 80).split('\n')]) + error_lines = ['%s%s' % (l, ' ' * (max_len - len(l))) for l in textwrap.fill(textwrap.dedent(self.e), 80).split('\n')] error_message = ['%s: %s' % (color_text(self.error_type, 'red'), error_lines[0])] for error_line in error_lines[1:]: diff --git a/anvio/kegg.py b/anvio/kegg.py index 6ffdbc172c..3b58e51fd9 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -17,7 +17,7 @@ import multiprocessing as mp from scipy import stats -from typing import List +from typing import Dict, List, Tuple, Union import anvio import anvio.db as db @@ -26,8 +26,6 @@ import anvio.filesnpaths as filesnpaths import anvio.tables as t import anvio.ccollections as ccollections -import anvio.biochemistry.reactionnetwork as reactionnetwork -from anvio.biochemistry.reactionnetwork import _download_worker from anvio.errors import ConfigError from anvio.drivers.hmmer import HMMer @@ -294,6 +292,10 @@ # and to the relevant step metadata clause in write_stat_to_matrix() STEP_METADATA_HEADERS = ["step_definition"] +# Global and overview map IDs have certain ranges of numbers. +GLOBAL_MAP_ID_PATTERN = re.compile(r'\d{1}11\d{2}') +OVERVIEW_MAP_ID_PATTERN = re.compile(r'\d{1}1[23]\d{2}') + class KeggContext(object): """The purpose of this base class is to define shared functions and file paths for all KEGG operations.""" @@ -310,6 +312,38 @@ def __init__(self, args): self.kegg_hmm_data_dir = os.path.join(self.kegg_data_dir, "HMMs") self.pathway_data_dir = os.path.join(self.kegg_data_dir, "pathways") self.brite_data_dir = os.path.join(self.kegg_data_dir, "BRITE") + self.binary_relation_data_dir = os.path.join(self.kegg_data_dir, "binary_relations") + + # The 'KEGG/map_images' directory has a structure of nested directories. 'map_images' + # contains 'png' for image files and 'kgml' for XML mapping files. Within both 'png' and + # 'kgml' are directories, '1x' and '2x', for lower and higher resolution maps. 'png/1x' + # contains 5 directories of image files highlighting different things: 'map', 'ko', 'ec', + # 'rn', and 'org'. 'png/2x' contains 1 directory, 'map', as higher resolution images are + # only available for manually drawn maps. 'kgml/1x' and 'kgml/2x' each contain 4 directories + # of XML files that allow modification of different lower and higher resolution maps: 'ko', + # 'ec', 'rn', and 'org'. + self.map_image_data_dir = os.path.join(self.kegg_data_dir, "map_images") + self.png_dir = os.path.join(self.map_image_data_dir, "png") + self.kgml_dir = os.path.join(self.map_image_data_dir, "kgml") + self.png_1x_dir = os.path.join(self.png_dir, "1x") + self.png_2x_dir = os.path.join(self.png_dir, "2x") + self.png_1x_map_dir = os.path.join(self.png_1x_dir, "map") + self.png_1x_ko_dir = os.path.join(self.png_1x_dir, "ko") + self.png_1x_ec_dir = os.path.join(self.png_1x_dir, "ec") + self.png_1x_rn_dir = os.path.join(self.png_1x_dir, "rn") + self.png_1x_org_dir = os.path.join(self.png_1x_dir, "org") + self.png_2x_map_dir = os.path.join(self.png_2x_dir, "map") + self.kgml_1x_dir = os.path.join(self.kgml_dir, "1x") + self.kgml_2x_dir = os.path.join(self.kgml_dir, "2x") + self.kgml_1x_ko_dir = os.path.join(self.kgml_1x_dir, "ko") + self.kgml_1x_ec_dir = os.path.join(self.kgml_1x_dir, "ec") + self.kgml_1x_rn_dir = os.path.join(self.kgml_1x_dir, "rn") + self.kgml_1x_org_dir = os.path.join(self.kgml_1x_dir, "org") + self.kgml_2x_ko_dir = os.path.join(self.kgml_2x_dir, "ko") + self.kgml_2x_ec_dir = os.path.join(self.kgml_2x_dir, "ec") + self.kgml_2x_rn_dir = os.path.join(self.kgml_2x_dir, "rn") + self.kgml_2x_org_dir = os.path.join(self.kgml_2x_dir, "org") + self.quiet = A('quiet') or False self.just_do_it = A('just_do_it') @@ -323,6 +357,9 @@ def __init__(self, args): self.kegg_pathway_file = os.path.join(self.kegg_data_dir, "pathways.keg") self.kegg_brite_hierarchies_file = os.path.join(self.kegg_data_dir, "hierarchies.json") self.kegg_modules_db_path = os.path.join(self.kegg_data_dir, "MODULES.db") + self.kegg_binary_relation_files = {('KO', 'EC'): "ko2ec.xl", ('KO', 'RN'): "ko2rn.xl"} + self.kegg_pathway_list_file = os.path.join(self.kegg_data_dir, "pathway_list.tsv") + self.kegg_map_image_kgml_file = os.path.join(self.kegg_data_dir, "map_kgml.tsv") if self.user_input_dir: self.user_module_data_dir = os.path.join(self.user_input_dir, "modules") @@ -394,17 +431,17 @@ def setup_ko_dict(self, exclude_threshold=True, suppress_warnings=False): if not suppress_warnings: self.run.warning("FYI, we are including KOfams that do not have a bitscore threshold in the analysis.") - + def setup_stray_ko_dict(self, add_entries_to_regular_ko_dict=False): """This class sets up a dictionary of predicted bit score thresholds for stray KOs, if possible. - - Those predicted thresholds are generated during `anvi-setup-kegg-data --include-stray-KOs` + + Those predicted thresholds are generated during `anvi-setup-kegg-data --include-stray-KOs` (see KOfamDownload.process_all_stray_kos()), and are stored in a file that looks like this: knum threshold score_type definition K11700 800.4 full poly(A) RNA polymerase Cid12 [EC:2.7.7.19] K14747_anvio_version 1054.2 full benzoylacetate-CoA ligase [EC:6.2.1.-] - + The dictionary structure is identical to that of self.ko_dict. Note that the `knum` column can contain normal KEGG Ortholog accessions (for KOs whose HMMs we haven't updated) and accessions that end with STRAY_KO_ANVIO_SUFFIX (for KOs that we created new models for). @@ -414,7 +451,7 @@ def setup_stray_ko_dict(self, add_entries_to_regular_ko_dict=False): Parameters ========== add_entries_to_regular_ko_dict : Boolean - If True, we don't create a separate self.stray_ko_dict but instead add the stray KOs to the + If True, we don't create a separate self.stray_ko_dict but instead add the stray KOs to the regular self.ko_dict attribute. Useful if you don't need to keep the two sets separate. """ @@ -422,6 +459,8 @@ def setup_stray_ko_dict(self, add_entries_to_regular_ko_dict=False): if add_entries_to_regular_ko_dict: stray_kos = utils.get_TAB_delimited_file_as_dictionary(self.stray_ko_thresholds_file) self.ko_dict.update(stray_kos) + # initialize it to None so that things don't break if we try to access this downstream + self.stray_ko_dict = None else: self.stray_ko_dict = utils.get_TAB_delimited_file_as_dictionary(self.stray_ko_thresholds_file) else: @@ -564,7 +603,8 @@ def __init__(self, args, run=run, progress=progress, skip_init=False): self.only_processing = True if A('only_processing') else False self.skip_init = skip_init self.skip_brite_hierarchies = True if A('skip_brite_hierarchies') else False - + self.skip_binary_relations = True if A('skip_binary_relations') else False + self.skip_map_images = True if A('skip_map_images') else False if self.kegg_archive_path and self.download_from_kegg: raise ConfigError("You provided two incompatible input options, --kegg-archive and --download-from-kegg. " @@ -589,7 +629,7 @@ def __init__(self, args, run=run, progress=progress, skip_init=False): KeggContext.__init__(self, self.args) # get KEGG snapshot info for default setup - self.target_snapshot = self.kegg_snapshot or 'v2024-03-09' + self.target_snapshot = self.kegg_snapshot or 'v2024-09-08' self.target_snapshot_yaml = os.path.join(os.path.dirname(anvio.__file__), 'data/misc/KEGG-SNAPSHOTS.yaml') self.snapshot_dict = utils.get_yaml_as_dict(self.target_snapshot_yaml) @@ -605,8 +645,6 @@ def __init__(self, args, run=run, progress=progress, skip_init=False): # default download path for KEGG snapshot self.default_kegg_data_url = self.snapshot_dict[self.target_snapshot]['url'] self.default_kegg_archive_file = self.snapshot_dict[self.target_snapshot]['archive_name'] - self.expect_modeling_files_in_archive = True if 'no_modeling_data' in self.snapshot_dict[self.target_snapshot].keys() and \ - (not self.snapshot_dict[self.target_snapshot]['no_modeling_data']) else False # the KEGG API URL, in case its needed downstream self.kegg_rest_api_get = "http://rest.kegg.jp/get" @@ -720,8 +758,10 @@ def setup_from_archive(self): utils.tar_extract_file(self.kegg_archive_path, output_file_path=unpacked_archive_name, keep_original=True) self.progress.update('Checking KEGG archive structure and contents...') - archive_is_ok = self.kegg_archive_is_ok(unpacked_archive_name, no_modeling_is_ok = (not self.expect_modeling_files_in_archive)) + archive_is_ok = self.kegg_archive_is_ok(unpacked_archive_name) archive_contains_brite = self.check_archive_for_brite(unpacked_archive_name) + archive_contains_binary_relations = self.check_archive_for_binary_relations(unpacked_archive_name) + archive_contains_map_images = self.check_archive_for_map_images(unpacked_archive_name) self.progress.end() if archive_is_ok: if os.path.exists(self.kegg_data_dir): @@ -735,6 +775,22 @@ def setup_from_archive(self): "This is not a problem, and KEGG set up proceeded without it. BRITE is guaranteed to be set up when " "downloading the latest version of KEGG with `anvi-setup-kegg-data`.") + if not archive_contains_binary_relations and not self.skip_binary_relations: + self.run.warning( + "The KEGG data archive does not contain the binary relation files needed for " + "`anvi-reaction-network`. This is not a problem, and KEGG setup proceeded " + "without it. Binary relation files are guaranteed to be set up when " + "downloading the latest version of KEGG with `anvi-setup-kegg-data`." + ) + + if not archive_contains_map_images and not self.skip_map_images: + self.run.warning( + "The KEGG data archive does not contain the pathway map image files used for " + "pathway visualization. This is not a problem, and KEGG setup proceeded " + "without it. Map image files are guaranteed to be set up when downloading the " + "latest version of KEGG with `anvi-setup-kegg-data`." + ) + # if necessary, warn user about migrating the modules db self.check_modules_db_version() @@ -791,6 +847,65 @@ def check_archive_for_brite(self, unpacked_archive_path): return is_brite_included + def check_archive_for_binary_relations(self, unpacked_archive_path): + """ + Check the archive for the binary relations directory and files. + + It is ok for archives not to have these present, but let the user know. + """ + path_to_kegg_in_archive = os.path.join(unpacked_archive_path, "KEGG") + binary_relation_data_dir = os.path.join( + path_to_kegg_in_archive, os.path.basename(self.binary_relation_data_dir) + ) + if os.path.isdir(binary_relation_data_dir): + is_binary_relation_dir_included = True + else: + is_binary_relation_dir_included = False + if anvio.DEBUG and not self.skip_binary_relations: + self.run.warning( + "The KEGG archive does not contain the following optional binary relations " + f"directory needed for `anvi-reaction-network`: {binary_relation_data_dir}" + ) + + if is_binary_relation_dir_included: + missing_files = [] + for file in self.kegg_binary_relation_files.values(): + path = os.path.join(binary_relation_data_dir, file) + if not os.path.isfile(path): + missing_files.append(file) + if anvio.DEBUG and missing_files: + self.run.warning( + "The following binary relation files expected in an up-to-date anvi'o KEGG " + f"installation are missing from the directory, '{binary_relation_data_dir}', " + f"in the archive: {', '.join(missing_files)}" + ) + + return is_binary_relation_dir_included + + + def check_archive_for_map_images(self, unpacked_archive_path): + """ + Check the archive for the pathway map directory and image files. + + It is ok for archives not to have these present, but let the user know. + """ + path_to_kegg_in_archive = os.path.join(unpacked_archive_path, "KEGG") + map_image_data_dir = os.path.join( + path_to_kegg_in_archive, os.path.basename(self.map_image_data_dir) + ) + if os.path.isdir(map_image_data_dir): + is_map_image_dir_included = True + else: + is_map_image_dir_included = False + if anvio.DEBUG and not self.skip_map_images: + self.run.warning( + f"The KEGG archive does not contain the following optional pathway map images " + f"directory, which is used in pathway visualization." + ) + + return is_map_image_dir_included + + def setup_kegg_snapshot(self): """This is the default setup strategy in which we unpack a specific KEGG archive. @@ -818,7 +933,7 @@ def setup_kegg_snapshot(self): "has been kept. You may want to remove it later.") - def kegg_archive_is_ok(self, unpacked_archive_path, no_modeling_is_ok = False): + def kegg_archive_is_ok(self, unpacked_archive_path): """This function checks the structure and contents of an unpacked KEGG archive and returns True if it is as expected. Please note that we check for existence of the files that are necessary to run KEGG scripts, but we don't check the file @@ -835,9 +950,6 @@ def kegg_archive_is_ok(self, unpacked_archive_path, no_modeling_is_ok = False): ========== unpacked_archive_path : str Path to the unpacked archive directory - no_modeling_is_ok : boolean - Whether or not we care if modeling data is not found in the archive. This is added for backwards compatibility to the - previous versions of KEGG archives that do not include this data. """ is_ok = True @@ -878,29 +990,6 @@ def kegg_archive_is_ok(self, unpacked_archive_path, no_modeling_is_ok = False): self.run.warning(f"The KEGG archive does not contain the following expected `hmmpress` output: " f"{path_to_expected_hmmpress_file}") - # check modeling files - # this section needs to be kept up to date with any changes to requirements in reactionnetwork.py - # which is a bit silly, but since these two classes don't know about each other it is the workaround we need :( - path_to_modeling_files_in_archive = os.path.join(path_to_kegg_in_archive, "KO_REACTION_NETWORK") - expected_modeling_files = reactionnetwork.KODatabase.expected_files - missing_modeling_files = [] - for f in expected_modeling_files: - path_to_f_in_archive = os.path.join(path_to_modeling_files_in_archive, f) - if not os.path.exists(path_to_f_in_archive): - is_ok = False or no_modeling_is_ok - missing_modeling_files.append(f) - if anvio.DEBUG: - self.run.warning(f"The KEGG archive does not contain the following expected modeling file: " - f"{path_to_f_in_archive}") - - if no_modeling_is_ok and missing_modeling_files: - self.run.warning("Modeling files are missing from the KEGG archive you have set up. However, somebody " - "upstream thinks this is okay. Likely you are setting up an early KEGG snapshot version " - "that doesn't contain this data. That's fine. But please keep in mind that you won't be " - "able to run metabolic modeling. If this is a problem, you should either pick a later " - "KEGG snapshot, or download the modeling data independently using the command " - "`anvi-setup-kegg-data --mode modeling`.") - return is_ok @@ -1214,10 +1303,10 @@ def download_pathways(self): "provide you with a legacy KEGG data archive that you can use to setup KEGG with the --kegg-archive flag." % (file_path, last_line)) - + def extract_data_field_from_kegg_file(self, file_path, target_field): """This function parses a KEGG file and returns the data value associated with the given target field. - + It can work on flat-text files obtained via the REST API (ie, self.kegg_rest_api_get). """ @@ -1225,7 +1314,7 @@ def extract_data_field_from_kegg_file(self, file_path, target_field): f = open(file_path, 'r') current_data_name = None - + for line in f.readlines(): line = line.strip('\n') @@ -1491,9 +1580,9 @@ def move_orphan_files(self): f"We have removed those HMM profiles from the final database. You can find them under the directory " f"'{self.orphan_data_dir}'.") - + def exec_hmmpress_command_on_ko_file(self, hmm_file_path, log_file_path): - """Given a path to a set of KO HMMs and a log file path, this function executes the appropriate + """Given a path to a set of KO HMMs and a log file path, this function executes the appropriate `hmmpress` command and deletes the log file afterwards if it was successful. """ @@ -1531,7 +1620,7 @@ def run_hmmpress(self): def download_ko_files(self, kos_to_download, destination_dir, dont_raise=True): """Multi-threaded download of KEGG Orthology files. - + Parameters ========== kos_to_download: list of str @@ -1588,13 +1677,13 @@ def download_ko_files(self, kos_to_download, destination_dir, dont_raise=True): f"{', '.join(undownloaded)}. Since the function responsible for handling this was " f"told to quit should this happen, well, here we are. If skipping these failed KOs " f"is okay, you could always run this function with `dont_raise=True`.") - + return undownloaded - + def download_kegg_genes_files(self, genes_to_download, destination_dir, dont_raise=True): """Multi-threaded download of KEGG GENES files. - + Parameters ========== genes_to_download: list of str @@ -1651,13 +1740,13 @@ def download_kegg_genes_files(self, genes_to_download, destination_dir, dont_rai f"{', '.join(undownloaded)}. Since the function responsible for handling this was " f"told to quit should this happen, well, here we are. If skipping these failed KOs " f"is okay, you could always run this function with `dont_raise=True`.") - + return undownloaded def get_kegg_gene_accessions_from_ko_files(self, ko_list, ko_file_dir): """Extracts KEGG GENES accessions from KO files and returns a dictionary mapping KO to its GENES. - + Parameters ========== ko_list: list of str @@ -1680,16 +1769,15 @@ def get_kegg_gene_accessions_from_ko_files(self, ko_list, ko_file_dir): for i, acc in enumerate(genes_acc_list): acc_fields = acc.split(": ") # example accession is "CTC: CTC_p60(tetX)" org_code = acc_fields[0].lower() # the organism code (before the colon) needs to be converted to lowercase - gene_name = acc_fields[1].split('(')[0] # the gene name (after the colon) needs to have anything in parentheses removed + gene_name = acc_fields[1] # sometimes we have multiple genes per organism, like this: "PSOM: 113322169 113340172" - if ' ' in gene_name: - all_genes = gene_name.split(' ') - for g in all_genes: - kegg_genes_code = f"{org_code}:{g}" - kegg_genes_code_list.append(kegg_genes_code) - else: - kegg_genes_code_list.append(f"{org_code}:{gene_name}") + all_genes = gene_name.split(' ') + for g in all_genes: + g = g.split('(')[0] # the gene name needs to have anything in parentheses removed. ex. CTC_p60(tetX) becomes CTC_p60 + kegg_genes_code = f"{org_code}:{g}" + kegg_genes_code_list.append(kegg_genes_code) + ko_to_genes[ko] = kegg_genes_code_list return ko_to_genes @@ -1697,7 +1785,7 @@ def get_kegg_gene_accessions_from_ko_files(self, ko_list, ko_file_dir): def kegg_gene_sequences_to_fasta_file(self, kegg_genes_files, target_fasta_file): """This function extracts the amino acid sequences for a list of KEGG GENES and prints them to a FASTA file. - + Parameters ========== kegg_genes_files : List of str @@ -1708,7 +1796,7 @@ def kegg_gene_sequences_to_fasta_file(self, kegg_genes_files, target_fasta_file) Returns ======= seq_tuples : List of tuples - Each sequence added to the FASTA file is also returned in this list, where each tuple contains + Each sequence added to the FASTA file is also returned in this list, where each tuple contains (KEGG GENES name, amino acid sequence). Note that the seq name is taken from the name of the KEGG GENES file. """ @@ -1731,13 +1819,13 @@ def kegg_gene_sequences_to_fasta_file(self, kegg_genes_files, target_fasta_file) def build_HMM_from_seqs(self, hmm_name, tuple_of_seqs, hmm_output_file, log_file_path): """This function aligns sequences and builds an HMM from them using `muscle` and `hmmbuild`. - + Parameters ========== hmm_name : str What to name the model (ie 'NAME' field in the .hmm file) tuple_of_seqs : List of (sequence name, sequence) tuples - The sequences to align with 'muscle' to create the `hmmbuild` input. + The sequences to align with 'muscle' to create the `hmmbuild` input. See anvio.drivers.muscle for example format hmm_output_file : str File path where to store the new HMM model @@ -1773,12 +1861,12 @@ def estimate_bitscore_for_ko(self, ko, kegg_genes_for_ko, kegg_genes_fasta, ko_m ko : str KEGG identifier for the KO kegg_genes_for_ko : list of str - List of KEGG GENE accessions that were used to generate the KO model (for sanity check and + List of KEGG GENE accessions that were used to generate the KO model (for sanity check and number of sequences) kegg_genes_fasta : str Path to FASTA file where the KEGG GENES sequences for this KO are stored ko_model_file : str - File path of the .hmm file containg the KO model (doesn't need to contain only this model, + File path of the .hmm file containg the KO model (doesn't need to contain only this model, but must be hmmpressed already) Returns ======= @@ -1791,33 +1879,33 @@ def estimate_bitscore_for_ko(self, ko, kegg_genes_for_ko, kegg_genes_fasta, ko_m self.run.warning(f"The function estimate_bitscore_for_ko() received an empty list of KEGG GENES " f"for {ko}, so it cannot estimate a bit score threshold. The function will return " f"a threshold of `None` for this KO.") - return None + return None # we run hmmscan of the KO against its associated GENES sequences and process the hits target_file_dict = {'AA:GENE': kegg_genes_fasta} hmmer = HMMer(target_file_dict, num_threads_to_use=self.num_threads, progress=progress_quiet, run=run_quiet) hmm_hits_file = hmmer.run_hmmer('KO {ko}', 'AA', 'GENE', None, None, len(kegg_genes_for_ko), ko_model_file, None, None) - + if not hmm_hits_file: raise ConfigError(f"No HMM hits were found for the KO model {ko}. This is seriously concerning, because we were running it against " f"gene sequences that were used to generate the model.") - + parser = parser_modules['search']['hmmer_table_output'](hmm_hits_file, alphabet='AA', context='GENE', run=run_quiet) search_results_dict = parser.get_search_results() - + # take the minimum of hits from current KO model as bit score threshold all_relevant_bitscores = [] for hit, hit_info_dict in search_results_dict.items(): if hit_info_dict['gene_name'] == ko or hit_info_dict['gene_name'] == f"{ko}{STRAY_KO_ANVIO_SUFFIX}": all_relevant_bitscores.append(hit_info_dict['bit_score']) - + threshold = min(all_relevant_bitscores) return threshold - - + + def process_all_stray_kos(self): """This driver function processes each stray KO and creates a file of bit score thresholds for them. - + The following steps are run for each stray KO: 1. download of its KO file 2. identification and download of the KEGG GENES sequences in this family @@ -1929,14 +2017,14 @@ def process_all_stray_kos(self): self.progress.update(f"Working on {k} [{cur_num} of {len(ko_files_to_process)}]") self.progress.increment(increment_to=cur_num) downloaded_genes_list = [a for a in ko_to_gene_accessions[k] if a in kegg_genes_downloaded] - threshold_dict[k] = self.estimate_bitscore_for_ko(k, kegg_genes_for_ko=downloaded_genes_list, - kegg_genes_fasta=os.path.join(self.stray_ko_seqs_dir, f"GENES_FOR_{k}.fa"), + threshold_dict[k] = self.estimate_bitscore_for_ko(k, kegg_genes_for_ko=downloaded_genes_list, + kegg_genes_fasta=os.path.join(self.stray_ko_seqs_dir, f"GENES_FOR_{k}.fa"), ko_model_file=self.stray_ko_hmm_file_path) cur_num += 1 self.progress.end() # we need to re-load the ko dictionary so that we have access to the definitions of the stray KOs - # cannot do this before this point because the absence of an stray KO from this dict controls whether it is moved to the + # cannot do this before this point because the absence of an stray KO from this dict controls whether it is moved to the # stray data directory (and we want to keep the strays separate since we process them specially) self.setup_ko_dict(exclude_threshold=(not self.include_stray_kos), suppress_warnings=True) @@ -1978,7 +2066,7 @@ def setup_kofams(self): if self.include_stray_kos: self.process_all_stray_kos() - + # there is no reason to keep the original HMM profiles around, unless we are debugging if not anvio.DEBUG: shutil.rmtree(os.path.join(self.kegg_data_dir, "profiles")) @@ -1986,7 +2074,9 @@ def setup_kofams(self): class ModulesDownload(KeggSetup): - """Class for setting up all KEGG data related to pathway prediction, namely KOfam profiles and KEGG MODULES. + """Class for setting up all KEGG data related to pathway prediction, namely KOfam profiles and KEGG MODULES; + reaction networks, which require MODULES, BRITE, and binary relation files; + and pathway map images and reference KO, EC, and RN KGML files. Parameters ========== @@ -2006,6 +2096,8 @@ def __init__(self, args, run=run, progress=progress, skip_init=False): self.progress = progress self.skip_init = skip_init self.skip_brite_hierarchies = A('skip_brite_hierarchies') + self.skip_binary_relations = A('skip_binary_relations') + self.skip_map_images = A('skip_map_images') self.overwrite_modules_db = A('overwrite_output_destinations') self.run.info_single("Info from MODULES Download") @@ -2019,11 +2111,24 @@ def __init__(self, args, run=run, progress=progress, skip_init=False): "does, this data will not be removed. You can always check if the resulting modules database contains BRITE data by " "running `anvi-db-info` on it and looking at the `is_brite_setup` value (which will be 1 if the database contains BRITE data).") + if (not self.download_from_kegg) and self.skip_binary_relations: + self.run.warning( + "Just so you know, the --skip-binary-relations flag does not do anything (besides " + "suppress some warning output) when used without the -D option. You are setting up " + "from an archived KEGG snapshot which may already include binary relation files, " + "and if it does, this data will not be removed. `anvi-reaction-network` depends on " + "these files and will let you know if they're missing." + ) + # download from KEGG option: module/pathway map htext files and API link self.kegg_module_download_path = "https://www.genome.jp/kegg-bin/download_htext?htext=ko00002.keg&format=htext&filedir=" self.kegg_pathway_download_path = "https://www.genome.jp/kegg-bin/download_htext?htext=br08901.keg&format=htext&filedir=" + self.kegg_rest_api_get = "http://rest.kegg.jp/get" + self.kegg_binary_relations_download_path = "https://www.genome.jp/kegg-bin/show?file=" # download a json file containing all BRITE hierarchies, which can then be downloaded themselves self.kegg_brite_hierarchies_download_path = os.path.join(self.kegg_rest_api_get, "br:br08902/json") + # download the list of pathways, used for processing map image files + self.kegg_pathway_list_download_path = "https://rest.kegg.jp/list/pathway" # check if the data is already downloaded expected_files_for_modules = [self.kegg_module_file, @@ -2031,6 +2136,10 @@ def __init__(self, args, run=run, progress=progress, skip_init=False): if not self.skip_brite_hierarchies: expected_files_for_modules.append(self.kegg_brite_hierarchies_file) expected_files_for_modules.append(self.brite_data_dir) + if not self.skip_binary_relations: + expected_files_for_modules.append(self.binary_relation_data_dir) + if not self.skip_map_images: + expected_files_for_modules.append(self.map_image_data_dir) if not args.reset and not anvio.DEBUG and not self.skip_init: self.is_database_exists(expected_files_for_modules, fail_if_exists=(not self.only_processing)) @@ -2040,7 +2149,39 @@ def __init__(self, args, run=run, progress=progress, skip_init=False): filesnpaths.gen_output_directory(self.kegg_module_data_dir, delete_if_exists=args.reset) if not self.skip_brite_hierarchies: filesnpaths.gen_output_directory(self.brite_data_dir, delete_if_exists=args.reset) - + if not self.skip_binary_relations: + filesnpaths.gen_output_directory( + self.binary_relation_data_dir, delete_if_exists=args.reset + ) + if not self.skip_map_images: + filesnpaths.gen_output_directory( + self.map_image_data_dir, delete_if_exists=args.reset + ) + # Create subdirectories of the map image directory. + for subdir in ( + self.map_image_data_dir, + self.png_dir, + self.kgml_dir, + self.png_1x_dir, + self.png_2x_dir, + self.png_1x_map_dir, + self.png_1x_ko_dir, + self.png_1x_ec_dir, + self.png_1x_rn_dir, + self.png_1x_org_dir, + self.png_2x_map_dir, + self.kgml_1x_dir, + self.kgml_2x_dir, + self.kgml_1x_ko_dir, + self.kgml_1x_ec_dir, + self.kgml_1x_rn_dir, + self.kgml_1x_org_dir, + self.kgml_2x_ko_dir, + self.kgml_2x_ec_dir, + self.kgml_2x_rn_dir, + self.kgml_2x_org_dir + ): + filesnpaths.gen_output_directory(subdir) def download_kegg_module_file(self): """This function downloads the KEGG module file, which tells us which module files to download.""" @@ -2167,12 +2308,13 @@ def download_modules(self): for worker in workers: worker.terminate() + self.progress.end() + if undownloaded: raise ConfigError( "Unfortunately, files for the following modules failed to download despite multiple attempts, " f"and so the database needs to be set up again: {', '.join(undownloaded)}" ) - self.progress.end() def confirm_downloaded_modules(self): @@ -2207,7 +2349,7 @@ def confirm_downloaded_modules(self): def setup_modules_data(self): - """This is a driver function which executes the setup process for pathway prediction data from KEGG.""" + """This is a driver function which executes the setup process for pathway prediction and reaction network data from KEGG.""" # FIXME: we will have to move user setup to a completely separate program at some point # PS. user setup related functions belong to the superclass for now @@ -2227,14 +2369,25 @@ def setup_modules_data(self): self.process_brite_hierarchy_of_hierarchies() # get brite dict attribute self.download_brite_hierarchies() self.confirm_downloaded_brite_hierarchies() + + if not self.skip_binary_relations: + self.download_binary_relations() + self.confirm_downloaded_binary_relations() + + if not self.skip_map_images: + self.download_map_images() else: # get required attributes for database setup and make sure all expected files were downloaded self.process_module_file() self.confirm_downloaded_modules() + if not self.skip_brite_hierarchies: self.process_brite_hierarchy_of_hierarchies() self.confirm_downloaded_brite_hierarchies() + if not self.skip_binary_relations: + self.confirm_downloaded_binary_relations() + # process the modules file into a database if not self.only_download: self.setup_modules_db(db_path=self.kegg_modules_db_path, module_data_directory=self.kegg_module_data_dir, brite_data_directory=self.brite_data_dir, skip_brite_hierarchies=self.skip_brite_hierarchies) @@ -2410,6 +2563,333 @@ def confirm_downloaded_brite_hierarchies(self): self.run.info("Number of BRITE hierarchy files found", len(self.brite_dict)) + ###### Binary relations-related functions below ###### + def download_binary_relations(self): + """ + Download binary relations files relating the accession of a type of KEGG data, such as KOs, + to related accessions of another type of data, such as EC numbers. + """ + for file in self.kegg_binary_relation_files.values(): + url = f'{self.kegg_binary_relations_download_path}{file}' + dest = os.path.join(self.binary_relation_data_dir, file) + try: + utils.download_file(url, dest, progress=self.progress, run=self.run) + except Exception as e: + print(e) + raise ConfigError( + f"Anvi'o failed to download the KEGG binary relations file, '{file}', from the " + "KEGG website. Something likely changed on the KEGG end. Please contact the " + "developers to see if this is a fixable issue. If it isn't, we may be able to " + "provide you with a legacy KEGG data archive that you can use to set up KEGG " + "with the --kegg-archive flag." + ) + + + def confirm_downloaded_binary_relations(self): + """Verify that all expected binary relations files were downloaded.""" + missing_files = [] + for file in self.kegg_binary_relation_files.values(): + path = os.path.join(self.binary_relation_data_dir, file) + if not os.path.exists(path): + missing_files.append(file) + if missing_files: + raise ConfigError( + "The following binary relation files were not found in the expected directory, " + f"'{self.binary_relation_data_dir}', so the KEGG data should be re-downloaded: " + f"{', '.join(missing_files)}" + ) + self.run.info( + "Number of KEGG binary relations files found", len(self.kegg_binary_relation_files) + ) + + + ###### Pathway map image-related functions below ###### + def download_map_images( + self, + add_global_reaction_line_width: Union[float, None] = 6.0, + global_compound_circle_diameter: Union[float, None] = 17.0 + ) -> None: + """ + Download reference pathway map image files and associated KGML files. + + Only download maps with at least one reference KGML file, since the purpose is to be able to + modify maps with data, and KGML files are required to customize maps. Write a table + indicating which KO, EC, and RN KGML files are available for every map available in KEGG, + including those not downloaded due to an absence of KGML files. + + Different sets of "global" and non-global "standard" and "overview" map images are + downloaded. The following global map images are downloaded: 1x and 2x resolution images with + filenames starting "map", and 1x images starting "ko", "ec", and "rn". The "ko", "ec", and + "rn" global maps color reactions with accessions in each of the KEGG KO, EC, and RN + databases, respectively, and the "map" global maps color reactions with accessions in any of + these databases. Non-global 1x and 2x resolution map images starting with "map" are + downloaded. KGML files, which are tailored to the position of features in 1x maps, are + copied to rescale features to match 2x image files. + + Parameters + ========== + add_global_reaction_line_width : Union[float, None], 6.0 + If not None, modify downloaded global map KGML files to add a width attribute to + reaction line graphics elements. The default value of 6 (in the 1x resolution maps, 12 + in the 2x resolution maps) is just wide enough for the lines drawn from the KGML file to + cover up the lines in the base map image. + + global_compound_circle_diameter : Union[float, None], 17.0 + If not None, modify downloaded global map KGML files to adjust the size of compound + circle graphics elements. The argument value is used as the width and height attributes + in 1x resolution maps, with twice the value used in 2x resolution maps. The default + value of 17 is just wide enough for the circles rendered from the KGML file to cover up + the circles in the base map image. + """ + # Download a table from KEGG listing all available pathways. + try: + utils.download_file( + self.kegg_pathway_list_download_path, + self.kegg_pathway_list_file, + progress=self.progress, + run=self.run + ) + except Exception as e: + print(e) + raise ConfigError( + "Anvi'o failed to download a list of pathways from the KEGG website. Something " + "likely changed on the KEGG end. Please contact the developers to see if this is a " + "fixable issue." + ) + pathway_table = pd.read_csv( + self.kegg_pathway_list_file, sep='\t', header=None, names=['id', 'name'] + ) + + # Determine the maximum number of map image files that may be downloaded (image files are + # only downloaded if a corresponding KGML file is available). 5 versions of each global map + # are downloaded: 1x and 2x "map" files and 1x "ko", "ec", and "rn" files. 2 versions of + # each non-global map may be downloaded: 1x and 2x "map" files. + global_map_count = sum( + 1 if re.match(GLOBAL_MAP_ID_PATTERN, pathway_id[-5:]) else 0 + for pathway_id in pathway_table['id'] + ) + nonglobal_map_count = len(pathway_table) - global_map_count + total_dl_count = global_map_count * 5 + nonglobal_map_count * 2 + self.run.info_single( + f"Up to {total_dl_count} map images will be downloaded. \"Up to\" because only maps " + f"found to have associated reference KGML files are downloaded. {self.num_threads} " + "cores (threads) will be used in downloading.", + nl_before=1 + ) + + # Start the worker threads for downloading map image and KGML files. + self.progress.new("Downloading KEGG pathway map files") + self.progress.update("0 pathway maps downloaded") + manager = mp.Manager() + input_queue = manager.Queue() + output_queue = manager.Queue() + for pathway_id in pathway_table['id']: + input_queue.put({ + 'pathway_number': pathway_id[3:], + 'url_stem': self.kegg_rest_api_get, + 'data_dir': self.map_image_data_dir + }) + workers: List[mp.Process] = [] + for _ in range(self.num_threads): + worker = mp.Process( + target=_download_pathway_image_files_worker, args=(input_queue, output_queue) + ) + workers.append(worker) + worker.start() + + # Process the output of download threads. The threads should return items equal to the + # maximum number of image files that may be downloaded. Wait for threads until this number + # of items is reached. + successful_dls: List[str] = [] + failed_dls: List[str] = [] + # Record the paths of KGML files that need to be rescaled to fit 2x resolution images. + kgml_paths: List[str] = [] + # Record which of types of KGML files ('KO', 'EC', 'RN') are available for each downloaded + # pathway map image. + kgml_availability: Dict[str, Dict[str, int]] = {} + processed_count = 0 + while processed_count < total_dl_count: + # For each pathway, a dictionary is returned with keys indicating each type of possible + # map image and KGML file that can be downloaded, and length-2 list values containing + # 1) the possible filepath and 2) an integer value indicating the success or failure + # type of the download. + output: Dict[str, List[str, int]] = output_queue.get() + pathway_id = os.path.splitext(os.path.basename(output['png_1x_map'][0]))[0] + image_keys = ['png_1x_map', 'png_2x_map'] + if re.match(GLOBAL_MAP_ID_PATTERN, pathway_id[-5:]): + image_keys += ['png_1x_ko', 'png_1x_ec', 'png_1x_rn'] + for image_key in image_keys: + if output[image_key][1] == 0: + # This occurs when there were connection errors preventing any KGML files from + # being downloaded, so PNG file downloads were not attempted. + failed_dls.append(output[image_key][0]) + elif output[image_key][1] == 1: + successful_dls.append(output[image_key][0]) + self.progress.update(f"{len(successful_dls)} pathway maps downloaded") + elif output[image_key][1] == 2: + # This indicates that the PNG file was unavailable for download. It should have + # been available given KEGG's pathway list. + failed_dls.append(output[image_key][0]) + elif output[image_key][1] == 3: + # This occurs when connection errors prevented the PNG file from being + # downloaded. + failed_dls.append(output[image_key][0]) + elif output[image_key][1] == 4: + # This indicates that the program did not attempt to download the PNG file + # because there is no KGML file available, e.g., drug maps have no KO, EC, and + # RN KGML files available. + pass + # Record KGML files associated with 2x resolution images. These need to be rescaled. + if image_key == 'png_2x_map': + for kgml_key in ('kgml_ko', 'kgml_ec', 'kgml_rn'): + if output[kgml_key][1] == 1: + kgml_paths.append(output[kgml_key][0]) + processed_count += 1 + # Record data that goes into the table of KGML availability for each pathway. + kgml_availability[pathway_id] = pathway_kgml_availability = {} + for pathway_org in ('ko', 'ec', 'rn'): + if output[f'kgml_{pathway_org}'][1] == 1: + pathway_kgml_availability[pathway_org.upper()] = 1 + else: + pathway_kgml_availability[pathway_org.upper()] = 0 + + # Downloading is complete. Kill the worker threads. + for worker in workers: + worker.terminate() + self.progress.end() + + # Raise an exception when expected files failed to download. Report the failed files by + # pathway ID. + if failed_dls: + failed_dl_groups: Dict[str, List[str]] = {} + for failed_dl in failed_dls: + failed_filename = os.path.basename(failed_dl) + pathway_number = os.path.splitext(failed_filename)[0][3:] + try: + failed_dl_groups[pathway_number].append(failed_filename) + except KeyError: + failed_dl_groups[pathway_number] = [failed_filename] + failed_message = '' + for pathway_number, failed_filenames in failed_dl_groups.items(): + failed_message += f"map{pathway_number}: {', '.join(failed_filenames)}; " + failed_message = failed_message[:-2] + raise ConfigError( + "Unfortunately, files (in parentheses) for the following pathway maps failed to " + "download despite multiple attempts, and so the database needs to be set up again: " + f"{failed_message}" + ) + self.run.info("Number of downloaded map images", len(successful_dls)) + + # Add reaction line widths to global map KGML files. + if add_global_reaction_line_width is not None: + self._add_global_kgml_reaction_line_widths(add_global_reaction_line_width) + + # Rescale compound circles in global map KGML files. + if global_compound_circle_diameter is not None: + self._change_global_kgml_compound_circle_diameters(global_compound_circle_diameter) + + # Create rescaled KGML files to fit 2x resolution map images. + self.progress.new( + "Creating map KGML files rescaled to 2x resolution", + progress_total_items=len(kgml_paths) + ) + # This import can't happen at the module level due to a circular import. + import anvio.kgml as kgml + xml_ops = kgml.XMLOps() + rescaled_count = 0 + for input_path in kgml_paths: + self.progress.update(f"{rescaled_count} / {len(kgml_paths)} KGML files rescaled") + kgml_id: str = os.path.splitext(os.path.basename(input_path))[0] + pathway_org = kgml_id[:-5] + pathway_number = kgml_id[-5:] + pathway = xml_ops.load(input_path) + pathway.scale_graphics(2) + if pathway_org == 'ko': + kgml_dir = self.kgml_2x_ko_dir + elif pathway_org == 'ec': + kgml_dir = self.kgml_2x_ec_dir + elif pathway_org == 'rn': + kgml_dir = self.kgml_2x_rn_dir + else: + raise AssertionError( + "Only KGML files for pathway IDs starting with 'ko', 'ec', and 'rn' should " + f"have been downloaded. The ID, '{kgml_id}', is not recognized." + ) + output_path = os.path.join(kgml_dir, f'{kgml_id}.xml') + xml_ops.write(pathway, output_path) + rescaled_count += 1 + self.progress.end() + + # Write a table of the KGML files available for each map image. + pd.DataFrame.from_dict( + kgml_availability, orient='index', columns=['KO', 'EC', 'RN'] + ).sort_index().to_csv(self.kegg_map_image_kgml_file, sep='\t') + + def _add_global_kgml_reaction_line_widths(self, width: float) -> None: + """ + Add reaction line widths to newly downloaded KGML files for global maps. Width attributes + are not in the files. + + Parameters + ========== + width : float + Width value to add. + """ + assert width > 0 + + # This import can't happen at the module level due to a circular import. + import anvio.kgml as kgml + xml_ops = kgml.XMLOps() + + for entry_type, kgml_dir in zip( + ('ortholog', 'enzyme', 'reaction'), + (self.kgml_1x_ko_dir, self.kgml_1x_ec_dir, self.kgml_1x_rn_dir) + ): + for kgml_path in glob.glob(os.path.join(kgml_dir, '*.xml')): + if re.match( + GLOBAL_MAP_ID_PATTERN, os.path.splitext(os.path.basename(kgml_path))[0][-5:] + ): + pathway = xml_ops.load(kgml_path) + for entry in pathway.get_entries(entry_type=entry_type): + for uuid in entry.children['graphics']: + graphics: kgml.Graphics = pathway.uuid_element_lookup[uuid] + graphics.width = width + xml_ops.write(pathway, kgml_path) + + def _change_global_kgml_compound_circle_diameters(self, diameter: float) -> None: + """ + Change the diameters of compound circles in KGML files for global maps. The purpose of this + is to fully cover circles in base map images with circles rendered from KGML files. + + Parameters + ========== + diameter : float + New diameter of compound cirles. + """ + assert diameter > 0 + + # This import can't happen at the module level due to a circular import. + import anvio.kgml as kgml + xml_ops = kgml.XMLOps() + + for kgml_dir in (self.kgml_1x_ko_dir, self.kgml_1x_ec_dir, self.kgml_1x_rn_dir): + for kgml_path in glob.glob(os.path.join(kgml_dir, '*.xml')): + if re.match( + GLOBAL_MAP_ID_PATTERN, os.path.splitext(os.path.basename(kgml_path))[0][-5:] + ): + pathway = xml_ops.load(kgml_path) + for entry in pathway.get_entries(entry_type='compound'): + for uuid in entry.children['graphics']: + graphics: kgml.Graphics = pathway.uuid_element_lookup[uuid] + width = graphics.width + height = graphics.height + if width is not None: + graphics.width = diameter + if height is not None: + graphics.height = diameter + xml_ops.write(pathway, kgml_path) + + class RunKOfams(KeggContext): """Class for running `hmmscan` against the KOfam database and adding the resulting hits to contigs DB for later metabolism prediction. @@ -2475,8 +2955,8 @@ def __init__(self, args, run=run, progress=progress): if not self.skip_brite_hierarchies and not self.kegg_modules_db.db.get_meta_value('is_brite_setup'): self.run.warning("The KEGG Modules database does not contain BRITE hierarchy data, " - "which could very well be useful to you. BRITE is guaranteed to be set up " - "when downloading the latest version of KEGG with `anvi-setup-kegg-data`.") + "which could very well be useful to you. BRITE is guaranteed to be set up " + "when downloading the latest version of KEGG with `anvi-setup-kegg-data`.") else: self.run.warning("No modules database was found in the KEGG data directory you specified. This is fine, but " "you will not get functional annotations related to KEGG MODULES or BRITE hierarchies in your " @@ -2734,7 +3214,7 @@ def update_dict_for_genes_with_missing_annotations(self, gcids_list, super_hits_ The list of gene caller ids in the contigs database. We will use this to figure out which genes have no annotations super_hits_dict : dictionary - A two-level dictionary in which keys are the labels for each set of hits and values are the dictionary output + A two-level dictionary in which keys are the labels for each set of hits and values are the dictionary output from the hmmsearch parser, which should contain all hits from the set (ie, weak hits not yet removed) next_key : int The next integer key that is available for adding functions to self.functions_dict @@ -3005,7 +3485,7 @@ def process_kofam_hmms(self): if not self.skip_bitscore_heuristic: self.update_dict_for_genes_with_missing_annotations(all_gcids_in_contigs_db, super_hits_dict, next_key=next_key_in_functions_dict) - + # add functions and KEGG modules info to database self.store_annotations_in_db() @@ -3222,7 +3702,7 @@ def init_data_from_modules_db(self): # if we have our own versions of any stray KOs, then we include them here to enable lookups downstream k_anvio = f"{k}{STRAY_KO_ANVIO_SUFFIX}" - if self.include_stray_kos and k_anvio in self.ko_dict: + if self.include_stray_kos and (k_anvio in self.ko_dict or (self.stray_ko_dict and k_anvio in self.stray_ko_dict)): if k_anvio not in self.all_kos_in_db: src = 'KOfam' func = self.all_modules_in_db[mod]['ORTHOLOGY'][k] if 'ORTHOLOGY' in self.all_modules_in_db[mod] else self.ko_dict[k_anvio]['definition'] @@ -3468,7 +3948,7 @@ def get_ko_metadata_dictionary(self, knum, dont_fail_if_not_found=False): raise ConfigError("Something is mysteriously wrong. You are seeking metadata " f"for enzyme {knum} but this enzyme is not in the enzyme dictionary " "(self.ko_dict, or (self.stray_ko_dict) in some cases). This should never have happened.") - + return metadata_dict @@ -3702,7 +4182,7 @@ def __init__(self, args, run=run, progress=progress): estimation_mode = "List of enzymes" elif self.pan_db_path: estimation_mode = "Gene cluster bins in a pangenome" - + self.run.info('Mode (what we are estimating metabolism for)', estimation_mode, quiet=self.quiet) # a warning for high memory usage with metagenome mode in certain situations @@ -3733,7 +4213,7 @@ def __init__(self, args, run=run, progress=progress): # (henceforth referred to as the KO dict, even though it doesn't only contain KOs for user data) self.setup_ko_dict(exclude_threshold=self.exclude_kos_no_threshold) if self.include_stray_kos: - self.setup_stray_ko_dict(add_entries_to_regular_ko_dict=True) + self.setup_stray_ko_dict(add_entries_to_regular_ko_dict=False) annotation_source_set = set(['KOfam']) # check for kegg modules db @@ -5152,8 +5632,8 @@ def get_step_copy_number(self, step_string, enzyme_hit_counts): operations, replacing commas with + operations, and replacing enzyme accessions with their corresponding hit counts; then returning the value obtained by evaluating the resulting arithmetic expression. - Some steps are defined by other modules. When module accessions are found, we initially treat them as having a copy number of 0, but - we re-compute the copy number of the module later once we have the overall copy number of all other modules (and then we use the + Some steps are defined by other modules. When module accessions are found, we initially treat them as having a copy number of 0, but + we re-compute the copy number of the module later once we have the overall copy number of all other modules (and then we use the component module's copy number in the calculation instead). PARAMETERS @@ -5285,21 +5765,21 @@ def get_step_copy_number(self, step_string, enzyme_hit_counts): return 0 return enzyme_hit_counts[step_string] - + def are_enzymes_indirect_alternatives_within_step(self, enzyme_list: list, step: str): - """An overly simplistic function to determine whether the relationship between the provided alternative + """An overly simplistic function to determine whether the relationship between the provided alternative enzymes in the given step is indirect. - - To do this, it simply walks through the step definition string to determine whether each pair of enzymes is separated by + + To do this, it simply walks through the step definition string to determine whether each pair of enzymes is separated by a character symbolizing a more complex relationship. That is, they are not separated only by commas and other enzymes (which indicates a direct relationship, as in the two enzymes are synonymous in the context of the metabolic pathway). - For example, within the step (((K01657+K01658,K13503,K13501,K01656) K00766),K13497), the direct alternatives include - K13503, K13501, and K01656. K01657 and K01658 are indirect alternatives to each other because they are two - components of the same enzyme, while K01658 and K00766 are indirect because they catalyze two separate reactions in + For example, within the step (((K01657+K01658,K13503,K13501,K01656) K00766),K13497), the direct alternatives include + K13503, K13501, and K01656. K01657 and K01658 are indirect alternatives to each other because they are two + components of the same enzyme, while K01658 and K00766 are indirect because they catalyze two separate reactions in an alternative branch of the step. - This algorithm is not perfect at identifying all indirect relationships - for instance, given K01658 and K13503 it will + This algorithm is not perfect at identifying all indirect relationships - for instance, given K01658 and K13503 it will wrongly suggest they are direct alternatives. However, it is meant to be used only for identifying putative edge cases for the `get_dereplicated_enzyme_hits_for_step_in_module()` function, and it works well enough for that. @@ -5316,10 +5796,10 @@ def are_enzymes_indirect_alternatives_within_step(self, enzyme_list: list, step: True if the list of provided enzymes contains those that are indirect alternatives within the given step. """ - enzyme_data = {e : {'index': step.index(e), - 'direct_alts': [], + enzyme_data = {e : {'index': step.index(e), + 'direct_alts': [], 'indirect_alts': []} for e in enzyme_list} - + contains_indirect = False # get enzyme-specific list of alternatives for e in enzyme_list: @@ -5328,12 +5808,12 @@ def are_enzymes_indirect_alternatives_within_step(self, enzyme_list: list, step: e_index = enzyme_data[e]['index'] z_index = enzyme_data[z]['index'] indirect_alternatives = False - + # indirect alts have a space, parentheses, or plus/minus sign between them for c in step[min(e_index, z_index):max(e_index, z_index)]: if c in [' ', '(', ')', '+', '-']: indirect_alternatives = True - + if indirect_alternatives: enzyme_data[e]['indirect_alts'].append(z) contains_indirect = True @@ -5342,14 +5822,14 @@ def are_enzymes_indirect_alternatives_within_step(self, enzyme_list: list, step: return contains_indirect - + def get_dereplicated_enzyme_hits_for_step_in_module(self, meta_dict_for_mnum: dict, step_to_focus_on: str, mnum: str): - """This function returns a dictionary of enzyme accessions matched to the number of hits, with duplicate hits to the + """This function returns a dictionary of enzyme accessions matched to the number of hits, with duplicate hits to the same gene removed, for the provided step in a metabolic pathway. - Depreplicating the gene calls is necessary because the same gene can be annotated with multiple alternative enzymes for the + Depreplicating the gene calls is necessary because the same gene can be annotated with multiple alternative enzymes for the same reaction, and we don't want these annotations to be double-counted in the stepwise copy number calculation. - + PARAMETERS ========== meta_dict_for_mnum : dictionary of dictionaries @@ -5358,7 +5838,7 @@ def get_dereplicated_enzyme_hits_for_step_in_module(self, meta_dict_for_mnum: di which step in the module to resolve alternative enzymes for, passed as a definition string for the step. mnum : string module ID (used only for warning output) - + RETURNS ======= derep_enzyme_hits : dictionary @@ -5381,7 +5861,7 @@ def get_dereplicated_enzyme_hits_for_step_in_module(self, meta_dict_for_mnum: di # and for all other annotations, we reduce the count of hits by one for acc in enzymes[1:]: derep_enzyme_hits[acc] -= 1 - + if self.are_enzymes_indirect_alternatives_within_step(enzymes, step_to_focus_on) and self.add_copy_number: enz_str = ", ".join(enzymes) self.run.warning(f"The gene call {gcid} has multiple annotations to alternative enzymes " @@ -5591,7 +6071,7 @@ def estimate_for_contigs_db_for_metagenome(self, kofam_gene_split_contig, return RETURNS ======= metagenome_metabolism_superdict : dictionary of dictionary of dictionaries - dictionary mapping metagenome name to its metabolism completeness dictionary + dictionary mapping metagenome name to its metabolism completeness dictionary (will be empty dictionary if return_superdicts is False) metagenome_ko_superdict : dictionary of dictionary of dictionaries dictionary mapping metagenome name to its KOfam hits dictionary @@ -5840,7 +6320,7 @@ def estimate_metabolism_from_enzymes_txt(self): def init_hits_for_pangenome(self, gene_cluster_list: list): """This function loads enzyme annotations from the pangenome for use by downstream metabolism estimation. - + For each gene cluster, it takes the most common function from each annotation source relevant to the modules. PARAMETERS @@ -5853,7 +6333,7 @@ def init_hits_for_pangenome(self, gene_cluster_list: list): enzyme_cluster_split_contig : list (enzyme_accession, gene_cluster_id, split, contig) tuples in which split and contig are both NAs """ - + pan_super = PanSuperclass(self.args) pan_super.init_gene_clusters(gene_cluster_ids_to_focus = gene_cluster_list) pan_super.init_gene_clusters_functions_summary_dict(source_list = self.annotation_sources_to_use, gene_clusters_of_interest = gene_cluster_list) @@ -5873,7 +6353,7 @@ def init_hits_for_pangenome(self, gene_cluster_list: list): def estimate_metabolism_for_pangenome_bins(self, enzyme_cluster_split_contig, cluster_collection): """Estimates metabolism individually on each bin in a pangenome. - + PARAMETERS ========== enzyme_cluster_split_contig : list @@ -5882,7 +6362,7 @@ def estimate_metabolism_for_pangenome_bins(self, enzyme_cluster_split_contig, cl cluster_collection : dictionary maps bin names in the collection to the list of gene clusters in each bin """ - + gc_bins_metabolism_superdict = {} gc_bins_ko_superdict = {} num_bins = len(cluster_collection) @@ -5913,7 +6393,7 @@ def estimate_metabolism_for_pangenome_bins(self, enzyme_cluster_split_contig, cl self.progress.end() - return gc_bins_metabolism_superdict, gc_bins_ko_superdict + return gc_bins_metabolism_superdict, gc_bins_ko_superdict def estimate_metabolism(self, skip_storing_data=False, output_files_dictionary=None, return_superdicts=False, @@ -9564,3 +10044,281 @@ def module_definition_to_enzyme_accessions(mod_definition): acc_list = re.split(r'\s+', mod_definition) return acc_list + + +def _download_worker( + input_queue: mp.Queue, + output_queue: mp.Queue, + max_num_tries: int = 100, + wait_secs: float = 10.0 +) -> None: + """ + Multiprocessing worker to download files from a queue. + + Parameters + ========== + input_queue : multiprocessing.Queue + Queue of length-two iterables of the URL and local path for each file to download. + + output_queue : multiprocessing.Queue + Queue in which the success of each download operation is recorded, with True put in the + output queue if the download succeeded and the local path from the input queue put in the + output queue if the download failed (after exceeding the maximum number of tries). + + max_num_tries : int, 100 + The maximum number of times to try downloading a file (in case of a connection reset). + + wait_secs : float, 10.0 + The number of seconds to wait between each file download attempt. + + Returns + ======= + None + """ + while True: + url, path = input_queue.get() + num_tries = 0 + while True: + try: + utils.download_file(url, path) + output = True + break + except (ConfigError, ConnectionResetError) as e: + num_tries += 1 + if num_tries > max_num_tries: + output = path + break + time.sleep(wait_secs) + output_queue.put(output) + + +def download_org_pathway_image_files( + pathway_name: str, + data_dir: str, + kegg_rest_api_get: str = 'http://rest.kegg.jp/get' +) -> Tuple[str, str]: + """ + Download an organism-specific pathway map and associated KGML file. + + Parameters + ========== + pathway_name : str + This ID has 2 parts: the first 3 org characters are specific to the organism, such as 'eco' + for E. coli, and the last 5 digits identify the pathway, such as '00010'. + + data_dir : str + Path to KEGG data directory set up by anvi'o with the necessary subdirectory structure. + + kegg_rest_api_get : str, 'http://rest.kegg.jp/get' + KEGG API URL for downloading files. + + Returns + ======= + Tuple[str, str] + Pathway PNG image and KGML XML filepaths of downloaded files. + """ + png_url = f'{kegg_rest_api_get}/{pathway_name}/image' + kgml_url = f'{kegg_rest_api_get}/{pathway_name}/kgml' + + png_path = os.path.join(data_dir, 'png', '1x', 'org', f'{pathway_name}.png') + kgml_path = os.path.join(data_dir, 'kgml', '1x', 'org', f'{pathway_name}.xml') + + utils.download_file(png_url, png_path) + utils.download_file(kgml_url, kgml_path) + + return (png_path, kgml_path) + +def _download_pathway_image_files_worker( + input_queue: mp.Queue, + output_queue: mp.Queue, + max_num_tries: int = 100, + wait_secs: float = 10.0 +) -> None: + """ + Multiprocessing worker to download pathway maps and associated KGML files given a pathway ID. + + Parameters + ========== + input_queue : multiprocessing.Queue + Queue of input data stored in dictionaries formatted as follows, with values being strings. + { + 'pathway_number': , + 'url_stem': , + 'data_dir': + } + Here is a description of the required subdirectory structure of the data directory. It must + contain subdirectories 'png' and 'kgml', within each of which are subdirectories '1x' and + '2x'. Within 'png/1x' are 5 directories, 'map', 'ko', 'ec', 'rn', and 'org'. Within 'png/2x' + is one directory, 'map'. Within 'kgml/1x' and 'kgml/2x' are 4 directories, 'ko', 'ec', 'rn', + and 'org'. + + output_queue : multiprocessing.Queue + Queue of output data stored in dictionaries formatted as follows, with values being length-2 + lists of 1) the target download filepath and 2) an integer indicating what happened with the + download. A value of 0 indicates that there was no attempt at downloading the file because + the program did not need to try, e.g., for non-global maps, 'ko', 'ec', and 'rn' map images + are not downloaded; also, if there was a connection error in trying to download a KGML file, + then the associated map image files did not need to be downloaded. A value of 1 indicates + that the file downloaded successfully. A value of 2 indicates that the file was unavailable + for download, e.g., there is no KGML RN file available for the pathway. A value of 3 + indicates that there was a connection error preventing download. A value of 4 indicates that + there was no attempt to download because the program found that other requisite files were + unavailable, e.g., a map image is not downloaded if it has no reference KGML files + associated with it. + { + 'png_1x_map': [, ], + 'png_2x_map': [, ], + 'png_1x_ko': [, ], + 'png_1x_ec': [, ], + 'png_1x_rn': [, ], + 'kgml_ko': [, ], + 'kgml_ec': [, ], + 'kgml_rn': [, ] + } + + max_num_tries : int, 10 + The maximum number of times to try downloading a file (in case of a connection reset). + + wait_secs : float, 10.0 + The number of seconds to wait between each file download attempt. + + Returns + ======= + None + """ + while True: + input = input_queue.get() + pathway_number: str = input['pathway_number'] + url: str = input['url_stem'] + data_dir: str = input['data_dir'] + + png_1x_map_url = f'{url}/map{pathway_number}/image' + png_2x_map_url = f'{url}/map{pathway_number}/image2x' + png_1x_ko_url = f'{url}/ko{pathway_number}/image' + png_1x_ec_url = f'{url}/ec{pathway_number}/image' + png_1x_rn_url = f'{url}/rn{pathway_number}/image' + kgml_ko_url = f'{url}/ko{pathway_number}/kgml' + kgml_ec_url = f'{url}/ec{pathway_number}/kgml' + kgml_rn_url = f'{url}/rn{pathway_number}/kgml' + + png_1x_map_path = os.path.join(data_dir, 'png', '1x', 'map', f'map{pathway_number}.png') + png_2x_map_path = os.path.join(data_dir, 'png', '2x', 'map', f'map{pathway_number}.png') + png_1x_ko_path = os.path.join(data_dir, 'png', '1x', 'ko', f'ko{pathway_number}.png') + png_1x_ec_path = os.path.join(data_dir, 'png', '1x', 'ec', f'ec{pathway_number}.png') + png_1x_rn_path = os.path.join(data_dir, 'png', '1x', 'rn', f'rn{pathway_number}.png') + kgml_ko_path = os.path.join(data_dir, 'kgml', '1x', 'ko', f'ko{pathway_number}.xml') + kgml_ec_path = os.path.join(data_dir, 'kgml', '1x', 'ec', f'ec{pathway_number}.xml') + kgml_rn_path = os.path.join(data_dir, 'kgml', '1x', 'rn', f'rn{pathway_number}.xml') + + output: Dict[str, List[str, int]] = { + 'png_1x_map': [png_1x_map_path, 0], + 'png_2x_map': [png_2x_map_path, 0], + 'png_1x_ko': [png_1x_ko_path, 0], + 'png_1x_ec': [png_1x_ec_path, 0], + 'png_1x_rn': [png_1x_rn_path, 0], + 'kgml_ko': [kgml_ko_path, 0], + 'kgml_ec': [kgml_ec_path, 0], + 'kgml_rn': [kgml_rn_path, 0] + } + + if re.match(GLOBAL_MAP_ID_PATTERN, pathway_number): + is_global_map = True + else: + is_global_map = False + + # First try to download KGML files for the pathway. Map images are only downloaded if there + # is at least 1 KGML file associated with it. + max_tries_exceeded = False + for key, kgml_url, kgml_path in ( + ('kgml_ko', kgml_ko_url, kgml_ko_path), + ('kgml_ec', kgml_ec_url, kgml_ec_path), + ('kgml_rn', kgml_rn_url, kgml_rn_path) + ): + num_tries = 0 + while True: + try: + utils.download_file(kgml_url, kgml_path) + output[key][1] = 1 + break + except ConnectionResetError: + num_tries += 1 + if num_tries > max_num_tries: + max_tries_exceeded = True + output[key][1] = 3 + break + time.sleep(wait_secs) + except ConfigError as e: + if 'HTTP Error 404' in str(e): + output[key][1] = 2 + break + else: + num_tries += 1 + if num_tries > max_num_tries: + max_tries_exceeded = True + output[key][1] = 3 + break + time.sleep(wait_secs) + + if max_tries_exceeded: + # Connection errors prevented at least 1 of the KO, EC, or RN KGML files from being + # downloaded, so it remains unknown if these files are actually available for the + # pathway map. + output_queue.put(output) + continue + elif output['kgml_ko'][1] == 2 and output['kgml_ec'][1] == 2 and output['kgml_rn'][1] == 2: + # No KO, EC, and RN KGML files are available for the pathway map. For instance, this is + # the case for drug maps with KEGG IDs starting with 'map07', such as 'map07011', + # 'Penicillins'. + output['png_1x_map'][1] = 4 + output['png_2x_map'][1] = 4 + if is_global_map: + output['png_1x_ko'][1] = 4 + output['png_1x_ec'][1] = 4 + output['png_1x_rn'][1] = 4 + output_queue.put(output) + continue + + dl_items = [ + ('png_1x_map', png_1x_map_url, png_1x_map_path), + ('png_2x_map', png_2x_map_url, png_2x_map_path) + ] + if is_global_map: + if output['kgml_ko'][1] == 1: + dl_items.append(('png_1x_ko', png_1x_ko_url, png_1x_ko_path)) + elif output['kgml_ko'][1] == 2: + output['png_1x_ko'][1] = 4 + + if output['kgml_ec'][1] == 1: + dl_items.append(('png_1x_ec', png_1x_ec_url, png_1x_ec_path)) + elif output['kgml_ec'][1] == 2: + output['png_1x_ec'][1] = 4 + + if output['kgml_rn'][1] == 1: + dl_items.append(('png_1x_rn', png_1x_rn_url, png_1x_rn_path)) + elif output['kgml_rn'][1] == 2: + output['png_1x_rn'][1] = 4 + for key, image_url, image_path in dl_items: + num_tries = 0 + while True: + try: + utils.download_file(image_url, image_path) + output[key][1] = 1 + break + except ConnectionResetError: + num_tries += 1 + if num_tries > max_num_tries: + output[key][1] = 3 + break + time.sleep(wait_secs) + except ConfigError as e: + if 'HTTP Error 404' in str(e): + output[key][1] = 2 + break + else: + num_tries += 1 + if num_tries > max_num_tries: + output[key][1] = 3 + break + time.sleep(wait_secs) + output_queue.put(output) + \ No newline at end of file diff --git a/anvio/keggmapping.py b/anvio/keggmapping.py new file mode 100644 index 0000000000..2e7b6b4c38 --- /dev/null +++ b/anvio/keggmapping.py @@ -0,0 +1,2306 @@ +#!/usr/bin/env python +# -*- coding: utf-8 +"""Make KEGG pathway maps incorporating data sourced from anvi'o databases.""" + +import os +import re +import fitz +import math +import shutil +import functools +import numpy as np +import pandas as pd +import matplotlib as mpl +import matplotlib.pyplot as plt +import matplotlib.colors as mcolors + +from argparse import Namespace +from itertools import combinations +from typing import Dict, Iterable, List, Literal, Tuple, Union + +import anvio.kegg as kegg +import anvio.kgml as kgml +import anvio.dbinfo as dbinfo +import anvio.terminal as terminal +import anvio.reactionnetwork as rn +import anvio.filesnpaths as filesnpaths + +from anvio.errors import ConfigError +from anvio.genomestorage import GenomeStorage +from anvio.dbops import ContigsDatabase, PanSuperclass +from anvio import FORCE_OVERWRITE, QUIET, __version__ as VERSION + + +__author__ = "Developers of anvi'o (see AUTHORS.txt)" +__copyright__ = "Copyleft 2015-2024, the Meren Lab (http://merenlab.org/)" +__credits__ = [] +__license__ = "GPL 3.0" +__version__ = VERSION +__maintainer__ = "Samuel Miller" +__email__ = "samuelmiller10@gmail.com" +__status__ = "Development" + + +# The colors of qualitative and repeating colormaps are sampled in order, whereas other colormaps, +# including sequential colormaps, are sampled evenly. +qualitative_colormaps: List[str] = [ + 'Pastel1', + 'Pastel2', + 'Paired', + 'Accent', + 'Dark2', + 'Set1', + 'Set2', + 'Set3', + 'tab10', + 'tab20', + 'tab20b', + 'tab20c' +] +repeating_colormaps: List[str] = [ + 'flag', + 'prism' +] + +class Mapper: + """ + Make KEGG pathway maps incorporating data sourced from anvi'o databases. + + Attributes + ========== + kegg_context : anvio.kegg.KeggContext + This contains anvi'o KEGG database attributes, such as filepaths. + + available_pathway_numbers : List[str] + ID numbers of all pathways set up with PNG and KGML files in the KEGG data directory. + + pathway_names : Dict[str, str] + The names of all KEGG pathways, including those without files in the KEGG data directory. + Keys are pathway ID numbers and values are pathway names. + + rn_constructor : anvio.reactionnetwork.Constructor + Used for loading reaction networks from anvi'o databases. + + xml_ops : anvio.kgml.XMLOps + Used for loading KGML files as pathway objects. + + overwrite_output : bool + If True, methods in this class overwrite existing output files. + + name_files : bool + Include the pathway name along with the number in output map file names. + + run : anvio.terminal.Run + This object prints run information to the terminal. + + progress : anvio.terminal.Progress + This object prints transient progress information to the terminal. + """ + def __init__( + self, + kegg_dir: str = None, + overwrite_output: bool = FORCE_OVERWRITE, + name_files: bool = False, + run: terminal.Run = terminal.Run(), + progress: terminal.Progress = terminal.Progress(), + quiet: bool = QUIET + ) -> None: + """ + Parameters + ========== + kegg_dir : str, None + Directory containing an anvi'o KEGG database. The default argument of None expects the + KEGG database to be set up in the default directory used by the program + anvi-setup-kegg-data. + + overwrite_output : bool, anvio.FORCE_OVERWRITE + If True, methods in this class overwrite existing output files. + + name_files : bool, False + Include the pathway name along with the number in output map file names. + + run : anvio.terminal.Run, anvio.terminal.Run() + This object prints run information to the terminal. + + progress : anvio.terminal.Progress, anvio.terminal.Progress() + This object prints transient progress information to the terminal. + + quiet : bool, anvio.QUIET + If True, run and progress information is not printed to the terminal. + """ + args = Namespace() + args.kegg_data_dir = kegg_dir + self.kegg_context = kegg.KeggContext(args) + + if not os.path.exists(self.kegg_context.kegg_map_image_kgml_file): + raise ConfigError( + "One of the key files required by KEGG pathway maps is missing in your active " + "anvi'o installation. If your KEGG data are not stored at the default KEGG data " + "location, include that path using the 'kegg_dir' argument. Otherwise, please " + "consider using the program `anvi-setup-kegg-data` to set up the latest KEGG data " + "that includes the necessary files for KEGG pathway maps." + ) + + available_pathway_numbers: List[str] = [] + for row in pd.read_csv( + self.kegg_context.kegg_map_image_kgml_file, sep='\t', index_col=0 + ).itertuples(): + if row.KO + row.EC + row.RN == 0: + continue + available_pathway_numbers.append(row.Index[-5:]) + self.available_pathway_numbers = available_pathway_numbers + + pathway_names: Dict[str, str] = {} + for pathway_number, pathway_name in pd.read_csv( + self.kegg_context.kegg_pathway_list_file, sep='\t', header=None + ).itertuples(index=False): + pathway_names[pathway_number[3:]] = pathway_name + self.pathway_names = pathway_names + + self.rn_constructor = rn.Constructor(kegg_dir=self.kegg_context.kegg_data_dir) + + self.xml_ops = kgml.XMLOps() + self.drawer = kgml.Drawer(kegg_dir=self.kegg_context.kegg_data_dir) + + self.name_files = name_files + self.overwrite_output = overwrite_output + self.run = run + self.progress = progress + self.quiet = self._quiet = quiet + + def map_contigs_database_kos( + self, + contigs_db: str, + output_dir: str, + pathway_numbers: Iterable[str] = None, + color_hexcode: str = '#2ca02c', + draw_maps_lacking_kos: bool = False + ) -> Dict[str, bool]: + """ + Draw pathway maps, highlighting KOs present in the contigs database. + + Parameters + ========== + contigs_db : str + File path to a contigs database containing KO annotations. + + output_dir : str + Path to the output directory in which pathway map PDF files are drawn. The directory is + created if it does not exist. + + pathway_numbers : Iterable[str], None + Regex patterns to match the ID numbers of the drawn pathway maps. The default of None + draws all available pathway maps in the KEGG data directory. + + color_hexcode : str, '#2ca02c' + This is the color, by default green, for reactions containing contigs database KOs. + Alternatively to a color hex code, the string, 'original', can be provided to use the + original color scheme of the reference map. In global and overview maps, KOs are + represented in reaction lines. The foreground color of lines is set. In standard maps, + KOs are represented in boxes, the background color of which is set, or lines. + + draw_maps_lacking_kos : bool, False + If False, by default, only draw maps containing any of the KOs in the contigs database. + If True, draw maps regardless, meaning that nothing may be colored. + + Returns + ======= + Dict[str, bool] + Keys are pathway numbers. Values are True if the map was drawn, False if the map was not + drawn because it did not contain any of the select KOs and 'draw_maps_lacking_kos' was + False. + """ + # Retrieve the IDs of all KO annotations in the contigs database. + self.progress.new("Loading KO data from the contigs database") + self.progress.update("...") + + self._check_contigs_db(contigs_db) + self._check_contigs_db_ko_annotation(contigs_db) + + cdb = ContigsDatabase(contigs_db) + ko_ids = cdb.db.get_single_column_from_table( + 'gene_functions', + 'accession', + unique=True, + where_clause='source = "KOfam"' + ) + self.progress.end() + + drawn = self._map_kos_fixed_colors( + ko_ids, + output_dir, + pathway_numbers=pathway_numbers, + color_hexcode=color_hexcode, + draw_maps_lacking_kos=draw_maps_lacking_kos + ) + count = sum(drawn.values()) if drawn else 0 + self.run.info("Number of maps drawn", count) + + return drawn + + def map_genomes_storage_genome_kos( + self, + genomes_storage_db: str, + genome_name: str, + output_dir: str, + pathway_numbers: Iterable[str] = None, + color_hexcode: str = '#2ca02c', + draw_maps_lacking_kos: bool = False + ) -> Dict[str, bool]: + """ + Draw pathway maps, highlighting KOs present in the genome. + + Parameters + ========== + genomes_storage_db : str + File path to a genomes storage database containing KO annotations. + + genome_name : str + Name of a genome in the genomes storage. + + output_dir : str + Path to the output directory in which pathway map PDF files are drawn. The directory is + created if it does not exist. + + pathway_numbers : Iterable[str], None + Regex patterns to match the ID numbers of the drawn pathway maps. The default of None + draws all available pathway maps in the KEGG data directory. + + color_hexcode : str, '#2ca02c' + This is the color, by default green, for reactions containing KOs in the genome. + Alternatively to a color hex code, the string, 'original', can be provided to use the + original color scheme of the reference map. In global and overview maps, KOs are + represented in reaction lines. The foreground color of lines is set. In standard maps, + KOs are represented in boxes, the background color of which is set, or lines. + + draw_maps_lacking_kos : bool, False + If False, by default, only draw maps containing any of the KOs in the genome. If True, + draw maps regardless, meaning that nothing may be colored. + + Returns + ======= + Dict[str, bool] + Keys are pathway numbers. Values are True if the map was drawn, False if the map was not + drawn because it did not contain any of the select KOs and 'draw_maps_lacking_kos' was + False. + """ + # Retrieve the IDs of all KO annotations for the genome. + self.progress.new("Loading KO data from the genome") + self.progress.update("...") + + self._check_genomes_storage_db(genomes_storage_db) + self._check_genomes_storage_ko_annotation(genomes_storage_db) + + gsdb = GenomeStorage( + genomes_storage_db, + genome_names_to_focus=[genome_name], + function_annotation_sources=['KOfam'], + run=terminal.Run(verbose=False), + progress=terminal.Progress(verbose=False) + ) + ko_ids = gsdb.db.get_single_column_from_table( + 'gene_function_calls', + 'accession', + unique=True, + where_clause=f'genome_name = "{genome_name}" AND source = "KOfam"' + ) + self.progress.end() + + drawn = self._map_kos_fixed_colors( + ko_ids, + output_dir, + pathway_numbers=pathway_numbers, + color_hexcode=color_hexcode, + draw_maps_lacking_kos=draw_maps_lacking_kos + ) + count = sum(drawn.values()) if drawn else 0 + self.run.info("Number of maps drawn", count) + + return drawn + + def map_contigs_databases_kos( + self, + contigs_dbs: Iterable[str], + output_dir: str, + pathway_numbers: Iterable[str] = None, + draw_contigs_db_files: Union[Iterable[str], bool] = False, + draw_grid: Union[Iterable[str], bool] = False, + colormap: Union[bool, str, mcolors.Colormap] = True, + colormap_limits: Tuple[float, float] = None, + colormap_scheme: Literal['by_count', 'by_database'] = None, + reverse_overlay: bool = False, + color_hexcode: str = '#2ca02c', + colorbar: bool = True, + draw_maps_lacking_kos: bool = False + ) -> Dict[Literal['unified', 'individual', 'grid'], Dict]: + """ + Draw pathway maps, highlighting KOs across contigs databases (representing, for example, + genomes or metagenomes). + + A reaction on a map can correspond to one or more KOs, and a KO can annotate one or more + sequences in a contigs database. In global and overview maps, reaction lines are colored. + In standard maps, reaction boxes or lines are colored. + + Parameters + ========== + contigs_dbs : Iterable[str] + File paths to contigs databases containing KO annotations. Databases should have + different project names, by which they are uniquely identified. + + output_dir : str + Path to the output directory in which pathway map PDF files are drawn. The directory is + created if it does not exist. + + pathway_numbers : Iterable[str], None + Regex patterns to match the ID numbers of the drawn pathway maps. The default of None + draws all available pathway maps in the KEGG data directory. + + draw_contigs_db_files : Union[Iterable[str], bool], False + Draw pathway maps for each contigs database if not False. If True, draw maps for all of + the contigs databases. Alternatively, the project names of a subset of contigs databases + can be provided. + + draw_grid : Union[Iterable[str], bool], False + If not False, draw a grid for each pathway map showing both the unified map of input + contigs databases and a map for each contigs database, facilitating identification of + the contigs databases containing reactions highlighted in the unified map. If True, + include all of the contigs databases in the grid. Alternatively, the project names of a + subset of contigs databases can be provided. + + colormap : Union[bool, str, matplotlib.colors.Colormap], True + Reactions are dynamically colored to reflect the contigs databases involving the + reaction, unless the argument value is False. False overrides dynamic coloring via a + colormap using the argument provided to 'color_hexcode', so that reactions represented + by KOs in contigs databases are assigned predetermined colors. + + The default argument value of True automatically assigns a colormap given the colormap + scheme (see the 'colormap_scheme' argument). The scheme, 'by_count', uses the sequential + colormap, 'plasma_r', by default; it spans yellow (fewer databases) to blue-violet (more + databases). This accentuates reactions that are shared rather than unshared across + databases. In contrast, a colormap spanning dark to light, such as 'plasma', is better + for drawing attention to unshared reactions. The scheme, 'by_database', uses the + qualitative colormap, 'tab10', by default; it contains distinct colors suitable for + clearly differentiating the different databases containing reactions. + + The name of a Matplotlib Colormap or a Colormap object itself can also be provided to be + used in lieu of the default. See the following webpage for named colormaps: + https://matplotlib.org/stable/users/explain/colors/colormaps.html#classes-of-colormaps + + colormap_limits : Tuple[float, float], None + Limit the fraction of the colormap used in dynamically selecting colors. The first value + is the lower cutoff and the second value is the upper cutoff, e.g., (0.2, 0.9) limits + color selection to 70% of the colormap, trimming the bottom 20% and top 10%. By default, + for the colormap scheme, 'by_count', the colormap is 'plasma_r', and the limits are set + to (0.1, 0.9). By default, for the scheme, 'by_database', the colormap is qualititative + ('tab10'), and limits are set to (0.0, 1.0). + + colormap_scheme : Literal['by_count', 'by_database'], None + There are two ways of dynamically coloring reactions by inclusion in contigs databases: + by count or by specific database or combination of database. Given the default argument + value of None, with 4 or more databases, reactions are colored by count, and with 2 or + 3, by database. In coloring by count, the colormap should be sequential, such that the + color of a reaction changes 'smoothly' with the count. In contrast, coloring by database + means reaction color is determined by membership in a database or combination of + databases, so each possibility should have a distinct color from a qualitative colormap. + + reverse_overlay : bool, False + By default, with False, reactions in more contigs databases are drawn on top of those in + fewer databases. With True, the opposite applies; especially in global maps with a + non-default colormap spanning dark to light, this accentuates unshared rather than + shared parts of a pathway. + + color_hexcode : str, '#2ca02c' + This is the color, by default green, for reactions containing contigs database KOs. + Alternatively to a color hex code, the string, 'original', can be provided to use the + original color scheme of the reference map. The 'colormap' argument must be False for + this argument to be used, overriding dynamic coloring based on database membership with + static coloring based on presence/absence in any database. + + colorbar : bool, True + If True and coloring by database membership, save a colorbar legend to the file, + 'colorbar.pdf', in the output directory. + + draw_maps_lacking_kos : bool, False + If False, by default, only draw maps containing any of the select KOs. If True, draw + maps regardless, meaning that nothing may be colored. + + Returns + ======= + Dict[Literal['unified', 'individual', 'grid'], Dict] + Keys in the outer dictionary are different types of files that can be drawn. 'unified' + maps show data from all contigs databases. 'individual' maps show data from individual + contigs databases. 'grid' images show both unified and individual maps. 'unified' and + 'grid' values are Dict[str, bool], where keys are pathway numbers, and values are True + if the map was drawn, False if the map was not drawn because it did not contain any of + the select KOs and 'draw_maps_lacking_kos' was False. 'individual' values are Dict[str, + Dict[str, bool]], where keys in the outer dictionary are contigs database project names, + keys in the inner dictionary are pathway numbers, and values in the inner dictionary are + True if the map was drawn, False if the map was not drawn because it did not contain any + of the select KOs and 'draw_maps_lacking_kos' was False. + """ + # This method is similar to map_pan_database_kos, and almost identical after KOs are loaded. + # Set the colormap scheme. + if colormap is False: + scheme = 'static' + else: + if colormap_scheme is None: + if len(contigs_dbs) < 4: + scheme = 'by_database' + else: + scheme = 'by_count' + elif colormap_scheme == 'by_count': + scheme = 'by_count' + elif colormap_scheme == 'by_database': + scheme = 'by_database' + else: + raise AssertionError + + # Set the colormap. + if colormap is True: + if scheme == 'by_count': + cmap = plt.colormaps['plasma_r'] + if colormap_limits is None: + colormap_limits = (0.1, 0.9) + elif scheme == 'by_database': + cmap = plt.colormaps['tab10'] + if colormap_limits is None: + colormap_limits = (0.0, 1.0) + else: + raise AssertionError + elif colormap is False: + cmap = None + elif isinstance(colormap, str): + cmap = plt.colormaps[colormap] + if colormap_limits is None: + colormap_limits = (0.0, 1.0) + elif isinstance(colormap, mcolors.Colormap): + cmap = colormap + if colormap_limits is None: + colormap_limits = (0.0, 1.0) + else: + raise AssertionError + + # Set how the colormap is sampled. + if cmap is None: + sampling = None + else: + if cmap.name in qualitative_colormaps + repeating_colormaps: + sampling = 'in_order' + else: + sampling = 'even' + + # Trim the colormap. + if cmap is not None and colormap_limits is not None and colormap_limits != (0.0, 1.0): + assert 0.0 <= colormap_limits[0] <= colormap_limits[1] <= 1.0 + cmap = mcolors.LinearSegmentedColormap.from_list( + f'trunc({cmap.name},{colormap_limits[0]:.2f},{colormap_limits[1]:.2f})', + cmap(range( + int(colormap_limits[0] * cmap.N), math.ceil(colormap_limits[1] * cmap.N) + )) + ) + + self.progress.new("Loading KO data from contigs databases") + self.progress.update("...") + + self._check_contigs_dbs(contigs_dbs) + self._check_contigs_dbs_ko_annotation(contigs_dbs) + + # Load contigs database metadata. + project_names: Dict[str, str] = {} + for contigs_db in contigs_dbs: + contigs_db_info = dbinfo.ContigsDBInfo(contigs_db) + self_table = contigs_db_info.get_self_table() + + annotation_sources = self_table['gene_function_sources'] + assert annotation_sources is not None and 'KOfam' in annotation_sources.split(',') + + project_name = self_table['project_name'] + assert project_name not in project_names + project_names[project_name] = contigs_db + + # Find which contigs databases contain each KO. + ko_dbs: Dict[str, List[str]] = {} + for project_name, contigs_db in project_names.items(): + cdb = ContigsDatabase(contigs_db) + for ko_id in cdb.db.get_single_column_from_table( + 'gene_functions', + 'accession', + unique=True, + where_clause='source = "KOfam"' + ): + try: + ko_dbs[ko_id].append(project_name) + except KeyError: + ko_dbs[ko_id] = [project_name] + self.progress.end() + + # Find the numeric IDs of the maps to draw. + pathway_numbers = self._find_maps(output_dir, 'kos', patterns=pathway_numbers) + + filesnpaths.gen_output_directory(output_dir, progress=self.progress, run=self.run) + + drawn: Dict[Literal['unified', 'individual', 'grid'], Dict] = { + 'unified': {}, + 'individual': {}, + 'grid': {} + } + + self.progress.new("Drawing 'unified' map incorporating data from all contigs databases") + exceeds_colors: Tuple[int, int] = None + if scheme == 'static': + # Draw unified maps of all contigs databases with a static reaction color. + for pathway_number in pathway_numbers: + self.progress.update(pathway_number) + if color_hexcode == 'original': + drawn['unified'][pathway_number] = self._draw_map_kos_original_color( + pathway_number, + ko_dbs, + output_dir, + draw_map_lacking_kos=draw_maps_lacking_kos + ) + else: + drawn['unified'][pathway_number] = self._draw_map_kos_single_color( + pathway_number, + ko_dbs, + color_hexcode, + output_dir, + draw_map_lacking_kos=draw_maps_lacking_kos + ) + else: + # Draw unified maps with dynamic coloring by number of contigs databases. + assert cmap is not None + color_priority: Dict[str, float] = {} + if scheme == 'by_count': + # Sample the colormap for colors representing each possible number of contigs + # databases. Lower color values correspond to smaller numbers of databases. + if sampling == 'in_order': + if len(contigs_dbs) == 1: + sample_points = range(1, 2) + else: + sample_points = range(len(contigs_dbs)) + elif sampling == 'even': + if len(contigs_dbs) == 1: + sample_points = np.linspace(1, 1, 1) + else: + sample_points = np.linspace(0, 1, len(contigs_dbs)) + else: + raise AssertionError + + if len(contigs_dbs) > cmap.N: + exceeds_colors = (cmap.N, len(contigs_dbs)) + + for sample_point in sample_points: + if reverse_overlay: + color_priority[mcolors.rgb2hex(cmap(sample_point))] = 1 - sample_point + else: + color_priority[mcolors.rgb2hex(cmap(sample_point))] = sample_point + db_combos = None + elif scheme == 'by_database': + # Sample the colormap for colors representing the different contigs databases and + # their combinations. Lower color values correspond to smaller numbers of databases. + db_combos = [] + for db_count in range(1, len(contigs_dbs) + 1): + db_combos += list(combinations(project_names, db_count)) + + if sampling == 'in_order': + sample_points = range(len(db_combos)) + elif sampling == 'even': + sample_points = np.linspace(0, 1, len(db_combos)) + else: + raise AssertionError + + if len(db_combos) > cmap.N: + exceeds_colors = (cmap.N, len(db_combos)) + + for sample_point in sample_points: + if reverse_overlay: + color_priority[ + mcolors.rgb2hex(cmap(sample_point)) + ] = 1 - sample_point / cmap.N + else: + color_priority[ + mcolors.rgb2hex(cmap(sample_point)) + ] = (sample_point + 1) / cmap.N + else: + raise AssertionError + + if colorbar: + # Draw a colorbar in a separate file. + _draw_colorbar = self._draw_colorbar + if scheme == 'by_count': + _draw_colorbar = functools.partial( + _draw_colorbar, + color_labels=range(1, len(contigs_dbs) + 1), + label='database count' + ) + elif scheme == 'by_database': + _draw_colorbar = functools.partial( + _draw_colorbar, + color_labels=[', '.join(db_combo) for db_combo in db_combos], + label='databases' + ) + _draw_colorbar( + color_priority, os.path.join(output_dir, 'colorbar.pdf') + ) + + for pathway_number in pathway_numbers: + self.progress.update(pathway_number) + drawn['unified'][pathway_number] = self._draw_map_kos_membership( + pathway_number, + ko_dbs, + color_priority, + output_dir, + cmap, + source_combos=db_combos, + draw_map_lacking_kos=draw_maps_lacking_kos + ) + self.progress.end() + + if exceeds_colors: + self.run.warning( + f"There were fewer distinct colors available in the colormap ({exceeds_colors[0]}) " + f"than were needed ({exceeds_colors[1]}), so some colors were repeated in use." + ) + + if draw_contigs_db_files is False and draw_grid is False: + count = sum(drawn['unified'].values()) if drawn['unified'] else 0 + self.run.info("Number of maps drawn", count) + return + + # Determine the individual database maps to draw. + if draw_contigs_db_files == True: + draw_files_project_names = list(project_names) + elif draw_contigs_db_files == False: + draw_files_project_names = [] + else: + for project_name in draw_contigs_db_files: + assert project_name in project_names + draw_files_project_names = draw_contigs_db_files + seen = set() + draw_files_project_names = [ + project_name for project_name in list(draw_files_project_names) + if not (project_name in seen or seen.add(project_name)) + ] + + # Determine the map grids to draw. + if draw_grid == True: + draw_grid_project_names = list(project_names) + elif draw_grid == False: + draw_grid_project_names = [] + else: + for project_name in draw_grid: + assert project_name in project_names + draw_grid_project_names = draw_grid + seen = set() + draw_grid_project_names = [ + project_name for project_name in list(draw_grid_project_names) + if not (project_name in seen or seen.add(project_name)) + ] + + seen = set() + draw_project_names = [ + project_name for project_name in draw_files_project_names + draw_grid_project_names + if not (project_name in seen or seen.add(project_name)) + ] + + # Draw individual database maps needed as final outputs or for grids. + for project_name in draw_project_names: + self.progress.new(f"Drawing maps for contigs database '{project_name}'") + self.progress.update("...") + progress = self.progress + self.progress = terminal.Progress(verbose=False) + run = self.run + self.run = terminal.Run(verbose=False) + drawn['individual'][project_name] = self.map_contigs_database_kos( + project_names[project_name], + os.path.join(output_dir, project_name), + pathway_numbers=pathway_numbers, + color_hexcode=color_hexcode, + draw_maps_lacking_kos=draw_maps_lacking_kos + ) + self.progress = progress + self.run = run + self.progress.end() + + if draw_grid == False: + count = sum(drawn['unified'].values()) if drawn['unified'] else 0 + self.run.info( + "Number of 'unified' maps drawn incorporating data from all contigs databases", + count + ) + if not drawn['individual']: + count = 0 + else: + count = sum([sum(d.values()) if d else 0 for d in drawn['individual'].values()]) + self.run.info("Number of maps drawn for individual contigs databases", count) + return + + self.progress.new("Drawing map grid") + self.progress.update("...") + + # Draw empty maps needed to fill in grids. + paths_to_remove: List[str] = [] + if not draw_maps_lacking_kos: + # Make a new dictionary with outer keys being pathway numbers, inner dictionaries + # indicating which maps were drawn per contigs database. + drawn_pathway_number: Dict[str, Dict[str, bool]] = {} + for project_name, drawn_project_name in drawn['individual'].items(): + for pathway_number, drawn_map in drawn_project_name.items(): + try: + drawn_pathway_number[pathway_number][project_name] = drawn_map + except KeyError: + drawn_pathway_number[pathway_number] = {project_name: drawn_map} + + # Draw empty maps as needed, for pathways with some but not all maps drawn. + progress = self.progress + self.progress = terminal.Progress(verbose=False) + run = self.run + self.run = terminal.Run(verbose=False) + for pathway_number, drawn_project_name in drawn_pathway_number.items(): + if set(drawn_project_name.values()) != set([True, False]): + continue + for project_name, drawn_map in drawn_project_name.items(): + if drawn_map: + continue + self.map_contigs_database_kos( + project_names[project_name], + os.path.join(output_dir, project_name), + pathway_numbers=[pathway_number], + color_hexcode=color_hexcode, + draw_maps_lacking_kos=True + ) + if self.name_files: + pathway_name = '_' + self._get_filename_pathway_name(pathway_number) + else: + pathway_name = '' + paths_to_remove.append(os.path.join( + output_dir, project_name, f'kos_{pathway_number}{pathway_name}.pdf' + )) + self.progress = progress + self.run = run + + # Draw map grids. + grid_dir = os.path.join(output_dir, 'grid') + filesnpaths.gen_output_directory(grid_dir, progress=self.progress, run=self.run) + for pathway_number in pathway_numbers: + self.progress.update(pathway_number) + if self.name_files: + pathway_name = '_' + self._get_filename_pathway_name(pathway_number) + else: + pathway_name = '' + unified_map_path = os.path.join(output_dir, f'kos_{pathway_number}{pathway_name}.pdf') + if not os.path.exists(unified_map_path): + continue + in_paths = [unified_map_path] + labels = ['all'] + + pdf_doc = fitz.open(in_paths[0]) + page = pdf_doc.load_page(0) + input_aspect_ratio = page.rect.width / page.rect.height + landscape = True if input_aspect_ratio > 1 else False + + for project_name in draw_grid_project_names: + if self.name_files: + pathway_name = '_' + self._get_filename_pathway_name(pathway_number) + else: + pathway_name = '' + individual_map_path = os.path.join( + output_dir, project_name, f'kos_{pathway_number}{pathway_name}.pdf' + ) + if not os.path.exists(individual_map_path): + break + in_paths.append(individual_map_path) + labels.append(project_name) + else: + if self.name_files: + pathway_name = '_' + self._get_filename_pathway_name(pathway_number) + else: + pathway_name = '' + out_path = os.path.join(grid_dir, f'kos_{pathway_number}{pathway_name}.pdf') + self.make_grid(in_paths, out_path, labels=labels, landscape=landscape) + drawn['grid'][pathway_number] = True + self.progress.end() + + # Remove individual database maps that were only needed for map grids. + for path in paths_to_remove: + os.remove(path) + for project_name in set(draw_project_names).difference(set(draw_files_project_names)): + shutil.rmtree(os.path.join(output_dir, project_name)) + drawn['individual'].pop(project_name) + + count = sum(drawn['unified'].values()) if drawn['unified'] else 0 + self.run.info( + "Number of 'unified' maps drawn incorporating data from all contigs databases", + count + ) + if draw_contigs_db_files: + if not drawn['individual']: + count = 0 + else: + count = sum([sum(d.values()) if d else 0 for d in drawn['individual'].values()]) + self.run.info("Number of maps drawn for individual contigs databases", count) + count = sum(drawn['grid'].values()) if drawn['grid'] else 0 + self.run.info("Number of map grids drawn", count) + + return drawn + + def map_pan_database_kos( + self, + pan_db: str, + genomes_storage_db: str, + output_dir: str, + pathway_numbers: Iterable[str] = None, + draw_genome_files: Union[Iterable[str], bool] = False, + draw_grid: Union[Iterable[str], bool] = False, + colormap: Union[str, mcolors.Colormap, None] = 'plasma_r', + colormap_limits: Tuple[float, float] = None, + reverse_overlay: bool = False, + color_hexcode: str = '#2ca02c', + colorbar: bool = True, + draw_maps_lacking_kos: bool = False, + consensus_threshold: float = None, + discard_ties: bool = None + ) -> Dict[Literal['unified', 'individual', 'grid'], Dict]: + """ + Draw pathway maps, highlighting consensus KOs from the pan database. + + A reaction on a map can correspond to one or more KOs, and a KO can annotate one or more + gene clusters. In global and overview maps, reaction lines are colored. In standard maps, + reaction boxes or lines are colored. + + Parameters + ========== + pan_db : str + File path to a pangenomic database. If a reaction network was stored in the database, + then consensus KOs are determined using the consensus_threshold and discard_ties + parameters stored as database metadata unless explicitly given here as arguments. These + parameters are only stored in the database when a reaction network is stored. + + genomes_storage_db : str + Path to the genomes storage database associated with the pan database. This contains + KO annotations. + + output_dir : str + Path to the output directory in which pathway map PDF files are drawn. The directory is + created if it does not exist. + + pathway_numbers : Iterable[str], None + Regex patterns to match the ID numbers of the drawn pathway maps. The default of None + draws all available pathway maps in the KEGG data directory. + + draw_genome_files : Union[Iterable[str], bool], False + Draw pathway maps for genomes of the pangenome if not False. If True, draw maps for all + of the genomes. Alternatively, the names of a subset of genomes can be provided. + + draw_grid : Union[Iterable[str], bool], False + If not False, draw a grid for each pathway map showing both the pangenomic map and a map + for each genome of the pangenome, facilitating identification of the genomes containing + reactions highlighted in the pangenomic map. If True, include all of the genomes in the + grid. Alternatively, the names of a subset of genomes can be provided. + + colormap : Union[str, matplotlib.colors.Colormap, None], 'plasma_r' + Reactions are dynamically colored to reflect the number of genomes involving the + reaction, unless the argument value is None. None overrides dynamic coloring via a + colormap using the argument provided to 'color_hexcode', so that reactions in the + pangenome are assigned predetermined colors. + + Here is how a reaction is assigned a genome count. A reaction element in a map can + contain one or more KOs. Find corresponding consensus KOs from the anvi'o pangenomic + database. Each consensus KO is assigned to one or more gene clusters. Counted genomes + have one or more genes in gene clusters with these consensus KOs. + + This argument can take either be the name of a built-in matplotlib colormap or a + Colormap object itself. The default sequential colormap, 'plasma_r', spans yellow (fewer + genomes) to blue-violet (more genomes). This accentuates reactions that are shared + rather than unshared across genomes. A colormap spanning dark (fewer genomes) to light + (more genomes), such as 'plasma', is better for drawing attention to unshared reactions. + + See the following webpage for named colormaps: + https://matplotlib.org/stable/users/explain/colors/colormaps.html#classes-of-colormaps + + colormap_limits : Tuple[float, float], (0.0, 1.0) + Limit the fraction of the colormap used in dynamically selecting colors. The first value + is the lower cutoff and the second value is the upper cutoff, e.g., (0.2, 0.9) limits + color selection to 70% of the colormap, trimming the bottom 20% and top 10%. The default + limits with the default colormap scheme, 'plasma_r', are set to (0.1, 0.9). + + reverse_overlay : bool, False + By default, with False, reactions in more genomes are drawn on top of those in fewer + genomes. With True, the opposite applies; especially in global maps with a non-default + colormap spanning dark to light, this accentuates unshared rather than shared parts of a + pathway. + + color_hexcode : str, '#2ca02c' + This is the color, by default green, for reactions containing consensus KOs from the pan + database. Alternatively to a color hex code, the string, 'original', can be provided to + use the original color scheme of the reference map. The 'colormap' argument must be + False for this argument to be used, overriding dynamic coloring based on quantitative + data with static coloring based on presence/absence in the pangenome. + + colorbar : bool, True + If True and coloring by number of genomes, save a colorbar legend to the file, + 'colorbar.pdf', in the output directory. + + draw_maps_lacking_kos : bool, False + If False, by default, only draw maps containing any of the select KOs. If True, draw + maps regardless, meaning that nothing may be colored. + + consensus_threshold : float, None + With a value of None, if a reaction network was stored in the pan database, then the + consensus_threshold metavalue that was also stored in the database is used to find + consensus KOs. If a reaction network was not stored, then with a value of None, the KO + annotation most frequent in a gene cluster is assigned to the cluster itself. If a + numerical value is provided (must be on [0, 1]), at least this proportion of genes in + the cluster must have the most frequent annotation for the cluster to be annotated. + + discard_ties : bool, None + With a value of None, if a reaction network was stored in the pan database, then the + discard_ties metavalue that was also stored in the database is used to find consensus + KOs. If a reaction network was not stored, then with a value of None, discard_ties + assumes a value of False. A value of True means that if multiple KO annotations are most + frequent among genes in a cluster, then a consensus KO is not assigned to the cluster + itself, whereas a value of False would cause one of the most frequent KOs to be + arbitrarily chosen. + + Returns + ======= + Dict[Literal['unified', 'individual', 'grid'], Dict] + Keys in the outer dictionary are different types of files that can be drawn. 'unified' + maps show data from all genomes. 'individual' maps show data from individual genomes. + 'grid' images show both unified and individual maps. 'unified' and 'grid' values are + Dict[str, bool], where keys are pathway numbers, and values are True if the map was + drawn, False if the map was not drawn because it did not contain any of the select KOs + and 'draw_maps_lacking_kos' was False. 'individual' values are Dict[str, Dict[str, + bool]], where keys in the outer dictionary are genome names, keys in the inner + dictionary are pathway numbers, and values in the inner dictionary are True if the map + was drawn, False if the map was not drawn because it did not contain any of the select + KOs and 'draw_maps_lacking_kos' was False. + """ + # This method is similar to map_contigs_databases_kos, and almost identical after KOs are + # loaded. + if isinstance(colormap, str): + assert colormap in mpl.colormaps() + + self._check_pan_db(pan_db) + self._check_genomes_storage_db(genomes_storage_db) + self._check_genomes_storage_ko_annotation(genomes_storage_db) + + # Load pan database metadata. + pan_db_info = dbinfo.PanDBInfo(pan_db) + self_table = pan_db_info.get_self_table() + + # Parameterize how consensus KOs are found. + if consensus_threshold is None: + consensus_threshold = self_table['reaction_network_consensus_threshold'] + if consensus_threshold is not None: + consensus_threshold = float(consensus_threshold) + assert 0 <= consensus_threshold <= 1 + self.run.info_single( + "No consensus threshold was explicitly specified for consensus KO assignment " + f"to gene clusters, but there was a value of '{consensus_threshold}' stored in " + "the pan database from reaction network construction, so this was used. (The " + "default if this were not the case is 0, or no threshold.)" + ) + + if discard_ties is None: + discard_ties = self_table['reaction_network_discard_ties'] + if discard_ties is None: + discard_ties = False + else: + discard_ties = bool(int(discard_ties)) + self.run.info_single( + "It was not explicitly specified whether to discard ties in consensus KO " + f"assignment to gene clusters, but there was a value of '{discard_ties}' " + "stored in the pan database from reaction network construction, so this was " + "used. (The default if this were not the case is False, or do not discard " + "ties.)" + ) + + # Find consensus KOs from the loaded pan database. + self.progress.new("Loading consensus KO data from pan database") + self.progress.update("...") + progress = self.progress + self.progress = terminal.Progress(verbose=False) + run = self.run + self.run = terminal.Run(verbose=False) + args = Namespace() + args.pan_db = pan_db + args.genomes_storage = genomes_storage_db + args.consensus_threshold = consensus_threshold + args.discard_ties = discard_ties + pan_super = PanSuperclass(args, r=self.run, p=self.progress) + pan_super.init_gene_clusters() + pan_super.init_gene_clusters_functions() + pan_super.init_gene_clusters_functions_summary_dict() + gene_clusters: Dict[str, Dict[str, List[int]]] = pan_super.gene_clusters + gene_clusters_functions_summary_dict: Dict = pan_super.gene_clusters_functions_summary_dict + + consensus_cluster_ids: List[str] = [] + consensus_ko_ids: List[str] = [] + for cluster_id, gene_cluster_functions_data in gene_clusters_functions_summary_dict.items(): + gene_cluster_ko_data = gene_cluster_functions_data['KOfam'] + if gene_cluster_ko_data == {'function': None, 'accession': None}: + continue + consensus_cluster_ids.append(cluster_id) + consensus_ko_ids.append(gene_cluster_ko_data['accession']) + self.progress = progress + self.run = run + self.progress.end() + + # Find the numeric IDs of the maps to draw. + pathway_numbers = self._find_maps(output_dir, 'kos', patterns=pathway_numbers) + + filesnpaths.gen_output_directory(output_dir, progress=self.progress, run=self.run) + + genome_names = self_table['external_genome_names'].split(',') + + drawn: Dict[Literal['unified', 'individual', 'grid'], Dict] = { + 'unified': {}, + 'individual': {}, + 'grid': {} + } + + self.progress.new("Drawing 'unified' map incorporating data from all genomes") + exceeds_colors: Tuple[int, int] = None + if colormap is None: + # Draw pangenomic maps with a static reaction color. + for pathway_number in pathway_numbers: + if color_hexcode == 'original': + drawn['unified'][pathway_number] = self._draw_map_kos_original_color( + pathway_number, + set(consensus_ko_ids), + output_dir, + draw_map_lacking_kos=draw_maps_lacking_kos + ) + else: + drawn['unified'][pathway_number] = self._draw_map_kos_single_color( + pathway_number, + set(consensus_ko_ids), + color_hexcode, + output_dir, + draw_map_lacking_kos=draw_maps_lacking_kos + ) + cmap = None + sampling = None + else: + # Draw pangenomic maps with dynamic coloring by number of genomes. + if isinstance(colormap, str): + cmap = plt.colormaps[colormap] + if colormap_limits is None: + colormap_limits = (0.1, 0.9) + else: + cmap = colormap + + # Set how the colormap is sampled. + if cmap.name in qualitative_colormaps + repeating_colormaps: + sampling = 'in_order' + else: + sampling = 'even' + + # Trim the colormap. + if cmap is not None and colormap_limits is not None and colormap_limits != (0.0, 1.0): + assert 0.0 <= colormap_limits[0] <= colormap_limits[1] <= 1.0 + cmap = mcolors.LinearSegmentedColormap.from_list( + f'trunc({cmap.name},{colormap_limits[0]:.2f},{colormap_limits[1]:.2f})', + cmap(range( + int(colormap_limits[0] * cmap.N), math.ceil(colormap_limits[1] * cmap.N) + )) + ) + + # For each consensus KO -- which can annotate more than one gene cluster -- find which + # genomes contribute genes to clusters represented by the KO. + ko_genomes: Dict[str, List[str]] = {} + for cluster_id, ko_id in zip(consensus_cluster_ids, consensus_ko_ids): + for genome_name, gcids in gene_clusters[cluster_id].items(): + if not gcids: + continue + try: + ko_genomes[ko_id].append(genome_name) + except KeyError: + ko_genomes[ko_id] = [genome_name] + for ko_id, ko_genome_names in ko_genomes.items(): + ko_genomes[ko_id] = list(set(ko_genome_names)) + + # Sample the colormap for colors representing each possible number of genomes. Lower + # color values correspond to smaller numbers of databases. + if sampling == 'in_order': + if len(genome_names) == 1: + sample_points = range(1, 2) + else: + sample_points = range(len(genome_names)) + elif sampling == 'even': + if len(genome_names) == 1: + sample_points = np.linspace(1, 1, 1) + else: + sample_points = np.linspace(0, 1, len(genome_names)) + else: + raise AssertionError + + if len(genome_names) > cmap.N: + exceeds_colors = (cmap.N, len(genome_names)) + + color_priority: Dict[str, float] = {} + for sample_point in sample_points: + if reverse_overlay: + color_priority[mcolors.rgb2hex(cmap(sample_point))] = 1 - sample_point + else: + color_priority[mcolors.rgb2hex(cmap(sample_point))] = sample_point + + if colorbar: + self._draw_colorbar( + color_priority, + os.path.join(output_dir, 'colorbar.pdf'), + color_labels=range(1, len(genome_names) + 1), + label='genomes' + ) + for pathway_number in pathway_numbers: + self.progress.update(pathway_number) + drawn['unified'][pathway_number] = self._draw_map_kos_membership( + pathway_number, + ko_genomes, + color_priority, + output_dir, + cmap, + draw_map_lacking_kos=draw_maps_lacking_kos + ) + self.progress.end() + + if exceeds_colors: + self.run.warning( + f"There were fewer distinct colors available in the colormap ({exceeds_colors[0]}) " + f"than were needed ({exceeds_colors[1]}), so some colors were repeated in use." + ) + + if draw_genome_files is False and draw_grid is False: + count = sum(drawn['unified'].values()) if drawn['unified'] else 0 + self.run.info("Number of maps drawn", count) + return + + # Determine the individual genome maps to draw. + if draw_genome_files == True: + draw_files_genome_names = genome_names + elif draw_genome_files == False: + draw_files_genome_names = [] + else: + for genome_name in draw_genome_files: + assert genome_name in genome_names + draw_files_genome_names = draw_genome_files + seen = set() + draw_files_genome_names = [ + genome_name for genome_name in list(draw_files_genome_names) + if not (genome_name in seen or seen.add(genome_name)) + ] + + # Determine the map grids to draw. + if draw_grid == True: + draw_grid_genome_names = genome_names + elif draw_grid == False: + draw_grid_genome_names = [] + else: + for genome_name in draw_grid: + assert genome_name in genome_names + draw_grid_genome_names = draw_grid + seen = set() + draw_grid_genome_names = [ + genome_name for genome_name in list(draw_grid_genome_names) + if not (genome_name in seen or seen.add(genome_name)) + ] + + seen = set() + draw_genome_names = [ + genome_name for genome_name in draw_files_genome_names + draw_grid_genome_names + if not (genome_name in seen or seen.add(genome_name)) + ] + + # Draw individual genome maps needed as final outputs or for grids. + for genome_name in draw_genome_names: + self.progress.new(f"Drawing maps for genome '{genome_name}'") + self.progress.update("...") + progress = self.progress + self.progress = terminal.Progress(verbose=False) + run = self.run + self.run = terminal.Run(verbose=False) + drawn['individual'][genome_name] = self.map_genomes_storage_genome_kos( + genomes_storage_db, + genome_name, + os.path.join(output_dir, genome_name), + pathway_numbers=pathway_numbers, + color_hexcode=color_hexcode, + draw_maps_lacking_kos=draw_maps_lacking_kos + ) + self.progress = progress + self.run = run + self.progress.end() + + if draw_grid == False: + count = sum(drawn['unified'].values()) if drawn['unified'] else 0 + self.run.info( + "Number of 'unified' maps drawn incorporating data from all genomes", + count + ) + if not drawn['individual']: + count = 0 + else: + count = sum([sum(d.values()) if d else 0 for d in drawn['individual'].values()]) + self.run.info("Number of maps drawn for individual genomes", count) + return + + self.progress.new("Drawing map grid") + self.progress.update("...") + + # Draw empty maps needed to fill in grids. + paths_to_remove: List[str] = [] + if not draw_maps_lacking_kos: + # Make a new dictionary with outer keys being pathway numbers, inner dictionaries + # indicating which maps were drawn per genome. + drawn_pathway_number: Dict[str, Dict[str, bool]] = {} + for genome_name, drawn_genome_name in drawn['individual'].items(): + for pathway_number, drawn_map in drawn_genome_name.items(): + try: + drawn_pathway_number[pathway_number][genome_name] = drawn_map + except KeyError: + drawn_pathway_number[pathway_number] = {genome_name: drawn_map} + + # Draw empty maps as needed, for pathways with some but not all maps drawn. + progress = self.progress + self.progress = terminal.Progress(verbose=False) + run = self.run + self.run = terminal.Run(verbose=False) + for pathway_number, drawn_genome_name in drawn_pathway_number.items(): + if set(drawn_genome_name.values()) != set([True, False]): + continue + for genome_name, drawn_map in drawn_genome_name.items(): + if drawn_map: + continue + self.map_genomes_storage_genome_kos( + genomes_storage_db, + genome_name, + os.path.join(output_dir, genome_name), + pathway_numbers=[pathway_number], + color_hexcode=color_hexcode, + draw_maps_lacking_kos=True + ) + if self.name_files: + pathway_name = '_' + self._get_filename_pathway_name(pathway_number) + else: + pathway_name = '' + paths_to_remove.append(os.path.join( + output_dir, genome_name, f'kos_{pathway_number}{pathway_name}.pdf' + )) + self.progress = progress + self.run = run + + # Draw map grids. + grid_dir = os.path.join(output_dir, 'grid') + filesnpaths.gen_output_directory(grid_dir, progress=self.progress, run=self.run) + for pathway_number in pathway_numbers: + self.progress.update(pathway_number) + if self.name_files: + pathway_name = '_' + self._get_filename_pathway_name(pathway_number) + else: + pathway_name = '' + unified_map_path = os.path.join(output_dir, f'kos_{pathway_number}{pathway_name}.pdf') + if not os.path.exists(unified_map_path): + continue + in_paths = [unified_map_path] + labels = ['pangenome'] + + pdf_doc = fitz.open(in_paths[0]) + page = pdf_doc.load_page(0) + input_aspect_ratio = page.rect.width / page.rect.height + landscape = True if input_aspect_ratio > 1 else False + + for genome_name in draw_grid_genome_names: + if self.name_files: + pathway_name = '_' + self._get_filename_pathway_name(pathway_number) + else: + pathway_name = '' + individual_map_path = os.path.join( + output_dir, genome_name, f'kos_{pathway_number}{pathway_name}.pdf' + ) + if not os.path.exists(individual_map_path): + break + in_paths.append(individual_map_path) + labels.append(genome_name) + else: + if self.name_files: + pathway_name = '_' + self._get_filename_pathway_name(pathway_number) + else: + pathway_name = '' + out_path = os.path.join(grid_dir, f'kos_{pathway_number}{pathway_name}.pdf') + self.make_grid(in_paths, out_path, labels=labels, landscape=landscape) + drawn['grid'][pathway_number] = True + self.progress.end() + + # Remove individual genome maps that were only needed for map grids. + for path in paths_to_remove: + os.remove(path) + for genome_name in set(draw_genome_names).difference(set(draw_files_genome_names)): + shutil.rmtree(os.path.join(output_dir, genome_name)) + drawn['individual'].pop(genome_name) + + count = sum(drawn['unified'].values()) if drawn['unified'] else 0 + self.run.info( + "Number of 'unified' maps drawn incorporating data from all genomes", + count + ) + if draw_genome_files: + if not drawn['individual']: + count = 0 + else: + count = sum([sum(d.values()) if d else 0 for d in drawn['individual'].values()]) + self.run.info("Number of maps drawn for individual genomes", count) + count = sum(drawn['grid'].values()) if drawn['grid'] else 0 + self.run.info("Number of map grids drawn", count) + + return drawn + + def _map_kos_fixed_colors( + self, + ko_ids: Iterable[str], + output_dir: str, + pathway_numbers: List[str] = None, + color_hexcode: str = '#2ca02c', + draw_maps_lacking_kos: bool = False + ) -> Dict[str, bool]: + """ + Draw pathway maps, highlighting reactions containing select KOs in either a single color + provided by a hex code or the colors originally used in the reference map. + + Parameters + ========== + ko_ids : Iterable[str] + KO IDs to be highlighted in the maps. + + output_dir : str + Path to the output directory in which pathway map PDF files are drawn. The directory is + created if it does not exist. + + pathway_numbers : Iterable[str], None + Regex patterns to match the ID numbers of the drawn pathway maps. The default of None + draws all available pathway maps in the KEGG data directory. + + color_hexcode : str, '#2ca02c' + This is the color, by default green, for reactions containing provided KOs. + Alternatively to a color hex code, the string, 'original', can be provided to use the + original color scheme of the reference map. In global maps, KOs are represented in + reaction lines, and in overview maps, KOs are represented in reaction arrows. The + foreground color of the lines and arrows is set. In standard maps, KOs are represented + in boxes, the background color of which is set. + + draw_maps_lacking_kos : bool, False + If False, by default, only draw maps containing any of the select KOs. If True, draw + maps regardless, meaning that nothing may be colored. + + Returns + ======= + Dict[str, bool] + Keys are pathway numbers. Values are True if the map was drawn, False if the map was not + drawn because it did not contain any of the select KOs and 'draw_maps_lacking_kos' was + False. + """ + # Find the numeric IDs of the maps to draw. + pathway_numbers = self._find_maps(output_dir, 'kos', patterns=pathway_numbers) + + filesnpaths.gen_output_directory(output_dir, progress=self.progress, run=self.run) + + # Draw maps. + self.progress.new("Drawing map") + drawn: Dict[str, bool] = {} + for pathway_number in pathway_numbers: + self.progress.update(pathway_number) + if color_hexcode == 'original': + drawn[pathway_number] = self._draw_map_kos_original_color( + pathway_number, + ko_ids, + output_dir, + draw_map_lacking_kos=draw_maps_lacking_kos + ) + else: + drawn[pathway_number] = self._draw_map_kos_single_color( + pathway_number, + ko_ids, + color_hexcode, + output_dir, + draw_map_lacking_kos=draw_maps_lacking_kos + ) + self.progress.end() + + return drawn + + @staticmethod + def _check_contigs_db(contigs_db: str) -> None: + """ + Check the validity of an expected contigs database. + + Parameters + ========== + contigs_db : str + File path to an expected contigs database. + """ + if not os.path.exists(contigs_db): + raise ConfigError( + f"There was no file at the following expected contigs database path: '{contigs_db}'" + ) + + contigs_db_info = dbinfo.ContigsDBInfo(contigs_db, dont_raise=True, expecting='contigs') + if contigs_db_info is None: + raise ConfigError( + "The file at the following expected contigs database path is not a contigs " + f"database: '{contigs_db}'" + ) + + @staticmethod + def _check_contigs_db_ko_annotation(contigs_db: str) -> None: + """ + Check that a contigs database was annotated with KOs. + + Parameters + ========== + contigs_db : str + File path to a contigs database. + """ + contigs_db_info = dbinfo.ContigsDBInfo(contigs_db, expecting='contigs') + if 'KOfam' not in contigs_db_info.get_functional_annotation_sources(): + raise ConfigError( + f"The contigs database, '{contigs_db}', was never annotated with KOs. This can be " + "rectified by running `anvi-run-kegg-kofams` on the database." + ) + + @staticmethod + def _check_genomes_storage_db(genomes_storage_db: str) -> None: + """ + Check the validity of an expected genomes storage database. + + Parameters + ========== + genomes_storage_db : str + File path to an expected genomes storage database. + """ + if not os.path.exists(genomes_storage_db): + raise ConfigError( + "There was no file at the following expected genomes storage database path: " + f"'{genomes_storage_db}'" + ) + + gsdb_info = dbinfo.GenomeStorageDBInfo( + genomes_storage_db, dont_raise=True, expecting='genomestorage' + ) + if gsdb_info is None: + raise ConfigError( + "The file at the following expected genomes storage database path is not a genomes " + f"storage database: '{genomes_storage_db}'" + ) + + @staticmethod + def _check_genomes_storage_ko_annotation(genomes_storage_db: str) -> None: + """ + Check that a genomes storage database was annotated with KOs. + + Parameters + ========== + genomes_storage_db : str + File path to a genomes storage database. + """ + gsdb_info = dbinfo.GenomeStorageDBInfo(genomes_storage_db, expecting='genomestorage') + if 'KOfam' not in gsdb_info.get_functional_annotation_sources(): + raise ConfigError( + f"The genomes storage database, '{genomes_storage_db}', was never annotated with " + "KOs. The genomes storage should be remade with annotated genomes, which can be " + "rectified by running `anvi-run-kegg-kofams` on the genome databases." + ) + + @staticmethod + def _check_contigs_dbs(contigs_dbs: Iterable[str]) -> None: + """ + Check the validity of expected contigs databases. + + Parameters + ========== + contigs_dbs : Iterable[str] + File paths to expected contigs databases. + """ + invalid_paths: List[str] = [] + invalid_filetypes: List[str] = [] + for contigs_db in contigs_dbs: + if not os.path.exists(contigs_db): + invalid_paths.append(contigs_db) + if invalid_paths: + continue + + contigs_db_info = dbinfo.ContigsDBInfo(contigs_db, dont_raise=True, expecting='contigs') + if contigs_db_info is None: + invalid_filetypes.append(contigs_db) + if invalid_filetypes: + continue + + if invalid_paths: + paths = ', '.join([f'{path}' for path in invalid_paths]) + raise ConfigError( + f"There were no files at the following expected contigs database paths: {paths}" + ) + + if invalid_filetypes: + paths = ', '.join([f'{path}' for path in invalid_filetypes]) + raise ConfigError( + "The files at the following expected contigs database paths are not contigs " + f"databases: {paths}" + ) + + @staticmethod + def _check_contigs_dbs_ko_annotation(contigs_dbs: Iterable[str]) -> None: + unannotated: List[str] = [] + for contigs_db in contigs_dbs: + contigs_db_info = dbinfo.ContigsDBInfo(contigs_db, expecting='contigs') + if 'KOfam' not in contigs_db_info.get_functional_annotation_sources(): + unannotated.append(contigs_db) + if unannotated: + continue + + if unannotated: + paths = ', '.join([f'{path}' for path in unannotated]) + raise ConfigError( + "The following contigs databases were never annotated with KOs, but this can be " + f"rectified by running `anvi-run-kegg-kofams` on them: {paths}" + ) + + @staticmethod + def _check_pan_db(pan_db: str) -> None: + """ + Check the validity of an expected pan database. + + Parameters + ========== + pan_db : str + File path to an expected pan database. + """ + if not os.path.exists(pan_db): + raise ConfigError( + f"There was no file at the following expected pan database path: '{pan_db}'" + ) + + pan_db_info = dbinfo.PanDBInfo(pan_db, dont_raise=True, expecting='pan') + if pan_db_info is None: + raise ConfigError( + "The file at the following expected pan database path is not a pan database: " + f"'{pan_db}'" + ) + + def _find_maps(self, output_dir: str, prefix: str, patterns: List[str] = None) -> List[str]: + """ + Find the numeric IDs of maps to draw given the file prefix, checking that the map can be + drawn in the target output direcotry. + + Parameters + ========== + output_dir : str + Path to the output directory in which pathway map PDF files are drawn. The directory is + created if it does not exist. + + prefix : str + Output filenames are formatted as _.pdf or + __.pdf. + + patterns : List[str], None + Regex patterns of pathway numbers, which are five digits. + """ + if patterns is None: + pathway_numbers = self.available_pathway_numbers + else: + pathway_numbers = self._get_pathway_numbers_from_patterns(patterns) + + if not self.overwrite_output: + for pathway_number in pathway_numbers: + if self.name_files: + pathway_name = '_' + self._get_filename_pathway_name(pathway_number) + else: + pathway_name = '' + out_path = os.path.join(output_dir, f'{prefix}_{pathway_number}{pathway_name}.pdf') + if os.path.exists(out_path): + raise ConfigError( + f"Output files would be overwritten in the output directory, {output_dir}. " + "Either delete the contents of the directory, or use the option to " + "overwrite output destinations." + ) + + return pathway_numbers + + def _get_pathway_numbers_from_patterns(self, patterns: Iterable[str]) -> List[str]: + """ + Among pathways available in the KEGG data directory, get those with ID numbers matching the + given regex patterns. + + Parameters + ========== + patterns : Iterable[str] + Regex patterns of pathway numbers, which are five digits. + + Returns + ======= + List[str] + Pathway numbers matching the regex patterns. + """ + pathway_numbers: List[str] = [] + for pattern in patterns: + for available_pathway_number in self.available_pathway_numbers: + if re.match(pattern, available_pathway_number): + pathway_numbers.append(available_pathway_number) + + # Maintain the order of pathway numbers recovered from patterns. + seen = set() + return [ + pathway_number for pathway_number in pathway_numbers + if not (pathway_number in seen or seen.add(pathway_number)) + ] + + def _draw_map_kos_single_color( + self, + pathway_number: str, + ko_ids: Iterable[str], + color_hexcode: str, + output_dir: str, + draw_map_lacking_kos: bool = False + ) -> bool: + """ + Draw a pathway map, highlighting reactions containing select KOs in a single color. + + Parameters + ========== + pathway_number : str, None + Numeric ID of the map to draw. + + ko_ids : Iterable[str] + Select KOs, any of which in the map are colored. + + color_hexcode : str + This is the color, by default green, for reactions containing provided KOs. A reaction + on a map can correspond to one or more KOs, and a KO can annotate one or more sequences + in a contigs database. In global and overview maps, reaction lines are colored. In + standard maps, reaction boxes or lines are colored. + + output_dir : str + Path to an existing output directory in which map PDF files are drawn. + + draw_map_lacking_kos : bool, False + If False, by default, only draw the map if it contains any of the select KOs. If True, + draw the map regardless, meaning that nothing may be highlighted. + + Returns + ======= + bool + True if the map was drawn, False if the map was not drawn because it did not contain any + of the select KOs and 'draw_map_lacking_kos' was False. + """ + pathway = self._get_pathway(pathway_number) + + select_entries = pathway.get_entries(kegg_ids=ko_ids) + if not select_entries and not draw_map_lacking_kos: + return False + + # Set the color of Graphics elements for reactions containing select KOs. For other Graphics + # elements, change the 'fgcolor' attribute to a nonsense value of '0' to ensure that the + # elements with the prioritized color can be distinguished from other elements. Also, in + # overview and standard maps, widen lines from the base map default of 1.0. + all_entries = pathway.get_entries(entry_type='ortholog') + select_uuids = [entry.uuid for entry in select_entries] + for entry in all_entries: + if entry.uuid in select_uuids: + for uuid in entry.children['graphics']: + graphics: kgml.Graphics = pathway.uuid_element_lookup[uuid] + if pathway.is_global_map: + assert graphics.type == 'line' + graphics.fgcolor = color_hexcode + graphics.bgcolor = '#FFFFFF' + elif pathway.is_overview_map: + assert graphics.type == 'line' + graphics.fgcolor = color_hexcode + graphics.bgcolor = '#FFFFFF' + graphics.width = 5.0 + else: + if graphics.type == 'rectangle': + graphics.fgcolor = '#000000' + graphics.bgcolor = color_hexcode + elif graphics.type == 'line': + graphics.fgcolor = color_hexcode + graphics.bgcolor = '#FFFFFF' + graphics.width = 5.0 + else: + raise AssertionError( + "Ortholog entries are assumed to have Graphics elements of type " + "'rectangle' or 'line', not the encountered type, " + f"'{graphics.type}'." + ) + else: + for uuid in entry.children['graphics']: + graphics: kgml.Graphics = pathway.uuid_element_lookup[uuid] + graphics.fgcolor = '0' + + # Set the color priority so that the colored reactions are prioritized for display on top. + # Recolor "unprioritized" reactions to a background color. In global and overview maps, + # recolor circles to reflect the colors of prioritized reactions involving the compounds. + color_priority: Dict[str, Dict[str, Dict[Tuple[str, str], float]]] = {} + if pathway.is_global_map: + color_priority['ortholog'] = {'line': {(color_hexcode, '#FFFFFF'): 1.0}} + recolor_unprioritized_entries = 'g' + color_associated_compounds = 'high' + elif pathway.is_overview_map: + color_priority['ortholog'] = {'line': {(color_hexcode, '#FFFFFF'): 1.0}} + recolor_unprioritized_entries = 'w' + color_associated_compounds = 'high' + else: + color_priority['ortholog'] = { + 'rectangle': {('#000000', color_hexcode): 1.0}, + 'line': {(color_hexcode, '#FFFFFF'): 1.0} + } + recolor_unprioritized_entries = 'w' + color_associated_compounds = None + pathway.set_color_priority( + color_priority, + recolor_unprioritized_entries=recolor_unprioritized_entries, + color_associated_compounds=color_associated_compounds + ) + + # Draw the map. + if self.name_files: + pathway_name = '_' + self._get_filename_pathway_name(pathway_number) + else: + pathway_name = '' + out_path = os.path.join(output_dir, f'kos_{pathway_number}{pathway_name}.pdf') + if os.path.exists(out_path) and self.overwrite_output: + os.remove(out_path) + else: + filesnpaths.is_output_file_writable(out_path, ok_if_exists=False) + self.drawer.draw_map(pathway, out_path) + return True + + def _draw_map_kos_original_color( + self, + pathway_number: str, + ko_ids: Iterable[str], + output_dir: str, + draw_map_lacking_kos: bool = False + ) -> bool: + """ + Draw a pathway map, highlighting reactions containing select KOs in the color or colors + originally used in the reference map. + + Parameters + ========== + pathway_number : str, None + Numeric ID of the map to draw. + + ko_ids : Iterable[str] + Select KOs, any of which in the map are colored. + + output_dir : str + Path to an existing output directory in which map PDF files are drawn. + + draw_map_lacking_kos : bool, False + If False, by default, only draw the map if it contains any of the select KOs. If True, + draw the map regardless, meaning that nothing may be highlighted. + + Returns + ======= + bool + True if the map was drawn, False if the map was not drawn because it did not contain any + of the select KOs and 'draw_map_lacking_kos' was False. + """ + pathway = self._get_pathway(pathway_number) + + select_entries = pathway.get_entries(kegg_ids=ko_ids) + if not select_entries and not draw_map_lacking_kos: + return False + + # Set "secondary" colors of ortholog Graphics elements for reactions containing select KOs: + # white background color of lines or black foreground text of rectangles. For other Graphics + # elements, change the 'fgcolor' attribute to a nonsense value to ensure that the elements + # with prioritized colors can be distinguished from other elements. Also, in overview and + # standard maps, widen lines from the base map default of 1.0. + all_entries = pathway.get_entries(entry_type='ortholog') + select_uuids = [entry.uuid for entry in select_entries] + prioritized_colors: Dict[str, List[Tuple[str, str]]] = {} + for entry in all_entries: + if entry.uuid in select_uuids: + for uuid in entry.children['graphics']: + graphics: kgml.Graphics = pathway.uuid_element_lookup[uuid] + if pathway.is_global_map: + assert graphics.type == 'line' + graphics.bgcolor = '#FFFFFF' + elif pathway.is_overview_map: + assert graphics.type == 'line' + graphics.bgcolor = '#FFFFFF' + graphics.width = 5.0 + else: + if graphics.type == 'rectangle': + graphics.fgcolor = '#000000' + elif graphics.type == 'line': + graphics.bgcolor = '#FFFFFF' + graphics.width = 5.0 + else: + raise AssertionError( + "Ortholog entries are assumed to have Graphics elements of type " + "'rectangle' or 'line', not the encountered type, " + f"'{graphics.type}'." + ) + try: + graphics_type_prioritized_colors = prioritized_colors[graphics.type] + except: + prioritized_colors[graphics.type] = graphics_type_prioritized_colors = [] + graphics_type_prioritized_colors.append((graphics.fgcolor, graphics.bgcolor)) + else: + for uuid in entry.children['graphics']: + graphics: kgml.Graphics = pathway.uuid_element_lookup[uuid] + graphics.fgcolor = '0' + + # By default, global maps but not overview and standard maps display reaction graphics in + # more than one color. Give higher priority to reaction entries that are encountered later + # (occur further down in the KGML file), and would thus be rendered above earlier reactions. + color_priority: Dict[str, Dict[str, Dict[Tuple[str, str], float]]] = {'ortholog': {}} + for graphics_type, graphics_type_prioritized_colors in prioritized_colors.items(): + seen = set() + unique_prioritized_colors = [ + colors for colors in graphics_type_prioritized_colors + if not (colors in seen or seen.add(colors)) + ] + priorities = np.linspace(0, 1, len(unique_prioritized_colors) + 1)[1: ] + graphics_type_color_priority = { + colors: priority for colors, priority in zip(unique_prioritized_colors, priorities) + } + color_priority['ortholog'][graphics_type] = graphics_type_color_priority + + # Recolor "unprioritized" reactions to a background color. In global and overview maps, + # recolor circles to reflect the colors of prioritized reactions involving the compounds. + if pathway.is_global_map: + recolor_unprioritized_entries = 'g' + color_associated_compounds = 'high' + elif pathway.is_overview_map: + recolor_unprioritized_entries = 'w' + color_associated_compounds = 'high' + else: + recolor_unprioritized_entries = 'w' + color_associated_compounds = None + pathway.set_color_priority( + color_priority, + recolor_unprioritized_entries=recolor_unprioritized_entries, + color_associated_compounds=color_associated_compounds + ) + + # Draw the map. + if self.name_files: + pathway_name = '_' + self._get_filename_pathway_name(pathway_number) + else: + pathway_name = '' + out_path = os.path.join(output_dir, f'kos_{pathway_number}{pathway_name}.pdf') + if os.path.exists(out_path) and self.overwrite_output: + os.remove(out_path) + else: + filesnpaths.is_output_file_writable(out_path, ok_if_exists=False) + self.drawer.draw_map(pathway, out_path) + return True + + def _draw_map_kos_membership( + self, + pathway_number: str, + ko_membership: Dict[str, List[str]], + color_priority: Dict[str, float], + output_dir: str, + colormap: mcolors.Colormap, + source_combos: List[Tuple[str]] = None, + draw_map_lacking_kos: bool = False + ) -> bool: + """ + Draw a pathway map, coloring reactions by their membership in sources. + + For a pangenome, reactions are colored by genomes containing consensus KOs in the reaction. + For contigs databases, reactions are colored by databases containing KOs in the reaction. By + default, with 'source_combos' being None, coloring reflects the count of genomes or + databases rather than actual genome or database membership. + + In global and overview maps, compounds involved in colored reactions are given the color of + the reaction with the highest priority. + + Parameters + ========== + pathway_number : str + Numeric ID of the map to draw. + + ko_membership : Dict[str, List[str]] + Keys are KO IDs. Values are lists of "sources:" genome names or project names of contigs + databases. + + A KO can annotate more than one gene cluster in a pangenome; a list contains the names + of genomes contributing genes to clusters represented by the KO. + + color_priority : Dict[str, float] + Keys are color hex codes. If 'by_count' is True, there should be one color for each + possible number of genomes or databases. If 'by_count' is False, there should be one + color for each individual genome or database and combination thereof. Values are + priorities. KOs with higher priority colors are drawn over KOs with lower priority + colors. + + output_dir : str + Path to an existing output directory in which map PDF files are drawn. + + colormap : matplotlib.colors.Colormap + This colormap is used to interpolate the colors of compounds involved in reactions with + color-prioritized KOs. Colors in the color_priority arguments should be drawn from this + colormap. + + source_combos : List[Tuple[str]], None + With the default argument value of None, reactions are colored by number of pangenomic + genomes or contigs databases containing the reaction. A list of "source combination" + tuples can be provided instead to color explicitly by genome or database membership. + Tuples should consist of source names (genome names or database project names) and their + combinations, e.g., [('A', ), ('B', ), ('C', ), ('A', 'B'), ('A', 'C'), ('B', 'C'), + ('A', 'B', 'C')]. + + draw_map_lacking_kos : bool, False + If False, by default, only draw the map if it contains any of the select KOs. If True, + draw the map regardless, meaning that nothing may be highlighted. + + Returns + ======= + bool + True if the map was drawn, False if the map was not drawn because it did not contain any + of the select KOs and 'draw_map_lacking_kos' was False. + """ + pathway = self._get_pathway(pathway_number) + + combo_lookup: Dict[Tuple[str], Tuple[str]] = {} + if source_combos is not None: + for combo in source_combos: + combo_lookup[tuple(sorted(combo))] = combo + + entries = pathway.get_entries(kegg_ids=ko_membership) + if not entries and not draw_map_lacking_kos: + return False + + # Change the colors of the KO graphics. A reaction Entry can represent multiple KOs. Also, + # in overview and standard maps, widen lines from the base map default of 1.0. + color_hexcodes = list(color_priority) + for entry in entries: + source_names = [] + for kegg_name in entry.name.split(): + split_kegg_name = kegg_name.split(':') + kegg_id = split_kegg_name[1] + try: + source_names += ko_membership[kegg_id] + except KeyError: + continue + assert len(source_names) + + if source_combos is None: + color_hexcode = color_hexcodes[len(set(source_names)) - 1] + else: + source_combo = combo_lookup[tuple(sorted(set(source_names)))] + color_hexcode = color_hexcodes[source_combos.index(source_combo)] + for uuid in entry.children['graphics']: + graphics: kgml.Graphics = pathway.uuid_element_lookup[uuid] + if pathway.is_global_map: + assert graphics.type == 'line' + graphics.fgcolor = color_hexcode + graphics.bgcolor = '#FFFFFF' + elif pathway.is_overview_map: + assert graphics.type == 'line' + graphics.fgcolor = color_hexcode + graphics.bgcolor = '#FFFFFF' + graphics.width = 5.0 + else: + if graphics.type == 'rectangle': + graphics.fgcolor = '#000000' + graphics.bgcolor = color_hexcode + elif graphics.type == 'line': + graphics.fgcolor = color_hexcode + graphics.bgcolor = '#FFFFFF' + graphics.width = 5.0 + else: + raise AssertionError( + "Ortholog entries are assumed to have Graphics elements of type " + f"'rectangle' or 'line', not the encountered type, '{graphics.type}'." + ) + + # Set the color priorities of entries for proper overlaying in the image. Recolor + # "unprioritized" KO graphics to a background color. In global and overview maps, recolor + # circles to reflect the colors of prioritized reactions involving the compounds. + ortholog_color_priority: Dict[str, Dict[Tuple[str, str], float]] = {} + if pathway.is_global_map: + ortholog_color_priority['line'] = line_color_priority = {} + for color_hexcode, priority in color_priority.items(): + line_color_priority[(color_hexcode, '#FFFFFF')] = priority + pathway.set_color_priority( + {'ortholog': ortholog_color_priority}, + recolor_unprioritized_entries='g', + color_associated_compounds='high', + colormap=colormap + ) + elif pathway.is_overview_map: + ortholog_color_priority['line'] = line_color_priority = {} + for color_hexcode, priority in color_priority.items(): + line_color_priority[(color_hexcode, '#FFFFFF')] = priority + pathway.set_color_priority( + {'ortholog': ortholog_color_priority}, + recolor_unprioritized_entries='w', + color_associated_compounds='high', + colormap=colormap + ) + else: + ortholog_color_priority['rectangle'] = rectangle_color_priority = {} + ortholog_color_priority['line'] = line_color_priority = {} + for color_hexcode, priority in color_priority.items(): + rectangle_color_priority[('#000000', color_hexcode)] = priority + line_color_priority[(color_hexcode, '#FFFFFF')] = priority + pathway.set_color_priority( + {'ortholog': ortholog_color_priority}, + recolor_unprioritized_entries='w' + ) + + # Draw the map. + if self.name_files: + pathway_name = '_' + self._get_filename_pathway_name(pathway_number) + else: + pathway_name = '' + out_path = os.path.join(output_dir, f'kos_{pathway_number}{pathway_name}.pdf') + if os.path.exists(out_path) and self.overwrite_output: + os.remove(out_path) + else: + filesnpaths.is_output_file_writable(out_path, ok_if_exists=False) + self.drawer.draw_map(pathway, out_path) + return True + + def _get_pathway(self, pathway_number: str) -> kgml.Pathway: + """ + Get a Pathway object for the KGML file used in drawing a pathway map. + + Parameters + ========== + pathway_number : str + Numeric ID of the map to draw. + + Returns + ======= + kgml.Pathway + Representation of the KGML file as an object. + """ + # KOs correspond to arrows rather than boxes in global and overview maps. + is_global_map = False + is_overview_map = False + if re.match(kegg.GLOBAL_MAP_ID_PATTERN, pathway_number): + is_global_map = True + elif re.match(kegg.OVERVIEW_MAP_ID_PATTERN, pathway_number): + is_overview_map = True + + # A 1x resolution global 'KO' image is used as the base of the drawing, whereas a 2x + # overview or standard 'map' image is used as the base. The global 'KO' image grays out + # all reaction arrows that are not annotated by KO ID. Select the KGML file accordingly. + if is_global_map: + kgml_path = os.path.join( + self.kegg_context.kgml_1x_ko_dir, f'ko{pathway_number}.xml' + ) + else: + kgml_path = os.path.join( + self.kegg_context.kgml_2x_ko_dir, f'ko{pathway_number}.xml' + ) + pathway = self.xml_ops.load(kgml_path) + + return pathway + + def _get_filename_pathway_name(self, pathway_number: str) -> str: + """ + Format the pathway name corresponding to the number to include in file paths. + + Replace all non-alphanumeric characters except parentheses, brackets, and curly braces with + underscores. Replace multiple consecutive underscores with a single underscore. Strip + leading and trailing underscores. + + Parameters + ========== + pathway_number : str + Numeric ID of a pathway map. + + Returns + ======= + str + Altered version of the pathway name. + """ + try: + pathway_name = self.pathway_names[pathway_number] + except KeyError: + raise ConfigError( + f"The pathway number, '{pathway_number}', is not recognized in the table of KEGG " + "pathway names set up in the KEGG data directory, which can be found here: " + f"'{self.kegg_context.kegg_pathway_list_file}'." + ) + + altered = re.sub(r'[^a-zA-Z0-9()\[\]\{\}]', '_', pathway_name) + altered = re.sub(r'_+', '_', altered) + altered = altered.strip('_') + + return altered + + def _draw_colorbar( + self, + colors: Iterable, + out_path: str, + color_labels: Iterable[str] = None, + label: str = None + ) -> None: + """ + Save a standalone colorbar to a file. + + Parameters + ========== + colors : Iterable + Sequence of Matplotlib color specifications for matplotlib.colors.ListedColormap color + parameter. + + out_path : str + Path to PDF output file. + + color_labels : Iterable[str], None + Labels corresponding to each color. + + label : str, None + Overall colorbar label. + """ + if color_labels is not None: + assert len(colors) == len(color_labels) + + fig, ax = plt.subplots(figsize=(1, 6)) + + cmap = mcolors.ListedColormap(colors) + norm = mcolors.BoundaryNorm(boundaries=range(len(colors) + 1), ncolors=len(colors)) + + cb = plt.colorbar( + plt.cm.ScalarMappable(norm=norm, cmap=cmap), + cax=ax, + orientation='vertical' + ) + + # Don't show tick marks. + cb.ax.tick_params(size=0) + + if color_labels: + # Calculate appropriate font size of tick labels based on color segment height. + height_in_data_coords = 1 / len(colors) + height_in_points = ( + ax.transData.transform((0, height_in_data_coords)) - ax.transData.transform((0, 0)) + ) + if height_in_points[1] < 10: + tick_font_size = height_in_points[1] * 2 + else: + tick_font_size = min(height_in_points[1], 24) + + cb.set_ticks(np.arange(len(colors)) + 0.5) + cb.set_ticklabels(color_labels, fontsize=tick_font_size) + + if label: + label_font_size = min(tick_font_size * 1.25, 30) + cb.set_label(label, rotation=270, labelpad=label_font_size * 1.25, fontsize=label_font_size) + + if os.path.exists(out_path) and self.overwrite_output: + os.remove(out_path) + else: + filesnpaths.is_output_file_writable(out_path, ok_if_exists=False) + plt.savefig(out_path, format='pdf', bbox_inches='tight') + plt.close() + + def make_grid( + self, + in_paths: Iterable[str], + out_path: str, + labels: Iterable[str] = None, + landscape: bool = False, + margin: float = 10.0 + ) -> None: + """ + Write a PDF containing a grid of input PDF images. + + Parameters + ========== + in_paths : Iterable[str] + Paths to input PDFs. + + out_path : str + Path to output PDF. + + labels : Iterable[str], None + Labels displayed over grid cells corresponding to input files. + + landscape : bool, False + Page layout is portrait if False, landscape if True. + + margin : float, 10.0 + Minimum space between cells. + """ + if labels: + assert len(in_paths) == len(labels) + + # Find the number of rows and columns in the grid. + cols = math.ceil(math.sqrt(len(in_paths))) + rows = math.ceil(len(in_paths) / cols) + + # Find the width and height of each cell. + width, height = fitz.paper_size(f'{"letter-l" if landscape else "letter"}') + cell_width = (width - (cols + 1) * margin) / cols + cell_height = (height - (rows + 1) * margin) / rows + + fontsize = margin * 0.8 + + # Create a new PDF document. + output_doc = fitz.open() + output_page = output_doc.new_page(width=width, height=height) + + # Loop through input PDF files, placing them in the grid. + for i, pdf_path in enumerate(in_paths): + pdf_doc = fitz.open(pdf_path) + page = pdf_doc.load_page(0) + + # Calculate position in the grid. + row = i // cols + col = i % cols + x = margin + col * (cell_width + margin) + y = margin + row * (cell_height + margin) + + # Resize the input PDF to the cell by the longest dimension, maintaining aspect ratio. + input_aspect_ratio = page.rect.width / page.rect.height + if input_aspect_ratio > 1: + draw_width = cell_width + draw_height = cell_width / input_aspect_ratio + else: + draw_height = cell_height + draw_width = cell_height * input_aspect_ratio + + # If the resized shorter side still exceeds the cell size, resize by the shorter side. + if draw_width > cell_width: + draw_width = cell_width + draw_height = cell_width / input_aspect_ratio + if draw_height > cell_height: + draw_height = cell_height + draw_width = cell_height * input_aspect_ratio + + # Find upper left drawing coordinates. + draw_x = x + (cell_width - draw_width) / 2 + draw_y = y + (cell_height - draw_height) / 2 + + # Place the input PDF. + rect = fitz.Rect(draw_x, draw_y, draw_x + draw_width, draw_y + draw_height) + output_page.show_pdf_page(rect, pdf_doc, 0) + + if labels: + # Draw labels above each image. + label = labels[i] + label_x = draw_x + label_y = draw_y + output_page.insert_text((label_x, label_y), label, fontsize=fontsize) + + output_doc.save(out_path) + + @property + def quiet(self): + return self._quiet + + @quiet.setter + def quiet(self, new_value: bool): + self._quiet = new_value + self.run.verbose = not self.quiet + self.progress.verbose = not self.quiet diff --git a/anvio/kgml.py b/anvio/kgml.py new file mode 100644 index 0000000000..305b5dc352 --- /dev/null +++ b/anvio/kgml.py @@ -0,0 +1,2161 @@ +#!/usr/bin/env python +# -*- coding: utf-8 +""" +Manipulate KEGG KGML files, which store certain KEGG pathway map data and can be used to create +customized map images. + +The XMLOps class loads KGML (XML) files into memory in an object-oriented framework, with a Pathway +element object containing all data from a file via subelements. The XMLOps class also converts a +Pathway object back to a properly formatted string that can be written to an XML file. + +The KGML framework implemented in the class is based on the schema: +https://www.kegg.jp/kegg/xml/docs/ +""" + +from __future__ import annotations + +import os +import re +import uuid +import numpy as np +import xml.etree.ElementTree as ET + +from io import StringIO +from argparse import Namespace +from Bio.KEGG.KGML import KGML_parser +from Bio.Graphics.KGML_vis import KGMLCanvas +from matplotlib.colors import Colormap, rgb2hex + +from typing import Dict, Iterable, List, Literal, Tuple, Union + +import anvio.kegg as kegg +import anvio.terminal as terminal + +from anvio.errors import ConfigError +from anvio import FORCE_OVERWRITE, __version__ as VERSION +from anvio.filesnpaths import is_file_exists, is_output_file_writable + +__author__ = "Developers of anvi'o (see AUTHORS.txt)" +__copyright__ = "Copyleft 2015-2024, the Meren Lab (http://merenlab.org/)" +__credits__ = [] +__license__ = "GPL 3.0" +__version__ = VERSION +__maintainer__ = "Samuel Miller" +__email__ = "samuelmiller10@gmail.com" +__status__ = "Development" + +class Element: + """ + Representation of an XML element from a KGML file. + + Attributes + ========== + uuid : str + Unique ID, which can be used to look up child elements in a Pathway object. + """ + # Subclass names are the same as the capitalized tag attribute. + tag: str + # Element attributes are required or not, according to the KGML schema. + attribute_required: Dict[str, bool] + + def __init__(self) -> None: + self.uuid = str(uuid.uuid4()) + +class Pathway(Element): + """ + A pathway element is the parent of all other elements in a KGML file. + + Attributes + ========== + subelement_tags : Tuple[str] + Possible child element tags. + + name : str, None + KEGG ID of the pathway map. + + org : str, None, + ko/ec/rn/[org prefix] in ID. + + number : str, None + Map number in ID. + + title : str, None + Map title. + + image : str, None + URL of map image file. + + link : str, None + URL of map information. + + xml_declaration : str, None + XML declaration line from file metadata. This is the first line of a reference KGML file. + + xml_doctype : str, None + Doctype line from file metadata. This is the second line of a reference KGML file. + + xml_comment : str, None + Comment line from file metadata. This is the third line of a reference KGML file. + + children : Dict[str, List[str]] + Keys are subelement tags, values are lists of subelement UUIDs. + + uuid_element_lookup : Dict[str, Element], {} + Keys are unique IDs of elements, values are element objects. + + kegg_id_element_lookup : Dict[str, List[Element]] = {} + Keys are KEGG IDs, values are lists of element objects with the ID in the name attribute. + An ID is not necessarily unique to an element, and an element can have multiple KEGG IDs. + + is_global_map : bool, None + True if the pathway map is a global map, as indicated by the map number. + + is_overview_map : bool, None + True if the pathway map is an overview map, as indicated by the map number. + + color_priority : Dict[str, Dict[str, Dict[Tuple[str, str], float]]] + This defines the order of entry graphics by foreground and background color. Set this + attribute with the method, set_color_priority. + + Outermost dictionary keys are Entry types, any of the possible values of the type attribute + of the Entry class, e.g., 'ortholog' and 'compound'. Middle dict keys are Graphics types, + any of the possible values of the type attribute of the Graphics class, e.g., 'rectangle' + and 'line'. Inner dict keys are length-2 tuples of fgcolor and bgcolor hex codes, + respectively. Inner dict values are non-negative numbers indicating the priority of Entry + Graphics with the given foreground and background colors: higher numbers indicate higher + priority colors. + + The following is an example of a valid color priority dictionary: ortholog entries with a + white background (#FFFFFF) take precedence over those with a gray background (#EDEDED). + Orthologs may be drawn as rectangles or lines. + { + 'ortholog': { + 'rectangle': { + ('#000000', '#FFFFFF'): 1.0, + ('#000000', '#EDEDED'): 0.0 + }, + 'line': { + ('#000000', '#FFFFFF'): 1.0, + ('#000000', '#EDEDED'): 0.0 + } + } + } + """ + tag = 'pathway' + attribute_required = { + 'name': True, + 'org': True, + 'number': True, + 'title': False, + 'image': False, + 'link': False + } + subelement_tags: Tuple[str] = ( + 'entry', + 'relation', + 'reaction' + ) + + def __init__(self) -> None: + self.name: str = None + self.org: str = None + self.number: str = None + self.title: str = None + self.image: str = None + self.link: str = None + + # Store the XML metadata of the KGML file from which the pathway was loaded. + self.xml_declaration: str = None + self.xml_doctype: str = None + self.xml_comment: str = None + + self.children: Dict[str, List[str]] = {tag: [] for tag in self.subelement_tags} + + self.uuid_element_lookup: Dict[str, Element] = {} + self.kegg_id_element_lookup: Dict[str, List[Element]] = {} + + self._is_global_map: bool = None + self._is_overview_map: bool = None + + self.color_priority: Dict[str, Dict[str, Dict[Tuple[str, str], float]]] = {} + + super().__init__() + + @property + def is_global_map(self): + if self.number is None: + return None + return True if re.match(kegg.GLOBAL_MAP_ID_PATTERN, self.number) else False + + @property + def is_overview_map(self): + if self.number is None: + return None + return True if re.match(kegg.OVERVIEW_MAP_ID_PATTERN, self.number) else False + + def set_color_priority( + self, + new_color_priority: Dict[str, Dict[str, Dict[Tuple[str, str], float]]], + recolor_unprioritized_entries: Union[str, Dict[str, Tuple[str, str]]] = False, + color_associated_compounds: Literal['high', 'low', 'average'] = None, + colormap: Colormap = None + ) -> None: + """ + Set the color_priority attribute. Entry elements in the children attribute are automatically + reordered. + + A single Entry (e.g., representing KOs or compounds) can occur multiple times on a map + (e.g., as different rectangles or circles), and thus have multiple Graphics elements. It is + required here that Graphics elements of the same type (e.g., rectangle type Graphics or line + type Graphics) for an Entry must all have the same foreground and background colors if they + are to be ordered. + + Entries with higher priority fg/bg colors are placed last in the children attribute and in + KGML files, and they are rendered in the foreground of the map. The lowest priority entries + are always those without fg/bg colors defined in the color_priority attribute; these entries + are placed first in the children attribute and KGML files, and they are rendered in the + background of the map and thus can be overlaid by higher priority entries. + + Parameters + ========== + new_color_priority : Dict[str, Dict[str, Dict[Tuple[str, str], float]]] + This dictionary is the basis of the color_priority attribute. + + It has the same structure as the color_priority attribute. Outermost dict keys are Entry + types, any of the possible values of the type attribute of the Entry class, e.g., + 'ortholog' and 'compound'. Middle dict keys are Graphics types, any of the possible + values of the type attribute of the Graphics class, e.g., 'rectangle' and 'line'. Inner + dict keys are length-2 tuples of fgcolor and bgcolor hex codes, respectively. Inner dict + values are non-negative numbers indicating the priority of Entry Graphics with the given + foreground and background colors: higher numbers indicate higher priority colors. + + What is actually used to set color_priority is a deep copy of the argument in which + fg/bg color combinations (entries in each inner dict) are reordered by priority value + ascending, so that the lowest priority colors appear first in each inner dict. The order + of Entry and Graphics types (outer and middle dict entries) do not not change: so if, + for example, 'ortholog' appears before 'compound' in the outermost dict keys, then + ortholog entries will occur before compound entries in the KGML file, and compounds can + be drawn over orthologs. + + recolor_unprioritized_entries : Union[str, Dict[str, Dict[str, Tuple[str, str]]]], False + Recolor unprioritized entries, either automatically with a string argument, or with a + custom dictionary argument for fine-tuning foreground and background colors by Entry + type. The valid string arguments for automatic recoloring are 'w' and 'g'. + + It is assumed that global maps contain reaction lines and compound circles, so automatic + recoloring is tailored to ortholog Entry line Graphics and compound Entry circle + Graphics. 'w' erases unprioritized lines and circles by coloring them entirely white. + 'g' colors them a light gray (#E0E0E0), consistent with other "unidentified" reactions + in the base map. + + It is assumed that overview maps contain reaction lines (drawn as arrows) and compound + circles. Unlike global maps, 'w' colors unprioritized arrows black, consistent with + other "unidentified" reactions in the base map. 'w' colors unprioritized circles white + (with a black border). 'g' colors arrows and circles light gray. + + It is assumed that standard maps contain ortholog boxes or lines and compound circles. + 'w' colors unprioritized boxes white, with black text; lines black; and circles white. + 'g' colors unprioritized boxes light gray, with black text; lines light gray; and + circles light gray. + + A custom dictionary argument can be used to set fg/bg colors in detail. Outer dict keys + are Entry types, e.g., 'ortholog', 'compound'. Inner dict keys are Graphics types, e.g., + 'rectangle', 'line'. Inner dict values are length-2 tuples of color hex codes for fg and + bg colors, respectively. This is shown in the following example, which sets the fg + (text) color of unprioritized ortholog Entry rectangle Graphics to dark gray and the bg + to light gray; the fg color of unprioritized ortholog Entry line Graphics to black and + the bg to white; and the fg (border) of unprioritized compounds to black and the bg to + white. + { + 'ortholog': { + 'rectangle': ('#A9A9A9', '#E0E0E0'), + 'line': ('#000000', '#FFFFFF') + }, + 'compound': { + 'circle': ('#000000', '#FFFFFF') + } + } + + color_associated_compounds : Literal['high', 'low', 'average'], None + Automatically set the background color of compound entries based on the color priority + of ortholog entries involving the compounds. By default, compounds are circles, and + orthologs are lines on global/overview maps and boxes or lines on standard maps. + + An argument of 'high' or 'low' sets the compound background color to the bg color of the + ortholog with the highest or lowest priority fg/bg color combination. 'average' sets the + bg color to the average bg color of orthologs with prioritized colors -- unprioritized + orthologs are not taken into account. 'average' should only be used if priority values + are normalized to the interval [0, 1] and can be converted to a color given by the + colormap argument. The average priority value of the orthologs with prioritized colors + is mapped to a bg color for compound Entry circle Graphics. + + Automatically colored compound entries are added to the color_priority attribute, and + Entry elements in the children attribute are reordered accordingly. Compound entries + that are already in the color_priority attribute are exempt from recoloring and given + higher priority than automatically recolored compound entries. + + colormap : matplotlib.colors.Colormap, None + If 'average' is used as the color_associated_compounds argument, a colormap must be + provided to map averaged priority values on the interval [0, 1] to a background color + for compound Entry circle Graphics. + """ + # Check that new_color_priority only contains positive priority values. + for new_entry_color_priority in new_color_priority.values(): + for new_graphics_color_priority in new_entry_color_priority.values(): + for priority in new_graphics_color_priority.values(): + assert priority >= 0 + + # Make the color_priority attribute dict, reordering colors from lowest to highest priority. + color_priority = {} + for entry_type, new_entry_color_priority in new_color_priority.items(): + color_priority[entry_type] = entry_type_color_priority = {} + for graphics_type, new_graphics_color_priority in new_entry_color_priority.items(): + entry_type_color_priority[graphics_type] = graphics_type_color_priority = {} + for colors, priority in sorted( + new_graphics_color_priority.items(), key=lambda item: item[1] + ): + graphics_type_color_priority[colors] = priority + self.color_priority = color_priority + + # Reorder Entry elements in the children attribute from lowest to highest priority. + unprioritized_entry_uuids = self.order_entries_by_color_priority() + + if recolor_unprioritized_entries: + if isinstance(recolor_unprioritized_entries, str): + assert recolor_unprioritized_entries in ('w', 'g') + + # Recolor orthologs. + if recolor_unprioritized_entries == 'w': + if self.is_overview_map: + color_hex_code = '#000000' + else: + color_hex_code = '#FFFFFF' + elif recolor_unprioritized_entries == 'g': + color_hex_code = '#E0E0E0' + self.recolor_unprioritized_ortholog_entries( + unprioritized_entry_uuids, color_hex_code + ) + + # Recolor compounds. + if recolor_unprioritized_entries == 'w': + color_hex_code = '#FFFFFF' + elif recolor_unprioritized_entries == 'g': + color_hex_code = '#E0E0E0' + self.recolor_unprioritized_compound_entries( + unprioritized_entry_uuids, color_hex_code + ) + else: + self.recolor_unprioritized_entries( + unprioritized_entry_uuids, recolor_unprioritized_entries + ) + + if color_associated_compounds is None: + return + self.color_associated_compounds(color_associated_compounds, colormap=colormap) + + # Reorder compound entries in the children attribute according to color priority. + unprioritized_entry_uuids = self.order_entries_by_color_priority() + + # Recolor compounds that have not been assigned a color priority. + if recolor_unprioritized_entries: + if isinstance(recolor_unprioritized_entries, str): + if recolor_unprioritized_entries == 'w': + color_hex_code = '#FFFFFF' + elif recolor_unprioritized_entries == 'g': + color_hex_code = '#E0E0E0' + self.recolor_unprioritized_compound_entries( + unprioritized_entry_uuids, color_hex_code + ) + else: + self.recolor_unprioritized_entries( + unprioritized_entry_uuids, recolor_unprioritized_entries + ) + + def order_entries_by_color_priority(self) -> List[str]: + """ + Reorder Entry (e.g., 'ortholog' and 'compound') UUIDs by color priority in the children + attribute of the Pathway object. This determines how entries are ordered in KGML files and + rendered in maps. + + Returns + ======= + List[str] + UUIDs of Entry elements without a color priority. + """ + # Entries have different types ('ortholog', 'compound', etc.). Group entries into two + # classes. "Qualifying" entries have types in the color priority outermost dict. Other + # entries do not have types in the dict and are assigned the lowest nominal priority of + # -1.0. No effort is made to sort these entries in any way. + reordered_entry_uuids: List[str] = [] + unprioritized_entry_uuids: List[str] = [] + qualifying_entry_uuids: Dict[str, List[str]] = { + entry_type: [] for entry_type in self.color_priority + } + for entry_uuid in self.children['entry']: + entry: Entry = self.uuid_element_lookup[entry_uuid] + if entry.type in self.color_priority: + qualifying_entry_uuids[entry.type].append(entry_uuid) + else: + reordered_entry_uuids.append(entry_uuid) + unprioritized_entry_uuids.append(entry_uuid) + + # Sort "qualifying" entries. Loop through each Entry type in the color priority dict. + for entry_type, entry_type_color_priority in self.color_priority.items(): + # Retrieve each Entry object of the type. Its priority is determined from fg and bg + # colors. + type_qualifying_entry_uuids = qualifying_entry_uuids[entry_type] + type_priority_entry_uuids: Dict[float, List[str]] = {} + for entry_uuid in type_qualifying_entry_uuids: + entry: Entry = self.uuid_element_lookup[entry_uuid] + + # Ensure that all of the Entry Graphics elements are of the same type and have the + # same fg and bg colors. + graphics_types: List[str] = [] + fgcolors: List[str] = [] + bgcolors: List[str] = [] + for graphics_uuid in entry.children['graphics']: + graphics_element: Graphics = self.uuid_element_lookup[graphics_uuid] + graphics_types.append(graphics_element.type) + fgcolors.append(graphics_element.fgcolor) + bgcolors.append(graphics_element.bgcolor) + if len(set(graphics_types)) != 1: + graphics_type_message = ', '.join([f"'{gt}'" for gt in graphics_types]) + raise AssertionError( + f"The Graphics elements for the Entry with UUID '{entry_uuid}' do not " + "have the same type, which is required for ordering entries based on " + f"color. Graphics have types: {graphics_type_message}" + ) + if len(set(fgcolors)) != 1 or len(set(bgcolors)) != 1: + raise AssertionError( + f"The Graphics elements in the Entry with UUID '{entry_uuid}' do not " + "have consistent foreground and background colors, which is required " + "for ordering entries based on color." + ) + + graphics_type = graphics_types[0] + try: + priority = entry_type_color_priority[graphics_type][(fgcolors[0], bgcolors[0])] + except KeyError: + # The Entry does not have colors in the priority dictionary. + priority = -1.0 + + try: + type_priority_entry_uuids[priority].append(entry_uuid) + except KeyError: + type_priority_entry_uuids[priority] = [entry_uuid] + + # Add the reordered UUIDs of the Entry type to the new list of Entry UUIDs and to the + # dict mapping priority values to UUIDs of entries of all types. + for priority, entry_uuids in sorted(type_priority_entry_uuids.items()): + reordered_entry_uuids += entry_uuids + + try: + unprioritized_entry_uuids += type_priority_entry_uuids[-1.0] + except KeyError: + pass + + self.children['entry'] = reordered_entry_uuids + + return unprioritized_entry_uuids + + def recolor_unprioritized_ortholog_entries( + self, + unprioritized_entry_uuids: List[str], + color_hex_code: str + ) -> None: + """ + Recolor orthologs without a color priority. + + Ortholog entries are expected to have Graphics elements of type 'line' in global and + overview maps and type 'rectangle' or 'line' in standard maps. The color is applied to the + foreground of a line, and the background is made white. The color is applied to the + background of a rectangle, and the foreground (text) is made black. + + Parameters + ========== + unprioritized_entry_uuids : List[str] + List of UUIDs of all entries without a color priority. + + color_hex_code : str + Hex code of the color for ortholog graphics. + """ + if self.is_global_map or self.is_overview_map: + self.recolor_unprioritized_entries( + unprioritized_entry_uuids, {'ortholog': {'line': (color_hex_code, '#FFFFFF')}} + ) + else: + self.recolor_unprioritized_entries( + unprioritized_entry_uuids, + {'ortholog': { + 'rectangle': ('#000000', color_hex_code), 'line': (color_hex_code, '#000000') + }} + ) + + def recolor_unprioritized_compound_entries( + self, + unprioritized_entry_uuids: List[str], + color_hex_code: str + ) -> None: + """ + Recolor compounds without a color priority. + + Compound entries are expected to have Graphics elements of type 'circle'. In global maps, + the color is applied to both the background (fill) and foreground (border) of the circle. In + overview and standard maps, the background is colored, and the foreground is made black. + + Parameters + ========== + unprioritized_entry_uuids : List[str] + List of UUIDs of all entries without a color priority. + + color_hex_code : str + Hex code of the color for compound graphics. + """ + if self.is_global_map: + self.recolor_unprioritized_entries( + unprioritized_entry_uuids, + {'compound': {'circle': (color_hex_code, color_hex_code)}} + ) + else: + self.recolor_unprioritized_entries( + unprioritized_entry_uuids, {'compound': {'circle': ('#000000', color_hex_code)}} + ) + + def recolor_unprioritized_entries( + self, + unprioritized_entry_uuids: List[str], + type_colors: Dict[str, Dict[str, Tuple[str, str]]] + ) -> None: + """ + Entries without a color priority are recolored by Entry type. + + Parameters + ========== + unprioritized_entry_uuids : List[str] + List of UUIDs of all entries without a color priority. + + type_colors : Dict[str, Dict[str, Tuple[str, str]]] + Outer dictionary keys are Entry types, e.g., 'ortholog', 'compound'. Inner dict keys are + Graphics types, e.g., 'rectangle', 'line', 'circle'. Inner dict values are length-2 + tuples of foreground and background color hex codes, respectively. This is shown in the + following example, which sets the fg (text) color of unprioritized ortholog Entry + rectangle Graphics to dark gray and the bg to light gray; the fg color of unprioritized + ortholog Entry line Graphics to black and the bg to white; and the fg (border) of + unprioritized compounds to black and the bg to white. + { + 'ortholog': { + 'rectangle': ('#A9A9A9', '#E0E0E0'), + 'line': ('#000000', '#FFFFFF') + }, + 'compound': { + 'circle': ('#000000', '#FFFFFF') + } + } + """ + # Prevent unprioritized entries from being assigned prioritized colors. + for entry_type, entry_type_colors in type_colors.items(): + try: + entry_type_color_priority = self.color_priority[entry_type] + except KeyError: + continue + for graphics_type, graphics_type_colors in entry_type_colors.items(): + try: + graphics_type_color_priority = entry_type_color_priority[graphics_type] + except KeyError: + continue + if graphics_type_colors in graphics_type_color_priority: + raise ConfigError( + "Unprioritized entry graphics cannot be assigned the same combination of " + "foreground and background colors as prioritized entries of the same entry " + "and graphics types." + ) + + for entry_uuid in unprioritized_entry_uuids: + entry: Entry = self.uuid_element_lookup[entry_uuid] + try: + entry_type_colors = type_colors[entry.type] + except KeyError: + continue + for graphics_uuid in entry.children['graphics']: + graphics: Graphics = self.uuid_element_lookup[graphics_uuid] + try: + fgcolor_hex_code, bgcolor_hex_code = entry_type_colors[graphics.type] + except KeyError: + continue + graphics.fgcolor = fgcolor_hex_code + graphics.bgcolor = bgcolor_hex_code + + def color_associated_compounds( + self, + transfer: Literal['high', 'low', 'average'], + colormap: Colormap = None + ) -> None: + """ + Set the color of compound entries based on the color priority of ortholog entries involving + the compounds. + + Compound entries are expected to have Graphics elements of type 'circle'. If the map is + global, color both the background (interior) and foreground (border) of the circle. In + overview and standard maps, the background is colored, and the foreground is made black. + + Parameters + ========== + transfer : Literal['high', 'low', 'average'] + An argument of 'high' or 'low' sets the compound color to the bg color of the ortholog + with the highest or lowest priority fg/bg color combination. 'average' sets the compound + color to the average bg color of orthologs with prioritized colors -- unprioritized + orthologs are not taken into account. 'average' should only be used if priority values + are normalized to the interval [0, 1] and can be converted to a color given by the + colormap argument. The average priority value of the orthologs with prioritized colors + is mapped to a color for the compound Entry. + + Compound entries that are already in the color_priority attribute are exempt from + recoloring and given higher priority than recolored compound entries. + + colormap : matplotlib.colors.Colormap, None + If 'average' is used as an argument to transfer, a colormap must be provided to map + averaged priority values on the interval [0, 1] to a background color for compound + entries. + """ + # Make Reaction elements searchable by name (KEGG IDs). Reaction elements link Compound + # elements to ortholog Entry elements. + name_reaction: Dict[str, Reaction] = {} + for entry_uuid in self.children['reaction']: + reaction: Reaction = self.uuid_element_lookup[entry_uuid] + name_reaction[reaction.name] = reaction + + # For each compound Entry with associated color-prioritized ortholog entries, record the + # colors and priorities of these entries. + compound_uuid_color_priorities: Dict[str, List[Tuple[str, float]]] = {} + # Loop through each ortholog Entry. + for entry_uuid in self.children['entry']: + entry: Entry = self.uuid_element_lookup[entry_uuid] + if entry.type != 'ortholog': + continue + + # Ensure that all of the ortholog Entry Graphics elements have the same fg/bg colors. + graphics_types: List[str] = [] + fgcolors: List[str] = [] + bgcolors: List[str] = [] + for graphics_uuid in entry.children['graphics']: + graphics: Graphics = self.uuid_element_lookup[graphics_uuid] + graphics_types.append(graphics.type) + fgcolors.append(graphics.fgcolor) + bgcolors.append(graphics.bgcolor) + if len(set(graphics_types)) != 1: + graphics_type_message = ', '.join([f"'{gt}'" for gt in graphics_types]) + raise AssertionError( + f"The Graphics elements for the Entry with UUID '{entry_uuid}' do not " + "have the same type, which is required for ordering entries based on " + f"color. Graphics have types: {graphics_type_message}" + ) + if len(set(fgcolors)) != 1 or len(set(bgcolors)) != 1: + raise AssertionError( + "The Graphics elements in the ortholog Entry with the following UUID do not " + "have consistent foreground and background colors, which is required for " + f"ordering entries based on color: {entry_uuid}" + ) + + graphics_type = graphics_types[0] + fgcolor = fgcolors[0] + bgcolor = bgcolors[0] + try: + priority = self.color_priority['ortholog'][graphics_type][(fgcolor, bgcolor)] + except KeyError: + # Unprioritized ortholog entries do not affect the color of associated compounds. + continue + + reaction_name = entry.reaction + if reaction_name is None: + # The ortholog is not associated with a reaction. + continue + + try: + reaction = name_reaction[reaction_name] + except KeyError: + # No Reaction element is present with the name of the ortholog reaction. + continue + + if graphics.type == 'line': + ortholog_color = fgcolor + else: + ortholog_color = bgcolor + + for substrate_uuid in reaction.children['substrate']: + substrate: Substrate = self.uuid_element_lookup[substrate_uuid] + split_substrate_names = [ + split_name.split(':') for split_name in substrate.name.split() + ] + for split_name in split_substrate_names: + for compound_entry in self.kegg_id_element_lookup[split_name[1]]: + compound_entry: Entry + compound_uuid = compound_entry.uuid + try: + compound_uuid_color_priorities[compound_uuid].append( + (ortholog_color, priority) + ) + except KeyError: + compound_uuid_color_priorities[compound_uuid] = [ + (ortholog_color, priority) + ] + + for product_uuid in reaction.children['product']: + product: Product = self.uuid_element_lookup[product_uuid] + split_product_names = [split_name.split(':') for split_name in product.name.split()] + for split_name in split_product_names: + for compound_entry in self.kegg_id_element_lookup[split_name[1]]: + compound_entry: Entry + compound_uuid = compound_entry.uuid + try: + compound_uuid_color_priorities[compound_uuid].append( + (ortholog_color, priority) + ) + except KeyError: + compound_uuid_color_priorities[compound_uuid] = [ + (ortholog_color, priority) + ] + + # Make compound entries searchable by ID, which should be a unique pathway element ID. + id_compound_entry: Dict[str, Entry] = {} + for entry_uuid in self.children['entry']: + entry: Entry = self.uuid_element_lookup[entry_uuid] + if entry.type != 'compound': + continue + id_compound_entry[entry.id] = entry + + # Define functions for finding compound Entry color. + def _get_high_color(color_priorities: List[Tuple[str, float]]) -> Tuple[str, float]: + return sorted(color_priorities, key=lambda t: -t[1])[0] + + def _get_low_color(color_priorities: List[Tuple[str, float]]) -> Tuple[str, float]: + return sorted(color_priorities, key=lambda t: t[1])[0] + + def _get_average_color(color_priorities: List[Tuple[str, float]]) -> Tuple[str, float]: + priority = np.mean([t[1] for t in color_priorities]) + color = rgb2hex(colormap(priority)) + return color, priority + + if transfer == 'high': + get_color_priority = _get_high_color + elif transfer == 'low': + get_color_priority = _get_low_color + elif transfer == 'average': + get_color_priority = _get_average_color + else: + raise AssertionError + + # Set compound Entry color. + for compound_uuid, color_priorities in compound_uuid_color_priorities.items(): + compound: Union[Substrate, Product] = self.uuid_element_lookup[compound_uuid] + compound_entry: Entry = id_compound_entry[compound.id] + + # Get all of the Graphics elements for the Entry. + graphics_elements: List[Graphics] = [] + for graphics_uuid in compound_entry.children['graphics']: + graphics_elements.append(self.uuid_element_lookup[graphics_uuid]) + + set_color = True + for graphics in graphics_elements: + try: + # The compound Entry has already been assigned a color priority, so don't + # recolor it automatically. + self.color_priority['compound']['circle'][(graphics.fgcolor, graphics.bgcolor)] + set_color = False + except KeyError: + continue + if not set_color: + continue + + compound_color, compound_priority = get_color_priority(color_priorities) + # Set the color of each Graphics element. + for graphics in graphics_elements: + graphics.bgcolor = compound_color + if self.is_global_map: + graphics.fgcolor = compound_color + + # Record the compound Element color priority. + try: + entry_type_color_priority = self.color_priority['compound'] + except KeyError: + self.color_priority['compound'] = entry_type_color_priority = {} + try: + graphics_type_color_priority = entry_type_color_priority['circle'] + except KeyError: + entry_type_color_priority['circle'] = graphics_type_color_priority = {} + if self.is_global_map: + graphics_type_color_priority[(compound_color, compound_color)] = compound_priority + else: + graphics_type_color_priority[(graphics.fgcolor, compound_color)] = compound_priority + + def get_entries( + self, + entry_type: str = None, + kegg_ids: Iterable[str] = None, + expect_kegg_ids: bool = False + ) -> List[Entry]: + """ + Get Entry elements from the pathway. + + Parameters + ========== + entry_type : str, None + The type of Entry to return. By default entries of all types are returned. Permitted + Entry types are given by the types attribute of the Entry class. + + The box Entry types (line Entry types in global and overview maps) are as follows given + the map name prefix: maps starting with 'ko' have box/line entries of type 'ortholog', + 'ec' have type 'enzyme', 'rn' have type 'reaction', and organism-specific maps with + have type 'ko'. + + kegg_ids : Iterable[str], None + If KEGG IDs are provided, then only entries with these IDs in their 'name' attribute are + sought. With the default argument of None, all entries of the type are returned. KEGG + IDs should not be the full ID found in the Entry name attribute, but the part after the + colon. For example, instead of 'ko:K01080', 'cpd:C12144', and 'path:map00604', which is + how they appear in the KGML file, use 'K01080', 'C12144', and 'map00604'. + + expect_kegg_ids : bool, False + If KEGG IDs are provided and this argument is True, then an exception is raised if they + are not found among the entries in the pathway. + + Returns + ======= + List[Entry] + A list of entry element objects contained in the pathway. + """ + if entry_type is not None: + assert entry_type in Entry.types + if kegg_ids is not None: + assert entry_type is None + + entries: List[Entry] = [] + + if kegg_ids is None: + for uuid in self.children['entry']: + entry: Entry = self.uuid_element_lookup[uuid] + if entry_type is not None and entry.type != entry_type: + continue + entries.append(entry) + return entries + + missing_kegg_ids: List[str] = [] + for kegg_id in kegg_ids: + try: + entries += self.kegg_id_element_lookup[kegg_id] + except KeyError: + missing_kegg_ids.append(kegg_id) + if missing_kegg_ids and expect_kegg_ids: + raise ValueError( + "The following 'kegg_ids' that were provided are not found among entries in the " + f"pathway: {', '.join(missing_kegg_ids)}" + ) + return entries + + def scale_graphics(self, factor: float, entry_type: str = None): + """ + Change the scale of entry graphics. + + Rescaling all of the graphics is useful in fitting the KGML file to map images with + different resolutions. For example, 1x and 2x resolution image files can be downloaded from + KEGG, but only KGML files fitting the 1x images. + + Parameters + ========== + factor : float + Factor by which to rescale all graphical elements in the pathway. + + entry_type : str, None + Only rescale graphics for a certain type of entry in the KGML file, such as "ortholog" + or "compound". The argument must be from the types attribute of the Entry class. + """ + for entry in self.get_entries(entry_type=entry_type): + for graphics_uuid in entry.children['graphics']: + graphics: Graphics = self.uuid_element_lookup[graphics_uuid] + for attrib in ('x', 'y', 'width', 'height'): + value = getattr(graphics, attrib, None) + if value is None: + continue + setattr(graphics, attrib, value * factor) + value = getattr(graphics, 'coords', None) + if value is None: + continue + setattr(graphics, 'coords', tuple([coord * factor for coord in value])) + +class Entry(Element): + """ + An entry element contains information about a node of the pathway. + + Attributes + ========== + types : Tuple[str] + Possible entry types. + + subelement_tags : Tuple[str] + Possible subelement tags. + + id : str, None + ID unique to map. + + name : str, None + KEGG ID(s) represented by the element. + + type : str, None + Entry type. + + reaction : str, None + KEGG ID(s) of reaction(s) represented by the element. + + link : str, None + URL of entry information. + + children : Dict[str, List[str]] + Keys are subelement tags, values are lists of subelement UUIDs. + """ + tag = 'entry' + attribute_required = { + 'id': True, + 'name': True, + 'type': True, + 'reaction': False, + 'link': False + } + types: Tuple[str] = ( + 'ortholog', + 'enzyme', + 'reaction', + 'gene', + 'group', + 'compound', + 'map', + 'brite', + 'other' + ) + subelement_tags: Tuple[str] = ( + 'graphics', + 'component' + ) + + def __init__(self) -> None: + self.id: str = None + self.name: str = None + self.type: str = None + self.reaction: str = None + self.link: str = None + + self.children: Dict[str, List[str]] = {n: [] for n in self.subelement_tags} + + super().__init__() + +class Graphics(Element): + """ + A graphics element contains drawing information on the entry parent element. + + Attributes + ========== + types : Tuple[str] + Possible shapes of graphical objects. + + name : str, None + Label of graphical object on map. + + fgcolor : str, None + Foreground color of graphical object on map. + + bgcolor : str, None + Background color of graphical object on map. + + type : str, None + Shape of graphical object on map. + + x : float, None + X axis position of graphical object on map. + + y : float, None + Y axis position of graphical object on map. + + coords : Tuple[float], None + Polyline coordinates of "line"-type graphical object on map. + + width : float, None + Width of graphical object on map. + + height : float, None + Height of graphical object on map. + """ + tag = 'graphics' + attribute_required = { + 'name': False, + 'fgcolor': False, + 'bgcolor': False, + 'type': False, + 'x': False, + 'y': False, + 'coords': False, + 'width': False, + 'height': False + } + types: Tuple[str] = ( + 'rectangle', + 'circle', + 'roundrectangle', + 'line' + ) + + def __init__(self) -> None: + self.name: str = None + self.fgcolor: str = None + self.bgcolor: str = None + self.type: str = None + self.x: float = None + self.y: float = None + self.coords: Tuple[float] = None + self.width: float = None + self.height: float = None + + super().__init__() + +class Component(Element): + """ + A component element is only applicable to "group"-type entry elements representing a complex. + + Attributes + ========== + id : str, None + ID of component element unique to map. + """ + tag = 'component' + attribute_required = { + 'id': True + } + def __init__(self) -> None: + self.id: str = None + + super().__init__() + +class Relation(Element): + """ + A relation element is an edge between proteins, gene products, compounds, and pathways. + + Attributes + ========== + types : Tuple[str] + Possible types of relations. + + subelement_tags : Tuple[str] + Possible subelement tags. + + entry1 : str, None + ID unique to map representing a node in the relationship. + + entry2 : str, None + ID unique to map representing the other node in the relationship. + + children : Dict[str, List[str]] + Keys are subelement tags, values are lists of subelement UUIDs. + """ + + tag = 'relation' + attribute_required = { + 'entry1': True, + 'entry2': True, + 'type': True + } + types: Tuple[str] = ( + 'ECrel', + 'PPrel', + 'GErel', + 'PCrel', + 'maplink' + ) + subelement_tags: Tuple[str] = ( + 'subtype', + ) + def __init__(self) -> None: + self.entry1: str = None + self.entry2: str = None + self.type: str = None + + self.children: Dict[str, List[str]] = {n: [] for n in self.subelement_tags} + + super().__init__() + +class Subtype(Element): + """ + A subtype element specifies more detailed information about the relation. + + Attributes + ========== + names : Tuple[str] + Possible names of subcategories of relation. + + name : str, None + Name of the subcategory of relation. + + value : str, None + The value represents information on the subcategory relation. + """ + tag = 'subtype' + attribute_required = { + 'name': True, + 'value': True + } + names: Tuple[str] = ( + 'compound', + 'hidden compound', + 'activation', + 'inhibition', + 'expression', + 'repression', + 'indirect effect', + 'state change', + 'binding/association', + 'dissociation', + 'missing information', + 'phosphorylation', + 'dephosphorylation', + 'glycosylation', + 'ubiquitination', + 'methylation' + ) + + def __init__(self) -> None: + self.name: str = None + self.value: str = None + + super().__init__() + +class Reaction(Element): + """ + A chemical reaction element. + + Attributes + ========== + types : Tuple[str] + Possible types of reactions. + + subelement_tags : Tuple[str] + Possible subelement tags. + + id : str, None + ID of reaction unique to map. + + name : str, None + KEGG ID(s) represented by the reaction. + + type : str, None + Reversible vs. irreversible reaction, as drawn on the map. + + children : Dict[str, List[str]] + Keys are subelement tags, values are lists of subelement UUIDs. + """ + tag = 'reaction' + attribute_required = { + 'id': True, + 'name': True, + 'type': True + } + types: Tuple[str] = ( + 'reversible', + 'irreversible' + ) + subelement_tags: Tuple[str] = ( + 'substrate', + 'product' + ) + + def __init__(self) -> None: + self.id: str = None + self.name: str = None + self.type: str = None + + self.children: Dict[str, List[str]] = {n: [] for n in self.subelement_tags} + + super().__init__() + +class Substrate(Element): + """ + A substrate element represents a substrate node in a parent reaction element. + + Attributes + ========== + subelement_tags : Tuple[str] + Possible subelement tags. + + id : str, None + ID of substrate unique to map corresponding to a compound entry. + + name : str, None + KEGG ID of the compound. + + children : Dict[str, List[str]] + Keys are subelement tags, values are lists of subelement UUIDs. + """ + tag = 'substrate' + attribute_required = { + 'id': True, + 'name': True + } + subelement_tags: Tuple[str] = ( + 'alt', + ) + + def __init__(self) -> None: + self.id: str = None + self.name: str = None + + self.children: Dict[str, List[str]] = {n: [] for n in self.subelement_tags} + + super().__init__() + +class Product(Element): + """ + A product element represents a product node in a parent reaction element. + + Attributes + ========== + subelement_tags : Tuple[str] + Possible subelement tags. + + id : str, None + ID of product unique to map corresponding to a compound entry. + + name : str, None + KEGG ID of the compound. + + children : Dict[str, List[str]] + Keys are subelement tags, values are lists of subelement UUIDs. + """ + tag = 'product' + attribute_required = { + 'id': True, + 'name': True + } + subelement_tags: Tuple[str] = ( + 'alt', + ) + + def __init__(self) -> None: + self.id: str = None + self.name: str = None + + self.children: Dict[str, List[str]] = {n: [] for n in self.subelement_tags} + + super().__init__() + +class Alt(Element): + """ + An alt element specifies an alternative name of a parent substrate or product element. + + Attributes + ========== + name : str, None + Alternative KEGG ID of the compound. + """ + tag = 'alt' + attribute_required = { + 'name': True + } + + def __init__(self) -> None: + self.name: str = None + + super().__init__() + +class XMLOps: + """ + This class loads KGML (XML) files into memory in an object-oriented framework, and converts KGML + objects into a properly formatted string that can be written to an XML file. + + Attributes + ========== + subelement_indentation_increment : int + Class variable setting the indentation increment of subelements relative to parents in an + output KGML XML file. The value of 4 spaces is that used in KGML reference files. + + attribute_indentations : Dict[Tuple[str, str, str], int] + Class variable setting the absolute indentation of element attributes placed on new lines in + an output KGML XML file. Keys are tuples of element tag, name of the attribute before the + line break, and name of the attribute after the line break; values are the number of spaces. + The attributes placed on new lines and numbers of spaces are those used in KGML reference + files. + """ + subelement_indentation_increment: int = 4 + attribute_indentations: Dict[Tuple[str, str, str], int] = { + ('pathway', 'number', 'title'): 9, + ('pathway', 'title', 'image'): 9, + ('pathway', 'image', 'link'): 9, + ('entry', 'reaction', 'link'): 8, + ('entry', 'type', 'link'): 8, + ('graphics', 'bgcolor', 'type'): 13 + } + + def __init__(self) -> None: + pass + + def load(self, kgml_filepath: str) -> Pathway: + """ + Load a KGML file as element objects. + + Parameters + ========== + kgml_filepath : str + Path to a KGML file. + + Returns + ======= + Pathway + KGML pathway element object containing all data from the file via subelements. + """ + assert os.path.exists(kgml_filepath) + + with open(kgml_filepath, 'rb') as file: + kgml_bytes = file.read() + root = ET.fromstring(kgml_bytes) + assert root.tag == Pathway.tag + + pathway: Pathway = self.load_element(root) + pathway.xml_declaration, pathway.xml_doctype, pathway.xml_comment = [ + line.decode('utf-8') for line in kgml_bytes.split(b'\n')[: 3] + ] + + return pathway + + def load_element(self, xml_element: ET.Element, pathway: Pathway = None) -> Element: + """ + Load a KGML element object representing an XML element from a KGML file. + + Parameters + ========== + xml_element : xml.etree.ElementTree.Element + XML element loaded from KGML file. + + pathway : Pathway + The pathway object containing the loaded KGML element, None if the element being loaded + is the pathway itself. + + Returns + ======= + Element + Object representing KGML element. + """ + kgml_element_class = globals()[xml_element.tag.capitalize()] + kgml_element: Element = kgml_element_class() + + # Consider each possible attribute of the KGML element. + for attribute, is_required in kgml_element.attribute_required.items(): + try: + value = xml_element.attrib[attribute] + except KeyError: + if is_required: + # The required attribute was not present. + error_message = "" + for a, v in xml_element.attrib.items(): + error_message += f" '{a}': '{v}'" + raise AssertionError( + "An XML element was encountered that should but does not contain an " + f"attribute, '{attribute}'. Here is a list of the element's attributes " + f"read from the KGML file:{error_message}") + else: + # The optional attribute was not present. + continue + + # Convert certain non-string values stored in KGML element objects. + if kgml_element.tag == 'graphics': + if attribute in ('x', 'y', 'width', 'height'): + value = float(value) + elif attribute == 'coords': + value = tuple([int(coord) for coord in value.split(',')]) + + setattr(kgml_element, attribute, value) + + # Recursively load subelements. + for xml_subelement in xml_element: + assert xml_subelement.tag in kgml_element.subelement_tags + if pathway is None: + kgml_subelement = self.load_element(xml_subelement, pathway=kgml_element) + else: + kgml_subelement = self.load_element(xml_subelement, pathway=pathway) + kgml_element.children[kgml_subelement.tag].append(kgml_subelement.uuid) + + if pathway is None: + return kgml_element + + # Map unique IDs to element objects. + pathway.uuid_element_lookup[kgml_element.uuid] = kgml_element + + # Map KEGG IDs to element objects; IDs are not necessarily unique to objects. Note that the + # first part of the KEGG ID as it appears in the KGML file is stripped. For example, + # 'ko:K01080', 'cpd:C12144', and 'path:map00604' become 'K01080', 'C12144', and 'map00604' + # in the dictionary keys. + if kgml_element.tag in ( + 'pathway', + 'entry', + 'reaction', + 'substrate', + 'product', + 'alt' + ): + for kegg_name in kgml_element.name.split(): + # Some Entry names are "undefined". + split_kegg_name = kegg_name.split(':') + if len(split_kegg_name) != 2: + continue + kegg_id = split_kegg_name[1] + try: + pathway.kegg_id_element_lookup[kegg_id].append(kgml_element) + except KeyError: + pathway.kegg_id_element_lookup[kegg_id] = [kgml_element] + + return kgml_element + + def write(self, pathway: Pathway, output_filepath: str) -> None: + """ + Write a KGML object representation as a formatted KGML (XML) file. + + Parameters + ========== + pathway : Pathway + KGML pathway element object. + + output_filepath : str + Path to KGML (XML) output file. + + Returns + ======= + None + """ + assert is_output_file_writable(output_filepath) + with open(output_filepath, 'w') as file: + file.write(self.get_str(pathway)) + + def get_str(self, pathway: Pathway) -> str: + """ + Convert a KGML object representation to a formatted KGML XML string. + + Parameters + ========== + pathway : Pathway + KGML pathway element object. + + Returns + ======= + str + Formatted XML string ready to write as a KGML file. + """ + assert pathway.tag == Pathway.tag + tree = self.get_tree(pathway, str_values=True) + + output_str = '' + + # Record KGML XML metadata, stored in the pathway object. Note that special HTML characters + # (&, >, <) are not escaped, as these were not observed in the metadata of KGML reference + # files. + xml_declaration = pathway.xml_declaration + if xml_declaration is not None: + assert xml_declaration[: 6] == '' + output_str += f'\n' + + xml_doctype = pathway.xml_doctype + if xml_doctype is not None: + assert xml_doctype[: 10] == '' + output_str += f'\n' + + xml_comment = pathway.xml_comment + if xml_comment is not None: + assert xml_comment[: 5] == '' + output_str += f'\n' + + output_str += self.get_indented_str(tree.getroot()) + + return output_str + + def get_tree(self, pathway: Pathway, str_values: bool = False) -> ET.ElementTree: + """ + Convert a KGML object representation to an XML tree. + + Parameters + ========== + pathway : Pathway + KGML pathway element object. + + str_values : bool, False + If True, convert non-string attribute values stored in KGML element objects to strings + like those in KGML reference files. + + Returns + ======= + xml.etree.ElementTree.ElementTree + XML representation of KGML elements, with the root pathway element. + """ + root = self.get_element(pathway, str_values=str_values) + tree = ET.ElementTree(root) + return tree + + def get_element( + self, + kgml_element: Element, + pathway: Pathway = None, + str_values: bool = False + ) -> ET.Element: + """ + Convert a KGML element object representation to an XML element. + + Parameters + ========== + kgml_element : Element + KGML element object to convert into an XML element. + + pathway : Pathway + Pathway object containing the KGML element, needed if the element is not a pathway + element. + + str_values : bool, False + If True, convert non-string attribute values stored in the KGML element object to + strings like those in KGML reference files. + + Returns + ======= + xml.etree.ElementTree.Element + XML representation of the KGML element. + """ + if pathway is None: + if kgml_element.tag != 'pathway': + raise ValueError( + "The 'pathway' element containing 'kgml_element' must be given as an argument." + ) + pathway = kgml_element + + xml_element = ET.Element(kgml_element.tag) + + for attribute in kgml_element.attribute_required: + # The KGML element object should have a value of each possible attribute, even those not + # required, for which the default value is None. + value = getattr(kgml_element, attribute) + if value is None: + continue + if str_values: + if kgml_element.tag == 'graphics': + if attribute in ('x', 'y', 'width', 'height'): + value = str(round(value)) + elif attribute == 'coords': + value = ','.join([str(round(coord)) for coord in value]) + xml_element.attrib[attribute] = value + + if not hasattr(kgml_element, 'children'): + return xml_element + + # Recursively add XML subelements. + for subelement_uuids in kgml_element.children.values(): + for uuid in subelement_uuids: + kgml_subelement = pathway.uuid_element_lookup[uuid] + xml_subelement = self.get_element(kgml_subelement, pathway=pathway) + xml_element.append(xml_subelement) + + return xml_element + + def get_indented_str(self, xml_element: ET.Element, tag_indentation: int = 0) -> str: + """ + Convert a KGML XML element to a string with formatting, including indentation, that is + consistent with reference KGML files. + + Parameters + ========== + xml_element : xml.etree.ElementTree + XML element loaded from KGML file. + + tag_indentation : int, 0 + Indentation of the element tag, which, by default, is 0 for the root element, and + increments by the class variable, `subelement_indentation_increment`, in the recursive + calls to this class for subelements. + + Returns + ======= + str + The formatted string for the element, ready to be written to a KGML file. + """ + tag = xml_element.tag + attributes = xml_element.attrib + children = list(xml_element) + + if attributes: + indented_output = f'{" " * tag_indentation}<{tag}' + for i, (attr, value) in enumerate(attributes.items()): + if isinstance(value, str): + value: str + # ">" and "<" are encountered in attribute values and are represented by HTML + # escape characters in KGML files. + assert '&' not in value + v = value.replace(">", ">").replace("<", "<") + elif isinstance(value, float): + value: float + v = str(round(value)) + elif isinstance(value, tuple): + value: Tuple[float] + v = ','.join([str(round(coord)) for coord in value]) + else: + raise AssertionError( + f"The attribute, '{attr}', had a value, '{value}', of unrecognized type " + f"'{type(value)}'." + ) + if i == 0: + indented_output += f' {attr}="{v}"' + else: + attr_indentation = self.attribute_indentations.get((tag, prev_attr, attr)) + if attr_indentation is None: + indented_output += f' {attr}="{v}"' + else: + indented_output += f'\n{" " * attr_indentation}{attr}="{v}"' + prev_attr = attr + else: + indented_output = f'{" " * tag_indentation}<{tag}' + raise AssertionError( + "It is assumed that all KGML XML elements have attributes, but an element with the " + f"tag, '{tag}', did not have any." + ) + + if children: + indented_output += '>\n' + for child in children: + indented_output += self.get_indented_str( + child, tag_indentation=tag_indentation + self.subelement_indentation_increment + ) + # End tag + indented_output += f'{" " * tag_indentation}\n' + else: + # Tags without possible subelements, plus substrate and product, which have possible alt + # subelement, which is never present in reference files. + if tag in ('graphics', 'component', 'subtype', 'substrate', 'product', 'alt'): + # Self-closing tag + indented_output += '/>\n' + else: + indented_output += '>\n' + # End tag + indented_output += f'{" " * tag_indentation}\n' + + if xml_element.text: + raise AssertionError( + "It is assumed that KGML XML elements do not contain 'text', but an element with " + f"the tag, '{tag}', contained the text, '{xml_element.text}'." + ) + + return indented_output + +class Drawer: + """ + Write pathway map image files incorporating KGML data. + + Attributes + ========== + kegg_context : anvio.kegg.KeggContext + This contains anvi'o KEGG database attributes, such as filepaths. + + xml_ops : XMLOps + Loads KGML files. + + overwrite_output : bool + If True, methods in this class overwrite existing output files. + + run : anvio.terminal.Run + This object prints run information to the terminal. + + progress : anvio.terminal.Progress + This object prints transient progress information to the terminal. + + non_reactant_transparency : float, 1.0 + This controls the transparency, or alpha, of the background color of compound circles + rendered from KGML for non-reactants, or compounds that don't participate in reactions. This + value is used to set the attribute of Bio.Graphics.KGML_vis.KGMLCanvas, which is 0.3 by + default, which allows the color of the circle in the underlying base map to bleed through, + which is probably not desirable. + """ + def __init__( + self, + kegg_dir: str = None, + overwrite_output: bool = FORCE_OVERWRITE, + run: terminal.Run = terminal.Run(), + progress: terminal.Progress = terminal.Progress() + ) -> None: + """ + Parameters + ========== + kegg_dir : str, None + Directory containing an anvi'o KEGG database. The default argument of None expects the + KEGG database to be set up in the default directory used by the program + anvi-setup-kegg-data. + + overwrite_output : bool, anvio.FORCE_OVERWRITE + If True, methods in this class overwrite existing output files. + + run : anvio.terminal.Run, anvio.terminal.Run() + This object prints run information to the terminal. + + progress : anvio.terminal.Progress, anvio.terminal.Progress() + This object prints transient progress information to the terminal. + """ + args = Namespace() + args.kegg_data_dir = kegg_dir + self.kegg_context = kegg.KeggContext(args) + + self.xml_ops = XMLOps() + + self.overwrite_output = overwrite_output + self.run = run + self.progress = progress + + self.non_reactant_transparency = 1.0 + + def draw_map( + self, + pathway: Pathway, + output_filepath: str, + map_filepath: str = None, + use_org_map: bool = False, + **kwargs + ) -> None: + """ + Draw a pathway map with KGML data as a PDF file. + + Parameters + ========== + pathway : Pathway + Object representation of a KGML file. + + output_filepath : str + Path to PDF output file containing the pathway map. + + map_filepath : str, None + Path to pathway map image file to use as the base image of the output file. If None, + then a PNG image file is automatically sought in the KEGG data directory, and it is + assumed that the KGML data is scaled to fit the image size; here is more information on + the files that are sought. + + For a standard or overview (not global) map, a 2x resolution 'map' file is sought. If + the org attribute of the pathway object is an organism code and the argument, + use_org_map, is True, then a 1x resolution organism-specific file is sought. + + For a global map, a 1x resolution file is sought; it is assumed the KGML data is scaled + to fit this image size. The org attribute of the pathway object is used to seek the + corresponding file, i.e., a 'ko' pathway containing reactions with KO IDs results in the + reference 'ko' file being sought, whereas an 'ec' pathway containing reactions with EC + number IDs corresponds to the reference 'ec' file. If the org attribute is an organism + code and the argument, use_org_map, is True, then an organism-specific image file is + sought, and if the argument is False, then a 'ko' file is sought. + + use_org_map : bool, False + If True and the org attribute of the pathway object is an organism code, such as 'eco' + for E. coli, then an organism-specific 1x resolution file is used if available locally + in the KEGG directory or online for download to that directory. If False and the org + attribute is an organism code, then the 1x 'ko' file is used. + + **kwargs + Valid kwargs are arguments to a biopython.Bio.Graphics.KGML_vis.KGMLCanvas object. These + control what is displayed on the map from the KGML file. + + Arguments include the following. See the KGMLCanvas class definition in the source code + for a full list. + https://github.com/biopython/biopython/blob/master/Bio/Graphics/KGML_vis.py + + import_imagemap : bool + By default True. Setting to False prevents the base map image from being rendered + beneath KGML graphics, which is especially useful for decluttering global maps. + + label_compounds : bool + By default, Drawer sets to False to reduce clutter. Setting to True displays KEGG + COMPOUND IDs. + + label_orthologs : bool + By default, Drawer sets to False for global and overview maps to reduce clutter next + to reaction arrows and to True for standard maps. Setting to True displays KO IDs. + + label_reaction_entries : bool + By default, Drawer sets to False to reduce clutter. Setting to True displays KEGG + REACTION IDs. + + fontname : str + KGML label font name, with the default being Helvetica. + + fontsize : float + KGML label font size. Drawer sets the default to 9 for 1x resolution maps and 18 + for 2x, if the map base image is not provided explicitly by map_filepath. If it is + provided explicitly, then the default is 9, erring on the side of fitting the text + in an ortholog box on a standard 1x map. + """ + is_output_file_writable(output_filepath, ok_if_exists=self.overwrite_output) + + # These canvas parameters apply to both standard and global/overview maps. + if kwargs.get('import_imagemap') is None: + kwargs['import_imagemap'] = True + if kwargs.get('label_compounds') is None: + kwargs['label_compounds'] = False + if map_filepath is not None and kwargs.get('fontsize') is None: + kwargs['fontsize'] = 9 + + if pathway.is_global_map: + self._draw_global_map( + pathway, + output_filepath, + map_filepath=map_filepath, + use_org_map=use_org_map, + **kwargs + ) + elif pathway.is_overview_map: + self._draw_overview_map( + pathway, + output_filepath, + map_filepath=map_filepath, + use_org_map=use_org_map, + **kwargs + ) + else: + self._draw_standard_map( + pathway, + output_filepath, + map_filepath=map_filepath, + use_org_map=use_org_map, + **kwargs + ) + + def _draw_global_map( + self, + pathway: Pathway, + output_filepath: str, + map_filepath: str = None, + use_org_map: bool = False, + **kwargs + ) -> None: + """ + Draw a global pathway map with KGML data as a PDF file. + + Parameters + ========== + pathway : Pathway + Object representation of a KGML file. + + output_filepath : str + Path to PDF output file containing the pathway map. + + map_filepath : str, None + Path to pathway map image file to use as the base image of the output file. If None, + then a 1x resolution PNG file stored in the KEGG directory is used; it is assumed the + KGML data is scaled to fit this image size. The org attribute of the pathway object is + used to seek the corresponding PNG file, i.e., a 'ko' pathway containing reactions with + KO IDs results in the reference 'ko' map being sought, whereas an 'ec' pathway + containing reactions with EC number IDs corresponds to the reference 'ec' map. + + use_org_map : bool, False + If True and the org attribute of the pathway object is an organism code, such as 'eco' + for E. coli, then an organism-specific 1x resolution file is used if available locally + in the KEGG directory or available online for download to that directory. If False and + the org attribute is an organism code, then the 1x 'ko' file is used. + + **kwargs + Valid kwargs are arguments to a biopython.Bio.Graphics.KGML_vis.KGMLCanvas object. + These control what is displayed on the map from the KGML file. + """ + if kwargs.get('label_orthologs') is None: + kwargs['label_orthologs'] = False + if kwargs.get('label_reaction_entries') is None: + kwargs['label_reaction_entries'] = False + if kwargs.get('fontsize') is None: + kwargs['fontsize'] = 9 + + bio_pathway = KGML_parser.read(StringIO(self.xml_ops.get_str(pathway))) + + if map_filepath is None: + if pathway.org == 'ko': + map_filepath = os.path.join( + self.kegg_context.png_1x_ko_dir, f'{pathway.org}{pathway.number}.png' + ) + elif pathway.org == 'ec': + map_filepath = os.path.join( + self.kegg_context.png_1x_ec_dir, f'ec{pathway.number}.png' + ) + elif pathway.org == 'rn': + map_filepath = os.path.join( + self.kegg_context.png_1x_rn_dir, f'rn{pathway.number}.png' + ) + elif use_org_map: + map_filepath = os.path.join( + self.kegg_context.png_1x_org_dir, f'{pathway.org}{pathway.number}.png' + ) + else: + map_filepath = os.path.join( + self.kegg_context.png_1x_ko_dir, f'ko{pathway.number}.png' + ) + else: + assert not use_org_map + is_file_exists(map_filepath) + + if use_org_map and not is_file_exists(map_filepath, dont_raise=True): + kegg.download_org_pathway_image_files(f'{pathway.org}{pathway.number}', self.kegg_dir) + + bio_pathway.image = map_filepath + + canvas = KGMLCanvas(bio_pathway, **kwargs) + canvas.non_reactant_transparency = self.non_reactant_transparency + canvas.draw(output_filepath) + + def _draw_overview_map( + self, + pathway: Pathway, + output_filepath: str, + map_filepath: str = None, + use_org_map: bool = False, + **kwargs + ) -> None: + """ + Draw an overview pathway map with KGML data as a PDF file. + + Parameters + ========== + pathway : Pathway + Object representation of a KGML file. + + output_filepath : str + Path to PDF output file containing the pathway map. + + map_filepath : str, None + Path to pathway map image file to use as the base image of the output file. If None, + then a 2x resolution 'map' PNG file stored in the KEGG directory is used; it is assumed + the KGML data is scaled to fit this image size. + + use_org_map : bool, False + If True and the org attribute of the pathway object is an organism code, such as 'eco' + for E. coli, then an organism-specific 1x resolution file is used if available locally + in the KEGG directory or available online for download to that directory. If False and + the org attribute is an organism code, then the 2x 'ko' file is used. + + **kwargs + Valid kwargs are arguments to a biopython.Bio.Graphics.KGML_vis.KGMLCanvas object. + These control what is displayed on the map from the KGML file. + """ + if kwargs.get('label_orthologs') is None: + kwargs['label_orthologs'] = False + if kwargs.get('label_reaction_entries') is None: + kwargs['label_reaction_entries'] = False + + bio_pathway = KGML_parser.read(StringIO(self.xml_ops.get_str(pathway))) + + if map_filepath is None: + if use_org_map: + map_filepath = os.path.join( + self.kegg_context.png_1x_org_dir, f'{pathway.org}{pathway.number}.png' + ) + if kwargs.get('fontsize') is None: + kwargs['fontsize'] = 9 + else: + map_dir = self.kegg_context.png_2x_map_dir + map_filepath = os.path.join( + self.kegg_context.png_2x_map_dir, f'map{pathway.number}.png' + ) + if kwargs.get('fontsize') is None: + kwargs['fontsize'] = 18 + else: + assert not use_org_map + is_file_exists(map_filepath) + if kwargs.get('fontsize') is None: + kwargs['fontsize'] = 9 + + if use_org_map and not is_file_exists(map_filepath, dont_raise=True): + kegg.download_org_pathway_image_files(f'{pathway.org}{pathway.number}', self.kegg_dir) + + bio_pathway.image = map_filepath + + canvas = KGMLCanvas(bio_pathway, **kwargs) + canvas.non_reactant_transparency = self.non_reactant_transparency + canvas.draw(output_filepath) + + def _draw_standard_map( + self, + pathway: Pathway, + output_filepath: str, + map_filepath: str = None, + use_org_map: bool = False, + **kwargs + ) -> None: + """ + Draw a standard (not global/overview) pathway map with KGML data as a PDF file. + + Parameters + ========== + pathway : Pathway + Object representation of a KGML file. + + output_filepath : str + Path to PDF output file containing the pathway map. + + map_filepath : str, None + Path to pathway map image file to use as the base image of the output file. If None, + then the 2x resolution 'map' PNG file stored in the KEGG directory is used; it is + assumed the KGML data is scaled to fit this image size. + + use_org_map : bool, False + If True and the org attribute of the pathway object is an organism code, such as 'eco' + for E. coli, then an organism-specific 1x resolution file is used if available locally + in the KEGG directory or available online for download to that directory. If False and + the org attribute is an organism code, then the 2x 'ko' file is used. + + **kwargs + Valid kwargs are arguments to a biopython.Bio.Graphics.KGML_vis.KGMLCanvas object. + These control what is displayed on the map from the KGML file. + """ + bio_pathway = KGML_parser.read(StringIO(self.xml_ops.get_str(pathway))) + + if map_filepath is None: + if use_org_map: + map_filepath = os.path.join( + self.kegg_context.png_1x_org_dir, f'{pathway.org}{pathway.number}.png' + ) + if kwargs.get('fontsize') is None: + kwargs['fontsize'] = 9 + else: + map_filepath = os.path.join( + self.kegg_context.png_2x_map_dir, f'map{pathway.number}.png' + ) + if kwargs.get('fontsize') is None: + kwargs['fontsize'] = 18 + else: + assert not use_org_map + is_file_exists(map_filepath) + if kwargs.get('fontsize') is None: + kwargs['fontsize'] = 9 + + if use_org_map and not is_file_exists(map_filepath, dont_raise=True): + kegg.download_org_pathway_image_files(f'{pathway.org}{pathway.number}', self.kegg_dir) + + bio_pathway.image = map_filepath + + canvas = KGMLCanvas(bio_pathway, **kwargs) + canvas.non_reactant_transparency = self.non_reactant_transparency + canvas.draw(output_filepath) + +class Tester: + """ + Tests KGML operations. + + Attributes + ========== + xml_ops : XMLOps() + Loads KMGL files. + + run : anvio.terminal.Run + This object prints run information to the terminal. + + progress : anvio.terminal.Progress + This object prints transient progress information to the terminal. + """ + def __init__( + self, + run: terminal.Run = terminal.Run(), + progress: terminal.Progress = terminal.Progress(), + ) -> None: + """ + Parameters + ========== + run : anvio.terminal.Run, anvio.terminal.Run() + This object prints run information to the terminal. + + progress : anvio.terminal.Progress, anvio.terminal.Progress() + This object prints transient progress information to the terminal. + """ + self.xml_ops = XMLOps() + + self.run = run + self.progress = progress + + def load_all_anvio_kgml_files(self, kegg_dirpath: str = None) -> None: + """ + Load each KGML file within a superdirectory formatted like an anvi'o KEGG directory into + memory as a KGML pathway object, and test that the object can be converted back to a string + equivalent to the contents of the file. + + Parameters + ========== + kegg_dirpath : str, None + A directory of KEGG files like that installed by `anvi-setup-kegg-data`. By default, the + default anvi'o KEGG installation location is sought. + + Returns + ======= + None + """ + args = Namespace() + if kegg_dirpath is not None: + args.kegg_data_dir = kegg_dirpath + kegg_context = kegg.KeggContext(args) + for dirname in os.listdir(kegg_context.map_image_kgml_dir): + dirpath = os.path.join(kegg_context.map_image_kgml_dir, dirname) + if not os.path.isdir(dirpath): + continue + self.load_kgml_files_in_dir(dirpath) + + def load_kgml_files_in_dir(self, dirpath: str) -> None: + """ + Load each KGML file in a single directory into memory as a KGML pathway object, and test + that the object can be converted back to a string equivalent to the contents of the file. + + Parameters + ========== + dirpath : str + Path to a directory in which to look for KGML files, assumed to have a '.xml' extension. + + Returns + ======= + None + """ + filepaths: List[str] = [] + for filename in os.listdir(dirpath): + filepath = os.path.join(dirpath, filename) + if not os.path.isfile(filepath) or not os.path.splitext(filepath)[1] == '.xml': + continue + filepaths.append(filepath) + self.progress.new( + f"Testing KGML files in {os.path.dirname(dirpath)}", + progress_total_items=len(filepaths) + ) + for filepath in filepaths: + self.progress.update("...", increment=True) + self.load_kgml_file(filepath) + self.progress.end() + self.run.info_single(f"Tested {len(filepaths)} KGML files in '{dirpath}'") + + def load_kgml_file(self, filepath: str, buffer: int = 100) -> None: + """ + Load a KGML file into memory as a KGML pathway object, and test that the object can be + converted back to a string equivalent to the contents of the file. + + Parameters + ========== + filepath : str + Path to a KGML file. + + buffer : int, 100 + If an inconsistency is found between the string representing the contents of the KGML + file and the string representing a KGML file reconstructed from the loaded pathway + object, then display the text where these strings diverge, including the number of + characters given by `buffer` around this point. + + Returns + ======= + None + """ + pathway = self.xml_ops.load(filepath) + reconstructed_xml_str = self.xml_ops.get_str(pathway) + with open(filepath) as file: + xml_str = file.read() + if xml_str != reconstructed_xml_str: + for i in range(1, min(len(xml_str), len(reconstructed_xml_str))): + if xml_str[: i] == reconstructed_xml_str[: i]: + continue + error_message = ( + "Anvi'o loaded a KGML file as a kgml.Pathway object. It then tried to convert " + "the object back into a string equivalent to the text of the KGML file, and " + "failed. Here is the area of the KGML text where an inconsistency was " + "detected. First, the file text is displayed, and then the text reconstructed " + "from the object is displayed.\n" + ) + error_message += xml_str[max(0, i - buffer): min(i + buffer, len(xml_str))] + "\n" + error_message += reconstructed_xml_str[ + max(0, i - buffer): min(i + buffer, len(reconstructed_xml_str)) + ] + raise AssertionError(error_message) diff --git a/anvio/migrations/contigs/v23_to_v24.py b/anvio/migrations/contigs/v23_to_v24.py new file mode 100644 index 0000000000..81e5288817 --- /dev/null +++ b/anvio/migrations/contigs/v23_to_v24.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python +# -*- coding: utf-8 + +import os +import sys +import argparse +import pandas as pd + +import anvio.db as db +import anvio.utils as utils +import anvio.terminal as terminal + +from anvio.errors import ConfigError +from anvio.reactionnetwork import ModelSEEDDatabase + +current_version, next_version = [x[1:] for x in __name__.split('_to_')] + +reaction_network_kegg_table_name = 'reaction_network_kegg' +reaction_network_kegg_table_structure = ['kegg_id', 'name', 'modules', 'pathways', 'brite_categorization'] +reaction_network_kegg_table_types = [ 'text' , 'text', 'text' , 'text' , 'text' ] + +reaction_network_metabolites_table_structure = ['modelseed_compound_id', 'modelseed_compound_name', 'kegg_aliases', 'formula', 'charge' , 'smiles'] +reaction_network_metabolites_table_types = [ 'text' , 'text' , 'text' , 'text' , 'numeric', 'text' ] + +run = terminal.Run() +progress = terminal.Progress() + +def migrate(db_path): + if db_path is None: + raise ConfigError("No database path is given.") + + utils.is_contigs_db(db_path) + + contigs_db = db.DB(db_path, None, ignore_version=True) + if str(contigs_db.get_version()) != current_version: + raise ConfigError( + f"The version of the provided contigs database is {contigs_db.get_version}, not the " + f"required version, {current_version}, so this script cannot upgrade the database." + ) + + progress.new("Creating a new table for KEGG KO information in a reaction network") + progress.update("...") + # To be on the safe side, remove any KEGG table that may already exist. + try: + contigs_db.drop_table(reaction_network_kegg_table_name) + except: + pass + + contigs_db.create_table( + reaction_network_kegg_table_name, + reaction_network_kegg_table_structure, + reaction_network_kegg_table_types + ) + progress.end() + + added_smiles_strings = add_smiles_column(contigs_db) + + progress.new("Renaming reaction network tables") + progress.update("...") + # To be on the safe side, remove any tables with the new names that may already exist. + try: + contigs_db.drop_table('reaction_network_reactions') + contigs_db.drop_table('reaction_network_metabolites') + except: + pass + + contigs_db._exec('ALTER TABLE gene_function_reactions RENAME TO reaction_network_reactions') + contigs_db._exec('ALTER TABLE gene_function_metabolites RENAME TO reaction_network_metabolites') + progress.end() + + progress.new("Updating version") + progress.update("...") + contigs_db.remove_meta_key_value_pair('version') + contigs_db.set_version(next_version) + + contigs_db.disconnect() + progress.end() + + if added_smiles_strings: + smiles_message = "SMILES string structural data was added to the existing reaction network." + else: + smiles_message = ( + "A new column for storage of SMILES string structural data was added to the " + "metabolites table." + ) + message = ( + f"Congratulations! Your contigs database is now version {next_version}. An empty table has " + "been added to improve the functionality of reaction networks, particularly their " + "portability and reproducibility. The two existing tables for storing reaction networks " + f"have been renamed for the sake of clarity. {smiles_message} " + ) + run.info_single(message, nl_after=1, nl_before=1, mc='green') + +def add_smiles_column(contigs_db: db.DB) -> bool: + """ + Add a SMILES string column to the reaction network metabolites table. + + Parameters + ========== + contigs_db : db.DB + Database being migrated. + + Returns + ======= + bool + True if SMILES strings could be retrieved from a ModelSEED Biochemistry reference database + and added. False if only an empty column was added. + """ + modelseed_db_available = check_modelseed_database(contigs_db) + + progress.new("Adding metabolite SMILES string column") + progress.update("...") + + contigs_db._exec('ALTER TABLE gene_function_metabolites ADD smiles text') + + if not modelseed_db_available: + progress.end() + return False + + metabolites_table = contigs_db.get_table_as_dataframe('gene_function_metabolites') + modelseed_table = ModelSEEDDatabase().compounds_table + smiles_strings = [] + for row in metabolites_table.itertuples(): + smiles = modelseed_table.loc[row.modelseed_compound_id]['smiles'] + if pd.isna(smiles): + smiles_strings.append('') + else: + smiles_strings.append(smiles) + metabolites_table['smiles'] = smiles_strings + + contigs_db.drop_table('gene_function_metabolites') + contigs_db.create_table( + 'gene_function_metabolites', + reaction_network_metabolites_table_structure, + reaction_network_metabolites_table_types + ) + contigs_db.insert_rows_from_dataframe('gene_function_metabolites', metabolites_table) + progress.end() + + return True + +def check_modelseed_database(contigs_db: db.DB) -> bool: + """ + Check if the contigs database contains a reaction network that was constructed with a ModelSEED + Biochemistry database installed at the default anvi'o location. + + Parameters + ========== + contigs_db : db.DB + Database being migrated. + + Returns + ======= + bool + True if the contigs database contains a reaction network constructed with a ModelSEED + database installed at the default anvi'o location. + """ + network_sha: str = contigs_db.get_meta_value('reaction_network_modelseed_database_sha') + if not network_sha: + return False + + sha_txt_path = os.path.join(ModelSEEDDatabase.default_dir, 'sha.txt') + compounds_db_path = os.path.join(ModelSEEDDatabase.default_dir, 'compounds.tsv') + if not os.path.isfile(sha_txt_path) or not os.path.isfile(compounds_db_path): + run.warning( + "A ModelSEED Biochemistry database was not found to be set up in the default anvi'o " + f"directory, '{ModelSEEDDatabase.default_dir}', preventing this script from adding " + "SMILES string structural data to the table of reaction network metabolites in the " + "contigs database. The reaction network can be reconstructed and overwritten to store " + "SMILES strings." + ) + return False + + with open(sha_txt_path) as sha_txt: + ref_sha = sha_txt.read() + if network_sha != ref_sha: + run.warning( + f"The ID ('{network_sha}') of the ModelSEED Biochemistry database used to build the " + f"reaction network stored in the contigs database does not match the ID ('{ref_sha}') " + "of the reference database set up in the default anvi'o directory, indicating that the " + "network was built with a different version of ModelSEED. This script therefore cannot " + "add SMILES string structural data to the table of reaction network metabolites in the " + "contigs database. The reaction network can be reconstructed and overwritten to store " + "SMILES strings." + ) + return False + + return True + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='A simple script to upgrade CONTIGS.db from version %s to version %s' % (current_version, next_version)) + parser.add_argument('contigs_db', metavar = 'CONTIGS_DB', help = 'Contigs database at version %s' % current_version) + args, unknown = parser.parse_known_args() + + try: + migrate(args.contigs_db) + except ConfigError as e: + print(e) + sys.exit(-1) diff --git a/anvio/migrations/pan/v18_to_v19.py b/anvio/migrations/pan/v18_to_v19.py new file mode 100644 index 0000000000..e41f2f83e1 --- /dev/null +++ b/anvio/migrations/pan/v18_to_v19.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python +# -*- coding: utf-8 + +import sys +import argparse + +import anvio.db as db +import anvio.terminal as terminal + +from anvio.errors import ConfigError + +current_version, next_version = [x[1:] for x in __name__.split('_to_')] + +pan_reaction_network_kegg_table_name = 'pan_reaction_network_kegg' +pan_reaction_network_kegg_table_structure = ['kegg_id', 'name', 'modules', 'pathways', 'brite_categorization'] +pan_reaction_network_kegg_table_types = [ 'text' , 'text', 'text' , 'text' , 'text' ] + +run = terminal.Run() +progress = terminal.Progress() + +def migrate(db_path): + if db_path is None: + raise ConfigError("No database path is given.") + + pan_db = db.DB(db_path, None, ignore_version=True) + if str(pan_db.get_version()) != current_version: + raise ConfigError( + f"The version of the provided contigs database is {pan_db.get_version}, not the " + f"required version, {current_version}, so this script cannot upgrade the database." + ) + + progress.new("Migrating") + progress.update("Creating a new table for KEGG KO information in a reaction network") + + # To be on the safe side, remove any KEGG table that may already exist. + try: + pan_db.drop_table(pan_reaction_network_kegg_table_name) + except: + pass + + pan_db.create_table( + pan_reaction_network_kegg_table_name, + pan_reaction_network_kegg_table_structure, + pan_reaction_network_kegg_table_types + ) + + progress.update("Renaming other reaction network tables") + + # To be on the safe side, remove any tables with the new names that may already exist. + try: + pan_db.drop_table('pan_reaction_network_reactions') + pan_db.drop_table('pan_reaction_network_metabolites') + except: + pass + + pan_db._exec( + 'ALTER TABLE gene_cluster_function_reactions RENAME TO pan_reaction_network_reactions' + ) + pan_db._exec( + 'ALTER TABLE gene_cluster_function_metabolites RENAME TO pan_reaction_network_metabolites' + ) + + progress.update("Updating version") + pan_db.remove_meta_key_value_pair('version') + pan_db.set_version(next_version) + + progress.update("Committing changes") + pan_db.disconnect() + + progress.end() + + message = ( + "Congratulations! Your pan database is now version 19. An empty table has been added to " + "improve the functionality of reaction networks, particularly their portability and " + "reproducibility. The two existing tables for storing reaction networks have also been " + "renamed for the sake of clarity." + ) + run.info_single(message, nl_after=1, nl_before=1, mc='green') + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='A simple script to upgrade the pan database from version %s to version %s' % (current_version, next_version)) + parser.add_argument('pan_db', metavar = 'PAN_DB', help = "An anvi'o pan database of version %s" % current_version) + args, unknown = parser.parse_known_args() + + try: + migrate(args.pan_db) + except ConfigError as e: + print(e) + sys.exit(-1) diff --git a/anvio/migrations/pan/v19_to_v20.py b/anvio/migrations/pan/v19_to_v20.py new file mode 100644 index 0000000000..a730d60bd5 --- /dev/null +++ b/anvio/migrations/pan/v19_to_v20.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python +# -*- coding: utf-8 + +import os +import sys +import argparse +import pandas as pd + +import anvio.db as db +import anvio.terminal as terminal + +from anvio.errors import ConfigError +from anvio.reactionnetwork import ModelSEEDDatabase + +current_version, next_version = [x[1:] for x in __name__.split('_to_')] + +pan_reaction_network_metabolites_table_structure = ['modelseed_compound_id', 'modelseed_compound_name', 'kegg_aliases', 'formula', 'charge' , 'smiles'] +pan_reaction_network_metabolites_table_types = [ 'text' , 'text' , 'text' , 'text' , 'numeric', 'text' ] + +run = terminal.Run() +progress = terminal.Progress() + +def migrate(db_path): + if db_path is None: + raise ConfigError("No database path is given.") + + pan_db = db.DB(db_path, None, ignore_version=True) + if str(pan_db.get_version()) != current_version: + raise ConfigError( + f"The version of the provided pan database is {pan_db.get_version}, not the required " + f"version, {current_version}, so this script cannot upgrade the database." + ) + + added_smiles_strings = add_smiles_column(pan_db) + + progress.new("Updating version") + progress.update("...") + pan_db.remove_meta_key_value_pair('version') + pan_db.set_version(next_version) + pan_db.disconnect() + progress.end() + + if added_smiles_strings: + smiles_message = ( + "SMILES string compound structural data was added to the existing reaction network." + ) + else: + smiles_message = ( + "A new column for storage of SMILES string compound structural data was added to the " + "metabolites table." + ) + message = ( + f"Congratulations! Your pan database is now version {next_version}. {smiles_message}" + ) + run.info_single(message, nl_after=1, nl_before=1, mc='green') + +def add_smiles_column(pan_db: db.DB) -> bool: + """ + Add a SMILES string column to the pan reaction network metabolites table. + + Parameters + ========== + pan_db : db.DB + Database being migrated. + + Returns + ======= + bool + True if SMILES strings could be retrieved from a ModelSEED Biochemistry reference database + and added. False if only an empty column was added. + """ + modelseed_db_available = check_modelseed_database(pan_db) + + metabolites_table = pan_db.get_table_as_dataframe('pan_reaction_network_metabolites', error_if_no_data=False) + + if 'smiles' in metabolites_table.columns: + pan_db._exec('ALTER TABLE pan_reaction_network_metabolites DROP smiles') + run.warning("An existing column by the name of 'smiles' was dropped.") + + progress.new("Adding metabolite SMILES string column") + progress.update("...") + pan_db._exec('ALTER TABLE pan_reaction_network_metabolites ADD smiles text') + + if not modelseed_db_available: + progress.end() + return False + + modelseed_table = ModelSEEDDatabase().compounds_table + smiles_strings = [] + for row in metabolites_table.itertuples(): + smiles = modelseed_table.loc[row.modelseed_compound_id]['smiles'] + if pd.isna(smiles): + smiles_strings.append('') + else: + smiles_strings.append(smiles) + metabolites_table['smiles'] = smiles_strings + + pan_db.drop_table('pan_reaction_network_metabolites') + pan_db.create_table( + 'pan_reaction_network_metabolites', + pan_reaction_network_metabolites_table_structure, + pan_reaction_network_metabolites_table_types + ) + pan_db.insert_rows_from_dataframe('pan_reaction_network_metabolites', metabolites_table) + progress.end() + + return True + +def check_modelseed_database(pan_db: db.DB) -> bool: + """ + Check if the pan database contains a reaction network that was constructed with a ModelSEED + Biochemistry database installed at the default anvi'o location. + + Parameters + ========== + pan_db : db.DB + Database being migrated. + + Returns + ======= + bool + True if the pan database contains a reaction network constructed with a ModelSEED database + installed at the default anvi'o location. + """ + network_sha: str = pan_db.get_meta_value('reaction_network_modelseed_database_sha') + if not network_sha: + return False + + sha_txt_path = os.path.join(ModelSEEDDatabase.default_dir, 'sha.txt') + compounds_db_path = os.path.join(ModelSEEDDatabase.default_dir, 'compounds.tsv') + if not os.path.isfile(sha_txt_path) or not os.path.isfile(compounds_db_path): + run.warning( + "A ModelSEED Biochemistry database was not found to be set up in the default anvi'o " + f"directory, '{ModelSEEDDatabase.default_dir}', preventing this script from adding " + "SMILES string structural data to the table of reaction network metabolites in the " + "pan database. The reaction network can be reconstructed and overwritten to store " + "SMILES strings." + ) + return False + + with open(sha_txt_path) as sha_txt: + ref_sha = sha_txt.read() + if network_sha != ref_sha: + run.warning( + f"The ID ('{network_sha}') of the ModelSEED Biochemistry database used to build the " + f"reaction network stored in the pan database does not match the ID ('{ref_sha}') of " + "the reference database set up in the default anvi'o directory, indicating that the " + "network was built with a different version of ModelSEED. This script therefore cannot " + "add SMILES string structural data to the table of reaction network metabolites in the " + "pan database. The reaction network can be reconstructed and overwritten to store " + "SMILES strings." + ) + return False + + return True + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='A simple script to upgrade the pan database from version %s to version %s' % (current_version, next_version)) + parser.add_argument('pan_db', metavar = 'PAN_DB', help = "An anvi'o pan database of version %s" % current_version) + args, unknown = parser.parse_known_args() + + try: + migrate(args.pan_db) + except ConfigError as e: + print(e) + sys.exit(-1) diff --git a/anvio/migrations/pan/v20_to_v21.py b/anvio/migrations/pan/v20_to_v21.py new file mode 100644 index 0000000000..188621cbe6 --- /dev/null +++ b/anvio/migrations/pan/v20_to_v21.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python +# -*- coding: utf-8 + +import sys +import argparse + +import anvio.dbinfo as dbinfo +import anvio.terminal as terminal + +from anvio.errors import ConfigError + +run = terminal.Run() +progress = terminal.Progress() + +current_version, next_version = [x[1:] for x in __name__.split('_to_')] + + +def migrate(db_path): + if db_path is None: + raise ConfigError("No database path is given.") + + pan_db_info = dbinfo.PanDBInfo(db_path) + if str(pan_db_info.version) != current_version: + raise ConfigError( + f"The version of the provided pan database is {pan_db_info.version}, not the required " + f"version, {current_version}, so this script cannot upgrade the database." + ) + + pan_db = pan_db_info.load_db() + + progress.new("Migrating") + progress.update("Updating the self table with two variables if not already there") + + threshold_added = False + if 'reaction_network_consensus_threshold' not in pan_db_info.get_self_table(): + pan_db.set_meta_value('reaction_network_consensus_threshold', None) + threshold_added = True + + discard_ties_added = False + if 'reaction_network_discard_ties' not in pan_db_info.get_self_table(): + pan_db.set_meta_value('reaction_network_discard_ties', None) + discard_ties_added = True + + progress.update("Updating version") + pan_db.remove_meta_key_value_pair('version') + pan_db.set_version(next_version) + + progress.update("Committing changes") + pan_db.disconnect() + + progress.end() + + if threshold_added and discard_ties_added: + change_message = ( + "Two placeholder variables were added to the self table. These are filled in when a " + "reaction network is generated via `anvi-reaction-network`." + ) + elif threshold_added and not discard_ties_added: + change_message = ( + "A placeholder variable, 'reaction_network_consensus_threshold', was added to the self " + "table. Strangely, the variable, 'reaction_network_discard_ties', which goes " + "hand-in-hand with the other variable, was already present, and since we trust that " + "you know what's up, we left it alone. These variables are filled in when a reaction " + "network is generated via `anvi-reaction-network`." + ) + elif discard_ties_added and not threshold_added: + change_message = ( + "A placeholder variable, 'reaction_network_discard_ties', was added to the self table. " + "Strangely, the variable, 'reaction_network_consensus_threshold', which goes " + "hand-in-hand with the other variable, was already present, and since we trust that " + "you know what's up, we left it alone. These variables are filled in when a reaction " + "network is generated via `anvi-reaction-network`." + ) + else: + change_message = ( + "The variables, 'reaction_network_consensus_threshold' and " + "'reaction_network_discard_ties', were already found in the self table, likely because " + "you have run `anvi-reaction-network`, so we left them alone and didn't change " + "anything in the database besides the version number." + ) + message = f"Done! Your pan database is now version 21. This wasn't a biggie. {change_message}" + run.info_single(message, nl_after=1, nl_before=1, mc='green') + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='A simple script to upgrade the pan database from version %s to version %s' % (current_version, next_version)) + parser.add_argument('pan_db', metavar = 'PAN_DB', help = "An anvi'o pan database of version %s" % current_version) + args, unknown = parser.parse_known_args() + + try: + migrate(args.pan_db) + except ConfigError as e: + print(e) + sys.exit(-1) diff --git a/anvio/panops.py b/anvio/panops.py index 49c9e4202e..3126dcb7bd 100644 --- a/anvio/panops.py +++ b/anvio/panops.py @@ -153,6 +153,8 @@ def generate_pan_db(self): 'reaction_network_ko_annotations_hash': None, 'reaction_network_kegg_database_release': None, 'reaction_network_modelseed_database_sha': None, + 'reaction_network_consensus_threshold': None, + 'reaction_network_discard_ties': None, 'description': self.description if self.description else '_No description is provided_', } diff --git a/anvio/programs.py b/anvio/programs.py index 83ce2ccb69..932ac82ed3 100644 --- a/anvio/programs.py +++ b/anvio/programs.py @@ -189,7 +189,7 @@ def __init__(self, args, r=terminal.Run(), p=terminal.Progress()): "Probably there is a typo or something :/") - def init_programs(self, okay_if_no_meta=False, quiet=False): + def init_programs(self, okay_if_no_meta=False, always_include_those_with_docs=True, quiet=False): """Initializes the `self.programs` dictionary.""" num_all_programs = len(self.all_program_filepaths) @@ -223,14 +223,30 @@ def init_programs(self, okay_if_no_meta=False, quiet=False): else: programs_without_usage_info.add(program.name) - if not (program.meta_info['provides']['value'] or program.meta_info['requires']['value']) and not okay_if_no_meta: + keep_program = True + if not (program.meta_info['provides']['value'] or program.meta_info['requires']['value']): + # if we are here, it means the program is missing both provides AND requires statements. + # If the user hasn't set `okay_if_no_meta=True`, we're going to get rid of them, and + # will NOT include them in `self.programs` + if not okay_if_no_meta: + keep_program = False + + # BUT, there are programs that have no provides/requires statements, such as anvi-self-test, + # but have a usage statement under docs already, we may want to keep them in the list + # regardless. so here we test that: + if program.name in programs_with_usage_info and always_include_those_with_docs: + keep_program = True + + if keep_program: + # include the program in our final list + self.programs[program.name] = program + else: + # forget all about it try: programs_with_usage_info.remove(program.name) programs_without_usage_info.remove(program.name) except: pass - else: - self.programs[program.name] = program self.progress.end() @@ -258,28 +274,26 @@ def init_programs(self, okay_if_no_meta=False, quiet=False): f"an entry in the authors YAML file: {', '.join(programs_with_unknown_authors)}.") # report missing provides/requires information - if anvio.DEBUG: - self.run.info_single("Of %d programs found, %d did not contain PROVIDES AND/OR REQUIRES " - "statements :/ This may be normal for some programs, but here is the " - "complete list of those that are missing __provides__ and __requires__ " - "tags in their code in case you see something you can complete: '%s'." % \ - (len(self.all_program_filepaths), - len(programs_without_provides_requires_info), - ', '.join(programs_without_provides_requires_info)), - nl_after=1, nl_before=1) + self.run.info_single("Of %d programs found, %d did not contain PROVIDES AND/OR REQUIRES " + "statements :/ This may be normal for some programs, but here is the " + "complete list of those that are missing __provides__ and __requires__ " + "tags in their code in case you see something you can complete: '%s'." % \ + (len(self.all_program_filepaths), + len(programs_without_provides_requires_info), + ', '.join(programs_without_provides_requires_info)), + nl_after=1, nl_before=1) # report missing provides/requires information - if anvio.DEBUG: - self.run.info_single("Of %d programs found, %d did not have any PROVIDES/REQUIRES statements. You can " - "help by adding usage information for programs by creating markdown " - "formatted files under the directory '%s'. Please see examples in anvi'o " - "codebase: https://github.com/merenlab/anvio/tree/master/anvio/docs. " - "Here is a complete list of programs that are missing usage statements: %s " % \ - (len(self.all_program_filepaths), - len(programs_without_provides_requires_info), - anvio.DOCS_PATH, - ', '.join(programs_without_provides_requires_info)), - nl_after=1, nl_before=1) + self.run.info_single("Of %d programs found, %d did not have any PROVIDES/REQUIRES statements. You can " + "help by adding usage information for programs by creating markdown " + "formatted files under the directory '%s'. Please see examples in anvi'o " + "codebase: https://github.com/merenlab/anvio/tree/master/anvio/docs. " + "Here is a complete list of programs that are missing usage statements: %s " % \ + (len(self.all_program_filepaths), + len(programs_without_provides_requires_info), + anvio.DOCS_PATH, + ', '.join(programs_without_provides_requires_info)), + nl_after=1, nl_before=1) class Program: diff --git a/anvio/reactionnetwork.py b/anvio/reactionnetwork.py new file mode 100644 index 0000000000..2da9fd063e --- /dev/null +++ b/anvio/reactionnetwork.py @@ -0,0 +1,10637 @@ +# -*- coding: utf-8 +# pylint: disable=line-too-long +"""Generate, manipulate, and export metabolic reaction networks from gene annotations.""" + +from __future__ import annotations + +import os +import re +import glob +import json +import math +import time +import random +import shutil +import hashlib +import tarfile +import zipfile +import argparse +import tempfile +import fractions +import functools +import collections +import numpy as np +import pandas as pd +import multiprocessing as mp + +from copy import deepcopy +from argparse import Namespace +from dataclasses import dataclass, field +from typing import Any, Dict, List, Set, Tuple, Union, Iterable + +import anvio.kegg as kegg +import anvio.utils as utils +import anvio.dbinfo as dbinfo +import anvio.tables as tables +import anvio.terminal as terminal +import anvio.filesnpaths as filesnpaths + +from anvio.db import DB +from anvio.errors import ConfigError +from anvio import DEBUG, __file__ as ANVIO_PATH, __version__ as VERSION +from anvio.dbops import ContigsDatabase, PanDatabase, PanSuperclass, ProfileDatabase + + +__author__ = "Developers of anvi'o (see AUTHORS.txt)" +__copyright__ = "Copyleft 2015-2024, the Meren Lab (http://merenlab.org/)" +__credits__ = [] +__license__ = "GPL 3.0" +__version__ = VERSION +__maintainer__ = "Samuel Miller" +__email__ = "samuelmiller10@gmail.com" +__status__ = "Development" + + +run_quiet = terminal.Run(verbose=False) + +# Network statistics are stored in a dictionary of dictionaries. Keys in the outer dictionary are +# "classes" of network statistics. Keys in the inner dictionary are statistics themselves. +GenomicNetworkStats = Dict[str, Dict[str, Any]] +PangenomicNetworkStats = Dict[str, Dict[str, Any]] + +RANDOM_SEED = 1066 + + +@dataclass +class ModelSEEDCompound: + """ + Representation of a chemical (a compound, element, or ions thereof) or a class of chemicals + (either abstract, like 'Cofactors' and 'Biomass', or defined, like 'Carboxylic acid' and + 'Polynucleotides'), with properties given by the ModelSEED Biochemistry database. + + Objects of this class are stored in the 'metabolites' attribute of a ReactionNetwork instance. + + Attributes + ========== + modelseed_id : str, None + The ModelSEED compound ID, formatted 'cpdXXXXX', where each X is a digit, e.g., 'cpd00001'. + + modelseed_name : str, None + Name of the ModelSEED compound, e.g., 'cpd00001' has the name, 'H2O'. When absent in the + database, assumes a value of None. + + kegg_aliases : Tuple[str], None + The KEGG COMPOUND IDs that are known to possibly alias the ModelSEED compound, according to + the ModelSEED database, e.g., 'cpd00001' has the aliases, ('C00001', 'C01328'). A KEGG + COMPOUND ID is formatted 'CXXXXX', where each X is a digit, e.g., 'C00001'. + + charge : int, None + The electrical charge of the ModelSEED compound, e.g., 'cpd00001' has charge 0. ModelSEED + compounds without a formula have a nominal charge of 10000000 in the database. + + formula : str, None + The formula of the ModelSEED compound, e.g., 'cpd00001' has the formula, 'H2O'. When absent + in the database, assumes a value of None. + + smiles : str, None + The SMILES string encoding the structure of the ModelSEED compound, e.g., 'cpd00001' has the + SMILES string, 'O'. When absent in the database, assumes a value of None. + + abundances : Dict[str, float], dict() + Abundance profile data (from metabolomics, for instance) with each key being a sample name + and each value being the abundance of the ModelSEED compound in that sample. + """ + modelseed_id: str = None + modelseed_name: str = None + kegg_aliases: Tuple[str] = None + charge: int = None + formula: str = None + smiles: str = None + abundances: Dict[str, float] = field(default_factory=dict) + +@dataclass +class ModelSEEDReaction: + """ + Representation of a reaction, with properties given by the ModelSEED Biochemistry database. + + Objects of this class are stored in the 'reactions' attribute of a ReactionNetwork instance. + + Attributes + ========== + modelseed_id : str, None + The ModelSEED reaction ID, formatted 'rxnXXXXX', where each X is a digit, e.g., + 'rxn00001'. + + modelseed_name : str, None + Name of the reaction, e.g., 'rxn00001' has the name, 'diphosphate phosphohydrolase'. When + absent in the database, assumes a value of None. + + kegg_aliases : Tuple[str], None + The KEGG REACTION IDs that are known to possibly alias the ModelSEED reaction, according to + the ModelSEED database, e.g., 'rxn00001' has the aliases, ('R00004'). A KEGG REACTION ID is + formatted 'RXXXXX', where each X is a digit, e.g., 'R00001'. + + ec_number_aliases : Tuple[str], None + The EC numbers that are known to possibly alias the ModelSEED reaction, according to the + ModelSEED database, e.g., 'rxn00001' has the aliases, ('3.6.1.1'). + + compound_ids : Tuple[str], None + ModelSEED IDs of reactants and products involved in the reaction. For example, 'rxn00001' + involves the ModelSEED compounds, 'cpd00001', 'cpd00012', 'cpd00009', and 'cpd00067'. A + compound ID is formatted 'cpdXXXXX', where each X is a digit, e.g., 'cpd00001'. IDs can be + used to look up metabolite objects in the 'metabolites' attribute of the ReactionNetwork + containing the reaction. Each metabolite object has a corresponding stoichiometric reaction + coefficient in the reaction attribute, 'coefficients', and a corresponding cellular + compartment in the reaction attribute, 'compartments'. + + coefficients : Tuple[int], None + Integer stoichiometric reaction coefficients of reactants and products, with negative + coefficients indicating reactants and positive coefficients indicating products, e.g., + 'rxn00001' has the coefficients, (-1, -1, 2, 1). Each coefficient item has a corresponding + ModelSEED compound ID in the attribute, 'compounds', and a corresponding cellular + compartment in the attribute, 'compartments'. + + compartments : Tuple[str], None + Cellular compartments of reactants and products, with valid values being 'c' for 'cytosolic' + and 'e' for 'extracellular', e.g., 'rxn00001' involves the compartments, ('c', 'c', 'c', + 'c'). Each compartment item has a corresponding ModelSEED compound ID in the attribute, + 'compounds', and a corresponding stoichiometric reaction coefficient in the attribute, + 'coefficients'. + + reversibility : bool, None + Reaction reversibility, with True indicating the reaction is reversible and False indicating + the reaction is irreversible given the equation encoded in the attributes, 'compounds', + 'coefficients', and 'compartments'. For example, 'rxn00001' has a value of False. + """ + modelseed_id: str = None + modelseed_name: str = None + kegg_aliases: Tuple[str] = None + ec_number_aliases: Tuple[str] = None + compound_ids: Tuple[str] = None + coefficients: Tuple[int] = None + compartments: Tuple[str] = None + reversibility: bool = None + +@dataclass +class KO: + """ + Representation of a KEGG Ortholog (KO) in a reaction network. + + Objects of this class are stored in the 'kos' attribute of a ReactionNetwork instance. + + Attributes + ========== + id : str, None + KEGG ORTHOLOGY ID in the format, 'KXXXXX', where X is a digit, e.g., 'K00001'. + + name : str, None + Name of the KO, e.g., 'K00001' has the name, 'alcohol dehydrogenase [EC:1.1.1.1]'. + + module_ids : List[str], list() + IDs of KEGG modules containing the KO, which can be used to look up module objects in the + 'pathways' attribute of the ReactionNetwork containing the KO. + + hierarchies : Dict[str, List[Tuple[str]]], dict() + Membership of the KO in BRITE hierarchies. Keys are hierarchy IDs. Values are dictionary + representations of categorizations in the hierarchy. For example, 'K00844', hexokinase, is + classified multiple ways in the 'KEGG Orthology (KO)' hierarchy, 'ko00001', including '09100 + Metabolism >>> 09101 Carbohydrate metabolism >>> 00010 Glycolysis / Gluconeogenesis + [PATH:ko00010]' and '09100 Metabolism >>> 09101 Carbohydrate metabolism >>> 00051 Fructose + and mannose metabolism [PATH:ko00051]'. This hierarchy and these classifications would be + represented as follows: {'ko00001': [('09100 Metabolism', '09101 Carbohydrate metabolism', + '00010 Glycolysis / Gluconeogenesis [PATH:ko00010]'), ('09100 Metabolism', '09101 + Carbohydrate metabolism', '00051 Fructose and mannose metabolism [PATH:ko00051]'), ...], + ...} Hierarchy IDs and categorization tuples can be used to look up category objects in the + 'categories' attribute of the ReactionNetwork containing the KO. + + pathway_ids : List[str], list() + IDs of KEGG pathways containing the KO, which can be used to look up pathway objects in the + 'pathways' attribute of the ReactionNetwork containing the KO. + + reaction_ids : List[str], list() + IDs of ModelSEED reactions associated with the KO via KEGG reaction and EC number + annotations of the KO. A ModelSEED reaction ID is formatted 'rxnXXXXX', where each X is a + digit, e.g., 'rxn00001'. ModelSEED reaction IDs can be used to look up reaction objects in + the 'reactions' attribute of the ReactionNetwork containing the KO. + + kegg_reaction_aliases : Dict[str, List[str]], dict() + KEGG reaction annotations of the KO that alias ModelSEED reactions. A KEGG REACTION ID is + formatted 'RXXXXX', where each X is a digit, e.g., 'R00001'. For example, KO 'K00003' has + two KEGG reaction annotations, both of which are associated with ModelSEED reactions via the + ModelSEED database: {'R01773': ['rxn01301', 'rxn27933'], 'R01775': ['rxn01302', + 'rxn27932']}. Note that a ModelSEED reaction may have more KEGG reaction aliases than those + annotating the KO: all known KEGG reaction aliases of the ModelSEED reaction in the + ModelSEED database are recorded in the 'kegg_aliases' attribute of a 'ModelSEEDReaction' + object. + + ec_number_aliases : Dict[str, List[str]], dict() + EC number annotations of the KO that alias ModelSEED reactions. For example, KO 'K00003' has + one EC number annotation, which is associated with ModelSEED reactions via the ModelSEED + database: {'1.1.1.3': ['rxn01301', 'rxn01302', 'rxn19904', 'rxn27931', 'rxn27932', + 'rxn27933', 'rxn33957']}. Note that a ModelSEED reaction may have more EC number aliases + than those annotating the KO: all known EC number aliases of the ModelSEED reaction in the + ModelSEED database are recorded in the 'ec_number_aliases' attribute of a + 'ModelSEEDReaction' object. + """ + id: str = None + name: str = None + module_ids: List[str] = field(default_factory=list) + hierarchies: Dict[str, List[Tuple[str]]] = field(default_factory=dict) + pathway_ids: List[str] = field(default_factory=list) + reaction_ids: List[str] = field(default_factory=list) + kegg_reaction_aliases: Dict[str, List[str]] = field(default_factory=dict) + ec_number_aliases: Dict[str, List[str]] = field(default_factory=dict) + +@dataclass +class KEGGModule: + """ + Representation of a KEGG module with KOs in a reaction network. + + Objects of this class are stored in the 'modules' attribute of a ReactionNetwork instance. + + Attributes + ========== + id : str, None + KEGG MODULE ID in the format, 'MXXXXX', where X is a digit, e.g., 'M00001'. + + name : str, None + Name of the module, e.g., 'M00001' has the name, 'Glycolysis (Embden-Meyerhof pathway), + glucose => pyruvate'. + + ko_ids : List[str], list() + IDs of reaction network KOs that are in the module, which can be used to look up KO objects + in the 'kos' attribute of the ReactionNetwork containing the module. To reiterate, this does + not include KOs in the module that are not in the reaction network. + + pathway_ids : List[str], list() + IDs of KEGG pathways containing the module, which can be used to look up KEGG pathway + objects in the 'pathways' attribute of the ReactionNetwork containing the module. + """ + id: str = None + name: str = None + ko_ids: List[str] = field(default_factory=list) + pathway_ids: List[str] = field(default_factory=list) + +@dataclass +class KEGGPathway: + """ + Representation of a KEGG pathway with KOs in a reaction network. + + Objects of this class are stored in the 'pathways' attribute of a ReactionNetwork instance. + + Attributes + ========== + id : str, None + The KEGG PATHWAY ID in the format, 'XXXXX', where X is a digit, e.g., '00010' represents + 'Glycolysis / Gluconeogensis', and corresponds to the reference pathway map, 'map00010'. + + name : str, None + Name of the pathway, e.g., 'Glycolysis / Gluconeogenesis' for pathway ID '00010'. + + categorization : Tuple[str], None + Certain pathways are equivalent to bottommost categories in the KEGG BRITE hierarchy, + 'ko00001', e.g., '00010 Glycolysis / Gluconeogenesis [PATH:ko00010]', which is represented + in the categorization tuple, ('09100 Metabolism', '09101 Carbohydrate metabolism', '00010 + Glycolysis / Gluconeogenesis [PATH:ko00010]'). The categorization tuple can be used to + retrieve the category object from the 'categories' attribute of the ReactionNetwork + containing the pathway, e.g., `category = network.categories['ko00001'][('09100 Metabolism', + '09101 Carbohydrate metabolism', '00010 Glycolysis / Gluconeogenesis [PATH:ko00010]')]` + + ko_ids : List[str], list() + IDs of reaction network KOs that are in the pathway, which can be used to look up KO objects in the + 'kos' attribute of the ReactionNetwork containing the pathway. To reiterate, this does not + include KOs in the pathway that are not in the reaction network. + + module_ids : List[str], list() + IDs of modules in the pathway that contain reaction network KOs. Module IDs can be used to + look up module objects in the 'modules' attribute of the ReactionNetwork containing the + pathway. To reiterate, this does not include modules in the pathway that do not contain KOs + in the reaction network. + """ + id: str = None + name: str = None + categorization: Tuple[str] = None + ko_ids: List[str] = field(default_factory=list) + module_ids: List[str] = field(default_factory=list) + +@dataclass +class BRITEHierarchy: + """ + Representation of a KEGG BRITE hierarchy with KOs in a reaction network. + + Objects of this class are stored in the 'hierarchies' attribute of a ReactionNetwork instance. + + Attributes + ========== + id : str, None + BRITE hierarchy ID in the format, 'koXXXXX' or 'brXXXXX', where X is a digit, e.g., + 'ko00001'. Currently, given the anvi'o KEGG data setup, the ReactionNetwork will only + contain hierarchies with IDs in the 'koXXXXX' format. + + name : str, None + Name of the hierarchy, e.g., 'ko00001' has the name, 'KEGG Orthology (KO)'. + + categorizations : List[Tuple[str]], list() + Categorizations of reaction network KOs in the hierarchy. To reiterate, this does not + include categories that do not contain KOs in the reaction network. Categories at each level + receive their own entries. For example, 'K00844', hexokinase, is classified multiple ways in + the 'KEGG Orthology (KO)' hierarchy, 'ko00001', including '09100 Metabolism >>> 09101 + Carbohydrate metabolism >>> 00010 Glycolysis / Gluconeogenesis [PATH:00010]' and '09100 + Metabolism >>> 09101 Carbohydrate metabolism >>> 00051 Fructose and mannose metabolism + [PATH:00051]'. These categorizations would yield four entries like the following: [('09100 + Metabolism', ), ('09100 Metabolism', '09101 Carbohydrate metabolism'), ('09100 Metabolism', + '09101 Carbohydrate metabolism', '00010 Glycolysis / Gluconeogenesis [PATH:00010]'), ('09100 + Metabolism', '09101 Carbohydrate metabolism', '00051 Fructose and mannose metabolism + [PATH:00051]')]. Each categorization tuple can be used to retrieve the corresponding + category object from the 'categories' attribute of the ReactionNetwork containing the + hierarchy, e.g., `category = network.categories['ko00001'][('09100 Metabolism', '09101 + Carbohydrate metabolism')]` + + ko_ids : List[str], list() + IDs of reaction network KOs in the hierarchy, which can be used to look up KO objects in the + 'kos' attribute of the ReactionNetwork containing the hierarchy. To reiterate, this does not + include KOs in the hierarchy that are not in the reaction network. + """ + id: str = None + name: str = None + categorizations: List[Tuple[str]] = field(default_factory=list) + ko_ids: List[str] = field(default_factory=list) + +@dataclass +class BRITECategory: + """ + Representation of a KEGG BRITE hierarchy category with KOs in a reaction network. + + Objects of this class are stored in the 'categories' attribute of a ReactionNetwork instance. + + Attributes + ========== + id : str + Unique ID for the category comprising the hierarchy ID and the hierarchical categorization. + The following example demonstrates ID format. In the 'KEGG Orthology (KO)' hierarchy, + 'ko00001', there is a category, '09100 Metabolism >>> 09101 Carbohydrate metabolism >>> + 00010 Glycolysis / Gluconeogenesis [PATH:00010]'. This yields the ID, 'ko00001: 09100 + Metabolism >>> 09101 Carbohydrate metabolism >>> 00010 Glycolysis / Gluconeogenesis + [PATH:00010]'. + + name : str + Name of the category. These need not be unique in a hierarchy. For example, there are + multiple categories called 'Small subunit' and 'Large subunit' in the 'Ribosome' hierarchy. + + hierarchy_id : str, None + ID of the BRITE hierarchy containing the category, which can be used to look up the + hierarchy object in the 'hierarchies' attribute of the ReactionNetwork containing the + category. + + subcategory_names : List[str], list() + The names of encompassed categories containing KOs in the reaction network. This is an empty + list if there are no categories lower in the hierarchy. For example, the category, + 'Polyketide synthase (PKS) >>> Modular type I PKS' in the hierarchy, 'ko01008' encompasses + the categories, 'cis-AT PKS' and 'trans-AT PKS'. Objects representing these subcategories + can be looked up in the 'categories' attribute of the ReactionNetwork containing the + category, e.g., `cis_category = network.categories['ko01008'][('Polyketide synthase (PKS)', + 'Modular type I PKS', 'cis-AT PKS')]` and `trans_category = network.categories['ko01008'] + [('Polyketide synthase (PKS)', 'Modular type I PKS', 'trans-AT PKS')]` + + pathway_id : str, None + Certain bottommost categories in the hierarchy, 'ko00001', are equivalent to KEGG pathways, + e.g., '00010 Glycolysis / Gluconeogenesis [PATH:ko00010]'. This attribute encodes any + equivalent pathway ID, which can be used to look up the pathway object using the 'pathways' + attribute of the ReactionNetwork containing the category. + + ko_ids : List[str], list() + IDs of Reaction network KOs in the category (and all subcategories), which can be used to + look up KO objects in the 'kos' attribute of the ReactionNetwork containing the category. To + reiterate, this does not include KOs in the category that are not in the reaction network. + """ + id: str = None + name: str = None + hierarchy_id: str = None + subcategory_names: List[str] = field(default_factory=list) + pathway_id: str = None + ko_ids: List[str] = field(default_factory=list) + +@dataclass +class Gene: + """ + Representation of a gene in a genomic reaction network. + + Objects of this class are stored in the 'categories' attribute of a GenomicNetwork instance. + + Attributes + ========== + gcid : int, None + The gene callers ID, or unique anvi'o identifier, of the gene: a non-negative integer. + + ko_ids : List[str], list() + IDs of KOs annotating the gene, which can be used to look up KO objects in the 'kos' + attribute of the GenomicNetwork containing the gene. + + e_values : Dict[str, float], dict() + E-values express the strength of KO-gene associations. Keys are KO IDs; values are + non-negative numbers. + + protein_id : Protein, None + ID of the protein expressed by the gene. The protein is used for storing abundance data, + from proteomics, for instance. + """ + gcid: int = None + ko_ids: List[str] = field(default_factory=list) + e_values: Dict[str, float] = field(default_factory=dict) + protein_id: str = None + +@dataclass +class Protein: + """ + This object stores protein abundance data (from proteomics, for instance) in a reaction network. + + Objects of this class are stored in the 'proteins' attribute of a GenomicNetwork instance. + + Attributes + ========== + id : int, None + The unique anvi'o ID for the protein: a non-negative integer. + + gcids : List[int], list() + Anvi'o gene callers IDs of genes that can express the protein. These can be used to look up + gene objects in the 'genes' attribute of the GenomicNetwork containing the gene. + + abundances : Dict[str, float], dict() + Protein abundance profile data with each key being a sample name and each value being the + abundance of the protein expressed by the gene in that sample. + """ + id: int = None + gcids: List[int] = field(default_factory=list) + abundances: Dict[str, float] = field(default_factory=dict) + +@dataclass +class GeneCluster: + """ + Representation of a gene cluster in a pangenomic reaction network. + + Objects of this class are stored in the 'gene_clusters' attribute of a PangenomicNetwork + instance. + + Attributes + ========== + gene_cluster_id : int, None + The unique anvi'o ID for the gene cluster: a non-negative integer. + + genomes : List[str], [] + The names of the genomes contributing the genes in the cluster. + + ko_id : str, None + ID of the consensus KO among the genes in the cluster, which can be used to look up the KO + object in the 'kos' attribute of the PangenomicNetwork containing the gene cluster. + (Consensus KOs can be found from a pangenome by the anvi'o method, + 'dbops.PanSuperclass.get_gene_cluster_function_summary'.) Note that the individual gene KO + annotations underlying the consensus annotation are not tracked. + """ + gene_cluster_id: int = None + genomes: List[str] = field(default_factory=list) + ko_id: str = None + +class ReactionNetwork: + """ + A reaction network predicted from KEGG KO and ModelSEED annotations. + + A reaction network need not be fully connected: it is not guaranteed that there exists a path + through the network from one arbitrary reaction to another. + + Attributes + ========== + kos : Dict[str, KO], dict() + KOs in the network, with keys being KO IDs. + + modules : Dict[str, KEGGModule], dict() + KEGG modules containing KOs in the network, with keys being module IDs. + + pathways : Dict[str, KEGGPathway], dict() + KEGG pathways containing KOs in the network, with keys being pathway IDs. + + hierarchies : Dict[str, BRITEHierarchy], dict() + KEGG BRITE hierarchies containing KOs in the network, with keys being hierarchy IDs. + + categories : Dict[str, Dict[Tuple[str], Tuple[BRITECategory]]], dict() + KEGG BRITE hierarchy categories containing KOs in the network. Keys are hierarchy IDs. + Values are dictionary representations of categorizations in the hierarchy. Categories at + each level receive their own entries. For example, 'K00844', hexokinase, is classified + multiple ways in the 'KEGG Orthology (KO)' hierarchy, 'ko00001', including '09100 + Metabolism >>> 09101 Carbohydrate metabolism >>> 00010 Glycolysis / Gluconeogenesis + [PATH:00010]' and '09100 Metabolism >>> 09101 Carbohydrate metabolism >>> 00051 Fructose + and mannose metabolism [PATH:00051]'. These categorizations would yield entries like the + following: {'ko00001': {('09100 Metabolism', ): (, ), ('09100 + Metabolism', '09101 Carbohydrate metabolism'): (, + ), ('09100 Metabolism', '09101 Carbohydrate metabolism', + '00010 Glycolysis / Gluconeogenesis [PATH:00010]'): (, + , ), ('09100 Metabolism', + '09101 Carbohydrate metabolism', '00051 Fructose and mannose metabolism [PATH:00051]'): + (, , )}} + + reactions : Dict[str, ModelSEEDReaction], dict() + ModelSEED reactions in the network, with keys being reaction IDs. + + metabolites : Dict[str, ModelSEEDCompound], dict() + ModelSEED compounds in the network, with keys being metabolite IDs. + + kegg_modelseed_aliases : Dict[str, List[str]], dict() + This maps KEGG REACTION IDs associated with KOs in the network to ModelSEED reactions + aliased by the KEGG reaction. KO-associated KEGG reactions that do not alias ModelSEED + reactions are not included. + + ec_number_modelseed_aliases : Dict[str, List[str]], dict() + This maps EC numbers associated with KOs in the network to ModelSEED reactions aliased by + the EC number. KO-associated EC numbers that do not alias ModelSEED reactions are not + included. + + modelseed_kegg_aliases : Dict[str, List[str]], dict() + This maps the IDs of ModelSEED reactions in the network to lists of KEGG REACTION IDs that + are associated with KOs in the network and alias the ModelSEED reaction. + + modelseed_ec_number_aliases : Dict[str, List[str]], dict() + This maps the IDs of ModelSEED reactions in the network to lists of EC numbers that are + associated with KOs in the network and alias the ModelSEED reaction. + + run : anvio.terminal.Run, anvio.terminal.Run() + This object prints run information to the terminal. This attribute is assigned the argument + of the same name upon initialization. + + progress : anvio.terminal.Progress, anvio.terminal.Progress() + This object prints transient progress information to the terminal. This attribute is + assigned the argument of the same name upon initialization. + + verbose : bool, True + Report more information to the terminal if True. + """ + def __init__( + self, + run: terminal.Run = terminal.Run(), + progress: terminal.Progress = terminal.Progress(), + verbose: bool = True + ) -> None: + """ + Parameters + ========== + run : anvio.terminal.Run, anvio.terminal.Run() + This object sets the 'run' attribute, which prints run information to the terminal. + + progress : anvio.terminal.Progress, anvio.terminal.Progress() + This object sets the 'progress' attribute, which prints transient progress information + to the terminal. + + verbose : bool, True + This sets the 'verbose' attribute, causing more information to be reported to the + terminal if True. + + Returns + ======= + None + """ + self.kos: Dict[str, KO] = {} + self.modules: Dict[str, KEGGModule] = {} + self.pathways: Dict[str, KEGGPathway] = {} + self.hierarchies: Dict[str, BRITEHierarchy] = {} + self.categories: Dict[str, Dict[Tuple[str], Tuple[BRITECategory]]] = {} + self.reactions: Dict[str, ModelSEEDReaction] = {} + self.metabolites: Dict[str, ModelSEEDCompound] = {} + # The following dictionaries map reaction aliases in the network: as in, not all known + # aliases, but only those sourced from KOs and contributing ModelSEEDReaction objects. + self.kegg_modelseed_aliases: Dict[str, List[str]] = {} + self.ec_number_modelseed_aliases: Dict[str, List[str]] = {} + self.modelseed_kegg_aliases: Dict[str, List[str]] = {} + self.modelseed_ec_number_aliases: Dict[str, List[str]] = {} + + self.run = run + self.progress = progress + self.verbose = verbose + + def remove_missing_objective_metabolites(self, objective_dict: Dict) -> None: + """ + Remove metabolites from a biomass objective dictionary that are not produced or consumed by + any reactions in the network. + + Parameters + ========== + objective_dict : dict + Biomass objective in COBRApy JSON format, like that returned by the method, + 'JSONStructure.get_e_coli_core_objective'. + + Returns + ======= + None + """ + objective_metabolites: Dict = objective_dict['metabolites'] + missing_metabolite_ids = [] + if 'original_metabolite_ids' in objective_dict['notes']: + # The E. coli objective had metabolite BiGG IDs, which were replaced with KEGG COMPOUND + # IDs, and the original BiGG IDs were recorded in the 'notes' section of the objective. + missing_original_metabolite_ids = [] + objective_original_metabolites: Dict = objective_dict['notes'][ + 'original_metabolite_ids' + ] + for metabolite_id, original_metabolite_id in zip( + objective_metabolites, objective_original_metabolites + ): + if metabolite_id[:-2] not in self.metabolites: + # The metabolite (removing localization substring) is not in the network. + missing_metabolite_ids.append(metabolite_id) + missing_original_metabolite_ids.append(original_metabolite_id) + for original_metabolite_id in missing_original_metabolite_ids: + objective_original_metabolites.pop(original_metabolite_id) + else: + for metabolite_id in objective_metabolites: + if metabolite_id[:-2] not in self.metabolites: + # The metabolite (removing localization substring) is not in the network. + missing_metabolite_ids.append(metabolite_id) + for metabolite_id in missing_metabolite_ids: + objective_metabolites.pop(metabolite_id) + + if not self.verbose: + return + + if 'original_metabolite_ids' in objective_dict['notes']: + id_string = "" + for original_id, modelseed_id in zip( + missing_original_metabolite_ids, missing_metabolite_ids + ): + id_string += f"{original_id} ({modelseed_id}), " + id_string = id_string[:-2] + self.run.info_single( + "The following metabolites were removed from the biomass objective, with the " + f"original IDs aliasing the ModelSEED compound IDs in parentheses: {id_string}" + ) + else: + self.run.info_single( + "The following metabolites, given by their ModelSEED compound IDs, were removed " + f"from the biomass objective: {', '.join(missing_metabolite_ids)}" + ) + + def _write_remove_metabolites_without_formula_output( + self, + output_path : str, + removed: Dict[str, List] + ) -> None: + """ + Parameters + ========== + output_path : str + Write tab-delimited files of metabolites, reactions, KOs, KEGG modules, KEGG pathways, + KEGG BRITE hierarchies, and KEGG BRITE hierarchy categories removed from the network to + file locations based on the provided path. For example, if the argument, 'removed.tsv', + is provided, then the following files will be written: 'removed-metabolites.tsv', + 'removed-reactions.tsv', 'removed-kos.tsv', 'removed-modules.tsv', + 'removed-pathways.tsv', 'removed-hierarchies.tsv', and 'removed-categories.tsv'. + + removed : Dict[str, List] + Data removed from the network. The dictionary looks like the following for a genomic + network. (For a pangenomic network, the last gene entry is replaced by a gene cluster + entry, 'gene_cluster': [].) + { + 'metabolite': [], + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [], + 'gene': [] + } + """ + # Record the reactions removed as a consequence of involving formulaless metabolites, and + # record the formulaless metabolites involved in removed reactions. + removed_metabolites: List[ModelSEEDCompound] = removed['metabolite'] + removed_metabolite_ids: List[str] = [ + metabolite.modelseed_id for metabolite in removed_metabolites + ] + metabolite_removed_reactions: Dict[str, List[str]] = {} + reaction_removed_metabolites: Dict[str, List[str]] = {} + removed_reactions: List[ModelSEEDReaction] = removed['reaction'] + for reaction in removed_reactions: + reaction_removed_metabolites[reaction.modelseed_id] = metabolite_ids = [] + for compound_id in reaction.compound_ids: + if compound_id in removed_metabolite_ids: + try: + metabolite_removed_reactions[compound_id].append(reaction.modelseed_id) + except KeyError: + metabolite_removed_reactions[compound_id] = [reaction.modelseed_id] + metabolite_ids.append(compound_id) + + metabolite_table = [] + for metabolite in removed_metabolites: + row = [] + row.append(metabolite.modelseed_id) + row.append(metabolite.modelseed_name) + row.append(metabolite.formula) + try: + # The metabolite did not have a formula. + removed_reaction_ids = metabolite_removed_reactions[metabolite.modelseed_id] + except KeyError: + # The metabolite had a formula but was removed as a consequence of all the reactions + # involving the metabolite being removed due to them containing formulaless + # metabolites: the metabolite did not cause any reactions to be removed. + row.append("") + continue + # The set accounts for the theoretical possibility that a compound is present on both + # sides of the reaction equation and thus the reaction is recorded multiple times. + row.append(", ".join(sorted(set(removed_reaction_ids)))) + + reaction_table = [] + for reaction in removed_reactions: + row = [] + row.append(reaction.modelseed_id) + row.append(reaction.modelseed_name) + # The set accounts for the theoretical possibility that a compound is present on both + # sides of the reaction equation and thus is recorded multiple times. + row.append( + ", ".join(set(reaction_removed_metabolites[reaction.modelseed_id])) + ) + row.append(", ".join(reaction.compound_ids)) + row.append(get_chemical_equation(reaction)) + reaction_table.append(row) + + ko_table = [] + removed_kos: List[KO] = removed['ko'] + for ko in removed_kos: + row = [] + row.append(ko.id) + row.append(ko.name) + row.append(", ".join(ko.reaction_ids)) + ko_table.append(row) + + module_table = [] + removed_modules: List[KEGGModule] = removed['module'] + for module in removed_modules: + row = [] + row.append(module.id) + row.append(module.name) + row.append(", ".join(module.ko_ids)) + module_table.append(row) + + pathway_table = [] + removed_pathways: List[KEGGPathway] = removed['pathway'] + for pathway in removed_pathways: + row = [] + row.append(pathway.id) + row.append(pathway.name) + row.append(", ".join(pathway.ko_ids)) + row.append(", ".join(pathway.module_ids)) + pathway_table.append(row) + + hierarchy_table = [] + removed_hierarchies: List[BRITEHierarchy] = removed['hierarchy'] + removed_hierarchy_names: Dict[str, str] = {} + for hierarchy in removed_hierarchies: + row = [] + row.append(hierarchy.id) + row.append(hierarchy.name) + row.append(", ".join(hierarchy.ko_ids)) + pathway_table.append(row) + removed_hierarchy_names[hierarchy.id] = hierarchy.name + + category_table = [] + removed_categories: List[BRITECategory] = removed['category'] + for category in removed_categories: + row = [] + row.append(category.hierarchy_id) + try: + row.append(self.hierarchies[category.hierarchy_id].name) + except KeyError: + row.append(removed_hierarchy_names[category.hierarchy_id]) + row.append(category.id[len(category.hierarchy_id) + 2:]) + row.append(", ".join(category.ko_ids)) + category_table.append(row) + + path_basename, path_extension = os.path.splitext(output_path) + metabolite_path = f"{path_basename}-metabolites{path_extension}" + reaction_path = f"{path_basename}-reactions{path_extension}" + ko_path = f"{path_basename}-kos{path_extension}" + module_path = f"{path_basename}-modules{path_extension}" + pathway_path = f"{path_basename}-pathways{path_extension}" + hierarchy_path = f"{path_basename}-hierarchies{path_extension}" + category_path = f"{path_basename}-categories{path_extension}" + + pd.DataFrame( + metabolite_table, + columns=[ + "ModelSEED compound ID", + "ModelSEED compound name", + "Formula", + "Removed reaction ModelSEED IDs" + ] + ).to_csv(metabolite_path, sep='\t', index=False) + + pd.DataFrame( + reaction_table, + columns=[ + "ModelSEED reaction ID", + "ModelSEED reaction name", + "Removed ModelSEED compound IDs", + "Reaction ModelSEED compound IDs", + "Equation" + ] + ).to_csv(reaction_path, sep='\t', index=False) + + pd.DataFrame( + ko_table, + columns=[ + "KO ID", + "KO name", + "KO ModelSEED reaction IDs" + ] + ).to_csv(ko_path, sep='\t', index=False) + + pd.DataFrame( + module_table, + columns=[ + "KEGG module ID", + "KEGG module name", + "Module KOs" + ] + ).to_csv(module_path, sep='\t', index=False) + + pd.DataFrame( + pathway_table, + columns=[ + "KEGG pathway ID", + "KEGG pathway name", + "Pathway KOs", + "Pathway modules" + ] + ).to_csv(pathway_path, sep='\t', index=False) + + pd.DataFrame( + hierarchy_table, + columns=[ + "KEGG BRITE hierarchy ID", + "KEGG BRITE hierarchy name", + "Hierarchy KOs" + ] + ).to_csv(hierarchy_path, sep='\t', index=False) + + pd.DataFrame( + category_table, + columns=[ + "KEGG BRITE hierarchy ID", + "KEGG BRITE hierarchy name", + "KEGG BRITE hierarchy categorization", + "Category KOs" + ] + ).to_csv(category_path, sep='\t', index=False) + + def _purge_metabolites(self, metabolites_to_remove: Iterable[str]) -> Dict[str, List]: + """ + Remove any trace of the given metabolites from the network. + + Reactions involving the metabolite are also purged from the network. KOs that are only + associated with removed reactions are purged. In genomic networks, genes that are only + associated with removed KOs are purged. In pangenomic networks, gene clusters assigned + removed KOs are purged. KEGG modules, pathways, BRITE hierarchies, and BRITE hierarchy + categories only associated with purged KOs are removed. + + Removal of reactions involving the metabolite can also result in other metabolites being + being removed from the network, those that exclusively participate in these reactions. + + Parameters + ========== + metabolites_to_remove : Iterable[str] + ModelSEED compound IDs to remove. + + Returns + ======= + dict + This dictionary contains data removed from the network. + + The dictionary examples below are for a genomic network. For a pangenomic network, the + gene entry is replaced by the gene cluster entry, 'gene_cluster': [] or 'gene_cluster': []. The examples show protein entries as if the genomic + network has been annotated with protein abundances; these are absent for genomic + networks lacking protein annotations and for pangenomic networks. + + If this method is NOT called from the method, '_purge_reactions', then the dictionary + will look like the following. + { + 'metabolite': [], + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [], + 'gene': [], + 'protein': [] + } + + If this method is called from the method, '_purge_reactions', then the dictionary will + look like the following. + { + 'metabolite': [], + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [], + 'gene': [], + 'protein': [] + } + + If no metabolites are removed from the network, then the dictionary will look like the + following regardless of calling method. + { + 'metabolite': [], + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [], + 'gene': [], + 'protein': [] + } + """ + metabolites_to_remove = set(metabolites_to_remove) + removed_metabolites: List[ModelSEEDCompound] = [] + for compound_id in metabolites_to_remove: + try: + removed_metabolites.append(self.metabolites.pop(compound_id)) + except KeyError: + # This can occur for two reasons. First, the metabolite from 'metabolites_to_remove' + # could not be in the network. + + # Second, this can occur when removing other "unintended" metabolites from the + # network. '_purge_metabolites' was first called with metabolites of interest, then + # '_purge_reactions' was called from within the method the remove reactions + # involving the metabolites of interest, and then '_purge_metabolites' was called + # again from within '_purge_reactions' to remove other metabolites exclusively found + # in the removed reactions. In this last call of '_purge_metabolites', the + # 'metabolites_to_remove' also include the metabolites of interest that were already + # removed from 'self.metabolites' in the original '_purge_metabolites' call. This + # KeyError occurs when trying to remove those already-removed metabolites. + pass + removed_metabolite_ids = [metabolite.modelseed_id for metabolite in removed_metabolites] + + if not removed_metabolites: + removed = { + 'metabolite': [], + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [] + } + if isinstance(self, GenomicNetwork): + removed['gene'] = [] + if self.proteins: + removed['protein'] = [] + elif isinstance(self, PangenomicNetwork): + removed['gene_cluster'] = [] + else: + raise AssertionError + return removed + + # Purge reactions from the record that involve removed metabolites. + reactions_to_remove: List[str] = [] + for reaction_id, reaction in self.reactions.items(): + for compound_id in reaction.compound_ids: + if compound_id in removed_metabolite_ids: + reactions_to_remove.append(reaction_id) + break + + removed = {'metabolite': removed_metabolites} + if reactions_to_remove: + removed_cascading_up = self._purge_reactions(reactions_to_remove) + # There may be other metabolites exclusively involved in the removed reactions; these + # metabolites were therefore also removed. + removed['metabolite'] = removed_metabolites + removed_cascading_up.pop('metabolite') + else: + # This method must have been called from the method, '_purge_reactions', because the + # reactions containing the metabolites were already removed from the network. + removed_cascading_up = { + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [] + } + if isinstance(self, GenomicNetwork): + removed_cascading_up['gene'] = [] + if self.proteins: + removed_cascading_up['protein'] = [] + elif isinstance(self, PangenomicNetwork): + removed_cascading_up['gene_cluster'] = [] + else: + raise AssertionError + removed.update(removed_cascading_up) + return removed + + def _purge_reactions(self, reactions_to_remove: Iterable[str]) -> Dict[str, List]: + """ + Remove any trace of the given reactions from the network. + + Metabolites that exclusively participate in removed reactions are purged. KOs that are only + associated with removed reactions are purged. In genomic networks, genes that are only + associated with removed KOs are purged. In pangenomic networks, gene clusters assigned + removed KOs are purged. KEGG modules, pathways, BRITE hierarchies, and BRITE hierarchy + categories only associated with purged KOs are removed. + + Parameters + ========== + reactions_to_remove : Iterable[str] + ModelSEED reaction IDs to remove. + + Returns + ======= + dict + This dictionary contains data removed from the network. + + The dictionary examples below are for a genomic network. For a pangenomic network, the + gene entry is replaced by the gene cluster entry, 'gene_cluster': [] or 'gene_cluster': []. The examples show protein entries as if the genomic + network has been annotated with protein abundances; these are absent for genomic + networks lacking protein annotations and for pangenomic networks. + + If this method is NOT called from the method, '_purge_metabolites', or the method, + '_purge_kos', then the dictionary will look like the following. + { + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'metabolite': [], + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [], + 'gene': [], + 'protein': [] + } + + If this method is called from the method, '_purge_metabolites', then the dictionary will + look like the following. + { + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'metabolite': [], + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [], + 'gene': [], + 'protein': [] + } + + If this method is called from the method, '_purge_kos', then the dictionary will look + like the following. + { + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'metabolite': [ Dict[str, List]: + """ + Remove any trace of the given KOs from the network. + + If KEGG modules, pathways, BRITE hierarchies, or BRITE hierarchy categories are provided, + remove any trace of the KOs that are in these from the network. + + Reactions and metabolites that are only associated with removed KOs are purged. In genomic + networks, genes that are only associated with removed KOs are purged. In pangenomic + networks, gene clusters assigned removed KOs are purged. KEGG modules, pathways, BRITE + hierarchies, and BRITE hierarchy categories only associated with purged KOs are removed. + + Parameters + ========== + kos_to_remove : Iterable[str], None + KO IDs to remove. + + modules_to_remove : Iterable[str], None + KEGG module IDs to remove, equivalent to giving the KOs in the modules to the argument, + 'kos_to_remove'. + + pathways_to_remove : Iterable[str], None + KEGG pathway IDs to remove, equivalent to giving the KOs in the pathways to the + argument, 'kos_to_remove'. + + hierarchies_to_remove : Iterable[str], None + KEGG BRITE hierarchy IDs to remove, equivalent to giving the KOs in the hierarchies to + the argument, 'kos_to_remove'. + + categories_to_remove : Dict[str, List[Tuple[str]]], None + KEGG BRITE hierarchy categories to remove, equivalent to giving the KOs in the + categories to the argument, 'kos_to_remove'. The dictionary argument is keyed by BRITE + hierarchy ID and has values that list category tuples. For example, to purge KOs from + the network contained in the 'ko00001' 'KEGG Orthology (KO)' hierarchy categories, + '09100 Metabolism >>> 09101 Carbohydrate metabolism >>> 00010 Glycolysis / + Gluconeogenesis [PATH:ko00010]' and '09100 Metabolism >>> 09101 Carbohydrate + metabolism >>> 00051 Fructose and mannose metabolism [PATH:ko00051]', the dictionary + argument would need to look like the following: {'ko00001': [('09100 Metabolism', '09101 + Carbohydrate metabolism', '00010 Glycolysis / Gluconeogenesis'), ('09100 Metabolism', + '09101 Carbohydrate metabolism', '00051 Fructose and mannose metabolism + [PATH:ko00051]')]} + + Returns + ======= + dict + This dictionary contains data removed from the network. + + The dictionary examples below are for a genomic network. For a pangenomic network, the + gene entry is replaced by the gene cluster entry, 'gene_cluster': [] or 'gene_cluster': []. The examples show protein entries as if the genomic + network has been annotated with protein abundances; these are absent for genomic + networks lacking protein annotations and for pangenomic networks. + + If this method is NOT called from the method, '_purge_reactions', or the method, + '_purge_genes', then the dictionary will look like the following. + { + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [], + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'metabolite': [], + 'gene': [], + 'protein': [] + } + + If this method is called from the method, '_purge_reactions', then the dictionary will + look like the following. + { + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [], + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'metabolite': [], + 'gene': [], + 'protein': [] + } + + If this method is called from the method, '_purge_genes', then the dictionary will look + like the following. + { + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [], + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'metabolite': [], + 'gene': [], + 'protein': [] + } + + If no KOs are removed from the network, then the dictionary will look like the following + regardless of calling method. + { + 'metabolite': [], + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [], + 'gene': [], + 'protein': [] + } + """ + assert ( + kos_to_remove or + modules_to_remove or + pathways_to_remove or + hierarchies_to_remove or + categories_to_remove + ) + + if kos_to_remove is None: + kos_to_remove: List[str] = [] + else: + kos_to_remove = list(set(kos_to_remove)) + if modules_to_remove is None: + modules_to_remove: List[str] = [] + else: + modules_to_remove = list(set(modules_to_remove)) + if pathways_to_remove is None: + pathways_to_remove: List[str] = [] + else: + pathways_to_remove = list(set(pathways_to_remove)) + if hierarchies_to_remove is None: + hierarchies_to_remove: List[str] = [] + else: + hierarchies_to_remove = list(set(hierarchies_to_remove)) + if categories_to_remove is None: + categories_to_remove: Dict[str, List[Tuple[str]]] = {} + + # Get the KOs to remove from requested modules, pathways, hierarchies, and hierarchy + # categories. + for module_id in modules_to_remove: + try: + module = self.modules[module_id] + except KeyError: + # The requested module is not in the network. + continue + kos_to_remove += module.ko_ids + for pathway_id in pathways_to_remove: + try: + pathway = self.pathways[pathway_id] + except KeyError: + # The requested pathway is not in the network. + continue + kos_to_remove += pathway.ko_ids + for hierarchy_id in hierarchies_to_remove: + try: + hierarchy = self.hierarchies[hierarchy_id] + except KeyError: + # The requested hierarchy is not in the network. + continue + kos_to_remove += hierarchy.ko_ids + for hierarchy_id, categorizations in categories_to_remove.items(): + try: + hierarchy_categorizations = self.categories[hierarchy_id] + except KeyError: + # The requested hierarchy is not in the network. + continue + for categorization in categorizations: + try: + categories = hierarchy_categorizations[categorization] + except KeyError: + # The requested category is not in the network. + continue + category = categories[-1] + kos_to_remove += category.ko_ids + + # Remove requested KOs from the network. + kos_to_remove = set(kos_to_remove) + removed_kos: List[KO] = [] + for ko_id in kos_to_remove: + try: + removed_kos.append(self.kos.pop(ko_id)) + except KeyError: + # This occurs when the original method called is '_purge_kos', followed by + # '_purge_genes' or '_purge_gene_clusters', followed by this method again -- + # 'removed_kos' will be empty. Alternatively, this occurs if the KO in + # 'kos_to_remove' is not in the network. + pass + removed_ko_ids = [ko.id for ko in removed_kos] + + if not removed_kos: + removed = { + 'metabolite': [], + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [] + } + if isinstance(self, GenomicNetwork): + removed['gene'] = [] + elif isinstance(self, PangenomicNetwork): + removed['gene_cluster'] = [] + else: + raise AssertionError + return removed + + # Remove modules and pathways from the network that exclusively contain removed KOs. + removed_modules: List[KEGGModule] = [] + removed_pathways: List[KEGGPathway] = [] + removed_hierarchies: List[BRITEHierarchy] = [] + removed_categories: List[BRITECategory] = [] + for removed_ko in removed_kos: + removed_ko_id = removed_ko.id + + for module_id in removed_ko.module_ids: + module = self.modules[module_id] + module.ko_ids.remove(removed_ko_id) + if not module.ko_ids: + self.modules.pop(module_id) + # Remove obsolete module references from pathways. + for pathway_id in module.pathway_ids: + pathway = self.pathways[pathway_id] + pathway.module_ids.remove(module_id) + removed_modules.append(module) + + for pathway_id in removed_ko.pathway_ids: + pathway = self.pathways[pathway_id] + pathway.ko_ids.remove(removed_ko_id) + if not pathway.ko_ids: + self.pathways.pop(pathway_id) + # Remove obsolete pathway references from modules. + for module_id in pathway.module_ids: + module = self.modules[module_id] + module.pathway_ids.remove(pathway_id) + removed_pathways.append(pathway) + + for hierarchy_id, categorizations in removed_ko.hierarchies.items(): + hierarchy = self.hierarchies[hierarchy_id] + hierarchy.ko_ids.remove(removed_ko_id) + if not hierarchy.ko_ids: + self.hierarchies.pop(hierarchy_id) + network_categorizations = self.categories.pop(hierarchy_id) + removed_hierarchies.append(hierarchy) + # Record categories removed along with the hierarchy. + for categorization in hierarchy.categorizations: + removed_categories.append(network_categorizations[categorization][-1]) + continue + + network_categorizations = self.categories[hierarchy_id] + for categorization in categorizations: + categories = network_categorizations[categorization] + supercategory = None + for depth, category in enumerate(categories, 1): + try: + category.ko_ids.remove(removed_ko_id) + except ValueError: + # The KO has already been removed from the supercategory, as it was + # already encountered in another subcategory. + supercategory = category + continue + if not category.ko_ids: + focus_categorization = categorization[:depth] + # Remove obsolete category references from the hierarchy. + hierarchy.categorizations.remove(focus_categorization) + # Remove obsolete categories from the network. + network_categorizations.pop(focus_categorization) + # Remove obsolete category references from the supercategory if the + # supercategory also hasn't been removed. + if supercategory: + supercategory.subcategory_names.remove(category.name) + supercategory = None + removed_categories.append(category) + continue + supercategory = category + + if 'ko00001' in self.hierarchies: + network_categorizations = self.categories['ko00001'] + for pathway in removed_pathways: + if pathway.categorization is not None: + assert pathway.categorization not in network_categorizations + for category in removed_categories: + if category.pathway_id is not None: + assert category.pathway_id not in self.pathways + + # Purge reactions from the network that are exclusive to removed KOs. + reactions_to_remove: List[str] = [] + for ko in removed_kos: + for reaction_id in ko.reaction_ids: + reactions_to_remove.append(reaction_id) + reactions_to_remove = list(set(reactions_to_remove)) + for ko in self.kos.values(): + reactions_to_spare: List[int] = [] + for reaction_id in ko.reaction_ids: + for idx, reaction_id_to_remove in enumerate(reactions_to_remove): + if reaction_id == reaction_id_to_remove: + # The reaction is associated with a retained KO, so do not remove the + # reaction. + reactions_to_spare.append(idx) + for idx in sorted(reactions_to_spare, reverse=True): + reactions_to_remove.pop(idx) + if reactions_to_remove: + removed_cascading_down = self._purge_reactions(reactions_to_remove) + for key in ['ko', 'module', 'pathway', 'hierarchy', 'category']: + removed_cascading_down.pop(key) + if isinstance(self, GenomicNetwork): + removed_cascading_down.pop('gene') + elif isinstance(self, PangenomicNetwork): + removed_cascading_down.pop('gene_cluster') + else: + raise AssertionError + else: + # This method must have been called "cascading up" from the method, '_purge_reactions', + # because the reactions that are only associated with the removed KOs were already + # removed from the network. + removed_cascading_down = { + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'metabolite': [] + } + + if isinstance(self, GenomicNetwork): + # Purge genes from the network that are are only associated with removed KOs. + genes_to_remove: List[str] = [] + for gcid, gene in self.genes.items(): + gene_kos_to_remove: List[str] = [] + for ko_id in gene.ko_ids: + if ko_id in removed_ko_ids: + gene_kos_to_remove.append(ko_id) + if len(gene_kos_to_remove) == len(gene.ko_ids): + # All KOs matching the gene were removed, so remove it as well. + genes_to_remove.append(gcid) + continue + for ko_id in gene_kos_to_remove: + gene.ko_ids.remove(ko_id) + gene.e_values.pop(ko_id) + # If this method was called from '_purge_genes', then the genes that are only associated + # with KOs removed here were already removed from the network, and 'genes_to_remove' + # would be empty. In contrast, if this method was not called from '_purge_genes', but + # zero genes are only associated with KOs removed here, then 'genes_to_remove' would + # likewise be empty. + if genes_to_remove: + removed_cascading_up = self._purge_genes(genes_to_remove) + for key in [ + 'ko', + 'module', + 'pathway', + 'hierarchy', + 'category', + 'reaction', + 'kegg_reaction', + 'ec_number', + 'metabolite' + ]: + removed_cascading_up.pop(key) + else: + removed_cascading_up = {'gene': []} + elif isinstance(self, PangenomicNetwork): + gene_clusters_to_remove: List[str] = [] + for gene_cluster_id, cluster in self.gene_clusters.items(): + if cluster.ko_id in removed_ko_ids: + gene_clusters_to_remove.append(gene_cluster_id) + # If this method was called from 'purge_gene_clusters' then the gene clusters that are only + # associated with KOs removed here were already removed from the network, and + # 'gene_clusters_to_remove' would be empty. + if gene_clusters_to_remove: + removed_cascading_up = self._purge_gene_clusters(gene_clusters_to_remove) + for key in [ + 'ko', + 'module', + 'pathway', + 'hierarchy', + 'category', + 'reaction', + 'kegg_reaction', + 'ec_number', + 'metabolite' + ]: + removed_cascading_up.pop(key) + else: + removed_cascading_up = {'gene_cluster': []} + else: + raise AssertionError + + removed = { + 'ko': removed_kos, + 'module': removed_modules, + 'pathway': removed_pathways, + 'hierarchy': removed_hierarchies, + 'category': removed_categories + } + removed.update(removed_cascading_down) + removed.update(removed_cascading_up) + return removed + + def _subset_network_by_kos( + self, + ko_ids: Iterable[str], + subnetwork: ReactionNetwork = None, + inclusive: bool = False + ) -> ReactionNetwork: + """ + Subset the network by KOs. + + Parameters + ========== + ko_ids : Iterable[str] + KO IDs to subset. + + subnetwork : ReactionNetwork, None + This network under construction is provided when the KOs being added to the network + annotate already subsetted genes or gene clusters. + + inclusive : bool, False + This option applies to genomic and not pangenomic networks. If True, "inclusive" + subsetting applies a "Midas touch" where all items in the network that are however + associated with requested KOs are "turned to gold" and included in the subsetted + network. In default "exclusive" subsetting, a gene added to the subsetted network due to + references to requested KOs will be missing its references to any other unrequested KOs + in the source network. + + Returns + ======= + ReactionNetwork + If a 'subnetwork' argument is provided, then that network is returned after + modification. Otherwise, a new subsetted reaction network is returned. + """ + if isinstance(self, GenomicNetwork): + if subnetwork is None: + subnetwork = GenomicNetwork() + # Signify that genes annotated by subsetted KOs are to be added to the network. + subset_referencing_genes = True + else: + assert isinstance(subnetwork, GenomicNetwork) + # Signify that the KOs being added to the network annotate subsetted genes that were + # already added to the network. + subset_referencing_genes = False + elif isinstance(self, PangenomicNetwork): + if subnetwork is None: + subnetwork = PangenomicNetwork() + # Signify that gene clusters annotated by subsetted KOs are to be added to the + # network. + subset_referencing_gene_clusters = True + else: + assert isinstance(subnetwork, PangenomicNetwork) + # Signify that the KOs being added to the network annotate subsetted gene clusters + # that were already added to the network. + subset_referencing_gene_clusters = False + else: + raise AssertionError + + for ko_id in ko_ids: + try: + ko = self.kos[ko_id] + except KeyError: + # This occurs if the requested KO ID is not in the source network. + continue + + # Copy reactions annotating the KO to the subsetted network. + self._subset_network_by_reactions(ko.reaction_ids, subnetwork=subnetwork) + + subnetwork.kos[ko_id] = deepcopy(ko) + + self._subset_ko_classifications(subnetwork, ko) + + if isinstance(self, GenomicNetwork): + if subset_referencing_genes: + # Add genes that are annotated by the subsetted KOs to the network. + self._subset_genes_via_kos(subnetwork, inclusive=inclusive) + elif isinstance(self, PangenomicNetwork): + if subset_referencing_gene_clusters: + # Add gene clusters that are annotated by the subsetted KOs to the network. + self._subset_gene_clusters_via_kos(subnetwork) + + return subnetwork + + def _subset_ko_classifications(self, subnetwork: ReactionNetwork, ko: KO) -> None: + """ + Add KEGG modules, pathways, and BRITE hierarchies/categories that contain the subsetted KO + to the subsetted network. + + Subsetted module, pathway, hierarchy, and category objects only reference subsetted KOs and + not other KOs also classified in them which are not subsetted. + + Parameters + ========== + subnetwork : ReactionNetwork + Subsetted reaction network under construction. + + ko : KO + KO copied from the source network to the subsetted network. + """ + ko_id = ko.id + + for module_id in ko.module_ids: + try: + # The module was already added to the subnetwork via another KO. + subnetwork_module = subnetwork.modules[module_id] + is_module_added = True + except KeyError: + is_module_added = False + module = self.modules[module_id] + + if is_module_added: + subnetwork_module.ko_ids.append(ko_id) + for pathway_id in module.pathway_ids: + if pathway_id in subnetwork_module.pathway_ids: + # The pathway was already linked to the module via another KO. + continue + # Only link pathways to the module that also contain the KO. + if pathway_id in ko.pathway_ids: + subnetwork_module.pathway_ids.append(pathway_id) + continue + + subnetwork_module = KEGGModule( + id=module_id, + name=module.name, + ko_ids=[ko_id] + ) + # Only link pathways to the module that also contain the KO. + for pathway_id in module.pathway_ids: + if pathway_id in ko.pathway_ids: + subnetwork_module.pathway_ids.append(pathway_id) + subnetwork.modules[module_id] = subnetwork_module + + for pathway_id in ko.pathway_ids: + try: + # The pathway was already added to the subnetwork via another KO. + subnetwork_pathway = subnetwork.pathways[pathway_id] + is_pathway_added = True + except KeyError: + is_pathway_added = False + + pathway = self.pathways[pathway_id] + if is_pathway_added: + subnetwork_pathway.ko_ids.append(ko_id) + for module_id in pathway.module_ids: + if module_id in subnetwork_pathway.module_ids: + # The module was already linked to the pathway via another KO. + continue + # Only link modules to the pathway that also contain the KO. + if module_id in ko.module_ids: + subnetwork_pathway.module_ids.append(module_id) + continue + + subnetwork_pathway = KEGGPathway( + id=pathway_id, + name=pathway.name, + categorization=pathway.categorization, + ko_ids = [ko_id] + ) + # Only link modules to the pathway that also contain the KO. + for module_id in pathway.module_ids: + if module_id in ko.module_ids: + subnetwork_pathway.module_ids.append(module_id) + subnetwork.pathways[pathway_id] = subnetwork_pathway + + for hierarchy_id, categorizations in ko.hierarchies.items(): + try: + # The hierarchy was already added to the subnetwork via another KO. + subnetwork_hierarchy = subnetwork.hierarchies[hierarchy_id] + is_hierarchy_added = True + except KeyError: + is_hierarchy_added = False + + if is_hierarchy_added: + subnetwork_hierarchy_categorizations = subnetwork.categories[hierarchy_id] + else: + hierarchy = self.hierarchies[hierarchy_id] + subnetwork_hierarchy = BRITEHierarchy( + id=hierarchy_id, + name=hierarchy.name, + ko_ids=[ko_id] + ) + subnetwork.hierarchies[hierarchy_id] = subnetwork_hierarchy + subnetwork_hierarchy_categorizations: Dict[Tuple[str], Tuple[BRITECategory]] = {} + subnetwork.categories[hierarchy_id] = subnetwork_hierarchy_categorizations + + subnetwork_hierarchy.ko_ids.append(ko_id) + + network_hierarchy_categorizations = self.categories[hierarchy_id] + for categorization in categorizations: + if categorization in subnetwork_hierarchy.categorizations: + # The category and all supercategories were already added to the subsetted + # network. + continue + categories = network_hierarchy_categorizations[categorization] + subnetwork_categories: List[BRITECategory] = [] + for depth, category in enumerate(categories, 1): + focus_categorization = categorization[:depth] + try: + # The supercategory was already added to the subsetted network. + subnetwork_category = subnetwork_hierarchy_categorizations[ + focus_categorization + ][-1] + is_category_added = True + except KeyError: + is_category_added = False + + if is_category_added: + subnetwork_category.ko_ids.append(ko_id) + subnetwork_categories.append(subnetwork_category) + continue + + subnetwork_category = BRITECategory( + id=category.id, + name=category.name, + hierarchy_id=hierarchy_id, + pathway_id=category.pathway_id, + ko_ids=[ko_id] + ) + subnetwork_categories.append(subnetwork_category) + # Add the category to the subnetwork. + subnetwork_hierarchy_categorizations[focus_categorization] = tuple( + subnetwork_categories + ) + + def _subset_network_by_reactions( + self, + reaction_ids: Iterable[str], + subnetwork: ReactionNetwork = None, + inclusive: bool = False + ) -> ReactionNetwork: + """ + Subset the network by ModelSEED reactions. + + Parameters + ========== + reaction_ids : Iterable[str] + ModelSEED reaction IDs to subset. + + subnetwork : ReactionNetwork, None + This network under construction is provided when the reactions being added to the + network annotate already subsetted KOs. + + inclusive : bool, False + If True, "inclusive" subsetting applies a "Midas touch" where all items in the network + that are however associated with requested reactions are "turned to gold" and included + in the subsetted network. In default "exclusive" subsetting, KOs and genes or gene + clusters that are added to the subsetted network due to references to requested + reactions will be missing references to any other unrequested reactions. + + Returns + ======= + ReactionNetwork + If a 'subnetwork' argument is provided, then that network is returned after + modification. Otherwise, a new subsetted reaction network is returned. + """ + if isinstance(self, GenomicNetwork): + if subnetwork is None: + subnetwork = GenomicNetwork() + # Signify that KOs annotated by subsetted reactions are to be added to the network. + subset_referencing_kos = True + else: + assert isinstance(subnetwork, GenomicNetwork) + # Signify that the reactions being added to the network annotate subsetted KOs that + # were already added to the network. + subset_referencing_kos = False + elif isinstance(self, PangenomicNetwork): + if subnetwork is None: + subnetwork = PangenomicNetwork() + # Signify that KOs annotated by subsetted reactions are to be added to the network. + subset_referencing_kos = True + else: + assert isinstance(subnetwork, PangenomicNetwork) + # Signify that the reactions being added to the network annotate subsetted KOs that + # were already added to the network. + subset_referencing_kos = False + else: + raise AssertionError + + for reaction_id in reaction_ids: + try: + reaction = self.reactions[reaction_id] + except KeyError: + # This occurs if the requested reaction is not in the source network. + continue + self._subset_reaction(subnetwork, reaction) + + if subset_referencing_kos: + # Add KOs that are annotated by the subsetted reactions to the network. + self._subset_kos_via_reactions(subnetwork, inclusive=inclusive) + + return subnetwork + + def _subset_reaction(self, subnetwork: ReactionNetwork, reaction: ModelSEEDReaction) -> None: + """ + Add a reaction to a subsetted network along with metabolites involved in the reaction. + + Parameters + ========== + subnetwork : ReactionNetwork + Subsetted reaction network under construction. + + reaction : ModelSEEDReaction + Reaction object from the source network to be added to the subnetwork. + + Returns + ======= + None + """ + reaction_id = reaction.modelseed_id + subnetwork.reactions[reaction_id] = deepcopy(reaction) + + # Copy metabolites involved in the reaction to the subnetwork. + for compound_id in reaction.compound_ids: + if compound_id in subnetwork.metabolites: + continue + metabolite = self.metabolites[compound_id] + subnetwork.metabolites[compound_id] = deepcopy(metabolite) + + # Add KEGG reaction and EC number aliases of the reaction to the subsetted network. + try: + subnetwork.modelseed_kegg_aliases[reaction_id] += list(reaction.kegg_aliases) + except KeyError: + subnetwork.modelseed_kegg_aliases[reaction_id] = list(reaction.kegg_aliases) + + try: + subnetwork.modelseed_ec_number_aliases[reaction_id] += list( + reaction.ec_number_aliases + ) + except KeyError: + subnetwork.modelseed_ec_number_aliases[reaction_id] = list( + reaction.ec_number_aliases + ) + + for kegg_id in reaction.kegg_aliases: + try: + subnetwork.kegg_modelseed_aliases[kegg_id].append(reaction_id) + except KeyError: + subnetwork.kegg_modelseed_aliases[kegg_id] = [reaction_id] + + for ec_number in reaction.ec_number_aliases: + try: + subnetwork.ec_number_modelseed_aliases[ec_number].append(reaction_id) + except KeyError: + subnetwork.ec_number_modelseed_aliases[ec_number] = [reaction_id] + + def _subset_kos_via_reactions( + self, + subnetwork: ReactionNetwork, + inclusive: bool = False + ) -> None: + """ + Add KOs that are annotated with subsetted reactions to the subsetted network. + + Then add genes that are annotated with these added KOs to the subsetted network. + + Parameters + ========== + subnetwork : ReactionNetwork + The subsetted reaction network under construction. + + inclusive : bool, False + This option applies to genomic and not pangenomic networks. If True, "inclusive" + subsetting applies a "Midas touch" where all items in the network that are however + associated with requested KOs are "turned to gold" and included in the subsetted + network. In default "exclusive" subsetting, a gene added to the subsetted network due to + references to requested KOs will be missing its references to any other unrequested KOs + in the source network. + + Returns + ======= + None + """ + if isinstance(self, GenomicNetwork): + assert isinstance(subnetwork, GenomicNetwork) + elif isinstance(self, PangenomicNetwork): + assert isinstance(subnetwork, PangenomicNetwork) + else: + raise AssertionError + + for ko_id, ko in self.kos.items(): + # Check all KOs in the source network for subsetted reactions. + subsetted_reaction_ids: List[str] = [] + for reaction_id in ko.reaction_ids: + if reaction_id in subnetwork.reactions: + # The KO is annotated by the subsetted reaction. + subsetted_reaction_ids.append(reaction_id) + if not subsetted_reaction_ids: + # The KO is not annotated by any subsetted reactions. + continue + + if inclusive: + # Copy the KO, including all its references, to the subsetted network. + subnetwork.kos[ko_id] = deepcopy(ko) + + # Add "unrequested" reactions associated with the KO to the subsetted network if not + # already added. + for reaction_id in ko.reaction_ids: + if reaction_id in subsetted_reaction_ids: + continue + self._subset_reaction(subnetwork, self.reactions[reaction_id]) + + self._subset_ko_classifications(subnetwork, ko) + + continue + + # Subsetting is exclusive, not inclusive. Add the KO only with references to subsetted + # reactions. + subnetwork_ko = KO( + id=ko_id, + name=ko.name, + module_ids=ko.module_ids.copy(), + hierarchies=deepcopy(ko.hierarchies), + pathway_ids=ko.pathway_ids.copy() + ) + + for reaction_id in subsetted_reaction_ids: + subnetwork_ko.reaction_ids.append(reaction_id) + + for kegg_id, modelseed_reaction_ids in ko.kegg_reaction_aliases.items(): + for reaction_id in modelseed_reaction_ids: + if reaction_id not in subsetted_reaction_ids: + continue + try: + subnetwork_ko.kegg_reaction_aliases[kegg_id].append(reaction_id) + except KeyError: + subnetwork_ko.kegg_reaction_aliases[kegg_id] = [reaction_id] + + for ec_number, modelseed_reaction_ids in ko.ec_number_aliases.items(): + for reaction_id in modelseed_reaction_ids: + if reaction_id not in subsetted_reaction_ids: + continue + try: + subnetwork_ko.ec_number_aliases[ec_number].append(reaction_id) + except KeyError: + subnetwork_ko.ec_number_aliases[ec_number] = [reaction_id] + + subnetwork.kos[ko_id] = subnetwork_ko + + self._subset_ko_classifications(subnetwork, ko) + + if isinstance(self, GenomicNetwork): + # Copy genes that are annotated with the added KOs to the subsetted network. + self._subset_genes_via_kos(subnetwork, inclusive=inclusive) + elif isinstance(self, PangenomicNetwork): + # Copy gene clusters that are annotated with the added KOs to the subsetted network. + self._subset_gene_clusters_via_kos(subnetwork) + + def _subset_network_by_metabolites( + self, + compound_ids: Iterable[str], + inclusive: bool = False + ) -> ReactionNetwork: + """ + Subset the network by metabolites. + + Parameters + ========== + compound_ids : Iterable[str] + ModelSEED compound IDs to subset. + + inclusive : bool, False + If True, "inclusive" subsetting applies a "Midas touch" where all items in the network + that are however associated with requested metabolites are "turned to gold" and included + in the subsetted network. In default "exclusive" subsetting, KOs and genes or gene + clusters that are added to the subsetted network due to references to reactions + involving requested metabolites will be missing references to any other reactions not + involving any requested metabolites. + + Returns + ======= + ReactionNetwork + New subsetted reaction network. + """ + if isinstance(self, GenomicNetwork): + subnetwork = GenomicNetwork() + elif isinstance(self, PangenomicNetwork): + subnetwork = PangenomicNetwork() + else: + raise AssertionError + + for reaction in self.reactions.values(): + # Check all reactions in the source network for subsetted metabolites. + for compound_id in reaction.compound_ids: + if compound_id in compound_ids: + break + else: + # The reaction does not involve any of the requested metabolites. + continue + self._subset_reaction(subnetwork, reaction) + + # Add KOs that are annotated with the added reactions to the subsetted network, and then add + # genes or gene clusters annotated with the added KOs to the subsetted network. + self._subset_kos_via_reactions(subnetwork, inclusive=inclusive) + + return subnetwork + + def _merge_network(self, network: ReactionNetwork) -> ReactionNetwork: + """ + This method is used in the process of merging the network with another network to produce a + merged network, and contains steps common to different types of network: merge the + attributes of the networks BESIDES genes (in a GenomicNetwork) / gene clusters (in a + PangenomicNetwork) and protein abundances (which can only be stored in a GenomicNetwork). + + Parameters + ========== + network : ReactionNetwork + The other reaction network being merged. + + Returns + ======= + merged_network : ReactionNetwork + A merged reaction network to be completed in the calling method. + """ + if isinstance(network, GenomicNetwork): + merged_network = GenomicNetwork() + elif isinstance(network, PangenomicNetwork): + merged_network = PangenomicNetwork() + else: + raise AssertionError + + merged_network.metabolites = deepcopy(self.metabolites) + merged_network.reactions = deepcopy(self.reactions) + merged_network.kos = deepcopy(self.kos) + merged_network.modules = deepcopy(self.modules) + merged_network.pathways = deepcopy(self.pathways) + merged_network.hierarchies = deepcopy(self.hierarchies) + merged_network.categories = deepcopy(self.categories) + merged_network.kegg_modelseed_aliases = deepcopy(self.kegg_modelseed_aliases) + merged_network.ec_number_modelseed_aliases = deepcopy(self.ec_number_modelseed_aliases) + merged_network.modelseed_kegg_aliases = deepcopy(self.modelseed_kegg_aliases) + merged_network.modelseed_ec_number_aliases = deepcopy(self.modelseed_ec_number_aliases) + + # Copy unique metabolites from the second network. Assume objects representing the same + # metabolite in both networks have identical attributes. + for compound_id, metabolite in network.metabolites.items(): + if compound_id in merged_network.metabolites: + continue + merged_network.metabolites[compound_id] = deepcopy(metabolite) + + # Copy unique reactions from the second network. Assume objects representing the same + # reaction in both networks have identical attributes. + for reaction_id, reaction in network.reactions.items(): + if reaction_id in merged_network.reactions: + continue + merged_network.reactions[reaction_id] = deepcopy(reaction) + + # Reconcile reaction ID aliases, which can differ between the networks depending on the KO + # sources of the reactions. + for kegg_reaction_id, modelseed_reaction_ids in network.kegg_modelseed_aliases.items(): + try: + merged_modelseed_reaction_ids = merged_network.kegg_modelseed_aliases[ + kegg_reaction_id + ] + except KeyError: + merged_network.kegg_modelseed_aliases[ + kegg_reaction_id + ] = modelseed_reaction_ids.copy() + continue + merged_network.kegg_modelseed_aliases[kegg_reaction_id] = sorted( + set(modelseed_reaction_ids + merged_modelseed_reaction_ids) + ) + + for ec_number, modelseed_reaction_ids in network.ec_number_modelseed_aliases.items(): + try: + merged_modelseed_reaction_ids = merged_network.ec_number_modelseed_aliases[ + ec_number + ] + except KeyError: + merged_network.ec_number_modelseed_aliases[ + ec_number + ] = modelseed_reaction_ids.copy() + continue + merged_network.ec_number_modelseed_aliases[ec_number] = sorted( + set(modelseed_reaction_ids + merged_modelseed_reaction_ids) + ) + + for modelseed_reaction_id, kegg_reaction_ids in network.modelseed_kegg_aliases.items(): + try: + merged_kegg_reaction_ids = merged_network.modelseed_kegg_aliases[ + modelseed_reaction_id + ] + except KeyError: + merged_network.modelseed_kegg_aliases[ + modelseed_reaction_id + ] = kegg_reaction_ids.copy() + continue + merged_network.modelseed_kegg_aliases[modelseed_reaction_id] = sorted( + set(kegg_reaction_ids + merged_kegg_reaction_ids) + ) + + for modelseed_reaction_id, ec_numbers in network.modelseed_ec_number_aliases.items(): + try: + merged_ec_numbers = merged_network.modelseed_ec_number_aliases[ + modelseed_reaction_id + ] + except KeyError: + merged_network.modelseed_ec_number_aliases[ + modelseed_reaction_id + ] = ec_numbers.copy() + continue + merged_network.modelseed_ec_number_aliases[modelseed_reaction_id] = sorted( + set(ec_numbers + merged_ec_numbers) + ) + + # Copy KOs from the second network. These can have different reaction annotations, so take + # the union of the reactions associated with the same KO. Assume KOs with the same ID have + # the same name. + for ko_id, ko in network.kos.items(): + try: + merged_ko = merged_network.kos[ko_id] + except KeyError: + merged_network.kos[ko_id] = deepcopy(ko) + continue + + # The KOs should be classified in the same modules, pathways, and hierarchy categories, + # unless the networks are derived from different reference database versions. + merged_ko.module_ids = sorted(set(ko.module_ids + merged_ko.module_ids)) + merged_ko.pathway_ids = sorted(set(ko.pathway_ids + merged_ko.pathway_ids)) + for hierarchy_id, categorizations in ko.hierarchies.items(): + try: + merged_hierarchy_categorizations = merged_ko.hierarchies[hierarchy_id] + except KeyError: + merged_ko.hierarchies[hierarchy_id] = categorizations.copy() + continue + merged_ko.hierarchies[hierarchy_id] = sorted( + set(categorizations + merged_hierarchy_categorizations) + ) + + merged_ko.reaction_ids = sorted(set(ko.reaction_ids + merged_ko.reaction_ids)) + + for kegg_reaction_id, modelseed_reaction_ids in merged_ko.kegg_reaction_aliases.items(): + try: + merged_modelseed_reaction_ids = merged_ko.kegg_reaction_aliases[ + kegg_reaction_id + ] + except KeyError: + merged_ko.kegg_reaction_aliases = modelseed_reaction_ids.copy() + continue + merged_ko.kegg_reaction_aliases[kegg_reaction_id] = sorted( + set(merged_modelseed_reaction_ids + modelseed_reaction_ids) + ) + + for ec_number, modelseed_reaction_ids in merged_ko.ec_number_aliases.items(): + try: + merged_modelseed_reaction_ids = merged_ko.ec_number_aliases[ec_number] + except KeyError: + merged_ko.ec_number_aliases = modelseed_reaction_ids.copy() + continue + merged_ko.ec_number_aliases[ec_number] = sorted( + set(merged_modelseed_reaction_ids + modelseed_reaction_ids) + ) + + # Copy modules from the second network. Modules from the two networks can contain different + # KOs. + module_pathway_ids: List[str] = [] + for module_id, module in network.modules.items(): + try: + merged_module = merged_network.modules[module_id] + except KeyError: + merged_network.modules[module_id] = deepcopy(module) + continue + + merged_module.ko_ids = sorted(set(module.ko_ids + merged_module.ko_ids)) + merged_module.pathway_ids = sorted(set(module.pathway_ids + merged_module.pathway_ids)) + module_pathway_ids += merged_module.pathway_ids + + # Copy pathways from the second network. Pathways from the two networks can contain + # different KOs. + pathway_module_ids: List[str] = [] + for pathway_id, pathway in network.pathways.items(): + try: + merged_pathway = merged_network.pathways[pathway_id] + except KeyError: + merged_network.pathways[pathway_id] = deepcopy(pathway) + continue + + merged_pathway.ko_ids = sorted(set(pathway.ko_ids + merged_pathway.ko_ids)) + merged_pathway.module_ids = sorted(set(pathway.module_ids + merged_pathway.module_ids)) + pathway_module_ids += merged_pathway.module_ids + assert pathway.categorization == merged_pathway.categorization + + for pathway_id in module_pathway_ids: + assert pathway_id in merged_network.pathways + for module_id in pathway_module_ids: + assert module_id in merged_network.modules + + # Copy hierarchies from the second network. Hierarchies from the two networks can contain + # different categories due to different KOs. Assume hierarchies with the same ID have the + # same name. + for hierarchy_id, hierarchy in network.hierarchies.items(): + try: + merged_hierarchy = merged_network.hierarchies[hierarchy_id] + except KeyError: + merged_network.hierarchies[hierarchy_id] = deepcopy(hierarchy) + continue + + merged_hierarchy.categorizations = sorted( + set(hierarchy.categorizations + merged_hierarchy.categorizations) + ) + + merged_hierarchy.ko_ids = sorted(set(hierarchy.ko_ids + merged_hierarchy.ko_ids)) + + # Copy hierarchy categories from the second network. + for hierarchy_id, categorizations in network.categories.items(): + try: + merged_hierarchy_categorizations = merged_network.categories[hierarchy_id] + except KeyError: + merged_network.categories[hierarchy_id] = deepcopy(categorizations) + continue + + for categorization, categories in categorizations.items(): + try: + merged_categories = merged_hierarchy_categorizations[categorization] + is_category_copied = True + except KeyError: + is_category_copied = False + + if is_category_copied: + # Reconcile the subcategories and KOs contained in the category from the two + # networks. + category = categories[-1] + merged_category = merged_categories[-1] + + merged_category.subcategory_names = sorted( + set(category.subcategory_names + merged_category.subcategory_names) + ) + + merged_category.ko_ids = sorted(set(category.ko_ids + merged_category.ko_ids)) + + continue + + # Copy the category and supercategories that have not already been copied. + copied_categories = [] + for depth, category in enumerate(categories, 1): + focus_categorization = categorization[:depth] + try: + # The supercategory has already been copied. + copied_category = merged_hierarchy_categorizations[focus_categorization] + is_focus_category_copied = True + except KeyError: + is_focus_category_copied = False + if not is_focus_category_copied: + copied_category = deepcopy(category) + copied_categories.append(copied_category) + if is_focus_category_copied: + continue + merged_hierarchy_categorizations[focus_categorization] = tuple( + copied_categories + ) + + return merged_network + + def _get_common_overview_statistics( + self, + stats: Union[GenomicNetworkStats, PangenomicNetworkStats] + ) -> None: + """ + Calculate overview statistics that are found the same way for both genomic and pangenomic + networks. + + Parameters + ========== + stats : Union[GenomicNetworkStats, PangenomicNetworkStats] + Network statistics are stored in a dictionary of dictionaries. Keys in the outer + dictionary are "classes" of network statistics. Keys in the inner dictionary are + the names of the statistics themselves. + + Returns + ======= + None + """ + self.progress.new("Counting KO biological classifications") + self.progress.update("...") + stats['KO biological classification'] = stats_group = {} + + ko_in_module_count = 0 + ko_in_pathway_count = 0 + for ko in self.kos.values(): + if ko.module_ids: + ko_in_module_count += 1 + if ko.pathway_ids: + ko_in_pathway_count += 1 + + module_ko_counts = [] + for module in self.modules.values(): + module_ko_counts.append(len(module.ko_ids)) + + pathway_ko_counts = [] + for pathway in self.pathways.values(): + pathway_ko_counts.append(len(pathway.ko_ids)) + + all_level_category_count = 0 + low_level_category_count = 0 + for categorizations in self.categories.values(): + for categories in categorizations.values(): + all_level_category_count += 1 + category = categories[-1] + if not category.subcategory_names: + low_level_category_count += 1 + + stats_group['KEGG modules in network'] = len(self.modules) + stats_group['KOs in modules'] = ko_in_module_count + stats_group['Mean KOs per module'] = round(np.mean(module_ko_counts), 1) + stats_group['Max KOs per module'] = max(module_ko_counts) + stats_group['KEGG pathways in network'] = len(self.pathways) + stats_group['KOs in pathways'] = ko_in_pathway_count + stats_group['Mean KOs per pathway'] = round(np.mean(pathway_ko_counts), 1) + stats_group['Max KOs per pathway'] = max(pathway_ko_counts) + stats_group['KEGG BRITE hierarchies in network'] = len(self.hierarchies) + stats_group['All-level hierarchy categories in network'] = all_level_category_count + stats_group['Low-level hierarchy categories in network'] = low_level_category_count + + self.progress.end() + + self.progress.new("Counting reactions and KO sources") + self.progress.update("...") + stats['Reactions and KO sources'] = stats_group = {} + + stats_group['Reactions in network'] = len(self.reactions) + reaction_counts = [] + for ko in self.kos.values(): + reaction_counts.append(len(ko.reaction_ids)) + stats_group['Mean reactions per KO'] = round(np.mean(reaction_counts), 1) + stats_group['Stdev reactions per KO'] = round(np.std(reaction_counts), 1) + stats_group['Max reactions per KO'] = max(reaction_counts) + + self.progress.end() + + self.progress.new("Counting reactions from each alias source") + self.progress.update("...") + stats['Reaction alias sources'] = stats_group = {} + + kegg_aliased_modelseed_reaction_ids = [] + for modelseed_reaction_id, kegg_reaction_ids in self.modelseed_kegg_aliases.items(): + if len(kegg_reaction_ids) > 0: + kegg_aliased_modelseed_reaction_ids.append(modelseed_reaction_id) + ec_number_aliased_modelseed_reaction_ids = [] + for modelseed_reaction_id, ec_numbers in self.modelseed_ec_number_aliases.items(): + if len(ec_numbers) > 0: + ec_number_aliased_modelseed_reaction_ids.append(modelseed_reaction_id) + kegg_reaction_source_count = len(kegg_aliased_modelseed_reaction_ids) + ec_number_source_count = len(ec_number_aliased_modelseed_reaction_ids) + both_source_count = len( + set(kegg_aliased_modelseed_reaction_ids).intersection( + set(ec_number_aliased_modelseed_reaction_ids) + ) + ) + stats_group['Reactions aliased by KEGG reaction'] = kegg_reaction_source_count + stats_group['Reactions aliased by EC number'] = ec_number_source_count + stats_group['Rxns aliased by both KEGG rxn & EC number'] = both_source_count + stats_group['Reactions aliased only by KEGG reaction'] = ( + kegg_reaction_source_count - both_source_count + ) + stats_group['Reactions aliased only by EC number'] = ( + ec_number_source_count - both_source_count + ) + + stats_group['KEGG reactions contributing to network'] = len(self.kegg_modelseed_aliases) + reaction_counts = [] + for modelseed_reaction_ids in self.kegg_modelseed_aliases.values(): + reaction_counts.append(len(modelseed_reaction_ids)) + stats_group['Mean reactions per KEGG reaction'] = round(np.mean(reaction_counts), 1) + stats_group['Stdev reactions per KEGG reaction'] = round(np.std(reaction_counts), 1) + stats_group['Max reactions per KEGG reaction'] = max(reaction_counts) + + stats_group['EC numbers contributing to network'] = len(self.ec_number_modelseed_aliases) + reaction_counts = [] + for modelseed_reaction_ids in self.ec_number_modelseed_aliases.values(): + reaction_counts.append(len(modelseed_reaction_ids)) + stats_group['Mean reactions per EC number'] = round(np.mean(reaction_counts), 1) + stats_group['Stdev reactions per EC number'] = round(np.std(reaction_counts), 1) + stats_group['Max reactions per EC number'] = max(reaction_counts) + + self.progress.end() + + self.progress.new("Counting reactions and metabolites by property") + self.progress.update("...") + stats['Reaction and metabolite properties'] = stats_group = {} + + reversible_count = 0 + irreversible_count = 0 + cytoplasmic_compound_ids = [] + extracellular_compound_ids = [] + consumed_compound_ids = [] + produced_compound_ids = [] + compound_reaction_counts = {} + for reaction in self.reactions.values(): + if reaction.reversibility: + reversible_count += 1 + else: + irreversible_count += 1 + encountered_compound_ids = [] + for compartment, coefficient, compound_id in zip( + reaction.compartments, reaction.coefficients, reaction.compound_ids + ): + if compartment == 'c': + cytoplasmic_compound_ids.append(compound_id) + else: + extracellular_compound_ids.append(compound_id) + if reaction.reversibility: + consumed_compound_ids.append(compound_id) + produced_compound_ids.append(compound_id) + elif coefficient < 0: + consumed_compound_ids.append(compound_id) + else: + produced_compound_ids.append(compound_id) + if compound_id not in encountered_compound_ids: + try: + compound_reaction_counts[compound_id] += 1 + except KeyError: + compound_reaction_counts[compound_id] = 1 + stats_group['Reversible reactions'] = reversible_count + stats_group['Irreversible reactions'] = irreversible_count + cytoplasmic_compound_ids = set(cytoplasmic_compound_ids) + extracellular_compound_ids = set(extracellular_compound_ids) + stats_group['Metabolites in network'] = metabolite_count = len(self.metabolites) + stats_group['Cytoplasmic metabolites'] = len(cytoplasmic_compound_ids) + stats_group['Extracellular metabolites'] = len(extracellular_compound_ids) + stats_group['Exclusively cytoplasmic metabolites'] = len( + cytoplasmic_compound_ids.difference(extracellular_compound_ids) + ) + stats_group['Exclusively extracellular metabolites'] = len( + extracellular_compound_ids.difference(cytoplasmic_compound_ids) + ) + stats_group['Cytoplasmic/extracellular metabolites'] = len( + cytoplasmic_compound_ids.intersection(extracellular_compound_ids) + ) + consumed_compound_ids = set(consumed_compound_ids) + produced_compound_ids = set(produced_compound_ids) + stats_group['Consumed metabolites'] = len(consumed_compound_ids) + stats_group['Produced metabolites'] = len(produced_compound_ids) + stats_group['Both consumed & produced metabolites'] = len( + consumed_compound_ids.intersection(produced_compound_ids) + ) + stats_group['Exclusively consumed metabolites'] = len( + consumed_compound_ids.difference(produced_compound_ids) + ) + stats_group['Exclusively produced metabolites'] = len( + produced_compound_ids.difference(consumed_compound_ids) + ) + metabolite_reaction_counts = collections.Counter(compound_reaction_counts.values()) + one_reaction_count = metabolite_reaction_counts[1] + stats_group['Metabolites consumed or produced by 1 rxn'] = one_reaction_count + two_reactions_count = metabolite_reaction_counts[2] + stats_group['Metabolites consumed or produced by 2 rxns'] = two_reactions_count + three_plus_reactions_count = metabolite_count - one_reaction_count - two_reactions_count + stats_group['Metabolites consumed or produced by 3+ rxns'] = three_plus_reactions_count + + self.progress.end() + + def _print_common_overview_statistics( + self, + stats: Union[GenomicNetworkStats, PangenomicNetworkStats] + ) -> None: + """ + Print overview statistics that are the same for both genomic and pangenomic networks. + + Parameters + ========== + stats : Union[GenomicNetworkStats, PangenomicNetworkStats] + Network statistics are stored in a dictionary of dictionaries. Keys in the outer + dictionary are "classes" of network statistics. Keys in the inner dictionary are + the names of the statistics themselves. + + Returns + ======= + None + """ + self.run.info_single("KO biological classification") + stats_group = stats['KO biological classification'] + for key in ( + 'KEGG modules in network', + 'KOs in modules', + 'Mean KOs per module', + 'Max KOs per module', + 'KEGG pathways in network', + 'KOs in pathways', + 'Mean KOs per pathway', + 'Max KOs per pathway', + 'KEGG BRITE hierarchies in network', + 'All-level hierarchy categories in network', + 'Low-level hierarchy categories in network' + ): + self.run.info(key, stats_group[key]) + + self.run.info_single("ModelSEED reactions in network and KO sources", nl_before=1) + stats_group = stats['Reactions and KO sources'] + for key in ( + 'Reactions in network', + 'Mean reactions per KO', + 'Stdev reactions per KO', + 'Max reactions per KO' + ): + self.run.info(key, stats_group[key]) + + self.run.info_single("Reaction alias source comparison", nl_before=1) + stats_group = stats['Reaction alias sources'] + for key in ( + 'Reactions aliased by KEGG reaction', + 'Reactions aliased by EC number', + 'Rxns aliased by both KEGG rxn & EC number', + 'Reactions aliased only by KEGG reaction', + 'Reactions aliased only by EC number', + 'KEGG reactions contributing to network', + 'Mean reactions per KEGG reaction', + 'Stdev reactions per KEGG reaction', + 'Max reactions per KEGG reaction', + 'EC numbers contributing to network', + 'Mean reactions per EC number', + 'Stdev reactions per EC number', + 'Max reactions per EC number' + ): + self.run.info(key, stats_group[key]) + + stats_group = stats['Reaction and metabolite properties'] + self.run.info_single("Reaction reversibility", nl_before=1) + for key in ( + 'Reversible reactions', + 'Irreversible reactions' + ): + self.run.info(key, stats_group[key]) + + self.run.info_single("Metabolites and localization", nl_before=1) + for key in ( + 'Metabolites in network', + 'Cytoplasmic metabolites', + 'Extracellular metabolites', + 'Exclusively cytoplasmic metabolites', + 'Exclusively extracellular metabolites', + 'Cytoplasmic/extracellular metabolites' + ): + self.run.info(key, stats_group[key]) + + self.run.info_single("Metabolite consumption and production", nl_before=1) + for key in ( + 'Consumed metabolites', + 'Produced metabolites', + 'Both consumed & produced metabolites', + 'Exclusively consumed metabolites', + 'Exclusively produced metabolites', + 'Metabolites consumed or produced by 1 rxn', + 'Metabolites consumed or produced by 2 rxns', + 'Metabolites consumed or produced by 3+ rxns' + ): + self.run.info(key, stats_group[key]) + print() + + def write_overview_statistics( + self, + stats_file: str, + stats: Union[GenomicNetworkStats, PangenomicNetworkStats] = None + ) -> None: + """ + Write a tab-delimited file of overview statistics for the metabolic network. + + Parameters + ========== + stats_file : str + Path to output tab-delimited file of overview statistics. + + stats : Union[GenomicNetworkStats, PangenomicNetworkStats], None + With the default value of None, network statistics will be calculated and written to + file. Alternatively, provided network statistics will be written to file without + calculating anew. + + Returns + ======= + None + """ + if not stats: + # Subclasses must have a method, 'get_overview_statistics'. + stats = self.get_overview_statistics() + + filesnpaths.is_output_file_writable(stats_file) + + table = [] + for stats_group_name, stats_group in stats.items(): + for stat_name, stat_value in stats_group.items(): + table.append([stats_group_name, stat_name, stat_value]) + pd.DataFrame(table, columns=['Group', 'Statistic', 'Value']).to_csv( + stats_file, sep='\t', index=False + ) + + self.run.info("Metabolic network statistics output file", stats_file) + +class GenomicNetwork(ReactionNetwork): + """ + A reaction network predicted from KEGG Ortholog annotations of genes and ModelSEED data. + + Attributes + ========== + kos : Dict[str, KO], dict() + KOs in the network, with keys being KO IDs. + + modules : Dict[str, KEGGModule], dict() + KEGG modules containing KOs in the network, with keys being module IDs. + + pathways : Dict[str, KEGGPathway], dict() + KEGG pathways containing KOs in the network, with keys being pathway IDs. + + hierarchies : Dict[str, BRITEHierarchy], dict() + KEGG BRITE hierarchies containing KOs in the network, with keys being hierarchy IDs. + + categories : Dict[str, Dict[Tuple[str], Tuple[BRITECategory]]], dict() + KEGG BRITE hierarchy categories containing KOs in the network. Keys are hierarchy IDs. + Values are dictionary representations of categorizations in the hierarchy. Categories at + each level receive their own entries. For example, 'K00844', hexokinase, is classified + multiple ways in the 'KEGG Orthology (KO)' hierarchy, 'ko00001', including '09100 + Metabolism >>> 09101 Carbohydrate metabolism >>> 00010 Glycolysis / Gluconeogenesis + [PATH:00010]' and '09100 Metabolism >>> 09101 Carbohydrate metabolism >>> 00051 Fructose + and mannose metabolism [PATH:00051]'. These categorizations would yield entries like the + following: {'ko00001': {('09100 Metabolism', ): (, ), ('09100 + Metabolism', '09101 Carbohydrate metabolism'): (, + ), ('09100 Metabolism', '09101 Carbohydrate metabolism', + '00010 Glycolysis / Gluconeogenesis [PATH:00010]'): (, + , ), ('09100 Metabolism', + '09101 Carbohydrate metabolism', '00051 Fructose and mannose metabolism [PATH:00051]'): + (, , )}} + + reactions : Dict[str, ModelSEEDReaction], dict() + ModelSEED reactions in the network, with keys being reaction IDs. + + metabolites : Dict[str, ModelSEEDCompound], dict() + ModelSEED compounds in the network, with keys being metabolite IDs. + + kegg_modelseed_aliases : Dict[str, List[str]], dict() + This maps KEGG REACTION IDs associated with KOs in the network to ModelSEED reactions + aliased by the KEGG reaction. KO-associated KEGG reactions that do not alias ModelSEED + reactions are not included. + + ec_number_modelseed_aliases : Dict[str, List[str]], dict() + This maps EC numbers associated with KOs in the network to ModelSEED reactions aliased by + the EC number. KO-associated EC numbers that do not alias ModelSEED reactions are not + included. + + modelseed_kegg_aliases : Dict[str, List[str]], dict() + This maps the IDs of ModelSEED reactions in the network to lists of KEGG REACTION IDs that + are associated with KOs in the network and alias the ModelSEED reaction. + + modelseed_ec_number_aliases : Dict[str, List[str]], dict() + This maps the IDs of ModelSEED reactions in the network to lists of EC numbers that are + associated with KOs in the network and alias the ModelSEED reaction. + + contigs_db_source_path : str, None + Path to the contigs database from which the network was built. + + profile_db_source_path : str, None + Path to the profile database from which protein and metabolite abundance data was loaded. + + genes : Dict[int, Gene], dict() + This maps gene callers IDs to object representations of genes in the network. + + proteins : Dict[int, Protein], dict() + This maps protein IDs to object representations of proteins with abundance data in the + network. + """ + def __init__( + self, + run: terminal.Run = terminal.Run(), + progress: terminal.Progress = terminal.Progress(), + verbose: bool = True + ) -> None: + """ + Parameters + ========== + run : anvio.terminal.Run, anvio.terminal.Run() + This object sets the 'run' attribute, which prints run information to the terminal. + + progress : anvio.terminal.Progress, anvio.terminal.Progress() + This object sets the 'progress' attribute, which prints transient progress information + to the terminal. + + verbose : bool, True + This sets the 'verbose' attribute, causing more information to be reported to the + terminal if True. + + Returns + ======= + None + """ + super().__init__(run=run, progress=progress, verbose=verbose) + self.contigs_db_source_path: str = None + self.profile_db_source_path: str = None + self.genes: Dict[int, Gene] = {} + self.proteins: Dict[int, Protein] = {} + + def remove_metabolites_without_formula(self, output_path: str = None) -> None: + """ + Remove metabolites without a formula in the ModelSEED database from the network. + + Other items can be removed from the network by association: reactions that involve a + formulaless metabolite; other metabolites with formulas that are exclusive to such + reactions; KOs predicted to exclusively catalyze such reactions; and genes exclusively + annotated with such KOs. Removed metabolites with a formula are reported alongside + formulaless metabolites to the optional output table of removed metabolites. + + output_path : str, None + If not None, write tab-delimited files of metabolites, reactions, KOs, KEGG modules, + KEGG pathways, KEGG BRITE hierarchies, KEGG BRITE hierarchy categories, and genes + removed from the network to file locations based on the provided path. For example, if + the argument, 'removed.tsv', is provided, then the following files will be written: + 'removed-metabolites.tsv', 'removed-reactions.tsv', 'removed-kos.tsv', + 'removed-modules.tsv', 'removed-pathways.tsv', 'removed-hierarchies.tsv', + 'removed-categories.tsv', and 'removed-genes.tsv'. + """ + if self.verbose: + self.progress.new("Removing metabolites without a formula in the network") + self.progress.update("...") + + if output_path: + path_basename, path_extension = os.path.splitext(output_path) + metabolite_path = f"{path_basename}-metabolites{path_extension}" + reaction_path = f"{path_basename}-reactions{path_extension}" + ko_path = f"{path_basename}-kos{path_extension}" + module_path = f"{path_basename}-modules{path_extension}" + pathway_path = f"{path_basename}-pathways{path_extension}" + hierarchy_path = f"{path_basename}-hierarchies{path_extension}" + category_path = f"{path_basename}-categories{path_extension}" + gene_path = f"{path_basename}-genes{path_extension}" + for path in ( + metabolite_path, + reaction_path, + ko_path, + module_path, + pathway_path, + hierarchy_path, + category_path, + gene_path + ): + filesnpaths.is_output_file_writable(path) + + metabolites_to_remove: List[str] = [] + for compound_id, metabolite in self.metabolites.items(): + # ModelSEED compounds without a formula have a formula value of None in the network + # object. + if metabolite.formula is None: + metabolites_to_remove.append(compound_id) + removed = self._purge_metabolites(metabolites_to_remove) + + if self.verbose: + self.progress.end() + self.run.info("Removed metabolites", len(removed['metabolite'])) + self.run.info("Removed reactions", len(removed['reaction'])) + self.run.info("Removed KOs", len(removed['ko'])) + self.run.info("Removed KEGG modules", len(removed['module'])) + self.run.info("Removed KEGG pathways", len(removed['pathway'])) + self.run.info("Removed KEGG BRITE hierarchies", len(removed['hierarchy'])) + self.run.info("Removed KEGG BRITE hierarchy categories", len(removed['category'])) + self.run.info("Removed genes", len(removed['gene'])) + + if not output_path: + return + + if self.verbose: + self.progress.new("Writing output files of removed network items") + self.progress.update("...") + + gene_table = [] + for gene in removed['gene']: + gene: Gene + row = [] + row.append(gene.gcid) + row.append(", ".join(gene.ko_ids)) + gene_table.append(row) + + self._write_remove_metabolites_without_formula_output(output_path, removed) + + pd.DataFrame( + gene_table, + columns=[ + "Gene callers ID", + "KO IDs" + ] + ).to_csv(gene_path, sep='\t', index=False) + + if self.verbose: + self.progress.end() + self.run.info("Table of removed metabolites", metabolite_path) + self.run.info("Table of removed reactions", reaction_path) + self.run.info("Table of removed KOs", ko_path) + self.run.info("Table of removed KEGG modules", module_path) + self.run.info("Table of removed KEGG pathways", pathway_path) + self.run.info("Table of removed KEGG BRITE hierarchies", hierarchy_path) + self.run.info("Table of removed KEGG BRITE hierarchy categories", category_path) + self.run.info("Table of removed genes", gene_path) + + def prune( + self, + genes_to_remove: Union[int, Iterable[int]] = None, + proteins_to_remove: Union[str, Iterable[str]] = None, + kos_to_remove: Union[str, Iterable[str]] = None, + modules_to_remove: Union[str, Iterable[str]] = None, + pathways_to_remove: Union[str, Iterable[str]] = None, + hierarchies_to_remove: Union[str, Iterable[str]] = None, + categories_to_remove: Dict[str, List[Tuple[str]]] = None, + reactions_to_remove: Union[str, Iterable[str]] = None, + metabolites_to_remove: Union[str, Iterable[str]] = None + ) -> Dict[str, List]: + """ + Prune items from the metabolic network. + + Pruning modifies the network in situ: use the network 'copy' method as needed to create a + backup of the network. + + If requested genes, proteins, KOs, KEGG modules, KEGG pathways, KEGG BRITE hierarchies, KEGG + BRITE hierarchy categories, reactions, or metabolites are not present in the network, no + error is raised. + + Network items (e.g., genes, KOs, reactions, and metabolites) that are exclusively associated + with requested items are also removed from the network. Example: Consider a KO that is + requested to be removed from the network. The KO is associated with two reactions. The first + reaction is exclusive to the KO and thus is also removed, whereas the second reaction is + also associated with another retained KO and thus is retained in the network. The first + reaction involves four metabolites, and two are exclusive to the reaction: these are also + removed from the network. The KO is the only KO annotating a certain gene but one of two KOs + annotating another gene: the former but not the latter gene is removed from the network + along with the KO. Note that KO annotations of genes and reaction annotations of KOs can be + selected to the exclusion of others. In the example, the latter gene is left with one KO. + + Parameters + ========== + genes_to_remove : Union[int, Iterable[int]], None + Gene callers ID(s) to remove. + + proteins_to_remove : Union[str, Iterable[str]], None + Protein ID(s) to remove if the network has been annotated with protein abundances, + equivalent to giving the genes encoding the protein(s) to the argument, + 'genes_to_remove'. + + kos_to_remove : Union[str, Iterable[str]], None + KO ID(s) to remove. + + modules_to_remove : Union[str, Iterable[str]], None + KEGG module ID(s) to remove, with the effect of giving the KOs in the module(s) to the + argument, 'kos_to_remove'. This does not remove other module annotations of these KOs + from the network that also annotate other KOs. + + pathways_to_remove : Union[str, Iterable[str]], None + KEGG pathway ID(s) to remove, with the effect of giving the KOs in the pathway(s) to the + argument, 'kos_to_remove'. This does not remove other pathway annotations of these KOs + from the network that also annotate other KOs. + + hierarchies_to_remove : Union[str, Iterable[str]], None + KEGG BRITE hierarchy (or hierarchies) to remove, with the effect of giving the KOs in + the hierarchy to the argument, 'kos_to_remove'. This does not remove other hierarchy + annotations of these KOs from the network that also annotate other KOs. + + categories_to_remove : Dict[str, List[Tuple[str]]], None + KEGG BRITE hierarchy categories to remove, with the effect of giving the KOs in the + categories to the argument, 'kos_to_remove'. This does not remove other category + annotations of these KOs from the network that also annotate other KOs. The dictionary + argument is keyed by BRITE hierarchy ID and has values that list category tuples. For + example, to remove KOs from the network contained in the 'ko00001' 'KEGG Orthology (KO)' + hierarchy categories, '09100 Metabolism >>> 09101 Carbohydrate metabolism >>> 00010 + Glycolysis / Gluconeogenesis [PATH:ko00010]' and '09100 Metabolism >>> 09101 + Carbohydrate metabolism >>> 00051 Fructose and mannose metabolism [PATH:ko00051]', the + dictionary argument would need to look like the following: {'ko00001': [('09100 + Metabolism', '09101 Carbohydrate metabolism', '00010 Glycolysis / Gluconeogenesis'), + ('09100 Metabolism', '09101 Carbohydrate metabolism', '00051 Fructose and mannose + metabolism [PATH:ko00051]')]} + + reactions_to_remove : Union[str, Iterable[str]], None + ModelSEED reaction ID(s) to remove. + + metabolites_to_remove : Union[str, Iterable[str]], None + ModelSEED compound ID(s) to remove. + + Returns + ======= + dict + This dictionary contains data removed from the network. + + The dictionary has the following format. It shows protein entries as if the network has + been annotated with protein abundances; these are absent for genomic networks lacking + protein annotations. + { + 'gene': [], + 'protein': [], + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [], + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'metabolite': [] + } + """ + assert ( + genes_to_remove or + proteins_to_remove or + kos_to_remove or + modules_to_remove or + pathways_to_remove or + hierarchies_to_remove or + categories_to_remove or + reactions_to_remove or + metabolites_to_remove + ) + + if type(genes_to_remove) == int: + genes_to_remove = [genes_to_remove] + if type(proteins_to_remove) == str: + proteins_to_remove = [proteins_to_remove] + if type(kos_to_remove) == str: + kos_to_remove = [kos_to_remove] + if type(modules_to_remove) == str: + modules_to_remove = [modules_to_remove] + if type(pathways_to_remove) == str: + pathways_to_remove = [pathways_to_remove] + if type(hierarchies_to_remove) == str: + hierarchies_to_remove = [hierarchies_to_remove] + if type(reactions_to_remove) == str: + reactions_to_remove = [reactions_to_remove] + if type(metabolites_to_remove) == str: + metabolites_to_remove = [metabolites_to_remove] + + removed: Dict[str, List] = { + 'gene': [], + 'protein': [], + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [], + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'metabolite': [] + } + if not self.proteins: + removed.pop('protein') + + if genes_to_remove or proteins_to_remove: + for item_type, removed_items in self._purge_genes( + genes_to_remove=genes_to_remove, + proteins_to_remove=proteins_to_remove + ).items(): + removed[item_type] += removed_items + + if ( + kos_to_remove or + modules_to_remove or + pathways_to_remove or + hierarchies_to_remove or + categories_to_remove + ): + for item_type, removed_items in self._purge_kos( + kos_to_remove=kos_to_remove, + modules_to_remove=modules_to_remove, + pathways_to_remove=pathways_to_remove, + hierarchies_to_remove=hierarchies_to_remove, + categories_to_remove=categories_to_remove + ).items(): + removed[item_type] += removed_items + + if reactions_to_remove: + for item_type, removed_items in self._purge_reactions(reactions_to_remove).items(): + removed[item_type] += removed_items + + if metabolites_to_remove: + for item_type, removed_items in self._purge_metabolites(metabolites_to_remove).items(): + removed[item_type] += removed_items + + return removed + + def _purge_genes( + self, + genes_to_remove: Iterable[int] = None, + proteins_to_remove: Iterable[str] = None + ) -> Dict[str, List]: + """ + Remove any trace of the given genes from the network. + + If proteins are provided, remove any trace of the genes that encode these from the network. + + KOs, reactions, and metabolites that are only associated with removed genes are purged. KEGG + modules, pathways, BRITE hierarchies, and BRITE hierarchy categories only associated with + purged KOs are removed. + + Parameters + ========== + genes_to_remove : Iterable[int], None + Gene callers IDs to remove. + + proteins_to_remove : Iterable[str], None + Protein IDs to remove if the network has been annotated with protein abundances, + equivalent to giving the genes encoding the proteins to the argument, 'genes_to_remove'. + + Returns + ======= + dict + This dictionary contains data removed from the network. + + The dictionary examples below show protein entries as if the network has been annotated + with protein abundances; these are absent for genomic networks lacking protein + annotations. + + If this method is NOT called from the method, '_purge_kos', then the dictionary will + look like the following. + { + 'gene': [], + 'protein': [], + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [], + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'metabolite': [] + } + + If this method is called from the method, '_purge_kos', then the dictionary will look + like the following. + { + 'gene': [], + 'protein': [], + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [], + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'metabolite': [] + } + + If no genes are removed from the network, then the dictionary will look like the + following regardless of calling method. + { + 'metabolite': [], + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [], + 'gene': [], + 'protein': [] + } + """ + assert genes_to_remove or proteins_to_remove + + if genes_to_remove is None: + genes_to_remove: List[int] = [] + if proteins_to_remove is None: + proteins_to_remove: List[str] = [] + + # Get genes to remove from requested proteins. + for protein_id in proteins_to_remove: + try: + protein = self.proteins[protein_id] + except KeyError: + # The requested protein ID is not in the network. + continue + genes_to_remove += protein.gcids + + # Remove requested genes from the network. + genes_to_remove = set(genes_to_remove) + removed_genes: List[Gene] = [] + for gcid in genes_to_remove: + try: + removed_genes.append(self.genes.pop(gcid)) + except KeyError: + # This occurs if the gene in 'genes_to_remove' is not in the network. + pass + + if not removed_genes: + removed = { + 'metabolite': [], + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [], + 'gene': [] + } + if self.proteins: + removed['protein'] = [] + return removed + + # Purge KOs from the network that are exclusive to removed genes. + kos_to_remove: List[str] = [] + for gene in removed_genes: + for ko_id in gene.ko_ids: + kos_to_remove.append(ko_id) + kos_to_remove = list(set(kos_to_remove)) + for gene in self.genes.values(): + kos_to_spare: List[str] = [] + for ko_id in gene.ko_ids: + if ko_id in kos_to_remove: + # The KO is not removed because it is associated with a retained gene. + kos_to_spare.append(ko_id) + for ko_id in kos_to_spare: + kos_to_remove.remove(ko_id) + if kos_to_remove: + removed_cascading_down = self._purge_kos(kos_to_remove) + removed_cascading_down.pop('gene') + else: + # This method must have been called from the method, '_purge_kos', because the KOs that + # are only associated with the removed genes were already removed from the network. + removed_cascading_down = { + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [], + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'metabolite': [] + } + + if not self.proteins: + removed = {'gene': removed_genes} + removed.update(removed_cascading_down) + return removed + + # Purge protein abundance annotations from the network that are exclusive to removed genes. + removed_gcids: List[str] = [] + proteins_to_remove: List[str] = [] + for protein_id, protein in self.proteins.items(): + for gcid in protein.gcids: + if gcid not in removed_gcids: + # The protein is not removed because it is associated with a retained gene. + break + else: + proteins_to_remove.append(protein_id) + removed_proteins: List[Protein] = [] + for protein_id in proteins_to_remove: + removed_proteins.append(self.proteins.pop(protein_id)) + + removed = {'gene': removed_genes, 'protein': removed_proteins} + removed.update(removed_cascading_down) + return removed + + def subset_network( + self, + genes_to_subset: Union[int, Iterable[int]] = None, + proteins_to_subset: Union[str, Iterable[str]] = None, + kos_to_subset: Union[str, Iterable[str]] = None, + modules_to_subset: Union[str, Iterable[str]] = None, + pathways_to_subset: Union[str, Iterable[str]] = None, + hierarchies_to_subset: Union[str, Iterable[str]] = None, + categories_to_subset: Dict[str, List[Tuple[str]]] = None, + reactions_to_subset: Union[str, Iterable[str]] = None, + metabolites_to_subset: Union[str, Iterable[str]] = None, + inclusive: bool = False + ) -> GenomicNetwork: + """ + Subset a smaller network from the metabolic network. + + If requested genes, proteins, KOs, KEGG modules, KEGG pathways, KEGG BRITE hierarchies, KEGG + BRITE hierarchy categories, reactions, or metabolites are not present in the network, no + error is raised. + + Subsetted items are not represented by the same objects as in the source network, i.e., new + gene, KO, reaction, metabolite, and other objects are created and added to the subsetted + network. + + Network items (e.g., genes, KOs, reactions, and metabolites) that are associated with + requested items (e.g., genes in the network that reference requested KOs; metabolites + referenced by requested reactions) are added to the subsetted network. + + The choice of "inclusive" or, by default, "exclusive" subsetting determines which associated + items are included in the subsetted network. In exclusive subsetting, a gene added to the + subsetted network due to references to requested KOs will be missing its references to any + other unrequested KOs in the source network. Likewise, genes and KOs that are added to the + subsetted network due to references to requested reactions will be missing references to any + other unrequested reactions. In other words, certain KO and reaction annotations can be + selected to the exclusion of others, e.g., a KO encoding two reactions can be restricted to + encode one requested reaction in the subsetted network; a KO encoding multiple reactions can + be restricted to encode only those reactions involving requested metabolites. + + "Inclusive" subsetting applies a "Midas touch" where all items in the network that are + however associated with requested KOs, reactions, and metabolites are "turned to gold" and + included in the subsetted network. A gene added to the subsetted network due to references + to requested KOs will still include all of its other references to unrequested KOs in the + source network. Likewise, KOs and genes that are added to the subsetted network due to + references to requested reactions and metabolites will include all their other references to + unrequested reactions and metabolites. Inclusive subsetting precludes the emendation of gene + KO annotations and KO reaction annotations. + + Parameters + ========== + genes_to_subset : Union[int, Iterable[int]], None + Gene callers ID(s) to subset. + + proteins_to_subset : Union[str, Iterable[str]], None + Protein ID(s) to subset if the network has been annotated with protein abundances, + equivalent to giving the genes encoding the protein(s) to the argument, + 'genes_to_subset'. + + kos_to_subset : Union[str, Iterable[str]], None + KO ID(s) to subset. + + modules_to_subset : Union[str, Iterable[str]], None + KEGG module ID(s) to subset, with the effect of giving the KOs in the module(s) to the + argument, 'kos_to_subset'. This does not exclude other module annotations of these KOs + from the network. + + pathways_to_subset : Union[str, Iterable[str]], None + KEGG pathway ID(s) to subset, with the effect of giving the KOs in the pathway(s) to the + argument, 'kos_to_subset'. This does not exclude other pathway annotations of these KOs + from the network. + + hierarchies_to_subset : Union[str, Iterable[str]], None + KEGG BRITE hierarchy (or hierarchies) to subset, with the effect of giving the KOs in + the hierarchy to the argument, 'kos_to_subset'. This does not exclude other hierarchy + annotations of these KOs from the network. + + categories_to_subset : Dict[str, List[Tuple[str]]], None + KEGG BRITE hierarchy categories to subset, with the effect of giving the KOs in the + categories to the argument, 'kos_to_subset'. This does not exclude other category + annotations of these KOs from the network. The dictionary argument is keyed by BRITE + hierarchy ID and has values that list category tuples. For example, to subset KOs from + the network contained in the 'ko00001' 'KEGG Orthology (KO)' hierarchy categories, + '09100 Metabolism >>> 09101 Carbohydrate metabolism >>> 00010 Glycolysis / + Gluconeogenesis [PATH:ko00010]' and '09100 Metabolism >>> 09101 Carbohydrate + metabolism >>> 00051 Fructose and mannose metabolism [PATH:ko00051]', the dictionary + argument would need to look like the following: {'ko00001': [('09100 Metabolism', '09101 + Carbohydrate metabolism', '00010 Glycolysis / Gluconeogenesis'), ('09100 Metabolism', + '09101 Carbohydrate metabolism', '00051 Fructose and mannose metabolism + [PATH:ko00051]')]} + + reactions_to_subset : Union[str, Iterable[str]], None + ModelSEED reaction ID(s) to subset. + + metabolites_to_subset : Union[str, Iterable[str]], None + ModelSEED compound ID(s) to subset. + + inclusive : bool, False + If True, "inclusive" subsetting applies a "Midas touch" where all items in the network + that are however associated with requested KOs, reactions, and metabolites are "turned + to gold" and included in the subsetted network. In default "exclusive" subsetting, a + gene added to the subsetted network due to references to requested KOs will be missing + its references to any other unrequested KOs in the source network; KOs and genes that + are added to the subsetted network due to references to requested reactions and + metabolites will be missing references to any other unrequested reactions and + metabolites. + + Returns + ======= + GenomicNetwork + New subsetted reaction network. + """ + assert ( + genes_to_subset or + proteins_to_subset or + kos_to_subset or + modules_to_subset or + pathways_to_subset or + hierarchies_to_subset or + categories_to_subset or + reactions_to_subset or + metabolites_to_subset + ) + + if genes_to_subset is None: + genes_to_subset: List[int] = [] + if proteins_to_subset is None: + proteins_to_subset: List[str] = [] + + # Get genes to subset from requested proteins. + for protein_id in proteins_to_subset: + try: + protein = self.proteins[protein_id] + except KeyError: + # The requested protein ID is not in the network. + continue + genes_to_subset += protein.gcids + + if kos_to_subset is None: + kos_to_subset: List[str] = [] + else: + kos_to_subset = list(kos_to_subset) + if modules_to_subset is None: + modules_to_subset: List[str] = [] + if pathways_to_subset is None: + pathways_to_subset: List[str] = [] + if hierarchies_to_subset is None: + hierarchies_to_subset: List[str] = [] + if categories_to_subset is None: + categories_to_subset: Dict[str, List[Tuple[str]]] = {} + + # Get KOs to subset from requested modules, pathways, hierarchies, and hierarchy categories. + for module_id in modules_to_subset: + try: + module = self.modules[module_id] + except KeyError: + # The requested module is not in the network. + continue + kos_to_subset += module.ko_ids + for pathway_id in pathways_to_subset: + try: + pathway = self.pathways[pathway_id] + except KeyError: + # The requested pathway is not in the network. + continue + kos_to_subset += pathway.ko_ids + for hierarchy_id in hierarchies_to_subset: + try: + hierarchy = self.hierarchies[hierarchy_id] + except KeyError: + # The requested hierarchy is not in the network. + continue + kos_to_subset += hierarchy.ko_ids + for hierarchy_id, categorizations in categories_to_subset.items(): + hierarchy_categorizations = self.categories[hierarchy_id] + for categorization in categorizations: + try: + categories = hierarchy_categorizations[categorization] + except KeyError: + # The requested category is not in the network. + continue + category = categories[-1] + kos_to_subset += category.ko_ids + kos_to_subset = set(kos_to_subset) + + # Sequentially subset the network for each type of request. Upon generating two subsetted + # networks from two types of request, merge the networks into a single subsetted network; + # repeat. + first_subnetwork = None + for items_to_subset, subset_network_method in ( + (genes_to_subset, self._subset_network_by_genes), + (kos_to_subset, functools.partial(self._subset_network_by_kos, inclusive=inclusive)), + (reactions_to_subset, functools.partial( + self._subset_network_by_reactions, inclusive=inclusive + )), + (metabolites_to_subset, functools.partial( + self._subset_network_by_metabolites, inclusive=inclusive + )) + ): + if not items_to_subset: + continue + + second_subnetwork = subset_network_method(items_to_subset) + + if first_subnetwork is None: + first_subnetwork = second_subnetwork + else: + first_subnetwork = first_subnetwork.merge_network(second_subnetwork) + + return first_subnetwork + + def _subset_network_by_genes(self, gcids: Iterable[int]) -> GenomicNetwork: + """ + Subset the network by genes. + + Parameters + ========== + gcids : Iterable[int] + Gene callers IDs to subset. + + Returns + ======= + GenomicNetwork + New subsetted reaction network. + """ + subnetwork = GenomicNetwork() + + for gcid in gcids: + try: + gene = self.genes[gcid] + except KeyError: + # This occurs if the requested gene callers ID is not in the source network. + continue + + # Subset KOs annotating the gene. + self._subset_network_by_kos(gene.ko_ids, subnetwork=subnetwork) + + subnetwork.genes[gcid] = deepcopy(gene) + + # Only include protein abundances of subsetted genes, ignoring references to unsubsetted + # genes not encoding the protein. + if gene.protein_id is not None: + try: + subnetwork_protein = subnetwork.proteins[gene.protein_id] + except KeyError: + protein = self.proteins[gene.protein_id] + subnetwork_protein = Protein(id=protein.id, abundances=protein.abundances) + subnetwork.proteins[gene.protein_id] = subnetwork_protein + subnetwork_protein.gcids.append(gcid) + + return subnetwork + + def _subset_genes_via_kos( + self, + subnetwork: GenomicNetwork, + inclusive: bool = False + ) -> None: + """ + Add genes that are annotated with subsetted KOs to the subsetted network. + + These gene objects only reference subsetted KOs and not other KOs that also annotate the + gene but which are not subsetted. + + Parameters + ========== + subnetwork : GenomicNetwork + The subsetted reaction network under construction. + + inclusive : bool, False + If True, "inclusive" subsetting applies a "Midas touch" where all items in the network + that are however associated with requested KOs are "turned to gold" and included in the + subsetted network. In default "exclusive" subsetting, a gene added to the subsetted + network due to references to requested KOs will be missing its references to any other + unrequested KOs in the source network. + + Returns + ======= + None + """ + for gcid, gene in self.genes.items(): + # Check all genes in the source network for subsetted KOs. + subsetted_ko_ids: List[str] = [] + for ko_id in gene.ko_ids: + if ko_id in subnetwork.kos: + # The gene is annotated by the subsetted KO. + subsetted_ko_ids.append(ko_id) + if not subsetted_ko_ids: + # The gene is not annotated by any subsetted KOs. + continue + + if inclusive: + # Copy the gene, including all its references, to the subsetted network. + subnetwork.genes[gcid] = deepcopy(gene) + + # Only include protein abundances of subsetted genes, ignoring references to + # unsubsetted genes not encoding the protein. + if gene.protein_id is not None: + try: + subnetwork_protein = subnetwork.proteins[gene.protein_id] + except KeyError: + protein = self.proteins[gene.protein_id] + subnetwork_protein = Protein(id=protein.id, abundances=protein.abundances) + subnetwork.proteins[gene.protein_id] = subnetwork_protein + subnetwork_protein.gcids.append(gcid) + + # Add "unrequested" KOs associated with the gene to the subsetted network if not + # already added. + for ko_id in gene.ko_ids: + if ko_id in subsetted_ko_ids: + continue + ko = self.kos[ko_id] + subnetwork.kos[ko_id] = deepcopy(ko) + + # Add reactions associated with the unrequested KO to the subsetted network if + # not already added. + for reaction_id in ko.reaction_ids: + if reaction_id in subnetwork.reactions: + continue + reaction = self.reactions[reaction_id] + self._subset_reaction(subnetwork, reaction) + + self._subset_ko_classifications(subnetwork, ko) + + continue + + # Subsetting is exclusive, not inclusive. Add the gene only with references to subsetted + # KOs. + subnetwork_gene = Gene(gcid=gcid, protein_id=gene.protein_id) + subnetwork.genes[gcid] = subnetwork_gene + + if gene.protein_id is not None: + try: + subnetwork_protein = subnetwork.proteins[gene.protein_id] + except KeyError: + protein = self.proteins[gene.protein_id] + subnetwork_protein = Protein(id=protein.id, abundances=protein.abundances) + subnetwork.proteins[gene.protein_id] = subnetwork_protein + subnetwork_protein.gcids.append(gcid) + + for ko_id in subsetted_ko_ids: + subnetwork_gene.ko_ids.append(ko_id) + subnetwork_gene.e_values[ko_id] = gene.e_values[ko_id] + + def merge_network(self, network: GenomicNetwork) -> GenomicNetwork: + """ + Merge the genomic reaction network with another genomic reaction network derived from the + same contigs database. + + The purpose of the network is to combine different, but potentially overlapping, subnetworks + from the same genome. + + Each network can contain different genes, KOs, and reactions/metabolites. Merging + nonredundantly incorporates all of this data as new objects in the new network. + + Objects representing genes, KOs, KEGG modules, KEGG pathways, BRITE hierarchies, and + hierarchy categories in both networks can have different sets of references: genes can be + annotated by different KOs; KOs can be annotated by different reactions; depending on the + KOs in each network, different modules, pathways, and hierarchies/categories can be present. + + Other object attributes should be consistent between the networks. For instance, the same + ModelSEED reactions and metabolites in both networks should have identical attributes. The + same gene-KO should have the same e-values. If applicable, both networks should have been + annotated with the same protein and metabolite abundance data. + + Parameters + ========== + network : GenomicNetwork + The other genomic reaction network being merged. + + Returns + ======= + GenomicNetwork + The merged genomic reaction network. + """ + merged_network: GenomicNetwork = self._merge_network(network) + + merged_network.genes = deepcopy(self.genes) + merged_network.proteins = deepcopy(self.proteins) + + # Copy genes from the second network. Assume the same gene KO references have the same + # e-values. Assume identical protein abundance assignments. + for gcid, gene in network.genes.items(): + try: + merged_gene = merged_network.genes[gcid] + except KeyError: + merged_network.genes[gcid] = deepcopy(gene) + continue + + merged_gene.ko_ids = sorted(set(gene.ko_ids + merged_gene.ko_ids)) + + for ko_id, e_value in gene.e_values.items(): + if ko_id in merged_gene.e_values: + continue + + merged_gene.e_values[ko_id] = e_value + + # Copy proteins from the second network. Assume identical abundance profiles. + merged_proteins = merged_network.proteins + for protein_id, protein in network.proteins.items(): + try: + merged_protein = merged_proteins[protein_id] + except KeyError: + merged_proteins[protein_id] = deepcopy(protein) + continue + + merged_protein.gcids = sorted(set(protein.gcids + merged_protein.gcids)) + + return merged_network + + def get_overview_statistics( + self, + precomputed_counts: Dict[str, int] = None + ) -> GenomicNetworkStats: + """ + Calculate overview statistics for the genomic metabolic network. + + Parameters + ========== + precomputed_counts : Dict[str, int], None + To spare additional computations that involve loading and parsing the contigs database, + this dictionary must contain certain precomputed data: the key, 'total_genes', should + have a value of the number of genes in the genome; the key, 'kos_assigned_genes', should + have a value of the number of genes in the genome that are assigned KOs; the key, + 'kos_assigned_genes', should have a value of the number of unique KOs assigned to genes + in the genome. + + Returns + ======= + GenomicNetworkStats + Network statistics are stored in a dictionary of dictionaries. Keys in the outer + dictionary are "classes" of network statistics. Keys in the inner dictionary are + statistics themselves. + """ + if ( + precomputed_counts is not None and + sorted(precomputed_counts) != [ + 'genes_assigned_kos', 'kos_assigned_genes', 'total_genes' + ] + ): + raise ConfigError( + "The 'precomputed_counts' argument must be a dictionary only containing the keys, " + "'total_genes', 'genes_assigned_kos', and 'kos_assigned_genes'." + ) + + stats: GenomicNetworkStats = {} + + self.progress.new("Counting genes and KOs") + self.progress.update("...") + stats['Gene and KO counts'] = stats_group = {} + + if precomputed_counts: + assert ( + type(precomputed_counts['total_genes']) is int and + precomputed_counts['total_genes'] >= 0 + ) + gene_count = precomputed_counts['total_genes'] + assert ( + type(precomputed_counts['genes_assigned_kos']) is int and + precomputed_counts['genes_assigned_kos'] >= 0 + ) + ko_annotated_gene_count = precomputed_counts['genes_assigned_kos'] + assert ( + type(precomputed_counts['kos_assigned_genes']) is int and + precomputed_counts['kos_assigned_genes'] >= 0 + ) + annotating_ko_count = precomputed_counts['kos_assigned_genes'] + else: + if self.contigs_db_source_path: + cdb = ContigsDatabase(self.contigs_db_source_path) + gene_count = cdb.db.get_row_counts_from_table('genes_in_contigs') + gene_ko_id_table = cdb.db.get_table_as_dataframe( + 'gene_functions', + where_clause='source = "KOfam"', + columns_of_interest=['gene_callers_id', 'source'] + ) + ko_annotated_gene_count = gene_ko_id_table['gene_callers_id'].nunique() + annotating_ko_count = gene_ko_id_table['KOfam'].nunique() + cdb.disconnect() + else: + gene_count = None + ko_annotated_gene_count = None + annotating_ko_count = None + + if gene_count is not None: + stats_group['Total gene calls in genome'] = gene_count + if ko_annotated_gene_count is not None: + stats_group['Genes annotated with protein KOs'] = ko_annotated_gene_count + stats_group['Genes in network'] = len(self.genes) + if annotating_ko_count is not None: + stats_group['Protein KOs annotating genes'] = annotating_ko_count + stats_group['KOs in network'] = len(self.kos) + self.progress.end() + + self._get_common_overview_statistics(stats) + + if precomputed_counts: + return stats + + if not self.contigs_db_source_path: + self.run.info_single( + "Since the genomic network was not associated with a contigs database, the " + "following statistics could not be calculated and were not reported to the output " + "file: 'Total gene calls in genome', 'Genes annotated with protein KOs', and " + "'Protein KOs annotating genes'." + ) + + return stats + + def print_overview_statistics(self, stats: GenomicNetworkStats = None) -> None: + """ + Print overview statistics for the genomic metabolic network. + + Parameters + ========== + stats : GenomicNetworkStats, None + With the default value of None, network statistics will be calculated and printed. + Alternatively, provided network statistics will be printed without calculating anew. + + Returns + ======= + None + """ + if not stats: + stats = self.get_overview_statistics() + + self.run.info_single("METABOLIC REACTION NETWORK STATISTICS", mc='green', nl_after=1) + + self.run.info_single("Gene calls and KEGG Ortholog (KO) annotations") + stats_group = stats['Gene and KO counts'] + self.run.info("Total gene calls in genome", stats_group['Total gene calls in genome']) + self.run.info( + "Genes annotated with protein KOs", stats_group['Genes annotated with protein KOs'] + ) + self.run.info("Genes in network", stats_group['Genes in network']) + self.run.info("Protein KOs annotating genes", stats_group['Protein KOs annotating genes']) + self.run.info("KOs in network", stats_group['KOs in network'], nl_after=1) + + self._print_common_overview_statistics(stats) + + def export_json( + self, + path: str, + overwrite: bool = False, + objective: str = None, + remove_missing_objective_metabolites: bool = False, + indent: int = 2, + progress: terminal.Progress = terminal.Progress() + ) -> None: + """ + Export the network to a metabolic model file in JSON format. + + All information from the network is included in the JSON so that the file can be loaded as a + GenomicNetwork object containing the same information. + + Parameters + ========== + path : str + Output JSON file path. + + overwrite : bool, False + Overwrite the JSON file if it already exists. + + objective : str, None + An objective to use in the model, stored as the first entry in the JSON 'reactions' + array. Currently, the only valid options are None and 'e_coli_core'. + + None means that no objective is added to the JSON, meaning that FBA cannot be performed + on the model. + + 'e_coli_core' is the biomass objective from the COBRApy example JSON file of E. coli + "core" metabolism, 'e_coli_core.json'. + + remove_missing_objective_metabolites : bool, False + If True, remove metabolites from the JSON objective that are not produced or consumed in + the reaction network. FBA fails with metabolites outside the network. + + indent : int, 2 + Spaces of indentation per nesting level in JSON file. + + progress : anvio.terminal.Progress, anvio.terminal.Progress() + Prints transient progress information to the terminal. + """ + progress.new("Constructing JSON") + progress.update("Setting up") + filesnpaths.is_output_file_writable(path, ok_if_exists=overwrite) + json_dict = JSONStructure.get() + json_genes: List[Dict] = json_dict['genes'] + json_reactions: List[Dict] = json_dict['reactions'] + json_metabolites: List[Dict] = json_dict['metabolites'] + if objective == 'e_coli_core': + objective_dict = JSONStructure.get_e_coli_core_objective() + if remove_missing_objective_metabolites: + self.remove_missing_objective_metabolites(objective_dict) + json_reactions.append(objective_dict) + elif objective != None: + raise ConfigError( + f"Anvi'o does not recognize an objective with the name, '{objective}'." + ) + + progress.update("Genes") + reaction_genes: Dict[str, List[str]] = {} + reaction_kos: Dict[str, List[KO]] = {} + for gcid, gene in self.genes.items(): + gene_entry = JSONStructure.get_gene_entry() + json_genes.append(gene_entry) + gcid_str = str(gcid) + gene_entry['id'] = gcid_str + + # Record KO IDs, annotation e-values, and KO classifications in the annotation section + # of the gene entry. + annotation = gene_entry['annotation'] + annotation['ko'] = annotation_kos = {} + for ko_id in gene.ko_ids: + annotation_kos[ko_id] = annotation_ko = { + 'e_value': str(gene.e_values[ko_id]), + 'modules': {}, + 'pathways': {}, + 'hierarchies': {} + } + + # Record KEGG modules containing the KO. + ko = self.kos[ko_id] + annotation_ko_modules = annotation_ko['modules'] + for module_id in ko.module_ids: + module = self.modules[module_id] + module_annotation = module.name + if not module.pathway_ids: + annotation_ko_modules[module_id] = module_annotation + continue + # Cross-reference KEGG pathways containing the module. + module_annotation += "[pathways:" + for pathway_id in module.pathway_ids: + module_annotation += f" {pathway_id}" + module_annotation += "]" + annotation_ko_modules[module_id] = module_annotation + + # Record KEGG pathways containing the KO. + annotation_ko_pathways = annotation_ko['pathways'] + for pathway_id in ko.pathway_ids: + pathway = self.pathways[pathway_id] + annotation_ko_pathways[pathway_id] = pathway.name + + # Record membership of the KO in KEGG BRITE hierarchies. + annotation_ko_hierarchies: Dict[str, List[str]] = annotation_ko['hierarchies'] + for hierarchy_id, categorizations in ko.hierarchies.items(): + hierarchy_name = self.hierarchies[hierarchy_id].name + annotation_ko_hierarchies[ + f"{hierarchy_id}: {hierarchy_name}" + ] = annotation_ko_categories = [] + hierarchy_categorizations = self.categories[hierarchy_id] + for categorization in categorizations: + categories = hierarchy_categorizations[categorization] + category = categories[-1] + category_id = category.id + annotation_ko_categories.append(category_id[len(hierarchy_id) + 2:]) + + # Set up dictionaries needed to fill out reaction entries. + for reaction_id in ko.reaction_ids: + try: + reaction_genes[reaction_id].append(gcid_str) + except KeyError: + reaction_genes[reaction_id] = [gcid_str] + try: + reaction_kos[reaction_id].append(ko) + except KeyError: + reaction_kos[reaction_id] = [ko] + + if not self.proteins: + continue + + # A protein section is added if the network has been annotated with protein + # abundances. + annotation['protein'] = annotation_protein = { + 'id': None, + 'abundances': {} + } + if gene.protein_id: + # Record abundances of the protein encoded by the gene. + protein_id = gene.protein_id + protein = self.proteins[protein_id] + annotation_protein['id'] = protein_id + annotation_protein_abundances = annotation_protein['abundances'] + for sample_name, abundance_value in protein.abundances.items(): + annotation_protein_abundances[sample_name] = abundance_value + + progress.update("Reactions") + compound_compartments: Dict[str, Set[str]] = {} + for reaction_id, reaction in self.reactions.items(): + reaction_entry = JSONStructure.get_reaction_entry() + json_reactions.append(reaction_entry) + reaction_entry['id'] = reaction_id + reaction_entry['name'] = reaction.modelseed_name + metabolites = reaction_entry['metabolites'] + for compound_id, compartment, coefficient in zip( + reaction.compound_ids, reaction.compartments, reaction.coefficients + ): + metabolites[f"{compound_id}_{compartment}"] = coefficient + try: + compound_compartments[compound_id].add(compartment) + except KeyError: + compound_compartments[compound_id] = set(compartment) + if not reaction.reversibility: + # By default, the reaction entry was set up to be reversible; here make it + # irreversible. + reaction_entry['lower_bound'] = 0.0 + reaction_entry['gene_reaction_rule'] = " or ".join( + [gcid for gcid in reaction_genes[reaction_id]] + ) + + notes = reaction_entry['notes'] + # Record gene KO annotations which aliased the reaction via KEGG REACTION or EC number. + notes['ko'] = ko_notes = {} + ko_kegg_aliases = [] + ko_ec_number_aliases = [] + for ko in reaction_kos[reaction_id]: + try: + kegg_aliases = ko.kegg_reaction_aliases[reaction_id] + except KeyError: + kegg_aliases = [] + try: + ec_number_aliases = ko.ec_number_aliases[reaction_id] + except KeyError: + ec_number_aliases = [] + ko_notes[ko.id] = {'kegg.reaction': kegg_aliases, 'ec-code': ec_number_aliases} + ko_kegg_aliases += kegg_aliases + ko_ec_number_aliases += ec_number_aliases + ko_kegg_aliases = set(ko_kegg_aliases) + ko_ec_number_aliases = set(ko_ec_number_aliases) + # Record other KEGG REACTION or EC number aliases of the reaction in the ModelSEED + # database that did not happen to be associated with KO annotations. + notes['other_aliases'] = { + 'kegg.reaction': list(set(reaction.kegg_aliases).difference(ko_kegg_aliases)), + 'ec-code': list(set(reaction.ec_number_aliases).difference(ko_ec_number_aliases)) + } + + progress.update("Metabolites") + for compound_id, metabolite in self.metabolites.items(): + modelseed_compound_name = metabolite.modelseed_name + charge = metabolite.charge + formula = metabolite.formula + kegg_compound_aliases = list(metabolite.kegg_aliases) + for compartment in compound_compartments[compound_id]: + metabolite_entry = JSONStructure.get_metabolite_entry() + json_metabolites.append(metabolite_entry) + metabolite_entry['id'] = f"{compound_id}_{compartment}" + metabolite_entry['name'] = modelseed_compound_name + metabolite_entry['compartment'] = compartment + # Compounds without a formula have a nominal charge of 10000000 in the ModelSEED + # compounds database, which is replaced by None in the reaction network and 0 in the + # JSON. + metabolite_entry['charge'] = charge if charge is not None else 0 + metabolite_entry['formula'] = formula if formula is not None else "" + metabolite_entry['annotation']['kegg.compound'] = kegg_compound_aliases + + progress.update("Saving") + with open(path, 'w') as f: + json.dump(json_dict, f, indent=indent) + progress.end() + +class PangenomicNetwork(ReactionNetwork): + """ + A reaction network predicted from KEGG KO and ModelSEED annotations of pangenomic gene clusters. + + Attributes + ========== + kos : Dict[str, KO], dict() + KOs in the network, with keys being KO IDs. + + modules : Dict[str, KEGGModule], dict() + KEGG modules containing KOs in the network, with keys being module IDs. + + pathways : Dict[str, KEGGPathway], dict() + KEGG pathways containing KOs in the network, with keys being pathway IDs. + + hierarchies : Dict[str, BRITEHierarchy], dict() + KEGG BRITE hierarchies containing KOs in the network, with keys being hierarchy IDs. + + categories : Dict[str, Dict[Tuple[str], Tuple[BRITECategory]]], dict() + KEGG BRITE hierarchy categories containing KOs in the network. Keys are hierarchy IDs. + Values are dictionary representations of categorizations in the hierarchy. Categories at + each level receive their own entries. For example, 'K00844', hexokinase, is classified + multiple ways in the 'KEGG Orthology (KO)' hierarchy, 'ko00001', including '09100 + Metabolism >>> 09101 Carbohydrate metabolism >>> 00010 Glycolysis / Gluconeogenesis + [PATH:00010]' and '09100 Metabolism >>> 09101 Carbohydrate metabolism >>> 00051 Fructose + and mannose metabolism [PATH:00051]'. These categorizations would yield entries like the + following: {'ko00001': {('09100 Metabolism', ): (, ), ('09100 + Metabolism', '09101 Carbohydrate metabolism'): (, + ), ('09100 Metabolism', '09101 Carbohydrate metabolism', + '00010 Glycolysis / Gluconeogenesis [PATH:00010]'): (, + , ), ('09100 Metabolism', + '09101 Carbohydrate metabolism', '00051 Fructose and mannose metabolism [PATH:00051]'): + (, , )}} + + reactions : Dict[str, ModelSEEDReaction], dict() + ModelSEED reactions in the network, with keys being reaction IDs. + + metabolites : Dict[str, ModelSEEDCompound], dict() + ModelSEED compounds in the network, with keys being metabolite IDs. + + kegg_modelseed_aliases : Dict[str, List[str]], dict() + This maps KEGG REACTION IDs associated with KOs in the network to ModelSEED reactions + aliased by the KEGG reaction. KO-associated KEGG reactions that do not alias ModelSEED + reactions are not included. + + ec_number_modelseed_aliases : Dict[str, List[str]], dict() + This maps EC numbers associated with KOs in the network to ModelSEED reactions aliased by + the EC number. KO-associated EC numbers that do not alias ModelSEED reactions are not + included. + + modelseed_kegg_aliases : Dict[str, List[str]], dict() + This maps the IDs of ModelSEED reactions in the network to lists of KEGG REACTION IDs that + are associated with KOs in the network and alias the ModelSEED reaction. + + modelseed_ec_number_aliases : Dict[str, List[str]], dict() + This maps the IDs of ModelSEED reactions in the network to lists of EC numbers that are + associated with KOs in the network and alias the ModelSEED reaction. + + pan_db_source_path : str, None + Path to the pan database from which the network was built. + + genomes_storage_db_source_path : str, None + Path to the genomes storage database from which the network was built. + + consensus_threshold : float, None + A parameter used in the selection of the gene cluster consensus KOs from which the network + was built. + + discard_ties : bool, None + A parameter used in the selection of the gene cluster consensus KOs from which the network + was built. + + consistent_annotations : bool, None + A loaded network may be based on a set of gene KO annotations in the genomes storage + database that has since changed, in which case this attribute would be False. + + gene_clusters : Dict[str, GeneCluster], dict() + This maps the IDs of gene clusters in the network to object representations of the clusters. + """ + def __init__( + self, + run: terminal.Run = terminal.Run(), + progress: terminal.Progress = terminal.Progress(), + verbose: bool = True + ) -> None: + """ + Parameters + ========== + run : anvio.terminal.Run, anvio.terminal.Run() + This object sets the 'run' attribute, which prints run information to the terminal. + + progress : anvio.terminal.Progress, anvio.terminal.Progress() + This object sets the 'progress' attribute, which prints transient progress information + to the terminal. + + verbose : bool, True + This sets the 'verbose' attribute, causing more information to be reported to the + terminal if True. + + Returns + ======= + None + """ + super().__init__(run=run, progress=progress, verbose=verbose) + self.pan_db_source_path: str = None + self.genomes_storage_db_source_path: str = None + self.consensus_threshold: float = None + self.discard_ties: bool = None + self.consistent_annotations: bool = None + self.gene_clusters: Dict[str, GeneCluster] = {} + + def remove_metabolites_without_formula(self, output_path: str = None) -> None: + """ + Remove metabolites without a formula in the ModelSEED database from the network. + + Other items can be removed from the network by association: reactions that involve a + formulaless metabolite; other metabolites with formulas that are exclusive to such + reactions; KOs predicted to exclusively catalyze such reactions; and gene clusters annotated + with such KOs. Removed metabolites with a formula are reported alongside formulaless + metabolites to the output table of removed metabolites. + + output_path : str, None + If not None, write tab-delimited files of metabolites, reactions, KOs, KEGG modules, + KEGG pathways, KEGG BRITE hierarchies, KEGG BRITE hierarchy categories, and gene + clusters removed from the network to file locations based on the provided path. For + example, if the argument, 'removed.tsv', is provided, then the following files will be + written: 'removed-metabolites.tsv', 'removed-reactions.tsv', 'removed-kos.tsv', + 'removed-modules.tsv', 'removed-pathways.tsv', 'removed-hierarchies.tsv', + 'removed-categories.tsv', and 'removed-gene-clusters.tsv'. + """ + if self.verbose: + self.progress.new("Removing metabolites without a formula in the network") + self.progress.update("...") + + if output_path: + path_basename, path_extension = os.path.splitext(output_path) + metabolite_path = f"{path_basename}-metabolites{path_extension}" + reaction_path = f"{path_basename}-reactions{path_extension}" + ko_path = f"{path_basename}-kos{path_extension}" + module_path = f"{path_basename}-modules{path_extension}" + pathway_path = f"{path_basename}-pathways{path_extension}" + hierarchy_path = f"{path_basename}-hierarchies{path_extension}" + category_path = f"{path_basename}-categories{path_extension}" + gene_cluster_path = f"{path_basename}-gene-clusters{path_extension}" + for path in ( + metabolite_path, + reaction_path, + ko_path, + module_path, + pathway_path, + hierarchy_path, + category_path, + gene_cluster_path + ): + filesnpaths.is_output_file_writable(path) + + metabolites_to_remove = [] + for compound_id, metabolite in self.metabolites.items(): + # ModelSEED compounds without a formula have a formula value of None in the network + # object. + if metabolite.formula is None: + metabolites_to_remove.append(compound_id) + removed = self._purge_metabolites(metabolites_to_remove) + + if self.verbose: + self.run.info("Removed metabolites", len(removed['metabolite'])) + self.run.info("Removed reactions", len(removed['reaction'])) + self.run.info("Removed KOs", len(removed['ko'])) + self.run.info("Removed KEGG modules", len(removed['module'])) + self.run.info("Removed KEGG pathways", len(removed['pathway'])) + self.run.info("Removed KEGG BRITE hierarchies", len(removed['hierarchy'])) + self.run.info("Removed KEGG BRITE hierarchy categories", len(removed['category'])) + self.run.info("Removed gene clusters", len(removed['gene_cluster'])) + + if not output_path: + return + + if self.verbose: + self.progress.new("Writing output files of removed network items") + self.progress.update("...") + + gene_cluster_table = [] + for cluster in removed['gene_cluster']: + cluster: GeneCluster + row = [] + row.append(cluster.gene_cluster_id) + row.append(cluster.ko_id) + row.append(", ".join(cluster.genomes)) + gene_cluster_table.append(row) + + self._write_remove_metabolites_without_formula_output(output_path, removed) + + pd.DataFrame( + gene_cluster_table, + columns=[ + "Gene cluster ID", + "KO ID", + "Gene cluster genomes" + ] + ).to_csv(gene_cluster_path, sep='\t', index=False) + + if self.verbose: + self.run.info("Table of removed metabolites", metabolite_path) + self.run.info("Table of removed reactions", reaction_path) + self.run.info("Table of removed KOs", ko_path) + self.run.info("Table of removed KEGG modules", module_path) + self.run.info("Table of removed KEGG pathways", pathway_path) + self.run.info("Table of removed KEGG BRITE hierarchies", hierarchy_path) + self.run.info("Table of removed KEGG BRITE hierarchy categories", category_path) + self.run.info("Table of removed gene clusters", gene_cluster_path) + + def prune( + self, + gene_clusters_to_remove: Union[int, Iterable[int]] = None, + kos_to_remove: Union[str, Iterable[str]] = None, + modules_to_remove: Union[str, Iterable[str]] = None, + pathways_to_remove: Union[str, Iterable[str]] = None, + hierarchies_to_remove: Union[str, Iterable[str]] = None, + categories_to_remove: Dict[str, List[Tuple[str]]] = None, + reactions_to_remove: Union[str, Iterable[str]] = None, + metabolites_to_remove: Union[str, Iterable[str]] = None + ) -> Dict[str, List]: + """ + Prune items from the metabolic network. + + Pruning modifies the network in situ: use the network 'copy' method as needed to create a + backup of the network. + + If requested gene clusters, KOs, KEGG modules, KEGG pathways, KEGG BRITE hierarchies, KEGG + BRITE hierarchy categories, reactions, or metabolites are not present in the network, no + error is raised. + + Network items (e.g., gene clusters, KOs, reactions, and metabolites) that are exclusively + associated with requested items are also removed from the network. Example: Consider a KO + that is requested to be removed from the network. The KO is associated with two reactions. + The first reaction is exclusive to the KO and thus is also removed, whereas the second + reaction is also associated with another retained KO and thus is retained in the network. + The first reaction involves four metabolites, and two are exclusive to the reaction: these + are also removed from the network. Each gene cluster has a single consensus KO annotation, + so any gene clusters assigned this KO are removed from the network. Note that reaction + annotations of KOs can be selected to the exclusion of others. In the example, the latter + gene cluster is left with one KO. + + Parameters + ========== + gene_clusters_to_remove : Union[str, Iterable[int]], None + Gene cluster ID(s) to remove. + + kos_to_remove : Union[str, Iterable[str]], None + KO ID(s) to remove. + + modules_to_remove : Union[str, Iterable[str]], None + KEGG module ID(s) to remove, with the effect of giving the KOs in the module(s) to the + argument, 'kos_to_remove'. This does not remove other module annotations of these KOs + that also annotate other KOs. + + pathways_to_remove : Union[str, Iterable[str]], None + KEGG pathway ID(s) to remove, with the effect of giving the KOs in the pathway(s) to the + argument, 'kos_to_remove'. This does not remove other pathway annotations of these KOs + that also annotate other KOs. + + hierarchies_to_remove : Union[str, Iterable[str]], None + KEGG BRITE hierarchy (or hierarchies) to remove, with the effect of giving the KOs in + the hierarchy to the argument, 'kos_to_remove'. This does not remove other hierarchy + annotations of these KOs that also annotate other KOs. + + categories_to_remove : Dict[str, List[Tuple[str]]], None + KEGG BRITE hierarchy categories to remove, with the effect of giving the KOs in the + categories to the argument, 'kos_to_remove'. This does not remove other category + annotations of these KOs that also annotate other KOs. The dictionary argument is keyed + by BRITE hierarchy ID and has values that list category tuples. For example, to remove + KOs contained in the 'ko00001' 'KEGG Orthology (KO)' hierarchy categories, '09100 + Metabolism >>> 09101 Carbohydrate metabolism >>> 00010 Glycolysis / Gluconeogenesis + [PATH:ko00010]' and '09100 Metabolism >>> 09101 Carbohydrate metabolism >>> 00051 + Fructose and mannose metabolism [PATH:ko00051]', the dictionary argument would need to + look like the following: {'ko00001': [('09100 Metabolism', '09101 Carbohydrate + metabolism', '00010 Glycolysis / Gluconeogenesis'), ('09100 Metabolism', '09101 + Carbohydrate metabolism', '00051 Fructose and mannose metabolism [PATH:ko00051]')]} + + reactions_to_remove : Union[str, Iterable[str]], None + ModelSEED reaction ID(s) to remove. + + metabolites_to_remove : Union[str, Iterable[str]], None + ModelSEED compound ID(s) to remove. + + Returns + ======= + dict + This dictionary contains data removed from the network. + + The dictionary has the following format. It shows protein entries as if the network has + been annotated with protein abundances; these are absent for genomic networks lacking + protein annotations. + { + 'gene': [], + 'protein': [], + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [], + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'metabolite': [] + } + """ + assert ( + gene_clusters_to_remove or + kos_to_remove or + modules_to_remove or + pathways_to_remove or + hierarchies_to_remove or + categories_to_remove or + reactions_to_remove or + metabolites_to_remove + ) + + removed: Dict[str, List] = { + 'gene_cluster': [], + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [], + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'metabolite': [] + } + + if gene_clusters_to_remove: + for item_type, removed_items in self._purge_gene_clusters( + gene_clusters_to_remove=gene_clusters_to_remove + ).items(): + removed[item_type] += removed_items + + if ( + kos_to_remove or + modules_to_remove or + pathways_to_remove or + hierarchies_to_remove or + categories_to_remove + ): + for item_type, removed_items in self._purge_kos( + kos_to_remove=kos_to_remove, + modules_to_remove=modules_to_remove, + pathways_to_remove=pathways_to_remove, + hierarchies_to_remove=hierarchies_to_remove, + categories_to_remove=categories_to_remove + ).items(): + removed[item_type] += removed_items + + if reactions_to_remove: + for item_type, removed_items in self._purge_reactions(reactions_to_remove).items(): + removed[item_type] += removed_items + + if metabolites_to_remove: + for item_type, removed_items in self._purge_metabolites(metabolites_to_remove).items(): + removed[item_type] += removed_items + + return removed + + def _purge_gene_clusters(self, gene_clusters_to_remove: Iterable[str]) -> Dict[str, List]: + """ + Remove any trace of the given gene clusters from the network. + + KOs, reactions, and metabolites that are only associated with removed gene clusters are + purged. KEGG modules, pathways, BRITE hierarchies, and BRITE hierarchy categories only + associated with purged KOs are removed. + + Parameters + ========== + gene_clusters_to_remove : Iterable[str] + Gene cluster IDs to remove. + + Returns + ======= + dict + This dictionary contains data removed from the network. + + If this method is NOT called from the method, '_purge_kos', then the dictionary will + look like the following. + { + 'gene_cluster': [], + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [], + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'metabolite': [] + } + + If this method is called from the method, '_purge_kos', then the dictionary will look + like the following. + { + 'gene_cluster': [], + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [], + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'metabolite': [] + } + + If no gene clusters are removed from the network, then the dictionary will look like the + following regardless of calling method. + { + 'metabolite': [], + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [], + 'gene_cluster': [] + } + """ + gene_clusters_to_remove = set(gene_clusters_to_remove) + removed_gene_clusters: List[GeneCluster] = [] + for gene_cluster_id in gene_clusters_to_remove: + try: + removed_gene_clusters.append(self.gene_clusters.pop(gene_cluster_id)) + except KeyError: + # This occurs if the cluster in 'gene_clusters_to_remove' is not in the network. + pass + + if not removed_gene_clusters: + return { + 'metabolite': [], + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [], + 'gene_cluster': [] + } + + # Purge KOs from the network that are exclusively assigned to removed gene clusters. + kos_to_remove: List[str] = [] + for cluster in removed_gene_clusters: + kos_to_remove.append(cluster.ko_id) + kos_to_remove = list(set(kos_to_remove)) + for gene_cluster in self.gene_clusters.values(): + kos_to_spare: List[str] = [] + if gene_cluster.ko_id in kos_to_remove: + # The KO is associated with a retained gene cluster, so do not remove the KO. + kos_to_spare.append(gene_cluster.ko_id) + for ko_id in kos_to_spare: + kos_to_remove.remove(ko_id) + if kos_to_remove: + removed_cascading_down = self._purge_kos(kos_to_remove) + removed_cascading_down.pop('gene_cluster') + else: + # This method must have been called from the method, '_purge_kos', because the KOs that + # are only associated with the removed gene clusters were already removed from the + # network. + removed_cascading_down = { + 'ko': [], + 'module': [], + 'pathway': [], + 'hierarchy': [], + 'category': [], + 'reaction': [], + 'kegg_reaction': [], + 'ec_number': [], + 'metabolite': [] + } + + removed = {'gene_cluster': removed_gene_clusters} + removed.update(removed_cascading_down) + return removed + + def subset_network( + self, + gene_clusters_to_subset: Union[int, Iterable[int]] = None, + kos_to_subset: Union[str, Iterable[str]] = None, + modules_to_subset: Union[str, Iterable[str]] = None, + pathways_to_subset: Union[str, Iterable[str]] = None, + hierarchies_to_subset: Union[str, Iterable[str]] = None, + categories_to_subset: Dict[str, List[Tuple[str]]] = None, + reactions_to_subset: Union[str, Iterable[str]] = None, + metabolites_to_subset: Union[str, Iterable[str]] = None, + inclusive: bool = False + ) -> PangenomicNetwork: + """ + Subset a smaller network from the metabolic network. + + If requested gene clusters, KOs, KEGG modules, KEGG pathways, KEGG BRITE hierarchies, KEGG + BRITE hierarchy categories, reactions, or metabolites are not present in the network, no + error is raised. + + Subsetted items are not represented by the same objects as in the source network, i.e., new + gene cluster, KO, reaction, metabolite, and other objects are created and added to the + subsetted network. + + Network items (e.g., gene clusters, KOs, reactions, and metabolites) that are associated with + requested items (e.g., gene clusters in the network that reference requested KOs; metabolites + referenced by requested reactions) are added to the subsetted network. + + The choice of "inclusive" or, by default, "exclusive" subsetting determines which associated + items are included in the subsetted network. In exclusive subsetting, KOs (and by extension, + gene clusters assigned the KOs) that are added to the subsetted network due to references to + requested reactions will be missing references to any other unrequested reactions. In other + words, certain reaction annotations can be selected to the exclusion of others, e.g., a KO + encoding two reactions can be restricted to encode one requested reaction in the subsetted + network; a KO encoding multiple reactions can be restricted to encode only those reactions + involving requested metabolites. + + "Inclusive" subsetting applies a "Midas touch" where all items in the network that are + however associated with requested reactions and metabolites are "turned to gold" and + included in the subsetted network. KOs and gene clusters that are added to the subsetted + network due to references to requested reactions and metabolites will include all their + other references to unrequested reactions and metabolites. Inclusive subsetting precludes + the emendation of KO reaction annotations. + + Parameters + ========== + gene_clusters_to_subset : Union[int, Iterable[int]], None + Gene cluster ID(s) to subset. + + kos_to_subset : Union[str, Iterable[str]], None + KO ID(s) to subset. + + modules_to_subset : List[str], None + KEGG module ID(s) to subset, with the effect of giving the KOs in the module(s) to the + argument, 'kos_to_subset'. This does not exclude other module annotations of these KOs + from the network. + + pathways_to_subset : Union[str, Iterable[str]], None + KEGG pathway ID(s) to subset, with the effect of giving the KOs in the pathway(s) to the + argument, 'kos_to_subset'. This does not exclude other pathway annotations of these KOs + from the network. + + hierarchies_to_subset : Union[str, Iterable[str]], None + KEGG BRITE hierarchy (or hierarchies) to subset, with the effect of giving the KOs in + the hierarchy to the argument, 'kos_to_subset'. This does not exclude other hierarchy + annotations of these KOs from the network. + + categories_to_subset : Dict[str, List[Tuple[str]]], None + KEGG BRITE hierarchy categories to subset, with the effect of giving the KOs in the + categories to the argument, 'kos_to_subset'. This does not exclude other category + annotations of these KOs from the network. The dictionary argument is keyed by BRITE + hierarchy ID and has values that list category tuples. For example, to subset KOs from + the network contained in the 'ko00001' 'KEGG Orthology (KO)' hierarchy categories, + '09100 Metabolism >>> 09101 Carbohydrate metabolism >>> 00010 Glycolysis / + Gluconeogenesis [PATH:ko00010]' and '09100 Metabolism >>> 09101 Carbohydrate + metabolism >>> 00051 Fructose and mannose metabolism [PATH:ko00051]', the dictionary + argument would need to look like the following: {'ko00001': [('09100 Metabolism', '09101 + Carbohydrate metabolism', '00010 Glycolysis / Gluconeogenesis'), ('09100 Metabolism', + '09101 Carbohydrate metabolism', '00051 Fructose and mannose metabolism + [PATH:ko00051]')]} + + reactions_to_subset : Union[str, Iterable[str]], None + ModelSEED reaction ID(s) to subset. + + metabolites_to_subset : Union[str, Iterable[str]], None + ModelSEED compound ID(s) to subset. + + inclusive : bool, False + If True, "inclusive" subsetting applies a "Midas touch" where all items in the network + that are however associated with requested reactions and metabolites are "turned to + gold" and included in the subsetted network. In default "exclusive" subsetting, KOs and + gene clusters that are added to the subsetted network due to references to requested + reactions and metabolites will be missing references to any other unrequested reactions + and metabolites. + + Returns + ======= + PangenomicNetwork + New subsetted reaction network. + """ + assert ( + gene_clusters_to_subset or + kos_to_subset or + modules_to_subset or + pathways_to_subset or + hierarchies_to_subset or + categories_to_subset or + reactions_to_subset or + metabolites_to_subset + ) + + if kos_to_subset is None: + kos_to_subset: List[str] = [] + else: + kos_to_subset = list(kos_to_subset) + if modules_to_subset is None: + modules_to_subset: List[str] = [] + if pathways_to_subset is None: + pathways_to_subset: List[str] = [] + if hierarchies_to_subset is None: + hierarchies_to_subset: List[str] = [] + if categories_to_subset is None: + categories_to_subset: Dict[str, List[Tuple[str]]] = {} + + # Get KOs to subset from requested modules, pathways, hierarchies, and hierarchy categories. + for module_id in modules_to_subset: + try: + module = self.modules[module_id] + except KeyError: + # The requested module is not in the network. + continue + kos_to_subset += module.ko_ids + for pathway_id in pathways_to_subset: + try: + pathway = self.pathways[pathway_id] + except KeyError: + # The requested pathway is not in the network. + continue + kos_to_subset += pathway.ko_ids + for hierarchy_id in hierarchies_to_subset: + try: + hierarchy = self.hierarchies[hierarchy_id] + except KeyError: + # The requested hierarchy is not in the network. + continue + kos_to_subset += hierarchy.ko_ids + for hierarchy_id, categorizations in categories_to_subset.items(): + try: + hierarchy_categorizations = self.categories[hierarchy_id] + except KeyError: + # The requested hierarchy is not in the network. + continue + for categorization in categorizations: + try: + categories = hierarchy_categorizations[categorization] + except KeyError: + # The requested category is not in the network. + continue + category = categories[-1] + kos_to_subset += category.ko_ids + kos_to_subset = set(kos_to_subset) + + # Sequentially subset the network for each type of request. Upon generating two subsetted + # networks from two types of request, merge the networks into a single subsetted network; + # repeat. + first_subnetwork = None + for items_to_subset, subset_network_method in ( + (gene_clusters_to_subset, self._subset_network_by_gene_clusters), + (kos_to_subset, functools.partial(self._subset_network_by_kos, inclusive=inclusive)), + (reactions_to_subset, functools.partial( + self._subset_network_by_reactions, inclusive=inclusive + )), + (metabolites_to_subset, functools.partial( + self._subset_network_by_metabolites, inclusive=inclusive + )) + ): + if not items_to_subset: + continue + + second_subnetwork = subset_network_method(items_to_subset) + + if first_subnetwork is None: + first_subnetwork = second_subnetwork + else: + first_subnetwork = first_subnetwork.merge_network(second_subnetwork) + + return first_subnetwork + + def _subset_network_by_gene_clusters( + self, + gene_cluster_ids: Iterable[int] + ) -> PangenomicNetwork: + """ + Subset the network by gene clusters with requested IDs. + + Parameters + ========== + gene_cluster_ids : Iterable[int] + Gene cluster IDs to subset. + + Returns + ======= + PangenomicNetwork + New subsetted reaction network. + """ + subnetwork = PangenomicNetwork() + + for gene_cluster_id in gene_cluster_ids: + try: + cluster = self.gene_clusters[gene_cluster_id] + except KeyError: + # This occurs if the requested gene cluster ID is not in the source network. + continue + + # Subset the consensus KO annotating the gene cluster. + self._subset_network_by_kos([cluster.ko_id], subnetwork=subnetwork) + + subnetwork.gene_clusters[gene_cluster_id] = deepcopy(cluster) + + return subnetwork + + def _subset_gene_clusters_via_kos(self, subnetwork: PangenomicNetwork) -> None: + """ + Add gene clusters that are annotated with subsetted KOs to the subsetted network. + + Parameters + ========== + subnetwork : PangenomicNetwork + The subsetted reaction network under construction. + + Returns + ======= + None + """ + subsetted_ko_ids = list(subnetwork.kos) + for gene_cluster_id, cluster in self.gene_clusters.items(): + # Check all gene clusters in the source network for subsetted KOs. + if cluster.ko_id in subsetted_ko_ids: + # A gene cluster is annotated by the subsetted KO. + subnetwork.gene_clusters[gene_cluster_id] = deepcopy(cluster) + + def merge_network(self, network: PangenomicNetwork) -> PangenomicNetwork: + """ + Merge the pangenomic reaction network with another pangenomic reaction network derived from + the same pan database. + + The purpose of the network is to combine different, but potentially overlapping, subnetworks + from the same pangenome. + + Each network can contain different gene clusters, KOs, and reactions/metabolites. Merging + nonredundantly incorporates all of this data as new objects in the new network. + + Objects representing KOs in both networks can have different sets of references: KOs can be + annotated by different reactions. However, the same gene cluster in each network should have + the same consensus KO annotation. ModelSEED reactions and metabolites in both networks + should have identical attributes. + + Parameters + ========== + network : PangenomicNetwork + The other pangenomic reaction network being merged. + + Returns + ======= + PangenomicNetwork + The merged pangenomic reaction network. + """ + merged_network: PangenomicNetwork = self._merge_network(network) + + merged_network.gene_clusters = deepcopy(self.gene_clusters) + + # Copy gene clusters from the second network. Assume they have the same consensus KOs. + merged_gene_clusters = merged_network.gene_clusters + for gene_cluster_id, cluster in network.gene_clusters.items(): + if gene_cluster_id not in merged_gene_clusters: + merged_gene_clusters[gene_cluster_id] = deepcopy(cluster) + + return merged_network + + def get_overview_statistics( + self, + precomputed_counts: Dict[str, int] = None + ) -> PangenomicNetworkStats: + """ + Calculate overview statistics for the pangenomic metabolic network. + + Parameters + ========== + precomputed_counts : Dict[str, int], None + To spare additional computations that involve loading and parsing databases, this + dictionary must contain certain precomputed data: the key, 'total_gene_clusters', should + have a value of the number of gene clusters in the pangenome; the key, + 'gene_clusters_assigned_ko', should have a value of the number of gene clusters in the + pangenome assigned a consensus KO (or None if 'self.consistent_annotations' is False); + the key, 'kos_assigned_gene_clusters', should have a value of the number of consensus + KOs assigned to gene clusters in the pangenome (or None if 'self.consistent_annotations' + is False). + + Returns + ======= + PangenomicNetworkStats + Network statistics are stored in a dictionary of dictionaries. Keys in the outer + dictionary are "classes" of network statistics. Keys in the inner dictionary are + statistics themselves. + """ + if ( + precomputed_counts is not None and + sorted(precomputed_counts) != [ + 'gene_clusters_assigned_ko', 'kos_assigned_gene_clusters', 'total_gene_clusters' + ] + ): + raise ConfigError( + "The 'precomputed_counts' argument must be a dictionary only containing the keys, " + "'total_gene_clusters', 'gene_clusters_assigned_ko', and " + "'kos_assigned_gene_clusters'." + ) + + stats: PangenomicNetworkStats = {} + + self.progress.new("Counting gene clusters and KOs") + self.progress.update("...") + stats['Gene cluster and KO counts'] = stats_group = {} + + if precomputed_counts: + assert ( + type(precomputed_counts['total_gene_clusters']) is int and + precomputed_counts['total_gene_clusters'] >= 0 + ) + gene_cluster_count = precomputed_counts['total_gene_clusters'] + assert ( + precomputed_counts['gene_clusters_assigned_ko'] is None or + ( + type(precomputed_counts['gene_clusters_assigned_ko']) is int and + precomputed_counts['gene_clusters_assigned_ko'] >= 0 + ) + ) + ko_annotated_gene_cluster_count = precomputed_counts['gene_clusters_assigned_ko'] + assert ( + precomputed_counts['kos_assigned_gene_clusters'] is None or + ( + type(precomputed_counts['kos_assigned_gene_clusters']) is int and + precomputed_counts['kos_assigned_gene_clusters'] >= 0 + ) + ) + annotating_ko_count = precomputed_counts['kos_assigned_gene_clusters'] + assert not ( + (ko_annotated_gene_cluster_count is None and annotating_ko_count is not None) or + (ko_annotated_gene_cluster_count is not None and annotating_ko_count is None) + ) + else: + # One database cannot be available without the other. + assert not ( + ( + self.pan_db_source_path is None and + self.genomes_storage_db_source_path is not None + ) or + ( + self.pan_db_source_path is not None and + self.genomes_storage_db_source_path is None + ) + ) + + if self.pan_db_source_path and self.genomes_storage_db_source_path: + pdb = PanDatabase(self.pan_db_source_path) + gene_cluster_count = pdb.meta['num_gene_clusters'] + pdb.disconnect() + else: + gene_cluster_count = None + + if ( + self.pan_db_source_path and + self.genomes_storage_db_source_path and + self.consistent_annotations is False + ): + args = argparse.Namespace() + args.genomes_storage = self.genomes_storage_db_source_path + args.consensus_threshold = self.consensus_threshold + args.discard_ties = self.discard_ties + pan_super = PanSuperclass(args, r=run_quiet) + pan_super.init_gene_clusters() + pan_super.init_gene_clusters_functions() + pan_super.init_gene_clusters_functions_summary_dict() + gene_clusters_functions_summary_dict: Dict = ( + pan_super.gene_clusters_functions_summary_dict + ) + ko_annotated_gene_cluster_count = 0 + ko_ids = [] + for gene_cluster_functions_data in gene_clusters_functions_summary_dict.values(): + gene_cluster_ko_data = gene_cluster_functions_data['KOfam'] + if gene_cluster_ko_data != {'function': None, 'accession': None}: + # A KO was assigned to the cluster. + ko_annotated_gene_cluster_count += 1 + ko_ids.append(gene_cluster_ko_data['accession']) + annotating_ko_count = len(set(ko_ids)) + else: + ko_annotated_gene_cluster_count = None + annotating_ko_count = None + + if gene_cluster_count is not None: + stats_group['Total gene clusters in pangenome'] = gene_cluster_count + if ko_annotated_gene_cluster_count is not None: + stats_group['Gene clusters assigned protein KO'] = ko_annotated_gene_cluster_count + stats_group['Gene clusters in network'] = len(self.gene_clusters) + if annotating_ko_count is not None: + stats_group['Protein KOs assigned to gene clusters'] = annotating_ko_count + stats_group['KOs in network'] = len(self.kos) + self.progress.end() + + self._get_common_overview_statistics(stats) + + if precomputed_counts: + return stats + + if not (self.pan_db_source_path and self.genomes_storage_db_source_path): + self.run.info_single( + "Since the pangenomic network was not associated with a pan database and genomes " + "storage database, the following statistics could not be calculated and were not " + "reported to the output file: 'Total gene clusters in pangenome', 'Gene clusters " + "assigned protein KOs', and 'Protein KOs assigned to gene clusters'." + ) + elif self.consistent_annotations is False: + self.run.info_single( + "The network attribute, 'consistent_annotations', is False, which indicates that " + "the reaction network stored in the pan database was made from a different set of " + "KO gene annotations than is currently in the genomes storage database. Therefore, " + "the following statistics were not calculated and reported to the output file to " + "avoid potential inaccuracies: 'Gene clusters assigned protein KO' and 'Protein " + "KOs assigned to gene clusters'." + ) + + return stats + + def print_overview_statistics(self, stats: PangenomicNetworkStats = None) -> None: + """ + Print overview statistics for the genomic metabolic network. + + Parameters + ========== + stats : PangenomicNetworkStats, None + With the default value of None, network statistics will be calculated and printed. + Alternatively, provided network statistics will be printed without calculating anew. + + Returns + ======= + None + """ + if not stats: + stats = self.get_overview_statistics() + + self.run.info_single("METABOLIC REACTION NETWORK STATISTICS", mc='green', nl_after=1) + + self.run.info_single("Gene clusters and KEGG Ortholog (KO) annotations") + stats_group = stats['Gene cluster and KO counts'] + self.run.info( + "Total gene clusters in pangenome", stats_group['Total gene clusters in pangenome'] + ) + self.run.info( + "Gene clusters annotated with protein KO", + stats_group['Gene clusters assigned protein KO'] + ) + self.run.info("Gene clusters in network", stats_group['Gene clusters in network']) + self.run.info( + "Protein KOs assigned to gene clusters", + stats_group['Protein KOs assigned to gene clusters'] + ) + self.run.info("KOs in network", stats_group['KOs in network'], nl_after=1) + + self._print_common_overview_statistics(stats) + + def export_json( + self, + path: str, + overwrite: bool = False, + objective: str = None, + remove_missing_objective_metabolites: bool = False, + record_genomes: Tuple[str] = ('gene', 'reaction'), + indent: int = 2, + progress: terminal.Progress = terminal.Progress() + ) -> None: + """ + Export the network to a metabolic model file in JSON format. + + Entries in the "gene" section of this file represent gene clusters. + + All information from the network is included in the JSON so that the file can be loaded as a + PangenomicNetwork object containing the same information. + + Parameters + ========== + path : str + Output JSON file path. + + overwrite : bool, False + Overwrite the JSON file if it already exists. + + objective : str, None + An objective to use in the model, stored as the first entry in the JSON 'reactions' + array. Currently, the only valid options are None and 'e_coli_core'. + + None means that no objective is added to the JSON, meaning that FBA cannot be performed + on the model. + + 'e_coli_core' is the biomass objective from the COBRApy example JSON file of E. coli + "core" metabolism, 'e_coli_core.json'. + + remove_missing_objective_metabolites : bool, False + If True, remove metabolites from the JSON objective that are not produced or consumed in + the reaction network. FBA fails with metabolites outside the network. + + record_genomes : tuple, ('gene cluster', 'reaction') + Record the genome membership of gene clusters in JSON entries. By default, genome names + are recorded for gene clusters and reactions with the argument, ('gene cluster', + 'reaction'). To not record genomes at all, pass either an empty tuple or None. The + following valid strings can be provided in a tuple in any combination: 'gene cluster', + 'reaction', and 'metabolite'. 'reaction' and 'metabolite' record the genomes predicted + to encode enzymes associated with reactions and metabolites, respectively. + + indent : int, 2 + Spaces of indentation per nesting level in JSON file. + + progress : terminal.Progress, terminal.Progress() + Prints transient progress information to the terminal. + """ + if record_genomes is None: + record_genomes = () + valid_items = ('gene cluster', 'reaction', 'metabolite') + invalid_items = [] + for item in record_genomes: + if item not in valid_items: + invalid_items.append(item) + if invalid_items: + raise ConfigError( + f"The following items in the 'record_genomes' argument are invalid: " + f"{', '.join(invalid_items)}" + ) + + progress.new("Constructing JSON") + progress.update("Setting up") + filesnpaths.is_output_file_writable(path, ok_if_exists=overwrite) + json_dict = JSONStructure.get() + json_gene_clusters: List[Dict] = json_dict['genes'] + json_reactions: List[Dict] = json_dict['reactions'] + json_metabolites: List[Dict] = json_dict['metabolites'] + if objective == 'e_coli_core': + objective_dict = JSONStructure.get_e_coli_core_objective() + if remove_missing_objective_metabolites: + self.remove_missing_objective_metabolites(objective_dict) + json_reactions.append(objective_dict) + elif objective != None: + raise ConfigError( + f"Anvi'o does not recognize an objective with the name, '{objective}'." + ) + + progress.update("Gene clusters") + reaction_gene_clusters: Dict[str, List[str]] = {} + reaction_kos: Dict[str, List[KO]] = {} + # The following two dictionaries are only needed for recording the occurrence of reactions + # and metabolites in genomes. + reaction_genomes: Dict[str, List[str]] = {} + metabolite_genomes: Dict[str, List[str]] = {} + for cluster_id, gene_cluster in self.gene_clusters.items(): + gene_cluster_entry = JSONStructure.get_gene_entry() + json_gene_clusters.append(gene_cluster_entry) + cluster_id_str = str(cluster_id) + gene_cluster_entry['id'] = cluster_id_str + + # Record the consensus KO ID and classifications in the annotation section of the gene + # cluster entry. + annotation = gene_cluster_entry['annotation'] + ko_id = gene_cluster.ko_id + annotation['ko'] = annotation_ko = { + 'id': ko_id, + 'modules': {}, + 'pathways': {}, + 'hierarchies': {} + } + + # Record KEGG modules containing the KO. + ko = self.kos[ko_id] + annotation_ko_modules = annotation_ko['modules'] + for module_id in ko.module_ids: + module = self.modules[module_id] + module_annotation = module.name + if not module.pathway_ids: + annotation_ko_modules[module_id] = module_annotation + continue + # Cross-reference KEGG pathways containing the module. + module_annotation += "[pathways:" + for pathway_id in module.pathway_ids: + module_annotation += f" {pathway_id}" + module_annotation += "]" + annotation_ko_modules[module_id] = module_annotation + + # Record KEGG pathways containing the KO. + annotation_ko_pathways = annotation_ko['pathways'] + for pathway_id in ko.pathway_ids: + pathway = self.pathways[pathway_id] + annotation_ko_pathways[pathway_id] = pathway.name + + # Record membership of the KO in KEGG BRITE hierarchies. + annotation_ko_hierarchies: Dict[str, List[str]] = annotation_ko['hierarchies'] + for hierarchy_id, categorizations in ko.hierarchies.items(): + hierarchy_name = self.hierarchies[hierarchy_id].name + annotation_ko_hierarchies[ + f"{hierarchy_id}: {hierarchy_name}" + ] = annotation_ko_categories = [] + hierarchy_categorizations = self.categories[hierarchy_id] + for categorization in categorizations: + categories = hierarchy_categorizations[categorization] + category = categories[-1] + category_id = category.id + annotation_ko_categories.append(category_id[len(hierarchy_id) + 2:]) + + # Set up dictionaries needed to fill out reaction entries. + for reaction_id in ko.reaction_ids: + try: + reaction_gene_clusters[reaction_id].append(cluster_id_str) + except KeyError: + reaction_gene_clusters[reaction_id] = [cluster_id_str] + try: + reaction_kos[reaction_id].append(ko) + except KeyError: + reaction_kos[reaction_id] = [ko] + + if not record_genomes: + continue + + genome_names = gene_cluster.genomes + if 'gene cluster' in record_genomes: + # Record the names of the genomes contributing to the gene cluster in the notes + # section of the gene cluster entry. + gene_cluster_entry['notes']['genomes'] = genome_names + if 'reaction' in record_genomes: + for reaction_id in ko.reaction_ids: + try: + reaction_genomes[reaction_id] += genome_names + except KeyError: + reaction_genomes[reaction_id] = genome_names + if 'metabolite' in record_genomes: + for reaction_id in ko.reaction_ids: + reaction = self.reactions[reaction_id] + for compartment, compound_id in zip( + reaction.compartments, reaction.compound_ids + ): + entry_id = f"{compound_id}_{compartment}" + try: + metabolite_genomes[entry_id] += genome_names + except KeyError: + metabolite_genomes[entry_id] = genome_names + + progress.update("Reactions") + compound_compartments: Dict[str, Set[str]] = {} + for reaction_id, reaction in self.reactions.items(): + reaction_entry = JSONStructure.get_reaction_entry() + json_reactions.append(reaction_entry) + reaction_entry['id'] = reaction_id + reaction_entry['name'] = reaction.modelseed_name + metabolites = reaction_entry['metabolites'] + for compound_id, compartment, coefficient in zip( + reaction.compound_ids, reaction.compartments, reaction.coefficients + ): + metabolites[f"{compound_id}_{compartment}"] = coefficient + try: + compound_compartments[compound_id].add(compartment) + except KeyError: + compound_compartments[compound_id] = set(compartment) + if not reaction.reversibility: + # By default, the reaction entry was set up to be reversible; here make it + # irreversible. + reaction_entry['lower_bound'] = 0.0 + reaction_entry['gene_reaction_rule'] = " or ".join( + [gcid for gcid in reaction_gene_clusters[reaction_id]] + ) + + notes = reaction_entry['notes'] + # Record gene KO annotations which aliased the reaction via KEGG REACTION or EC number. + notes['ko'] = ko_notes = {} + ko_kegg_aliases = [] + ko_ec_number_aliases = [] + for ko in reaction_kos[reaction_id]: + try: + kegg_aliases = ko.kegg_reaction_aliases[reaction_id] + except KeyError: + kegg_aliases = [] + try: + ec_number_aliases = ko.ec_number_aliases[reaction_id] + except KeyError: + ec_number_aliases = [] + ko_notes[ko.id] = {'kegg.reaction': kegg_aliases, 'ec-code': ec_number_aliases} + ko_kegg_aliases += kegg_aliases + ko_ec_number_aliases += ec_number_aliases + ko_kegg_aliases = set(ko_kegg_aliases) + ko_ec_number_aliases = set(ko_ec_number_aliases) + # Record other KEGG REACTION or EC number aliases of the reaction in the ModelSEED + # database that did not happen to be associated with KO annotations. + notes['other_aliases'] = { + 'kegg.reaction': list(set(reaction.kegg_aliases).difference(ko_kegg_aliases)), + 'ec-code': list(set(reaction.ec_number_aliases).difference(ko_ec_number_aliases)) + } + if 'reaction' not in record_genomes: + continue + notes['genomes'] = sorted(set(reaction_genomes[reaction_id])) + + progress.update("Metabolites") + for compound_id, metabolite in self.metabolites.items(): + modelseed_compound_name = metabolite.modelseed_name + charge = metabolite.charge + formula = metabolite.formula + kegg_compound_aliases = list(metabolite.kegg_aliases) + for compartment in compound_compartments[compound_id]: + metabolite_entry = JSONStructure.get_metabolite_entry() + json_metabolites.append(metabolite_entry) + entry_id = f"{compound_id}_{compartment}" + metabolite_entry['id'] = entry_id + metabolite_entry['name'] = modelseed_compound_name + metabolite_entry['compartment'] = compartment + # Compounds without a formula have a nominal charge of 10000000 in the ModelSEED + # compounds database, which is replaced by None in the reaction network and 0 in the + # JSON. + metabolite_entry['charge'] = charge if charge is not None else 0 + metabolite_entry['formula'] = formula if formula is not None else "" + metabolite_entry['annotation']['kegg.compound'] = kegg_compound_aliases + if 'metabolite' not in record_genomes: + continue + notes['genomes'] = sorted(set(metabolite_genomes[entry_id])) + + progress.update("Saving") + with open(path, 'w') as f: + json.dump(json_dict, f, indent=indent) + progress.end() + +class JSONStructure: + """JSON structure of metabolic model file.""" + def get() -> Dict: + """Top-level file framework.""" + return { + 'metabolites': [], + 'reactions': [], + 'genes': [], + 'id': '', + 'compartments': { + 'c': 'cytosol', + 'e': 'extracellular space' + }, + 'version': '1' + } + + def get_metabolite_entry() -> Dict: + """"Format of each object in the 'metabolites' array.""" + return { + 'id': '', + 'name': '', + 'compartment': '', + 'charge': 0, # placeholder: uncharged + 'formula': '', + 'notes': {}, + 'annotation': {} + } + + def get_reaction_entry() -> Dict: + """Format of each object in the 'reactions' array.""" + return { + 'id': '', + 'name': '', + 'metabolites': {}, + # By default, make the reaction perfectly reversible. + 'lower_bound': -1000.0, + 'upper_bound': 1000.0, + 'gene_reaction_rule': '', + 'subsystem': '', + 'notes': {}, + 'annotation': {} + } + + def get_gene_entry() -> Dict: + """Format of each object in the 'genes' array.""" + return { + 'id': '', + 'name': '', + 'notes': {}, + 'annotation': {} + } + + def get_e_coli_core_objective() -> Dict: + """ + Biomass objective from the 'reactions' array in the COBRApy example JSON file, + 'e_coli_core.json', with KBase/ModelSEED compound IDs replacing BiGG metabolite IDs. + """ + return { + 'id': 'BIOMASS_Ecoli_core_w_GAM', + 'name': 'Biomass Objective Function with GAM', + 'metabolites': { + 'cpd00169_c': -1.496, + 'cpd00022_c': -3.7478, + 'cpd00008_c': 59.81, + 'cpd00024_c': 4.1182, + 'cpd00002_c': -59.81, + 'cpd00010_c': 3.7478, + 'cpd00236_c': -0.361, + 'cpd00072_c': -0.0709, + 'cpd00102_c': -0.129, + 'cpd00079_c': -0.205, + 'cpd00053_c': -0.2557, + 'cpd00023_c': -4.9414, + 'cpd00001_c': -59.81, + 'cpd00067_c': 59.81, + 'cpd00003_c': -3.547, + 'cpd00004_c': 3.547, + 'cpd00006_c': 13.0279, + 'cpd00005_c': -13.0279, + 'cpd00032_c': -1.7867, + 'cpd00061_c': -0.5191, + 'cpd00009_c': 59.81, + 'cpd00020_c': -2.8328, + 'cpd00101_c': -0.8977 + }, + 'lower_bound': 0.0, + 'upper_bound': 1000.0, + 'gene_reaction_rule': '', + 'objective_coefficient': 1.0, + 'subsystem': 'Biomass and maintenance functions', + 'notes': { + 'original_bigg_ids': [ + 'Biomass_Ecoli_core_w_GAM' + ], + 'original_metabolite_ids': { + '3pg_c': -1.496, + 'accoa_c': -3.7478, + 'adp_c': 59.81, + 'akg_c': 4.1182, + 'atp_c': -59.81, + 'coa_c': 3.7478, + 'e4p_c': -0.361, + 'f6p_c': -0.0709, + 'g3p_c': -0.129, + 'g6p_c': -0.205, + 'gln__L_c': -0.2557, + 'glu__L_c': -4.9414, + 'h2o_c': -59.81, + 'h_c': 59.81, + 'nad_c': -3.547, + 'nadh_c': 3.547, + 'nadp_c': 13.0279, + 'nadph_c': -13.0279, + 'oaa_c': -1.7867, + 'pep_c': -0.5191, + 'pi_c': 59.81, + 'pyr_c': -2.8328, + 'r5p_c': -0.8977 + } + }, + 'annotation': { + 'bigg.reaction': [ + 'BIOMASS_Ecoli_core_w_GAM' + ], + 'metanetx.reaction': [ + 'MNXR96280' + ], + 'sbo': 'SBO:0000629' + } + } + +class KEGGData: + """ + This object handles KEGG reference data used in reaction networks. + + Attributes + ========== + kegg_context : anvio.kegg.KeggContext + This contains anvi'o KEGG database attributes, such as filepaths. + + modules_db : anvio.kegg.ModulesDatabase + The anvi'o modules database from which KEGG data is loaded. + + modules_db_hash : str + The unique identifier of the anvi'o modules database, unique to its contents. + + ko_data : Dict[str, Dict[str, Any]] + This dictionary relates KO IDs to various data, as shown in the following schematic. + ko_data = { + : + { + 'EC': (), + 'RN': (), + 'MOD': (), + 'PTH': (), + 'HIE': { + : ( + (), + (), + ... + ), + : (...), + ... + } + }, + : {...}, + ... + } + + module_data : Dict[str, Dict[str, Any]] + This dictionary relates module IDs to module names and pathways, as shown in the following + schematic. + module_data = { + : + { + 'NAME': , + 'PTH': () + }, + : {...}, + ... + } + + pathway_data : Dict[str, Dict[str, Any]] + This dictionary relates pathway IDs to pathway names and equivalent categories in the BRITE + hierarchy, 'ko00001', as shown in the following schematic. + pathway_data = { + : + { + 'NAME': , + 'CAT': () + }, + : {...}, + ... + } + + hierarchy_data : Dict[str, str] + This dictionary relates BRITE hierarchy IDs to hierarchy names. + """ + def __init__(self, kegg_dir: str = None) -> None: + """ + Set up the KEGG data in attributes designed for reaction network construction. + + Parameters + ========== + kegg_dir : str, None + Directory containing an anvi'o KEGG database. The default argument of None expects the + KEGG database to be set up in the default directory used by the program + `anvi-setup-kegg-data`. + """ + args = argparse.Namespace() + args.kegg_data_dir = kegg_dir + self.kegg_context = kegg.KeggContext(args) + + missing_paths = self.check_for_binary_relation_files() + if missing_paths: + raise ConfigError( + "Unfortunately, the KEGG database needs to be reinstalled, because the following " + "KEGG binary relation files used in reaction networks were not found: " + f"{', '.join(missing_paths)}" + ) + + utils.is_kegg_modules_db(self.kegg_context.kegg_modules_db_path) + + self.modules_db = kegg.ModulesDatabase( + self.kegg_context.kegg_modules_db_path, argparse.Namespace(quiet=True) + ) + self.modules_db_hash = self.modules_db.db.get_meta_value('hash') + + self.ko_data: Dict[str, Dict[str, Any]] = {} + self.module_data: Dict[str, Dict[str, Any]] = {} + self.pathway_data: Dict[str, Dict[str, Any]] = {} + self.hierarchy_data: Dict[str, str] = {} + + self._load_ko_binary_relation_data() + self._load_module_data() + self._load_ko_module_data() + self._load_ko_pathway_data() + self._load_ko_hierarchy_data() + self._load_pathway_data() + self._load_hierarchy_data() + + # Add placeholder entries for missing KO data. + for ko_dict in self.ko_data.values(): + if 'EC' not in ko_dict: + ko_dict['EC'] = tuple() + if 'RN' not in ko_dict: + ko_dict['RN'] = tuple() + if 'MOD' not in ko_dict: + ko_dict['MOD'] = tuple() + if 'PTH' not in ko_dict: + ko_dict['PTH'] = tuple() + if 'HIE' not in ko_dict: + ko_dict['HIE'] = {} + + self.modules_db.disconnect() + + def check_for_binary_relation_files(self) -> List[str]: + """ + Check for expected binary relation files. + + Returns + ======= + List[str] + The paths of missing binary relation files not found at the expected locations. + """ + missing_paths = [] + for file in self.kegg_context.kegg_binary_relation_files.values(): + path = os.path.join(self.kegg_context.binary_relation_data_dir, file) + if not os.path.exists(path): + missing_paths.append(path) + + return missing_paths + + def _load_ko_binary_relation_data(self) -> None: + """ + Load KO binary relations to EC numbers and KEGG reactions into 'ko_data'. + + Returns + ======= + None + """ + ko_data = self.ko_data + for binary_relation, file in self.kegg_context.kegg_binary_relation_files.items(): + binary_relation_path = os.path.join(self.kegg_context.binary_relation_data_dir, file) + binary_relation_df = pd.read_csv(binary_relation_path, sep='\t', header=0).rename( + {'#KO': 'KO'}, axis=1 + ) + + entry_label = binary_relation[1] + if entry_label == 'EC': + col_name = 'EC number' + elif entry_label == 'RN': + col_name = 'Reaction' + else: + raise AssertionError + binary_relation_df = binary_relation_df.rename({col_name: entry_label}, axis=1) + col_name = entry_label + assert binary_relation_df.columns.tolist() == ['KO', col_name] + + prefix_length = len(f'[{entry_label}:') + for line in binary_relation_df.itertuples(): + ko_id: str = line.KO + try: + ko_data_dict = ko_data[ko_id] + except KeyError: + ko_data[ko_id] = ko_data_dict = {} + entry: str = getattr(line, col_name) + # An entry has a format like '[EC:1.1.1.18 1.1.1.369]' or '[RN:R00842]'. + ko_data_dict[col_name] = tuple(entry[prefix_length:-1].split()) + + def _load_module_data(self) -> None: + """ + Load module name and pathway membership into 'module_data'. + + Returns + ======= + None + """ + modules_names_table = self.modules_db.db.get_table_as_dataframe( + 'modules', + where_clause='data_name="NAME"', + columns_of_interest=['module', 'data_value'] + ).rename({'module': 'module_id', 'data_value': 'module_name'}, axis=1) + + modules_pathways_table = self.modules_db.db.get_table_as_dataframe( + 'modules', + where_clause='data_name="PATHWAY"', + columns_of_interest=['module', 'data_value'] + ).rename({'module': 'module_id', 'data_value': 'pathway_id'}, axis=1) + + module_data = self.module_data + for key, module_table in modules_names_table.merge( + modules_pathways_table, how='left', on='module_id' + ).groupby(['module_id', 'module_name']): + module_id = key[0] + module_name = key[1] + module_data[module_id] = module_dict = {} + module_dict['NAME'] = module_name + pathway_id_tuple = tuple(sorted(module_table['pathway_id'])) + if pd.isna(pathway_id_tuple[0]): + # The module is not part of any pathway. + module_dict['PTH'] = tuple() + else: + module_dict['PTH'] = pathway_id_tuple + + def _load_ko_module_data(self) -> None: + """ + Load KO classification within modules into 'ko_data'. + + Returns + ======= + None + """ + kos_modules_table = self.modules_db.db.get_table_as_dataframe( + 'modules', + where_clause='data_name="ORTHOLOGY"', + columns_of_interest=['module', 'data_value', 'data_definition'] + ).rename({'module': 'module_id', 'data_value': 'ko_id'}, axis=1) + + ko_id_pattern = re.compile('K\d{5}') + for orthology_entry, ko_modules_table in kos_modules_table.groupby('ko_id'): + if not re.match(ko_id_pattern, orthology_entry): + # Screen for "orthology" entries that are validly formatted KO IDs. There are also + # orthology entires for modules that are part of other modules. + continue + ko_id = orthology_entry + + try: + ko_dict = self.ko_data[ko_id] + except KeyError: + self.ko_data[ko_id] = ko_dict = {} + ko_dict['MOD'] = tuple(sorted(ko_modules_table['module_id'])) + + def _load_ko_pathway_data(self) -> None: + """ + Load KO classification within pathways into 'ko_data'. + + Only pathways that are equivalent to categories in the KO BRITE hierarchy, 'ko00001', are + considered, because the complete KO membership of these pathways (maps) is accessible in the + modules database given how BRITE hierarchy files are processed by 'anvi-setup-kegg-data'. + This excludes global and overview metabolism maps, such as the global 'Metabolic pathways' + and overview 'Degradation of aromatic compounds' maps. This also excludes maps corresponding + to hierarchies with IDs starting 'br' rather than 'ko'. 'br' maps involve other data besides + KOs, such as drugs, diseases, reactions, and compounds. + + Returns + ======= + None + """ + hierarchy_table = self.modules_db.db.get_table_as_dataframe( + 'brite_hierarchies', + where_clause='hierarchy_accession="ko00001"', + columns_of_interest=['ortholog_accession', 'categorization'] + ).rename({'hierarchy_accession': 'hierarchy_id', 'ortholog_accession': 'ko_id'}, axis=1) + + for ko_id, ko_table in hierarchy_table.groupby('ko_id'): + try: + ko_dict = self.ko_data[ko_id] + except KeyError: + self.ko_data[ko_id] = ko_dict = {} + + pathway_ids: List[str] = [] + for categorization in ko_table['categorization']: + categorization: str + category_name = categorization.split('>>>')[-1] + # 'ko00001' categories that are equivalent to pathways have names formatted like + # '00010 Glycolysis / Gluconeogenesis [PATH:ko00010]'. + if category_name[-15:-8] != ' [PATH:' and category_name[-1] != ']': + continue + pathway_ids.append(f'map{category_name[-6:-1]}') + ko_dict['PTH'] = tuple(pathway_ids) + + def _load_ko_hierarchy_data(self) -> None: + """ + Load KO BRITE hierarchy membership into 'ko_data'. + + Only KO hierarchies with IDs starting 'ko' are considered given modules database setup. + + Returns + ======= + None + """ + hierarchies_table = self.modules_db.db.get_table_as_dataframe( + 'brite_hierarchies', + columns_of_interest=['hierarchy_accession', 'ortholog_accession', 'categorization'] + ).rename({'hierarchy_accession': 'hierarchy_id', 'ortholog_accession': 'ko_id'}, axis=1) + + for ko_id, ko_table in hierarchies_table.groupby('ko_id'): + try: + ko_dict = self.ko_data[ko_id] + except KeyError: + self.ko_data[ko_id] = ko_dict = {} + + ko_hierarchies_dict: Dict[str, Tuple[Tuple[str]]] = {} + for hierarchy_id, ko_hierarchy_table in ko_table.groupby('hierarchy_id'): + categorizations: List[Tuple[str]] = [] + for categorization in ko_hierarchy_table['categorization']: + categorization: str + categorizations.append(tuple(categorization.split('>>>'))) + ko_hierarchies_dict[hierarchy_id] = tuple(categorizations) + ko_dict['HIE'] = ko_hierarchies_dict + + def _load_pathway_data(self) -> None: + """ + Load pathway name and BRITE categorization into 'pathway_data'. + + Only pathways that are equivalent to categories in the KO BRITE hierarchy, 'ko00001', are + considered. + + Returns + ======= + None + """ + categorizations = self.modules_db.db.get_single_column_from_table( + 'brite_hierarchies', + 'categorization', + unique=True, + where_clause='hierarchy_accession="ko00001"' + ) + + for categorization in categorizations: + categorization: str + categorization_tuple = tuple(categorization.split('>>>')) + category_name = categorization_tuple[-1] + # 'ko00001' categories that are equivalent to pathways have names formatted like '00010 + # Glycolysis / Gluconeogenesis [PATH:ko00010]'. + if category_name[-15:-8] != ' [PATH:' and category_name[-1] != ']': + continue + pathway_id = f'map{category_name[-6:-1]}' + assert category_name[:6] == f'{pathway_id[3:]} ' + pathway_dict: Dict[str, Any] = {} + pathway_name = f'{category_name[6:-14]}' + pathway_dict['NAME'] = pathway_name + pathway_dict['CAT'] = categorization_tuple + self.pathway_data[pathway_id] = pathway_dict + + def _load_hierarchy_data(self) -> None: + """ + Load hierarchy names into 'hierarchy_data'. + + Returns + ======= + None + """ + hierarchies_table = self.modules_db.db.get_table_as_dataframe( + 'brite_hierarchies', columns_of_interest=['hierarchy_accession', 'hierarchy_name'] + ).rename({'hierarchy_accession': 'hierarchy_id'}, axis=1).drop_duplicates() + + for row in hierarchies_table.itertuples(): + hierarchy_id = row.hierarchy_id + hierarchy_name = row.hierarchy_name + self.hierarchy_data[hierarchy_id] = hierarchy_name + +class KODatabase: + """ + Representation of the KEGG KO database used in the construction of reaction networks. + + Unless an alternative directory is provided, the database is downloaded and set up in a + default anvi'o data directory, and loaded from this directory in network construction. + """ + default_dir = os.path.join(os.path.dirname(ANVIO_PATH), 'data/misc/KEGG/KO_REACTION_NETWORK') + expected_files = ['ko_info.txt', 'ko_data.tsv'] + + def __init__(self, ko_dir: str = None) -> None: + """ + Load the table derived from downloaded KEGG KO entry files that relates KOs to KEGG + reactions and EC numbers. + + Parameters + ========== + ko_dir : str, None + The directory containing reference KEGG Orthology (KO) tables set up by anvi'o. The + default argument of None expects KO data to be set up in the default anvi'o directory + used by the program `anvi-setup-kegg-data`. + """ + if ko_dir: + if not os.path.isdir(ko_dir): + raise ConfigError(f"There is no such directory, '{ko_dir}'.") + else: + ko_dir = self.default_dir + + for expected_file in self.expected_files: + if not os.path.isfile(os.path.join(ko_dir, expected_file)): + raise ConfigError( + f"No required file named '{expected_file}' was found in the KO directory, " + f"'{ko_dir}'." + ) + + f = open(os.path.join(ko_dir, 'ko_info.txt')) + f.readline() + self.release = ' '.join(f.readline().strip().split()[1:]) + f.close() + + self.ko_table = pd.read_csv( + os.path.join(ko_dir, 'ko_data.tsv'), sep='\t', header=0, index_col=0, low_memory=False + ) + + def set_up( + num_threads: int = 1, + dir: str = None, + reset: bool = False, + run: terminal.Run = terminal.Run(), + progress: terminal.Progress = terminal.Progress() + ) -> None: + """ + Download KEGG KO entry files and parse these files to construct a tab-delimited file + relating KOs to KEGG reactions and EC numbers. + + Parameters + ========== + num_threads : int, 1 + Number of threads to use in parallelizing the download of KO files. + + dir : str, None + Directory in which to create a subdirectory called `KO_REACTION_NETWORK`, + in which files are downloaded and set up. This argument overrides + the default directory. + + reset : bool, False + If True, remove any existing 'KO_REACTION_NETWORK' database directory and the files + therein. If False, an exception is raised if there are files in this directory. + + run : anvio.terminal.Run, anvio.terminal.Run() + This object prints run information to the terminal. + + progress : anvio.terminal.Progress, anvio.terminal.Progress() + """ + if dir: + if os.path.isdir(dir): + ko_dir = os.path.join(dir, 'KO_REACTION_NETWORK') + else: + raise ConfigError( + f"There is no such directory, '{dir}'. You should create it first if you want " + "to use it." + ) + else: + ko_dir = KODatabase.default_dir + parent_dir = os.path.dirname(ko_dir) + if not os.path.exists(parent_dir): + os.makedirs(parent_dir) + if os.path.exists(ko_dir): + if reset: + shutil.rmtree(ko_dir) + else: + raise ConfigError( + f"The KO database directory, '{ko_dir}', already exists. 'reset' can be used " + "to remove the database at this location and set it up again." + ) + os.makedirs(ko_dir) + + if num_threads == 1: + run.warning( + "Only 1 thread will be used to download KO files. It is advisable to set a higher " + "number of threads to download faster." + ) + assert type(num_threads) is int and num_threads > 0 + + # Download a file for each entry in a KEGG database. + download_root = 'https://rest.kegg.jp/' + while True: + # Break out of this loop upon confirming that the KEGG release didn't change in the + # middle of downloading KO files. + progress.new(f"Downloading KEGG KO files") + # Get the database version before download. + progress.update("Database info") + info_before_path = os.path.join(ko_dir, 'ko_info_before.txt') + utils.download_file(f'{download_root}info/ko', info_before_path) + f = open(info_before_path) + f.readline() + release_before = ' '.join(f.readline().strip().split()[1:]) + f.close() + + # Get a list of all KO IDs. + progress.update("KO list") + list_path = os.path.join(ko_dir, 'ko_list.txt') + utils.download_file(f'{download_root}list/ko', list_path) + ko_ids = [] + f = open(list_path) + for line in f: + line.split()[0] + ko_ids.append(line.split('\t')[0]) + f.close() + + # Download KO entry files. + manager = mp.Manager() + input_queue = manager.Queue() + output_queue = manager.Queue() + for ko_id in ko_ids: + input_queue.put( + (f'{download_root}get/{ko_id}', os.path.join(ko_dir, f'{ko_id}.txt')) + ) + workers: List[mp.Process] = [] + for _ in range(num_threads): + worker = mp.Process(target=kegg._download_worker, args=(input_queue, output_queue)) + workers.append(worker) + worker.start() + downloaded_count = 0 + undownloaded_count = 0 + total = len(ko_ids) + undownloaded = [] + while downloaded_count + undownloaded_count < total: + output = output_queue.get() + if output is True: + downloaded_count += 1 + progress.update(f"{downloaded_count} / {total} KO files") + else: + undownloaded_count += 1 + undownloaded.append(os.path.splitext(os.path.basename(output))[0]) + for worker in workers: + worker.terminate() + if undownloaded: + raise ConfigError( + "Unfortunately, files for the following KOs failed to download despite " + "multiple attempts, and so the database needs to be set up again: " + f"{', '.join(undownloaded)}" + ) + + # Get the database version after download. + progress.update("Database info (again)") + info_after_path = os.path.join(ko_dir, 'ko_info.txt') + utils.download_file(f'{download_root}info/ko', info_after_path) + f = open(info_after_path) + f.readline() + release_after = ' '.join(f.readline().strip().split()[1:]) + f.close() + + # Check that the database had the same version before and after download. + progress.end() + if release_before == release_after: + # Retain one of the info files and delete the other. + info_path = info_after_path + os.remove(info_before_path) + break + else: + run.warning( + "It's your lucky day! The version of KEGG appears to have changed from " + f"'{release_before}' to '{release_after}' while anvi'o was downloading files " + "from the KO database. Anvi'o will now attempt to redownload all of the files." + ) + run.info(f"Total number of KOs/entry files", total) + run.info("KEGG KO database version", release_after) + run.info("KEGG KO list", list_path) + run.info("KEGG KO info", info_path) + + progress.new("Processing KEGG KO database") + # Make a tab-delimited file relating KO IDs and names to KEGG reactions and EC numbers. + kos_data = {} + paths = glob.glob(os.path.join(ko_dir, 'K*.txt')) + for num_processed, path in enumerate(paths): + progress.update(f"{num_processed} / {total} KO files") + # Parse the KO file. + ko_data = {} + section = None + # Unfortunately, a non-unicode character can crop up. + f = open(path, 'rb') + for line in f.read().decode(errors='replace').split('\n'): + if line[0] == ' ': + pass + else: + section = line.split()[0] + if section == 'NAME': + # The name value follows 'NAME' at the beginning of the line. + ko_data['name'] = line[4:].strip() + # EC numbers associated with the KO are recorded at the end of the name value. + ec_string = re.search('\[EC:.*\]', line) + if ec_string: + ko_data['ec_numbers'] = ec_string[0][4:-1] + elif section == 'DBLINKS': + # There is a row for each linked database in this section. There can be a row + # for KEGG REACTION database entries. The first line of the section starts with + # 'DBLINKS' and is followed by a value for a linked database. Values from the + # linked database are separated by ': ' from the name of the database, e.g., + # 'RN: R00001'. + split_line = line.split() + try: + rn_index = split_line.index('RN:') + except ValueError: + continue + ko_data['reactions'] = ' '.join(split_line[rn_index + 1:]) + elif section == 'GENES': + # This is the section after DBLINKS. + break + f.close() + ko_id = os.path.splitext(os.path.basename(path))[0] + kos_data[ko_id] = ko_data + progress.update("Making a table mapping KOs to KEGG reactions and EC numbers") + columns = {h: [] for h in ['name', 'reactions', 'ec_numbers']} + for ko_data in kos_data.values(): + for h, column in columns.items(): + try: + value = ko_data[h] + except KeyError: + value = None + column.append(value) + table: pd.DataFrame = pd.DataFrame.from_dict(columns) + table.index = kos_data + table = table.sort_index() + table_path = os.path.join(ko_dir, 'ko_data.tsv') + table.to_csv(table_path, sep='\t') + progress.end() + run.info("Table of select KEGG KO data", table_path) + + # Tarball the KO entry files. + progress.new("Compressing downloaded KEGG KO entry files") + progress.update("...") + ko_entries_dir = os.path.join(ko_dir, 'ko_entries') + os.mkdir(ko_entries_dir) + for path in paths: + shutil.move(path, ko_entries_dir) + tar_path = os.path.join(ko_dir, 'ko_entries.tar.gz') + with tarfile.open(tar_path, mode='w:gz') as tar: + tar.add(ko_entries_dir, arcname='.') + progress.end() + shutil.rmtree(ko_entries_dir) + run.info("Archived KEGG KO entry files", tar_path) + +class ModelSEEDDatabase: + """ + The ModelSEED Biochemistry database set up by anvi'o. + + By default, the database is loaded from a default directory of ModelSEED files unless an + alternative directory is provided. + + Attributes + ========== + default_dir : str + Class attribute recording anvi'o default directory of ModelSEED files. + + compartment_ids : Dict[int, str] + Class attribute linking the integer and string codes ModelSEED uses for cellular + compartments recording the locations of compounds in reactions. ModelSEED codes the + cytosolic compartment as 0 and 'c' and the extracellular compartment as 1 and 'e'. + + sha : str + The git commit SHA used to track the version of the downloaded ModelSEED database. + + compounds_table : pandas.core.frame.DataFrame + The ModelSEED Biochemistry Compound database. + + kegg_reactions_table : pandas.core.frame.DataFrame + The ModelSEED Biochemistry Reaction database, retaining ModelSEED reactions with KEGG reaction aliases + and keying by KEGG reaction ID. + + ec_reactions_table : pandas.core.frame.DataFrame + The ModelSEED Biochemistry Reaction database, retaining ModelSEED reactions with EC number + aliases and keying by EC number. + """ + default_dir = os.path.join(os.path.dirname(ANVIO_PATH), 'data/misc/MODELSEED') + + # Compounds are identified as cytosolic or extracellular in ModelSEED reactions. + compartment_ids = {0: 'c', 1: 'e'} + + def __init__(self, modelseed_dir: str = None) -> None: + """ + Load and set up reorganized tables of reactions and compounds from the ModelSEED directory + to facilitate reaction network construction. + + Parameters + ========== + modelseed_dir : str, None + Directory of ModelSEED files to use instead of the default. + """ + if modelseed_dir: + if not os.path.isdir(modelseed_dir): + raise ConfigError(f"There is no such directory, '{modelseed_dir}'.") + else: + modelseed_dir = self.default_dir + sha_path = os.path.join(modelseed_dir, 'sha.txt') + if not os.path.isfile(sha_path): + raise ConfigError( + "No required file named 'sha.txt' was found in the ModelSEED directory, " + f"'{modelseed_dir}'." + ) + reactions_path = os.path.join(modelseed_dir, 'reactions.tsv') + if not os.path.isfile(reactions_path): + raise ConfigError( + "No required file named 'reactions.tsv' was found in the ModelSEED directory, " + f"'{modelseed_dir}'." + ) + compounds_path = os.path.join(modelseed_dir, 'compounds.tsv') + if not os.path.isfile(compounds_path): + raise ConfigError( + "No required file named 'compounds.tsv' was found in the ModelSEED directory, " + f"'{modelseed_dir}'." + ) + + with open(sha_path) as f: + self.sha = f.read().strip() + reactions_table = pd.read_csv(reactions_path, sep='\t', header=0, low_memory=False) + self.compounds_table: pd.DataFrame = pd.read_csv( + compounds_path, + sep='\t', + header=0, + index_col='id', + low_memory=False + ) + + # Facilitate lookup of reaction data by KEGG REACTION ID via a reorganized reactions table. + # Remove reactions without KEGG aliases. + reactions_table_without_na = reactions_table.dropna(subset=['KEGG']) + expanded = [] + ko_id_col = [] + for ko_ids, row in zip( + reactions_table_without_na['KEGG'], + reactions_table_without_na.itertuples(index=False) + ): + ko_ids: str + # A ModelSEED reaction can have multiple KEGG aliases. + for ko_id in ko_ids.split('; '): + ko_id_col.append(ko_id) + expanded.append(row) + kegg_reactions_table = pd.DataFrame(expanded) + kegg_reactions_table['KEGG_REACTION_ID'] = ko_id_col + self.kegg_reactions_table = kegg_reactions_table + + # Facilitate lookup of reaction data by EC number via a reorganized reactions table. + # Remove reactions without EC number aliases. + reactions_table_without_na = reactions_table.dropna(subset=['ec_numbers']) + expanded = [] + ec_number_col = [] + for ec_numbers, row in zip( + reactions_table_without_na['ec_numbers'], + reactions_table_without_na.itertuples(index=False) + ): + ec_numbers: str + # A ModelSEED reaction can have multiple EC number aliases. + for ec_number in ec_numbers.split('|'): + ec_number_col.append(ec_number) + expanded.append(row) + ec_reactions_table = pd.DataFrame(expanded) + ec_reactions_table['EC_number'] = ec_number_col + self.ec_reactions_table = ec_reactions_table + + def set_up( + dir: str = None, + reset: bool = False, + run: terminal.Run = terminal.Run(), + progress: terminal.Progress = terminal.Progress() + ) -> None: + """ + Download the ModelSEED Biochemistry database, which consists of two tables of reaction and + metabolite data, and reorganize the tables. + + Parameters + ========== + dir : str, None + Directory in which to create a new subdirectory called 'MODELSEED', in which files are + downloaded and set up. This argument overrides the default directory. + + reset : bool, False + If True, remove any existing 'MODELSEED' database directory and the files therein. If + False, an exception is raised if there are files in this directory. + + run : anvio.terminal.Run, anvio.terminal.Run() + This object prints run information to the terminal. + + progress : anvio.terminal.Progress, anvio.terminal.Progress() + This object prints transient progress information to the terminal. + """ + if dir: + if os.path.isdir(dir): + modelseed_dir = os.path.join(dir, 'MODELSEED') + else: + raise ConfigError(f"There is no such directory, '{dir}'.") + else: + modelseed_dir = ModelSEEDDatabase.default_dir + parent_dir = os.path.dirname(modelseed_dir) + if not os.path.exists(parent_dir): + os.mkdir(parent_dir) + if os.path.exists(modelseed_dir): + if reset: + shutil.rmtree(modelseed_dir) + else: + raise ConfigError( + f"The ModelSEED database directory, '{modelseed_dir}', already exists. 'reset' " + "can be used to remove the database at this location and set it up again." + ) + os.mkdir(modelseed_dir) + + def download(url, path): + max_num_tries = 100 + wait_secs = 10.0 + num_tries = 0 + while True: + try: + utils.download_file(url, path, progress=progress) + break + except ConnectionResetError: + num_tries += 1 + if num_tries > max_num_tries: + raise ConnectionResetError( + f"The connection was reset by the peer more than {max_num_tries} times, " + "the maximum number of attempts. Try setting up the ModelSEED database " + "again." + ) + time.sleep(wait_secs) + # The commit SHA taken from the following file is stored in a text file to track the version + # of the ModelSEED database. + json_url = 'https://api.github.com/repos/ModelSEED/ModelSEEDDatabase/commits' + json_path = os.path.join(modelseed_dir, 'commits.json') + download(json_url, json_path) + with open(json_path) as f: + sha = json.load(f)[0]['sha'] + zip_url = f'https://github.com/ModelSEED/ModelSEEDDatabase/archive/{sha}.zip' + zip_path = os.path.join(modelseed_dir, f'ModelSEEDDatabase-{sha}.zip') + download(zip_url, zip_path) + + progress.new("Setting up ModelSEED files") + progress.update("Extracting") + with zipfile.ZipFile(zip_path, 'r') as f: + f.extractall(modelseed_dir) + reactions_path = os.path.join( + modelseed_dir, f'ModelSEEDDatabase-{sha}', 'Biochemistry', 'reactions.tsv' + ) + compounds_path = os.path.join( + modelseed_dir, f'ModelSEEDDatabase-{sha}', 'Biochemistry', 'compounds.tsv' + ) + shutil.move(reactions_path, modelseed_dir) + shutil.move(compounds_path, modelseed_dir) + reactions_path = os.path.join(modelseed_dir, 'reactions.tsv') + compounds_path = os.path.join(modelseed_dir, 'compounds.tsv') + sha_path = os.path.join(modelseed_dir, 'sha.txt') + with open(sha_path, 'w') as f: + f.write(sha) + os.remove(json_path) + os.remove(zip_path) + shutil.rmtree(os.path.join(modelseed_dir, f'ModelSEEDDatabase-{sha}')) + + progress.update("Loading") + reactions_table = pd.read_csv(reactions_path, sep='\t', header=0, low_memory=False) + compounds_table = pd.read_csv(compounds_path, sep='\t', header=0, low_memory=False) + + progress.update("Reorganizing tables") + # Reorganize the downloaded tables, storing in the same locations. The tables each have a + # column of aliases, or IDs for the same reaction or compound from various databases. Split + # these IDs into separate columns added to the end of the table, dropping the alias column. + def expand_aliases(table: pd.DataFrame) -> pd.DataFrame: + new_rows = [] + for aliases in table.aliases: + aliases: str + new_row = {} + if pd.isna(aliases): + new_rows.append(new_row) + continue + split_aliases = aliases.split('|') + for alias in split_aliases: + sep_index = alias.index(': ') + alias_key = alias[: sep_index] + alias_value = alias[sep_index + 2:].lstrip() + new_row[alias_key] = alias_value + new_rows.append(new_row) + alias_df = pd.DataFrame(new_rows) + alias_df.fillna('') + new_table = pd.concat([table.drop('aliases', axis=1), alias_df], axis=1) + return new_table + reactions_table = expand_aliases(reactions_table) + compounds_table = expand_aliases(compounds_table) + + progress.update("Saving reorganized tables") + reactions_table.to_csv(reactions_path, sep='\t', index=None) + compounds_table.to_csv(compounds_path, sep='\t', index=None) + progress.end() + + run.info("ModelSEED database version (git commit hash)", sha) + run.info("Reorganized ModelSEED reactions table", reactions_path) + run.info("Reorganized ModelSEED compounds table", compounds_path) + +class Constructor: + """Make, store, and load metabolic reaction networks.""" + def __init__( + self, + kegg_dir: str = None, + modelseed_dir: str = None, + run: terminal.Run = terminal.Run(), + progress: terminal.Progress = terminal.Progress() + ) -> None: + """ + Parameters + ========== + kegg_dir : str, None + The directory containing the anvi'o KEGG database. The default argument of None expects + the KEGG database to be set up in the default directory used by the program + `anvi-setup-kegg-data`. + + modelseed_dir : str, None + The directory containing reference ModelSEED Biochemistry tables set up by anvi'o. The + default argument of None expects ModelSEED data to be set up in the default anvi'o + directory used by the program `anvi-setup-modelseed-database`. + + run : anvio.terminal.Run, anvio.terminal.Run() + This object prints run information to the terminal. + + progress : anvio.terminal.Progress, anvio.terminal.Progress() + This object prints transient progress information to the terminal. + """ + self.kegg_dir = kegg_dir + self.modelseed_dir = modelseed_dir + self.run = run + self.progress = progress + + def load_network( + self, + contigs_db: str = None, + pan_db: str = None, + genomes_storage_db: str = None, + check_gene_annotations: bool = True, + load_protein_abundances: bool = False, + load_metabolite_abundances: bool = False, + profile_db: str = None, + quiet: bool = False, + stats_file: str = None + ) -> ReactionNetwork: + """ + Load a reaction network stored in a database as a reaction network object. + + Parameters + ========== + contigs_db : str, None + Path to a contigs database in which a reaction network is stored. + + pan_db : str, None + Path to a pan database in which a reaction network is stored. 'genomes_storage_db' is + also required. + + genomes_storage_db : str, None + Path to a genomes storage database in which KO annotations are stored. 'pan_db' is also + required. + + check_gene_annotations : bool, True + If True, as by default, check that the stored reaction network was made from the set of + gene KO annotations that is currently stored. An exception is raised if this is not the + case. If False, allow the stored reaction network to have been made from a different set + of gene KO annotations than is currently stored. This can result in different KOs in the + returned ReactionNetwork than in the original network that was stored. + + load_protein_abundances : bool, False + If loading the network from a contigs database, also load abundance measurements of + proteins that can be expressed by genes in the network. 'profile_db' is also required, + as abundance profile data is stored there. + + load_metabolite_abundances : bool, False + If loading the network from a contigs database, also load stored abundance measurements + of metabolites found in the network. 'profile_db' is also required, as abundance profile + data is stored there. + + profile_db : str, None + If loading protein or metabolite abundance data, this database is required, as abundance + profile data is stored there. + + quiet : bool, False + Print network overview statistics to the terminal if False. + + stats_file : str, None + Write network overview statistics to a tab-delimited file at this output path. + + Returns + ======= + ReactionNetwork + Reaction network loaded from the input database. + """ + # Check that the reaction network stored in a database is derived from the current gene KO + # annotations in the database. + if contigs_db: + network = self.load_contigs_database_network( + contigs_db, + check_gene_annotations=check_gene_annotations, + load_protein_abundances=load_protein_abundances, + load_metabolite_abundances=load_metabolite_abundances, + profile_db=profile_db, + quiet=quiet, + stats_file=stats_file + ) + elif genomes_storage_db or pan_db: + network = self.load_pan_database_network( + genomes_storage_db=genomes_storage_db, + pan_db=pan_db, + check_gene_annotations=check_gene_annotations, + quiet=quiet, + stats_file=stats_file + ) + else: + raise ConfigError( + "A reaction network must be loaded from a database source. Either a contigs " + "database or a genomes storage database and pan database are required." + ) + return network + + def load_contigs_database_network( + self, + contigs_db: str, + check_gene_annotations: bool = True, + load_protein_abundances: bool = False, + load_metabolite_abundances: bool = False, + profile_db: str = None, + quiet: bool = False, + stats_file: str = None + ) -> GenomicNetwork: + """ + Load reaction network data stored in a contigs database as a reaction network object. + + Parameters + ========== + contigs_db : str + Path to a contigs database in which a reaction network is stored. + + check_gene_annotations : bool, True + If True, as by default, check that the reaction network stored in the contigs database + was made from the same set of gene KO annotations as currently in the database, and + throw an error if this is not the case. If False, allow the stored reaction network to + have been made from a different set of gene KO annotations than is currently stored in + the database. This can result in different KO assignments to genes in the returned + GenomicNetwork than in the original network that was stored. + + load_protein_abundances : bool, False + Load stored abundance measurements of proteins that can be expressed by genes in the + network. 'profile_db' is also required, as abundance profile data is stored there. + + load_metabolite_abundances : bool, False + Load stored abundance measurements of metabolites found in the network. 'profile_db' is + also required, as abundance profile data is stored there. + + profile_db : str, None + If loading protein or metabolite abundance data, this database is required, as abundance + profile data is stored there. + + quiet : bool, False + Print network overview statistics to the terminal if False. + + stats_file : str, None + Write network overview statistics to a tab-delimited file at this output path. + + Returns + ======= + GenomicNetwork + Reaction network loaded from the contigs database. + """ + if stats_file is not None: + filesnpaths.is_output_file_writable(stats_file) + + # Load the contigs database. + utils.is_contigs_db(contigs_db) + cdb = ContigsDatabase(contigs_db) + cdb_db: DB = cdb.db + sources: List[str] = cdb.meta['gene_function_sources'] + if not sources or not 'KOfam' in sources: + raise ConfigError( + "The contigs database indicates that genes were never annotated with KOs. This is " + "especially strange since to load a reaction network means that a network had to " + "be constructed from gene KO annotations in the database. " + ) + + # Check that the network stored in the contigs database was made from the same set of KO + # gene annotations as is in the database. + stored_hash = cdb_db.get_meta_value('reaction_network_ko_annotations_hash') + gene_ko_hits_table = cdb_db.get_table_as_dataframe( + 'gene_functions', + where_clause='source = "KOfam"', + columns_of_interest=['gene_callers_id', 'accession', 'function', 'e_value'] + ) + current_hash = self.hash_contigs_db_ko_hits(gene_ko_hits_table) + if stored_hash != current_hash: + if check_gene_annotations: + raise ConfigError( + "The reaction network stored in the contigs database was made from a different " + "set of KEGG KO gene annotations than is current in the database. There are " + "two solutions to this problem. First, 'anvi-reaction-network' can be run " + "again to overwrite the existing network stored in the database with a new " + "network from the new KO gene annotations. Second, 'check_gene_annotations' " + "can be made False rather than True, allowing the stored network to have been " + "made from a different set of KO gene annotations than is currently stored in " + "the database. This can result in different genes being associated with KOs in " + "the returned GenomicNetwork than in the original network that was stored. The " + "available version of the KO database that has been set up by anvi'o is used " + "to fill in data for KOs in the network that are not current gene annotations." + ) + self.run.warning( + "The reaction network stored in the contigs database was made from a different set " + "of KEGG KO gene annotations than is currently in the database. This will be " + "ignored since 'check_gene_annotations' is False. This can result in different " + "KO assignments to genes in the returned GenomicNetwork than in the original " + "network that was stored." + ) + + network = GenomicNetwork(run=self.run, progress=self.progress) + network.contigs_db_source_path = os.path.abspath(contigs_db) + + # Identify KOs present in the reaction network as it was stored that have disappeared from + # among the gene-KO annotations. + ko_id_pattern = re.compile('K\d{5}') + reaction_network_ko_ids: Set[str] = set([ + kegg_id for kegg_id in + set(cdb_db.get_single_column_from_table( + tables.reaction_network_kegg_table_name, 'kegg_id' + )) + if re.fullmatch(ko_id_pattern, kegg_id) + ]) + contigs_db_ko_ids = set(gene_ko_hits_table['accession']) + missing_ko_ids = reaction_network_ko_ids.difference(contigs_db_ko_ids) + if missing_ko_ids: + self.run.warning( + "The following KOs present in the reaction network as it was originally stored are " + "not present among the current gene-KO hits in the contigs database, indicating " + f"that genes were reannotated: {', '.join(missing_ko_ids)}" + ) + + # Count the genes with KO hits, a summary statistic. + num_genes_assigned_kos = gene_ko_hits_table['gene_callers_id'].nunique() + + # Make objects representing genes with KO annotations in the stored reaction network. Make + # objects representing the KOs, initially only assigning their ID attribute. + gene_ko_hits_table: pd.DataFrame = gene_ko_hits_table.set_index('accession').loc[ + reaction_network_ko_ids.intersection(contigs_db_ko_ids) + ] + gene_ko_hits_table = gene_ko_hits_table.reset_index().set_index('gene_callers_id') + for row in gene_ko_hits_table.itertuples(): + gcid = row.Index + ko_id = row.accession + ko_name = row.function + e_value = float(row.e_value) + + try: + # This is not the first annotation involving the gene, so an object for it already + # exists. + gene = network.genes[gcid] + except KeyError: + gene = Gene(gcid=gcid) + network.genes[gcid] = gene + + try: + # This is not the first annotation involving the KO, so an object for it already + # exists. + ko = network.kos[ko_id] + except KeyError: + ko = KO(id=ko_id, name=ko_name) + network.kos[ko_id] = ko + gene.ko_ids.append(ko_id) + gene.e_values[ko_id] = e_value + + self._load_modelseed_reactions(cdb, network) + self._load_modelseed_compounds(cdb, network) + self._load_ko_classifications(cdb, network) + + if load_protein_abundances or load_metabolite_abundances: + network.profile_db_source_path = os.path.abspath(profile_db) + pdb = ProfileDatabase(profile_db) + if load_protein_abundances: + self._load_protein_abundances(pdb, cdb, network) + if load_metabolite_abundances: + self._load_metabolite_abundances(pdb, network) + pdb.disconnect() + + if quiet and not stats_file: + return network + + precomputed_counts = { + 'total_genes': cdb_db.get_row_counts_from_table('genes_in_contigs'), + 'genes_assigned_kos': num_genes_assigned_kos, + 'kos_assigned_genes': len(contigs_db_ko_ids) + } + cdb.disconnect() + stats = network.get_overview_statistics(precomputed_counts=precomputed_counts) + if not quiet: + network.print_overview_statistics(stats=stats) + if stats_file: + network.write_overview_statistics(stats_file, stats=stats) + + return network + + def _load_protein_abundances( + self, + profile_database: ProfileDatabase, + contigs_database: ContigsDatabase, + network: GenomicNetwork + ) -> None: + """ + Load abundance data for proteins that can be expressed by genes in the metabolic network. + + Protein isoforms are not supported. + + Parameters + ========== + profile_database : ProfileDatabase + Database storing protein measurements. + + contigs_database : ContigsDatabase + Database storing associations between genes and proteins. + + network : GenomicNetwork + Genomic network under construction. + + Returns + ======= + None + """ + protein_abundances_table = profile_database.db.get_table_as_dataframe( + tables.protein_abundances_table_name + ) + if len(protein_abundances_table) == 0: + return + + gene_functions_table = contigs_database.db.get_table_as_dataframe( + 'gene_functions', columns_of_interest=['gene_callers_id', 'source', 'accession'] + ) + gene_functions_table = gene_functions_table[ + gene_functions_table['gene_callers_id'].isin(network.genes) + ] + gene_functions_table = gene_functions_table.rename( + {'source': 'reference_source', 'accession': 'reference_id'}, axis=1 + ) + + protein_abundances_table = protein_abundances_table.merge( + gene_functions_table, how='inner', on=['reference_source', 'reference_id'] + ) + + multiprotein_genes: Dict[int, List[int]] = {} + for key, protein_table in protein_abundances_table.groupby( + ['protein_id', 'reference_source', 'reference_id'] + ): + protein_id = key[0] + protein = Protein() + network.proteins[protein_id] = protein + protein.id = protein_id + for gcid in protein_table['gene_callers_id'].unique(): + gene = network.genes[gcid] + protein.gcids.append(gcid) + if gene.protein_id: + try: + multiprotein_genes[gcid].append(protein_id) + except KeyError: + multiprotein_genes[gcid] = [protein_id] + else: + gene.protein_id = protein_id + for row in protein_table.itertuples(): + protein.abundances[row.sample_name] = row.abundance_value + + if multiprotein_genes: + msg = "" + for gcid, protein_ids in multiprotein_genes.items(): + msg += f"{gcid}: {', '.join(protein_ids)}; " + msg = msg[: -1] + raise ConfigError( + "Certain genes were unexpectedly associated with multiple proteins with abundance " + "data. Unfortunately, multiple protein products are not currently allowed in " + "anvi'o, so the protein abundance data must be edited down in the profile database " + "to permit use with the reaction network. These are as follows, with the gene " + f"callers ID separated by a comma-separated\ list of protein IDs. {msg}" + ) + + def _load_metabolite_abundances( + self, + profile_database: ProfileDatabase, + network: GenomicNetwork + ) -> None: + """ + Load abundance data for metabolites represented in the metabolic network. + + Parameters + ========== + profile_database : ProfileDatabase + Database storing protein measurement data that is loaded into the genomic network. + + network : GenomicNetwork + Genomic network under construction. + + Returns + ======= + None + """ + metabolite_abundances_table = profile_database.db.get_table_as_dataframe( + tables.metabolite_abundances_table_name + ) + metabolite_abundances_table = metabolite_abundances_table[ + metabolite_abundances_table['reference_id'].isin(network.metabolites) + ] + if len(metabolite_abundances_table) == 0: + return + + for compound_id, metabolite_table in metabolite_abundances_table.groupby('reference_id'): + metabolite = network.metabolites[compound_id] + for row in metabolite_table.itertuples(): + metabolite.abundances[row.sample_name] = row.abundance_value + + def load_pan_database_network( + self, + pan_db: str, + genomes_storage_db: str, + check_gene_annotations: bool = True, + quiet: bool = False, + stats_file: str = None + ) -> PangenomicNetwork: + """ + Load reaction network data stored in a pan database as a reaction network object. + + Parameters + ========== + pan_db : str + Path to a pan database in which a reaction network is stored. + + genomes_storage_db : str + Path to the genomes storage database associated with the pan database. + + check_annotations : bool, True + If True, as by default, check that the reaction network stored in the pan database was + made from the set of gene KO annotations currently stored in the associated genomes + storage database. An exception is raised if this is not the case. If False, allow the + stored reaction network to have been made from a different set of gene KO annotations + than is currently stored in the genomes storage database. This can result in different + consensus KOs assigned to gene clusters in the returned PangenomicNetwork than in the + original network that was stored. + + quiet : bool, False + Print network overview statistics to the terminal if False. + + stats_file : str, None + Write network overview statistics to a tab-delimited file at this output path. + + Returns + ======= + PangenomicNetwork + Reaction network loaded from the pangenomic databases. + """ + if stats_file is not None: + filesnpaths.is_output_file_writable(stats_file) + + # Load the pan database. + pan_db_info = dbinfo.PanDBInfo(pan_db) + self_table = pan_db_info.get_self_table() + # No consensus threshold may have been used in network construction, in which case the value + # of the parameter is None. + consensus_threshold = self_table['reaction_network_consensus_threshold'] + if consensus_threshold is not None: + consensus_threshold = float(consensus_threshold) + discard_ties = bool(int(self_table['reaction_network_discard_ties'])) + args = argparse.Namespace() + args.pan_db = pan_db + args.genomes_storage = genomes_storage_db + args.consensus_threshold = consensus_threshold + args.discard_ties = discard_ties + pan_super = PanSuperclass(args, r=run_quiet) + pan_super.init_gene_clusters() + pan_super.init_gene_clusters_functions() + pan_super.init_gene_clusters_functions_summary_dict() + gene_clusters_functions_summary_dict: Dict = pan_super.gene_clusters_functions_summary_dict + + # Check that the network stored in the pan database was made from the same set of KO gene + # annotations currently in the associated genomes storage database. + stored_hash = self_table['reaction_network_ko_annotations_hash'] + current_hash = self.hash_pan_db_ko_annotations( + genomes_storage_db, + gene_clusters_functions_summary_dict, + consensus_threshold, + discard_ties + ) + if stored_hash != current_hash: + if check_gene_annotations: + # Note that another unstated possible cause of the error could be due to manual + # meddling with the metavariables, 'consensus_threshold' and 'discard_ties', in the + # database. Assume that the user was not engaged in mischief. + raise ConfigError( + "The reaction network stored in the pan database was made from a different set " + "of KO gene annotations than is currently in the associated genomes storage " + "database. There are two solutions to this problem. First, the program, " + "'anvi-reaction-network', can be run again to overwrite the existing network " + "stored in the pan database with a new network from the new KO gene " + "annotations. Second, 'check_gene_annotations' can be given an argument of " + "False instead of True, preventing this exception from being raised if the " + "stored network was made from a different set of KO gene annotations than is " + "currently in the genomes storage database. This can result in different " + "consensus KOs assigned to gene clusters in the returned PangenomicNetwork " + "than in the original network that was stored. The available version of the KO " + "database that has been set up by anvi'o is used to fill in data for any KOs " + "in the network that are not current gene annotations in the genomes storage " + "database." + ) + self.run.warning( + "The reaction network stored in the pan database was made from a different set of " + "KO gene annotations than is currently in the genomes storage database. This will " + "be ignored since 'check_gene_annotations' is False. This can result in different " + "consensus KO assignments to gene clusters in the returned PangenomicNetwork than " + "in the original network that was stored." + ) + + # Create the reaction network object. + network = PangenomicNetwork(run=self.run, progress=self.progress) + network.pan_db_source_path = os.path.abspath(pan_db) + network.genomes_storage_db_source_path = os.path.abspath(genomes_storage_db) + network.consensus_threshold = consensus_threshold + network.discard_ties = discard_ties + if stored_hash == current_hash: + network.consistent_annotations = True + else: + network.consistent_annotations = False + + # Find gene clusters with consensus KO annotations. Make objects representing gene clusters + # with KO annotations in the stored reaction network. Make objects representing the KOs, + # initially only assigning their ID attribute. + + pdb = PanDatabase(pan_db) + pdb_db: DB = pdb.db + ko_id_pattern = re.compile('K\d{5}') + reaction_network_ko_ids: List[str] = [ + kegg_id for kegg_id in + set(pdb_db.get_single_column_from_table( + tables.pan_reaction_network_kegg_table_name, 'kegg_id' + )) + if re.fullmatch(ko_id_pattern, kegg_id) + ] + + num_gene_clusters_assigned_ko = 0 + ko_ids_assigned_gene_cluster = [] + for cluster_id, gene_cluster_functions_data in gene_clusters_functions_summary_dict.items(): + # Retrieve the consensus KO across genes in the cluster. Parameterization of the method + # used to select consensus KOs occurred in pan super initialization. Parameter values + # were loaded from pan database metavariables. + gene_cluster_ko_data = gene_cluster_functions_data['KOfam'] + if gene_cluster_ko_data == {'function': None, 'accession': None}: + # No KO was assigned to the cluster. + continue + ko_id = gene_cluster_ko_data['accession'] + num_gene_clusters_assigned_ko += 1 + ko_ids_assigned_gene_cluster.append(ko_id) + + if ko_id not in reaction_network_ko_ids: + # The KO is not in the stored reaction network, indicating that it is a newer + # annotation. + continue + + gene_cluster = GeneCluster(gene_cluster_id=cluster_id, ko_id=ko_id) + gene_cluster.genomes = list(pan_super.gene_clusters[cluster_id]) + network.gene_clusters[cluster_id] = gene_cluster + + try: + # This is not the first gene cluster that has been encountered with the KO assigned + # to it, so an object for the KO already exists. + ko = network.kos[ko_id] + except KeyError: + ko = KO(id=ko_id, name=gene_cluster_ko_data['function']) + network.kos[ko_id] = ko + ko_ids_assigned_gene_cluster = set(ko_ids_assigned_gene_cluster) + + missing_ko_ids = set(reaction_network_ko_ids).difference(set(network.kos)) + if missing_ko_ids: + self.run.warning( + "The following KOs present in the reaction network as it was originally stored are " + "not present among the current gene cluster consensus KOs derived from the " + "pangenomic databases, suggesting that genes underlying the gene clusters were " + "reannotated." + ) + + self._load_modelseed_reactions(pdb, network) + self._load_modelseed_compounds(pdb, network) + self._load_ko_classifications(pdb, network) + + if quiet and not stats_file: + return network + + if network.consistent_annotations: + precomputed_counts = { + 'total_gene_clusters': pdb.meta['num_gene_clusters'], + 'gene_clusters_assigned_ko': num_gene_clusters_assigned_ko, + 'kos_assigned_gene_clusters': len(ko_ids_assigned_gene_cluster) + } + else: + precomputed_counts = { + 'total_gene_clusters': pdb.meta['num_gene_clusters'], + 'gene_clusters_assigned_ko': None, + 'kos_assigned_gene_clusters': None + } + pdb.disconnect() + stats = network.get_overview_statistics(precomputed_counts=precomputed_counts) + if not quiet: + network.print_overview_statistics(stats=stats) + if stats_file: + network.write_overview_statistics(stats_file, stats=stats) + + return network + + def _load_modelseed_reactions( + self, + database: Union[ContigsDatabase, PanDatabase], + network: ReactionNetwork + ) -> None: + """ + Add ModelSEED reactions to the network being loaded from the database. + + ModelSEED reaction objects are related to KOs through KEGG REACTION and EC number aliases. + + Parameters + ========== + database : ContigsDatabase or PanDatabase + Database storing a reaction network. + + network : ReactionNetwork + Network under construction. A GenomicNetwork is loaded from a ContigsDatabase; a + PangenomicNetwork is loaded from a PanDatabase. + + Returns + ======= + None + """ + # Load the table of reactions data. + if type(database) is ContigsDatabase: + reactions_table = database.db.get_table_as_dataframe( + tables.reaction_network_reactions_table_name + ) + if type(network) is not GenomicNetwork: + raise ConfigError( + "The provided 'database' was of type 'ContigsDatabase', so the provided " + "'network' must be of type 'GenomicNetwork'. Instead, the reaction network " + f"argument was of type '{type(network)}'." + ) + elif type(database) is PanDatabase: + reactions_table = database.db.get_table_as_dataframe( + tables.pan_reaction_network_reactions_table_name + ) + if type(network) is not PangenomicNetwork: + raise ConfigError( + "The provided 'database' was of type 'PanDatabase', so the provided 'network' " + "must be of type 'PangenomicNetwork'. Instead, the reaction network argument " + f"was of type '{type(network)}'." + ) + else: + raise ConfigError( + "The provided 'database' must be of type 'ContigsDatabase' or 'PanDatabase'. " + f"Instead, the argument was of type '{type(database)}'." + ) + + # Each row of the table contains information on a different ModelSEED reaction. + for row in reactions_table.itertuples(): + # Check that the reaction is associated with a KO that matches a gene in the contigs + # database (or is assigned to a gene cluster in the pan database). If KO gene + # annotations have been updated from those used to create the stored reaction network, + # then certain KOs and everything inferred from the KOs may no longer annotate genes + # and gene clusters, and are therefore not loaded. + is_networked = False + + # Map KEGG reaction aliases of the ModelSEED reaction to all KOs that were associated + # with the KEGG reaction. + kegg_reaction_sources: str = row.ko_kegg_reaction_source + kegg_reaction_kos: Dict[str, List[KO]] = {} + for kegg_reaction_item in kegg_reaction_sources.split('; '): + if not kegg_reaction_item: + # The ModelSEED reaction was not sourced from KEGG reactions. + continue + kegg_reaction_id, ko_ids = kegg_reaction_item.split(': (') + ko_ids = ko_ids[:-1].split(', ') + kegg_reaction_kos[kegg_reaction_id] = kos = [] + for ko_id in ko_ids: + try: + kos.append(network.kos[ko_id]) + except KeyError: + continue + if kos: + is_networked = True + + # Map EC number aliases of the ModelSEED reaction to all KOs that were associated with + # the EC number. + ec_number_sources: str = row.ko_ec_number_source + ec_number_kos: Dict[str, List[KO]] = {} + for ec_number_item in ec_number_sources.split('; '): + if not ec_number_item: + # The ModelSEED reaction was not sourced from EC numbers. + continue + ec_number, ko_ids = ec_number_item.split(': (') + ko_ids = ko_ids[:-1].split(', ') + ec_number_kos[ec_number] = kos = [] + for ko_id in ko_ids: + try: + kos.append(network.kos[ko_id]) + except KeyError: + continue + if kos: + is_networked = True + + if not is_networked: + continue + + reaction = ModelSEEDReaction() + reaction_id: str = row.modelseed_reaction_id + reaction.modelseed_id = reaction_id + reaction.modelseed_name = row.modelseed_reaction_name + network.reactions[reaction_id] = reaction + + modelseed_compound_ids: str = row.metabolite_modelseed_ids + reaction_compound_ids = [] + for compound_id in modelseed_compound_ids.split(', '): + reaction_compound_ids.append(compound_id) + if compound_id not in network.metabolites: + metabolite = ModelSEEDCompound() + metabolite.modelseed_id = compound_id + network.metabolites[compound_id] = metabolite + reaction.compound_ids = tuple(reaction_compound_ids) + + stoichiometry: str = row.stoichiometry + reaction.coefficients = tuple(int(coeff) for coeff in stoichiometry.split(', ')) + compartments: str = row.compartments + reaction.compartments = tuple(compartments.split(', ')) + reversibility: int = row.reversibility + reaction.reversibility = bool(reversibility) + + # Record *all* KEGG reaction aliases of the ModelSEED reaction, including those not + # associated with KO annotations. + other_kegg_reaction_ids: str = row.other_kegg_reaction_ids + reaction.kegg_aliases = list(kegg_reaction_kos) + if other_kegg_reaction_ids: + reaction.kegg_aliases += other_kegg_reaction_ids.split(', ') + reaction.kegg_aliases = tuple(reaction.kegg_aliases) + + network.modelseed_kegg_aliases[reaction_id] = modelseed_kegg_aliases = [] + for kegg_reaction_id, kos in kegg_reaction_kos.items(): + # Record the ModelSEED reaction as one of the aliases of the KEGG reaction in the + # network. + try: + network.kegg_modelseed_aliases[kegg_reaction_id].append(reaction_id) + except KeyError: + network.kegg_modelseed_aliases[kegg_reaction_id] = [reaction_id] + modelseed_kegg_aliases.append(kegg_reaction_id) + for ko in kos: + if not reaction_id in ko.reaction_ids: + # This is the first time encountering the reaction as a reference of the KO. + ko.reaction_ids.append(reaction_id) + try: + ko.kegg_reaction_aliases[reaction_id].append(kegg_reaction_id) + except KeyError: + ko.kegg_reaction_aliases[reaction_id] = [kegg_reaction_id] + + # Record *all* EC number aliases of the ModelSEED reaction, including those not + # associated with KO annotations. + other_ec_numbers: str = row.other_ec_numbers + reaction.ec_number_aliases = list(ec_number_kos) + if other_ec_numbers: + reaction.ec_number_aliases += other_ec_numbers.split(', ') + reaction.ec_number_aliases = tuple(reaction.ec_number_aliases) + + modelseed_ec_number_aliases = [] + network.modelseed_ec_number_aliases[reaction_id] = modelseed_ec_number_aliases + for ec_number, kos in ec_number_kos.items(): + # Record the ModelSEED reaction as one of the aliases of the EC number in the + # network. + try: + network.ec_number_modelseed_aliases[ec_number].append(reaction_id) + except KeyError: + network.ec_number_modelseed_aliases[ec_number] = [reaction_id] + modelseed_ec_number_aliases.append(ec_number) + for ko in kos: + if not reaction_id in ko.reaction_ids: + # This is the first time encountering the reaction as a reference of the KO. + ko.reaction_ids.append(reaction_id) + try: + ko.ec_number_aliases[reaction_id].append(ec_number) + except KeyError: + ko.ec_number_aliases[reaction_id] = [ec_number] + + def _load_modelseed_compounds( + self, + database: Union[ContigsDatabase, PanDatabase], + network: ReactionNetwork + ) -> None: + """ + Add ModelSEED compounds to the network being loaded from the database. + + Parameters + ========== + database : ContigsDatabase or PanDatabase + Database storing a reaction network. + + network : GenomicNetwork or PangenomicNetwork + Network under construction. A GenomicNetwork is loaded from a ContigsDatabase; a + PangenomicNetwork is loaded from a PanDatabase. + + Returns + ======= + None + """ + # Load the table of compounds data. + if type(database) is ContigsDatabase: + metabolites_table = database.db.get_table_as_dataframe( + tables.reaction_network_metabolites_table_name + ) + if type(network) is not GenomicNetwork: + raise ConfigError( + "The provided 'database' was of type 'ContigsDatabase', so the provided " + "'network' must be of type 'GenomicNetwork'. Instead, the reaction network " + f"argument was of type '{type(network)}'." + ) + elif type(database) is PanDatabase: + metabolites_table = database.db.get_table_as_dataframe( + tables.pan_reaction_network_metabolites_table_name + ) + if type(network) is not PangenomicNetwork: + raise ConfigError( + "The provided 'database' was of type 'PanDatabase', so the provided 'network' " + "must be of type 'PangenomicNetwork'. Instead, the reaction network argument " + f"was of type '{type(database)}'." + ) + else: + raise ConfigError( + "The provided 'database' must be of type 'ContigsDatabase' or 'PanDatabase'. " + f"Instead, the argument was of type '{type(database)}'." + ) + + # Each row of the table contains information on a different ModelSEED compound. + for row in metabolites_table.itertuples(): + compound_id = row.modelseed_compound_id + try: + metabolite = network.metabolites[compound_id] + except KeyError: + # The metabolite in the stored network is not loaded. The metabolite only + # participates in reactions that also were not loaded. The reactions are only + # associated with KOs that no longer match genes in the contigs database (or are no + # longer assigned to gene clusters in the pan database). Gene KO annotations must + # have been updated from those used to create the stored reaction network. + continue + modelseed_compound_name: str = row.modelseed_compound_name + metabolite.modelseed_name = modelseed_compound_name + kegg_aliases: str = row.kegg_aliases + if kegg_aliases: + metabolite.kegg_aliases = tuple(kegg_aliases.split(', ')) + else: + metabolite.kegg_aliases = tuple() + # Compounds without a formula, recorded here as None, have a nominal charge of 10000000 + # in the ModelSEED compounds database. This is replaced by NaN in the table and here as + # None in the reaction network. + formula: str = row.formula + metabolite.formula = formula + charge: int = row.charge + metabolite.charge = None if np.isnan(charge) else int(charge) + smiles: str = row.smiles + metabolite.smiles = smiles + + def _load_ko_classifications( + self, + database: Union[ContigsDatabase, PanDatabase], + network: ReactionNetwork + ) -> None: + """ + Add information on KEGG module, pathway, and BRITE hierarchy membership to the network being + loaded from the database. + + Parameters + ========== + database : ContigsDatabase or PanDatabase + Database storing a reaction network. + + network : ReactionNetwork + Network under construction. A GenomicNetwork is loaded from a ContigsDatabase; a + PangenomicNetwork is loaded from a PanDatabase. + + Returns + ======= + None + """ + # Load the table of compounds data. + if type(database) is ContigsDatabase: + kegg_table = database.db.get_table_as_dataframe(tables.reaction_network_kegg_table_name) + if type(network) is not GenomicNetwork: + raise ConfigError( + "The provided 'database' was of type 'ContigsDatabase', so the provided " + "'network' must be of type 'GenomicNetwork'. Instead, the reaction network " + f"argument was of type '{type(network)}'." + ) + elif type(database) is PanDatabase: + kegg_table = database.db.get_table_as_dataframe( + tables.pan_reaction_network_kegg_table_name + ) + if type(network) is not PangenomicNetwork: + raise ConfigError( + "The provided 'database' was of type 'PanDatabase', so the provided 'network' " + "must be of type 'PangenomicNetwork'. Instead, the reaction network argument " + f"was of type '{type(database)}'." + ) + else: + raise ConfigError( + "The provided 'database' must be of type 'ContigsDatabase' or 'PanDatabase'. " + f"Instead, the argument was of type '{type(database)}'." + ) + + # Create a separate table for each type of KEGG information. + kegg_table = kegg_table.fillna('') + ko_id_pattern = re.compile('K\d{5}') + kos_table = kegg_table[kegg_table['kegg_id'].apply( + lambda ko_id: True if re.fullmatch(ko_id_pattern, ko_id) else False + )] + module_id_pattern = re.compile('M\d{5}') + modules_table = kegg_table[kegg_table['kegg_id'].apply( + lambda module_id: True if re.fullmatch(module_id_pattern, module_id) else False + )] + pathway_id_pattern = re.compile('map\d{5}') + pathways_table = kegg_table[kegg_table['kegg_id'].apply( + lambda pathway_id: True if re.fullmatch(pathway_id_pattern, pathway_id) else False + )] + hierarchy_id_pattern = re.compile('ko\d{5}') + hierarchies_table = kegg_table[kegg_table['kegg_id'].apply( + lambda hierarchy_id: True if re.fullmatch(hierarchy_id_pattern, hierarchy_id) else False + )] + + # Remove entries in the KO table for stored network KOs that no longer annotate genes in the + # contigs database. (For pangenomics, instead read this and following comments as indicating + # that the KOs no longer are consensus annotations of gene clusters from the pan database.) + reaction_network_ko_ids: Set[str] = set(kos_table['kegg_id']) + kos_table: pd.DataFrame = kos_table.set_index('kegg_id').loc[ + set(network.kos).intersection(reaction_network_ko_ids) + ] + + # Fill out KEGG classification attributes of KO objects in the loaded network. + # Record KOs that have names that differ between (newer) gene annotations in the database + # and (older) information in the stored network due to a KEGG database update. + inconsistent_kos: Dict[str, Tuple[str, str]] = {} + for row in kos_table.itertuples(): + ko_id: str = row.Index + ko = network.kos[ko_id] + + ko_name: str = row.name + if ko.name != ko_name: + inconsistent_kos[ko_id] = (ko.name, ko_name) + + module_ids_str: str = row.modules + if module_ids_str: + module_ids = module_ids_str.split(', ') + else: + # The KO is not classified in a module. + module_ids = [] + for module_id in module_ids: + ko.module_ids.append(module_id) + try: + # Another KO was loaded that is classified in the module. + module = network.modules[module_id] + except KeyError: + # Create an object in the network for the newly encountered module. + network.modules[module_id] = module = KEGGModule(id=module_id) + module.ko_ids.append(ko_id) + + pathway_ids_str: str = row.pathways + if pathway_ids_str: + pathway_ids = pathway_ids_str.split(', ') + else: + # The KO is not classified in a pathway. + pathway_ids = [] + for pathway_id in pathway_ids: + ko.pathway_ids.append(pathway_id) + try: + # Another KO was loaded that is classified in the pathway. + pathway = network.pathways[pathway_id] + except KeyError: + # Create an object in the network for the newly encountered pathway. + network.pathways[pathway_id] = pathway = KEGGPathway(id=pathway_id) + pathway.ko_ids.append(ko_id) + + categorizations_str: str = row.brite_categorization + if categorizations_str: + categorization_strs = categorizations_str.split(' !!! ') + else: + # The KO is not classified in a BRITE category. + categorization_strs = [] + loaded_categories: Dict[str, List[Tuple[str]]] = {} + if categorization_strs: + for categorization_str in categorization_strs: + full_categorization = categorization_str.split(' >>> ') + hierarchy_id = full_categorization[0] + assert re.fullmatch(hierarchy_id_pattern, hierarchy_id) + try: + categorizations = loaded_categories[hierarchy_id] + except KeyError: + loaded_categories[hierarchy_id] = categorizations = [] + categorization = tuple(full_categorization[1:]) + categorizations.append(categorization) + + for hierarchy_id, categorizations in loaded_categories.items(): + ko.hierarchies[hierarchy_id] = categorizations + + try: + # Another KO was loaded that is classified in the hierarchy. + hierarchy = network.hierarchies[hierarchy_id] + except KeyError: + # Create an object in the network for the newly encountered hierarchy. + network.hierarchies[hierarchy_id] = hierarchy = BRITEHierarchy(id=hierarchy_id) + hierarchy.ko_ids.append(ko_id) + + try: + network_categorizations = network.categories[hierarchy_id] + except KeyError: + network.categories[hierarchy_id] = network_categorizations = {} + + for categorization in categorizations: + if categorization in hierarchy.categorizations: + # Another KO was loaded that is classified in the category. + categories = network_categorizations[categorization] + for category in categories: + if ko_id not in category.ko_ids: + category.ko_ids.append(ko_id) + continue + + hierarchy.categorizations.append(categorization) + categories: List[BRITECategory] = [] + + # Add the category and unencountered supercategories to the network. + for depth, category_name in enumerate(categorization, 1): + focus_categorization = categorization[:depth] + try: + # Another KO was loaded that is classified in the supercategory. + category = network_categorizations[focus_categorization][-1] + is_added = True + except KeyError: + is_added = False + + if is_added: + if ko_id not in category.ko_ids: + category.ko_ids.append(ko_id) + categories.append(category) + continue + + if depth > 1: + # The unencountered category is a subcategory of its supercategory. + categories[-1].subcategory_names.append(category_name) + + category = BRITECategory() + category.id = f'{hierarchy_id}: {" >>> ".join(focus_categorization)}' + category.name = category_name + category.hierarchy_id = hierarchy_id + category.ko_ids.append(ko_id) + categories.append(category) + network_categorizations[focus_categorization] = tuple(categories) + + if inconsistent_kos: + msg = '' + for ko_id, ko_names in inconsistent_kos.items(): + gene_ko_name, network_ko_name = ko_names + msg += f"{ko_id}: '{gene_ko_name}' ||| '{network_ko_name}', " + self.run.warning( + "KO names differ between certain records in the stored reaction network and the " + "current gene-KO hits, indicating that genes were reannotated with KOs from a " + "newer version of the KEGG database in which certain names have changed. Hopefully " + "the names are similar enough to indicate that the underlying KO ID that unites " + "them is from the same ortholog. The first name following the KO ID is from the " + "gene annotation and the second after the '|||' separator is from the stored " + f"network: {msg}" + ) + + # Fill out module and pathway attributes. + for row in modules_table.itertuples(): + module_id: str = row.kegg_id + try: + module = network.modules[module_id] + except KeyError: + # The module is not loaded because it no longer contains any KOs that annotate genes + # in the database. + continue + + module_name: str = row.name + module.name = module_name + + pathway_ids: str = row.pathways + if not pathway_ids: + continue + for pathway_id in pathway_ids.split(', '): + module.pathway_ids.append(pathway_id) + pathway = network.pathways[pathway_id] + pathway.module_ids.append(module_id) + + # Fill out pathway and equivalent BRITE category attributes. + for row in pathways_table.itertuples(): + pathway_id: str = row.kegg_id + try: + pathway = network.pathways[pathway_id] + except KeyError: + # The pathway is not loaded because it no longer contains any KOs that annotate + # genes in the database. + continue + + pathway_name: str = row.name + pathway.name = pathway_name + + categorization_str: str = row.brite_categorization + full_categorization = categorization_str.split(' >>> ') + hierarchy_id = full_categorization[0] + assert hierarchy_id == 'ko00001' + categorization = tuple(full_categorization[1:]) + pathway.categorization = categorization + category = network.categories[hierarchy_id][categorization][-1] + category.pathway_id = pathway_id + + # Fill out hierarchy names. + for row in hierarchies_table.itertuples(): + hierarchy_id: str = row.kegg_id + try: + hierarchy = network.hierarchies[hierarchy_id] + except KeyError: + # The hierarchy is not loaded because it no longer contains any KOs that annotate + # genes in the database. + continue + + hierarchy_name: str = row.name + hierarchy.name = hierarchy_name + + def make_network( + self, + contigs_db: str = None, + pan_db: str = None, + genomes_storage_db: str = None, + store: bool = True, + overwrite_existing_network: bool = False, + consensus_threshold: float = None, + discard_ties: bool = False, + stats_file: str = None + ) -> ReactionNetwork: + """ + Make a metabolic reaction network from KEGG Orthologs stored in an anvi'o database, + associated KEGG annotations, and the ModelSEED Biochemistry database. + + Parameters + ========== + contigs_db : str, None + Path to a contigs database. The database can represent different types of samples, + including a single genome, metagenome, or transcriptome. The network is derived from + gene KO annotations stored in the database. If 'store' is True, the network is saved in + the database. + + pan_db : str, None + Path to a pan database. The pangenomic network is determined for gene clusters stored in + the database. If 'store' is True, the network is saved in the database. + An argument for the paired 'genomes_storage_db' is also required. + + genomes_storage_db : str, None + Path to a genomes storage database. The pangenomic network is derived from gene KO + annotations stored in the database. An argument for the paired 'pan_db' is also + required. + + store : bool, True + Save the network. A network constructed from a contigs database is stored in that + database. A pangenomic network constructed from a genomes stroage database and pan + database is stored in the pan database. + + overwrite_existing_network : bool, False + Overwrite an existing network stored in the contigs or pan database. 'store' is also + required. + + consensus_threshold : float, None + This parameter applies to pangenomes. With the default of None, the protein annotation + most frequent among genes in a cluster is assigned to the cluster itself. If a + non-default argument is provided (a value on [0, 1]), at least this proportion of genes + in the cluster must have the most frequent annotation for the cluster to be annotated. + + discard_ties : bool, False + This parameter applies to pangenomes. If multiple protein annotations are most frequent + among genes in a cluster, then do not assign an annotation to the cluster itself when + this argument is True. By default, this argument is False, so one of the most frequent + annotations would be arbitrarily chosen. + + stats_file : str, None + Write network overview statistics to a tab-delimited file at this output path. + + Returns + ======= + ReactionNetwork + Reaction network loaded from the input database. + """ + if contigs_db and (pan_db or genomes_storage_db): + raise ConfigError( + "Either a contigs database OR both a pan database and genomes storage database are " + "required to make either a (meta)genomic reaction network or a pangenomic reaction " + "network, respectively." + ) + elif contigs_db: + self.run.info_single( + "A reaction network will be made from protein orthology annotations in the contigs " + "database." + ) + network = self.make_contigs_database_network( + contigs_db, + store=store, + overwrite_existing_network=overwrite_existing_network, + stats_file=stats_file + ) + elif genomes_storage_db or pan_db: + self.run.info_single( + "A pangenomic reaction network will be made from protein orthology annotations in " + "the genomes storage database and gene clusters in the pan database." + ) + network = self.make_pangenomic_network( + pan_db, + genomes_storage_db, + store=store, + overwrite_existing_network=overwrite_existing_network, + consensus_threshold=consensus_threshold, + discard_ties=discard_ties, + stats_file=stats_file + ) + else: + raise ConfigError( + "A reaction network cannot be made without a database source. Either a contigs " + "database OR a pan database and genomes storage database are required to make " + "either a (meta)genomic reaction network or a pangenomic reaction network, " + "respectively." + ) + return network + + def make_contigs_database_network( + self, + contigs_db: str, + store: bool = True, + overwrite_existing_network: bool = False, + stats_file: str = None + ) -> GenomicNetwork: + """ + Make a metabolic reaction network from KEGG Orthologs stored in a contigs database. + + Parameters + ========== + contigs_db : str + Path to a contigs database. The database can represent different types of samples, + including a single genome, metagenome, or transcriptome. The network is derived from + gene KO annotations stored in the database. + + store : bool, True + Save the network to the contigs database. + + overwrite_existing_network : bool, False + Overwrite an existing network stored in the contigs database. 'store' is also required. + + stats_file : str, None + Write network overview statistics to a tab-delimited file at this output path. + + Returns + ======= + GenomicNetwork + The network derived from the contigs database. + """ + # Here is an example of the information used to create a genomic network. + # gene 1 ---> KO 1 ---> KEGG rxn 1 ---> ModelSEED rxn 1 ---> ModelSEED metabs 1, 2, ... + # | | | + # | | ---> EC number 1 --> ModelSEED rxn 1 ---> ModelSEED metabs 1, 2, ... + # | | | | + # | | | --> ModelSEED rxn 2 ---> ... + # | | | + # | | ---> EC number 2 --> ... + # | | + # | ---> KO 2 ---> ... + # | + # gene 2 ---> ... + + # Preemptively check the statistics file path. + if stats_file is not None: + filesnpaths.is_output_file_writable(stats_file) + + # Load the contigs database. + self.run.info("Contigs database", contigs_db) + utils.is_contigs_db(contigs_db) + cdb = ContigsDatabase(contigs_db) + cdb_db: DB = cdb.db + sources: List[str] = cdb.meta['gene_function_sources'] + if not sources or not 'KOfam' in sources: + raise ConfigError( + "The contigs database indicates that genes were never annotated with KOs, which is " + "required to build a reaction network. This can be solved by running " + "'anvi-run-kegg-kofams' on the contigs database." + ) + if ( + store and + cdb_db.get_meta_value('reaction_network_ko_annotations_hash') and + not overwrite_existing_network + ): + raise ConfigError( + "The existing reaction network in the contigs database must be explicitly " + "overwritten." + ) + + self.progress.new("Building reaction network") + + network = GenomicNetwork(run=self.run, progress=self.progress) + network.contigs_db_source_path = os.path.abspath(contigs_db) + + # Load reference databases. + self.progress.update("Loading KEGG reference database") + kegg_db = KEGGData(kegg_dir=self.kegg_dir) + kegg_kos_data = kegg_db.ko_data + kegg_modules_data = kegg_db.module_data + kegg_pathways_data = kegg_db.pathway_data + kegg_hierarchies_data = kegg_db.hierarchy_data + + self.progress.update("Loading ModelSEED Biochemistry reference database") + modelseed_db = ModelSEEDDatabase(modelseed_dir=self.modelseed_dir) + modelseed_kegg_reactions_table = modelseed_db.kegg_reactions_table + modelseed_ec_reactions_table = modelseed_db.ec_reactions_table + modelseed_compounds_table = modelseed_db.compounds_table + + # Record KOs that annotated genes in the contigs database but for some reason aren't found + # in the KEGG KO database set up by anvi'o. + undefined_ko_ids: List[str] = [] + + # Record ModelSEED reactions that would have been added to the reaction network if the + # reaction had a chemical equation in the ModelSEED Biochemistry database. + undefined_modelseed_reaction_ids: List[str] = [] + + # Parse gene-KO matches recorded in the contigs database. + gene_ko_hits_table = cdb_db.get_table_as_dataframe( + 'gene_functions', + where_clause='source = "KOfam"', + columns_of_interest=['gene_callers_id', 'accession', 'function', 'e_value'] + ) + total_ko_matches = len(gene_ko_hits_table) + num_ko_matches_parsed = -1 + for row in gene_ko_hits_table.itertuples(index=False): + num_ko_matches_parsed += 1 + self.progress.update( + f"Gene-KO matches parsed: {num_ko_matches_parsed} / {total_ko_matches}" + ) + + # Get data on the gene-KO match. + gcid = int(row.gene_callers_id) + ko_id = row.accession + ko_name = row.function + e_value = float(row.e_value) + + # Represent the gene as an object. + if gcid in network.genes: + # The gene has already been added to the network. + gene = network.genes[gcid] + is_new_gene = False + else: + gene = Gene() + gene.gcid = gcid + is_new_gene = True + + try: + # The KO and its associated reactions and metabolites have already been added to the + # network. + ko = network.kos[ko_id] + is_new_ko = False + except KeyError: + is_new_ko = True + if not is_new_ko: + gene.ko_ids.append(ko_id) + gene.e_values[ko_id] = e_value + if is_new_gene: + # Add the unadded gene to the network. + network.genes[gcid] = gene + # Proceed to the next gene-KO match. + continue + + # Get KEGG REACTION IDs and EC numbers associated with the KO. + try: + ko_info = kegg_kos_data[ko_id] + except KeyError: + # For some reason the KO annotated a gene in the contigs database but is not found + # in the KEGG database set up by anvi'o. Do not add the alien KO or the gene, if + # unadded, to the network. + undefined_ko_ids.append(ko_id) + continue + ko_kegg_reaction_ids = self._get_ko_kegg_reaction_ids(ko_info) + ko_ec_numbers = self._get_ko_ec_numbers(ko_info) + + if not ko_kegg_reaction_ids and not ko_ec_numbers: + # The KO is not associated with any KEGG reactions or EC numbers, and therefore + # anvi'o can't relate the KO to ModelSEED reactions. Do not add the unsystematizable + # KO or the gene, if unadded, to the network. + continue + + # Check if KEGG reactions and EC numbers associated with the KO have already been added + # to the network in processing other gene-KO matches. To have been added to the network, + # KEGG reactions and EC numbers must have aliased ModelSEED reactions. + old_kegg_reaction_ids, new_kegg_reaction_ids = self._find_kegg_reaction_ids( + ko_kegg_reaction_ids, network + ) + old_ec_numbers, new_ec_numbers = self._find_ec_numbers(ko_ec_numbers, network) + + # Retrieve data on ModelSEED reactions aliasing KEGG reactions that haven't been added + # to the network. Each row of the reference table represents a unique mapping of KEGG + # reaction to ModelSEED reaction. + modelseed_kegg_reactions_dict: Dict[int, Dict] = modelseed_kegg_reactions_table[ + modelseed_kegg_reactions_table['KEGG_REACTION_ID'].isin(new_kegg_reaction_ids) + ].to_dict(orient='index') + + # Retrieve data on ModelSEED reactions aliasing EC numbers that haven't been added to + # the network. Each row of the reference table represents a unique mapping of EC number + # to ModelSEED reaction. + modelseed_ec_reactions_dict: Dict[int, Dict] = modelseed_ec_reactions_table[ + modelseed_ec_reactions_table['EC_number'].isin(new_ec_numbers) + ].to_dict(orient='index') + + if not ( + old_kegg_reaction_ids or + old_ec_numbers or + modelseed_kegg_reactions_dict or + modelseed_ec_reactions_dict + ): + # None of the KEGG REACTION IDs and EC numbers associated with the KO map to + # ModelSEED reactions (none are in the ModelSEED Biochemistry reactions table). + continue + + # Find "undefined" ModelSEED reactions without an equation. + undefined_modelseed_reaction_ids += self._remove_undefined_reactions( + ko_kegg_reaction_ids, + new_kegg_reaction_ids, + ko_ec_numbers, + new_ec_numbers, + modelseed_kegg_reactions_dict, + modelseed_ec_reactions_dict + ) + + if not ( + old_kegg_reaction_ids or + new_kegg_reaction_ids or + old_ec_numbers or + new_ec_numbers + ): + # The KO is not added to the network as none of the reactions have equations. + continue + # The newly encountered KO is now known to be associated with KEGG reactions or EC + # numbers that alias ModelSEED reactions with an equation; this new data can be added to + # the network. + + if is_new_gene: + # Add an object representing the unadded gene to the network. + network.genes[gcid] = gene + + # Add an object representing the newly encountered KO to the network. + ko = KO() + ko.id = ko_id + ko.name = ko_name + network.kos[ko_id] = ko + gene.ko_ids.append(ko_id) + gene.e_values[ko_id] = e_value + + # Associate ModelSEED reactions that have previously been added to the network under + # construction with the newly encountered KO. + self._process_added_reactions( + old_kegg_reaction_ids, + old_ec_numbers, + network, + ko, + ko_kegg_reaction_ids, + ko_ec_numbers + ) + + # Add ModelSEED reactions aliasing newly encountered KEGG reactions and EC numbers to + # the network. + self._add_reactions( + modelseed_kegg_reactions_dict, + modelseed_ec_reactions_dict, + network, + modelseed_compounds_table, + ko, + old_kegg_reaction_ids, + new_kegg_reaction_ids, + old_ec_numbers, + new_ec_numbers + ) + + # Add KEGG classifications of the KO (modules, pathways, and BRITE hierarchies) to the + # network. + self._add_ko_classification( + ko, + network, + ko_info, + kegg_modules_data, + kegg_pathways_data, + kegg_hierarchies_data + ) + + self._relate_modules_pathways(network, kegg_modules_data) + + if DEBUG: + for gene in network.genes.values(): + for ko_id in gene.ko_ids: + ko = network.kos[ko_id] + assert ko.reaction_ids + + assert sorted(network.modelseed_kegg_aliases) == sorted(network.reactions) + assert sorted(network.modelseed_ec_number_aliases) == sorted(network.reactions) + + undefined_modelseed_reaction_ids = set(undefined_modelseed_reaction_ids) + undefined_ko_ids = set(undefined_ko_ids) + + self.progress.end() + + if DEBUG: + self.run.info_single( + "The following ModelSEED reactions would have been added to the reaction network " + "had there been a chemical equation in the ModelSEED database; perhaps it is worth " + "investigating the ModelSEED reactions table to understand why this is not the " + f"case: {', '.join(undefined_modelseed_reaction_ids)}" + ) + + if undefined_ko_ids: + self.run.info_single( + "Certain genes matched KOs that were not found in the KEGG reference database. " + "These KOs will not be used in network construction. It could be that the KOfams " + "used to annotate genes were not from the same KEGG database version as the " + "reference. Here are the unrecognized KO IDs from the contigs database: " + f"{','.join(undefined_ko_ids)}" + ) + + kegg_dir = kegg_db.kegg_context.kegg_data_dir + if self.modelseed_dir is None: + modelseed_dir = ModelSEEDDatabase.default_dir + else: + modelseed_dir = self.modelseed_dir + self.run.info("Reference KEGG database directory", kegg_dir, nl_before=1) + self.run.info("Reference ModelSEED database directory", modelseed_dir, nl_after=1) + + if store: + if cdb_db.get_meta_value('reaction_network_ko_annotations_hash'): + self.run.warning("Deleting existing reaction network from contigs database") + cdb_db._exec(f'''DELETE from {tables.reaction_network_reactions_table_name}''') + cdb_db._exec(f'''DELETE from {tables.reaction_network_metabolites_table_name}''') + cdb_db._exec(f'''DELETE from {tables.reaction_network_kegg_table_name}''') + self.run.info_single( + "Deleted data in gene function reactions and metabolites tables", nl_after=1 + ) + + self.progress.new("Saving reaction network to contigs database") + self.progress.update("Reactions table") + reactions_table = self._get_database_reactions_table(network) + sql_statement = ( + f"INSERT INTO {tables.reaction_network_reactions_table_name} VALUES " + f"({','.join('?' * len(tables.reaction_network_reactions_table_structure))})" + ) + cdb_db._exec_many(sql_statement, reactions_table.values) + self.progress.update("Metabolites table") + metabolites_table = self._get_database_metabolites_table(network) + sql_statement = ( + f"INSERT INTO {tables.reaction_network_metabolites_table_name} VALUES " + f"({','.join('?' * len(tables.reaction_network_metabolites_table_structure))})" + ) + cdb_db._exec_many(sql_statement, metabolites_table.values) + self.progress.update("KEGG KO information table") + kegg_table = self._get_database_kegg_table(network) + sql_statement = ( + f"INSERT INTO {tables.reaction_network_kegg_table_name} VALUES " + f"({','.join('?' * len(tables.reaction_network_kegg_table_structure))})" + ) + cdb_db._exec_many(sql_statement, kegg_table.values) + + self.progress.update("Metadata") + ko_annotations_hash = self.hash_contigs_db_ko_hits(gene_ko_hits_table) + cdb_db.set_meta_value('reaction_network_ko_annotations_hash', ko_annotations_hash) + # The KEGG database release is now not explicitly that, but instead the hash of the + # anvi'o modules database. + cdb_db.set_meta_value('reaction_network_kegg_database_release', kegg_db.modules_db_hash) + cdb_db.set_meta_value('reaction_network_modelseed_database_sha', modelseed_db.sha) + self.progress.end() + + precomputed_counts = { + 'total_genes': cdb_db.get_row_counts_from_table('genes_in_contigs'), + 'genes_assigned_kos': gene_ko_hits_table['gene_callers_id'].nunique(), + 'kos_assigned_genes': gene_ko_hits_table['accession'].nunique() + } + cdb.disconnect() + stats = network.get_overview_statistics(precomputed_counts=precomputed_counts) + network.print_overview_statistics(stats=stats) + if stats_file: + network.write_overview_statistics(stats_file, stats=stats) + + return network + + def make_pangenomic_network( + self, + pan_db: str, + genomes_storage_db: str, + store: bool = True, + overwrite_existing_network: bool = False, + consensus_threshold: float = None, + discard_ties: bool = False, + stats_file: str = None + ) -> PangenomicNetwork: + """ + Make a pangenomic metabolic reaction network from KEGG Orthologs stored a genomes storage + database and gene clusters stored in a pan database. + + Parameters + ========== + pan_db : str + Path to a pan database. The pangenomic network is determined for gene clusters stored in + the database. + + genomes_storage_db : str + Path to a genomes storage database. The pangenomic network is derived from gene KO + annotations stored in the database. + + store : bool, True + Save the network to the pan database. + + overwrite_existing_network : bool, False + Overwrite an existing network stored in the pan database. 'store' is also required. + + consensus_threshold : float, None + With the default of None, the protein annotation most frequent among genes in a cluster + is assigned to the cluster itself. If a non-default argument is provided (a value on [0, + 1]), at least this proportion of genes in the cluster must have the most frequent + annotation for the cluster to be annotated. + + discard_ties : bool, False + If multiple protein annotations are most frequent among genes in a cluster, then do not + assign an annotation to the cluster itself when this argument is True. By default, this + argument is False, so one of the most frequent annotations would be arbitrarily chosen. + + stats_file : str, None + Write network overview statistics to a tab-delimited file at this output path. + + Returns + ======= + PangenomicNetwork + The network derived from the pangenomic databases. + """ + # Preemptively check the statistics file path. + if stats_file is not None: + filesnpaths.is_output_file_writable(stats_file) + + # Load the pan database. + args = Namespace() + args.pan_db = pan_db + args.genomes_storage = genomes_storage_db + args.discard_ties = discard_ties + args.consensus_threshold = consensus_threshold + pan_super = PanSuperclass(args, r=run_quiet) + + if ( + store and + pan_super.p_meta['reaction_network_ko_annotations_hash'] and + not overwrite_existing_network + ): + raise ConfigError( + "The existing reaction network in the pan database must be explicitly overwritten." + ) + + # Check that genome contigs databases were annotated with KOs before building the pan + # database. Unlike in contigs super, the initialization of functions by a method of pan + # super does not allow specification of particular functional annotation sources, with + # concomitant checks for their existence. + gs_info = dbinfo.GenomeStorageDBInfo(genomes_storage_db) + gs_sources: str = gs_info.get_self_table()['gene_function_sources'] + if 'KOfam' not in [source.strip() for source in gs_sources.split(',')]: + raise ConfigError( + "The genomes of the pangenome were not annotated with KOs, which can be rectified " + "by running `anvi-run-kegg-kofams` on the genome contigs databases and remaking " + "the pangenome." + ) + pan_super.init_gene_clusters() + pan_super.init_gene_clusters_functions() + pan_super.init_gene_clusters_functions_summary_dict() + + self.progress.new("Building reaction network") + self.progress.update("Loading reference databases") + + # Create the reaction network object. + network = PangenomicNetwork(run=self.run, progress=self.progress) + network.pan_db_source_path = os.path.abspath(pan_db) + network.genomes_storage_db_source_path = os.path.abspath(genomes_storage_db) + network.consensus_threshold = consensus_threshold + network.discard_ties = discard_ties + network.consistent_annotations = True + + # Load reference databases. + kegg_db = KEGGData(kegg_dir=self.kegg_dir) + kegg_kos_data = kegg_db.ko_data + kegg_modules_data = kegg_db.module_data + kegg_pathways_data = kegg_db.pathway_data + kegg_hierarchies_data = kegg_db.hierarchy_data + + modelseed_db = ModelSEEDDatabase(self.modelseed_dir) + modelseed_kegg_reactions_table = modelseed_db.kegg_reactions_table + modelseed_ec_reactions_table = modelseed_db.ec_reactions_table + modelseed_compounds_table = modelseed_db.compounds_table + + # Record KOs that annotated gene clusters in the pan database but for some reason aren't + # found in the KEGG KO database set up by anvi'o. + undefined_ko_ids: List[str] = [] + + # Record ModelSEED reactions that would have been added to the reaction network if the + # reaction had a chemical equation in the ModelSEED Biochemistry database. + undefined_modelseed_reaction_ids: List[str] = [] + + # Parse gene clusters. + gene_clusters_functions_summary_dict: Dict = pan_super.gene_clusters_functions_summary_dict + total_gene_clusters = len(pan_super.gene_clusters) + num_gene_clusters_parsed = -1 + num_gene_clusters_assigned_ko = 0 + ko_ids_assigned_gene_cluster: List[str] = [] + for cluster_id, gene_cluster_functions_data in gene_clusters_functions_summary_dict.items(): + num_gene_clusters_parsed += 1 + self.progress.update( + f"Gene clusters parsed: {num_gene_clusters_parsed} / {total_gene_clusters}" + ) + + # Retrieve the consensus KO across genes in the cluster. Parameterization of the method + # used to select consensus KOs occurred in pan super initialization. + gene_cluster_ko_data = gene_cluster_functions_data['KOfam'] + if gene_cluster_ko_data == {'function': None, 'accession': None}: + # No KO was assigned to the cluster. + continue + ko_id = gene_cluster_ko_data['accession'] + num_gene_clusters_assigned_ko += 1 + ko_ids_assigned_gene_cluster.append(ko_id) + + # Represent the gene cluster as an object. + gene_cluster = GeneCluster() + gene_cluster.gene_cluster_id = cluster_id + gene_cluster.genomes = list(pan_super.gene_clusters[cluster_id]) + + try: + # The KO and its associated reactions and metabolites have already been added to the + # network. + ko = network.kos[ko_id] + is_new_ko = False + except KeyError: + is_new_ko = True + if not is_new_ko: + gene_cluster.ko_id = ko_id + # Add the newly encountered gene cluster to the network. + network.gene_clusters[cluster_id] = gene_cluster + # Proceed to the next gene cluster. + continue + + # Get KEGG REACTION IDs and EC numbers associated with the KO. + try: + ko_info = kegg_kos_data[ko_id] + except KeyError: + # For some reason the KO annotated genes in the pangenomic databases but is not + # found in the KEGG database set up by anvi'o. Do not add the alien KO or the gene + # cluster to the network. + undefined_ko_ids.append(ko_id) + continue + ko_kegg_reaction_ids = self._get_ko_kegg_reaction_ids(ko_info) + ko_ec_numbers = self._get_ko_ec_numbers(ko_info) + + if not ko_kegg_reaction_ids and not ko_ec_numbers: + # The KO is not associated with any KEGG reactions or EC numbers, and therefore + # anvi'o can't relate the KO to ModelSEED reactions. Do not add the unsystematizable + # KO or the gene cluster to the network. + continue + + # Check if KEGG reactions and EC numbers associated with the KO have already been added + # to the network in processing other gene clusters. To have been added to the network, + # KEGG reactions and EC numbers must have aliased ModelSEED reactions. + old_kegg_reaction_ids, new_kegg_reaction_ids = self._find_kegg_reaction_ids( + ko_kegg_reaction_ids, network + ) + old_ec_numbers, new_ec_numbers = self._find_ec_numbers(ko_ec_numbers, network) + + # Retrieve data on ModelSEED reactions aliasing KEGG reactions that haven't been added + # to the network. Each row of the reference table represents a unique mapping of KEGG + # reaction to ModelSEED reaction. + modelseed_kegg_reactions_dict: Dict[int, Dict] = modelseed_kegg_reactions_table[ + modelseed_kegg_reactions_table['KEGG_REACTION_ID'].isin(new_kegg_reaction_ids) + ].to_dict(orient='index') + + # Retrieve data on ModelSEED reactions aliasing EC numbers that haven't been added to + # the network. Each row of the reference table represents a unique mapping of EC number + # to ModelSEED reaction. + modelseed_ec_reactions_dict: Dict[int, Dict] = modelseed_ec_reactions_table[ + modelseed_ec_reactions_table['EC_number'].isin(new_ec_numbers) + ].to_dict(orient='index') + + if not ( + old_kegg_reaction_ids or + old_ec_numbers or + modelseed_kegg_reactions_dict or + modelseed_ec_reactions_dict + ): + # None of the KEGG REACTION IDs and EC numbers associated with the KO map to + # ModelSEED reactions (none are in the ModelSEED Biochemistry reactions table). + continue + + # Find "undefined" ModelSEED reactions without an equation. + undefined_modelseed_reaction_ids += self._remove_undefined_reactions( + ko_kegg_reaction_ids, + new_kegg_reaction_ids, + ko_ec_numbers, + new_ec_numbers, + modelseed_kegg_reactions_dict, + modelseed_ec_reactions_dict + ) + + if not ( + old_kegg_reaction_ids or + new_kegg_reaction_ids or + old_ec_numbers or + new_ec_numbers + ): + # The KO is not added to the network as none of the reactions have equations. + continue + # The newly encountered KO is now known to be associated with KEGG reactions or EC + # numbers that alias ModelSEED reactions with an equation; this new data can be added to + # the network. + + # Add an object representing the gene cluster to the network. + network.gene_clusters[cluster_id] = gene_cluster + + # Add an object representing the newly encountered KO to the network. + ko = KO() + ko.id = ko_id + ko.name = gene_cluster_ko_data['function'] + network.kos[ko_id] = ko + gene_cluster.ko_id = ko_id + + # Associate ModelSEED reactions that have been previously added to the network under + # construction with the newly encountered KO. + self._process_added_reactions( + old_kegg_reaction_ids, + old_ec_numbers, + network, + ko, + ko_kegg_reaction_ids, + ko_ec_numbers + ) + + # Add ModelSEED reactions aliasing newly encountered KEGG reactions and EC numbers to + # the network. + self._add_reactions( + modelseed_kegg_reactions_dict, + modelseed_ec_reactions_dict, + network, + modelseed_compounds_table, + ko, + old_kegg_reaction_ids, + new_kegg_reaction_ids, + old_ec_numbers, + new_ec_numbers + ) + + # Add KEGG classifications of the KO (modules, pathways, and BRITE hierarchies) to the + # network. + self._add_ko_classification( + ko, + network, + ko_info, + kegg_modules_data, + kegg_pathways_data, + kegg_hierarchies_data + ) + + self._relate_modules_pathways(network, kegg_modules_data) + + if DEBUG: + for gene_cluster in network.gene_clusters.values(): + ko = network.kos[gene_cluster.ko_id] + assert ko.reaction_ids + + assert sorted(network.modelseed_kegg_aliases) == sorted(network.reactions) + assert sorted(network.modelseed_ec_number_aliases) == sorted(network.reactions) + + undefined_modelseed_reaction_ids = set(undefined_modelseed_reaction_ids) + undefined_ko_ids = set(undefined_ko_ids) + + self.progress.end() + + if DEBUG: + self.run.info_single( + "The following ModelSEED reactions would have been added to the reaction network " + "had there been a chemical equation in the ModelSEED database; perhaps it is worth " + "investigating the ModelSEED reactions table to understand why this is not the " + f"case: {', '.join(undefined_modelseed_reaction_ids)}" + ) + + if undefined_ko_ids: + self.run.info_single( + "Certain gene clusters were assigned KOs that were not found in the KEGG reference " + "database. These KOs will not be used in network construction. It could be that " + "the KOfams used to annotate genes were not from the same KEGG database version as " + "the reference. Here are the unrecognized KO IDs from the pangenomic databases: " + f"{','.join(undefined_ko_ids)}" + ) + + ko_dir = kegg_db.kegg_context.kegg_data_dir + if self.modelseed_dir is None: + modelseed_dir = ModelSEEDDatabase.default_dir + else: + modelseed_dir = self.modelseed_dir + self.run.info("Reference KEGG database directory", ko_dir, nl_before=1) + self.run.info("Reference ModelSEED database directory", modelseed_dir, nl_after=1) + + pdb = PanDatabase(pan_db) + if store: + if pan_super.p_meta['reaction_network_ko_annotations_hash']: + self.run.warning("Deleting existing reaction network from pan database") + pdb.db._exec( + f'''DELETE from {tables.pan_reaction_network_reactions_table_name}''' + ) + pdb.db._exec( + f'''DELETE from {tables.pan_reaction_network_metabolites_table_name}''' + ) + self.run.info_single( + "Deleted data in gene cluster function reactions and metabolites tables", + nl_after=1 + ) + + self.progress.new("Saving reaction network to pan database") + self.progress.update("Reactions table") + reactions_table = self._get_database_reactions_table(network) + table_name = tables.pan_reaction_network_reactions_table_name + table_structure = tables.pan_reaction_network_reactions_table_structure + pdb.db._exec_many( + f'''INSERT INTO {table_name} VALUES ({','.join('?' * len(table_structure))})''', + reactions_table.values + ) + self.progress.update("Metabolites table") + metabolites_table = self._get_database_metabolites_table(network) + table_name = tables.pan_reaction_network_metabolites_table_name + table_structure = tables.pan_reaction_network_metabolites_table_structure + pdb.db._exec_many( + f'''INSERT INTO {table_name} VALUES ({','.join('?' * len(table_structure))})''', + metabolites_table.values + ) + self.progress.update("KEGG KO information table") + kegg_table = self._get_database_kegg_table(network) + table_name = tables.pan_reaction_network_kegg_table_name + table_structure = tables.pan_reaction_network_kegg_table_structure + pdb.db._exec_many( + f'''INSERT INTO {table_name} VALUES ({','.join('?' * len(table_structure))})''', + kegg_table.values + ) + + self.progress.update("Metadata") + ko_annotations_hash = self.hash_pan_db_ko_annotations( + genomes_storage_db, + gene_clusters_functions_summary_dict, + consensus_threshold=consensus_threshold, + discard_ties=discard_ties + ) + pdb.db.set_meta_value('reaction_network_ko_annotations_hash', ko_annotations_hash) + # The KEGG database release is now not explicitly that, but instead the hash of the + # anvi'o modules database. + pdb.db.set_meta_value('reaction_network_kegg_database_release', kegg_db.modules_db_hash) + pdb.db.set_meta_value('reaction_network_modelseed_database_sha', modelseed_db.sha) + pdb.db.set_meta_value('reaction_network_consensus_threshold', consensus_threshold) + pdb.db.set_meta_value('reaction_network_discard_ties', int(discard_ties)) + self.progress.end() + + ko_ids_assigned_gene_cluster = set(ko_ids_assigned_gene_cluster) + precomputed_counts = { + 'total_gene_clusters': pdb.meta['num_gene_clusters'], + 'gene_clusters_assigned_ko': num_gene_clusters_assigned_ko, + 'kos_assigned_gene_clusters': len(ko_ids_assigned_gene_cluster) + } + pdb.disconnect() + stats = network.get_overview_statistics(precomputed_counts=precomputed_counts) + network.print_overview_statistics(stats=stats) + if stats_file: + network.write_overview_statistics(stats_file, stats=stats) + + return network + + def _get_ko_kegg_reaction_ids(self, ko_info: Dict[str, Any]) -> Set[str]: + """ + Get the set of KEGG REACTION IDs associated with the KO under consideration in network + construction. + + Parameters + ========== + ko_info : Dict[str, Any] + Information on the KO loaded from the anvi'o KEGG database. + + Returns + ======= + Set[str] + KEGG REACTION IDs associated with the KO. + """ + try: + ko_kegg_reaction_ids = set(ko_info['RN']) + except KeyError: + # The KO is not associated with KEGG reactions. + ko_kegg_reaction_ids = set() + ko_kegg_reaction_ids: Set[str] + + return ko_kegg_reaction_ids + + def _get_ko_ec_numbers(self, ko_info: Dict[str, Any]) -> Set[str]: + """ + Get the set of EC numbers associated with the KO under consideration in network + construction. + + Parameters + ========== + ko_info : Dict[str, Any] + Information on the KO loaded from the anvi'o KEGG database. + + Returns + ======= + Set[str] + EC numbers associated with the KO. + """ + try: + ko_ec_numbers = set(ko_info['EC']) + except KeyError: + # The KO is not associated with EC numbers. + ko_ec_numbers = set() + ko_ec_numbers: Set[str] + + return ko_ec_numbers + + def _find_kegg_reaction_ids( + self, + ko_kegg_reaction_ids: Set[str], + network: ReactionNetwork + ) -> Tuple[List[str], List[str]]: + """ + Find which KEGG reactions associated with a KO under consideration in network construction + are already in the network. + + Parameters + ========== + ko_kegg_reaction_ids : Set[str] + KEGG REACTION IDs associated with the KO. + + Returns + ======= + Tuple[Set[str], Set[str]] + A set of KEGG REACTION IDs in the network and a set of those not in the network. + """ + old_kegg_reaction_ids = [] + new_kegg_reaction_ids = [] + for kegg_reaction_id in ko_kegg_reaction_ids: + if kegg_reaction_id in network.kegg_modelseed_aliases: + old_kegg_reaction_ids.append(kegg_reaction_id) + else: + new_kegg_reaction_ids.append(kegg_reaction_id) + old_kegg_reaction_ids = set(old_kegg_reaction_ids) + new_kegg_reaction_ids = set(new_kegg_reaction_ids) + + return old_kegg_reaction_ids, new_kegg_reaction_ids + + def _find_ec_numbers( + self, + ko_ec_numbers: Set[str], + network: ReactionNetwork + ) -> Tuple[Set[str], Set[str]]: + """ + Find which EC numbers associated with a KO under consideration in network construction are + already in the network. + + Parameters + ========== + ko_ec_numbers : Set[str] + EC numbers associated with the KO. + + Returns + ======= + Tuple[List[str], List[str]] + A set of EC numbers in the network and a set of those not in the network. + """ + old_ec_numbers = [] + new_ec_numbers = [] + for ec_number in ko_ec_numbers: + if ec_number in network.ec_number_modelseed_aliases: + old_ec_numbers.append(ec_number) + else: + new_ec_numbers.append(ec_number) + old_ec_numbers = set(old_ec_numbers) + new_ec_numbers = set(new_ec_numbers) + + return old_ec_numbers, new_ec_numbers + + def _remove_undefined_reactions( + self, + ko_kegg_reaction_ids: Set[str], + new_kegg_reaction_ids: Set[str], + ko_ec_numbers: Set[str], + new_ec_numbers: Set[str], + modelseed_kegg_reactions_dict: Dict[int, Dict], + modelseed_ec_reactions_dict: Dict[int, Dict] + ) -> List[str]: + """ + Find "undefined" ModelSEED reactions lacking a chemical formula, and remove these from + further consideration in network construction. + + Parameters + ========== + ko_kegg_reaction_ids : Set[str] + KEGG REACTION IDs associated with the KO under consideration in network construction. + + new_kegg_reaction_ids : Set[str] + KEGG REACTION IDs associated with the KO that are not already in the network. + + ko_ec_numbers : Set[str] + EC numbers associated with the KO under consideration in network construction. + + new_ec_numbers : Set[str] + EC numbers associated with the KO that are not already in the network. + + modelseed_kegg_reactions_dict : Dict[int, Dict] + Data on ModelSEED reactions aliasing the newly encountered KEGG reactions. + + modelseed_ec_reactions_dict : Dict[int, Dict] + Data on ModelSEED reactions aliasing the newly encountered EC numbers. + + Returns + ======= + List[str] + Undefined ModelSEED reaction IDs. + """ + undefined_modelseed_reaction_ids: List[str] = [] + + defined_kegg_reactions: Dict[str, bool] = {}.fromkeys(new_kegg_reaction_ids, False) + undefined_kegg_indices: List[int] = [] + for idx, modelseed_reaction_data in modelseed_kegg_reactions_dict.items(): + if pd.isna(modelseed_reaction_data['stoichiometry']): + undefined_modelseed_reaction_ids.append(modelseed_reaction_data['id']) + undefined_kegg_indices.append(idx) + else: + defined_kegg_reactions[modelseed_reaction_data['KEGG_REACTION_ID']] = True + + defined_ec_numbers: Dict[str, bool] = {}.fromkeys(new_ec_numbers, False) + undefined_ec_indices: List[int] = [] + for idx, modelseed_reaction_data in modelseed_ec_reactions_dict.items(): + if pd.isna(modelseed_reaction_data['stoichiometry']): + undefined_modelseed_reaction_ids.append(modelseed_reaction_data['id']) + undefined_ec_indices.append(idx) + else: + defined_ec_numbers[modelseed_reaction_data['EC_number']] = True + + for kegg_reaction_id, is_defined in defined_kegg_reactions.items(): + if not is_defined: + ko_kegg_reaction_ids.remove(kegg_reaction_id) + new_kegg_reaction_ids.remove(kegg_reaction_id) + for idx in undefined_kegg_indices: + modelseed_kegg_reactions_dict.pop(idx) + + for ec_number, is_defined in defined_ec_numbers.items(): + if not is_defined: + ko_ec_numbers.remove(ec_number) + new_ec_numbers.remove(ec_number) + for idx in undefined_ec_indices: + modelseed_ec_reactions_dict.pop(idx) + + return undefined_modelseed_reaction_ids + + def _add_reactions( + self, + modelseed_kegg_reactions_dict: Dict[int, Dict], + modelseed_ec_reactions_dict: Dict[int, Dict], + network: ReactionNetwork, + modelseed_compounds_table: pd.DataFrame, + ko: KO, + old_kegg_reaction_ids: Set[str], + new_kegg_reaction_ids: Set[str], + old_ec_numbers: Set[str], + new_ec_numbers: Set[str] + ) -> None: + """ + Add ModelSEED reactions aliasing newly encountered KEGG reactions and EC numbers, which are + referenced by the KO under consideration, to the network under construction. + + Parameters + ========== + modelseed_kegg_reactions_dict : Dict[int, Dict] + Data on ModelSEED reactions aliasing the newly encountered KEGG reactions. + + modelseed_ec_reactions_dict : Dict[int, Dict] + Data on ModelSEED reactions aliasing the newly encountered EC numbers. + + network : ReactionNetwork + Reaction network under construction. + + modelseed_compounds_table : pandas.core.frame.DataFrame + Loaded compounds table of ModelSEED Biochemistry database set up by anvi'o. + + ko : KO + KO being added to the network. + + old_kegg_reaction_ids : Set[str] + KEGG REACTION IDs referenced by the KO that are in the network. + + new_kegg_reaction_ids : Set[str] + KEGG REACTION IDs referenced by the KO that are not in the network. + + old_ec_numbers : Set[str] + EC numbers referenced by the KO that are in the network. + + new_ec_numbers : Set[str] + EC numbers referenced by the KO that are not in the network. + + Returns + ======= + None + """ + # Add ModelSEED reactions aliasing newly encountered KEGG reactions to the network. + + # The following dictionary maps KEGG REACTION IDs referenced by the KO to aliasing ModelSEED + # reaction IDs. + kegg_modelseed_alias_dict: Dict[str, List[str]] = {} + # The following dictionary maps ModelSEED reaction ID to aliasing KEGG REACTION IDs + # referenced by the KO. + modelseed_kegg_alias_dict: Dict[str, Tuple[ModelSEEDReaction, List[str]]] = {} + for modelseed_reaction_data in modelseed_kegg_reactions_dict.values(): + # Each entry in the dictionary is unique to a KEGG reaction aliasing a ModelSEED + # reaction. + kegg_reaction_id = modelseed_reaction_data['KEGG_REACTION_ID'] + modelseed_reaction_id = modelseed_reaction_data['id'] + + if kegg_reaction_id in kegg_modelseed_alias_dict: + if DEBUG: + assert modelseed_reaction_id not in kegg_modelseed_alias_dict[kegg_reaction_id] + kegg_modelseed_alias_dict[kegg_reaction_id].append(modelseed_reaction_id) + else: + kegg_modelseed_alias_dict[kegg_reaction_id] = [modelseed_reaction_id] + + if modelseed_reaction_id in modelseed_kegg_alias_dict: + # The ModelSEED reaction was already added to the network, aliased by another KEGG + # reaction referenced by the KO. + reaction = modelseed_kegg_alias_dict[modelseed_reaction_id][0] + is_added = True + else: + try: + # The ModelSEED reaction was already added to the network through another KO. + reaction = network.reactions[modelseed_reaction_id] + is_added = True + except KeyError: + # Generate a new ModelSEED reaction object. + reaction, metabolites = self._get_modelseed_reaction( + modelseed_reaction_data, + modelseed_compounds_table, + network=network + ) + is_added = False + if DEBUG: + # No reactions lacking an equation should make it into the network or be under + # consideration when this method is called during network construction. + assert reaction.coefficients + + if not is_added: + # Add the new reaction to the network. + network.reactions[modelseed_reaction_id] = reaction + for metabolite in metabolites: + if metabolite.modelseed_id not in network.metabolites: + network.metabolites[metabolite.modelseed_id] = metabolite + + if DEBUG: + if is_added and (modelseed_reaction_id not in modelseed_kegg_alias_dict): + # Previously processed KO(s) must have referenced KEGG REACTION ID(s) and/or EC + # number(s) that aliased the ModelSEED reaction. + try: + other_kegg_reaction_ids = network.modelseed_kegg_aliases[ + modelseed_reaction_id + ] + assert not set(other_kegg_reaction_ids).intersection(old_kegg_reaction_ids) + assert not set(other_kegg_reaction_ids).intersection(new_kegg_reaction_ids) + except KeyError: + pass + try: + other_ec_numbers = network.modelseed_ec_number_aliases[ + modelseed_reaction_id + ] + assert not set(other_ec_numbers).intersection(old_ec_numbers) + assert not set(other_ec_numbers).intersection(new_ec_numbers) + except KeyError: + pass + + # Associate the reaction with the KO. + ko.reaction_ids.append(modelseed_reaction_id) + + try: + modelseed_kegg_alias_tuple = modelseed_kegg_alias_dict[modelseed_reaction_id] + modelseed_kegg_alias_tuple[1].append(kegg_reaction_id) + except KeyError: + modelseed_kegg_alias_dict[modelseed_reaction_id] = (reaction, [kegg_reaction_id]) + continue + + # Record KEGG reaction aliases in the network and KO. + for kegg_reaction_id, modelseed_reaction_ids in kegg_modelseed_alias_dict.items(): + if DEBUG: + assert kegg_reaction_id not in network.kegg_modelseed_aliases + network.kegg_modelseed_aliases[kegg_reaction_id] = modelseed_reaction_ids + for modelseed_reaction_id, modelseed_kegg_alias_tuple in modelseed_kegg_alias_dict.items(): + reaction = modelseed_kegg_alias_tuple[0] + if DEBUG: + assert not old_kegg_reaction_ids.intersection(set(reaction.kegg_aliases)) + assert not old_ec_numbers.intersection(set(reaction.ec_number_aliases)) + + kegg_reaction_ids = modelseed_kegg_alias_tuple[1] + try: + kegg_aliases = network.modelseed_kegg_aliases[modelseed_reaction_id] + except KeyError: + network.modelseed_kegg_aliases[modelseed_reaction_id] = kegg_aliases = [] + if DEBUG: + assert not set(kegg_reaction_ids).intersection(set(kegg_aliases)) + kegg_aliases += kegg_reaction_ids + if modelseed_reaction_id not in network.modelseed_ec_number_aliases: + # No previously processed KO(s) referenced EC number(s) that aliased the ModelSEED + # reaction. + network.modelseed_ec_number_aliases[modelseed_reaction_id] = [] + + if DEBUG: + assert not modelseed_reaction_id in ko.kegg_reaction_aliases + ko.kegg_reaction_aliases[modelseed_reaction_id] = kegg_reaction_ids + if DEBUG: + assert not modelseed_reaction_id in ko.ec_number_aliases + ko.ec_number_aliases[modelseed_reaction_id] = [] + + # Add ModelSEED reactions aliasing newly encountered EC numbers to the network. + + # The following dictionary maps EC numbers referenced by the KO to aliasing ModelSEED + # reaction IDs. + ec_modelseed_alias_dict: Dict[str, List[str]] = {} + # The following dictionary maps ModelSEED reaction ID to aliasing EC numbers referenced by + # the KO. + modelseed_ec_alias_dict: Dict[str, Tuple[ModelSEEDReaction, List[str]]] = {} + for modelseed_reaction_data in modelseed_ec_reactions_dict.values(): + # Each entry in the dictionary is unique to a EC number aliasing a ModelSEED reaction. + ec_number = modelseed_reaction_data['EC_number'] + modelseed_reaction_id = modelseed_reaction_data['id'] + + if ec_number in ec_modelseed_alias_dict: + if DEBUG: + assert modelseed_reaction_id not in ec_modelseed_alias_dict[ec_number] + ec_modelseed_alias_dict[ec_number].append(modelseed_reaction_id) + else: + ec_modelseed_alias_dict[ec_number] = [modelseed_reaction_id] + + if modelseed_reaction_id in modelseed_kegg_alias_dict: + # The ModelSEED reaction was aliased by a KEGG reaction referenced by the KO and so + # was already added to the network. + reaction = modelseed_kegg_alias_dict[modelseed_reaction_id][0] + if modelseed_reaction_id in modelseed_ec_alias_dict: + ec_aliases = modelseed_ec_alias_dict[modelseed_reaction_id][1] + if DEBUG: + assert ec_number not in ec_aliases + ec_aliases.append(ec_number) + else: + modelseed_ec_alias_dict[modelseed_reaction_id] = (reaction, [ec_number]) + continue + + if modelseed_reaction_id in modelseed_ec_alias_dict: + # The ModelSEED reaction was already added to the network, aliased by another EC + # number referenced by the KO. + reaction = modelseed_ec_alias_dict[modelseed_reaction_id][0] + is_added = True + else: + try: + # The ModelSEED reaction was already added to the network through another KO. + reaction = network.reactions[modelseed_reaction_id] + is_added = True + except KeyError: + # Generate a new ModelSEED reaction object. + reaction, reaction_metabolites = self._get_modelseed_reaction( + modelseed_reaction_data, + modelseed_compounds_table, + network=network + ) + is_added = False + if DEBUG: + # No reactions lacking an equation should make it into the network or be under + # consideration when this method is called during network construction. + assert reaction.coefficients + + if not is_added: + # Add the new reaction to the network. + network.reactions[modelseed_reaction_id] = reaction + for metabolite in reaction_metabolites: + if metabolite.modelseed_id not in network.metabolites: + network.metabolites[metabolite.modelseed_id] = metabolite + + if DEBUG: + if is_added and (modelseed_reaction_id not in modelseed_ec_alias_dict): + # Previously processed KO(s) must have referenced KEGG REACTION ID(s) and/or EC + # number(s) that aliased the ModelSEED reaction. + try: + other_kegg_reaction_ids = network.modelseed_kegg_aliases[ + modelseed_reaction_id + ] + assert not set(other_kegg_reaction_ids).intersection(old_kegg_reaction_ids) + assert not set(other_kegg_reaction_ids).intersection(new_kegg_reaction_ids) + except KeyError: + pass + try: + other_ec_numbers = network.modelseed_ec_number_aliases[ + modelseed_reaction_id + ] + assert not set(other_ec_numbers).intersection(old_ec_numbers) + assert not set(other_ec_numbers).intersection(new_ec_numbers) + except KeyError: + pass + + # Associate the reaction with the KO. + ko.reaction_ids.append(modelseed_reaction_id) + + try: + modelseed_ec_alias_tuple = modelseed_ec_alias_dict[modelseed_reaction_id] + modelseed_ec_alias_tuple[1].append(ec_number) + except KeyError: + modelseed_ec_alias_dict[modelseed_reaction_id] = (reaction, [ec_number]) + + # Record EC number aliases in the network and KO. + for ec_number, modelseed_reaction_ids in ec_modelseed_alias_dict.items(): + if DEBUG: + assert ec_number not in network.ec_number_modelseed_aliases + network.ec_number_modelseed_aliases[ec_number] = modelseed_reaction_ids + for modelseed_reaction_id, modelseed_ec_alias_tuple in modelseed_ec_alias_dict.items(): + reaction = modelseed_ec_alias_tuple[0] + if DEBUG: + assert not old_kegg_reaction_ids.intersection(set(reaction.kegg_aliases)) + assert not old_ec_numbers.intersection(set(reaction.ec_number_aliases)) + + ec_numbers = modelseed_ec_alias_tuple[1] + try: + ec_aliases = network.modelseed_ec_number_aliases[modelseed_reaction_id] + except KeyError: + network.modelseed_ec_number_aliases[modelseed_reaction_id] = ec_aliases = [] + if DEBUG: + assert not set(ec_numbers).intersection(set(ec_aliases)) + ec_aliases += ec_numbers + if modelseed_reaction_id not in network.modelseed_kegg_aliases: + # Neither this KO nor any previously processed KO(s) referenced KEGG reaction(s) + # that aliased the ModelSEED reaction. + network.modelseed_kegg_aliases[modelseed_reaction_id] = [] + + if modelseed_reaction_id in ko.ec_number_aliases: + # The ModelSEED reaction aliased KEGG reaction(s) refereced by the KO. An empty + # list was added for the ModelSEED reaction in the following attribute. + if DEBUG: + assert not ko.ec_number_aliases[modelseed_reaction_id] + ko.ec_number_aliases[modelseed_reaction_id] += ec_numbers + else: + # The ModelSEED reaction did not alias any KEGG reactions referenced by the KO. + ko.ec_number_aliases[modelseed_reaction_id] = ec_numbers + if modelseed_reaction_id not in ko.kegg_reaction_aliases: + ko.kegg_reaction_aliases[modelseed_reaction_id] = [] + + def _get_modelseed_reaction( + self, + modelseed_reaction_data: Dict, + modelseed_compounds_table: pd.DataFrame, + network: ReactionNetwork = None + ) -> Tuple[ModelSEEDReaction, List[ModelSEEDCompound]]: + """ + Get an object representation of the ModelSEED reaction and object representations of the + associated ModelSEED compounds involved in the reaction. + + Parameters + ========== + modelseed_reaction_data : Dict + Dictionary representation of a row of the ModelSEED reaction table set up by anvi'o, + containing data on the reaction. + + modelseed_compounds_table : pandas.core.frame.DataFrame + Loaded ModelSEED Biochemistry compounds database. + + network : ReactionNetwork, None + Reaction network under construction, with reaction compound objects drawn from the + network, if possible, rather than created anew, as is done when a network is not + provided. New reaction and compound objects are not added to the network by this method. + + Returns + ======= + ModelSEEDReaction + Representation of the reaction with data sourced from ModelSEED Biochemistry. + + List[ModelSEEDCompound] + Representations of metabolites involved in the reaction, with data sourced from + ModelSEED Biochemistry. + """ + reaction = ModelSEEDReaction() + + modelseed_reaction_id = modelseed_reaction_data['id'] + if DEBUG: + assert pd.notna(modelseed_reaction_id) + reaction.modelseed_id = modelseed_reaction_id + + modelseed_name: str = modelseed_reaction_data['name'] + if pd.isna(modelseed_name): + reaction.modelseed_name = None + else: + reaction.modelseed_name = modelseed_name + + kegg_reaction_ids: str = modelseed_reaction_data['KEGG'] + if pd.isna(kegg_reaction_ids): + reaction.kegg_aliases = tuple() + else: + reaction.kegg_aliases = tuple(kegg_reaction_ids.split('; ')) + + ec_numbers: str = modelseed_reaction_data['ec_numbers'] + if pd.isna(ec_numbers): + reaction.ec_number_aliases = tuple() + else: + reaction.ec_number_aliases = tuple(ec_numbers.split('|')) + + reversibility: str = modelseed_reaction_data['reversibility'] + if pd.isna(reversibility): + reaction.reversibility = None + elif reversibility == '=' or reversibility == '?': + # Assume that reactions lacking data ('?') are reversible. + reaction.reversibility = True + else: + reaction.reversibility = False + + stoichiometry: str = modelseed_reaction_data['stoichiometry'] + modelseed_compound_ids: List[str] = [] + if pd.isna(stoichiometry): + if DEBUG: + assert pd.isna(modelseed_reaction_data['reversibility']) + assert pd.isna(modelseed_reaction_data['direction']) + reaction.compartments = None + reaction.coefficients = None + else: + if DEBUG: + assert pd.notna(modelseed_reaction_data['reversibility']) + assert pd.notna(modelseed_reaction_data['direction']) + decimal_reaction_coefficients: List[float] = [] + split_stoichiometry = stoichiometry.split(';') + compartments: List[str] = [] + for entry in split_stoichiometry: + split_entry = entry.split(':') + if DEBUG: + assert len(split_entry) > 3 + decimal_reaction_coefficients.append(float(split_entry[0])) + modelseed_compound_ids.append(split_entry[1]) + compartments.append(ModelSEEDDatabase.compartment_ids[int(split_entry[2])]) + reaction.compartments = tuple(compartments) + reaction_coefficients = to_lcm_denominator(decimal_reaction_coefficients) + direction = modelseed_reaction_data['direction'] + if ( + (direction == '>' and reversibility == '<') or + (direction == '<' and reversibility == '>') + ): + # The way the reaction is written is the opposite of the way the reaction proceeds. + reaction_coefficients = [-c for c in reaction_coefficients] + reaction.coefficients = tuple(reaction_coefficients) + reaction.compound_ids = modelseed_compound_ids + + if not modelseed_compound_ids: + return reaction, [] + + reaction_metabolites: List[ModelSEEDCompound] = [] + for compound_id in modelseed_compound_ids: + if network: + try: + # The ModelSEED compound ID has been encountered in previously processed + # reactions, so there is already a 'ModelSEEDCompound' object for it. + reaction_metabolites.append(network.metabolites[compound_id]) + continue + except KeyError: + pass + + # Generate a new metabolite object. + try: + modelseed_compound_series: pd.Series = modelseed_compounds_table.loc[compound_id] + except KeyError: + raise ConfigError( + f"A row for the ModelSEED compound ID, '{compound_id}', was expected but not " + "found in the ModelSEED compounds table. This ID was found in the equation for " + f"the ModelSEED reaction, '{modelseed_reaction_id}'." + ) + modelseed_compound_data = modelseed_compound_series.to_dict() + modelseed_compound_data['id'] = compound_id + metabolite = self._get_modelseed_compound(modelseed_compound_data) + reaction_metabolites.append(metabolite) + + return reaction, reaction_metabolites + + def _get_modelseed_compound(self, modelseed_compound_data: Dict) -> ModelSEEDCompound: + """ + Generate a ModelSEED compound object from its entry in the ModelSEED table. + + Parameters + ========== + modelseed_compound_data : Dict + A dictionary representation of a row for a compound in the ModelSEED compound table set + up by anvi'o. + + Returns + ======= + ModelSEEDCompound + An object representation of the ModelSEED compound. + """ + compound = ModelSEEDCompound() + compound.modelseed_id = modelseed_compound_data['id'] + + modelseed_name = modelseed_compound_data['name'] + if pd.isna(modelseed_name): + compound.modelseed_name = None + else: + compound.modelseed_name = modelseed_name + + kegg_aliases: str = modelseed_compound_data['KEGG'] + if pd.isna(kegg_aliases): + compound.kegg_aliases = tuple() + else: + compound.kegg_aliases = tuple(kegg_aliases.split('; ')) + + formula = modelseed_compound_data['formula'] + if pd.isna(formula): + compound.formula = None + # compounds without formulas have a nominal charge of 10000000 in compounds.tsv + compound.charge = None + else: + compound.formula = formula + charge = modelseed_compound_data['charge'] + if pd.isna(charge): + raise ConfigError( + f"The charge of a ModelSEED compound, '{compound.modelseed_id}', was not " + "recorded in 'compounds.tsv' but is expected to be present as an integer. Here " + f"is the data in the row for the compound: '{modelseed_compound_data}'" + ) + compound.charge = charge + + smiles = modelseed_compound_data['smiles'] + if pd.isna(smiles): + compound.smiles = None + else: + compound.smiles = smiles + + return compound + + def _process_added_reactions( + self, + old_kegg_reaction_ids: Set[str], + old_ec_numbers: Set[str], + network: ReactionNetwork, + ko: KO, + ko_kegg_reaction_ids: Set[str], + ko_ec_numbers: Set[str] + ) -> None: + """ + Associate ModelSEED reactions that have been previously added to the network under + construction with the newly encountered KO. + + Parameters + ========== + old_kegg_reaction_ids : Set[str] + KEGG REACTION IDs previously added to the network. + + old_ec_numbers : Set[str] + EC numbers previously added to the network. + + network : ReactionNetwork + Reaction network under construction. + + ko : KO + KO being added to the network. + + ko_kegg_reaction_ids : Set[str] + KEGG REACTION IDs associated with the KO under consideration in network construction. + + ko_ec_numbers : Set[str] + EC numbers associated with the KO under consideration in network construction. + + Returns + ======= + None + """ + # Associate reactions aliasing KEGG reactions with the KO. + for kegg_reaction_id in old_kegg_reaction_ids: + for modelseed_reaction_id in network.kegg_modelseed_aliases[kegg_reaction_id]: + reaction = network.reactions[modelseed_reaction_id] + ko.reaction_ids.append(modelseed_reaction_id) + # Record which KEGG REACTION IDs and EC numbers associated with the KO alias the + # ModelSEED reaction. + ko.kegg_reaction_aliases[modelseed_reaction_id] = list( + ko_kegg_reaction_ids.intersection(set(reaction.kegg_aliases)) + ) + ko.ec_number_aliases[modelseed_reaction_id] = list( + ko_ec_numbers.intersection(set(reaction.ec_number_aliases)) + ) + + # Associate reactions aliasing EC numbers with the KO. + for ec_number in old_ec_numbers: + for modelseed_reaction_id in network.ec_number_modelseed_aliases[ec_number]: + if modelseed_reaction_id in ko.reaction_ids: + # The ModelSEED reaction has already been associated with the KO, as it was + # aliased by a KEGG reaction referenced by the KO -- addressed above -- as well + # as the EC number. + continue + reaction = network.reactions[modelseed_reaction_id] + ko.reaction_ids.append(modelseed_reaction_id) + if DEBUG: + assert not ko_kegg_reaction_ids.intersection(set(reaction.kegg_aliases)) + ko.kegg_reaction_aliases[modelseed_reaction_id] = [] + ko.ec_number_aliases[modelseed_reaction_id] = list( + ko_ec_numbers.intersection(set(reaction.ec_number_aliases)) + ) + + def _add_ko_classification( + self, + ko: KO, + network: ReactionNetwork, + ko_info: Dict[str, Any], + kegg_modules_data: Dict[str, Dict[str, Any]], + kegg_pathways_data: Dict[str, Dict[str, Any]], + kegg_hierarchies_data: Dict[str, str] + ) -> None: + """ + Add KEGG classifications of the KO (modules, pathways, and BRITE hierarchies) to the + network under construction. + + Parameters + ========== + ko : KO + KO being added to the network. + + network : ReactionNetwork + Reaction network under construction. + + ko_info : Dict[str, Any] + Information on the KO loaded from the anvi'o KEGG database. + + kegg_modules_data : Dict[str, Dict[str, Any]] + This dictionary of KEGG reference data relates module IDs to module names and pathways. + + kegg_pathways_data : Dict[str, Dict[str, Any]] + This dictionary of KEGG reference data relates pathway IDs to pathway names and + equivalent categories in the BRITE hierarchy, 'ko00001'. + + kegg_hierarchies_data : Dict[str, str] + This dictionary of KEGG reference data relates BRITE hierarchy IDs to hierarchy names. + + Returns + ======= + None + """ + ko_id = ko.id + + # Reference module IDs in the KO. + ko_info_mod: Tuple[str] = ko_info['MOD'] + for module_id in ko_info_mod: + ko.module_ids.append(module_id) + + # Reference pathway IDs in the KO. + ko_info_pth: Tuple[str] = ko_info['PTH'] + for pathway_id in ko_info_pth: + ko.pathway_ids.append(pathway_id) + + # Reference BRITE hierarchy categorizations in the KO. + ko_info_hie: Dict[str, Tuple[Tuple[str]]] = ko_info['HIE'] + for hierarchy_id, categorizations in ko_info_hie.items(): + ko_hierarchy_categorizations: List[Tuple[str]] = [] + for categorization in categorizations: + ko_hierarchy_categorizations.append(categorization) + ko.hierarchies[hierarchy_id] = ko_hierarchy_categorizations + + # Fill out module objects in the network. + for module_id in ko.module_ids: + try: + # The module has already been added to the network via another KO. + module = network.modules[module_id] + except KeyError: + # Create a module object and add it to the network. + module_info = kegg_modules_data[module_id] + module = KEGGModule(id=module_id) + module.name = module_info['NAME'] + # Do not yet add pathway ID references in the module. Certain KOs but not others in + # a module can be in a pathway. Only KOs in the network are relevant. The module is + # only linked to pathways via KOs in the network, so relationships between modules + # and pathways in the network can only be resolved after all KOs have been added to + # the network. + network.modules[module_id] = module + module.ko_ids.append(ko_id) + + # Fill out pathway objects in the network. + # Track BRITE categories that are equivalent to pathways to facilitate category object + # creation. + category_pathways: Dict[Tuple[str], str] = {} + for pathway_id in ko.pathway_ids: + try: + # The pathway has already been added to the network via another KO. + pathway = network.pathways[pathway_id] + except KeyError: + # Create a pathway object and add it to the network. + pathway_info = kegg_pathways_data[pathway_id] + pathway = KEGGPathway(id=pathway_id) + pathway.name = pathway_info['NAME'] + categorization = pathway_info['CAT'] + pathway.categorization = categorization + category_pathways[categorization] = pathway_id + network.pathways[pathway_id] = pathway + + pathway.ko_ids.append(ko_id) + + # Fill out hierarchy objects in the network. + for hierarchy_id, categorizations in ko.hierarchies.items(): + try: + # The hierarchy has already been added to the network via another KO. + hierarchy = network.hierarchies[hierarchy_id] + network_hierarchy_categories = network.categories[hierarchy_id] + except KeyError: + # Create a new hierarchy object and add it to the network. + hierarchy_name = kegg_hierarchies_data[hierarchy_id] + hierarchy = BRITEHierarchy(id=hierarchy_id) + hierarchy.name = hierarchy_name + network.hierarchies[hierarchy_id] = hierarchy + network_hierarchy_categories: Dict[Tuple[str], Tuple[BRITECategory]] = {} + network.categories[hierarchy_id] = network_hierarchy_categories + + hierarchy.ko_ids.append(ko_id) + + # Fill out category objects in the network. + for categorization in categorizations: + try: + # The category has already been added to the network via another KO. + categories = network_hierarchy_categories[categorization] + for category in categories: + if ko_id not in category.ko_ids: + category.ko_ids.append(ko_id) + except KeyError: + # Add a category object to the network for each level of the categorization. + categories: List[BRITECategory] = [] + for depth, focus_category_name in enumerate(categorization, 1): + focus_categorization = categorization[:depth] + try: + # The supercategory has already been added to the network. + focus_categories = network_hierarchy_categories[focus_categorization] + category = focus_categories[-1] + if ko_id not in category.ko_ids: + # It is not the supercategory of another category containing the KO. + category.ko_ids.append(ko_id) + categories = list(focus_categories) + continue + except KeyError: + pass + + # Add the previously unencountered category to the network. + category = BRITECategory() + category.id = f'{hierarchy_id}: {" >>> ".join(focus_categorization)}' + category.name = focus_category_name + category.hierarchy_id = hierarchy_id + category.ko_ids.append(ko_id) + if depth == len(categorization) and hierarchy_id == 'ko00001': + try: + category.pathway_id = category_pathways[categorization] + except KeyError: + pass + categories.append(category) + + if len(categories) > 1: + # Consider the supercategory of the newly encountered category. Add the + # category name as a subcategory reference of the supercategory. + categories[-2].subcategory_names.append(focus_category_name) + + hierarchy.categorizations.append(focus_categorization) + network_hierarchy_categories[focus_categorization] = tuple(categories) + + def _relate_modules_pathways( + self, + network: ReactionNetwork, + kegg_modules_data: Dict[str, Dict[str, Any]] + ) -> None: + """ + Link modules and pathways. + + Certain KOs but not others in a module can be in a pathway. Only KOs in the network are + relevant. A module is only linked to pathways via KOs in the network, so relationships + between modules and pathways in the network are only resolved here after all KOs have been + added to the network. + + Parameters + ========== + network : ReactionNetwork + Reaction network under construction. + + kegg_modules_data : Dict[str, Dict[str, Any]] + This dictionary of KEGG reference data relates module IDs to module names and pathways. + + Returns + ======= + None + """ + for module_id, module in network.modules.items(): + module_info = kegg_modules_data[module_id] + for pathway_id in module_info['PTH']: + try: + pathway = network.pathways[pathway_id] + except KeyError: + continue + module.pathway_ids.append(pathway_id) + pathway.module_ids.append(module_id) + + def _get_database_reactions_table(self, network: ReactionNetwork) -> pd.DataFrame: + """ + Make a reactions table that can be stored in either a contigs or pan database, as the tables + have the same structure. A ReactionNetwork can be reconstructed with the same data from the + reactions, metabolites, and KEGG tables of the database. + + Parameters + ========== + network : ReactionNetwork + Network generated from gene or gene cluster KO annotations. + + Returns + ======= + pd.DataFrame + Table of reactions data to be stored in the contigs or pan database. + """ + if DEBUG: + assert ( + tables.reaction_network_reactions_table_structure == + tables.pan_reaction_network_reactions_table_structure + ) + assert ( + tables.reaction_network_reactions_table_types == + tables.pan_reaction_network_reactions_table_types + ) + + # Transfer data from reaction objects to dictionaries mapping to table entries. + reactions_data: Dict[str, Dict] = {} + for reaction_id, reaction in network.reactions.items(): + reaction_data = {} + reaction_data['modelseed_reaction_id'] = reaction_id + reaction_data['modelseed_reaction_name'] = reaction.modelseed_name + reaction_data['metabolite_modelseed_ids'] = ', '.join(reaction.compound_ids) + reaction_data['stoichiometry'] = ', '.join([str(c) for c in reaction.coefficients]) + reaction_data['compartments'] = ', '.join(reaction.compartments) + reaction_data['reversibility'] = reaction.reversibility + # Record KEGG REACTION IDs and EC numbers that are aliases of ModelSEED reactions but + # are *NOT* associated with gene KO annotations; associated aliases are recorded later. + reaction_data['other_kegg_reaction_ids'] = ', '.join( + set(reaction.kegg_aliases).difference( + set(network.modelseed_kegg_aliases[reaction_id]) + ) + ) + reaction_data['other_ec_numbers'] = ', '.join( + set(reaction.ec_number_aliases).difference( + set(network.modelseed_ec_number_aliases[reaction_id]) + ) + ) + reactions_data[reaction_id] = reaction_data + + # Get *KO* KEGG REACTION ID and EC number aliases of each ModelSEED reaction. These are not + # all possible aliases, but only those associated with KOs that matched genes. Structure + # alias data as follows: + # : { + # : [], + # : [], + # ... + # } + # : { + # : [], + # : [], + # ... + # } + ko_reaction_aliases: Dict[str, Tuple[Dict[str, List[str]], Dict[str, List[str]]]] = { + modelseed_reaction_id: ({}, {}) for modelseed_reaction_id in reactions_data + } + for ko_id, ko in network.kos.items(): + for modelseed_reaction_id in ko.reaction_ids: + aliases = ko_reaction_aliases[modelseed_reaction_id] + + kegg_reaction_aliases = aliases[0] + kegg_reaction_ids = ko.kegg_reaction_aliases[modelseed_reaction_id] + for kegg_reaction_id in kegg_reaction_ids: + try: + ko_ids: List = kegg_reaction_aliases[kegg_reaction_id] + except KeyError: + kegg_reaction_aliases[kegg_reaction_id] = ko_ids = [] + ko_ids.append(ko_id) + + ec_number_aliases = aliases[1] + ec_numbers = ko.ec_number_aliases[modelseed_reaction_id] + for ec_number in ec_numbers: + try: + ko_ids: List = ec_number_aliases[ec_number] + except KeyError: + ec_number_aliases[ec_number] = ko_ids = [] + ko_ids.append(ko_id) + for modelseed_reaction_id, aliases in ko_reaction_aliases.items(): + reaction_data = reactions_data[modelseed_reaction_id] + + # Make the entry for KO KEGG REACTION aliases, which looks akin to the following + # arbitrary example: 'R00001: (K00010, K00100); R01234: (K54321)' + kegg_reaction_aliases = aliases[0] + entry = [] + for kegg_reaction_id, ko_ids in kegg_reaction_aliases.items(): + entry.append(f'{kegg_reaction_id}: ({", ".join(sorted(ko_ids))})') + reaction_data['ko_kegg_reaction_source'] = '; '.join(sorted(entry)) + + # Make the entry for KO EC number aliases, which looks akin to the following arbitrary + # example: '1.1.1.1: (K00010, K00100); 1.2.3.4: (K12345); 6.7.8.99: (K65432) + ec_number_aliases = aliases[1] + entry = [] + for ec_number, ko_ids in ec_number_aliases.items(): + entry.append(f'{ec_number}: ({", ".join(sorted(ko_ids))})') + reaction_data['ko_ec_number_source'] = '; '.join(sorted(entry)) + + reactions_table = pd.DataFrame.from_dict( + reactions_data, orient='index' + ).reset_index(drop=True).sort_values('modelseed_reaction_id') + reactions_table = reactions_table[tables.reaction_network_reactions_table_structure] + + return reactions_table + + def _get_database_metabolites_table(self, network: ReactionNetwork) -> pd.DataFrame: + """ + Make a metabolites table that can be stored in either a contigs or pan database, as the + tables have the same structure. A ReactionNetwork can be reconstructed with the same data + from the reactions, metabolites, and KEGG tables of the database. + + Parameters + ========== + network : ReactionNetwork + Network generated from gene or gene cluster KO annotations. + + Returns + ======= + pd.DataFrame + Table of metabolites data to be stored in the contigs or pan database. + """ + if DEBUG: + assert ( + tables.reaction_network_metabolites_table_structure == + tables.pan_reaction_network_metabolites_table_structure + ) + assert ( + tables.reaction_network_metabolites_table_types == + tables.pan_reaction_network_metabolites_table_types + ) + + # Transfer data from metabolite objects to dictionaries mapping to table entries. + metabolites_data = {} + for compound_id, metabolite in network.metabolites.items(): + metabolite_data = {} + metabolite_data['modelseed_compound_id'] = compound_id + metabolite_data['modelseed_compound_name'] = metabolite.modelseed_name + metabolite_data['kegg_aliases'] = ', '.join(metabolite.kegg_aliases) + metabolite_data['formula'] = metabolite.formula + metabolite_data['charge'] = metabolite.charge + metabolite_data['smiles'] = metabolite.smiles + metabolites_data[compound_id] = metabolite_data + + metabolites_table = pd.DataFrame.from_dict( + metabolites_data, orient='index' + ).reset_index(drop=True).sort_values('modelseed_compound_id') + metabolites_table = metabolites_table[tables.reaction_network_metabolites_table_structure] + + return metabolites_table + + def _get_database_kegg_table(self, network: ReactionNetwork) -> pd.DataFrame: + """ + Make a table recording the relationships between KEGG KOs, modules, pathways, and BRITE + hierarchies in the reaction network that can be stored in either a contigs or a pan + database, as tables have the same structure. A ReactionNetwork can be reconstructed with the + same data from the reaction, metabolites, and KEGG tables of the database. + + Parameters + ========== + network : ReactionNetwork + Network generated from gene or gene cluster KO annotations. + + Returns + ======= + pd.DataFrame + Table of KEGG information to be stored. + """ + if DEBUG: + assert ( + tables.reaction_network_kegg_table_structure == + tables.pan_reaction_network_kegg_table_structure + ) + assert ( + tables.reaction_network_kegg_table_types == + tables.pan_reaction_network_kegg_table_types + ) + + # Transfer data from KEGG objects to dictionaries mapping to table entries. + kegg_data = {} + + # The first rows in the table are for KOs. + ko_id_pattern = re.compile('K\d{5}') + for ko_id, ko in network.kos.items(): + ko_data = {} + assert re.fullmatch(ko_id_pattern, ko_id) + ko_data['kegg_id'] = ko_id + ko_data['name'] = ko.name + ko_data['modules'] = ', '.join(ko.module_ids) + ko_data['pathways'] = ', '.join(ko.pathway_ids) + brite_categorizations = [] + for hierarchy_id, categorizations in ko.hierarchies.items(): + for categorization in categorizations: + brite_categorizations.append( + f'{hierarchy_id} >>> {" >>> ".join(categorization)}' + ) + ko_data['brite_categorization'] = ' !!! '.join(brite_categorizations) + kegg_data[f'1{ko_id}'] = ko_data + + # Modules are second in the table. + module_id_pattern = re.compile('M\d{5}') + for module_id, module in network.modules.items(): + module_data = {} + assert re.fullmatch(module_id_pattern, module_id) + module_data['kegg_id'] = module_id + module_data['name'] = module.name + module_data['modules'] = '' + module_data['pathways'] = ', '.join(module.pathway_ids) + module_data['brite_categorization'] = '' + kegg_data[f'2{module_id}'] = module_data + + # Pathways are third in the table. + pathway_id_pattern = re.compile('map\d{5}') + for pathway_id, pathway in network.pathways.items(): + pathway_data = {} + assert re.fullmatch(pathway_id_pattern, pathway_id) + pathway_data['kegg_id'] = pathway_id + pathway_data['name'] = pathway.name + pathway_data['modules'] = '' + pathway_data['pathways'] = '' + pathway_data[ + 'brite_categorization' + ] = f'ko00001 >>> {" >>> ".join(pathway.categorization)}' + kegg_data[f'3{pathway_id}'] = pathway_data + + # Hierarchies are fourth in the table. + hierarchy_id_pattern = re.compile('ko\d{5}') + for hierarchy_id, hierarchy in network.hierarchies.items(): + hierarchy_data = {} + # Only hierarchies of KOs should be in consideration. Hierarchies of other KEGG items + # that do not resolve to KOs, such as reactions and drugs, have IDs that start with 'br' + # rather than 'ko'. + assert re.fullmatch(hierarchy_id_pattern, hierarchy_id) + hierarchy_data['kegg_id'] = hierarchy_id + hierarchy_data['name'] = hierarchy.name + hierarchy_data['modules'] = '' + hierarchy_data['pathways'] = '' + hierarchy_data['brite_categorization'] = '' + kegg_data[f'4{hierarchy_id}'] = hierarchy_data + + kegg_table = pd.DataFrame.from_dict( + kegg_data, orient='index' + ).sort_index().reset_index(drop=True) + kegg_table = kegg_table[tables.reaction_network_kegg_table_structure] + + return kegg_table + + def hash_contigs_db_ko_hits(self, gene_ko_hits_table: pd.DataFrame) -> str: + """ + To concisely represent the data underlying a reaction network, hash all gene KO annotations + in the contigs database. + + Parameters + ========== + gene_ko_hits_table : pandas.core.frame.DataFrame + This table contains gene KO hit data from the contigs database 'gene_functions' table. + + Returns + ======= + str + Hash representation of all gene KO annotations. + """ + gene_ko_hits_table = gene_ko_hits_table.sort_values(['gene_callers_id', 'accession']) + + gene_ko_hits_string = '' + for row in gene_ko_hits_table.itertuples(index=False): + gene_ko_hits_string += str(row.gene_callers_id) + gene_ko_hits_string += row.accession + gene_ko_hits_string += row.function + gene_ko_hits_string += str(row.e_value) + + hashed_gene_ko_hits = hashlib.sha1(gene_ko_hits_string.encode('utf-8')).hexdigest() + return hashed_gene_ko_hits + + def hash_pan_db_ko_annotations( + self, + genomes_storage_db: str, + gene_clusters_functions_summary_dict: Dict, + consensus_threshold: float, + discard_ties: bool + ) -> str: + """ + To concisely represent the data underlying a reaction network, hash all gene KO annotations + in the constituent genomes, all consensus KO annotations of the gene clusters, and + parameters used to select consensus KOs. + + Parameters + ========== + genomes_storage_db : str + This is the path to a genomes storage database with the underlying gene KO annotations. + + gene_clusters_functions_summary_dict : dict + This dictionary is loaded by a pan superclass and contains gene cluster KO annotations. + + consensus_threshold : float, None + This parameter was used in setting consensus KO annotations of gene clusters. + + discard_ties : bool, False + This parameter was used in setting consensus KO annotations of gene clusters. + + Returns + ======= + str + Hash representation of all gene cluster consensus KO annotations and the parameters used + to select consensus KOs. + """ + gsdb = dbinfo.GenomeStorageDBInfo(genomes_storage_db).load_db() + functions_table = gsdb.get_table_as_dataframe( + 'gene_function_calls', where_clause='source = "KOfam"' + ) + gsdb.disconnect() + ko_annotations = [] + for row in functions_table.itertuples(index=False): + ko_annotations.append(( + row.genome_name, + str(row.gene_callers_id), + row.accession, + row.function, + str(row.e_value) + )) + ko_annotations = sorted(ko_annotations, key=lambda x: (x[0], x[1], x[2])) + + ko_annotations = [] + for cluster_id, gene_cluster_dict in gene_clusters_functions_summary_dict.items(): + ko_data = gene_cluster_dict['KOfam'] + ko_id = ko_data['accession'] + ko_name = ko_data['function'] + # When the KO ID and name are None, convert them into 'None'. + ko_annotations.append((str(cluster_id), str(ko_id), str(ko_name))) + ko_annotations = sorted(ko_annotations, key=lambda x: x[0]) + + ko_annotations_string = f'{consensus_threshold}_{int(discard_ties)}_' + for ko_annotation in ko_annotations: + ko_annotations_string += ''.join(ko_annotation) + + hashed_ko_annotations = hashlib.sha1(ko_annotations_string.encode('utf-8')).hexdigest() + return hashed_ko_annotations + +class Tester: + """ + This class tests reaction network construction and operations. + + Attributes + ========== + kegg_dir : str, None + Directory containing an anvi'o KEGG database. This attribute is assigned the argument of the + same name upon initialization. + + modelseed_dir : str, None + Directory containing reference ModelSEED Biochemistry tables set up by anvi'o. This + attribute is assigned the argument of the same name upon initialization. + + test_dir : str, None + Directory storing test files, including copied input and output files. With the default + value of None, temporary directories are created and deleted as needed by methods. In + contrast, if a directory is provided, it and its contents will not be deleted. This + attribute is assigned the argument of the same name upon initialization. + + run : anvio.terminal.Run, anvio.terminal.Run() + This object prints run information to the terminal. This attribute is assigned the argument + of the same name upon initialization. + + progress : anvio.terminal.Progress, anvio.terminal.Progress() + This object prints transient progress information to the terminal. This attribute is + assigned the argument of the same name upon initialization. + """ + def __init__( + self, + kegg_dir: str = None, + modelseed_dir: str = None, + test_dir: str = None, + run: terminal.Run = terminal.Run(), + progress: terminal.Progress = terminal.Progress() + ) -> None: + """ + Parameters + ========== + kegg_dir : str, None + Directory containing an anvi'o KEGG database. The default argument of None expects KEGG + data to be set up in the default anvi'o directory used by the program, + `anvi-setup-kegg-data`. + + modelseed_dir : str, None + Directory containing reference ModelSEED Biochemistry tables set up by anvi'o. The + default argument of None expects ModelSEED data to be set up in the default anvi'o + directory used by the program, `anvi-setup-modelseed-database`. + + test_dir : str, None + Directory storing test files. With the default value of None, temporary test directories + are created and deleted by Tester methods; these methods operate on copies of input + files in the test directories. In contrast, if a directory is provided, it and its + contents will not be deleted. + + run : anvio.terminal.Run, anvio.terminal.Run() + This object prints run information to the terminal. + + progress : anvio.terminal.Progress, anvio.terminal.Progress() + This object prints transient progress information to the terminal. + """ + self.kegg_dir = kegg_dir + self.modelseed_dir = modelseed_dir + self.test_dir = test_dir + self.run = run + self.progress = progress + + def test_contigs_database_network(self, contigs_db: str, copy_db: bool = True) -> None: + """ + Test the construction of a reaction network from a contigs database, and test that network + methods are able to run and do not fail certain basic tests. + + Parameters + ========== + contigs_db : str + Path to a contigs database. The database can represent different types of samples, + including a single genome, metagenome, or transcriptome. The network is derived from + gene KO annotations stored in the database. + + copy_db : bool, True + If True, as by default, store the generated reaction network in a copy of the input + contigs database. If a test directory has been set, the database copy is placed there + with a derived filename, e.g., "my-CONTIGS.db" is copied to a file like + "TEST/my-CONTIGS-k2z9jxjd.db". If False, store the reaction network in the input contigs + database, overwriting any network that is already stored. + + Returns + ======= + None + """ + if self.test_dir is None: + test_dir = filesnpaths.get_temp_directory_path() + else: + test_dir = self.test_dir + self.run.info("Test directory", test_dir, nl_after=1) + + self.run.info_single("NETWORK CONSTRUCTION:", mc='magenta', level=0) + utils.is_contigs_db(contigs_db) + + if copy_db: + # Operations are performed on a copy of the contigs database in the (provided or + # temporary) test directory. + basename = os.path.basename(contigs_db) + prefix, suffix = os.path.splitext(basename) + contigs_db_target = tempfile.NamedTemporaryFile( + prefix=f"{prefix}-", suffix=suffix, dir=test_dir + ).name + shutil.copy(contigs_db, contigs_db_target) + else: + contigs_db_target = contigs_db + + con = Constructor( + kegg_dir=self.kegg_dir, + modelseed_dir=self.modelseed_dir, + run=self.run, + progress=self.progress + ) + + make_stats_file_target = os.path.join(test_dir, "make_contigs_db_network_stats.tsv") + network = con.make_contigs_database_network( + contigs_db=contigs_db_target, + overwrite_existing_network=True, + stats_file=make_stats_file_target + ) + + self.run.info_single("NETWORK LOADING:", mc='magenta', level=0) + load_stats_file_target = os.path.join(test_dir, "load_contigs_db_network_stats.tsv") + con.load_contigs_database_network(contigs_db_target, stats_file=load_stats_file_target) + + # Check that the statistics for the network constructed and saved in the contigs database + # are the same as the statistics for the same network loaded back into memory from the + # contigs database. + inconsistent_stats = self._get_inconsistent_statistics( + make_stats_file_target, load_stats_file_target + ) + if inconsistent_stats: + msg = "" + for stat, stat_tuple in inconsistent_stats.items(): + msg += f"{stat}: {stat_tuple[0]}, {stat_tuple[1]}; " + msg = msg[:-2] + raise AssertionError( + "Statistics on the network constructed and saved to the contigs database differ " + "from what should be the same statistics on the same network loaded from the " + "contigs database. Here are the different statistics, with the value from network " + f"construction before the value from network loading. {msg}" + ) + + self.run.info_single( + "PURGE OF METABOLITES WITHOUT FORMULA:", mc='magenta', nl_before=1, level=0 + ) + deepcopy(network).remove_metabolites_without_formula( + output_path=os.path.join(test_dir, "removed.tsv") + ) + print() + + self.progress.new("Testing network purge methods") + self.progress.update("...") + # Network pruning tests use a random sample of half the network items (nodes) of each type. + sample_proportion = 0.1 + sample_seed = RANDOM_SEED + samples = self._get_common_item_samples( + network, proportion=sample_proportion, seed=sample_seed + ) + + random.seed(sample_seed) + gene_sample = set( + random.sample(list(network.genes), math.ceil(sample_proportion * len(network.genes))) + ) + + self._test_common_prune(network, samples) + + copied_network = deepcopy(network) + removed = copied_network.prune(genes_to_remove=gene_sample) + assert gene_sample.difference(set(copied_network.genes)) == gene_sample + assert not gene_sample.difference(set([gene.gcid for gene in removed['gene']])) + + # Test the pruning of multiple types of items at once. + copied_network = deepcopy(network) + metabolite_sample: Set[str] = samples['metabolite'] + reaction_sample: Set[str] = samples['reaction'] + ko_sample: Set[str] = samples['ko'] + module_sample: Set[str] = samples['module'] + pathway_sample: Set[str] = samples['pathway'] + hierarchy_sample: Set[str] = samples['hierarchy'] + category_sample_dict: Dict[str, List[Tuple[str]]] = samples['category_dict'] + removed = copied_network.prune( + genes_to_remove=gene_sample, + kos_to_remove=ko_sample, + modules_to_remove=module_sample, + pathways_to_remove=pathway_sample, + hierarchies_to_remove=hierarchy_sample, + categories_to_remove=category_sample_dict, + reactions_to_remove=reaction_sample, + metabolites_to_remove=metabolite_sample + ) + assert metabolite_sample.difference(set(copied_network.metabolites)) == metabolite_sample + assert not metabolite_sample.difference( + set([metabolite.modelseed_id for metabolite in removed['metabolite']]) + ) + assert reaction_sample.difference(set(copied_network.reactions)) == reaction_sample + assert not reaction_sample.difference( + set([reaction.modelseed_id for reaction in removed['reaction']]) + ) + assert ko_sample.difference(set(copied_network.kos)) == ko_sample + assert not ko_sample.difference(set([ko.id for ko in removed['ko']])) + assert module_sample.difference(set(copied_network.modules)) == module_sample + assert not module_sample.difference(set([module.id for module in removed['module']])) + assert pathway_sample.difference(set(copied_network.pathways)) == pathway_sample + assert not pathway_sample.difference(set([pathway.id for pathway in removed['pathway']])) + assert hierarchy_sample.difference(set(copied_network.hierarchies)) == hierarchy_sample + assert not hierarchy_sample.difference( + set([hierarchy.id for hierarchy in removed['hierarchy']]) + ) + remaining_category_ids: List[str] = [] + for categorizations in copied_network.categories.values(): + for categories in categorizations.values(): + remaining_category_ids.append(categories[-1].id) + category_sample: Set[str] = samples['category'] + assert category_sample.difference(set(remaining_category_ids)) == category_sample + removed_category_ids: List[str] = [] + for category in removed['category']: + category: BRITECategory + removed_category_ids.append(category.id) + assert not category_sample.difference(set(removed_category_ids)) + assert gene_sample.difference(set(copied_network.genes)) == gene_sample + assert not gene_sample.difference(set([gene.gcid for gene in removed['gene']])) + self.progress.end() + + self.progress.new("Testing network subset methods") + self.progress.update("...") + self._test_common_subset(network, samples) + + subnetwork = network.subset_network(genes_to_subset=gene_sample) + assert not gene_sample.difference(set(subnetwork.genes)) + + # Test network merging functionality by subsetting samples of items of all types at the same + # time. Test subsetting with and without the "inclusive" option. + for inclusive in (False, True): + subnetwork = network.subset_network( + genes_to_subset=gene_sample, + kos_to_subset=ko_sample, + modules_to_subset=module_sample, + pathways_to_subset=pathway_sample, + hierarchies_to_subset=hierarchy_sample, + categories_to_subset=category_sample_dict, + reactions_to_subset=reaction_sample, + metabolites_to_subset=metabolite_sample, + inclusive=inclusive + ) + assert not metabolite_sample.difference(set(subnetwork.metabolites)) + assert not reaction_sample.difference(set(subnetwork.reactions)) + assert not ko_sample.difference(set(subnetwork.kos)) + ko_module_sample: Set[str] = samples['ko_module'] + assert not ko_module_sample.difference(set(subnetwork.modules)) + ko_pathway_sample: Set[str] = samples['ko_pathway'] + assert not ko_pathway_sample.difference(set(subnetwork.pathways)) + ko_hierarchy_sample: Set[str] = samples['ko_hierarchy'] + assert not ko_hierarchy_sample.difference(set(subnetwork.hierarchies)) + ko_category_sample: Set[str] = samples['ko_category'] + subnetwork_category_ids: List[str] = [] + for hierarchy_categorizations in subnetwork.categories.values(): + for categories in hierarchy_categorizations.values(): + subnetwork_category_ids.append(categories[-1].id) + assert not ko_category_sample.difference(set(subnetwork_category_ids)) + assert not module_sample.difference(set(subnetwork.modules)) + assert not pathway_sample.difference(set(subnetwork.pathways)) + assert not hierarchy_sample.difference(set(subnetwork.hierarchies)) + remaining_category_ids: List[str] = [] + for categorizations in subnetwork.categories.values(): + for categories in categorizations.values(): + remaining_category_ids.append(categories[-1].id) + category_sample: Set[str] = samples['category'] + assert not category_sample.difference(set(remaining_category_ids)) + assert not gene_sample.difference(set(subnetwork.genes)) + self.progress.end() + + if self.test_dir is None: + shutil.rmtree(test_dir) + + self.run.info_single( + "All tests passed for the contigs database reaction network", + mc='magenta', + nl_before=1, + level=0 + ) + self.run.info_single("Network construction and storage in the contigs database") + self.run.info_single("Purge metabolites without formula") + self.run.info_single("Purge select metabolites") + self.run.info_single("Purge select reactions") + self.run.info_single("Purge select KOs") + self.run.info_single("Purge KOs in select KEGG modules") + self.run.info_single("Purge KOs in select KEGG pathways") + self.run.info_single("Purge KOs in select KEGG BRITE hierarchies") + self.run.info_single("Purge KOs in select KEGG BRITE hierarchy categories") + self.run.info_single("Purge select genes") + self.run.info_single("Subset select metabolites") + self.run.info_single("Subset select reactions") + self.run.info_single("Subset select KOs") + self.run.info_single("Subset KOs in select KEGG modules") + self.run.info_single("Subset KOs in select KEGG pathways") + self.run.info_single("Subset KOs in select KEGG BRITE hierarchies") + self.run.info_single("Subset KOs in select KEGG BRITE hierarchy categories") + self.run.info_single("Subset select genes") + self.run.info_single("Subset select metabolites, reactions, KOs, and genes", nl_after=1) + + def test_pan_database_network( + self, + pan_db: str, + genomes_storage_db: str, + copy_db: bool = True, + consensus_threshold: float = None, + discard_ties: bool = False + ) -> None: + """ + Test the construction of a reaction network from a pan database, and test that network + methods are able to run and do not fail certain basic tests. + + Parameters + ========== + pan_db : str + Path to a pan database. The pangenomic network is determined for gene clusters stored in + the database. + + genomes_storage_db : str + Path to a genomes storage database. The pangenomic network is derived from gene KO + annotations stored in the database. + + copy_db : bool, True + If True, as by default, store the generated reaction network in a copy of the input pan + database. If a test directory has been set, the database copy is placed there with a + derived filename, e.g., "my-PAN.db" is copied to a file like "TEST/my-PAN-spiba5e7.db". + If False, store the reaction network in the input pan database, overwriting any network + that is already stored. + + consensus_threshold : float, None + With the default of None, the protein annotation most frequent among genes in a cluster + is assigned to the cluster itself. If a non-default argument is provided (a value on [0, + 1]), at least this proportion of genes in the cluster must have the most frequent + annotation for the cluster to be annotated. + + discard_ties : bool, False + If multiple protein annotations are most frequent among genes in a cluster, then do not + assign an annotation to the cluster itself when this argument is True. By default, this + argument is False, so one of the most frequent annotations would be arbitrarily chosen. + + Returns + ======= + None + """ + if self.test_dir is None: + test_dir = filesnpaths.get_temp_directory_path() + else: + test_dir = self.test_dir + self.run.info("Test directory", test_dir, nl_after=1) + + self.run.info_single("NETWORK CONSTRUCTION:", mc='magenta', level=0) + utils.is_pan_db(pan_db) + utils.is_genome_storage(genomes_storage_db) + + if copy_db: + # Operations are performed on a copy of the pan database in the (provided or temporary) + # test directory. + basename = os.path.basename(pan_db) + prefix, suffix = os.path.splitext(basename) + pan_db_target = tempfile.NamedTemporaryFile( + prefix=f"{prefix}-", suffix=suffix, dir=test_dir + ).name + shutil.copy(pan_db, pan_db_target) + else: + pan_db_target = pan_db + + con = Constructor( + kegg_dir=self.kegg_dir, + modelseed_dir=self.modelseed_dir, + run=self.run, + progress=self.progress + ) + + make_stats_file_target = os.path.join(test_dir, "make_pan_db_network_stats.tsv") + network = con.make_pangenomic_network( + pan_db=pan_db_target, + genomes_storage_db=genomes_storage_db, + overwrite_existing_network=True, + consensus_threshold=consensus_threshold, + discard_ties=discard_ties, + stats_file=make_stats_file_target + ) + + self.run.info_single("NETWORK LOADING:", mc='magenta', level=0) + load_stats_file_target = os.path.join(test_dir, "load_pan_db_network_stats.tsv") + con.load_pan_database_network( + pan_db_target, genomes_storage_db, stats_file=load_stats_file_target + ) + + # Check that the statistics for the network constructed and saved in the pan database are + # the same as the statistics for the same network loaded back into memory from the pan + # database. + inconsistent_stats = self._get_inconsistent_statistics( + make_stats_file_target, load_stats_file_target + ) + if inconsistent_stats: + msg = "" + for stat, stat_tuple in inconsistent_stats.items(): + msg += f"{stat}: {stat_tuple[0]}, {stat_tuple[1]}; " + msg = msg[:-2] + raise AssertionError( + "Statistics on the network constructed and saved to the pan database differ from " + "what should be the same statistics on the same network loaded from the pan " + "database. Here are the different statistics, with the value from network " + f"construction before the value from network loading. {msg}" + ) + + self.run.info_single( + "PURGE OF METABOLITES WITHOUT FORMULA:", mc='magenta', nl_before=1, level=0 + ) + deepcopy(network).remove_metabolites_without_formula( + output_path=os.path.join(test_dir, "removed.tsv") + ) + print() + + self.progress.new("Testing network purge methods") + self.progress.update("...") + # Network pruning tests use a random sample of half the network items (nodes) of each type. + sample_proportion = 0.1 + sample_seed = RANDOM_SEED + samples = self._get_common_item_samples( + network, proportion=sample_proportion, seed=sample_seed + ) + + random.seed(sample_seed) + gene_cluster_sample = set(random.sample( + list(network.gene_clusters), math.ceil(sample_proportion * len(network.gene_clusters)) + )) + + self._test_common_prune(network, samples) + + copied_network = deepcopy(network) + removed = copied_network.prune(gene_clusters_to_remove=gene_cluster_sample) + assert ( + gene_cluster_sample.difference(set(copied_network.gene_clusters)) == + gene_cluster_sample + ) + assert not gene_cluster_sample.difference( + set([gene_cluster.gene_cluster_id for gene_cluster in removed['gene_cluster']]) + ) + + # Test the pruning of multiple types of items at the same time. + copied_network = deepcopy(network) + metabolite_sample: Set[str] = samples['metabolite'] + reaction_sample: Set[str] = samples['reaction'] + ko_sample: Set[str] = samples['ko'] + module_sample: Set[str] = samples['module'] + pathway_sample: Set[str] = samples['pathway'] + hierarchy_sample: Set[str] = samples['hierarchy'] + category_sample_dict: Dict[str, List[Tuple[str]]] = samples['category_dict'] + removed = copied_network.prune( + gene_clusters_to_remove=gene_cluster_sample, + kos_to_remove=ko_sample, + modules_to_remove=module_sample, + pathways_to_remove=pathway_sample, + hierarchies_to_remove=hierarchy_sample, + categories_to_remove=category_sample_dict, + reactions_to_remove=reaction_sample, + metabolites_to_remove=metabolite_sample + ) + assert metabolite_sample.difference(set(copied_network.metabolites)) == metabolite_sample + assert not metabolite_sample.difference( + set([metabolite.modelseed_id for metabolite in removed['metabolite']]) + ) + assert reaction_sample.difference(set(copied_network.reactions)) == reaction_sample + assert not reaction_sample.difference( + set([reaction.modelseed_id for reaction in removed['reaction']]) + ) + assert ko_sample.difference(set(copied_network.kos)) == ko_sample + assert not ko_sample.difference(set([ko.id for ko in removed['ko']])) + assert module_sample.difference(set(copied_network.modules)) == module_sample + assert not module_sample.difference(set([module.id for module in removed['module']])) + assert pathway_sample.difference(set(copied_network.pathways)) == pathway_sample + assert not pathway_sample.difference(set([pathway.id for pathway in removed['pathway']])) + assert hierarchy_sample.difference(set(copied_network.hierarchies)) == hierarchy_sample + assert not hierarchy_sample.difference( + set([hierarchy.id for hierarchy in removed['hierarchy']]) + ) + remaining_category_ids: List[str] = [] + for categorizations in copied_network.categories.values(): + for categories in categorizations.values(): + remaining_category_ids.append(categories[-1].id) + category_sample: Set[str] = samples['category'] + assert category_sample.difference(set(remaining_category_ids)) == category_sample + removed_category_ids: List[str] = [] + for category in removed['category']: + category: BRITECategory + removed_category_ids.append(category.id) + assert not category_sample.difference(set(removed_category_ids)) + assert gene_cluster_sample.difference(set(copied_network.gene_clusters)) == gene_cluster_sample + assert not gene_cluster_sample.difference( + set([gene_cluster.gene_cluster_id for gene_cluster in removed['gene_cluster']]) + ) + self.progress.end() + + self.progress.new("Testing network subset methods") + self.progress.update("...") + self._test_common_subset(network, samples) + + subnetwork = network.subset_network(gene_clusters_to_subset=gene_cluster_sample) + assert not gene_cluster_sample.difference(set(subnetwork.gene_clusters)) + + # Test network merging functionality by subsetting samples of items of all types. Test + # subsetting with and without the "inclusive" option. + for inclusive in (False, True): + subnetwork = network.subset_network( + gene_clusters_to_subset=gene_cluster_sample, + kos_to_subset=ko_sample, + modules_to_subset=module_sample, + pathways_to_subset=pathway_sample, + hierarchies_to_subset=hierarchy_sample, + categories_to_subset=category_sample_dict, + reactions_to_subset=reaction_sample, + metabolites_to_subset=metabolite_sample, + inclusive=inclusive + ) + assert not metabolite_sample.difference(set(subnetwork.metabolites)) + assert not reaction_sample.difference(set(subnetwork.reactions)) + assert not ko_sample.difference(set(subnetwork.kos)) + ko_module_sample: Set[str] = samples['ko_module'] + assert not ko_module_sample.difference(set(subnetwork.modules)) + ko_pathway_sample: Set[str] = samples['ko_pathway'] + assert not ko_pathway_sample.difference(set(subnetwork.pathways)) + ko_hierarchy_sample: Set[str] = samples['ko_hierarchy'] + assert not ko_hierarchy_sample.difference(set(subnetwork.hierarchies)) + ko_category_sample: Set[str] = samples['ko_category'] + subnetwork_category_ids: List[str] = [] + for hierarchy_categorizations in subnetwork.categories.values(): + for categories in hierarchy_categorizations.values(): + subnetwork_category_ids.append(categories[-1].id) + assert not ko_category_sample.difference(set(subnetwork_category_ids)) + assert not module_sample.difference(set(subnetwork.modules)) + assert not pathway_sample.difference(set(subnetwork.pathways)) + assert not hierarchy_sample.difference(set(subnetwork.hierarchies)) + remaining_category_ids: List[str] = [] + for categorizations in subnetwork.categories.values(): + for categories in categorizations.values(): + remaining_category_ids.append(categories[-1].id) + category_sample: Set[str] = samples['category'] + assert not category_sample.difference(set(remaining_category_ids)) + assert not gene_cluster_sample.difference(set(subnetwork.gene_clusters)) + self.progress.end() + + if self.test_dir is None: + shutil.rmtree(test_dir) + + self.run.info_single( + "All tests passed for the pan database reaction network", + mc='magenta', + nl_before=1, + level=0 + ) + self.run.info_single("Network construction and storage in the pan database") + self.run.info_single("Purge metabolites without formula") + self.run.info_single("Purge select metabolites") + self.run.info_single("Purge select reactions") + self.run.info_single("Purge select KOs") + self.run.info_single("Purge KOs in select KEGG modules") + self.run.info_single("Purge KOs in select KEGG pathways") + self.run.info_single("Purge KOs in select KEGG BRITE hierarchies") + self.run.info_single("Purge KOs in select KEGG BRITE hierarchy categories") + self.run.info_single("Purge select gene clusters") + self.run.info_single("Subset select metabolites") + self.run.info_single("Subset select reactions") + self.run.info_single("Subset select KOs") + self.run.info_single("Subset KOs in select KEGG modules") + self.run.info_single("Subset KOs in select KEGG pathways") + self.run.info_single("Subset KOs in select KEGG BRITE hierarchies") + self.run.info_single("Subset KOs in select KEGG BRITE hierarchy categories") + self.run.info_single("Subset select gene clusters") + self.run.info_single( + "Subset select metabolites, reactions, KOs, and gene clusters", nl_after=1 + ) + + def _get_inconsistent_statistics( + self, + make_stats_file_target: str, + load_stats_file_target: str + ) -> Dict[str, Tuple[float, float]]: + """ + Compare statistics for the network constructed and saved in an anvi'o database with + statistics for what should be the same network loaded back into memory from the database, + returning any inconsistent statistics. + + Parameters + ========== + make_stats_file_target : str + Path to file of statistics for network construction. + + load_stats_file_target : str + Path to file of statistics for network loading. + + Returns + ======= + Dict[str, Tuple[float, float]] + Inconsistent statistics, with keys being names of the statistics and values being pairs + of the statistic from network construction and loading, respectively. + """ + make_stats_table = pd.read_csv( + make_stats_file_target, + sep='\t', + header=0, + index_col='Statistic', + usecols=['Statistic', 'Value'] + ) + make_stats_table = make_stats_table.rename({'Value': 'make'}, axis=1) + + load_stats_table = pd.read_csv( + load_stats_file_target, + sep='\t', + header=0, + index_col='Statistic', + usecols=['Statistic', 'Value'] + ) + load_stats_table = load_stats_table.rename({'Value': 'load'}, axis=1) + stats_table = pd.merge( + make_stats_table, load_stats_table, left_index=True, right_index=True + ) + + inconsistent_stats: Dict[str, Tuple[float, float]] = {} + for row in stats_table.itertuples(): + if row.make != row.load: + inconsistent_stats[row.Index] = (row.make, row.load) + + return inconsistent_stats + + def _get_common_item_samples( + self, + network: ReactionNetwork, + proportion: float = 0.1, + seed: int = 0 + ) -> Dict: + """ + Get a random sample of half of the network items (nodes) of each type shared in common + between genomic and pangenomic networks. + + Parameters + ========== + network : ReactionNetwork + Network generated from gene or gene cluster KO annotations. + + proportion : float, 0.5 + Proportion to be randomly sampled of network items of each type. + + seed : int, 0 + Seed for random number generation. + + Returns + ======= + dict + Dictionary with keys indicating item type and values being samples of item IDs. + """ + samples = {} + + random.seed(seed) + samples['metabolite'] = set(random.sample( + list(network.metabolites), math.ceil(proportion * len(network.metabolites)) + )) + + random.seed(seed) + samples['reaction'] = set( + random.sample(list(network.reactions), math.ceil(proportion * len(network.reactions))) + ) + + random.seed(seed) + samples['ko'] = set( + random.sample(list(network.kos), math.ceil(proportion * len(network.kos))) + ) + ko_module_ids = [] + ko_pathway_ids = [] + ko_hierarchies = [] + ko_categorizations = [] + for ko_id in samples['ko']: + ko = network.kos[ko_id] + ko_module_ids += ko.module_ids + ko_pathway_ids += ko.pathway_ids + for hierarchy_id, categorizations in ko.hierarchies.items(): + ko_hierarchies.append(hierarchy_id) + for categorization in categorizations: + for depth in range(1, len(categorization) + 1): + ko_categorizations.append( + f"{hierarchy_id}: {' >>> '.join(categorization[:depth])}" + ) + samples['ko_module'] = set(ko_module_ids) + samples['ko_pathway'] = set(ko_pathway_ids) + samples['ko_hierarchy'] = set(ko_hierarchies) + samples['ko_category'] = set(ko_categorizations) + + random.seed(seed) + samples['module'] = set( + random.sample(list(network.modules), math.ceil(proportion * len(network.modules))) + ) + + random.seed(seed) + samples['pathway'] = set( + random.sample(list(network.pathways), math.ceil(proportion * len(network.pathways))) + ) + + random.seed(seed) + samples['hierarchy'] = set(random.sample( + list(network.hierarchies), math.ceil(proportion * len(network.hierarchies)) + )) + + random.seed(seed) + all_category_ids: List[str] = [] + for categorizations in network.categories.values(): + for categories in categorizations.values(): + all_category_ids.append(categories[-1].id) + samples['category'] = category_sample = set( + random.sample(all_category_ids, math.ceil(proportion * len(all_category_ids))) + ) + # Reformat the category IDs into an argument for pruning and subsetting. + category_sample_dict: Dict[str, List[Tuple[str]]] = {} + hierarchy_id_pattern = re.compile('ko\d{5}') + for category_id in category_sample: + hierarchy_id = category_id.split(':')[0] + assert re.fullmatch(hierarchy_id_pattern, hierarchy_id) + try: + categorizations = category_sample_dict[hierarchy_id] + except KeyError: + category_sample_dict[hierarchy_id] = categorizations = [] + categorizations.append(tuple(category_id[len(hierarchy_id) + 2:].split(' >>> '))) + samples['category_dict'] = category_sample_dict + + return samples + + def _test_common_prune( + self, + network: Union[GenomicNetwork, PangenomicNetwork], + samples: Dict + ) -> None: + """ + Test the prune method of the reaction network, purging items of types in common to genomic + and pangenomic networks. + + Parameters + ========== + network : Union[GenomicNetwork, PangenomicNetwork] + Network generated from gene or gene cluster KO annotations. + + samples : Dict + Dictionary with keys indicating item type and values being samples of item IDs. + + Returns + ======= + None + """ + copied_network = deepcopy(network) + metabolite_sample: Set[str] = samples['metabolite'] + removed = copied_network.prune(metabolites_to_remove=metabolite_sample) + # The most basic test of the purge (pruning) method is that the network no longer contains + # the items that were requested to be removed. What remains untested, and would require a + # curated test dataset, is the removal of certain other "upstream" and "downstream" nodes + # associated with the nodes requested to be removed, e.g., KOs and genes or gene clusters + # upstream and metabolites downstream of requested reactions. + assert metabolite_sample.difference(set(copied_network.metabolites)) == metabolite_sample + assert not metabolite_sample.difference( + set([metabolite.modelseed_id for metabolite in removed['metabolite']]) + ) + + copied_network = deepcopy(network) + reaction_sample: Set[str] = samples['reaction'] + removed = copied_network.prune(reactions_to_remove=reaction_sample) + assert reaction_sample.difference(set(copied_network.reactions)) == reaction_sample + assert not reaction_sample.difference( + set([reaction.modelseed_id for reaction in removed['reaction']]) + ) + + copied_network = deepcopy(network) + ko_sample: Set[str] = samples['ko'] + removed = copied_network.prune(kos_to_remove=ko_sample) + assert ko_sample.difference(set(copied_network.kos)) == ko_sample + assert not ko_sample.difference(set([ko.id for ko in removed['ko']])) + + copied_network = deepcopy(network) + module_sample: Set[str] = samples['module'] + removed = copied_network.prune(modules_to_remove=module_sample) + assert module_sample.difference(set(copied_network.modules)) == module_sample + assert not module_sample.difference(set([module.id for module in removed['module']])) + + copied_network = deepcopy(network) + pathway_sample: Set[str] = samples['pathway'] + removed = copied_network.prune(pathways_to_remove=pathway_sample) + assert pathway_sample.difference(set(copied_network.pathways)) == pathway_sample + assert not pathway_sample.difference(set([pathway.id for pathway in removed['pathway']])) + + copied_network = deepcopy(network) + hierarchy_sample: Set[str] = samples['hierarchy'] + removed = copied_network.prune(hierarchies_to_remove=hierarchy_sample) + assert hierarchy_sample.difference(set(copied_network.hierarchies)) == hierarchy_sample + assert not hierarchy_sample.difference( + set([hierarchy.id for hierarchy in removed['hierarchy']]) + ) + + copied_network = deepcopy(network) + category_sample_dict: Dict[str, List[Tuple[str]]] = samples['category_dict'] + removed = copied_network.prune(categories_to_remove=category_sample_dict) + remaining_category_ids: List[str] = [] + for categorizations in copied_network.categories.values(): + for categories in categorizations.values(): + remaining_category_ids.append(categories[-1].id) + category_sample: Set[str] = samples['category'] + assert category_sample.difference(set(remaining_category_ids)) == category_sample + removed_category_ids: List[str] = [] + for category in removed['category']: + category: BRITECategory + removed_category_ids.append(category.id) + assert not category_sample.difference(set(removed_category_ids)) + + def _test_common_subset( + self, + network: Union[GenomicNetwork, PangenomicNetwork], + samples: Dict + ) -> None: + """ + Test the subset method of the reaction network, subsetting items of types in common to + genomic and pangenomic networks. + + Parameters + ========== + network : Union[GenomicNetwork, PangenomicNetwork] + Network generated from gene or gene cluster KO annotations. + + samples : Dict + Dictionary with keys indicating item type and values being samples of item IDs. + + Returns + ======= + None + """ + metabolite_sample: Set[str] = samples['metabolite'] + subnetwork = network.subset_network(metabolites_to_subset=metabolite_sample) + # The most basic test of the subset method is that the new network contains the requested + # items. What remains untested, and would require a curated test dataset, is the inclusion + # of certain other "upstream" and "downstream" nodes associated with the nodes requested to + # be removed, e.g., KOs and gene clusters upstream and metabolites downstream of requested + # reactions. + # Assert that all of the items requested to be subsetted were subsetted. + assert not metabolite_sample.difference(set(subnetwork.metabolites)) + + reaction_sample: Set[str] = samples['reaction'] + subnetwork = network.subset_network(reactions_to_subset=reaction_sample) + assert not reaction_sample.difference(set(subnetwork.reactions)) + + ko_sample: Set[str] = samples['ko'] + subnetwork = network.subset_network(kos_to_subset=ko_sample) + assert not ko_sample.difference(set(subnetwork.kos)) + ko_module_sample: Set[str] = samples['ko_module'] + assert not ko_module_sample.difference(set(subnetwork.modules)) + ko_pathway_sample: Set[str] = samples['ko_pathway'] + assert not ko_pathway_sample.difference(set(subnetwork.pathways)) + ko_hierarchy_sample: Set[str] = samples['ko_hierarchy'] + assert not ko_hierarchy_sample.difference(set(subnetwork.hierarchies)) + ko_category_sample: Set[str] = samples['ko_category'] + subnetwork_category_ids: List[str] = [] + for hierarchy_categorizations in subnetwork.categories.values(): + for categories in hierarchy_categorizations.values(): + subnetwork_category_ids.append(categories[-1].id) + assert not ko_category_sample.difference(set(subnetwork_category_ids)) + + module_sample: Set[str] = samples['module'] + subnetwork = network.subset_network(modules_to_subset=module_sample) + assert not module_sample.difference(set(subnetwork.modules)) + + pathway_sample: Set[str] = samples['pathway'] + subnetwork = network.subset_network(pathways_to_subset=pathway_sample) + assert not pathway_sample.difference(set(subnetwork.pathways)) + + hierarchy_sample: Set[str] = samples['hierarchy'] + subnetwork = network.subset_network(hierarchies_to_subset=hierarchy_sample) + assert not hierarchy_sample.difference(set(subnetwork.hierarchies)) + + category_sample_dict: Dict[str, List[Tuple[str]]] = samples['category_dict'] + subnetwork = network.subset_network(categories_to_subset=category_sample_dict) + remaining_category_ids: List[str] = [] + for categorizations in subnetwork.categories.values(): + for categories in categorizations.values(): + remaining_category_ids.append(categories[-1].id) + category_sample: Set[str] = samples['category'] + assert not category_sample.difference(set(remaining_category_ids)) + +class FormulaMatcher: + """ + Match chemical formulas to metabolites in a reaction network. + + Attributes + ========== + network : ReactionNetwork + Network generated from gene or gene cluster KO annotations. + """ + def __init__(self, network: ReactionNetwork) -> None: + """ + Parameters + ========== + network : network : ReactionNetwork + Network generated from gene or gene cluster KO annotations. This becomes the attribute + of the same name. + """ + self.network = network + + def match_metabolites(self, formula: str) -> List[ModelSEEDCompound]: + """ + Match a formula written the standard way to metabolites in the network, returning a list of + metabolites. + + Parameters + ========== + formula : str + Chemical formula written the standard way. + + Returns + ======= + List[ModelSEEDCompound] + Metabolites with the same formula. + """ + metabolites: List[ModelSEEDCompound] = [] + for metabolite in self.network.metabolites.values(): + if formula == metabolite.formula: + metabolites.append(metabolite) + + return metabolites + + def match_metabolites_network( + self, + formula: str + ) -> Tuple[List[ModelSEEDCompound], ReactionNetwork]: + """ + Match a formula written the standard way to metabolites in the network, returning a list of + metabolites and the subsetted network containing those metabolites. + + Parameters + ========== + formula : str + Chemical formula written the standard way. + + Returns + ======= + Tuple[List[ModelSEEDCompound], ReactionNetwork] + Metabolites with the same formula and the subsetted network containing those + metabolites. + """ + metabolites = self.match_metabolites(formula) + if not metabolites: + return metabolites, None + + subnetwork = self.network.subset_network( + metabolites_to_subset=[metabolite.modelseed_id for metabolite in metabolites] + ) + + return metabolites, subnetwork + +def get_chemical_equation( + reaction: ModelSEEDReaction, + use_compound_names: Iterable[str] = None, + ignore_compartments: bool = False +) -> str: + """ + Get a decent-looking chemical equation. + + Parameters + ========== + reaction : ModelSEEDReaction + The representation of the reaction with data sourced from ModelSEED Biochemistry. + + use_compound_names : Iterable[str], None + Rather than showing ModelSEED compound IDs in the equation, show ModelSEED compound names -- + except for compounds lacking a name, in which case ID is shown instead. Provide the compound + names to be used in lieu of IDs, in the same order as the compound IDs in the reaction, and + with entries of None for nameless compounds. + + ignore_compartments : bool, False + If True, do not show metabolite compartments in the equation. + + Returns + ======= + str + The stoichiometric equation has integer coefficients; reactants and products are represented + by ModelSEED Biochemistry compound names and compartment symbols "(c)" if cytosolic and + "(e)" if extracellular; and a unidirectional arrow, "->", if irreversible and bidirectional + arrow, "<->", if reversible. + """ + equation = "" + leftside = True + if use_compound_names: + compounds: List[str] = [] + for compound_name, compound_id in zip(use_compound_names, reaction.compound_ids): + if compound_name: + compounds.append(compound_name) + else: + compounds.append(compound_id) + else: + compounds = reaction.compound_ids + for coefficient, compound, compartment in zip( + reaction.coefficients, compounds, reaction.compartments + ): + if leftside and coefficient > 0: + leftside = False + equation = equation.rstrip('+ ') + ' ' + if reaction.reversibility: + equation += "<-> " + else: + equation += "-> " + + if leftside: + coeff = -coefficient + else: + coeff = coefficient + if ignore_compartments: + equation += f"{coeff} {compound} + " + else: + equation += f"{coeff} {compound} [{compartment}] + " + + return equation.rstrip('+ ') + +def to_lcm_denominator(floats: Iterable[float]) -> Tuple[int]: + """ + Convert a list of floats to a list of integers, with a list containing fractional numbers + transformed to a list of lowest common integer multiples. + + Parameters + ========== + floats : Iterable[float] + Numbers to convert. + + Returns + ======= + List[int] + List of integers transformed from the input list. + """ + def lcm(a, b): + return a * b // math.gcd(a, b) + + rationals = [fractions.Fraction(f).limit_denominator() for f in floats] + lcm_denom = functools.reduce(lcm, [r.denominator for r in rationals]) + + return list(int(r.numerator * lcm_denom / r.denominator) for r in rationals) diff --git a/anvio/tables/__init__.py b/anvio/tables/__init__.py index ff844e8860..6ef632fe12 100644 --- a/anvio/tables/__init__.py +++ b/anvio/tables/__init__.py @@ -13,10 +13,10 @@ __email__ = "a.murat.eren@gmail.com" -contigs_db_version = "23" +contigs_db_version = "24" profile_db_version = "40" genes_db_version = "6" -pan_db_version = "18" +pan_db_version = "21" auxiliary_data_version = "2" structure_db_version = "2" genomes_storage_vesion = "7" @@ -46,14 +46,17 @@ pan_gene_clusters_table_structure = ['gene_caller_id', 'gene_cluster_id', 'genome_name', 'alignment_summary'] pan_gene_clusters_table_types = [ 'numeric' , 'str' , 'str' , 'str' ] -pan_gene_cluster_function_reactions_table_name = 'gene_cluster_function_reactions' -pan_gene_cluster_function_reactions_table_structure = ['modelseed_reaction_id', 'modelseed_reaction_name', 'ko_kegg_reaction_source', 'ko_ec_number_source', 'other_kegg_reaction_ids', 'other_ec_numbers', 'metabolite_modelseed_ids', 'stoichiometry', 'compartments', 'reversibility'] -pan_gene_cluster_function_reactions_table_types = [ 'text' , 'text' , 'text' , 'text' , 'text' , 'text' , 'text' , 'text' , 'text' , 'bool' ] +pan_reaction_network_reactions_table_name = 'pan_reaction_network_reactions' +pan_reaction_network_reactions_table_structure = ['modelseed_reaction_id', 'modelseed_reaction_name', 'ko_kegg_reaction_source', 'ko_ec_number_source', 'other_kegg_reaction_ids', 'other_ec_numbers', 'metabolite_modelseed_ids', 'stoichiometry', 'compartments', 'reversibility'] +pan_reaction_network_reactions_table_types = [ 'text' , 'text' , 'text' , 'text' , 'text' , 'text' , 'text' , 'text' , 'text' , 'bool' ] -pan_gene_cluster_function_metabolites_table_name = 'gene_cluster_function_metabolites' -pan_gene_cluster_function_metabolites_table_structure = ['modelseed_compound_id', 'modelseed_compound_name', 'kegg_aliases', 'formula', 'charge'] -pan_gene_cluster_function_metabolites_table_types = [ 'text' , 'text' , 'text' , 'text' , 'numeric'] +pan_reaction_network_metabolites_table_name = 'pan_reaction_network_metabolites' +pan_reaction_network_metabolites_table_structure = ['modelseed_compound_id', 'modelseed_compound_name', 'kegg_aliases', 'formula', 'charge' , 'smiles'] +pan_reaction_network_metabolites_table_types = [ 'text' , 'text' , 'text' , 'text' , 'numeric', 'text' ] +pan_reaction_network_kegg_table_name = 'pan_reaction_network_kegg' +pan_reaction_network_kegg_table_structure = ['kegg_id', 'name', 'modules', 'pathways', 'brite_categorization'] +pan_reaction_network_kegg_table_types = [ 'text' , 'text', 'text' , 'text' , 'text' ] #################################################################################################### # @@ -93,13 +96,17 @@ gene_function_calls_table_structure = ['gene_callers_id', 'source', 'accession', 'function', 'e_value'] gene_function_calls_table_types = [ 'numeric' , 'text' , 'text' , 'text' , 'numeric'] -gene_function_reactions_table_name = 'gene_function_reactions' -gene_function_reactions_table_structure = ['modelseed_reaction_id', 'modelseed_reaction_name', 'ko_kegg_reaction_source', 'ko_ec_number_source', 'other_kegg_reaction_ids', 'other_ec_numbers', 'metabolite_modelseed_ids', 'stoichiometry', 'compartments', 'reversibility'] -gene_function_reactions_table_types = [ 'text' , 'text' , 'text' , 'text' , 'text' , 'text' , 'text' , 'text' , 'text' , 'bool' ] +reaction_network_reactions_table_name = 'reaction_network_reactions' +reaction_network_reactions_table_structure = ['modelseed_reaction_id', 'modelseed_reaction_name', 'ko_kegg_reaction_source', 'ko_ec_number_source', 'other_kegg_reaction_ids', 'other_ec_numbers', 'metabolite_modelseed_ids', 'stoichiometry', 'compartments', 'reversibility'] +reaction_network_reactions_table_types = [ 'text' , 'text' , 'text' , 'text' , 'text' , 'text' , 'text' , 'text' , 'text' , 'bool' ] -gene_function_metabolites_table_name = 'gene_function_metabolites' -gene_function_metabolites_table_structure = ['modelseed_compound_id', 'modelseed_compound_name', 'kegg_aliases', 'formula', 'charge'] -gene_function_metabolites_table_types = [ 'text' , 'text' , 'text' , 'text' , 'numeric'] +reaction_network_metabolites_table_name = 'reaction_network_metabolites' +reaction_network_metabolites_table_structure = ['modelseed_compound_id', 'modelseed_compound_name', 'kegg_aliases', 'formula', 'charge' , 'smiles'] +reaction_network_metabolites_table_types = [ 'text' , 'text' , 'text' , 'text' , 'numeric', 'text' ] + +reaction_network_kegg_table_name = 'reaction_network_kegg' +reaction_network_kegg_table_structure = ['kegg_id', 'name', 'modules', 'pathways', 'brite_categorization'] +reaction_network_kegg_table_types = [ 'text' , 'text', 'text' , 'text' , 'text' ] taxon_names_table_name = 'taxon_names' taxon_names_table_structure = ['taxon_id', "t_phylum", "t_class", "t_order", "t_family", "t_genus", "t_species"] @@ -400,12 +407,18 @@ 'max_normalized_ratio_splits': False, 'relative_abundance_splits': False, pan_gene_clusters_table_name: True, - pan_gene_cluster_function_reactions_table_name: False, - pan_gene_cluster_function_metabolites_table_name: False, + 'gene_cluster_function_reactions': False, # renamed to 'pan_reaction_network_reactions' + pan_reaction_network_reactions_table_name: False, + 'gene_cluster_function_metabolites': False, # renamed to 'pan_reaction_network_metabolites' + pan_reaction_network_metabolites_table_name: False, + pan_reaction_network_kegg_table_name: False, genes_in_splits_table_name: True, gene_function_calls_table_name: True, - gene_function_reactions_table_name: False, - gene_function_metabolites_table_name: False, + 'gene_function_reactions': False, # renamed to 'reaction_network_reactions' + reaction_network_reactions_table_name: False, + 'gene_function_metabolites': False, # renamed to 'reaction_network_metabolites' + reaction_network_metabolites_table_name: False, + reaction_network_kegg_table_name: False, hmm_hits_splits_table_name: True, scg_taxonomy_table_name: True, trna_taxonomy_table_name: True, diff --git a/anvio/terminal.py b/anvio/terminal.py index 484650610f..5b2417138b 100644 --- a/anvio/terminal.py +++ b/anvio/terminal.py @@ -38,6 +38,12 @@ CLEAR = lambda line: ansi_escape.sub('', non_ascii_escape.sub('', line.strip())) +mc_color_dict = {'Yes': 'green', + 'True': 'green', + 'No': 'red', + 'False': 'red'} + + class SuppressAllOutput(object): def __enter__(self): sys.stderr.flush() @@ -358,6 +364,9 @@ def __init__(self, log_file_path=None, verbose=True, width=45): self.verbose = verbose self.width = width + # when True, various output messages may be colored automatically + self.autocolor = False + # learn about the terminal self.terminal_width = get_terminal_width() @@ -434,6 +443,9 @@ def info(self, key, value, quiet=False, display_only=False, overwrite_verbose=Fa elif isinstance(value, int): value = pretty_print(value) + if self.autocolor and value in mc_color_dict: + mc = mc_color_dict[value] + label = constants.get_pretty_name(key) info_line = "%s%s %s: %s\n%s" % ('\n' * nl_before, c(label, lc), diff --git a/anvio/tests/run_component_tests_for_kegg_mapping.sh b/anvio/tests/run_component_tests_for_kegg_mapping.sh new file mode 100755 index 0000000000..2ee8320903 --- /dev/null +++ b/anvio/tests/run_component_tests_for_kegg_mapping.sh @@ -0,0 +1,131 @@ +#!/bin/bash +source 00.sh + +SETUP_WITH_OUTPUT_DIR $1 $2 $3 + +rn_python_script=`readlink -f run_component_tests_for_reaction_network` + +INFO "Checking for the required KEGG database set up by anvi'o in a default location" +${rn_python_script} --check-default-kegg-database + +INFO "Setting up the KEGG mapping analysis directory" +mkdir -p ${output_dir}/ +# These databases should already contain KO annotations. +cp ${files}/mock_data_for_pangenomics/*.db ${output_dir}/ +cp ${files}/mock_data_for_pangenomics/external-genomes.txt ${output_dir}/ +cp ${files}/example_description.md ${output_dir}/ +cd ${output_dir}/ +mkdir contigs_db_kos +mkdir contigs_dbs_kos_count +mkdir pan_db_kos_genome_count_emphasize_shared +mkdir pan_db_kos_genome_count_emphasize_unshared +mkdir pan_db_kos_presence_absence + +INFO "Migrating all databases" +anvi-migrate *db --migrate-quickly + +INFO "Generating an anvi'o genomes storage" +anvi-gen-genomes-storage -e external-genomes.txt -o TEST-GENOMES.db --no-progress + +INFO "Running the pangenome analysis with default parameters" +anvi-pan-genome -g TEST-GENOMES.db \ + -o TEST/ \ + -n TEST \ + --use-ncbi-blast \ + --description example_description.md \ + --no-progress \ + ${thread_controller} + +use_default_modelseed_db=`${rn_python_script} --check-default-modelseed-database` +if [ "${use_default_modelseed_db}" == "True" ] +then + INFO "Using the ModelSEED Biochemistry database already set up by anvi'o in a default location" +else + INFO "Setting up the ModelSEED Biochemistry database in a temporary directory (a permanent ModelSEED database can be installed in the default location with 'anvi-setup-modelseed-database')" + data_dir=`mktemp -d` + anvi-setup-modelseed-database --dir ${data_dir} + modelseed_data_dir=${data_dir}/MODELSEED +fi + +INFO "Generating a pangenomic reaction network" +args=() +args+=( "--pan-db" "TEST/TEST-PAN.db" ) +args+=( "--genomes-storage" "TEST-GENOMES.db" ) +if [ ${use_default_modelseed_db} == "False" ] +then + args+=( "--modelseed-dir" ${modelseed_data_dir} ) +fi +args+=( "--no-progress" ) +anvi-reaction-network "${args[@]}" + +pathway_numbers=( "00010" "01100" "01200" ) + +INFO "Testing mapping KOs from a genomic contigs database" +args=() +args+=( "--contigs-db" "E_faecalis_6240.db" ) +args+=( "--output-dir" ${output_dir}/contigs_db_kos ) +args+=( "--ko" ) +args+=( "--pathway-numbers" "${pathway_numbers[@]}" ) +args+=( "--no-progress" ) +anvi-draw-kegg-pathways "${args[@]}" + +INFO "Testing mapping KOs from multiple contigs databases, displaying database counts \ +emphasizing shared reactions, drawing grid maps" +args=() +args+=( "--external-genomes" "external-genomes.txt" ) +args+=( "--output-dir" ${output_dir}/contigs_dbs_kos_count ) +args+=( "--ko" ) +args+=( "--pathway-numbers" "${pathway_numbers[@]}" ) +args+=( "--draw-grid" ) +args+=( "--no-progress" ) +anvi-draw-kegg-pathways "${args[@]}" + +INFO "Testing mapping KOs from multiple contigs databases, displaying database membership" +args=() +args+=( "--contigs-dbs" \ +"E_faecalis_6240.db" \ +"E_faecalis_6255.db" \ +"E_faecalis_6512.db" +) +args+=( "--output-dir" ${output_dir}/contigs_dbs_kos_membership ) +args+=( "--ko" ) +args+=( "--pathway-numbers" "${pathway_numbers[@]}" ) +args+=( "--no-progress" ) +anvi-draw-kegg-pathways "${args[@]}" + +INFO "Testing mapping KOs from a pangenomic database, displaying genome counts \ +emphasizing shared reactions" +args=() +args+=( "--pan-db" "TEST/TEST-PAN.db" ) +args+=( "--genomes-storage" "TEST-GENOMES.db" ) +args+=( "--output-dir" ${output_dir}/pan_db_kos_genome_count_emphasize_shared ) +args+=( "--ko" ) +args+=( "--pathway-numbers" "${pathway_numbers[@]}" ) +args+=( "--no-progress" ) +anvi-draw-kegg-pathways "${args[@]}" + +INFO "Testing mapping KOs from a pangenomic database, displaying genome counts \ +emphasizing unshared reactions, drawing grid maps and maps for each genome" +args=() +args+=( "--pan-db" "TEST/TEST-PAN.db" ) +args+=( "--genomes-storage" "TEST-GENOMES.db" ) +args+=( "--output-dir" ${output_dir}/pan_db_kos_genome_count_emphasize_unshared ) +args+=( "--draw-individual-files" ) +args+=( "--draw-grid" ) +args+=( "--ko" ) +args+=( "--pathway-numbers" "${pathway_numbers[@]}" ) +args+=( "--colormap" "plasma" "0.1" "0.9") +args+=( "--reverse-overlay" ) +args+=( "--no-progress" ) +anvi-draw-kegg-pathways "${args[@]}" + +INFO "Testing mapping KOs from a pangenomic database, displaying presence/absence" +args=() +args+=( "--pan-db" "TEST/TEST-PAN.db" ) +args+=( "--genomes-storage" "TEST-GENOMES.db" ) +args+=( "--output-dir" ${output_dir}/pan_db_kos_presence_absence ) +args+=( "--ko" ) +args+=( "--pathway-numbers" "${pathway_numbers[@]}" ) +args+=( "--set-color" ) +args+=( "--no-progress" ) +anvi-draw-kegg-pathways "${args[@]}" diff --git a/anvio/tests/run_component_tests_for_metabolism.sh b/anvio/tests/run_component_tests_for_metabolism.sh index 7bcd8a44e1..ba9a7df377 100755 --- a/anvio/tests/run_component_tests_for_metabolism.sh +++ b/anvio/tests/run_component_tests_for_metabolism.sh @@ -5,6 +5,21 @@ source 00.sh SETUP_WITH_OUTPUT_DIR $1 $2 $3 ##################################### +# If you don't want to use the default KEGG data directory for testing, you should +# run `export kegg_data_dir=/path/to/data/dir/you/want` in your terminal before starting the self-test + +if [ x"${kegg_data_dir}" == "x" ]; then + INFO "Checking for KEGG database in default location" + # Here we use Sam's clever function to check for default KEGG data dir + rn_python_script=`readlink -f run_component_tests_for_reaction_network` + ${rn_python_script} --check-default-kegg-database + source_dir=$(dirname -- "$( readlink -f -- "$0"; )";) + kegg_data_dir=${source_dir%/tests}/data/misc/KEGG + INFO "Using default KEGG data directory: $kegg_data_dir" +else + INFO "Using manually-provided KEGG data directory: $kegg_data_dir" +fi + INFO "Setting up the metabolism test directory" mkdir $output_dir/metabolism_test cp $files/data/genomes/bacteria/*.db $output_dir/metabolism_test @@ -16,16 +31,6 @@ cd $output_dir/metabolism_test INFO "Migrating all databases" anvi-migrate *db --migrate-quickly -# generate a temporary directory to store anvi-setup-kegg-data output, -# and remove it immediately to make sure it doesn't exist: -kegg_data_dir=`mktemp -d` -rm -rf $kegg_data_dir - -INFO "Setting up KEGG data" -anvi-setup-kegg-data --mode all \ - --kegg-data-dir $kegg_data_dir \ - $thread_controller - INFO "Annotating all databases with KOfams" anvi-run-kegg-kofams -c B_thetaiotamicron_VPI-5482.db \ --kegg-data-dir $kegg_data_dir \ @@ -398,22 +403,3 @@ anvi-compute-metabolic-enrichment -M long_format_multi_modules.txt \ -o enrichment_ungrouped.txt \ --no-progress SHOW_FILE enrichment_ungrouped.txt - -## REACTION NETWORK TESTS -# generate a temporary directory to store anvi-setup-modelseed-database output -modelseed_data_dir=`mktemp -d` -INFO "Setting up the ModelSEED Biochemistry database" -anvi-setup-modelseed-database --dir $modelseed_data_dir -rm -rf $modelseed_data_dir - -INFO "Storing a metabolic reaction network" -anvi-reaction-network -c B_thetaiotamicron_VPI-5482.db \ - --no-progress - -INFO "Exporting the reaction network to file" -anvi-get-metabolic-model-file -c B_thetaiotamicron_VPI-5482.db \ - -o reaction_network.json - - -# clean up -rm -rf $kegg_data_dir diff --git a/anvio/tests/run_component_tests_for_pangenomics.sh b/anvio/tests/run_component_tests_for_pangenomics.sh index b6bd2a1f3f..bf1f78d1aa 100755 --- a/anvio/tests/run_component_tests_for_pangenomics.sh +++ b/anvio/tests/run_component_tests_for_pangenomics.sh @@ -24,7 +24,7 @@ anvi-gen-genomes-storage -e external-genomes.txt \ -o TEST-GENOMES.db \ --no-progress -INFO "Running the pangenome anaysis with default parameters" +INFO "Running the pangenome analysis with default parameters" anvi-pan-genome -g TEST-GENOMES.db \ -o TEST/ \ -n TEST \ diff --git a/anvio/tests/run_component_tests_for_reaction_network b/anvio/tests/run_component_tests_for_reaction_network new file mode 100755 index 0000000000..dd9867f2b4 --- /dev/null +++ b/anvio/tests/run_component_tests_for_reaction_network @@ -0,0 +1,160 @@ +#!/usr/bin/env python +# -*- coding: utf-8 +DESCRIPTION = """Run by the shell script of the same name that tests anvi'o reaction networks.""" + +from argparse import Namespace + +import anvio.terminal as terminal +import anvio.reactionnetwork as rn + +from anvio.errors import ConfigError +from anvio.argparse import ArgumentParser +from anvio import A, K, __version__ as VERSION + + +__author__ = "Developers of anvi'o (see AUTHORS.txt)" +__copyright__ = "Copyleft 2015-2024, the Meren Lab (http://merenlab.org/)" +__credits__ = [] +__license__ = "GPL 3.0" +__version__ = VERSION +__authors__ = ["semiller10"] +__requires__ = [] +__provides__ = [] +__description__ = DESCRIPTION + + +run = terminal.Run() + + +def main() -> None: + """Functionality required to run reaction network component testing.""" + args = get_args() + + if args.check_default_kegg_database: + check_default_kegg_database() + return + elif args.check_default_modelseed_database: + check_default_modelseed_database() + return + + tester = rn.Tester( + modelseed_dir=args.modelseed_dir, + test_dir=args.test_dir, + run=run + ) + if args.contigs_db: + tester.test_contigs_database_network(args.contigs_db, copy_db=args.copy_db) + elif args.pan_db and args.genomes_storage: + tester.test_pan_database_network( + args.pan_db, + args.genomes_storage, + copy_db=args.copy_db, + consensus_threshold=args.consensus_threshold, + discard_ties=args.discard_ties + ) + +def check_default_kegg_database() -> None: + """ + Check if the KEGG database has been set up by anvi'o in the default location, raising an error + if this is not the case. + """ + try: + rn.KEGGData() + except ConfigError: + raise ConfigError( + "The KEGG database is not set up by anvi'o in the default location, which can be " + "rectified by running the command, 'anvi-setup-kegg-data --kegg-snapshot v2024-08-30'." + ) + +def check_default_modelseed_database() -> None: + """ + Check if the ModelSEED Biochemistry database has been set up by anvi'o in the default location, + print "True" to the terminal if this is the case, and "False" otherwise. + """ + try: + rn.ModelSEEDDatabase() + except ConfigError: + print("False") + return + print("True") + +def get_args() -> Namespace: + """ + Get arguments from the command line. + + Returns + ======= + Namespace + Parsed arguments. + """ + parser = ArgumentParser(description=DESCRIPTION) + + groupA = parser.add_argument_group( + "CHECKS", + "Check that reference databases have been installed by anvi'o in default locations by " + "using one (and only one) of the following flags." + ) + groupA.add_argument( + "--check-default-kegg-database", default=False, action='store_true', help= + "Check if a KEGG database has been set up by anvi'o in the default location, raising an " + "error if absent." + ) + groupA.add_argument( + "--check-default-modelseed-database", default=False, action='store_true', help= + "Prints 'True' to the terminal if the ModelSEED Biochemistry database has been set up by " + "anvi'o in the default location, and 'False' otherwise." + ) + + groupB = parser.add_argument_group( + "TESTS", + "Test reaction networks constructed from either a contigs database or a pan database (and " + "its associated genomes storage database)." + ) + groupB.add_argument(*A('contigs-db'), **K('contigs-db', {'required': False})) + groupB.add_argument(*A('pan-db'), **K('pan-db', {'required': False})) + groupB.add_argument(*A('genomes-storage'), **K('genomes-storage', {'required': False})) + groupB.add_argument( + "--test-dir", type=str, metavar='PATH', help= + "The directory storing test files. With the default value of None, temporary directories " + "are created and deleted as needed by the program. In contrast, if a directory path is " + "provided, none of the test files in the directory are deleted." + ) + groupB.add_argument( + "--copy-db", default=False, action='store_true', help= + "This flag causes the reaction network to be stored in a copy of the input contigs or pan " + "database. If a test directory is provided by the argument, '--test-dir', the database " + "copy is placed in the test directory with a derived filename, e.g., 'my-CONTIGS.db' is " + "copied to a file like 'TEST/my-k2z9jxjd.db' and 'my-PAN.db' is copied to a file like " + "'TEST/my-PAN-spiba5e7.db'." + ) + groupB.add_argument( + "--modelseed-dir", type=str, metavar='PATH', help= + "Path to ModelSEED Biochemistry database directory. If this option is not used, the " + "program expects a database set up in the default location used by " + "'anvi-setup-modelseed-database'." + ) + groupB.add_argument( + "--consensus-threshold", default=None, type=float, metavar='FLOAT', help= + "If this argument is provided with a pan database, then a protein annotation must be " + "assigned to this minimum proportion of genes in a cluster in order to be imputed to the " + "cluster as a whole. By default, without this argument, the annotation assigned to the " + "most genes becomes the annotation of the cluster (also see --discard-ties). The consensus " + "threshold must be a number from 0 to 1." + ) + groupB.add_argument( + "--discard-ties", default=False, action='store_true', help= + "By default, a gene cluster in a pan database is assigned a protein annotation by finding " + "the protein ortholog that occurs in the greatest number of genes in the cluster (see " + "--consensus-threshold) and arbitrarily choosing one ortholog in case of a tie. With this " + "flag, a tie instead results in an ortholog annotation not being assigned to the cluster." + ) + + args = parser.get_args(parser) + return args + +if __name__ == '__main__': + try: + main() + except ConfigError as e: + print(e) + exit(-1) diff --git a/anvio/tests/run_component_tests_for_reaction_network.sh b/anvio/tests/run_component_tests_for_reaction_network.sh new file mode 100755 index 0000000000..5dfcd0d46b --- /dev/null +++ b/anvio/tests/run_component_tests_for_reaction_network.sh @@ -0,0 +1,82 @@ +#!/bin/bash +source 00.sh + +SETUP_WITH_OUTPUT_DIR $1 $2 $3 + +python_script=`readlink -f run_component_tests_for_reaction_network` + +INFO "Checking for the required KEGG database set up by anvi'o in a default location" +${python_script} --check-default-kegg-database + +INFO "Setting up the reaction network analysis directory" +mkdir -p ${output_dir}/ +# These databases should already contain KO annotations. +cp ${files}/mock_data_for_pangenomics/*.db ${output_dir}/ +cp ${files}/mock_data_for_pangenomics/external-genomes.txt ${output_dir}/ +cp ${files}/example_description.md ${output_dir}/ +cd ${output_dir}/ + +INFO "Migrating all databases" +anvi-migrate *db --migrate-quickly + +INFO "Generating an anvi'o genomes storage" +anvi-gen-genomes-storage -e external-genomes.txt -o TEST-GENOMES.db --no-progress + +INFO "Running the pangenome analysis with default parameters" +anvi-pan-genome -g TEST-GENOMES.db \ + -o TEST/ \ + -n TEST \ + --use-ncbi-blast \ + --description example_description.md \ + --no-progress \ + ${thread_controller} + +use_default_modelseed_db=`${python_script} --check-default-modelseed-database` +if [ "${use_default_modelseed_db}" == "True" ] +then + INFO "Using the ModelSEED Biochemistry database already set up by anvi'o in a default location" +else + INFO "Setting up the ModelSEED Biochemistry database in a temporary directory (a permanent ModelSEED database can be installed in the default location with 'anvi-setup-modelseed-database')" + data_dir=`mktemp -d` + anvi-setup-modelseed-database --dir ${data_dir} + modelseed_data_dir=${data_dir}/MODELSEED +fi + +INFO "Testing a genomic reaction network generated from a contigs database" +args=() +args+=( "--contigs-db" "E_faecalis_6240.db" ) +args+=( "--test-dir" ${output_dir} ) +if [ "${use_default_modelseed_db}" == "False" ] +then + args+=( "--modelseed-dir" ${modelseed_data_dir} ) +fi +args+=( "--no-progress" ) +${python_script} "${args[@]}" + +INFO "Exporting the genomic reaction network to a file" +anvi-get-metabolic-model-file --contigs-db E_faecalis_6240.db \ + --output-file E_faecalis_6240-network.json + +INFO "Testing a pangenomic reaction network generated from the pan and genomes storage databases" +args=() +args+=( "--pan-db" "TEST/TEST-PAN.db" ) +args+=( "--genomes-storage" "TEST-GENOMES.db" ) +args+=( "--test-dir" ${output_dir} ) +if [ ${use_default_modelseed_db} == "False" ] +then + args+=( "--modelseed-dir" ${modelseed_data_dir} ) +fi +args+=( "--no-progress" ) +${python_script} "${args[@]}" + +INFO "Exporting the pangenomic reaction network to a file" +anvi-get-metabolic-model-file --pan-db TEST/TEST-PAN.db \ + --genomes-storage TEST-GENOMES.db \ + --record-genomes \ + --output-file TEST-PAN-network.json + +if [ ${use_default_modelseed_db} == "False" ] +then + INFO "Removing the temporary ModelSEED Biochemistry database directory" + rm -rf ${modelseed_data_dir} +fi diff --git a/bin/anvi-draw-kegg-pathways b/bin/anvi-draw-kegg-pathways new file mode 100755 index 0000000000..320d7e5630 --- /dev/null +++ b/bin/anvi-draw-kegg-pathways @@ -0,0 +1,463 @@ +#!/usr/bin/env python +# -*- coding: utf-8 +DESCRIPTION = "Write KEGG pathway map files incorporating data sourced from anvi'o databases." + +import os +import re +import sys +import functools +import traceback +import pandas as pd + +from argparse import Namespace + +import anvio.kegg as kegg +import anvio.filesnpaths as filesnpaths + +from anvio.keggmapping import Mapper +from anvio.argparse import ArgumentParser +from anvio import A, K, __version__ as VERSION +from anvio.errors import ConfigError, FilesNPathsError + + +__authors__ = ["semiller10"] +__copyright__ = "Copyleft 2015-2024, The Anvi'o Project (http://anvio.org/)" +__license__ = "GPL 3.0" +__version__ = VERSION +__requires__ = ["contigs-db", "external-genomes", "pan-db", "genomes-storage-db", "kegg-data"] +__provides__ = ["kegg-pathway-map"] +__description__ = DESCRIPTION + + +def main() -> None: + args = get_args() + check_kegg_data(args) + consolidate_contigs_dbs(args) + + mapper = Mapper( + kegg_dir=args.kegg_dir, + overwrite_output=args.overwrite_output_destinations, + name_files=args.name_files + ) + + performed = False + if ( + args.contigs_dbs is not None and + len(args.contigs_dbs) == 1 and + args.colormap is None and + args.ko is True + ): + map_single_contigs_db_ko_data(args, mapper) + performed = True + elif ( + args.contigs_dbs is not None and + args.ko is True + ): + map_multiple_contigs_dbs_ko_data(args, mapper) + performed = True + + if ( + args.pan_db is not None and + args.genomes_storage is not None and + args.ko is True + ): + map_pan_db_ko_data(args, mapper) + performed = True + + if not performed: + raise ConfigError( + "No task was performed! The minimum requirements are a database source, such as " + "`--contigs-dbs`, and a data type to draw, such as `--ko`." + ) + +def get_args() -> Namespace: + parser = ArgumentParser(description=DESCRIPTION) + + groupCONTIGS = parser.add_argument_group( + "CONTIGS DATABASE", + "Display data from one or more contigs databases, e.g., for genomes and metagenomes." + ) + groupCONTIGS.add_argument( + '--contigs-dbs', type=str, nargs='+', help= + "One or more anvi'o contigs databases generated by 'anvi-gen-contigs-database'. Contigs " + "databases can alternatively be provided using the argument, '--external-genomes'." + ) + groupCONTIGS.add_argument( + '--external-genomes', type=str, help= + "A two-column tab-delimited flat text file that lists anvi'o contigs databases generated " + "by 'anvi-gen-contigs-database'. Contigs databases can alternatively be provided using the " + "argument, '--contigs-dbs'. The first item in the header line of the file should read " + "'name', and the second should read 'contigs_db_path'. Each line in the file should " + "describe a single entry, where the first column is a name for the database, and the " + "second is the path to the database." + ) + + groupPAN = parser.add_argument_group( + "PANGENOMIC DATABASE", + "Display data from a pangenome." + ) + groupPAN.add_argument(*A('pan-db'), **K('pan-db', {'required': False})) + groupPAN.add_argument(*A('genomes-storage'), **K('genomes-storage', {'required': False})) + groupPAN.add_argument( + '--consensus-threshold', default=None, type=float, metavar='FLOAT', help= + "If this argument is provided, then a KO annotation must be assigned to this minimum " + "proportion of genes in a cluster to be imputed to the cluster as a whole. By default, " + "without this argument, the annotation assigned to the most genes becomes the annotation " + "of the cluster (also see '--discard-ties'). The consensus threshold must be a number from " + "0 to 1." + ) + groupPAN.add_argument( + '--discard-ties', action='store_true', default=False, help= + "By default, a gene cluster is assigned a KO annotation by finding the protein ortholog " + "that occurs in the greatest number of genes in the cluster (see '--consensus-threshold') " + "and arbitrarily choosing one ortholog in case of a tie. With this flag, a tie instead " + "results in an ortholog annotation not being assigned to the cluster." + ) + + groupOUT = parser.add_argument_group("OUTPUT", "Output files") + groupOUT.add_argument(*A('output-dir'), **K('output-dir')) + groupOUT.add_argument(*A('overwrite-output-destinations'), **K('overwrite-output-destinations')) + groupOUT.add_argument( + '--name-files', action='store_true', default=False, help= + "Include the pathway name along with the number in output map file names. For example, in " + "drawing KO presence/absence data, the 'Glycolysis / Gluconeogenesis' map would be saved " + "by default to a file named 'kos_00010.pdf', but with this flag would be saved to a file " + "named 'kos_00010_Glycolysis_Gluconeogenesis.pdf'. Likewise, the file name for " + "'Glycosylphosphatidylinositol (GPI)-anchor biosynthesis' would be " + "'kos_00563_Glycosylphosphatidylinositol_(GPI)_anchor_biosynthesis.pdf', and the file " + "name for 'Biosynthesis of 12-, 14- and 16-membered macrolides' would be " + "'kos_00522_Biosynthesis_of_12_14_and_16_membered_macrolides.pdf' with this flag." + ) + groupOUT.add_argument( + '--draw-individual-files', nargs='*', help= + "Draw pathway maps for individual contigs databases if multiple databases are provided, or " + "for individual genomes of the pangenome. If used as a flag (without values), save files " + "for all of the individual databases or genomes. Alternatively, the project names of a " + "subset of contigs databases or the names of a subset of genomes can be provided." + ) + groupOUT.add_argument( + '--draw-grid', nargs='*', help= + "Draw a grid for each pathway map. If using multiple contigs databases, the grid shows " + "the unified map of data from all databases and maps for individual databases. If using a " + "pangenomic database, the grid shows the pangenomic map and maps for individual genomes. " + "The grid view facilitates identification of the contigs databases or genomes containing " + "reactions highlighted in the integrative map. If used as a flag (without values), all of " + "the contigs databases or genomes are included in the grid. Alternatively, the project " + "names of a subset of contigs databases or the names of a subset of genomes can be " + "provided." + ) + groupOUT.add_argument( + '--draw-bare-maps', action='store_true', default=False, help= + "By default, without this flag, only draw maps containing select data from anvi'o " + "databases, such as KOs with the '--ko' flag. Even if pathway maps are given explicitly " + "with '--pathway-numbers' (e.g., 00010 01100), if they do not contain anvi'o data, they " + "are not drawn unless this flag is used." + ) + + groupDATA = parser.add_argument_group( + "DATA", "Types of data to display from anvi'o databases." + ) + groupDATA.add_argument( + '--ko', action='store_true', default=False, help= + "Map KOs in the contigs or pangenomic database. For contigs databases, the " + "presence/absence of KOs is displayed. For pangenomic databases, by default, the count of " + "genomes contributing to gene clusters annotated by KOs is translated to a color map. " + "The argument, '--set-color', can be used to display presence/absence rather than counts." + ) + + groupMAP = parser.add_argument_group("MAP", "Pathway maps to draw") + groupMAP.add_argument( + '--pathway-numbers', type=str, nargs='+', help= + "Five-digit numbers identify pathway maps to draw. By default, all maps are drawn. Numbers " + "are five-digits long. This argument accepts regular expression patterns. For example, the " + "values, 01100 03... , will draw the global 'Metabolic pathways' map '01100' and all of " + "the 'Genetic Information Processing' maps with numbers starting '03'. See the following " + "website for a classification of the maps: https://www.genome.jp/kegg/pathway.html" + ) + groupMAP.add_argument( + '--kegg-dir', type=str, metavar='PATH', help= + "Path to KEGG database directory containing map files. If this option is not used, the " + "program expects a database set up in the default location used by 'anvi-setup-kegg-data'." + ) + + groupCOLOR = parser.add_argument_group("COLOR", "Color scheme to use") + groupCOLOR.add_argument( + '--set-color', nargs='?', const=True, help= + "This argument can be used as a flag (without a value), to use a single default color for " + "presence/absence data. Alternatively, it can be used with the value 'original' to use " + "original colors from the reference map. Lastly, this argument can take a color hex code, " + "such as '#FFA500' for orange, to use instead of the default color. A COLOR HEX CODE " + "ARGUMENT MUST BE ENCLOSED IN QUOTES, AS # OTHERWISE CAUSES THE REST OF THE COMMAND TO BE " + "IGNORED AS A COMMENT. Use of this argument with certain data types preempts the default " + "way of displaying the data. For instance, pangenomic KO data that would be dynamically " + "colored by genome count is instead colored by presence/absence." + ) + groupCOLOR.add_argument( + '--colormap', nargs='+', help= + "This option takes the name of a Matplotlib Colormap which is sampled in coloring data. In " + "addition to the colormap name, two decimal values between 0.0 and 1.0, with the first " + "value smaller than the second, can be provided to limit the fraction of the colormap " + "used. For example, the values, plasma 0.2 0.9 , would extract 70%% of the 'plasma' " + "colormap, ignoring the darkest 20%% and lightest 10%%. Here is how a colormap is applied " + "to KO occurrence data. KO reactions can be dynamically colored by occurrence in multiple " + "contigs databases or the genomes of a pangenome. Pangenomes by default use the sequential " + "colormap, 'plasma_r' ('_r' can be added to colormap names to reverse the order of " + "colors), trimming the top and bottom 10%%. 'plasma_r' spans yellow (fewer genomes) to " + "blue-violet (more genomes), which accentuates in darker colors reactions that are shared " + "rather than unshared across genomes. In contrast, a colormap spanning dark to light, such " + "as 'plasma', is better for drawing attention to unshared reactions. Multiple contigs " + "databases can use two 'schemes' for dynamic coloring, 'by_count' and 'by_database' (see " + "the argument, '--colormap-scheme'). As with pangenomes, 'by_count' by default uses the " + "'plasma_r' colormap trimming the top and bottom 10%%. 'by_database' by default uses the " + "qualitative colormap, 'tab10', without trimming. This colormap contains distinct colors " + "suitable for clearly differentiating the different databases containing reactions. See " + "the following webpage for named colormaps: " + "https://matplotlib.org/stable/users/explain/colors/colormaps.html#classes-of-colormaps " + ) + groupCOLOR.add_argument( + '--colormap-scheme', choices=['by_count', 'by_database'], help= + "There are two ways of dynamically coloring KO reactions by inclusion in multiple contigs " + "databases (using options '--contigs-dbs' and '--ko'): by count or by database. By default, " + "with 4 or more databases, reactions are colored by count, and with 2 or 3, by database. " + "In coloring by count of databases, the colormap should be sequential, such that the color " + "of a reaction changes 'smoothly' with the count. In contrast, coloring by database means " + "reaction color is determined by membership in a database or combination of databases, so " + "a qualitative colormap can be used, as by default, instead of a sequential colormap to " + "give a distinct color to each membership category." + ) + groupCOLOR.add_argument( + '--reverse-overlay', action='store_true', default=False, help= + "By default, without this flag, reactions with a greater numerical value (e.g., in more " + "contigs databases or pangenomic genomes) are drawn on top of those with a lesser value. " + "With this flag, the opposite applies; especially in global maps with a non-default " + "colormap spanning dark to light, this accentuates unshared rather than shared parts of " + "a pathway." + ) + + args = parser.get_args(parser) + return args + +def check_kegg_data(args: Namespace) -> None: + kegg_args = Namespace() + kegg_args.kegg_data_dir = args.kegg_dir + kegg_context = kegg.KeggContext(kegg_args) + + if not os.path.exists(kegg_context.kegg_map_image_kgml_file): + raise ConfigError( + "One of the key files required by KEGG pathway maps is missing in your active " + "anvi'o installation. If your KEGG data are not stored at the default KEGG data " + "location, include that path using the `--kegg-dir` parameter. Otherwise, please " + "consider using the program `anvi-setup-kegg-data` to set up the latest KEGG data " + "that includes the necessary files for KEGG pathway maps." + ) + +def consolidate_contigs_dbs(args: Namespace) -> None: + """Transfer contigs database paths from an external_genomes file to the contigs_dbs argument.""" + if args.external_genomes is None: + return + + if args.contigs_dbs is None: + args.contigs_dbs = [] + + filesnpaths.is_file_tab_delimited(args.external_genomes, expected_number_of_fields=2) + external_genomes_table = pd.read_csv(args.external_genomes, sep='\t', header=0) + assert external_genomes_table.columns.tolist() == ['name', 'contigs_db_path'] + args.contigs_dbs += external_genomes_table['contigs_db_path'].tolist() + +def map_single_contigs_db_ko_data(args: Namespace, mapper: Mapper) -> None: + """Draw KO data from a single contigs database source in the absence of a colormap.""" + map_contigs_database_kos = mapper.map_contigs_database_kos + + if args.set_color is None or args.set_color is True: + # Use the default color to highlight reactions. + pass + else: + map_contigs_database_kos = functools.partial( + map_contigs_database_kos, color_hexcode=args.set_color + ) + + map_contigs_database_kos( + args.contigs_dbs[0], + args.output_dir, + pathway_numbers=args.pathway_numbers, + draw_maps_lacking_kos=args.draw_bare_maps + ) + +def map_multiple_contigs_dbs_ko_data(args: Namespace, mapper: Mapper) -> None: + """Draw KO data from contigs database sources.""" + map_contigs_databases_kos = mapper.map_contigs_databases_kos + + if args.draw_individual_files is None: + pass + elif len(args.draw_individual_files) == 0: + # Draw maps for all contigs databases. + map_contigs_databases_kos = functools.partial( + map_contigs_databases_kos, draw_contigs_db_files=True + ) + else: + # Draw maps for select contigs databases. + map_contigs_databases_kos = functools.partial( + map_contigs_databases_kos, draw_contigs_db_files=args.draw_individual_files + ) + + if args.draw_grid is None: + pass + elif len(args.draw_grid) == 0: + # Draw a grid of maps including all contigs databases. + map_contigs_databases_kos = functools.partial(map_contigs_databases_kos, draw_grid=True) + else: + # Include select contigs databases. + map_contigs_databases_kos = functools.partial( + map_contigs_databases_kos, draw_grid=args.draw_grid.split() + ) + + assert not ((args.set_color is not None) and (args.colormap is not None)) + + if args.colormap is None: + # Dynamically color reactions in unified maps using the default colormap. + pass + elif len(args.colormap) == 1: + # Use the provided colormap name. + map_contigs_databases_kos = functools.partial( + map_contigs_databases_kos, colormap=args.colormap[0] + ) + else: + # Use the provided colormap name and limits. + assert len(args.colormap) == 3 + min_limit = float(args.colormap[1]) + max_limit = float(args.colormap[2]) + map_contigs_databases_kos = functools.partial( + map_contigs_databases_kos, + colormap=args.colormap[0], + colormap_limits=(min_limit, max_limit) + ) + + if args.colormap_scheme is None: + # The scheme is determined automatically by the number of contigs databases. + pass + else: + map_contigs_databases_kos = functools.partial( + map_contigs_databases_kos, colormap_scheme=args.colormap_scheme + ) + + if args.set_color is None: + # Dynamically color reactions in unified maps by number of contigs databases. + pass + elif args.set_color is True: + # Color reactions in unified maps the default static color. + map_contigs_databases_kos = functools.partial(map_contigs_databases_kos, colormap=False) + else: + # Use the provided color. + map_contigs_databases_kos = functools.partial( + map_contigs_databases_kos, colormap=False, color_hexcode=args.set_color + ) + + map_contigs_databases_kos( + args.contigs_dbs, + args.output_dir, + pathway_numbers=args.pathway_numbers, + reverse_overlay=args.reverse_overlay, + draw_maps_lacking_kos=args.draw_bare_maps + ) + +def map_pan_db_ko_data(args: Namespace, mapper: Mapper) -> None: + """Draw KO data from a pangenomic database source.""" + map_pan_database_kos = mapper.map_pan_database_kos + + if args.draw_individual_files is None: + pass + elif len(args.draw_individual_files) == 0: + # Draw maps for all genomes in the pangenome. + map_pan_database_kos = functools.partial(map_pan_database_kos, draw_genome_files=True) + else: + # Draw maps for select genomes. + map_pan_database_kos = functools.partial( + map_pan_database_kos, draw_genome_files=args.draw_individual_files + ) + + if args.draw_grid is None: + pass + elif len(args.draw_grid) == 0: + # Draw a grid of maps including all genomes in the pangenome. + map_pan_database_kos = functools.partial(map_pan_database_kos, draw_grid=True) + else: + # Include select genomes. + map_pan_database_kos = functools.partial( + map_pan_database_kos, draw_grid=args.draw_grid.split() + ) + + assert not ((args.set_color is not None) and (args.colormap is not None)) + + if args.set_color is None: + # Dynamically color reactions in pangenomic maps by number of genomes. + pass + elif args.set_color is True: + # Color reactions in pangenomic maps the default static color. + map_pan_database_kos = functools.partial(map_pan_database_kos, colormap=None) + else: + # Use the provided color. + map_pan_database_kos = functools.partial( + map_pan_database_kos, colormap=None, color_hexcode=args.set_color + ) + + if args.colormap is None: + # Dynamically color reactions in pangenomic maps using the default colormap. + pass + elif len(args.colormap) == 1: + # Use the provided colormap. + map_pan_database_kos = functools.partial(map_pan_database_kos, colormap=args.colormap[0]) + else: + # Use the provided colormap and limits. + assert len(args.colormap) == 3 + min_limit = float(args.colormap[1]) + max_limit = float(args.colormap[2]) + map_pan_database_kos = functools.partial( + map_pan_database_kos, + colormap=args.colormap[0], + colormap_limits=(min_limit, max_limit) + ) + + map_pan_database_kos( + args.pan_db, + args.genomes_storage, + args.output_dir, + pathway_numbers=args.pathway_numbers, + reverse_overlay=args.reverse_overlay, + draw_maps_lacking_kos=args.draw_bare_maps, + consensus_threshold=args.consensus_threshold, + discard_ties=args.discard_ties + ) + +if __name__ == '__main__': + try: + main() + except ConfigError as e: + e_str = re.sub(r'\s+', ' ', str(e)) + if ( + "Unprioritized entry graphics cannot be assigned the same combination of foreground " + "and background colors as prioritized entries of the same entry and graphics types." + ) in e_str: + raise ConfigError( + "The colors of highlighted reactions and compounds cannot be set to reserved " + "colors of other un-highlighted reactions and compounds, respectively. In global " + "maps, other reactions and compounds are colored gray ('#E0E0E0'), so this should " + "not be used as a static color or dynamic color in a colormap. In overview maps, " + "other reactions are colored black ('#000000') and other compounds are colored " + "white ('#FFFFFF'), so these should not be used as colors. In standard maps, other " + "reactions and compounds are colored white, so this should not be used as a color." + ) + print(e) + sys.exit(-1) + except FilesNPathsError as e: + print(e) + sys.exit(-1) + except Exception as e: + if type(e) != AssertionError: + print( + "\nHave you installed the necessary Python package requirements for " + "`anvi-draw-kegg-pathways`? This might be the cause of the error that was " + "encountered, reported below. Run the following command in your terminal and then " + "try rerunning the program:\npip install biopython ReportLab pymupdf frontend\n" + ) + traceback.print_exception(type(e), e, e.__traceback__) + sys.exit(-1) diff --git a/bin/anvi-export-items-order b/bin/anvi-export-items-order index 37256c4d2d..401330bc8f 100755 --- a/bin/anvi-export-items-order +++ b/bin/anvi-export-items-order @@ -57,12 +57,14 @@ def main(args): items_order_of_interest = None for item_order in item_order_names: item_order_name, item_order_distance, item_order_clustering = item_order.split(':') - if order_name == item_order_name: + if order_name == item_order_name or order_name == item_order: items_order_of_interest = item_orders_dict[item_order] if not items_order_of_interest: - raise ConfigError("The item order '%s' is not one of the item orders in the database. This what you " - "have in there: '%s'." % (order_name, ', '.join(item_order_names))) + raise ConfigError(f"The item order '{order_name}' is not one of the item orders in the database. Here " + f"is a comma-separated list of what you have in there (please note that you can, but do not need " + f"to include the distance/clustering types after the ':' character in each name): " + f"{', '.join(item_order_names)}") order_data_type_newick = items_order_of_interest['type'] == 'newick' run.info("Database", db_path) diff --git a/bin/anvi-get-metabolic-model-file b/bin/anvi-get-metabolic-model-file index 92c1a11993..4270a4fe94 100755 --- a/bin/anvi-get-metabolic-model-file +++ b/bin/anvi-get-metabolic-model-file @@ -5,7 +5,7 @@ DESCRIPTION = """This program exports a metabolic reaction network to a file sui from sys import exit from argparse import Namespace -import anvio.biochemistry.reactionnetwork as reactionnetwork +import anvio.reactionnetwork as reactionnetwork from anvio import A, K from anvio.errors import ConfigError diff --git a/bin/anvi-import-protein-profile b/bin/anvi-import-protein-profile index 33a82c63c4..9eb26f5b66 100755 --- a/bin/anvi-import-protein-profile +++ b/bin/anvi-import-protein-profile @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -DESCRIPTION = """This program imports protein abundance data and stores it in a profile database.""" +DESCRIPTION = """This program imports protein abundance data into a profile database.""" import pandas as pd @@ -77,10 +77,10 @@ def get_args() -> Namespace: """This tab-delimited table contains protein abundance data from different samples. It \ should have the following named columns: 'source', 'accession', 'sample', and 'abundance'. \ Each row corresponds to a distinct protein abundance measurement. 'source' is the source of \ - the identifying accessions, and must correspond to a gene function annotation source stored \ - in the anvi'o database (available sources can be found with the program, `anvi-db-info`). \ + the protein accessions, and must be a gene function annotation source stored in the anvi'o \ + profile database (available sources can be found with the program, `anvi-db-info`). \ 'accession' is the protein ID in the annotation source. A contigs database built from a \ - GenBank, for example, can contain the source 'NCBI_PGAP' and the accession, \ + GenBank file, for example, could contain the source, 'NCBI_PGAP', and the accession, \ 'WP_011862028.1'. 'sample' is the name of the sample in which the measurement was made. It \ need not be the same as any nucleotide sequence samples stored in the profile database. \ 'abundance' is the protein abundance value itself.""" diff --git a/bin/anvi-reaction-network b/bin/anvi-reaction-network index 0bc12d76af..16de02f147 100755 --- a/bin/anvi-reaction-network +++ b/bin/anvi-reaction-network @@ -7,9 +7,9 @@ import sys from argparse import Namespace from anvio.argparse import ArgumentParser +from anvio.reactionnetwork import Constructor from anvio.errors import ConfigError, FilesNPathsError from anvio import A as A, K as K, __version__ as VERSION -from anvio.biochemistry.reactionnetwork import Constructor __copyright__ = "Copyleft 2015-2024, The Anvi'o Project (http://anvio.org/)" @@ -23,7 +23,7 @@ __description__ = DESCRIPTION def main() -> None: args = get_args() - constructor = Constructor(ko_dir=args.ko_dir, modelseed_dir=args.modelseed_dir) + constructor = Constructor(kegg_dir=args.kegg_dir, modelseed_dir=args.modelseed_dir) if args.contigs_db: constructor.make_network( contigs_db=args.contigs_db, @@ -41,11 +41,9 @@ def main() -> None: ) else: raise ConfigError( - f"""\ - Either a contigs database (`--contigs-db`) OR a pan database (`--pan-db`) and genomes - storage database (`--genomes-storage`) must be provided to make a (meta)genomic or - pangenomic reaction network, respectively.\ - """ + "Either a contigs database (`--contigs-db`) OR a pan database (`--pan-db`) and genomes " + "storage database (`--genomes-storage`) must be provided to make a (meta)genomic or " + "pangenomic reaction network, respectively." ) def get_args() -> Namespace: @@ -59,76 +57,51 @@ def get_args() -> Namespace: groupB = parser.add_argument_group( "PANGENOME INPUT", - "Generate a reaction network from a pan database and genomes storage database, " + "Generate a reaction network from a pan database and associated genomes storage database, " "and store the network in the pan database." ) groupB.add_argument(*A('pan-db'), **K('pan-db', {'required': False})) groupB.add_argument(*A('genomes-storage'), **K('genomes-storage', {'required': False})) groupB.add_argument( - '--consensus-threshold', default=None, type=float, - help=( - f"""\ - If this argument is provided, then a protein annotation must be assigned to this minimum - proportion of genes in a cluster to be imputed to the cluster as a whole. By default, - without this argument, the annotation assigned to the most genes becomes the annotation - of the cluster (also see --discard-ties). The consensus threshold must be a number from - 0 to 1.\ - """ - ) + '--consensus-threshold', default=None, type=float, metavar='FLOAT', help= + "If this argument is provided, then a protein annotation must be assigned to this minimum " + "proportion of genes in a cluster to be imputed to the cluster as a whole. By default, " + "without this argument, the annotation assigned to the most genes becomes the annotation " + "of the cluster (also see --discard-ties). The consensus threshold must be a number from 0 " + "to 1." ) groupB.add_argument( - '--discard-ties', default=False, action='store_true', - help=( - f"""\ - By default, a gene cluster is assigned a protein annotation by finding the protein - ortholog that occurs in the greatest number of genes in the cluster (see - --consensus-threshold) and arbitrarily choosing one ortholog in case of a tie. With this - flag, a tie instead results in an ortholog annotation not being assigned to the - cluster.\ - """ - ) + '--discard-ties', default=False, action='store_true', help= + "By default, a gene cluster is assigned a protein annotation by finding the protein " + "ortholog that occurs in the greatest number of genes in the cluster (see " + "--consensus-threshold) and arbitrarily choosing one ortholog in case of a tie. With this " + "flag, a tie instead results in an ortholog annotation not being assigned to the cluster." ) groupC = parser.add_argument_group( "DATABASE", "KEGG and ModelSEED reference database information" ) groupC.add_argument( - '--ko-dir', type=str, metavar='PATH', - help=( - f"""\ - Path to KEGG KO database directory. If this option is not used, the program expects a - database set up in the default location used by 'anvi-setup-kegg-data'.\ - """ - ) + '--kegg-dir', type=str, metavar='PATH', help= + "Path to KEGG database directory. If this option is not used, the program expects a " + "database set up in the default location used by 'anvi-setup-kegg-data'." ) groupC.add_argument( - '--modelseed-dir', type=str, metavar='PATH', - help=( - f"""\ - Path to ModelSEED Biochemistry database directory. If this option is not used, the - program expects a database set up in the default location used by - 'anvi-setup-modelseed-database'.\ - """ - ) + '--modelseed-dir', type=str, metavar='PATH', help= + "Path to ModelSEED Biochemistry database directory. If this option is not used, the " + "program expects a database set up in the default location used by " + "'anvi-setup-modelseed-database'." ) groupD = parser.add_argument_group("OTHER OPTIONS") groupD.add_argument( - '--overwrite-existing-network', default=False, action='store_true', - help=( - f"""\ - Overwrite an existing reaction network in the database with the newly computed network.\ - """ - ) + '--overwrite-existing-network', default=False, action='store_true', help= + "Overwrite an existing reaction network in the database with the newly computed network." ) groupD.add_argument( - '--stats-file', type=str, metavar='PATH', - help=( - f"""\ - Write a tab-delimited file of network overview statistics (statistics also printed to - the terminal) to the output path.\ - """ - ) + '--stats-file', type=str, metavar='PATH', help= + "Write a tab-delimited file of network overview statistics (statistics also printed to the " + "terminal) to the output path." ) args = parser.get_args(parser) diff --git a/bin/anvi-self-test b/bin/anvi-self-test index f4b1b383ad..596da89fe1 100755 --- a/bin/anvi-self-test +++ b/bin/anvi-self-test @@ -22,7 +22,9 @@ __credits__ = [] __license__ = "GPL 3.0" __version__ = anvio.__version__ __authors__ = ['meren', 'semiller10', 'ekiefl', 'ivagljiva', 'mschecht'] -__description__ = "A script for anvi'o to test itself" +__requires__ = [] +__provides__ = [] +__description__ = "A program for anvi'o to test itself" tests = {'mini' : ['run_component_tests_for_minimal_metagenomics.sh'], @@ -30,6 +32,8 @@ tests = {'mini' : ['run_component_tests_for_minimal_metagenomic 'pangenomics' : ['run_component_tests_for_pangenomics.sh'], 'interactive-interface' : ['run_component_tests_for_manual_interactive.sh'], 'metabolism' : ['run_component_tests_for_metabolism.sh'], + 'reaction-network' : ['run_component_tests_for_reaction_network.sh'], + 'kegg-mapping' : ['run_component_tests_for_kegg_mapping.sh'], 'display-functions' : ['run_component_tests_for_display_functions.sh'], 'trnaseq' : ['run_component_tests_for_trnaseq.sh'], 'inversions' : ['run_component_tests_for_inversions.sh'], diff --git a/bin/anvi-setup-kegg-data b/bin/anvi-setup-kegg-data index bfe6b6e571..40a77ce8f3 100755 --- a/bin/anvi-setup-kegg-data +++ b/bin/anvi-setup-kegg-data @@ -7,10 +7,9 @@ import argparse import anvio import anvio.kegg as kegg -from anvio.biochemistry.reactionnetwork import KODatabase -from anvio.errors import ConfigError, FilesNPathsError from anvio.terminal import time_program from anvio.ttycolors import color_text as c +from anvio.errors import ConfigError, FilesNPathsError __copyright__ = "Copyleft 2015-2024, The Anvi'o Project (http://anvio.org/)" __license__ = "GPL 3.0" @@ -27,11 +26,12 @@ DOWNLOAD_MODES = {'KOfam': {'description': 'only KOfam annotation models (HMMs). 'only-processing': {'flags': anvio.A('only-processing'), 'definition': anvio.K('only-processing')}, 'include-stray-KOs': {'flags': anvio.A('include-stray-KOs'), - 'definition': anvio.K('include-stray-KOs')} + 'definition': anvio.K('include-stray-KOs')} } }, 'modules': {'description': 'metabolic pathways from the KEGG MODULES database and BRITE hierarchies. Use this mode AND "KOfam" ' - 'mode if you want to run pathway prediction with `anvi-estimate-metabolism`.', + 'mode if you want to run pathway prediction with `anvi-estimate-metabolism`. This mode does the ' + 'necessary setup that allows you to run `anvi-reaction-network` and visualize KEGG pathway maps.', 'arguments': {'only-download': {'flags': anvio.A('only-download'), 'definition': anvio.K('only-download')}, 'only-processing': {'flags': anvio.A('only-processing'), @@ -44,20 +44,12 @@ DOWNLOAD_MODES = {'KOfam': {'description': 'only KOfam annotation models (HMMs). "are using the --only-processing flag"})}, 'skip-brite-hierarchies': {'flags': anvio.A('skip-brite-hierarchies'), 'definition': anvio.K('skip-brite-hierarchies')}, + 'skip-binary-relations': {'flags': anvio.A('skip-binary-relations'), + 'definition': anvio.K('skip-binary-relations')}, + 'skip-map-images': {'flags': anvio.A('skip-map-images'), + 'definition': anvio.K('skip-map-images')} } }, - 'modeling': {'description': 'KEGG orthologs and reactions. Use this mode if ' - 'you want to run metabolic modeling with `anvi-reaction-network`.', - 'arguments': {'dir': {'flags': ['--dir'], - 'definition': {'default': None, - 'type': str, - 'help': "You have the option to store the modeling data in a different location " - "on your computer than other KEGG stuff (ie, NOT --kegg-data-dir). " - "Use this argument to select a custom directory in which to store " - "the modeling data." - f"(default: {KODatabase.default_dir})"}} - }, - }, 'all': {'description': 'Download ALL KEGG data. This is the default mode.', 'arguments': {'kegg-snapshot': {'flags': anvio.A('kegg-snapshot'), 'definition': anvio.K('kegg-snapshot')}, @@ -67,7 +59,6 @@ DOWNLOAD_MODES = {'KOfam': {'description': 'only KOfam annotation models (HMMs). 'definition': anvio.K('kegg-archive')} }, } - } @@ -98,11 +89,11 @@ def main(args, unknown_args): p_flag = "--" + p.replace("_", "-") if p_flag in mode_unknown: mode_unknown.remove(p_flag) - + args = argparse.Namespace(**vars(args), **vars(mode_args)) # global flags are already handled by anvi'o and shouldn't be in the unknown list # these are coming from __init__.py and not all are relevant to this code but we catch them anyway - global_flags_to_catch = ['--debug', '--no-progress', '--force', '--quiet', '--as-markdown', + global_flags_to_catch = ['--debug', '--no-progress', '--force', '--quiet', '--as-markdown', '--force-overwrite', '--fix-sad-tables', '--display-db-calls', '--I-know-this-is-not-a-good-idea', '--force-use-my-tree', '--debug-auto-fill-anvio-dbs'] for gf in global_flags_to_catch: @@ -127,20 +118,6 @@ def main(args, unknown_args): args.download_from_kegg = True setup = kegg.ModulesDownload(args) setup.setup_modules_data() - if mode == "modeling" or mode == "all": - setup_directory = None - # we ignore the dir parameter for all mode - if mode == "all": - setup_directory = args.kegg_data_dir - else: - # the --dir parameter overrides --kegg-data-dir - if args.dir: - setup_directory = args.dir - elif not args.dir and args.kegg_data_dir: - setup_directory = args.kegg_data_dir - - KODatabase.set_up(num_threads = args.num_threads, dir = setup_directory, reset = args.reset) - if __name__ == '__main__': from anvio.argparse import ArgumentParser @@ -154,7 +131,7 @@ if __name__ == '__main__': "certain subsets of the data available from KEGG. Use --list-modes to see " + \ f"a description of the options. Available modes: {', '.join(DOWNLOAD_MODES.keys())}" groupM.add_argument('--mode', choices=DOWNLOAD_MODES.keys(), help=mode_help, default='all') - groupM.add_argument('--list-modes', **{'default': False, 'action': 'store_true', + groupM.add_argument('--list-modes', **{'default': False, 'action': 'store_true', 'help': "List the available modes and their descriptions."}) # common arguments @@ -171,7 +148,7 @@ if __name__ == '__main__': if show_help: parser.print_help() - + subparsers = {} for mode, info_dict in DOWNLOAD_MODES.items(): subparser = argparse.ArgumentParser(usage=argparse.SUPPRESS, add_help=False) @@ -182,7 +159,7 @@ if __name__ == '__main__': if show_help: subparser.print_help() - + subparsers[mode] = subparser diff --git a/bin/anvi-setup-modelseed-database b/bin/anvi-setup-modelseed-database index 6a2f0227fb..379bd02280 100755 --- a/bin/anvi-setup-modelseed-database +++ b/bin/anvi-setup-modelseed-database @@ -7,7 +7,7 @@ import os from sys import exit from argparse import Namespace -import anvio.biochemistry.reactionnetwork as reactionnetwork +import anvio.reactionnetwork as reactionnetwork from anvio import A, K from anvio.errors import ConfigError diff --git a/requirements.txt b/requirements.txt index 08f6dacbec..324c2317f8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,3 +24,7 @@ multiprocess plotext networkx pulp==2.7.0 +biopython +ReportLab +pymupdf +frontend diff --git a/sandbox/anvi-script-reformat-fasta b/sandbox/anvi-script-reformat-fasta index 42e6aca8f4..a6d6a8484e 100755 --- a/sandbox/anvi-script-reformat-fasta +++ b/sandbox/anvi-script-reformat-fasta @@ -110,9 +110,11 @@ def reformat_FASTA(args): replace_chars = True if args.seq_type == 'AA': acceptable_chars = set(sorted(list(constants.AA_to_single_letter_code.values()))) + acceptable_chars.add('X') replacement = 'X' else: acceptable_chars = set(constants.nucleotides) + acceptable_chars.add('N') replacement = 'N' else: replace_chars = False