From 7274a917f91c90bb239c09ea1bb7eca06c996bb0 Mon Sep 17 00:00:00 2001 From: Rosina Torres Ortega Date: Fri, 18 Oct 2024 11:15:00 +0200 Subject: [PATCH 1/2] '[ADD] strain mapping fuct and notebook' --- notebooks/local_strain_mapping.ipynb | 136 +++++++++++++++++++++++++++ src/nplinker/strain/utils.py | 116 +++++++++++++++++++++++ 2 files changed, 252 insertions(+) create mode 100644 notebooks/local_strain_mapping.ipynb diff --git a/notebooks/local_strain_mapping.ipynb b/notebooks/local_strain_mapping.ipynb new file mode 100644 index 00000000..ba2697af --- /dev/null +++ b/notebooks/local_strain_mapping.ipynb @@ -0,0 +1,136 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Strain info to get the genome_id and spectras_id" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from nplinker.strain.utils import extract_strain_metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "strain_genome=extract_strain_metadata(\"/Users/rosinatorres/Documents/PhD/WP3/nplinker_workshop/nplinker/tests/unit/data3/3strains_metadata_genome.txt\")\n", + "strain_spectra=extract_strain_metadata(\"/Users/rosinatorres/Documents/PhD/WP3/nplinker_workshop/nplinker/tests/unit/data3/3strains_metadata_extract.txt\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Extracts the bgcs from antismash results, with the associated genome_id from the metadatafile" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from nplinker.strain.utils import extract_bgcs_genome_id" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "bgcs_path = \"/Users/rosinatorres/Documents/PhD/WP3/nplinker_workshop/nplinker/tests/unit/data3/antismash\" # Replace with the path to your antiSMASH results\n", + "bgc_dict,strain_bgcs = extract_bgcs_genome_id(strain_genome, bgcs_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Extracts the features from gnps results, with the associated spectra from the metadatafile" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "from nplinker.strain.utils import extract_features_metabolome_id" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "features_path =\"/Users/rosinatorres/Documents/PhD/WP3/nplinker_workshop/nplinker/tests/unit/data3/gnps/file_mappings.csv\"\n", + "strain_features = extract_features_metabolome_id(strain_spectra, features_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Strain_mapping creation" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "from nplinker.strain.utils import create_strain_mappings\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "JSON file 'strain_mappings_2.json' has been created successfully.\n" + ] + } + ], + "source": [ + "create_strain_mappings(strain_genome, bgc_dict, strain_features)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "npl_dev_2", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/nplinker/strain/utils.py b/src/nplinker/strain/utils.py index 56f19b49..4203e2c6 100644 --- a/src/nplinker/strain/utils.py +++ b/src/nplinker/strain/utils.py @@ -3,6 +3,7 @@ import logging from os import PathLike from jsonschema import validate +from nplinker.genomics.antismash import AntismashBGCLoader from nplinker.schemas import USER_STRAINS_SCHEMA from ..genomics.utils import extract_mappings_original_genome_id_resolved_genome_id from ..genomics.utils import extract_mappings_resolved_genome_id_bgc_id @@ -138,3 +139,118 @@ def podp_generate_strain_mappings( logger.info("Generated strain mappings JSON file: %s", output_json_file) return sc + + +def extract_strain_metadata(strain_path: str | PathLike) -> dict: + """This function extracts strain metadata from a tab-separated file. + Can be used for the strain_id ---> genome_id mapping or strain_id ---> spectra_id mapping. + + Args: + strain_path: _path to the tab-separated file_ + + Returns: + dictionary: _a dictionary with the strain_id as key and the genome_id or spectra_id as value_ + + + Example: + StrainID GenomeID + strain1 genome1 + strain2 genome2 + + Returns: + {'strain1': 'genome1', 'strain2': 'genome2'} + + """ + dictionary = {} + with open(strain_path, "r") as file: + for line in file: + key, value = map(str.strip, line.strip().split("\t")) + if key in dictionary: + if isinstance(dictionary[key], list): + dictionary[key].append(value) + else: + dictionary[key] = [dictionary[key], value] + else: + dictionary[key] = value + return dictionary + + +def extract_bgcs_genome_id(strain_genome: dict, bgc_path: str | PathLike): + """Extract bgcs based on the strain_genome mapping. + + Args: + strain_genome: dict that comes from extract_strain_metadata function + bgc_path: path of the folder of antismash results + """ + bgc_loader = AntismashBGCLoader(bgc_path) + bgc_dict = bgc_loader.get_genome_bgcs_mapping() + + # Make a dict for the bgcs based on the strain_id + strain_bgcs = {} + + for strain_id, genome_id in strain_genome.items(): + if genome_id in bgc_dict: + strain_bgcs[strain_id] = bgc_dict[genome_id] + + return bgc_dict, strain_bgcs + + +def extract_features_metabolome_id(strain_spectra: dict, features_file: str | PathLike): + """Extract features based on the strain_spectra mapping. + + Args: + strain_spectra: dict that comes from extract_strain_metadata function + features_file: path of file of the gnps results + """ + features_dict = extract_mappings_ms_filename_spectrum_id(features_file) + strain_features = {} + for strain_id, spectra in strain_spectra.items(): + if strain_id == "StrainID": + continue + if isinstance(spectra, str): + spectra = [spectra] + features_set = set() + + for spectrum in spectra: + if spectrum in features_dict: + features_set.update(features_dict[spectrum]) + + # Convert the set to a sorted list and add to the result_dict + strain_features[strain_id] = sorted(features_set) + + # Output the result + return strain_features + + +def create_strain_mappings(strain_genome: dict, bgc_dict: dict, strain_features: dict): + """Creates a JSON file with the strain mappings for NPLinker. + + Args: + strain_genome: dict that comes from extract_strain_metadata function + bgc_dict: from extract_bgcs_genome_id + strain_features: dict that comes from extract_strain_metadata function + """ + strain_bgcs_features = {} + + for strain_id, genome_id in strain_genome.items(): + if strain_id in strain_features: + bgcs = bgc_dict.get(genome_id, []) + features = strain_features[strain_id] + strain_bgcs_features[strain_id] = bgcs + features + + strain_mappings = {"version": 1.0, "strain_mappings": []} + + # Populate the strain_mappings + for strain_id, strain_alias in strain_bgcs_features.items(): + strain_mappings["strain_mappings"].append( + {"strain_id": strain_id, "strain_alias": strain_alias} + ) + + # Specify the file path where the JSON file will be saved + file_path = "strain_mappings_2.json" + + # Write the new dictionary to a JSON file + with open(file_path, "w") as json_file: + json.dump(strain_mappings, json_file, indent=4) + + return print(f"JSON file '{file_path}' has been created successfully.") From 9e969c95492d2069f396386ab3c7ff60f52868e7 Mon Sep 17 00:00:00 2001 From: Rosina Torres Ortega Date: Fri, 18 Oct 2024 11:18:51 +0200 Subject: [PATCH 2/2] '[ADD] antismash loader' --- src/nplinker/genomics/antismash/antismash_loader.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/nplinker/genomics/antismash/antismash_loader.py b/src/nplinker/genomics/antismash/antismash_loader.py index 8c00c065..057681ab 100644 --- a/src/nplinker/genomics/antismash/antismash_loader.py +++ b/src/nplinker/genomics/antismash/antismash_loader.py @@ -57,6 +57,17 @@ def get_bgc_genome_mapping(self) -> dict[str, str]: bid: os.path.basename(os.path.dirname(bpath)) for bid, bpath in self._file_dict.items() } + def get_genome_bgcs_mapping(self) -> dict[str, list]: + """Get the mapping from genome to BGCs. + + Returns: + The key is genome id and value is a list of BGC names (gbk file names + """ + genome_to_bgcs = {} + for bgc, genome in self.get_bgc_genome_mapping().items(): + genome_to_bgcs.setdefault(genome, []).append(bgc) + return genome_to_bgcs + def get_files(self) -> dict[str, str]: """Get BGC gbk files.