Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add class DatasetArranger #215

Merged
merged 12 commits into from
Mar 5, 2024
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ namespaces = true # enable data directory to be identified

[tool.pytest.ini_options]
minversion = "6.0"
addopts = "-ra -n auto" # -ra: show summary info for all test outcomes; -n auto: run tests in parallel
addopts = "-ra -n 1" # -ra: show summary info for all test outcomes; -n auto: run tests in parallel
testpaths = ["tests"]

[tool.coverage.run]
Expand Down
452 changes: 452 additions & 0 deletions src/nplinker/arranger.py

Large diffs are not rendered by default.

368 changes: 25 additions & 343 deletions src/nplinker/loader.py

Large diffs are not rendered by default.

67 changes: 11 additions & 56 deletions src/nplinker/nplinker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import sys
from typing import TYPE_CHECKING
from .arranger import DatasetArranger
from .config import config
from .genomics import BGC
from .genomics import GCF
Expand Down Expand Up @@ -38,12 +39,12 @@ class NPLinker:
def __init__(self):
"""Initialise an NPLinker instance."""
# configure logging based on the supplied config params
LogConfig.setLogLevelStr(config.loglevel)
logfile = config.get("logfile")
LogConfig.setLogLevelStr(config.log.level)
logfile = config.get("log.file")
if logfile:
logfile_dest = logging.FileHandler(logfile)
# if we want to log to stdout plus logfile, add the new destination
if config.get("log_to_stdout"): # default to True
if config.get("log.to_stdout"): # default to True
LogConfig.addLogDestination(logfile_dest)
else:
# otherwise overwrite the default stdout destination
Expand Down Expand Up @@ -124,22 +125,7 @@ def root_dir(self):
Returns:
str: the path to the dataset root directory currently in use
"""
return self._loader._root

@property
def dataset_id(self):
"""Returns dataset "ID".

For local datasets this will just be the last component of the directory path,
e.g. /path/to/my_dataset would produce an ID of "my_dataset".

For datasets loaded from the Paired Omics platform the ID will be the platform
project ID, e.g. "MSV000079284"

Returns:
str: the dataset ID
"""
return self._loader.dataset_id
return config.root_dir

@property
def data_dir(self):
Expand All @@ -149,30 +135,13 @@ def data_dir(self):
@property
def bigscape_cutoff(self):
"""Returns the current BiGSCAPE clustering cutoff value."""
return self._loader._bigscape_cutoff
return config.bigscape.cutoff

def load_data(self, new_bigscape_cutoff=None):
"""Loads the basic components of a dataset.

This method is responsible for loading the various pieces of the supplied dataset into
memory and doing any initial parsing/object manipulation required. After it completes,
applications can access the lists of GCFs, Spectra, MolecularFamilies and strains
using the corresponding properties of the NPLinker class.

Returns:
bool: True if successful, False otherwise
"""
logger.debug("load_data(new_bigscape_cutoff=%s)", new_bigscape_cutoff)
if new_bigscape_cutoff is None:
self._loader.validate()
self._loader.generate_strain_mappings()
if not self._loader.load():
return False
else:
# CG: only reload genomics data when changing bigscape cutoff
self._loader._bigscape_cutoff = new_bigscape_cutoff
# TODO: only need to reload gcfs using load_gcfs()
self._loader._load_genomics()
def load_data(self):
"""Loads the basic components of a dataset."""
arranger = DatasetArranger()
arranger.arrange()
self._loader.load()

self._spectra = self._loader.spectra
self._molfams = self._loader.molfams
Expand All @@ -184,20 +153,6 @@ def load_data(self, new_bigscape_cutoff=None):
self._chem_classes = self._loader.chem_classes
self._class_matches = self._loader.class_matches

logger.debug("Generating lookup tables: genomics")
self._bgc_lookup = {bgc.bgc_id: bgc for bgc in self._bgcs}
self._gcf_lookup = {gcf.gcf_id: gcf for gcf in self._gcfs}

# don't need to do these two if cutoff changed (indicating genomics data
# was reloaded but not metabolomics)
if new_bigscape_cutoff is None:
logger.debug("Generating lookup tables: metabolomics")
self._spec_lookup = {spec.spectrum_id: spec for spec in self._spectra}
self._mf_lookup = {mf.family_id: mf for mf in self._molfams}

logger.debug("load_data: completed")
return True

# TODO CG: refactor this method and update its unit tests
def get_links(self, input_objects, scoring_methods, and_mode=True):
"""Find links for a set of input objects (BGCs/GCFs/Spectra/MolFams).
Expand Down
180 changes: 0 additions & 180 deletions src/nplinker/pairedomics/downloader.py

This file was deleted.

56 changes: 2 additions & 54 deletions src/nplinker/pairedomics/runbigscape.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,3 @@
# Copyright 2021 The NPLinker Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import subprocess
import sys
Expand All @@ -21,27 +7,19 @@

logger = LogConfig.getLogger(__name__)

# NOTE: for simplicity this is currently written with assumption it will only be
# called in context of nplinker Docker image, where bigscape should be available
PFAM_PATH = os.path.join(sys.prefix, "nplinker_lib")


def run_bigscape(
bigscape_py_path: str | PathLike,
antismash_path: str | PathLike,
output_path: str | PathLike,
pfam_path: str | PathLike,
extra_params: str,
):
bigscape_py_path = "bigscape.py"
logger.info(
f'run_bigscape: input="{antismash_path}", output="{output_path}", extra_params={extra_params}"'
)

if os.path.exists(os.path.join(output_path, "completed")):
logger.info("BiG-SCAPE appears to have been run already, skipping!")
logger.info("To force re-run, delete {%s}", os.path.join(output_path, "completed"))
return True

try:
subprocess.run([bigscape_py_path, "-h"], capture_output=True, check=True)
except Exception as e:
Expand All @@ -51,7 +29,7 @@ def run_bigscape(
raise Exception(f'antismash_path "{antismash_path}" does not exist!')

# configure the IO-related parameters, including pfam_dir
args = [bigscape_py_path, "-i", antismash_path, "-o", output_path, "--pfam_dir", pfam_path]
args = [bigscape_py_path, "-i", antismash_path, "-o", output_path, "--pfam_dir", PFAM_PATH]

# append the user supplied params, if any
if len(extra_params) > 0:
Expand All @@ -65,34 +43,4 @@ def run_bigscape(
# which will indicate to the PODPDownloader module that something went wrong.
result.check_returncode()

# use presence of this file as a quick way to check if a previous run
# finished or not
with open(os.path.join(output_path, "completed"), "w") as f:
f.close()

return True


def podp_run_bigscape(
project_file_cache: str | PathLike,
PFAM_PATH: str | PathLike,
do_bigscape: bool,
extra_bigscape_parameters,
):
# TODO this currently assumes docker environment, allow customisation?
# can check if in container with: https://stackoverflow.com/questions/20010199/how-to-determine-if-a-process-runs-inside-lxc-docker
if not do_bigscape:
logger.info("BiG-SCAPE disabled by configuration, not running it")
return

logger.info('Running BiG-SCAPE! extra_bigscape_parameters="%s"', extra_bigscape_parameters)
try:
run_bigscape(
"bigscape.py",
os.path.join(project_file_cache, "antismash"),
os.path.join(project_file_cache, "bigscape"),
PFAM_PATH,
extra_bigscape_parameters,
)
except Exception as e:
logger.warning('Failed to run BiG-SCAPE on antismash data, error was "%s"', e)
Loading
Loading