diff --git a/docs/installation.rst b/docs/installation.rst index 624bed6e..7de14dbd 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -142,7 +142,7 @@ Generate conda requirements files `from setup.cfg`: $ python _requirements.py -First, if you would like a specific PyTorch version to use with PyHDX (ie CUDA/ROCm support), you should install this first. +If you would like a specific PyTorch version to use with PyHDX (ie CUDA/ROCm support), you should install this first. Installation instructions are on the Pytorch_ website. Then, install the other base dependencies and optional extras. For example, to install PyHDX with web app: diff --git a/pyhdx/batch_processing.py b/pyhdx/batch_processing.py index 6777de92..96e3b296 100644 --- a/pyhdx/batch_processing.py +++ b/pyhdx/batch_processing.py @@ -1,20 +1,31 @@ import warnings +from functools import reduce from pathlib import Path import os -from pyhdx.models import PeptideMasterTable, HDXMeasurement, HDXMeasurementSet -from pyhdx.fileIO import read_dynamx +import re +from pyhdx import TorchFitResult +from pyhdx.models import PeptideMasterTable, HDXMeasurement, HDXMeasurementSet +from pyhdx.fileIO import read_dynamx, csv_to_dataframe, save_fitresult +from pyhdx.fitting import fit_rates_half_time_interpolate, fit_rates_weighted_average, \ + fit_gibbs_global, fit_gibbs_global_batch, RatesFitResult, GenericFitResult +import param +import pandas as pd +from pyhdx.support import gen_subclasses +import yaml time_factors = {"s": 1, "m": 60.0, "min": 60.0, "h": 3600, "d": 86400} temperature_offsets = {"c": 273.15, "celsius": 273.15, "k": 0, "kelvin": 0} -# todo add data filters in yaml spec + +# todo add data filters in state spec? # todo add proline, n_term options -class YamlParser(object): - ""'object used to parse yaml data input files into PyHDX HDX Measurement object' +class StateParser(object): + ""'object used to parse yaml state input files into PyHDX HDX Measurement object' - def __init__(self, yaml_dict, data_src=None, data_filters=None): - self.yaml_dict = yaml_dict + # todo yaml_dict -> state_spec + def __init__(self, state_spec, data_src=None, data_filters=None): + self.state_spec = state_spec if isinstance(data_src, (os.PathLike, str)): self.data_src = Path(data_src) elif isinstance(data_src, dict): @@ -44,7 +55,7 @@ def load_data(self, *filenames, reader='dynamx'): def load_hdxmset(self): """batch read the full yaml spec into a hdxmeasurementset""" hdxm_list = [] - for state in self.yaml_dict.keys(): + for state in self.state_spec.keys(): hdxm = self.load_hdxm(state, name=state) hdxm_list.append(hdxm) @@ -55,7 +66,7 @@ def load_hdxm(self, state, **kwargs): kwargs: additional kwargs passed to hdxmeasurementset """ - state_dict = self.yaml_dict[state] + state_dict = self.state_spec[state] filenames = state_dict["filenames"] df = self.load_data(*filenames) @@ -95,8 +106,8 @@ def load_hdxm(self, state, **kwargs): raise ValueError("Must specify either 'c_term' or 'sequence'") state_data = pmt.get_state(state_dict["state"]) - for filter in self.data_filters: - state_data = filter(state_data) + for flt in self.data_filters: + state_data = flt(state_data) hdxm = HDXMeasurement( state_data, @@ -111,9 +122,157 @@ def load_hdxm(self, state, **kwargs): return hdxm +process_functions = { + 'csv_to_dataframe': csv_to_dataframe, + 'fit_rates_half_time_interpolate': fit_rates_half_time_interpolate, + 'fit_rates_weighted_average': fit_rates_weighted_average, + 'fit_gibbs_global': fit_gibbs_global + +} + +# task objects should be param +class Task(param.Parameterized): + ... + + scheduler_address = param.String(doc='Optional scheduler adress for dask task') + + cwd = param.ClassSelector(Path, doc='Path of the current working directory') + + +class LoadHDMeasurementSetTask(Task): + _type = 'load_hdxm_set' + + state_file = param.String() # = string path + + out = param.ClassSelector(HDXMeasurementSet) + + def execute(self, *args, **kwargs): + state_spec = yaml.safe_load((self.cwd / self.state_file).read_text()) + parser = StateParser(state_spec, self.cwd, default_filters) + hdxm_set = parser.load_hdxmset() + + self.out = hdxm_set + + +class EstimateRates(Task): + _type = 'estimate_rates' + + hdxm_set = param.ClassSelector(HDXMeasurementSet) + + select_state = param.String(doc='If set, only use this state for creating initial guesses') + + out = param.ClassSelector((RatesFitResult, GenericFitResult)) + + def execute(self, *args, **kwargs): + if self.select_state: # refactor to 'state' ? + hdxm = self.hdxm_set.get(self.select_state) + result = fit_rates_half_time_interpolate(hdxm) + else: + results = [] + for hdxm in self.hdxm_set: + r = fit_rates_half_time_interpolate(hdxm) + results.append(r) + result = RatesFitResult(results) + + self.out = result + + +# todo allow guesses from deltaG +class ProcessGuesses(Task): + _type = 'create_guess' + + hdxm_set = param.ClassSelector(HDXMeasurementSet) + + select_state = param.String(doc='If set, only use this state for creating initial guesses') + + rates_df = param.ClassSelector(pd.DataFrame) + + out = param.ClassSelector((pd.Series, pd.DataFrame)) + + def execute(self, *args, **kwargs): + if self.select_state: + hdxm = self.hdxm_set.get(self.select_state) + if self.rates_df.columns.nlevels == 2: + rates_series = self.rates_df[(self.select_state, 'rate')] + else: + rates_series = self.rates_df['rate'] + + guess = hdxm.guess_deltaG(rates_series) + + else: + rates = self.rates_df.xs('rate', level=-1, axis=1) + guess = self.hdxm_set.guess_deltaG(rates) + + self.out = guess + + +class FitGlobalBatch(Task): + _type = 'fit_global_batch' + + hdxm_set = param.ClassSelector(HDXMeasurementSet) + + initial_guess = param.ClassSelector( + (pd.Series, pd.DataFrame), doc='Initial guesses for fits') + + out = param.ClassSelector(TorchFitResult) + + def execute(self, *args, **kwargs): + result = fit_gibbs_global_batch(self.hdxm_set, self.initial_guess, **kwargs) + + self.out = result + + +class SaveFitResult(Task): + _type = 'save_fit_result' + + fit_result = param.ClassSelector(TorchFitResult) + + output_dir = param.String() + + def execute(self, *args, **kwargs): + save_fitresult(self.cwd / self.output_dir, self.fit_result) + + +class JobParser(object): + + cwd = param.ClassSelector(Path, doc='Path of the current working directory') + + def __init__(self, job_spec, cwd=None): + self.job_spec = job_spec + self.cwd = cwd or Path().cwd() + + self.tasks = {} + self.task_classes = {cls._type: cls for cls in gen_subclasses(Task) if getattr(cls, "_type", None)} + + def resolve_var(self, var_string): + task_name, *attrs = var_string.split('.') + + return reduce(getattr, attrs, self.tasks[task_name]) + + def execute(self): + + for task_spec in self.job_spec['steps']: + task_klass = self.task_classes[task_spec['task']] + skip = {'args', 'kwargs', 'task'} + + resolved_params = {} + for par_name in task_spec.keys() - skip: + value = task_spec[par_name] + if isinstance(value, str): + m = re.findall(r'\$\((.*?)\)', value) + if m: + value = self.resolve_var(m[0]) + resolved_params[par_name] = value + task = task_klass(cwd=self.cwd, **resolved_params) + task.execute(*task_spec.get('args', []), **task_spec.get('kwargs', {})) + + self.tasks[task.name] = task + + def yaml_to_hdxmset(yaml_dict, data_dir=None, **kwargs): """reads files according to `yaml_dict` spec from `data_dir into HDXMEasurementSet""" + warnings.warn("yaml_to_hdxmset is deprecated, use 'StateParser'") hdxm_list = [] for k, v in yaml_dict.items(): hdxm = yaml_to_hdxm(v, data_dir=data_dir, name=k) @@ -121,6 +280,11 @@ def yaml_to_hdxmset(yaml_dict, data_dir=None, **kwargs): return HDXMeasurementSet(hdxm_list) +# todo configurable +default_filters = [ + lambda df: df.query('exposure > 0') +] + def yaml_to_hdxm(yaml_dict, data_dir=None, data_filters=None, **kwargs): # todo perhas classmethod on HDXMeasurement object? @@ -142,7 +306,7 @@ def yaml_to_hdxm(yaml_dict, data_dir=None, data_filters=None, **kwargs): Output data object as specified by `yaml_dict`. """ - warnings.warn('This method is deprecated in favor of YamlParser', DeprecationWarning) + warnings.warn('This method is deprecated in favor of StateParser', DeprecationWarning) if data_dir is not None: input_files = [Path(data_dir) / fname for fname in yaml_dict["filenames"]] @@ -270,3 +434,5 @@ def load_from_yaml_v040b2(yaml_dict, data_dir=None, **kwargs): # pragma: no cov ) return hdxm + + diff --git a/pyhdx/cli.py b/pyhdx/cli.py index 04ebf995..cfb92dfd 100644 --- a/pyhdx/cli.py +++ b/pyhdx/cli.py @@ -1,32 +1,30 @@ -import argparse import time -from ipaddress import ip_address -from pyhdx.web import serve -from pyhdx.config import cfg -from pyhdx.local_cluster import verify_cluster, default_cluster +from typing import Union, Optional +from pathlib import Path +import typer +from ipaddress import ip_address +import yaml -# todo add check to see if the web module requirements are installed +app = typer.Typer() -def main(): - parser = argparse.ArgumentParser(prog="pyhdx", description="PyHDX Launcher") +@app.command() +def serve(scheduler_address: Optional[str] = typer.Option(None, help="Address for dask scheduler to use")): + """Launch the PyHDX web application""" - parser.add_argument("serve", help="Runs PyHDX Dashboard") - parser.add_argument( - "--scheduler_address", help="Run with local cluster :" - ) - args = parser.parse_args() + from pyhdx.config import cfg + from pyhdx.local_cluster import verify_cluster, default_cluster - if args.scheduler_address: - ip, port = args.scheduler_address.split(":") + if scheduler_address is not None: + ip, port = scheduler_address.split(":") if not ip_address(ip): print("Invalid IP Address") return elif not 0 <= int(port) < 2 ** 16: print("Invalid port, must be 0-65535") return - cfg.set("cluster", "scheduler_address", args.scheduler_address) + cfg.set("cluster", "scheduler_address", scheduler_address) scheduler_address = cfg.get("cluster", "scheduler_address") if not verify_cluster(scheduler_address): @@ -37,8 +35,9 @@ def main(): scheduler_address = f"{ip}:{port}" print(f"Started new Dask LocalCluster at {scheduler_address}") - if args.serve: - serve.run_apps() + # Start the PyHDX web application + from pyhdx.web import serve as serve_pyhdx + serve_pyhdx.run_apps() loop = True while loop: @@ -49,11 +48,22 @@ def main(): loop = False -if __name__ == "__main__": - import sys +@app.command() +def process( + jobfile: Path = typer.Argument(..., help="Path to .yaml jobfile"), + cwd: Optional[Path] = typer.Option(None, help="Optional path to working directory") +): + """ + Process a HDX dataset according to a jobfile + """ + + from pyhdx.batch_processing import JobParser - sys.argv.append("serve") - sys.argv.append("--scheduler_address") - sys.argv.append("127.0.0.1:53270") + job_spec = yaml.safe_load(jobfile.read_text()) + parser = JobParser(job_spec, cwd=cwd) - main() + parser.execute() + + +if __name__ == "__main__": + app() diff --git a/pyhdx/fileIO.py b/pyhdx/fileIO.py index cdda9492..309f8673 100644 --- a/pyhdx/fileIO.py +++ b/pyhdx/fileIO.py @@ -371,7 +371,7 @@ def save_fitresult(output_dir, fit_result, log_lines=None): dataframe_to_file(output_dir / "losses.csv", fit_result.losses) dataframe_to_file(output_dir / "losses.txt", fit_result.losses, fmt="pprint") - if isinstance(fit_result.hdxm_set, pyhdx.HDXMeasurement): + if isinstance(fit_result.hdxm_set, pyhdx.HDXMeasurement): # check, but this should always be hdxm_set fit_result.hdxm_set.to_file(output_dir / "HDXMeasurement.csv") if isinstance(fit_result.hdxm_set, pyhdx.HDXMeasurementSet): fit_result.hdxm_set.to_file(output_dir / "HDXMeasurements.csv") diff --git a/pyhdx/fitting.py b/pyhdx/fitting.py index d3336f00..1e016411 100644 --- a/pyhdx/fitting.py +++ b/pyhdx/fitting.py @@ -929,6 +929,7 @@ class GenericFitResult: @dataclass class RatesFitResult: + """Accumulates multiple Generic/KineticsFit Results""" results: list @property diff --git a/pyhdx/models.py b/pyhdx/models.py index d25f8cf5..fe60f6ae 100644 --- a/pyhdx/models.py +++ b/pyhdx/models.py @@ -1219,6 +1219,12 @@ def __iter__(self): def __getitem__(self, item): return self.hdxm_list.__getitem__(item) + def get(self, name): + """find a HDXMeasurement by name""" + + idx = self.names.index(name) + return self[idx] + @property def Ns(self): return len(self.hdxm_list) diff --git a/pyhdx/web/controllers.py b/pyhdx/web/controllers.py index d1844a1e..d4629348 100644 --- a/pyhdx/web/controllers.py +++ b/pyhdx/web/controllers.py @@ -20,7 +20,7 @@ from proplot import to_hex from skimage.filters import threshold_multiotsu -from pyhdx.batch_processing import YamlParser +from pyhdx.batch_processing import StateParser from pyhdx.config import cfg from pyhdx.fileIO import read_dynamx, csv_to_dataframe, dataframe_to_stringio from pyhdx.fitting import ( @@ -499,7 +499,7 @@ def _add_dataset_batch(self): ios = {name: StringIO(byte_content.decode("UTF-8")) for name, byte_content in zip(self.widgets['input_files'].filename, self.input_files)} filters = [lambda df: df.query('exposure > 0')] - parser = YamlParser(yaml_dict, data_src=ios, data_filters=filters) + parser = StateParser(yaml_dict, data_src=ios, data_filters=filters) for state in yaml_dict.keys(): hdxm = parser.load_hdxm(state, name=state) diff --git a/setup.cfg b/setup.cfg index 28068999..42ab74e5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,9 +1,9 @@ [metadata] name = PyHDX author = Jochem H. Smit -author-email = jhsmit@gmail.com +author_email = jhsmit@gmail.com maintainer = Jochem H. Smit -maintainer-email = jhsmit@gmail.com +maintainer_email = jhsmit@gmail.com url = https://github.com/Jhsmit/PyHDX license = MIT license_files = LICENSE @@ -31,22 +31,24 @@ install_requires = sympy==1.5.1 torch tqdm + typer dask distributed packaging + param + python_requires = >=3.8 [options.entry_points] console_scripts = - pyhdx = pyhdx.cli:main + pyhdx = pyhdx.cli:app [options.extras_require] web = panel>=0.12.6 bokeh - param holoviews colorcet >= 3.0.0 hvplot diff --git a/templates/02_guesses_from_yaml.py b/templates/02_guesses_from_yaml.py index 8b520417..09c5a764 100644 --- a/templates/02_guesses_from_yaml.py +++ b/templates/02_guesses_from_yaml.py @@ -1,5 +1,5 @@ """Load HDX-MS data from yaml spec and perform initial guess of exchange rates""" -from pyhdx.batch_processing import YamlParser +from pyhdx.batch_processing import StateParser from pathlib import Path from pyhdx.fitting import fit_rates_weighted_average import yaml @@ -16,7 +16,7 @@ # Requires local_cluster.py to be running (or other Dask client on default address in config) client = default_client() -parser = YamlParser(data_dict, data_src=data_dir) +parser = StateParser(data_dict, data_src=data_dir) for name in data_dict.keys(): print(name) dic = data_dict[name] diff --git a/templates/06_fitting_with_logs.py b/templates/06_fitting_with_logs.py index 6f80af5e..c6202050 100644 --- a/templates/06_fitting_with_logs.py +++ b/templates/06_fitting_with_logs.py @@ -1,5 +1,5 @@ """Perform fitting with a range of regularizers""" -from pyhdx.batch_processing import yaml_to_hdxmset, YamlParser +from pyhdx.batch_processing import yaml_to_hdxmset, StateParser from pathlib import Path from pyhdx.fitting import fit_gibbs_global_batch import yaml @@ -20,7 +20,7 @@ output_dir = current_dir / 'fit' output_dir.mkdir(exist_ok=True) -parser = YamlParser(data_dict, data_src=input_dir) +parser = StateParser(data_dict, data_src=input_dir) hdx_set = parser.load_hdxmset() rates_list = [csv_to_protein(current_dir / 'guesses' / f'{name}_rates_guess.csv')['rate'] for name in data_dict.keys()] diff --git a/templates/12_jobfiles.py b/templates/12_jobfiles.py new file mode 100644 index 00000000..05392a63 --- /dev/null +++ b/templates/12_jobfiles.py @@ -0,0 +1,27 @@ +""" +Execute a PyHDX data processing pipeline according to a yaml jobfile specification + + +""" + + +from pathlib import Path +from pyhdx.batch_processing import JobParser +import yaml + +#%% +# Pycharm scientific mode +if '__file__' not in locals(): + __file__ = Path().cwd() / 'templates' / 'script.py' + +current_dir = Path(__file__).parent +output_dir = current_dir / 'output' +output_dir.mkdir(exist_ok=True) +test_data_dir = current_dir.parent / 'tests' / 'test_data' +input_dir = test_data_dir / 'input' + +#%% + +job_spec = yaml.safe_load((input_dir / 'jobfile.yaml').read_text()) +job_parser = JobParser(job_spec, cwd=input_dir) +job_parser.execute() \ No newline at end of file diff --git a/tests/test_batchprocessing.py b/tests/test_batchprocessing.py index fc660f33..c4526e5b 100644 --- a/tests/test_batchprocessing.py +++ b/tests/test_batchprocessing.py @@ -1,8 +1,9 @@ -from pyhdx.batch_processing import yaml_to_hdxm, yaml_to_hdxmset, YamlParser +from pyhdx.batch_processing import StateParser, JobParser from pyhdx.models import HDXMeasurement, HDXMeasurementSet import numpy as np from pathlib import Path import yaml +import shutil cwd = Path(__file__).parent input_dir = cwd / 'test_data' / 'input' @@ -17,17 +18,7 @@ def test_load_from_yaml(self): yaml_pth = Path(input_dir / 'data_states.yaml') data_dict = yaml.safe_load(yaml_pth.read_text()) - hdxm = yaml_to_hdxm(data_dict['SecB_tetramer'], data_dir=input_dir) - assert isinstance(hdxm, HDXMeasurement) - - assert hdxm.metadata['temperature'] == data_dict['SecB_tetramer']['temperature']['value'] + 273.15 - assert hdxm.name == 'SecB WT apo' - - hdxm_set = yaml_to_hdxmset(data_dict, data_dir=input_dir) - assert isinstance(hdxm_set, HDXMeasurementSet) - assert hdxm_set.names == list(data_dict.keys()) - - parser = YamlParser(data_dict, data_src=input_dir) + parser = StateParser(data_dict, data_src=input_dir) hdxm = parser.load_hdxm('SecB_tetramer') assert isinstance(hdxm, HDXMeasurement) @@ -39,4 +30,14 @@ def test_load_from_yaml(self): assert isinstance(hdxm_set, HDXMeasurementSet) assert hdxm_set.names == list(data_dict.keys()) + def test_load_job_parser(self): + fit_output_dir = input_dir / 'fit_result_output_1' + if fit_output_dir.exists(): + shutil.rmtree(fit_output_dir, ignore_errors=True) + + job_spec = yaml.safe_load((input_dir / 'jobfile.yaml').read_text()) + parser = JobParser(job_spec, cwd=input_dir) + parser.execute() + assert fit_output_dir.exists() + shutil.rmtree(fit_output_dir, ignore_errors=True) diff --git a/tests/test_data/input/jobfile.yaml b/tests/test_data/input/jobfile.yaml new file mode 100644 index 00000000..29d0a5ab --- /dev/null +++ b/tests/test_data/input/jobfile.yaml @@ -0,0 +1,21 @@ +steps: + - task: load_hdxm_set + name: load_data + state_file: data_states.yaml + - task: estimate_rates + name: rates + hdxm_set: $(load_data.out) + - task: create_guess + name: guess + rates_df: $(rates.out.output) + hdxm_set: $(load_data.out) + - task: fit_global_batch + name: global_fit + hdxm_set: $(load_data.out) + initial_guess: $(guess.out) + kwargs: + epochs: 100 + stop_loss: 1.e-6 + - task: save_fit_result + fit_result: $(global_fit.out) + output_dir: fit_result_output_1