diff --git a/examples/load_datasets.py b/examples/load_datasets.py index cbe78ff..b9c4aa1 100644 --- a/examples/load_datasets.py +++ b/examples/load_datasets.py @@ -1,11 +1,21 @@ +# %% + from hdxms_datasets import DataVault +from pathlib import Path + -# Creating a DataVault without giving a cache path name uses $home/.hdxms_datasets by default -vault = DataVault() +# %% +# create a data vault, specify cache_dir to download datasets to +cache_dir = Path.home() / ".hdxms_datasets" +vault = DataVault(cache_dir=cache_dir) +vault +# %% # Download a specific HDX dataset vault.fetch_dataset("20221007_1530_SecA_Krishnamurthy") +vault.datasets +# %% # Load the dataset ds = vault.load_dataset("20221007_1530_SecA_Krishnamurthy") @@ -24,3 +34,5 @@ # States can also be referenced by their index, used here to load the peptides corresponding to # the experiment. peptides = ds.load_peptides(0, "experiment") + +# %% diff --git a/examples/load_from_yaml.py b/examples/load_from_yaml.py index 9a2d063..cfeb3ba 100644 --- a/examples/load_from_yaml.py +++ b/examples/load_from_yaml.py @@ -10,7 +10,7 @@ hdx_spec = yaml.safe_load((data_pth / data_id / "hdx_spec.yaml").read_text()) metadata = yaml.safe_load((data_pth / data_id / "metadata.yaml").read_text()) -#%% +# %% dataset = HDXDataSet.from_spec(hdx_spec, data_dir=data_pth / data_id, metadata=metadata) diff --git a/examples/load_to_pyhdx.py b/examples/load_to_pyhdx.py index 547ab5b..9d08be1 100644 --- a/examples/load_to_pyhdx.py +++ b/examples/load_to_pyhdx.py @@ -6,10 +6,9 @@ test_pth = Path("../tests").resolve() data_pth = test_pth / "datasets" -vault = DataVault() +vault = DataVault(cache_dir=data_pth) ds = vault.load_dataset("20221007_1530_SecB_Krishnamurthy") -# Not implemented yet hdxm = HDXMeasurement.from_dataset(ds) print(hdxm) diff --git a/hdxms_datasets/__init__.py b/hdxms_datasets/__init__.py index cdc14a4..28cc266 100644 --- a/hdxms_datasets/__init__.py +++ b/hdxms_datasets/__init__.py @@ -1,7 +1,6 @@ """Top-level package for HDXMS Datasets.""" from hdxms_datasets.__version__ import __version__ -from hdxms_datasets.config import cfg from hdxms_datasets.datasets import HDXDataSet, DataFile from hdxms_datasets.datavault import DataVault from hdxms_datasets.process import ( diff --git a/hdxms_datasets/config.py b/hdxms_datasets/config.py deleted file mode 100644 index d4e724d..0000000 --- a/hdxms_datasets/config.py +++ /dev/null @@ -1,131 +0,0 @@ -from __future__ import annotations - -import os -from pathlib import Path -from typing import Any - -from omegaconf import OmegaConf, DictConfig, DictKeyType -from packaging import version - -PACKAGE_NAME = "hdxms_datasets" - - -def reset_config(): - """Create a new config.yaml file in the user home dir/.hdxms_datasets folder""" - - with open(conf_home_pth, "w") as target: - from hdxms_datasets.__version__ import __version__ - - version_string = f"# {PACKAGE_NAME} configuration file " + __version__ + "\n\n" - target.write(version_string) - - with open(current_dir / "config.yaml") as source: - for line in source: - target.write(line) - - -class Singleton(type): - _instances: dict[type, Singleton] = {} - - def __call__(cls, *args: Any, **kwargs: Any) -> Any: - if cls not in cls._instances: - cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) - return cls._instances[cls] - - def instance(cls: Any, *args: Any, **kwargs: Any) -> Any: - return cls(*args, **kwargs) - - -class HDXMSDatasetsConfig(metaclass=Singleton): - __slots__ = ["conf"] - - def __init__(self) -> None: - self.conf = None - - def __getattr__(self, item: str) -> Any: - return getattr(self.conf, item) - - def __setattr__(self, key: str, value: Any) -> None: - if key in self.__slots__: - super().__setattr__(key, value) - elif key in self.conf.keys(): - setattr(self.conf, key, value) - else: - raise AttributeError(f"Config has no attribute {key}") - - def load_config(self, config_file: os.PathLike[str]): - conf = OmegaConf.create(Path(config_file).read_text()) - self.set_config(conf) - - def set_config(self, conf: DictConfig) -> None: - self.conf = conf - - def get(self, key: DictKeyType, default_value: Any = None) -> Any: - return self.conf.get(key, default_value) - - @property - def database_dir(self) -> Path: - pth = self.conf.database_dir - if "~" in pth: - database_dir = Path(pth.replace("~", str(Path.home()))) - elif "$home" in pth: - database_dir = Path(pth.replace("$home", str(Path.home()))) - else: - database_dir = Path(pth) - - return database_dir - - -def valid_config() -> bool: - """Checks if the current config file in the user home directory is a valid config - file for the current hdxms_datasets version - - """ - if not conf_home_pth.exists(): - return False - else: - with open(conf_home_pth, "r") as f: - version_string = f.readline().strip("; ").split(" ")[-1] - - from hdxms_datasets.__version__ import __version__ - - hdxms_datasets_version = version.parse(__version__) - cfg_version = version.parse(version_string) - - return hdxms_datasets_version.public == cfg_version.public - - -# https://stackoverflow.com/questions/6198372/most-pythonic-way-to-provide-global-configuration-variables-in-config-py/25880082 -class CfgClass(metaclass=Singleton): - def __init__(self, config=None): - self._config = {} if config is None else config - - def __getitem__(self, item): - return self._config[item] - - -home_dir = Path.home() -config_dir = home_dir / f".{PACKAGE_NAME}" -config_dir.mkdir(parents=False, exist_ok=True) -conf_home_pth = config_dir / "config.yaml" - -current_dir = Path(__file__).parent -conf_src_pth = current_dir / "config.yaml" - -# Current config version is outdated -if not valid_config(): - try: - reset_config() - conf = OmegaConf.load(conf_home_pth) - except FileNotFoundError: - # This will happen on conda-forge docker build. - # When no config.yaml file is in home_dir / '.{PACKAGE_NAME}' - # ConfigurationSettings will use the hardcoded version - conf = OmegaConf.load(conf_src_pth) - # (this is run twice due to import but should be OK since conf is singleton) -else: - conf = OmegaConf.load(conf_home_pth) - - -cfg = HDXMSDatasetsConfig() -cfg.set_config(conf) diff --git a/hdxms_datasets/config.yaml b/hdxms_datasets/config.yaml deleted file mode 100644 index 829ac54..0000000 --- a/hdxms_datasets/config.yaml +++ /dev/null @@ -1,6 +0,0 @@ -database_dir: $home/.hdxms_datasets/datasets -database_url: https://raw.githubusercontent.com/Jhsmit/HDX-MS-datasets/master/datasets/ -time_unit: s - -dynamx: - time_unit: min \ No newline at end of file diff --git a/hdxms_datasets/datasets.py b/hdxms_datasets/datasets.py index 8d551c9..b83daff 100644 --- a/hdxms_datasets/datasets.py +++ b/hdxms_datasets/datasets.py @@ -11,7 +11,6 @@ import pandas as pd import yaml -from hdxms_datasets.config import cfg from hdxms_datasets.process import filter_peptides, convert_temperature, parse_data_files from hdxms_datasets.reader import read_dynamx @@ -24,13 +23,14 @@ class DataFile(object): filepath_or_buffer: Union[Path, StringIO] + time_conversion: tuple[Literal["h", "min", "s"], Literal["h", "min", "s"]] = ("min", "s") + # from, to time conversion + @cached_property def data(self) -> pd.DataFrame: + # TODO convert time after reading if self.format == "DynamX": - # from, to time conversion - time_conversion = (cfg.dynamx.time_unit, cfg.time_unit) - - data = read_dynamx(self.filepath_or_buffer, time_conversion=time_conversion) + data = read_dynamx(self.filepath_or_buffer, time_conversion=self.time_conversion) else: raise ValueError(f"Invalid format {self.format!r}") diff --git a/hdxms_datasets/datavault.py b/hdxms_datasets/datavault.py index eef0725..22cb983 100644 --- a/hdxms_datasets/datavault.py +++ b/hdxms_datasets/datavault.py @@ -10,22 +10,22 @@ import requests import yaml -from hdxms_datasets.config import cfg from hdxms_datasets.datasets import HDXDataSet +DATABASE_URL = "https://raw.githubusercontent.com/Jhsmit/HDX-MS-datasets/master/datasets/" + + class DataVault(object): def __init__( self, - cache_dir: Optional[Union[Path[str], str]] = None, + cache_dir: Union[Path, str], + remote_url: str = DATABASE_URL, ): - if cache_dir is None: - self.cache_dir = cfg.database_dir - self.cache_dir.mkdir(exist_ok=True, parents=True) - else: - self.cache_dir: Path = Path(cache_dir) - if not self.cache_dir.exists(): - raise FileNotFoundError(f"Cache directory '{self.cache_dir}' does not exist") + self.cache_dir = Path(cache_dir) + self.cache_dir.mkdir(exist_ok=True, parents=True) + + self.remote_url = remote_url def filter(self, *spec: dict): # filters list of available datasets @@ -35,7 +35,7 @@ def filter(self, *spec: dict): def remote_index(self) -> list[str]: """List of available datasets in the remote database""" - url = urllib.parse.urljoin(cfg.database_url, "index.txt") + url = urllib.parse.urljoin(self.remote_url, "index.txt") response = requests.get(url) if response.ok: index = response.text.split("\n")[1:] @@ -90,7 +90,7 @@ def fetch_dataset(self, data_id: str) -> bool: else: output_pth.mkdir() - dataset_url = urllib.parse.urljoin(cfg.database_url, data_id + "/") + dataset_url = urllib.parse.urljoin(self.remote_url, data_id + "/") files = ["hdx_spec.yaml", "metadata.yaml"] optional_files = ["CITATION.cff"] diff --git a/hdxms_datasets/process.py b/hdxms_datasets/process.py index 8c173f4..1065e2e 100644 --- a/hdxms_datasets/process.py +++ b/hdxms_datasets/process.py @@ -5,8 +5,6 @@ import pandas as pd -from hdxms_datasets.config import cfg - if TYPE_CHECKING: from hdxms_datasets import DataFile @@ -71,6 +69,7 @@ def filter_peptides( exposure: Optional[dict] = None, query: Optional[list[str]] = None, dropna: bool = True, + time_unit: str = "s", ) -> pd.DataFrame: """ Convenience function to filter a peptides DataFrame. . @@ -82,6 +81,7 @@ def filter_peptides( exposure value, and "unit" for the time unit. query: Additional queries to pass to [pandas.DataFrame.query][]. dropna: Drop rows with `NaN` uptake entries. + time_unit: Time unit for exposure column of supplied dataframe. Examples: Filter peptides for a specific protein state and exposure time: @@ -97,7 +97,7 @@ def filter_peptides( df = df[df["state"] == state] if exposure is not None: - t_val = convert_time(exposure, target_unit=cfg.time_unit) + t_val = convert_time(exposure, time_unit) # type: ignore if isinstance(t_val, list): df = df[df["exposure"].isin(t_val)] else: diff --git a/pyproject.toml b/pyproject.toml index 4fed193..8050c9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,6 @@ classifiers = [ ] dependencies = [ - "omegaconf", "pandas", "PyYAML", "requests", @@ -73,3 +72,4 @@ line-length = 100 [tool.ruff] line-length = 100 +target-version = "py310" \ No newline at end of file