remove config and omegaconf dependency

Jhsmit · Jan 2, 2024 · f72b6f2 · f72b6f2
1 parent 1157d20
commit f72b6f2
Show file tree

Hide file tree

Showing 10 changed files with 36 additions and 163 deletions.
diff --git a/examples/load_datasets.py b/examples/load_datasets.py
@@ -1,11 +1,21 @@
+# %%
+
 from hdxms_datasets import DataVault
+from pathlib import Path
+
 
-# Creating a DataVault without giving a cache path name uses $home/.hdxms_datasets by default
-vault = DataVault()
+# %%
+# create a data vault, specify cache_dir to download datasets to
+cache_dir = Path.home() / ".hdxms_datasets"
+vault = DataVault(cache_dir=cache_dir)
+vault
 
+# %%
 # Download a specific HDX dataset
 vault.fetch_dataset("20221007_1530_SecA_Krishnamurthy")
+vault.datasets
 
+# %%
 # Load the dataset
 ds = vault.load_dataset("20221007_1530_SecA_Krishnamurthy")
 
@@ -24,3 +34,5 @@
 # States can also be referenced by their index, used here to load the peptides corresponding to
 # the experiment.
 peptides = ds.load_peptides(0, "experiment")
+
+# %%
diff --git a/examples/load_from_yaml.py b/examples/load_from_yaml.py
@@ -10,7 +10,7 @@
 hdx_spec = yaml.safe_load((data_pth / data_id / "hdx_spec.yaml").read_text())
 metadata = yaml.safe_load((data_pth / data_id / "metadata.yaml").read_text())
 
-#%%
+# %%
 
 dataset = HDXDataSet.from_spec(hdx_spec, data_dir=data_pth / data_id, metadata=metadata)
 

diff --git a/examples/load_to_pyhdx.py b/examples/load_to_pyhdx.py
@@ -6,10 +6,9 @@
 test_pth = Path("../tests").resolve()
 data_pth = test_pth / "datasets"
 
-vault = DataVault()
+vault = DataVault(cache_dir=data_pth)
 ds = vault.load_dataset("20221007_1530_SecB_Krishnamurthy")
 
-# Not implemented yet
 hdxm = HDXMeasurement.from_dataset(ds)
 
 print(hdxm)
diff --git a/hdxms_datasets/__init__.py b/hdxms_datasets/__init__.py
@@ -1,7 +1,6 @@
 """Top-level package for HDXMS Datasets."""
 
 from hdxms_datasets.__version__ import __version__
-from hdxms_datasets.config import cfg
 from hdxms_datasets.datasets import HDXDataSet, DataFile
 from hdxms_datasets.datavault import DataVault
 from hdxms_datasets.process import (

diff --git a/hdxms_datasets/config.py b/hdxms_datasets/config.py
diff --git a/hdxms_datasets/config.yaml b/hdxms_datasets/config.yaml
diff --git a/hdxms_datasets/datasets.py b/hdxms_datasets/datasets.py
@@ -11,7 +11,6 @@
 import pandas as pd
 import yaml
 
-from hdxms_datasets.config import cfg
 from hdxms_datasets.process import filter_peptides, convert_temperature, parse_data_files
 from hdxms_datasets.reader import read_dynamx
 
@@ -24,13 +23,14 @@ class DataFile(object):
 
     filepath_or_buffer: Union[Path, StringIO]
 
+    time_conversion: tuple[Literal["h", "min", "s"], Literal["h", "min", "s"]] = ("min", "s")
+    # from, to time conversion
+
     @cached_property
     def data(self) -> pd.DataFrame:
+        # TODO convert time after reading
         if self.format == "DynamX":
-            # from, to time conversion
-            time_conversion = (cfg.dynamx.time_unit, cfg.time_unit)
-
-            data = read_dynamx(self.filepath_or_buffer, time_conversion=time_conversion)
+            data = read_dynamx(self.filepath_or_buffer, time_conversion=self.time_conversion)
         else:
             raise ValueError(f"Invalid format {self.format!r}")
 

diff --git a/hdxms_datasets/datavault.py b/hdxms_datasets/datavault.py
@@ -10,22 +10,22 @@
 import requests
 import yaml
 
-from hdxms_datasets.config import cfg
 from hdxms_datasets.datasets import HDXDataSet
 
 
+DATABASE_URL = "https://raw.githubusercontent.com/Jhsmit/HDX-MS-datasets/master/datasets/"
+
+
 class DataVault(object):
     def __init__(
         self,
-        cache_dir: Optional[Union[Path[str], str]] = None,
+        cache_dir: Union[Path, str],
+        remote_url: str = DATABASE_URL,
     ):
-        if cache_dir is None:
-            self.cache_dir = cfg.database_dir
-            self.cache_dir.mkdir(exist_ok=True, parents=True)
-        else:
-            self.cache_dir: Path = Path(cache_dir)
-            if not self.cache_dir.exists():
-                raise FileNotFoundError(f"Cache directory '{self.cache_dir}' does not exist")
+        self.cache_dir = Path(cache_dir)
+        self.cache_dir.mkdir(exist_ok=True, parents=True)
+
+        self.remote_url = remote_url
 
     def filter(self, *spec: dict):
         # filters list of available datasets
@@ -35,7 +35,7 @@ def filter(self, *spec: dict):
     def remote_index(self) -> list[str]:
         """List of available datasets in the remote database"""
 
-        url = urllib.parse.urljoin(cfg.database_url, "index.txt")
+        url = urllib.parse.urljoin(self.remote_url, "index.txt")
         response = requests.get(url)
         if response.ok:
             index = response.text.split("\n")[1:]
@@ -90,7 +90,7 @@ def fetch_dataset(self, data_id: str) -> bool:
         else:
             output_pth.mkdir()
 
-        dataset_url = urllib.parse.urljoin(cfg.database_url, data_id + "/")
+        dataset_url = urllib.parse.urljoin(self.remote_url, data_id + "/")
 
         files = ["hdx_spec.yaml", "metadata.yaml"]
         optional_files = ["CITATION.cff"]

diff --git a/hdxms_datasets/process.py b/hdxms_datasets/process.py
@@ -5,8 +5,6 @@
 
 import pandas as pd
 
-from hdxms_datasets.config import cfg
-
 if TYPE_CHECKING:
     from hdxms_datasets import DataFile
 
@@ -71,6 +69,7 @@ def filter_peptides(
     exposure: Optional[dict] = None,
     query: Optional[list[str]] = None,
     dropna: bool = True,
+    time_unit: str = "s",
 ) -> pd.DataFrame:
     """
     Convenience function to filter a peptides DataFrame. .
@@ -82,6 +81,7 @@ def filter_peptides(
             exposure value, and "unit" for the time unit.
         query: Additional queries to pass to [pandas.DataFrame.query][].
         dropna: Drop rows with `NaN` uptake entries.
+        time_unit: Time unit for exposure column of supplied dataframe.
 
     Examples:
         Filter peptides for a specific protein state and exposure time:
@@ -97,7 +97,7 @@ def filter_peptides(
         df = df[df["state"] == state]
 
     if exposure is not None:
-        t_val = convert_time(exposure, target_unit=cfg.time_unit)
+        t_val = convert_time(exposure, time_unit)  # type: ignore
         if isinstance(t_val, list):
             df = df[df["exposure"].isin(t_val)]
         else:

diff --git a/pyproject.toml b/pyproject.toml
@@ -19,7 +19,6 @@ classifiers = [
 ]
 
 dependencies = [
-    "omegaconf",
     "pandas",
     "PyYAML",
     "requests",
@@ -73,3 +72,4 @@ line-length = 100
 
 [tool.ruff]
 line-length = 100
+target-version = "py310"
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,7 +10,7 @@ @@
     hdx_spec = yaml.safe_load((data_pth / data_id / "hdx_spec.yaml").read_text())
     metadata = yaml.safe_load((data_pth / data_id / "metadata.yaml").read_text())
-    #%%
+    # %%
     dataset = HDXDataSet.from_spec(hdx_spec, data_dir=data_pth / data_id, metadata=metadata)
@@ Expand Down @@