Skip to content

Commit

Permalink
Merge pull request #8 from Jhsmit/remove_config
Browse files Browse the repository at this point in the history
remove config and omegaconf dependency
  • Loading branch information
Jhsmit authored Jan 2, 2024
2 parents 1157d20 + 4a9e7c4 commit 6ee99e3
Show file tree
Hide file tree
Showing 16 changed files with 97 additions and 259 deletions.
16 changes: 14 additions & 2 deletions examples/load_datasets.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,21 @@
# %%

from hdxms_datasets import DataVault
from pathlib import Path


# Creating a DataVault without giving a cache path name uses $home/.hdxms_datasets by default
vault = DataVault()
# %%
# create a data vault, specify cache_dir to download datasets to
cache_dir = Path.home() / ".hdxms_datasets"
vault = DataVault(cache_dir=cache_dir)
vault

# %%
# Download a specific HDX dataset
vault.fetch_dataset("20221007_1530_SecA_Krishnamurthy")
vault.datasets

# %%
# Load the dataset
ds = vault.load_dataset("20221007_1530_SecA_Krishnamurthy")

Expand All @@ -24,3 +34,5 @@
# States can also be referenced by their index, used here to load the peptides corresponding to
# the experiment.
peptides = ds.load_peptides(0, "experiment")

# %%
2 changes: 1 addition & 1 deletion examples/load_from_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
hdx_spec = yaml.safe_load((data_pth / data_id / "hdx_spec.yaml").read_text())
metadata = yaml.safe_load((data_pth / data_id / "metadata.yaml").read_text())

#%%
# %%

dataset = HDXDataSet.from_spec(hdx_spec, data_dir=data_pth / data_id, metadata=metadata)

Expand Down
4 changes: 2 additions & 2 deletions examples/load_to_pyhdx.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
test_pth = Path("../tests").resolve()
data_pth = test_pth / "datasets"

vault = DataVault()
vault = DataVault(cache_dir=data_pth)
ds = vault.load_dataset("20221007_1530_SecB_Krishnamurthy")

# Not implemented yet
# not yet in latests release
hdxm = HDXMeasurement.from_dataset(ds)

print(hdxm)
1 change: 0 additions & 1 deletion hdxms_datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Top-level package for HDXMS Datasets."""

from hdxms_datasets.__version__ import __version__
from hdxms_datasets.config import cfg
from hdxms_datasets.datasets import HDXDataSet, DataFile
from hdxms_datasets.datavault import DataVault
from hdxms_datasets.process import (
Expand Down
131 changes: 0 additions & 131 deletions hdxms_datasets/config.py

This file was deleted.

6 changes: 0 additions & 6 deletions hdxms_datasets/config.yaml

This file was deleted.

10 changes: 5 additions & 5 deletions hdxms_datasets/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import pandas as pd
import yaml

from hdxms_datasets.config import cfg
from hdxms_datasets.process import filter_peptides, convert_temperature, parse_data_files
from hdxms_datasets.reader import read_dynamx

Expand All @@ -24,13 +23,14 @@ class DataFile(object):

filepath_or_buffer: Union[Path, StringIO]

time_conversion: tuple[Literal["h", "min", "s"], Literal["h", "min", "s"]] = ("min", "s")
# from, to time conversion

@cached_property
def data(self) -> pd.DataFrame:
# TODO convert time after reading
if self.format == "DynamX":
# from, to time conversion
time_conversion = (cfg.dynamx.time_unit, cfg.time_unit)

data = read_dynamx(self.filepath_or_buffer, time_conversion=time_conversion)
data = read_dynamx(self.filepath_or_buffer, time_conversion=self.time_conversion)
else:
raise ValueError(f"Invalid format {self.format!r}")

Expand Down
22 changes: 11 additions & 11 deletions hdxms_datasets/datavault.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,22 +10,22 @@
import requests
import yaml

from hdxms_datasets.config import cfg
from hdxms_datasets.datasets import HDXDataSet


DATABASE_URL = "https://raw.githubusercontent.com/Jhsmit/HDX-MS-datasets/master/datasets/"


class DataVault(object):
def __init__(
self,
cache_dir: Optional[Union[Path[str], str]] = None,
cache_dir: Union[Path, str],
remote_url: str = DATABASE_URL,
):
if cache_dir is None:
self.cache_dir = cfg.database_dir
self.cache_dir.mkdir(exist_ok=True, parents=True)
else:
self.cache_dir: Path = Path(cache_dir)
if not self.cache_dir.exists():
raise FileNotFoundError(f"Cache directory '{self.cache_dir}' does not exist")
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(exist_ok=True, parents=True)

self.remote_url = remote_url

def filter(self, *spec: dict):
# filters list of available datasets
Expand All @@ -35,7 +35,7 @@ def filter(self, *spec: dict):
def remote_index(self) -> list[str]:
"""List of available datasets in the remote database"""

url = urllib.parse.urljoin(cfg.database_url, "index.txt")
url = urllib.parse.urljoin(self.remote_url, "index.txt")
response = requests.get(url)
if response.ok:
index = response.text.split("\n")[1:]
Expand Down Expand Up @@ -90,7 +90,7 @@ def fetch_dataset(self, data_id: str) -> bool:
else:
output_pth.mkdir()

dataset_url = urllib.parse.urljoin(cfg.database_url, data_id + "/")
dataset_url = urllib.parse.urljoin(self.remote_url, data_id + "/")

files = ["hdx_spec.yaml", "metadata.yaml"]
optional_files = ["CITATION.cff"]
Expand Down
6 changes: 3 additions & 3 deletions hdxms_datasets/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@

import pandas as pd

from hdxms_datasets.config import cfg

if TYPE_CHECKING:
from hdxms_datasets import DataFile

Expand Down Expand Up @@ -71,6 +69,7 @@ def filter_peptides(
exposure: Optional[dict] = None,
query: Optional[list[str]] = None,
dropna: bool = True,
time_unit: str = "s",
) -> pd.DataFrame:
"""
Convenience function to filter a peptides DataFrame. .
Expand All @@ -82,6 +81,7 @@ def filter_peptides(
exposure value, and "unit" for the time unit.
query: Additional queries to pass to [pandas.DataFrame.query][].
dropna: Drop rows with `NaN` uptake entries.
time_unit: Time unit for exposure column of supplied dataframe.
Examples:
Filter peptides for a specific protein state and exposure time:
Expand All @@ -97,7 +97,7 @@ def filter_peptides(
df = df[df["state"] == state]

if exposure is not None:
t_val = convert_time(exposure, target_unit=cfg.time_unit)
t_val = convert_time(exposure, time_unit) # type: ignore
if isinstance(t_val, list):
df = df[df["exposure"].isin(t_val)]
else:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ classifiers = [
]

dependencies = [
"omegaconf",
"pandas",
"PyYAML",
"requests",
Expand Down Expand Up @@ -73,3 +72,4 @@ line-length = 100

[tool.ruff]
line-length = 100
target-version = "py310"
26 changes: 10 additions & 16 deletions requirements/requirements-macOS-latest-3.10.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,35 +4,29 @@
#
# pip-compile --output-file=requirements-macOS-latest-3.10.txt pyproject.toml
#
antlr4-python3-runtime==4.9.3
# via omegaconf
certifi==2023.7.22
certifi==2023.11.17
# via requests
charset-normalizer==3.2.0
charset-normalizer==3.3.2
# via requests
idna==3.4
idna==3.6
# via requests
numpy==1.25.2
numpy==1.26.2
# via pandas
omegaconf==2.3.0
packaging==23.2
# via hdxms-datasets (pyproject.toml)
packaging==23.1
# via hdxms-datasets (pyproject.toml)
pandas==2.0.3
pandas==2.1.4
# via hdxms-datasets (pyproject.toml)
python-dateutil==2.8.2
# via pandas
pytz==2023.3
pytz==2023.3.post1
# via pandas
pyyaml==6.0.1
# via
# hdxms-datasets (pyproject.toml)
# omegaconf
# via hdxms-datasets (pyproject.toml)
requests==2.31.0
# via hdxms-datasets (pyproject.toml)
six==1.16.0
# via python-dateutil
tzdata==2023.3
tzdata==2023.4
# via pandas
urllib3==2.0.4
urllib3==2.1.0
# via requests
Loading

0 comments on commit 6ee99e3

Please sign in to comment.