From 0f2ea505aa8ca298c1726d57fa565187d3032d8c Mon Sep 17 00:00:00 2001 From: Jochem Smit Date: Tue, 2 Jan 2024 14:04:28 +0100 Subject: [PATCH 1/3] typing fixes --- hdxms_datasets/datavault.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/hdxms_datasets/datavault.py b/hdxms_datasets/datavault.py index 22cb983..5e79ef6 100644 --- a/hdxms_datasets/datavault.py +++ b/hdxms_datasets/datavault.py @@ -94,6 +94,7 @@ def fetch_dataset(self, data_id: str) -> bool: files = ["hdx_spec.yaml", "metadata.yaml"] optional_files = ["CITATION.cff"] + hdx_spec = None for f in files + optional_files: url = urllib.parse.urljoin(dataset_url, f) response = requests.get(url) @@ -103,12 +104,19 @@ def fetch_dataset(self, data_id: str) -> bool: elif f in files: raise urllib.error.HTTPError( - url, response.status_code, f"Error for file {f!r}", response.headers, None + url, + response.status_code, + f"Error for file {f!r}", + response.headers, # type: ignore + None, ) if f == "hdx_spec.yaml": hdx_spec = yaml.safe_load(response.text) + if hdx_spec is None: + raise ValueError(f"Could not find HDX spec for data_id {data_id!r}") + data_pth = output_pth / "data" data_pth.mkdir() @@ -124,7 +132,7 @@ def fetch_dataset(self, data_id: str) -> bool: f_url, response.status_code, f"Error for data file {filename!r}", - response.headers, + response.headers, # type: ignore None, ) From b2e59a18eb54099c1d49a23f7eefee65ba8bfd28 Mon Sep 17 00:00:00 2001 From: Jochem Smit Date: Tue, 2 Jan 2024 14:33:24 +0100 Subject: [PATCH 2/3] add template and dataset creation function --- hdxms_datasets/__init__.py | 3 +- hdxms_datasets/datasets.py | 41 +++++++++++++ hdxms_datasets/template/CITATION.cff | 10 +++ hdxms_datasets/template/data/data_file.csv | 0 hdxms_datasets/template/hdx_spec.yaml | 71 ++++++++++++++++++++++ hdxms_datasets/template/metadata.yaml | 25 ++++++++ tests/test_hdxms_datasets.py | 16 ++++- 7 files changed, 164 insertions(+), 2 deletions(-) create mode 100644 hdxms_datasets/template/CITATION.cff create mode 100644 hdxms_datasets/template/data/data_file.csv create mode 100644 hdxms_datasets/template/hdx_spec.yaml create mode 100644 hdxms_datasets/template/metadata.yaml diff --git a/hdxms_datasets/__init__.py b/hdxms_datasets/__init__.py index 5eb169c..c1f70f5 100644 --- a/hdxms_datasets/__init__.py +++ b/hdxms_datasets/__init__.py @@ -1,7 +1,7 @@ """Top-level package for HDXMS Datasets.""" from hdxms_datasets.__version__ import __version__ -from hdxms_datasets.datasets import HDXDataSet, DataFile +from hdxms_datasets.datasets import HDXDataSet, DataFile, create_dataset from hdxms_datasets.datavault import DataVault from hdxms_datasets.process import ( convert_temperature, @@ -14,6 +14,7 @@ __all__ = [ "HDXDataSet", "DataFile", + "create_dataset", "DataVault", "convert_temperature", "convert_time", diff --git a/hdxms_datasets/datasets.py b/hdxms_datasets/datasets.py index b83daff..3071daa 100644 --- a/hdxms_datasets/datasets.py +++ b/hdxms_datasets/datasets.py @@ -1,4 +1,6 @@ from __future__ import annotations +import shutil +import time import uuid from dataclasses import dataclass, field @@ -15,6 +17,45 @@ from hdxms_datasets.reader import read_dynamx +TEMPLATE_DIR = Path(__file__).parent / "template" + + +def create_dataset( + target_dir: Path, + author_name: str, + tag: Optional[str] = None, + template_dir: Path = TEMPLATE_DIR, +) -> str: + """ + Create a dataset in the specified target directory. + + Args: + target_dir: The directory where the dataset will be created. + author_name: The name of the author of the dataset. + tag: An optional tag to append to the directory name. Defaults to None. + template_dir: The directory containing the template files for the dataset. Defaults to TEMPLATE_DIR. + + Returns: + The id of the created dataset. + + """ + dirname = str(int(time.time())) + + if tag: + dirname += f"_{tag}" + + dirname += f"_{author_name}" + + target_dir.mkdir(parents=True, exist_ok=True) + target_dir = target_dir / dirname + + shutil.copytree(template_dir, target_dir) + + (target_dir / "readme.md").write_text(f"# {dirname}") + + return dirname + + @dataclass(frozen=True) class DataFile(object): name: str diff --git a/hdxms_datasets/template/CITATION.cff b/hdxms_datasets/template/CITATION.cff new file mode 100644 index 0000000..e734c00 --- /dev/null +++ b/hdxms_datasets/template/CITATION.cff @@ -0,0 +1,10 @@ +cff-version: 1.2.0 +message: "In lieu of an associated paper, please cit this dataset as below" +authors: + - family-names: Englander + given-names: Walter + orcid: https://orcid.org/0000-0000-0000-0000 +title: "My dataset" +version: 1.0.0 +doi: 10.5281/zenodo.1234 +date-released: 1970-01-01 \ No newline at end of file diff --git a/hdxms_datasets/template/data/data_file.csv b/hdxms_datasets/template/data/data_file.csv new file mode 100644 index 0000000..e69de29 diff --git a/hdxms_datasets/template/hdx_spec.yaml b/hdxms_datasets/template/hdx_spec.yaml new file mode 100644 index 0000000..9cc12ac --- /dev/null +++ b/hdxms_datasets/template/hdx_spec.yaml @@ -0,0 +1,71 @@ +data_files: + data_1: + filename: data\data_file.csv + format: DynamX + data_2: + filename: data\data_FD.csv + format: DynamX + +metadata: # global metadata equal for all protein states + pH: 7.5 + protein: + uniprot: P10408 + model: # Optional protein model information + database: pdb + entry: 2vda + +states: + SecB_ADP_glucose: # Short human-readable identifier for this protein HDX state + peptides: # Dictionary of peptides defined for this HDX state (typically experiment, FD_control, ND_control) + experiment: + data_file: data_1 # Reference to data files defined above + state: protein_state # which protein state to select from data file table + exposure: # Exposure value(s) to select from data file table + values: + - 10 + - 30 + - 60 + - 300 + - 1200 + unit: s + FD_control: + data_file: data_FD + state: SecB WT apo + exposure: + value: 0.167 + unit: min + description: # Optional additional experimental details per peptide set. + method: GdHCl and heating + DOI: ... + metadata: + pH: 7.5 # pH_read, uncorrected + d_percentage: 90. # Percentage deuterium in exchange buffer + temperature: # Temperature of the exchange buffer and unit + value: 30. + unit: Celsius + sequence: MSEQNNTEMTFQIQRIYTKDISFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVYEVVLRVTV # FASTA Sequence including deletions and mutations, tags + n_term: 1 # Residue number of the N terminal residue (can be negative) + c_term: 155 # Residue number of the C terminal residue + concentration: 20e-6 # Concentration of the protein during H/D labelling, in M + oligomeric_state: 1 # Oligomeric state of the protein during H/D labelling + ligand: # Section for added ligand(s) during H/D labelling ( + - name: ATP + concentration: 1e-3 + ChemSpider: 5800 + - name: glucose + concentration: 2e-5 + InChI: "1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2/h2-11H,1H2/t2-,3-,4+,5-,6?/m1/s1" + chaotrope: null + mutations: + - A123B + - H484Q + deletions: + - [1, 20] # Deletion of residues 1 up to and including 20 + protein_complex: # Description of the protein complex this protein is in + - name: SecY # human readable name + UniProt: P0AGA2 + stoichiometry: 1 # Number of this protein per POI + - name: SecB + UniProt: P0AG86 + stoichiometry: 2 + diff --git a/hdxms_datasets/template/metadata.yaml b/hdxms_datasets/template/metadata.yaml new file mode 100644 index 0000000..53eebfe --- /dev/null +++ b/hdxms_datasets/template/metadata.yaml @@ -0,0 +1,25 @@ +data: + protein: SecB + RCSB_entry: null + identifier: null + description: + +#citation.cff author format? +authors: + - name: Kaj Ulrik Linderstrøm-Lang + email: null + ORCID: null + affiliation: null + +# List of publications where the datasets are published +publications: + - title: null + DOI: null + URL: null + +# Repositories where the data is also published or where the raw data is deposited +repositories: + - repository: Zenodo + DOI: null + ULR: null + comments: Raw data \ No newline at end of file diff --git a/tests/test_hdxms_datasets.py b/tests/test_hdxms_datasets.py index 3998a6b..f8fc40f 100644 --- a/tests/test_hdxms_datasets.py +++ b/tests/test_hdxms_datasets.py @@ -1,6 +1,6 @@ import textwrap -from hdxms_datasets.datasets import HDXDataSet +from hdxms_datasets.datasets import HDXDataSet, create_dataset from hdxms_datasets.datavault import DataVault from pathlib import Path import pytest @@ -63,6 +63,20 @@ def test_dataset(dataset: HDXDataSet): assert textwrap.dedent(s.lstrip("\n")) == dataset.describe() +def test_create_dataset(tmp_path): + author_name = "smit" + human_readable_tag = "testing" # optional tag + + data_id = create_dataset(tmp_path / "datasets", author_name, human_readable_tag) + + dataset_pth = tmp_path / "datasets" / data_id + + assert human_readable_tag == data_id.split("_")[1] + assert author_name == data_id.split("_")[2] + + assert (dataset_pth / "readme.md").read_text() == f"# {data_id}" + + def test_metadata(dataset: HDXDataSet): test_metadata = yaml.safe_load((TEST_PTH / "datasets" / DATA_ID / "metadata.yaml").read_text()) assert dataset.metadata == test_metadata From 31ab7e11088fc915033e39fcea41aff561b15eba Mon Sep 17 00:00:00 2001 From: Jochem Smit Date: Tue, 2 Jan 2024 14:43:58 +0100 Subject: [PATCH 3/3] test and example --- examples/create_dataset.py | 12 ++++++++++++ tests/test_hdxms_datasets.py | 3 +++ 2 files changed, 15 insertions(+) create mode 100644 examples/create_dataset.py diff --git a/examples/create_dataset.py b/examples/create_dataset.py new file mode 100644 index 0000000..7b198a9 --- /dev/null +++ b/examples/create_dataset.py @@ -0,0 +1,12 @@ +""" +Run this script to copy the template directory to a create a new dataset +""" + +from pathlib import Path +from hdxms_datasets import create_dataset + + +author_name = "Krishnamurthy" +human_readable_tag = "SecB" # optional tag + +data_id = create_dataset(Path().resolve() / "datasets", "smit", "testing") diff --git a/tests/test_hdxms_datasets.py b/tests/test_hdxms_datasets.py index f8fc40f..3512a3d 100644 --- a/tests/test_hdxms_datasets.py +++ b/tests/test_hdxms_datasets.py @@ -76,6 +76,9 @@ def test_create_dataset(tmp_path): assert (dataset_pth / "readme.md").read_text() == f"# {data_id}" + assert (dataset_pth / "hdx_spec.yaml").exists() + assert (dataset_pth / "data" / "data_file.csv").exists() + def test_metadata(dataset: HDXDataSet): test_metadata = yaml.safe_load((TEST_PTH / "datasets" / DATA_ID / "metadata.yaml").read_text())