diff --git a/hdxms_datasets/__init__.py b/hdxms_datasets/__init__.py index 5eb169c..c1f70f5 100644 --- a/hdxms_datasets/__init__.py +++ b/hdxms_datasets/__init__.py @@ -1,7 +1,7 @@ """Top-level package for HDXMS Datasets.""" from hdxms_datasets.__version__ import __version__ -from hdxms_datasets.datasets import HDXDataSet, DataFile +from hdxms_datasets.datasets import HDXDataSet, DataFile, create_dataset from hdxms_datasets.datavault import DataVault from hdxms_datasets.process import ( convert_temperature, @@ -14,6 +14,7 @@ __all__ = [ "HDXDataSet", "DataFile", + "create_dataset", "DataVault", "convert_temperature", "convert_time", diff --git a/hdxms_datasets/datasets.py b/hdxms_datasets/datasets.py index b83daff..3071daa 100644 --- a/hdxms_datasets/datasets.py +++ b/hdxms_datasets/datasets.py @@ -1,4 +1,6 @@ from __future__ import annotations +import shutil +import time import uuid from dataclasses import dataclass, field @@ -15,6 +17,45 @@ from hdxms_datasets.reader import read_dynamx +TEMPLATE_DIR = Path(__file__).parent / "template" + + +def create_dataset( + target_dir: Path, + author_name: str, + tag: Optional[str] = None, + template_dir: Path = TEMPLATE_DIR, +) -> str: + """ + Create a dataset in the specified target directory. + + Args: + target_dir: The directory where the dataset will be created. + author_name: The name of the author of the dataset. + tag: An optional tag to append to the directory name. Defaults to None. + template_dir: The directory containing the template files for the dataset. Defaults to TEMPLATE_DIR. + + Returns: + The id of the created dataset. + + """ + dirname = str(int(time.time())) + + if tag: + dirname += f"_{tag}" + + dirname += f"_{author_name}" + + target_dir.mkdir(parents=True, exist_ok=True) + target_dir = target_dir / dirname + + shutil.copytree(template_dir, target_dir) + + (target_dir / "readme.md").write_text(f"# {dirname}") + + return dirname + + @dataclass(frozen=True) class DataFile(object): name: str diff --git a/hdxms_datasets/template/CITATION.cff b/hdxms_datasets/template/CITATION.cff new file mode 100644 index 0000000..e734c00 --- /dev/null +++ b/hdxms_datasets/template/CITATION.cff @@ -0,0 +1,10 @@ +cff-version: 1.2.0 +message: "In lieu of an associated paper, please cit this dataset as below" +authors: + - family-names: Englander + given-names: Walter + orcid: https://orcid.org/0000-0000-0000-0000 +title: "My dataset" +version: 1.0.0 +doi: 10.5281/zenodo.1234 +date-released: 1970-01-01 \ No newline at end of file diff --git a/hdxms_datasets/template/data/data_file.csv b/hdxms_datasets/template/data/data_file.csv new file mode 100644 index 0000000..e69de29 diff --git a/hdxms_datasets/template/hdx_spec.yaml b/hdxms_datasets/template/hdx_spec.yaml new file mode 100644 index 0000000..9cc12ac --- /dev/null +++ b/hdxms_datasets/template/hdx_spec.yaml @@ -0,0 +1,71 @@ +data_files: + data_1: + filename: data\data_file.csv + format: DynamX + data_2: + filename: data\data_FD.csv + format: DynamX + +metadata: # global metadata equal for all protein states + pH: 7.5 + protein: + uniprot: P10408 + model: # Optional protein model information + database: pdb + entry: 2vda + +states: + SecB_ADP_glucose: # Short human-readable identifier for this protein HDX state + peptides: # Dictionary of peptides defined for this HDX state (typically experiment, FD_control, ND_control) + experiment: + data_file: data_1 # Reference to data files defined above + state: protein_state # which protein state to select from data file table + exposure: # Exposure value(s) to select from data file table + values: + - 10 + - 30 + - 60 + - 300 + - 1200 + unit: s + FD_control: + data_file: data_FD + state: SecB WT apo + exposure: + value: 0.167 + unit: min + description: # Optional additional experimental details per peptide set. + method: GdHCl and heating + DOI: ... + metadata: + pH: 7.5 # pH_read, uncorrected + d_percentage: 90. # Percentage deuterium in exchange buffer + temperature: # Temperature of the exchange buffer and unit + value: 30. + unit: Celsius + sequence: MSEQNNTEMTFQIQRIYTKDISFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVYEVVLRVTV # FASTA Sequence including deletions and mutations, tags + n_term: 1 # Residue number of the N terminal residue (can be negative) + c_term: 155 # Residue number of the C terminal residue + concentration: 20e-6 # Concentration of the protein during H/D labelling, in M + oligomeric_state: 1 # Oligomeric state of the protein during H/D labelling + ligand: # Section for added ligand(s) during H/D labelling ( + - name: ATP + concentration: 1e-3 + ChemSpider: 5800 + - name: glucose + concentration: 2e-5 + InChI: "1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2/h2-11H,1H2/t2-,3-,4+,5-,6?/m1/s1" + chaotrope: null + mutations: + - A123B + - H484Q + deletions: + - [1, 20] # Deletion of residues 1 up to and including 20 + protein_complex: # Description of the protein complex this protein is in + - name: SecY # human readable name + UniProt: P0AGA2 + stoichiometry: 1 # Number of this protein per POI + - name: SecB + UniProt: P0AG86 + stoichiometry: 2 + diff --git a/hdxms_datasets/template/metadata.yaml b/hdxms_datasets/template/metadata.yaml new file mode 100644 index 0000000..53eebfe --- /dev/null +++ b/hdxms_datasets/template/metadata.yaml @@ -0,0 +1,25 @@ +data: + protein: SecB + RCSB_entry: null + identifier: null + description: + +#citation.cff author format? +authors: + - name: Kaj Ulrik Linderstrøm-Lang + email: null + ORCID: null + affiliation: null + +# List of publications where the datasets are published +publications: + - title: null + DOI: null + URL: null + +# Repositories where the data is also published or where the raw data is deposited +repositories: + - repository: Zenodo + DOI: null + ULR: null + comments: Raw data \ No newline at end of file diff --git a/tests/test_hdxms_datasets.py b/tests/test_hdxms_datasets.py index 3998a6b..f8fc40f 100644 --- a/tests/test_hdxms_datasets.py +++ b/tests/test_hdxms_datasets.py @@ -1,6 +1,6 @@ import textwrap -from hdxms_datasets.datasets import HDXDataSet +from hdxms_datasets.datasets import HDXDataSet, create_dataset from hdxms_datasets.datavault import DataVault from pathlib import Path import pytest @@ -63,6 +63,20 @@ def test_dataset(dataset: HDXDataSet): assert textwrap.dedent(s.lstrip("\n")) == dataset.describe() +def test_create_dataset(tmp_path): + author_name = "smit" + human_readable_tag = "testing" # optional tag + + data_id = create_dataset(tmp_path / "datasets", author_name, human_readable_tag) + + dataset_pth = tmp_path / "datasets" / data_id + + assert human_readable_tag == data_id.split("_")[1] + assert author_name == data_id.split("_")[2] + + assert (dataset_pth / "readme.md").read_text() == f"# {data_id}" + + def test_metadata(dataset: HDXDataSet): test_metadata = yaml.safe_load((TEST_PTH / "datasets" / DATA_ID / "metadata.yaml").read_text()) assert dataset.metadata == test_metadata