Skip to content

Commit

Permalink
add template and dataset creation function
Browse files Browse the repository at this point in the history
  • Loading branch information
Jhsmit committed Jan 2, 2024
1 parent 0f2ea50 commit b2e59a1
Show file tree
Hide file tree
Showing 7 changed files with 164 additions and 2 deletions.
3 changes: 2 additions & 1 deletion hdxms_datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Top-level package for HDXMS Datasets."""

from hdxms_datasets.__version__ import __version__
from hdxms_datasets.datasets import HDXDataSet, DataFile
from hdxms_datasets.datasets import HDXDataSet, DataFile, create_dataset
from hdxms_datasets.datavault import DataVault
from hdxms_datasets.process import (
convert_temperature,
Expand All @@ -14,6 +14,7 @@
__all__ = [
"HDXDataSet",
"DataFile",
"create_dataset",
"DataVault",
"convert_temperature",
"convert_time",
Expand Down
41 changes: 41 additions & 0 deletions hdxms_datasets/datasets.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from __future__ import annotations
import shutil
import time

import uuid
from dataclasses import dataclass, field
Expand All @@ -15,6 +17,45 @@
from hdxms_datasets.reader import read_dynamx


TEMPLATE_DIR = Path(__file__).parent / "template"


def create_dataset(
target_dir: Path,
author_name: str,
tag: Optional[str] = None,
template_dir: Path = TEMPLATE_DIR,
) -> str:
"""
Create a dataset in the specified target directory.
Args:
target_dir: The directory where the dataset will be created.
author_name: The name of the author of the dataset.
tag: An optional tag to append to the directory name. Defaults to None.
template_dir: The directory containing the template files for the dataset. Defaults to TEMPLATE_DIR.
Returns:
The id of the created dataset.
"""
dirname = str(int(time.time()))

if tag:
dirname += f"_{tag}"

dirname += f"_{author_name}"

target_dir.mkdir(parents=True, exist_ok=True)
target_dir = target_dir / dirname

shutil.copytree(template_dir, target_dir)

(target_dir / "readme.md").write_text(f"# {dirname}")

return dirname


@dataclass(frozen=True)
class DataFile(object):
name: str
Expand Down
10 changes: 10 additions & 0 deletions hdxms_datasets/template/CITATION.cff
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
cff-version: 1.2.0
message: "In lieu of an associated paper, please cit this dataset as below"
authors:
- family-names: Englander
given-names: Walter
orcid: https://orcid.org/0000-0000-0000-0000
title: "My dataset"
version: 1.0.0
doi: 10.5281/zenodo.1234
date-released: 1970-01-01
Empty file.
71 changes: 71 additions & 0 deletions hdxms_datasets/template/hdx_spec.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
data_files:
data_1:
filename: data\data_file.csv
format: DynamX
data_2:
filename: data\data_FD.csv
format: DynamX

metadata: # global metadata equal for all protein states
pH: 7.5
protein:
uniprot: P10408
model: # Optional protein model information
database: pdb
entry: 2vda

states:
SecB_ADP_glucose: # Short human-readable identifier for this protein HDX state
peptides: # Dictionary of peptides defined for this HDX state (typically experiment, FD_control, ND_control)
experiment:
data_file: data_1 # Reference to data files defined above
state: protein_state # which protein state to select from data file table
exposure: # Exposure value(s) to select from data file table
values:
- 10
- 30
- 60
- 300
- 1200
unit: s
FD_control:
data_file: data_FD
state: SecB WT apo
exposure:
value: 0.167
unit: min
description: # Optional additional experimental details per peptide set.
method: GdHCl and heating
DOI: ...
metadata:
pH: 7.5 # pH_read, uncorrected
d_percentage: 90. # Percentage deuterium in exchange buffer
temperature: # Temperature of the exchange buffer and unit
value: 30.
unit: Celsius
sequence: MSEQNNTEMTFQIQRIYTKDISFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVYEVVLRVTV # FASTA Sequence including deletions and mutations, tags
n_term: 1 # Residue number of the N terminal residue (can be negative)
c_term: 155 # Residue number of the C terminal residue
concentration: 20e-6 # Concentration of the protein during H/D labelling, in M
oligomeric_state: 1 # Oligomeric state of the protein during H/D labelling
ligand: # Section for added ligand(s) during H/D labelling (
- name: ATP
concentration: 1e-3
ChemSpider: 5800
- name: glucose
concentration: 2e-5
InChI: "1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2/h2-11H,1H2/t2-,3-,4+,5-,6?/m1/s1"
chaotrope: null
mutations:
- A123B
- H484Q
deletions:
- [1, 20] # Deletion of residues 1 up to and including 20
protein_complex: # Description of the protein complex this protein is in
- name: SecY # human readable name
UniProt: P0AGA2
stoichiometry: 1 # Number of this protein per POI
- name: SecB
UniProt: P0AG86
stoichiometry: 2

25 changes: 25 additions & 0 deletions hdxms_datasets/template/metadata.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
data:
protein: SecB
RCSB_entry: null
identifier: null
description:

#citation.cff author format?
authors:
- name: Kaj Ulrik Linderstrøm-Lang
email: null
ORCID: null
affiliation: null

# List of publications where the datasets are published
publications:
- title: null
DOI: null
URL: null

# Repositories where the data is also published or where the raw data is deposited
repositories:
- repository: Zenodo
DOI: null
ULR: null
comments: Raw data
16 changes: 15 additions & 1 deletion tests/test_hdxms_datasets.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import textwrap

from hdxms_datasets.datasets import HDXDataSet
from hdxms_datasets.datasets import HDXDataSet, create_dataset
from hdxms_datasets.datavault import DataVault
from pathlib import Path
import pytest
Expand Down Expand Up @@ -63,6 +63,20 @@ def test_dataset(dataset: HDXDataSet):
assert textwrap.dedent(s.lstrip("\n")) == dataset.describe()


def test_create_dataset(tmp_path):
author_name = "smit"
human_readable_tag = "testing" # optional tag

data_id = create_dataset(tmp_path / "datasets", author_name, human_readable_tag)

dataset_pth = tmp_path / "datasets" / data_id

assert human_readable_tag == data_id.split("_")[1]
assert author_name == data_id.split("_")[2]

assert (dataset_pth / "readme.md").read_text() == f"# {data_id}"


def test_metadata(dataset: HDXDataSet):
test_metadata = yaml.safe_load((TEST_PTH / "datasets" / DATA_ID / "metadata.yaml").read_text())
assert dataset.metadata == test_metadata
Expand Down

0 comments on commit b2e59a1

Please sign in to comment.