Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Template #9

Merged
merged 3 commits into from
Jan 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions examples/create_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""
Run this script to copy the template directory to a create a new dataset
"""

from pathlib import Path
from hdxms_datasets import create_dataset


author_name = "Krishnamurthy"
human_readable_tag = "SecB" # optional tag

data_id = create_dataset(Path().resolve() / "datasets", "smit", "testing")
3 changes: 2 additions & 1 deletion hdxms_datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Top-level package for HDXMS Datasets."""

from hdxms_datasets.__version__ import __version__
from hdxms_datasets.datasets import HDXDataSet, DataFile
from hdxms_datasets.datasets import HDXDataSet, DataFile, create_dataset
from hdxms_datasets.datavault import DataVault
from hdxms_datasets.process import (
convert_temperature,
Expand All @@ -14,6 +14,7 @@
__all__ = [
"HDXDataSet",
"DataFile",
"create_dataset",
"DataVault",
"convert_temperature",
"convert_time",
Expand Down
41 changes: 41 additions & 0 deletions hdxms_datasets/datasets.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from __future__ import annotations
import shutil
import time

import uuid
from dataclasses import dataclass, field
Expand All @@ -15,6 +17,45 @@
from hdxms_datasets.reader import read_dynamx


TEMPLATE_DIR = Path(__file__).parent / "template"


def create_dataset(
target_dir: Path,
author_name: str,
tag: Optional[str] = None,
template_dir: Path = TEMPLATE_DIR,
) -> str:
"""
Create a dataset in the specified target directory.

Args:
target_dir: The directory where the dataset will be created.
author_name: The name of the author of the dataset.
tag: An optional tag to append to the directory name. Defaults to None.
template_dir: The directory containing the template files for the dataset. Defaults to TEMPLATE_DIR.

Returns:
The id of the created dataset.

"""
dirname = str(int(time.time()))

if tag:
dirname += f"_{tag}"

dirname += f"_{author_name}"

target_dir.mkdir(parents=True, exist_ok=True)
target_dir = target_dir / dirname

shutil.copytree(template_dir, target_dir)

(target_dir / "readme.md").write_text(f"# {dirname}")

return dirname


@dataclass(frozen=True)
class DataFile(object):
name: str
Expand Down
12 changes: 10 additions & 2 deletions hdxms_datasets/datavault.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def fetch_dataset(self, data_id: str) -> bool:

files = ["hdx_spec.yaml", "metadata.yaml"]
optional_files = ["CITATION.cff"]
hdx_spec = None
for f in files + optional_files:
url = urllib.parse.urljoin(dataset_url, f)
response = requests.get(url)
Expand All @@ -103,12 +104,19 @@ def fetch_dataset(self, data_id: str) -> bool:

elif f in files:
raise urllib.error.HTTPError(
url, response.status_code, f"Error for file {f!r}", response.headers, None
url,
response.status_code,
f"Error for file {f!r}",
response.headers, # type: ignore
None,
)

if f == "hdx_spec.yaml":
hdx_spec = yaml.safe_load(response.text)

if hdx_spec is None:
raise ValueError(f"Could not find HDX spec for data_id {data_id!r}")

data_pth = output_pth / "data"
data_pth.mkdir()

Expand All @@ -124,7 +132,7 @@ def fetch_dataset(self, data_id: str) -> bool:
f_url,
response.status_code,
f"Error for data file {filename!r}",
response.headers,
response.headers, # type: ignore
None,
)

Expand Down
10 changes: 10 additions & 0 deletions hdxms_datasets/template/CITATION.cff
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
cff-version: 1.2.0
message: "In lieu of an associated paper, please cit this dataset as below"
authors:
- family-names: Englander
given-names: Walter
orcid: https://orcid.org/0000-0000-0000-0000
title: "My dataset"
version: 1.0.0
doi: 10.5281/zenodo.1234
date-released: 1970-01-01
Empty file.
71 changes: 71 additions & 0 deletions hdxms_datasets/template/hdx_spec.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
data_files:
data_1:
filename: data\data_file.csv
format: DynamX
data_2:
filename: data\data_FD.csv
format: DynamX

metadata: # global metadata equal for all protein states
pH: 7.5
protein:
uniprot: P10408
model: # Optional protein model information
database: pdb
entry: 2vda

states:
SecB_ADP_glucose: # Short human-readable identifier for this protein HDX state
peptides: # Dictionary of peptides defined for this HDX state (typically experiment, FD_control, ND_control)
experiment:
data_file: data_1 # Reference to data files defined above
state: protein_state # which protein state to select from data file table
exposure: # Exposure value(s) to select from data file table
values:
- 10
- 30
- 60
- 300
- 1200
unit: s
FD_control:
data_file: data_FD
state: SecB WT apo
exposure:
value: 0.167
unit: min
description: # Optional additional experimental details per peptide set.
method: GdHCl and heating
DOI: ...
metadata:
pH: 7.5 # pH_read, uncorrected
d_percentage: 90. # Percentage deuterium in exchange buffer
temperature: # Temperature of the exchange buffer and unit
value: 30.
unit: Celsius
sequence: MSEQNNTEMTFQIQRIYTKDISFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVYEVVLRVTV # FASTA Sequence including deletions and mutations, tags
n_term: 1 # Residue number of the N terminal residue (can be negative)
c_term: 155 # Residue number of the C terminal residue
concentration: 20e-6 # Concentration of the protein during H/D labelling, in M
oligomeric_state: 1 # Oligomeric state of the protein during H/D labelling
ligand: # Section for added ligand(s) during H/D labelling (
- name: ATP
concentration: 1e-3
ChemSpider: 5800
- name: glucose
concentration: 2e-5
InChI: "1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2/h2-11H,1H2/t2-,3-,4+,5-,6?/m1/s1"
chaotrope: null
mutations:
- A123B
- H484Q
deletions:
- [1, 20] # Deletion of residues 1 up to and including 20
protein_complex: # Description of the protein complex this protein is in
- name: SecY # human readable name
UniProt: P0AGA2
stoichiometry: 1 # Number of this protein per POI
- name: SecB
UniProt: P0AG86
stoichiometry: 2

25 changes: 25 additions & 0 deletions hdxms_datasets/template/metadata.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
data:
protein: SecB
RCSB_entry: null
identifier: null
description:

#citation.cff author format?
authors:
- name: Kaj Ulrik Linderstrøm-Lang
email: null
ORCID: null
affiliation: null

# List of publications where the datasets are published
publications:
- title: null
DOI: null
URL: null

# Repositories where the data is also published or where the raw data is deposited
repositories:
- repository: Zenodo
DOI: null
ULR: null
comments: Raw data
19 changes: 18 additions & 1 deletion tests/test_hdxms_datasets.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import textwrap

from hdxms_datasets.datasets import HDXDataSet
from hdxms_datasets.datasets import HDXDataSet, create_dataset
from hdxms_datasets.datavault import DataVault
from pathlib import Path
import pytest
Expand Down Expand Up @@ -63,6 +63,23 @@ def test_dataset(dataset: HDXDataSet):
assert textwrap.dedent(s.lstrip("\n")) == dataset.describe()


def test_create_dataset(tmp_path):
author_name = "smit"
human_readable_tag = "testing" # optional tag

data_id = create_dataset(tmp_path / "datasets", author_name, human_readable_tag)

dataset_pth = tmp_path / "datasets" / data_id

assert human_readable_tag == data_id.split("_")[1]
assert author_name == data_id.split("_")[2]

assert (dataset_pth / "readme.md").read_text() == f"# {data_id}"

assert (dataset_pth / "hdx_spec.yaml").exists()
assert (dataset_pth / "data" / "data_file.csv").exists()


def test_metadata(dataset: HDXDataSet):
test_metadata = yaml.safe_load((TEST_PTH / "datasets" / DATA_ID / "metadata.yaml").read_text())
assert dataset.metadata == test_metadata
Expand Down