Skip to content

Commit

Permalink
More tests on different molecule formats
Browse files Browse the repository at this point in the history
  • Loading branch information
Old-Shatterhand committed Mar 21, 2024
1 parent 7edf382 commit 2e8b7e4
Show file tree
Hide file tree
Showing 8 changed files with 93 additions and 25 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
- name: Install environment
shell: bash -l {0}
run: |
mamba install -c conda-forge -c bioconda -y numpy pandas networkx matplotlib pytest setuptools pyscipopt"<4.0.0" foldseek mmseqs2 cd-hit mash tmalign diamond cvxpy pytest-cov rdkit">=2022.09.1" pytest-cases scikit-learn">=1.2,<1.6" pyyaml h5py
mamba install -c conda-forge -c bioconda -y numpy pandas networkx matplotlib pytest setuptools pyscipopt"<4.0.0" foldseek mmseqs2 cd-hit mash tmalign diamond cvxpy pytest-cov rdkit">=2023.09.1" pytest-cases scikit-learn pyyaml h5py
pip install grakel
- name: Run tests
Expand Down
5 changes: 3 additions & 2 deletions datasail/reader/read_genomes.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from pathlib import Path
from typing import List, Tuple, Optional

from datasail.reader.read_molecules import remove_duplicate_values
Expand Down Expand Up @@ -35,8 +36,8 @@ def read_genome_data(
"""
dataset = DataSet(type=G_TYPE, location=UNK_LOCATION, format=FORM_FASTA)

def read_dir(ds):
ds.data = dict(read_folder(data))
def read_dir(ds: DataSet, path: Path) -> None:
ds.data = dict(read_folder(path))
ds.format = FORM_GENOMES

read_data_input(data, dataset, read_dir)
Expand Down
16 changes: 8 additions & 8 deletions datasail/reader/read_molecules.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@


mol_reader = {
".mol": MolFromMolFile,
".mol2": MolFromMol2File,
".mrv": MolFromMrvFile,
"mol": MolFromMolFile,
"mol2": MolFromMol2File,
"mrv": MolFromMrvFile,
# "sdf": MolFromMol2File,
".pdb": MolFromPDBFile,
".tpl": MolFromTPLFile,
".xyz": MolFromXYZFile,
"pdb": MolFromPDBFile,
"tpl": MolFromTPLFile,
"xyz": MolFromXYZFile,
}


Expand Down Expand Up @@ -54,11 +54,11 @@ def read_molecule_data(
"""
dataset = DataSet(type=M_TYPE, format=FORM_SMILES, location=UNK_LOCATION)

def read_dir(ds: DataSet, path: Path):
def read_dir(ds: DataSet, path: Path) -> None:
ds.data = {}
for file in path.iterdir():
if file.suffix[1:].lower() != "sdf" and mol_reader[file.suffix[1:].lower()] is not None:
ds.data[file.stem] = mol_reader[file.suffix[1:].lower()](file)
ds.data[file.stem] = Chem.MolToSmiles(mol_reader[file.suffix[1:].lower()](str(file)))
else:
ds.data = read_sdf_file(file)

Expand Down
5 changes: 3 additions & 2 deletions datasail/reader/read_other.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from pathlib import Path
from typing import List, Tuple, Optional

from datasail.reader.read_genomes import read_folder
Expand Down Expand Up @@ -37,8 +38,8 @@ def read_other_data(
"""
dataset = DataSet(type=O_TYPE, location=UNK_LOCATION, format=FORM_OTHER)

def read_dir(ds):
ds.data = dict(read_folder(data))
def read_dir(ds: DataSet, path: Path) -> None:
ds.data = dict(read_folder(path))

read_data_input(data, dataset, read_dir)

Expand Down
4 changes: 2 additions & 2 deletions datasail/reader/read_proteins.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ def read_protein_data(
"""
dataset = DataSet(type=P_TYPE, location=UNK_LOCATION)

def read_dir(ds):
ds.data = dict(read_folder(data, "pdb"))
def read_dir(ds: DataSet, path: Path) -> None:
ds.data = dict(read_folder(path, "pdb"))

read_data_input(data, dataset, read_dir)

Expand Down
4 changes: 2 additions & 2 deletions datasail/reader/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ def read_folder(folder_path: Path, file_extension: Optional[str] = None) -> Gene
yield filename.stem, filename


def read_data_input(data: DATA_INPUT, dataset: DataSet, read_dir: Callable[[DataSet], None]):
def read_data_input(data: DATA_INPUT, dataset: DataSet, read_dir: Callable[[DataSet, Path], None]):
"""
Read in the data from different sources and store it in the dataset.
Expand Down Expand Up @@ -386,7 +386,7 @@ def read_data_input(data: DATA_INPUT, dataset: DataSet, read_dir: Callable[[Data
else:
raise ValueError("Unknown file format. Supported formats are: .fasta, .fna, .fa, tsv, .csv, .pkl, .h5")
elif data.is_dir():
read_dir(dataset)
read_dir(dataset, data)
else:
raise ValueError("Unknown data input type. Path encodes neither a file nor a directory.")
dataset.location = data
Expand Down
36 changes: 36 additions & 0 deletions tests/data/molecules.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
Drug_ID, SMILES
D001, C
D002, N
D003, O
D004, C#C
D005, C#N
D006, C=O
D007, CC
D008, CO
D009, CC#C
D010, CC#N
D011, CC=O
D012, NC=O
D013, CCC
D014, CCO
D015, COC
D016, C1CC1
D017, C1CO1
D018, CC(C)=O
D019, CC(N)=O
D020, NC(N)=O
D021, CC(C)C
D022, CC(C)O
D023, C#CC#C
D024, C#CC#N
D025, N#CC#N
D026, O=CC#C
D027, O=CC#N
D028, O=CC=O
D029, CC#CC
D030, CCC#C
D031, CCC#N
D032, NCC#N
D033, OCC#C
D034, OCC#N
D035, CCC=O
46 changes: 38 additions & 8 deletions tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,31 +234,31 @@ def md_calculator():

@pytest.mark.parametrize("mode", ["CSV", "TSV", "PKL", "H5PY", "SDF"])
def test_input_formats(mode, md_calculator):
base = Path("data") / "pipeline"
base = Path("data") / "pipeline" / "input_forms"
drugs = pd.read_csv(base / "drugs.tsv", sep="\t")
ddict = {row["Drug_ID"]: row["SMILES"] for index, row in drugs.iterrows()}
(base / "input_forms").mkdir(exist_ok=True, parents=True)
base.mkdir(exist_ok=True, parents=True)

if mode == "CSV":
filepath = base / "input_forms" / "drugs.csv"
filepath = base / "drugs.csv"
drugs.to_csv(filepath, sep=",", index=False)
elif mode == "TSV":
filepath = base / "input_forms" / "drugs.tsv"
filepath = base / "drugs.tsv"
drugs.to_csv(filepath, sep="\t", index=False)
elif mode == "PKL":
data = {}
for k, v in ddict.items():
data[k] = AllChem.MolToSmiles(Chem.MolFromSmiles(v))
filepath = base / "input_forms" / "drugs.pkl"
filepath = base / "drugs.pkl"
with open(filepath, "wb") as f:
pickle.dump(data, f)
elif mode == "H5PY":
filepath = base / "input_forms" / "drugs.h5"
filepath = base / "drugs.h5"
with h5py.File(filepath, "w") as f:
for k, v in ddict.items():
f[k] = list(md_calculator.CalcDescriptors(Chem.MolFromSmiles(v)))
elif mode == "SDF":
filepath = base / "input_forms" / "drugs.sdf"
filepath = base / "drugs.sdf"
with Chem.SDWriter(str(filepath)) as w:
for k, v in ddict.items():
mol = Chem.MolFromSmiles(v)
Expand All @@ -269,11 +269,41 @@ def test_input_formats(mode, md_calculator):

dataset = read_molecule_data(filepath)

shutil.rmtree(base / "input_forms", ignore_errors=True)
shutil.rmtree(base, ignore_errors=True)

assert set(dataset.names) == set(ddict.keys())


@pytest.mark.parametrize("mode", ["MOL", "MRV", "PDB", "TPL"]) # , "XYZ"])
def test_molecule_formats(mode):
base = Path("data") / "pipeline" / "input_forms"
base.mkdir(exist_ok=True, parents=True)
mols = {}
with open(Path("data") / "molecules.csv", "r") as f:
for line in f.readlines()[1:]:
k, v = line.strip().split(",")
mols[k] = Chem.MolFromSmiles(v)

for k, mol in mols.items():
AllChem.EmbedMultipleConfs(mol, numConfs=1)
if mode == "MOL":
Chem.MolToMolFile(mol, str(base / f"{k}.mol"))
elif mode == "MRV":
Chem.MolToMrvFile(mol, str(base / f"{k}.mrv"))
elif mode == "PDB":
Chem.MolToPDBFile(mol, str(base / f"{k}.pdb")) # , removeHs=False)
elif mode == "TPL":
Chem.MolToTPLFile(mol, str(base / f"{k}.tpl"))
# elif mode == "XYZ":
# Chem.MolToXYZFile(mol, str(base / f"{k}.xyz"))
else:
raise ValueError(f"Unknown mode: {mode}")

dataset = read_molecule_data(base)
shutil.rmtree(base, ignore_errors=True)
assert set(dataset.names) == set(mols.keys())


@pytest.mark.todo
def test_genomes():
base = Path("data") / "genomes"
Expand Down

0 comments on commit 2e8b7e4

Please sign in to comment.