Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

prepare chemcaption data #466

Merged
merged 12 commits into from
Nov 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
342 changes: 342 additions & 0 deletions data/tabular/chemcaption_rdkit/meta.yaml

Large diffs are not rendered by default.

79 changes: 79 additions & 0 deletions data/tabular/chemcaption_rdkit/preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import ast
from collections import defaultdict
from glob import glob

import pandas as pd
from datasets import Dataset

allowed_single_output_features = [
"['num_valence_electrons']",
"['monoisotopic_molecular_mass']",
"['molecular_formula']",
"['num_hydrogen_bond_acceptors']",
"['num_hydrogen_bond_donors']",
"['num_lipinski_violations']",
"['inertial_shape_factor']",
"['eccentricity']",
"['asphericity']",
"['num_chiral_centers']",
]

allowed_multi_output_features = [
"['rotable_proportion', 'non_rotable_proportion']",
"['num_unspecified_bond', 'num_single_bonds', 'num_double_bonds', 'num_triple_bonds', 'num_quadruple_bonds', 'num_quintuple_bonds', 'num_hextuple_bonds', 'num_oneandahalf_bonds', 'num_twoandahalf_bonds', 'num_threeandahalf_bonds', 'num_fourandahalf_bonds', 'num_fiveandahalf_bonds', 'num_aromatic_bonds', 'num_ionic_bonds', 'num_hydrogen_bonds', 'num_threecenter_bonds', 'num_dativeone_bonds', 'num_dative_bonds', 'num_other_bonds', 'num_zero_bonds', 'num_bonds']", # noqa
"['carbon_mass', 'hydrogen_mass', 'nitrogen_mass', 'oxygen_mass']",
"['num_carbon_atoms', 'num_hydrogen_atoms', 'num_nitrogen_atoms', 'num_oxygen_atoms']",
"['npr1_value', 'npr2_value']",
"['pmi1_value', 'pmi2_value', 'pmi3_value']",
]


def get_allowed_features():
return allowed_single_output_features + allowed_multi_output_features


def extract_output_feature(row):
completion = row["completion"]
completion = ast.literal_eval(completion)
labels = row["completion_labels"]
labels = ast.literal_eval(labels)

return dict(zip(completion, labels))


def extract_features_frame(file):
molecular_features = defaultdict(dict)
df = pd.read_json(file, lines=True)
df["completion_labels"] = df["completion_labels"].astype(str)
df["completion"] = df["completion"].astype(str)
subset = df[df["completion_labels"].str.contains("|".join(get_allowed_features()))]

for _index, row in subset.iterrows():
molecule = row["representation"]
representation_type = row["representation_type"]
features = extract_output_feature(row)
features["representation_type"] = representation_type
molecular_features[molecule].update(features)

list_of_dicts = []
for k, v in molecular_features.items():
v["representation"] = k
list_of_dicts.append(v)

del molecular_features
del df

return pd.DataFrame(list_of_dicts)


if __name__ == "__main__":
all_files = glob("*.jsonl")
all_dfs = []
for file in all_files:
df = extract_features_frame(file)
all_dfs.append(df)

df = pd.concat(all_dfs)

ds = Dataset.from_pandas(df)
ds.push_to_hub(repo_id="kjappelbaum/chemnlp-chem-caption", config_name="rdkit_feat")
68 changes: 68 additions & 0 deletions data/tabular/chemcaption_rdkit/transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import fire
import pandas as pd


def process():
df = pd.read_parquet(
"https://huggingface.co/datasets/kjappelbaum/chemnlp-chem-caption/resolve/main/rdkit_feat/train-00000-of-00001-7cea16ab26bf74cf.parquet?download=true" # noqa
)
df["num_bonds_simple"] = df[
[
"num_single_bonds",
"num_double_bonds",
"num_triple_bonds",
"num_quadruple_bonds",
"num_quintuple_bonds",
"num_aromatic_bonds",
]
].sum(axis=1)

df = df[df["num_bonds_simple"].astype(int) == df["num_bonds"].astype(int)]

df[
[
"num_valence_electrons",
"num_single_bonds",
"num_double_bonds",
"num_triple_bonds",
"num_quadruple_bonds",
"num_quintuple_bonds",
"num_aromatic_bonds",
"num_bonds",
"num_carbon_atoms",
"num_hydrogen_atoms",
"num_nitrogen_atoms",
"num_oxygen_atoms",
"num_hydrogen_bond_acceptors",
"num_hydrogen_bond_donors",
"num_lipinski_violations",
"num_chiral_centers",
]
] = df[
[
"num_valence_electrons",
"num_single_bonds",
"num_double_bonds",
"num_triple_bonds",
"num_quadruple_bonds",
"num_quintuple_bonds",
"num_aromatic_bonds",
"num_bonds",
"num_carbon_atoms",
"num_hydrogen_atoms",
"num_nitrogen_atoms",
"num_oxygen_atoms",
"num_hydrogen_bond_acceptors",
"num_hydrogen_bond_donors",
"num_lipinski_violations",
"num_chiral_centers",
]
].astype(
int
)
print(len(df))
df.to_csv("data_clean.csv", index=False)


if __name__ == "__main__":
fire.Fire(process)
1 change: 1 addition & 0 deletions data/text_sampling/text_sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@
"cav3_t-type_calcium_channels_butkiewicz", # because it is boolean target data
"chebi_20", # target is text description
"chembl_v29", # text only, no SMILES
"chemcaption_rdkit", # text only, no SMILES
"choline_transporter_butkiewicz", # because it is boolean target data
"clintox", # because it is boolean target data
"cyp2c9_substrate_carbonmangels", # boolean target data
Expand Down
3 changes: 2 additions & 1 deletion src/chemnlp/data/reprs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import deepsmiles
import pubchempy as pcp
import requests
import safe
import selfies
from rdkit import Chem

Expand Down Expand Up @@ -43,6 +42,8 @@ def smiles_to_safe(smiles: str) -> str:
"""
Takes a SMILES and return the SAFE.
"""
import safe

return safe.encode(smiles, seed=42, canonical=True, randomize=False)


Expand Down
20 changes: 10 additions & 10 deletions tests/test_reprs.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from chemnlp.data.reprs import smiles_to_iupac_name, smiles_to_safe


def test_smiles_to_safe():
safe = smiles_to_safe("CC(Cc1ccc(cc1)C(C(=O)O)C)C")
# equivalent, only rotations, it is not completely deterministic
assert (
safe == "c12ccc3cc1.C3(C)C(=O)O.CC(C)C2"
or safe == "c13ccc2cc1.C2(C)C(=O)O.CC(C)C3"
)
from chemnlp.data.reprs import smiles_to_iupac_name

# not used at the moment
# def test_smiles_to_safe():
# safe = smiles_to_safe("CC(Cc1ccc(cc1)C(C(=O)O)C)C")
# # equivalent, only rotations, it is not completely deterministic
# assert (
# safe == "c12ccc3cc1.C3(C)C(=O)O.CC(C)C2"
# or safe == "c13ccc2cc1.C2(C)C(=O)O.CC(C)C3"
# )


def test_smiles_to_iupac_name():
Expand Down