OpenBioML · kjappelbaum · Nov 8, 2023 · Oct 29, 2023 · Oct 30, 2023 · Oct 30, 2023
diff --git a/data/tabular/chemcaption_rdkit/meta.yaml b/data/tabular/chemcaption_rdkit/meta.yaml
diff --git a/data/tabular/chemcaption_rdkit/preprocess.py b/data/tabular/chemcaption_rdkit/preprocess.py
@@ -0,0 +1,79 @@
+import ast
+from collections import defaultdict
+from glob import glob
+
+import pandas as pd
+from datasets import Dataset
+
+allowed_single_output_features = [
+    "['num_valence_electrons']",
+    "['monoisotopic_molecular_mass']",
+    "['molecular_formula']",
+    "['num_hydrogen_bond_acceptors']",
+    "['num_hydrogen_bond_donors']",
+    "['num_lipinski_violations']",
+    "['inertial_shape_factor']",
+    "['eccentricity']",
+    "['asphericity']",
+    "['num_chiral_centers']",
+]
+
+allowed_multi_output_features = [
+    "['rotable_proportion', 'non_rotable_proportion']",
+    "['num_unspecified_bond', 'num_single_bonds', 'num_double_bonds', 'num_triple_bonds', 'num_quadruple_bonds', 'num_quintuple_bonds', 'num_hextuple_bonds', 'num_oneandahalf_bonds', 'num_twoandahalf_bonds', 'num_threeandahalf_bonds', 'num_fourandahalf_bonds', 'num_fiveandahalf_bonds', 'num_aromatic_bonds', 'num_ionic_bonds', 'num_hydrogen_bonds', 'num_threecenter_bonds', 'num_dativeone_bonds', 'num_dative_bonds', 'num_other_bonds', 'num_zero_bonds', 'num_bonds']",  # noqa
+    "['carbon_mass', 'hydrogen_mass', 'nitrogen_mass', 'oxygen_mass']",
+    "['num_carbon_atoms', 'num_hydrogen_atoms', 'num_nitrogen_atoms', 'num_oxygen_atoms']",
+    "['npr1_value', 'npr2_value']",
+    "['pmi1_value', 'pmi2_value', 'pmi3_value']",
+]
+
+
+def get_allowed_features():
+    return allowed_single_output_features + allowed_multi_output_features
+
+
+def extract_output_feature(row):
+    completion = row["completion"]
+    completion = ast.literal_eval(completion)
+    labels = row["completion_labels"]
+    labels = ast.literal_eval(labels)
+
+    return dict(zip(completion, labels))
+
+
+def extract_features_frame(file):
+    molecular_features = defaultdict(dict)
+    df = pd.read_json(file, lines=True)
+    df["completion_labels"] = df["completion_labels"].astype(str)
+    df["completion"] = df["completion"].astype(str)
+    subset = df[df["completion_labels"].str.contains("|".join(get_allowed_features()))]
+
+    for _index, row in subset.iterrows():
+        molecule = row["representation"]
+        representation_type = row["representation_type"]
+        features = extract_output_feature(row)
+        features["representation_type"] = representation_type
+        molecular_features[molecule].update(features)
+
+    list_of_dicts = []
+    for k, v in molecular_features.items():
+        v["representation"] = k
+        list_of_dicts.append(v)
+
+    del molecular_features
+    del df
+
+    return pd.DataFrame(list_of_dicts)
+
+
+if __name__ == "__main__":
+    all_files = glob("*.jsonl")
+    all_dfs = []
+    for file in all_files:
+        df = extract_features_frame(file)
+        all_dfs.append(df)
+
+    df = pd.concat(all_dfs)
+
+    ds = Dataset.from_pandas(df)
+    ds.push_to_hub(repo_id="kjappelbaum/chemnlp-chem-caption", config_name="rdkit_feat")
diff --git a/data/tabular/chemcaption_rdkit/transform.py b/data/tabular/chemcaption_rdkit/transform.py
@@ -0,0 +1,68 @@
+import fire
+import pandas as pd
+
+
+def process():
+    df = pd.read_parquet(
+        "https://huggingface.co/datasets/kjappelbaum/chemnlp-chem-caption/resolve/main/rdkit_feat/train-00000-of-00001-7cea16ab26bf74cf.parquet?download=true"  # noqa
+    )
+    df["num_bonds_simple"] = df[
+        [
+            "num_single_bonds",
+            "num_double_bonds",
+            "num_triple_bonds",
+            "num_quadruple_bonds",
+            "num_quintuple_bonds",
+            "num_aromatic_bonds",
+        ]
+    ].sum(axis=1)
+
+    df = df[df["num_bonds_simple"].astype(int) == df["num_bonds"].astype(int)]
+
+    df[
+        [
+            "num_valence_electrons",
+            "num_single_bonds",
+            "num_double_bonds",
+            "num_triple_bonds",
+            "num_quadruple_bonds",
+            "num_quintuple_bonds",
+            "num_aromatic_bonds",
+            "num_bonds",
+            "num_carbon_atoms",
+            "num_hydrogen_atoms",
+            "num_nitrogen_atoms",
+            "num_oxygen_atoms",
+            "num_hydrogen_bond_acceptors",
+            "num_hydrogen_bond_donors",
+            "num_lipinski_violations",
+            "num_chiral_centers",
+        ]
+    ] = df[
+        [
+            "num_valence_electrons",
+            "num_single_bonds",
+            "num_double_bonds",
+            "num_triple_bonds",
+            "num_quadruple_bonds",
+            "num_quintuple_bonds",
+            "num_aromatic_bonds",
+            "num_bonds",
+            "num_carbon_atoms",
+            "num_hydrogen_atoms",
+            "num_nitrogen_atoms",
+            "num_oxygen_atoms",
+            "num_hydrogen_bond_acceptors",
+            "num_hydrogen_bond_donors",
+            "num_lipinski_violations",
+            "num_chiral_centers",
+        ]
+    ].astype(
+        int
+    )
+    print(len(df))
+    df.to_csv("data_clean.csv", index=False)
+
+
+if __name__ == "__main__":
+    fire.Fire(process)
diff --git a/data/text_sampling/text_sampling.py b/data/text_sampling/text_sampling.py
@@ -80,6 +80,7 @@
     "cav3_t-type_calcium_channels_butkiewicz",  # because it is boolean target data
     "chebi_20",  # target is text description
     "chembl_v29",  # text only, no SMILES
+    "chemcaption_rdkit",  # text only, no SMILES
     "choline_transporter_butkiewicz",  # because it is boolean target data
     "clintox",  # because it is boolean target data
     "cyp2c9_substrate_carbonmangels",  # boolean target data

diff --git a/src/chemnlp/data/reprs.py b/src/chemnlp/data/reprs.py
@@ -2,7 +2,6 @@
 import deepsmiles
 import pubchempy as pcp
 import requests
-import safe
 import selfies
 from rdkit import Chem
 
@@ -43,6 +42,8 @@ def smiles_to_safe(smiles: str) -> str:
     """
     Takes a SMILES and return the SAFE.
     """
+    import safe
+
     return safe.encode(smiles, seed=42, canonical=True, randomize=False)
 
 

diff --git a/tests/test_reprs.py b/tests/test_reprs.py
@@ -1,13 +1,13 @@
-from chemnlp.data.reprs import smiles_to_iupac_name, smiles_to_safe
-
-
-def test_smiles_to_safe():
-    safe = smiles_to_safe("CC(Cc1ccc(cc1)C(C(=O)O)C)C")
-    # equivalent, only rotations, it is not completely deterministic
-    assert (
-        safe == "c12ccc3cc1.C3(C)C(=O)O.CC(C)C2"
-        or safe == "c13ccc2cc1.C2(C)C(=O)O.CC(C)C3"
-    )
+from chemnlp.data.reprs import smiles_to_iupac_name
+
+# not used at the moment
+# def test_smiles_to_safe():
+#     safe = smiles_to_safe("CC(Cc1ccc(cc1)C(C(=O)O)C)C")
+#     # equivalent, only rotations, it is not completely deterministic
+#     assert (
+#         safe == "c12ccc3cc1.C3(C)C(=O)O.CC(C)C2"
+#         or safe == "c13ccc2cc1.C2(C)C(=O)O.CC(C)C3"
+#     )
 
 
 def test_smiles_to_iupac_name():