diff --git a/data/tabular/mol2svg/meta.yaml b/data/tabular/mol2svg/meta.yaml new file mode 100644 index 000000000..840134303 --- /dev/null +++ b/data/tabular/mol2svg/meta.yaml @@ -0,0 +1,25 @@ +--- +name: chem_caption_smarts +description: |- + This dataset contains SVG images of molecules, including some with substructures + highlighted. +targets: + - id: completion + type: text + description: completion +identifiers: + - id: prompt + type: text + description: prompt + - id: smiles + type: SMILES + description: SMILES +license: CC BY 4.0 +num_points: 16019 +links: + - url: https://github.com/lamalab-org/chem-caption + description: Original codebase used to generate this dataset +templates: + - |- + {prompt#} + {completion#} diff --git a/data/tabular/mol2svg/transform.py b/data/tabular/mol2svg/transform.py new file mode 100644 index 000000000..b7969fbe2 --- /dev/null +++ b/data/tabular/mol2svg/transform.py @@ -0,0 +1,13 @@ +from datasets import load_dataset + + +def preprocess(): + dataset = load_dataset("kjappelbaum/chemnlp-mol-svg") + df = dataset["train"].to_pandas() + df.dropna(inplace=True) + print(len(df)) + df.to_csv("data_clean.csv", index=False) + + +if __name__ == "__main__": + preprocess()