diff --git a/data/tabular/mol2svg/meta.yaml b/data/tabular/mol2svg/meta.yaml
new file mode 100644
index 000000000..840134303
--- /dev/null
+++ b/data/tabular/mol2svg/meta.yaml
@@ -0,0 +1,25 @@
+---
+name: chem_caption_smarts
+description: |-
+    This dataset contains SVG images of molecules, including some with substructures
+    highlighted.
+targets:
+    - id: completion
+      type: text
+      description: completion
+identifiers:
+    - id: prompt
+      type: text
+      description: prompt
+    - id: smiles
+      type: SMILES
+      description: SMILES
+license: CC BY 4.0
+num_points: 16019
+links:
+    - url: https://github.com/lamalab-org/chem-caption
+      description: Original codebase used to generate this dataset
+templates:
+    - |-
+      {prompt#}
+      {completion#}
diff --git a/data/tabular/mol2svg/transform.py b/data/tabular/mol2svg/transform.py
new file mode 100644
index 000000000..b7969fbe2
--- /dev/null
+++ b/data/tabular/mol2svg/transform.py
@@ -0,0 +1,13 @@
+from datasets import load_dataset
+
+
+def preprocess():
+    dataset = load_dataset("kjappelbaum/chemnlp-mol-svg")
+    df = dataset["train"].to_pandas()
+    df.dropna(inplace=True)
+    print(len(df))
+    df.to_csv("data_clean.csv", index=False)
+
+
+if __name__ == "__main__":
+    preprocess()