diff --git a/data/tabular/chem_caption_smarts/meta.yaml b/data/tabular/chem_caption_smarts/meta.yaml new file mode 100644 index 000000000..57f8ecfbe --- /dev/null +++ b/data/tabular/chem_caption_smarts/meta.yaml @@ -0,0 +1,41 @@ +--- +name: chem_caption_smarts +description: |- + This dataset contains the count of substructures in molecules +targets: + - id: smarts + type: text + description: substructure smarts + names: + - noun: SMARTS + - noun: SMiles ARbitrary Target Specification (SMARTS) + - id: completion + type: categorical + description: number of matches + - id: completion_labels + type: text + description: name of the substructure +identifiers: + - id: representation + type: text + description: representation + - id: representation_type + type: text + description: representation type +license: CC BY 4.0 +links: + - url: https://github.com/lamalab-org/chem-caption + description: Original codebase used to generate this dataset +templates: + - |- + Question: {#How many times|How often!} does the {#molecule|chemical|compound|chemical structure!} with {representation_type#} {representation#} contain the substructure with the {smarts__names__noun} {#smarts#}? + Answer: {completion#} + - |- + Question: {#How many times|How often!} does the {#molecule|chemical|compound|chemical structure!} with {representation_type#} {representation#} contain a {completion#} substructure? + Answer: {smarts__names__noun} {#smarts#} + - |- + User: {#I want to|I have to|I must|I would like to!} know {#how many times|how often!} the {#molecule|chemical|compound|chemical structure!} with {representation_type#} {representation#} contains the substructure with the {smarts__names__noun} {#smarts#}. + Assistant: The {#molecule|chemical|compound|chemical structure!} with {representation_type#} {representation#} contains the substructure with the {smarts__names__noun} {#smarts#} {completion#} times. + - |- + User: {#I want to|I have to|I must|I would like to!} know how many times the {#molecule|chemical|compound|chemical structure!} with {representation_type#} {representation#} contains a {completion#} substructure. + Assistant: The {#molecule|chemical|compound|chemical structure!} contains the substructure with the {smarts__names__noun} {#smarts#} {completion#} times. diff --git a/data/tabular/chem_caption_smarts/preprocess.py b/data/tabular/chem_caption_smarts/preprocess.py new file mode 100644 index 000000000..06eeafe4e --- /dev/null +++ b/data/tabular/chem_caption_smarts/preprocess.py @@ -0,0 +1,810 @@ +# flake8: noqa +"""Preprocess the raw outputs from the text-output of chem-caption to a tabular dataset.""" +from glob import glob + +import pandas as pd +from datasets import Dataset +from tqdm import tqdm + +NAME_SMARTS_MAP = { + "tert-butyloxycarbonyl": "[#8]=[#6]-[#8]-[#6](-[#6])(-[#6])-[#6]", + "trityl": "[#6](-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)(-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "3,5-dimethoxyphenylisoproxycarbonyl": "[#6]-[#8]-[#6]1:[#6]:[#6](-[#6](-[#6])(-[#8]-[#6]=[#8])-[#6]):[#6]:[#6](-[#8]-[#6]):[#6]:1", + "2-(4-biphenyl)isopropoxycarbonyl": "[#6]-[#6](-[#6])(-[#8]-[#6]=[#8])-[#6]1:[#6]:[#6]:[#6](:[#6]:[#6]:1)-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "2-nitrophenylsulfenyl": "[#16]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#7+](-[#8-])=[#8]", + "boc": "[#8]=[#6]-[#8]-[#6](-[#6])(-[#6])-[#6]", + "trt": "[#6](-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)(-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "ddz": "[#6]-[#8]-[#6]1:[#6]:[#6](-[#6](-[#6])(-[#8]-[#6]=[#8])-[#6]):[#6]:[#6](-[#8]-[#6]):[#6]:1", + "bpoc": "[#6]-[#6](-[#6])(-[#8]-[#6]=[#8])-[#6]1:[#6]:[#6]:[#6](:[#6]:[#6]:1)-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "nps": "[#16]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#7+](-[#8-])=[#8]", + "9-fluorenylmethoxycarbonyl": "[#8]=[#6]-[#8]-[#6]-[#6]1-[#6]2:[#6](-[#6]3:[#6]-1:[#6]:[#6]:[#6]:[#6]:3):[#6]:[#6]:[#6]:[#6]:2", + "2-(4-nitrophenylsulfonyl)ethoxycarbonyl": "[#8]=[#6]-[#8]-[#6]-[#6]-[#16](=[#8])(-[#6]1:[#6]:[#6]:[#6](-[#7+](-[#8-])=[#8]):[#6]:[#6]:1)=[#8]", + "(1,1-dioxobenzo[b]thiophene-2-yl)methyloxycarbonyl": "[#8]=[#6]-[#8]-[#6]-[#6]1=[#6]-[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2-[#16]-1(=[#8])=[#8]", + "(1,1-dioxonaptho[1,2-b]thiophene-2-yl)methyloxycarbonyl": "[#8]=[#6]-[#8]-[#6]-[#6]1=[#6]-[#6]2:[#6]:[#6]:[#6]3:[#6]:[#6]:[#6]:[#6]:[#6]:3:[#6]:2-[#16]-1(=[#8])=[#8]", + "1-(4,4-dimethyl-2,6-dioxocyclohex-1-ylidene)-3-methylbutyl": "[#6]-[#6](-[#6])-[#6]-[#6]=[#6]1-[#6](-[#6]-[#6](-[#6])(-[#6])-[#6]-[#6]-1=[#8])=[#8]", + "2,7-di-tert-butyl-fmoc": "[#6]-[#6]1:[#6]:[#6]2-[#6](-[#6]-[#8]-[#6]=[#8])-[#6]3:[#6](-[#6]:2:[#6]:[#6]:1):[#6]:[#6]:[#6](:[#6]:3)-[#6](-[#6])(-[#6])-[#6]", + "2-fluoro-fmoc": "[#9]-[#6]1:[#6]:[#6]2:[#6](-[#6]3:[#6]:[#6]:[#6]:[#6]:[#6]:3-[#6]-2-[#6]-[#8]-[#6]=[#8]):[#6]:[#6]:1", + "2-monoisooctyl-fmoc": "[#8]=[#6]-[#8]-[#6]-[#6]1-[#6]2:[#6](:[#6]:[#6]:[#6]:[#6]:2)-[#6]2:[#6]-1:[#6]:[#6](-[#6](-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6])=[#8]):[#6]:[#6]:2", + "2,7-diisooctyl-fmoc": "[#8]=[#6]-[#8]-[#6]-[#6]1-[#6]2:[#6](:[#6]:[#6]:[#6](-[#6](-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6])=[#8]):[#6]:2)-[#6]2:[#6]-1:[#6]:[#6](-[#6](-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6])=[#8]):[#6]:[#6]:2", + "tetrachlorophthaloyl": "[#8]=[#6]-[#6]1:[#6](-[#17]):[#6](-[#17]):[#6](-[#17]):[#6](-[#17]):[#6]:1-[#6]=[#8]", + "2-[phenyl(methyl)sulfonio])ethyloxycarbonyltetrafluoroborate": "[#6]-[#16+](-[#6]-[#6]-[#8]-[#6]=[#8])-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "ethanesulfonylethoxycarbonyl": "[#8]=[#6]-[#8]-[#6](-[#16](=[#8])(-[#6]-[#6])=[#8])-[#6]", + "2-(4-sulfophenylsulfonyl)ethoxycarbonyl": "[#8]=[#6]-[#8]-[#6]-[#6]-[#16](=[#8])(-[#6]1:[#6]:[#6]:[#6](-[#16](=[#8])(-[#8])=[#8]):[#6]:[#6]:1)=[#8]", + "fmoc": "[#8]=[#6]-[#8]-[#6]-[#6]1-[#6]2:[#6](-[#6]3:[#6]-1:[#6]:[#6]:[#6]:[#6]:3):[#6]:[#6]:[#6]:[#6]:2", + "nsc": "[#8]=[#6]-[#8]-[#6]-[#6]-[#16](=[#8])(-[#6]1:[#6]:[#6]:[#6](-[#7+](-[#8-])=[#8]):[#6]:[#6]:1)=[#8]", + "bsmoc": "[#8]=[#6]-[#8]-[#6]-[#6]1=[#6]-[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2-[#16]-1(=[#8])=[#8]", + "alpha-nsmoc": "[#8]=[#6]-[#8]-[#6]-[#6]1=[#6]-[#6]2:[#6]:[#6]:[#6]3:[#6]:[#6]:[#6]:[#6]:[#6]:3:[#6]:2-[#16]-1(=[#8])=[#8]", + "ivdde": "[#6]-[#6](-[#6])-[#6]-[#6]=[#6]1-[#6](-[#6]-[#6](-[#6])(-[#6])-[#6]-[#6]-1=[#8])=[#8]", + "fmoc*": "[#6]-[#6]1:[#6]:[#6]2-[#6](-[#6]-[#8]-[#6]=[#8])-[#6]3:[#6](-[#6]:2:[#6]:[#6]:1):[#6]:[#6]:[#6](:[#6]:3)-[#6](-[#6])(-[#6])-[#6]", + "fmoc(fmoc(2f))": "[#9]-[#6]1:[#6]:[#6]2:[#6](-[#6]3:[#6]:[#6]:[#6]:[#6]:[#6]:3-[#6]-2-[#6]-[#8]-[#6]=[#8]):[#6]:[#6]:1", + "mio-fmoc": "[#8]=[#6]-[#8]-[#6]-[#6]1-[#6]2:[#6](:[#6]:[#6]:[#6]:[#6]:2)-[#6]2:[#6]-1:[#6]:[#6](-[#6](-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6])=[#8]):[#6]:[#6]:2", + "dio-fmoc": "[#8]=[#6]-[#8]-[#6]-[#6]1-[#6]2:[#6](:[#6]:[#6]:[#6](-[#6](-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6])=[#8]):[#6]:2)-[#6]2:[#6]-1:[#6]:[#6](-[#6](-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6])=[#8]):[#6]:[#6]:2", + "tcp": "[#8]=[#6]-[#6]1:[#6](-[#17]):[#6](-[#17]):[#6](-[#17]):[#6](-[#17]):[#6]:1-[#6]=[#8]", + "pms": "[#6]-[#16+](-[#6]-[#6]-[#8]-[#6]=[#8])-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "esc": "[#8]=[#6]-[#8]-[#6](-[#16](=[#8])(-[#6]-[#6])=[#8])-[#6]", + "sps": "[#8]=[#6]-[#8]-[#6]-[#6]-[#16](=[#8])(-[#6]1:[#6]:[#6]:[#6](-[#16](=[#8])(-[#8])=[#8]):[#6]:[#6]:1)=[#8]", + "benzyloxycarbonyl": "[#8]=[#6]-[#8]-[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "allyloxycarbonyl": "[#8]=[#6]-[#8]-[#6]-[#6]=[#6]", + "o-nitrobenzenesulfonyl": "[#8]=[#16](-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#7+](-[#8-])=[#8])=[#8]", + "2,4-dinitrobenzenesulfonyl": "[#8]=[#16](-[#6]1:[#6]:[#6]:[#6](-[#7+](-[#8-])=[#8]):[#6]:[#6]:1-[#7+](-[#8-])=[#8])=[#8]", + "benzothiazole-2-sulfonyl": "[#8]=[#16](-[#6]1:[#7]:[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2:[#16]:1)=[#8]", + "2,2,2-trichloroethyloxycarbonyl": "[#8]=[#6]-[#8]-[#6]-[#6](-[#17])(-[#17])-[#17]", + "dithiasuccinoyl": "[#8]=[#6]-[#16]-[#16]-[#6]=[#8]", + "p-nitrobenzyloxycarbonyl": "[#8]=[#6]-[#8]-[#6]-[#6]1:[#6]:[#6]:[#6](-[#7+](-[#8-])=[#8]):[#6]:[#6]:1", + "alpha-azidoacids": "[#7-]=[#7+]=[#7]-[#6]-[#6](-[#8])=[#8]", + "proparglyoxycarbonyl": "[#6]#[#6]-[#8]-[#6](-[#6])=[#8]", + "o-nitrobenzylcarbonyl": "[#8]=[#6]-[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#7+](-[#8-])=[#8]", + "4-nitroveratryloxycarbonyl": "[#8]=[#6]-[#8]-[#6]-[#6]1:[#6](-[#7+](-[#8-])=[#8]):[#6]:[#6](-[#8]-[#6]):[#6](-[#8]-[#6]):[#6]:1", + "2-(2-nitrophenyl)propyloxycarbonyl": "[#8]=[#6]-[#8]-[#6]-[#6](-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#7+](-[#8-])=[#8])-[#6]", + "2-(3,4-methylenedioxy-6-nitrophenyl)propyloxycarbonyl": "[#8]=[#6]-[#8]-[#6]-[#6](-[#6]1:[#6]:[#6]2-[#8]-[#6]-[#8]-[#6]:2:[#6]:[#6]:1-[#7+](-[#8-])=[#8])-[#6]", + "9-(4-bromophenyl)-9-fluorenyl": "[#35]-[#6]1:[#6]:[#6]:[#6](-[#6]2-[#6]3:[#6](-[#6]4:[#6]-2:[#6]:[#6]:[#6]:[#6]:4):[#6]:[#6]:[#6]:[#6]:3):[#6]:[#6]:1", + "azidomethoxycarbonyl": "[#8]=[#6]-[#8]-[#6]-[#7]=[#7+]=[#7-]", + "hexafluoroacetone": "[#8]=[#6]1-[#8]-[#6](-[#6](-[#6](-[#9])(-[#9])-[#9])-[#6](-[#9])(-[#9])-[#9])-[#7]-[#6]-1", + "Z": "[#8]=[#6]-[#8]-[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "alloc": "[#8]=[#6]-[#8]-[#6]-[#6]=[#6]", + "o-nbs": "[#8]=[#16](-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#7+](-[#8-])=[#8])=[#8]", + "d-nbs": "[#8]=[#16](-[#6]1:[#6]:[#6]:[#6](-[#7+](-[#8-])=[#8]):[#6]:[#6]:1-[#7+](-[#8-])=[#8])=[#8]", + "bts": "[#8]=[#16](-[#6]1:[#7]:[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2:[#16]:1)=[#8]", + "troc": "[#8]=[#6]-[#8]-[#6]-[#6](-[#17])(-[#17])-[#17]", + "dts": "[#8]=[#6]-[#16]-[#16]-[#6]=[#8]", + "pnz": "[#8]=[#6]-[#8]-[#6]-[#6]1:[#6]:[#6]:[#6](-[#7+](-[#8-])=[#8]):[#6]:[#6]:1", + "poc": "[#6]#[#6]-[#8]-[#6](-[#6])=[#8]", + "onz": "[#8]=[#6]-[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#7+](-[#8-])=[#8]", + "nvoc": "[#8]=[#6]-[#8]-[#6]-[#6]1:[#6](-[#7+](-[#8-])=[#8]):[#6]:[#6](-[#8]-[#6]):[#6](-[#8]-[#6]):[#6]:1", + "nppoc": "[#8]=[#6]-[#8]-[#6]-[#6](-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#7+](-[#8-])=[#8])-[#6]", + "mnppoc": "[#8]=[#6]-[#8]-[#6]-[#6](-[#6]1:[#6]:[#6]2-[#8]-[#6]-[#8]-[#6]:2:[#6]:[#6]:1-[#7+](-[#8-])=[#8])-[#6]", + "brphf": "[#35]-[#6]1:[#6]:[#6]:[#6](-[#6]2-[#6]3:[#6](-[#6]4:[#6]-2:[#6]:[#6]:[#6]:[#6]:4):[#6]:[#6]:[#6]:[#6]:3):[#6]:[#6]:1", + "azoc": "[#8]=[#6]-[#8]-[#6]-[#7]=[#7+]=[#7-]", + "hfa": "[#8]=[#6]1-[#8]-[#6](-[#6](-[#6](-[#9])(-[#9])-[#9])-[#6](-[#9])(-[#9])-[#9])-[#7]-[#6]-1", + "2-chlorobenzyloxycarbonyl": "[#8]=[#6]-[#8]-[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#17]", + "4-methyltrityl": "[#6]-[#6]1:[#6]:[#6]:[#6](-[#6](-[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2)-[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2):[#6]:[#6]:1", + "cl-z": "[#8]=[#6]-[#8]-[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#17]", + "mtt": "[#6]-[#6]1:[#6]:[#6]:[#6](-[#6](-[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2)-[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2):[#6]:[#6]:1", + "1-(4,4-dimethyl-2,6-dioxocylohex-1-ylidene)-3-methylbutyl": "[#8]=[#6]1-[#6](-[#6](-[#6]-[#6](-[#6])-[#6]-1)=[#8])=[#6]-[#6]-[#6](-[#6])-[#6]", + "trifluoroacetyl": "[#8]=[#6]-[#6](-[#9])(-[#9])-[#9]", + "2-(methylsulfonyl)ethoxycarbonyl": "[#8]=[#6]-[#8]-[#6]-[#6]-[#16](=[#8])(-[#6])=[#8]", + "tfa": "[#8]=[#6]-[#6](-[#9])(-[#9])-[#9]", + "msc": "[#8]=[#6]-[#8]-[#6]-[#6]-[#16](=[#8])(-[#6])=[#8]", + "phenyldisulphanylethyloxycarbonyl": "[#8]=[#6]-[#8]-[#6](-[#16]-[#16]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)-[#6]", + "2-pyridyldisulphanylethyloxycarbonyl": "[#8]=[#6]-[#8]-[#6](-[#16]-[#16]-[#6]1:[#7]:[#6]:[#6]:[#6]:[#6]:1)-[#6]", + "phdec": "[#8]=[#6]-[#8]-[#6](-[#16]-[#16]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)-[#6]", + "pydec": "[#8]=[#6]-[#8]-[#6](-[#16]-[#16]-[#6]1:[#7]:[#6]:[#6]:[#6]:[#6]:1)-[#6]", + "tert-butyl": "[#6]-[#6](-[#6])-[#6]", + "2-chlorotrityl": "[#17]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#6](-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "2-4-dimethyoxybenzyl": "[#6]-[#6]1:[#6]:[#6](-[#8]-[#6]):[#6]:[#6](-[#8]-[#6]):[#6]:1", + "2-phenylisopropyl": "[#6]-[#6](-[#6])-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "5-phenyl-3,4-ethylenedioxythenyl": "[#6]-[#6]1:[#6]2-[#8]-[#6]-[#6]-[#8]-[#6]:2:[#6](:[#16]:1)-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "bu": "[#6]-[#6](-[#6])-[#6]", + "2-cl-trt": "[#17]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#6](-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "dmb": "[#6]-[#6]1:[#6]:[#6](-[#8]-[#6]):[#6]:[#6](-[#8]-[#6]):[#6]:1", + "2-ph-pr": "[#6]-[#6](-[#6])-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "phenyl-edotn": "[#6]-[#6]1:[#6]2-[#8]-[#6]-[#6]-[#8]-[#6]:2:[#6](:[#16]:1)-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "9-fluorenylmethyl": "[#6]-[#6]1-[#6]2:[#6](-[#6]3:[#6]-1:[#6]:[#6]:[#6]:[#6]:3):[#6]:[#6]:[#6]:[#6]:2", + "4-(N-[1-(4,4-dimethyl-2,6-dioxocylocheylidene)-3-methylbutyl]-amino)benzyl": "[#6]-[#6]1(-[#6]-[#6](-[#6](=[#6](-[#7]-[#6]2:[#6]:[#6]:[#6](-[#6]):[#6]:[#6]:2)-[#6]-[#6](-[#6])-[#6])-[#6](-[#6]-1)=[#8])=[#8])-[#6]", + "methyl": "[#6H3]", + "ethyl": "[#6H2]-[#6]", + "carbamoylmethyl": "[#6]-[#6](-[#7])=[#8]", + "fm": "[#6]-[#6]1-[#6]2:[#6](-[#6]3:[#6]-1:[#6]:[#6]:[#6]:[#6]:3):[#6]:[#6]:[#6]:[#6]:2", + "dmab": "[#6]-[#6]1(-[#6]-[#6](-[#6](=[#6](-[#7]-[#6]2:[#6]:[#6]:[#6](-[#6]):[#6]:[#6]:2)-[#6]-[#6](-[#6])-[#6])-[#6](-[#6]-1)=[#8])=[#8])-[#6]", + "me": "[#6]", + "et": "[#6]-[#6]", + "cam": "[#6]-[#6](-[#7])=[#8]", + "allyl": "[#6H2]-[#6]=[#6]", + "benzyl": "[#6H2]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "phenacyl": "[#6H2]-[#6](-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)=[#8]", + "p-nitrobenzyl": "[#6]-[#6]1:[#6]:[#6]:[#6](-[#7+](-[#8-])=[#8]):[#6]:[#6]:1", + "2-trimethylsilyethyl": "[#6]-[#6]-[#6]-[Si](-[#6])(-[#6])-[#6]", + "(2-phenyl-2-trimethylsilyl)ethyl": "[#6]-[#6](-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)-[Si](-[#6])(-[#6])-[#6]", + "2-(trimethylsilyl)isopropyl": "[#6]-[#6](-[#6])(-[Si](-[#6])(-[#6])-[#6])-[#6]", + "2,2,2-trichloroethyl": "[#6]-[#6](-[#17])(-[#17])-[#17]", + "p-hydroxyphenacyl": "[#6]-[#6](-[#6]1:[#6]:[#6]:[#6](-[#8]):[#6]:[#6]:1)=[#8]", + "4,5-dimethyoxy-2-nitrobenzyl": "[#6]-[#6]1:[#6]:[#6](-[#8]-[#6]):[#6](-[#8]-[#6]):[#6]:[#6]:1-[#7+](-[#8-])=[#8]", + "1,1-dimethylallyl": "[#6]=[#6]-[#6](-[#6])-[#6]", + "pentaaminecobalt_III": "[#7]-[Co](-[#7])(-[#7])(-[#7])(-[#17])(-[#17])-[#7]", + "al": "[#6]-[#6]=[#6]", + "bn": "[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "pac": "[#6]-[#6](-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)=[#8]", + "pnb": "[#6]-[#6]1:[#6]:[#6]:[#6](-[#7+](-[#8-])=[#8]):[#6]:[#6]:1", + "tmse": "[#6]-[#6]-[#6]-[Si](-[#6])(-[#6])-[#6]", + "ptmse": "[#6]-[#6](-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)-[Si](-[#6])(-[#6])-[#6]", + "tmsi": "[#6]-[#6](-[#6])(-[Si](-[#6])(-[#6])-[#6])-[#6]", + "tce": "[#6]-[#6](-[#17])(-[#17])-[#17]", + "php": "[#6]-[#6](-[#6]1:[#6]:[#6]:[#6](-[#8]):[#6]:[#6]:1)=[#8]", + "dmnb": "[#6]-[#6]1:[#6]:[#6](-[#8]-[#6]):[#6](-[#8]-[#6]):[#6]:[#6]:1-[#7+](-[#8-])=[#8]", + "dma": "[#6]=[#6]-[#6](-[#6])-[#6]", + "cyclohexyl": "[#6]1-[#6]-[#6]-[#6]-[#6]-[#6]-1", + "b-menthyl": "[#6]-[#6@H]1-[#6@H](-[#6](-[#6])-[#6])-[#6]-[#6]-[#6@@H](-[#6])-[#6]-1", + "b-3-methylpent-3-yl": "[#6]-[#6]-[#6](-[#6])-[#6]-[#6]", + "4-(3,6,9-trioxadecyl)oxybenzyl": "[#6]-[#6]1:[#6]:[#6]:[#6](-[#8]-[#6]-[#6]-[#8]-[#6]-[#6]-[#8]-[#6]-[#6]-[#8]-[#6]):[#6]:[#6]:1", + "chx": "[#6]1-[#6]-[#6]-[#6]-[#6]-[#6]-1", + "men": "[#6]-[#6H]1-[#6H](-[#6](-[#6])-[#6])-[#6]-[#6]-[#6H](-[#6])-[#6]-1", + "mpe": "[#6]-[#6]-[#6](-[#6])-[#6]-[#6]", + "tegbz": "[#6]-[#6]1:[#6]:[#6]:[#6](-[#8]-[#6]-[#6]-[#8]-[#6]-[#6]-[#8]-[#6]-[#6]-[#8]-[#6]):[#6]:[#6]:1", + "9-fluoroenylmethyl": "[#6]-[#6]1-[#6]2:[#6](-[#6]3:[#6]-1:[#6]:[#6]:[#6]:[#6]:3):[#6]:[#6]:[#6]:[#6]:2", + "4-(N-[1-(4,4-dimethyl-2,6-dioxocyclohexylidene)-3-methyl-butyl]-amino)benzyl": "[#6]-[#6]1(-[#6]-[#6](-[#6](=[#6](-[#7]-[#6]2:[#6]:[#6]:[#6](-[#6]):[#6]:[#6]:2)-[#6]-[#6](-[#6])-[#6])-[#6](-[#6]-1)=[#8])=[#8])-[#6]", + "trimethylsilylethyl": "[#6]-[#6]-[#6]-[Si](-[#6])(-[#6])-[#6]", + "4,5-dimethoxy-2-nitrobenzyloxycarbonyl": "[#6]-[#6]1:[#6]:[#6](-[#8]-[#6]):[#6](-[#8]-[#6]):[#6]:[#6]:1-[#7+](-[#8-])=[#8]", + "pseudoprolines": "[#6]-[#6]1(-[#6])-[#7]-[#6](-[#6](-[#8])=[#8])-[#6]-[#8]-1", + "2-hydroxy-4-methoxybenzyl": "[#6]-[#6]1:[#6]:[#6]:[#6](-[#8]-[#6]):[#6]:[#6]:1-[#8]", + "2,4-dimethoxybenzyl": "[#6]-[#6]1:[#6]:[#6]:[#6](-[#8]-[#6]):[#6]:[#6]:1-[#8]-[#6]", + "2,4,6-trimethoxybenzyl": "[#6]-[#6]1:[#6](-[#8]-[#6]):[#6]:[#6](-[#8]-[#6]):[#6]:[#6]:1-[#8]-[#6]", + "1-methyl-3-indolylmethyl": "[#6]-[#6]-[#6]1:[#6]:[#7H]:[#6]2:[#6]:1:[#6]:[#6]:[#6]:[#6]:2", + "3,4-ethylene-dioxy-2-thenyl": "[#6]-[#6]1:[#6]2-[#8]-[#6]-[#6]-[#8]-[#6]:2:[#6]:[#16]:1", + "hmb": "[#6]-[#6]1:[#6]:[#6]:[#6](-[#8]-[#6]):[#6]:[#6]:1-[#8]", + "tmob": "[#6]-[#6]1:[#6](-[#8]-[#6]):[#6]:[#6](-[#8]-[#6]):[#6]:[#6]:1-[#8]-[#6]", + "mim": "[#6]-[#6]-[#6]1:[#6]:[#7H]:[#6]2:[#6]:1:[#6]:[#6]:[#6]:[#6]:2", + "edot": "[#6]-[#6]1:[#6]2-[#8]-[#6]-[#6]-[#8]-[#6]:2:[#6]:[#16]:1", + "4-methoxy-2-nitro-benzyl": "[#6]-[#6]1:[#6]:[#6]:[#6](-[#8]-[#6]):[#6]:[#6]:1-[#7+](-[#8-])=[#8]", + "(6-hydroxy-3-oxido-1,3-benz[d]oxathiol-5-yl)methyl": "[#8]=[#16]1-[#6]-[#8]-[#6]2:[#6]-1:[#6]:[#6](-[#6]):[#6](-[#8]):[#6]:2", + "2-hydroxy-4-methoxy-5-(methylsulfinyl)benzyl": "[#6]-[#6]1:[#6]:[#6](-[#16](-[#6])=[#8]):[#6](-[#8]-[#6]):[#6]:[#6]:1-[#8]", + "n-boc-n-methyl[2-(methylamino)ethyl]carbamoyl-hmb": "[#6]-[#6](-[#6])(-[#8]-[#6](-[#7](-[#6]-[#6]-[#7](-[#6](-[#8]-[#6]1:[#6]:[#6](-[#8]-[#6]):[#6]:[#6]:[#6]:1)=[#8])-[#6])-[#6])=[#8])-[#6]", + "9-xanthenyl": "[#6]12-[#6]-[#6]3:[#6](:[#6]:[#6]:[#6]:[#6]:3)-[#8]-[#6]:1:[#6]:[#6]:[#6]:[#6]:2", + "cyclopropyldimethylcarbinyl": "[#6]-[#6](-[#6]1-[#6]-[#6]-1)-[#6]", + "4,4-dimethoxybenzhydryl": "[#6]-[#8]-[#6]1:[#6]:[#6]:[#6](:[#6]:[#6]:1)-[#6]-[#6]1:[#6]:[#6]:[#6](-[#8]-[#6]):[#6]:[#6]:1", + "xan": "[#6]12-[#6]-[#6]3:[#6](:[#6]:[#6]:[#6]:[#6]:3)-[#8]-[#6]:1:[#6]:[#6]:[#6]:[#6]:2", + "cpd": "[#6]-[#6](-[#6]1-[#6]-[#6]-1)-[#6]", + "mbh": "[#6]-[#8]-[#6]1:[#6]:[#6]:[#6](:[#6]:[#6]:1)-[#6]-[#6]1:[#6]:[#6]:[#6](-[#8]-[#6]):[#6]:[#6]:1", + "p-toluenesulfonyl": "[#8]=[#16](-[#6]1:[#6]:[#6]:[#6](-[#6]):[#6]:[#6]:1)=[#8]", + "2,2,5,7,8-pentamethylchroman-6-sulfonyl": "[#8]=[#16](-[#6]1:[#6](-[#6]):[#6]2-[#6]-[#6]-[#6](-[#6])(-[#6])-[#8]-[#6]:2:[#6](-[#6]):[#6]:1-[#6])=[#8]", + "2,2,4,6,7-pentamethyl-2,3-dihydrobenzofuran-5-sulfonyl": "[#8]=[#16](-[#6]1:[#6](-[#6]):[#6](-[#6]):[#6]2-[#8]-[#6](-[#6])(-[#6])-[#6]-[#6]:2:[#6]:1-[#6])=[#8]", + "mesityl-2-sulfonyl": "[#6]-[#6]1:[#6](-[#16](=[#8])(-[#7]-[#6](-[#7])=[#7])=[#8]):[#6](-[#6]):[#6]:[#6](-[#6]):[#6]:1", + "4-methoxy-2,3,6-trimethylphenylsulfonyl": "[#8]=[#16](-[#6]1:[#6](-[#6]):[#6]:[#6](-[#8]-[#6]):[#6](-[#6]):[#6]:1-[#6])=[#8]", + "1,2-dimethylindole-3-sulfonyl": "[#8]=[#16](-[#6]1:[#6](-[#6]):[#7](-[#6]):[#6]2:[#6]:1:[#6]:[#6]:[#6]:[#6]:2)=[#8]", + "w,w-bis-tert-butyloxycarbonyl": "[#6]-[#6](-[#6])(-[#8]-[#6](/[#7]=[#6](/[#7]-[#6](-[#8]-[#6](-[#6])(-[#6])-[#6])=[#8])-[#7])=[#8])-[#6]", + "5-dibenzosuberenyl": "[#6]12:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#6]-[#6]-[#6]1-[#6]=[#6]-[#6]=[#6]-[#6]-1=[#6]-2", + "5-dibenzosuberyl": "[#6]12:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#6]-[#6]-[#6]1:[#6](:[#6]:[#6]:[#6]:[#6]:1)-[#6]-2", + "2-methoxy-5-dibenzosuberyl": "[#6]-[#8]-[#6]1:[#6]:[#6]2-[#6]-[#6]-[#6]3:[#6](-[#6]-[#6]:2:[#6]:[#6]:1):[#6]:[#6]:[#6]:[#6]:3", + "nitro": "[#8]=[#7+]-[#8-]", + "tos": "[#8]=[#16](-[#6]1:[#6]:[#6]:[#6](-[#6]):[#6]:[#6]:1)=[#8]", + "pmc": "[#8]=[#16](-[#6]1:[#6](-[#6]):[#6]2-[#6]-[#6]-[#6](-[#6])(-[#6])-[#8]-[#6]:2:[#6](-[#6]):[#6]:1-[#6])=[#8]", + "pbf": "[#8]=[#16](-[#6]1:[#6](-[#6]):[#6](-[#6]):[#6]2-[#8]-[#6](-[#6])(-[#6])-[#6]-[#6]:2:[#6]:1-[#6])=[#8]", + "mts": "[#6]-[#6]1:[#6](-[#16](=[#8])(-[#7]-[#6](-[#7])=[#7])=[#8]):[#6](-[#6]):[#6]:[#6](-[#6]):[#6]:1", + "mtr": "[#8]=[#16](-[#6]1:[#6](-[#6]):[#6]:[#6](-[#8]-[#6]):[#6](-[#6]):[#6]:1-[#6])=[#8]", + "mis": "[#8]=[#16](-[#6]1:[#6](-[#6]):[#7](-[#6]):[#6]2:[#6]:1:[#6]:[#6]:[#6]:[#6]:2)=[#8]", + "bis-boc": "[#6]-[#6](-[#6])(-[#8]-[#6](/[#7]=[#6](/[#7]-[#6](-[#8]-[#6](-[#6])(-[#6])-[#6])=[#8])-[#7])=[#8])-[#6]", + "suben": "[#6]12:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#6]-[#6]-[#6]1-[#6]=[#6]-[#6]=[#6]-[#6]-1=[#6]-2", + "sub": "[#6]12:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#6]-[#6]-[#6]1:[#6](:[#6]:[#6]:[#6]:[#6]:1)-[#6]-2", + "mesub": "[#6]-[#8]-[#6]1:[#6]:[#6]2-[#6]-[#6]-[#6]3:[#6](-[#6]-[#6]:2:[#6]:[#6]:1):[#6]:[#6]:[#6]:[#6]:3", + "no2": "[#8]=[#7+]-[#8-]", + "w,w-bis-benzyloxycarbonyl": "[#8]=[#6](/[#7]=[#6](/[#7]-[#6](-[#8]-[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)=[#8])-[#7])-[#8]-[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "w,w-bis-allyloxycarbonyl": "[#8]=[#6](/[#7]=[#6](/[#7]-[#6](-[#8]-[#6]-[#6]=[#6])=[#8])-[#7])-[#8]-[#6]-[#6]=[#6]", + "z-small": "[#8]=[#6](/[#7]=[#6](/[#7]-[#6](-[#8]-[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)=[#8])-[#7])-[#8]-[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "p-methylbenzyl": "[#6]-[#6]1:[#6]:[#6]:[#6](-[#6]):[#6]:[#6]:1", + "p-methoxybenzyl": "[#6]-[#6]1:[#6]:[#6]:[#6](-[#8]-[#6]):[#6]:[#6]:1", + "monomethoxytrityl": "[#6]-[#8]-[#6]1:[#6]:[#6]:[#6](-[#6](-[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2)-[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2):[#6]:[#6]:1", + "trimethoxybenzyl": "[#6]-[#6]1:[#6](-[#8]-[#6]):[#6]:[#6](-[#8]-[#6]):[#6]:[#6]:1-[#8]-[#6]", + "2,2,4,6,7-pentamethyl-5-dihydrobenzofuranylmethyl": "[#6]-[#6]1:[#6](-[#6]):[#6](-[#6]):[#6]2-[#8]-[#6](-[#6])(-[#6])-[#6]-[#6]:2:[#6]:1-[#6]", + "1-adamantyl": "[#6]12-[#6]-[#6]3-[#6]-[#6](-[#6]-1)-[#6]-[#6](-[#6]-3)-[#6]-2", + "meb": "[#6]-[#6]1:[#6]:[#6]:[#6](-[#6]):[#6]:[#6]:1", + "mob": "[#6]-[#6]1:[#6]:[#6]:[#6](-[#8]-[#6]):[#6]:[#6]:1", + "mmt": "[#6]-[#8]-[#6]1:[#6]:[#6]:[#6](-[#6](-[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2)-[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2):[#6]:[#6]:1", + "pmbf": "[#6]-[#6]1:[#6](-[#6]):[#6](-[#6]):[#6]2-[#8]-[#6](-[#6])(-[#6])-[#6]-[#6]:2:[#6]:1-[#6]", + "1-ada": "[#6]12-[#6]-[#6]3-[#6]-[#6](-[#6]-1)-[#6]-[#6](-[#6]-3)-[#6]-2", + "2-(2,4-dinitrophenyl)ethyl": "[#6]-[#6]-[#6]1:[#6]:[#6]:[#6](-[#7+](-[#8-])=[#8]):[#6]:[#6]:1-[#7+](-[#8-])=[#8]", + "9-fluororenylmethoxycarbonyl": "[#8]=[#6]-[#8]-[#6]-[#6]1-[#6]2:[#6](-[#6]3:[#6]-1:[#6]:[#6]:[#6]:[#6]:3):[#6]:[#6]:[#6]:[#6]:2", + "dnpe": "[#6]-[#6]-[#6]1:[#6]:[#6]:[#6](-[#7+](-[#8-])=[#8]):[#6]:[#6]:1-[#7+](-[#8-])=[#8]", + "acetamidomethyl": "[#6]-[#7]-[#6](-[#6])=[#8]", + "phenylacetamidomethyl": "[#6]-[#7]-[#6](-[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)=[#8]", + "5-tert-butylmercapto": "[#6]-[#6](-[#6])(-[#16])-[#6]", + "3-nitro-2-pyridinesulfenyl": "[#16]-[#6]1:[#7]:[#6]:[#6]:[#6]:[#6]:1-[#7+](-[#8-])=[#8]", + "2-pyridinesulfenyl": "[#16]-[#6]1:[#7]:[#6]:[#6]:[#6]:[#6]:1", + "N-allyloxycarbonyl-N-[2,3,5,6-tetrafluoro-4-(phenylthio)phenyl]]aminomethyl": "[#9]-[#6]1:[#6](-[#9]):[#6](-[#16]-[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2):[#6](-[#9]):[#6](-[#9]):[#6]:1-[#7](-[#6](-[#8]-[#6]-[#6]=[#6])=[#8])-[#6]", + "o-nitrobenzyl": "[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#7+](-[#8-])=[#8]", + "4-picolyl": "[#6]-[#6]1:[#6]:[#6]:[#7]:[#6]:[#6]:1", + "ninhydrin": "[#8]=[#6]1-[#6]2(-[#16]-[#6]-[#6](-[#6](-[#8])=[#8])-[#7]-2)-[#6](-[#6]2:[#6]-1:[#6]:[#6]:[#6]:[#6]:2)=[#8]", + "acm": "[#6]-[#7]-[#6](-[#6])=[#8]", + "phacm": "[#6]-[#7]-[#6](-[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)=[#8]", + "sbu": "[#6]-[#6](-[#6])(-[#16])-[#6]", + "npys": "[#16]-[#6]1:[#7]:[#6]:[#6]:[#6]:[#6]:1-[#7+](-[#8-])=[#8]", + "s-pyr": "[#16]-[#6]1:[#7]:[#6]:[#6]:[#6]:[#6]:1", + "fsam": "[#9]-[#6]1:[#6](-[#9]):[#6](-[#16]-[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2):[#6](-[#9]):[#6](-[#9]):[#6]:1-[#7](-[#6](-[#8]-[#6]-[#6]=[#6])=[#8])-[#6]", + "onb": "[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#7+](-[#8-])=[#8]", + "nin": "[#8]=[#6]1-[#6]2(-[#16]-[#6]-[#6](-[#6](-[#8])=[#8])-[#7]-2)-[#6](-[#6]2:[#6]-1:[#6]:[#6]:[#6]:[#6]:2)=[#8]", + "n-tosyl": "[#8]=[#16](-[#7]1:[#6]:[#6]:[#7]:[#6]:1)(-[#6]1:[#6]:[#6]:[#6](-[#6]):[#6]:[#6]:1)=[#8]", + "n-trityl": "[#7]1(-[#6](-[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2)(-[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2)-[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2):[#6]:[#6]:[#7]:[#6]:1", + "n-monomethoxytrityl": "[#6]-[#8]-[#6]1:[#6]:[#6]:[#6](:[#6]:[#6]:1)-[#6](-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)(-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)-[#7]1:[#6]:[#6]:[#7]:[#6]:1", + "n-methyltrityl": "[#6]-[#6]1:[#6]:[#6]:[#6](:[#6]:[#6]:1)-[#6](-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)(-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)-[#7]1:[#6]:[#6]:[#7]:[#6]:1", + "n-tert-butyloxycarbonyl": "[#8]=[#6](-[#8]-[#6](-[#6])(-[#6])-[#6])-[#7]1:[#6]:[#6]:[#7]:[#6]:1", + "n-2,4-dimethylpent-3-yloxycarbonyl": "[#8]=[#6](-[#8]-[#6](-[#6](-[#6])-[#6])-[#6](-[#6])-[#6])-[#7]1:[#6]:[#6]:[#7]:[#6]:1", + "n-benzyloxymethyl": "[#7+]1(-[#6]-[#8]-[#6]-[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2):[#6]:[#7H]:[#6]:[#6]:1", + "n-tert-butoxymethyl": "[#6]-[#6](-[#6])(-[#6])-[#8]-[#6]-[#7+]1:[#6]:[#7H]:[#6]:[#6]:1", + "ntos": "[#8]=[#16](-[#7]1:[#6]:[#6]:[#7]:[#6]:1)(-[#6]1:[#6]:[#6]:[#6](-[#6]):[#6]:[#6]:1)=[#8]", + "ntrt": "[#7]1(-[#6](-[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2)(-[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2)-[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2):[#6]:[#6]:[#7]:[#6]:1", + "nmtt": "[#6]-[#8]-[#6]1:[#6]:[#6]:[#6](:[#6]:[#6]:1)-[#6](-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)(-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)-[#7]1:[#6]:[#6]:[#7]:[#6]:1", + "nmmt": "[#6]-[#6]1:[#6]:[#6]:[#6](:[#6]:[#6]:1)-[#6](-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)(-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)-[#7]1:[#6]:[#6]:[#7]:[#6]:1", + "nboc": "[#8]=[#6](-[#8]-[#6](-[#6])(-[#6])-[#6])-[#7]1:[#6]:[#6]:[#7]:[#6]:1", + "ndoc": "[#8]=[#6](-[#8]-[#6](-[#6](-[#6])-[#6])-[#6](-[#6])-[#6])-[#7]1:[#6]:[#6]:[#7]:[#6]:1", + "nbom": "[#7+]1(-[#6]-[#8]-[#6]-[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2):[#6]:[#7H]:[#6]:[#6]:1", + "nbum": "[#6]-[#6](-[#6])(-[#6])-[#8]-[#6]-[#7+]1:[#6]:[#7H]:[#6]:[#6]:1", + "N-9-fluorenylmethoxycarbonyl": "[#8]=[#6](-[#8]-[#6]1-[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2-[#6]2:[#6]-1:[#6]:[#6]:[#6]:[#6]:2)-[#7]1:[#6]:[#6]:[#7]:[#6]:1", + "N-2,6-dimethoxybenzoyl": "[#8]=[#6](-[#6]1:[#6](-[#8]-[#6]):[#6]:[#6]:[#6]:[#6]:1-[#8]-[#6])-[#7]1:[#6]:[#6]:[#7]:[#6]:1", + "dmbz": "[#8]=[#6](-[#6]1:[#6](-[#8]-[#6]):[#6]:[#6]:[#6]:[#6]:1-[#8]-[#6])-[#7]1:[#6]:[#6]:[#7]:[#6]:1", + "N-2,4-dinitrophenyl": "[#8]=[#6](-[#6]1:[#6](-[#7+](-[#8-])=[#8]):[#6]:[#6](-[#7+](-[#8-])=[#8]):[#6]:[#6]:1)-[#7]1:[#6]:[#6]:[#7]:[#6]:1", + "dnp": "[#8]=[#6](-[#6]1:[#6](-[#7+](-[#8-])=[#8]):[#6]:[#6](-[#7+](-[#8-])=[#8]):[#6]:[#6]:1)-[#7]1:[#6]:[#6]:[#7]:[#6]:1", + "cyclohexyl;": "[#6]1-[#6]-[#6]-[#6]-[#6]-[#6]-1", + "tert-butyldimethylsilyl": "[#6]-[Si](-[#6](-[#6])(-[#6])-[#6])-[#6]", + "tbdms": "[#6]-[Si](-[#6](-[#6])(-[#6])-[#6])-[#6]", + "tert-butyldiphenylsilyl": "[#6]-[#6](-[Si](-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)(-[#6])-[#6]", + "propargyloxycarbonyl": "[#6]#[#6]-[#8]-[#6](-[#6])=[#8]", + "tbdps": "[#6]-[#6](-[Si](-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)(-[#6])-[#6]", + "2,6-dichlorobenzyl": "[#6]-[#6]1:[#6](-[#17]):[#6]:[#6]:[#6]:[#6]:1-[#17]", + "2-bromobenzyl": "[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#35]", + "2-bromobenzyloxycarbonyl": "[#8]=[#6]-[#8]-[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#35]", + "3-pentyl": "[#6]-[#6]-[#6]-[#6]-[#6]", + "dcb": "[#6]-[#6]1:[#6](-[#17]):[#6]:[#6]:[#6]:[#6]:1-[#17]", + "brbn": "[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#35]", + "brz": "[#8]=[#6]-[#8]-[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#35]", + "pen": "[#6]-[#6]-[#6]-[#6]-[#6]", + "tegb": "[#6]-[#6]1:[#6]:[#6]:[#6](-[#8]-[#6]-[#6]-[#8]-[#6]-[#6]-[#8]-[#6]-[#6]-[#8]-[#6]):[#6]:[#6]:1", + "boc-n-methyl-n-[2-(methylamino)ethyl]carbamoyl": "[#8]=[#6](-[#6](-[#8]-[#6](-[#6])(-[#6])-[#6])=[#8])-[#7](-[#6])-[#6]-[#6]-[#7]-[#6]", + "boc-nmec": "[#8]=[#6](-[#6](-[#8]-[#6](-[#6])(-[#6])-[#6])=[#8])-[#7](-[#6])-[#6]-[#6]-[#7]-[#6]", + "formyl": "[#6H]=[#8]", + "cyclohexyloxycarbonyl": "[#8]=[#6]-[#8]-[#6]1-[#6]-[#6]-[#6]-[#6]-[#6]-1", + "for": "[#6]=[#8]", + "hoc": "[#8]=[#6]-[#8]-[#6]1-[#6]-[#6]-[#6]-[#6]-[#6]-1", + "cyclopropane": "[#6]1-[#6]-[#6]-1", + "spiropentane": "[#6]1-[#6]-[#6]-12-[#6]-[#6]-2", + "cyclobutane": "[#6]1-[#6]-[#6]-[#6]-1", + "cyclopentane": "[#6]1-[#6]-[#6]-[#6]-[#6]-1", + "furan": "[#6]1:[#6]:[#6]:[#6]:[#8]:1", + "thiophene": "[#6]1:[#6]:[#6]:[#6]:[#16]:1", + "pyrrole": "[#7H]1:[#6]:[#6]:[#6]:[#6]:1", + "2H-pyrrole": "[#7]1=[#6]-[#6]=[#6]-[#6]-1", + "3H-pyrrole": "[#7]1=[#6]-[#6]-[#6]=[#6]-1", + "pyrazole": "[#7H]1:[#7]:[#6]:[#6]:[#6]:1", + "2H-imidazole": "[#6]1-[#7]=[#6]-[#6]=[#7]-1", + "1,2,3-triazole": "[#7H]1:[#7]:[#7]:[#6]:[#6]:1", + "1,2,4-triazole": "[#6]1:[#7]:[#6]:[#7]:[#7H]:1", + "1,2-dithiole": "[#16]1-[#16]-[#6]=[#6]-[#6]-1", + "1,3-dithiole": "[#16]1-[#6]-[#16]-[#6]=[#6]-1", + "3H-1,2-oxathiole": "[#8]1-[#16]-[#6]-[#6]=[#6]-1", + "isoxazole": "[#8]1:[#7]:[#6]:[#6]:[#6]:1", + "oxazole": "[#8]1:[#6]:[#7]:[#6]:[#6]:1", + "thiazole": "[#16]1:[#6]:[#7]:[#6]:[#6]:1", + "isothiazole": "[#16]1:[#7]:[#6]:[#6]:[#6]:1", + "1,2,3-oxadiazole": "[#8]1:[#7]:[#7]:[#6]:[#6]:1", + "1,2,4-oxadiazole": "[#8]1:[#7]:[#6]:[#7]:[#6]:1", + "1,2,5-oxadiazole": "[#8]1:[#7]:[#6]:[#6]:[#7]:1", + "1,3,4-oxadiazole": "[#8]1:[#6]:[#7]:[#7]:[#6]:1", + "1,2,3,4-oxatriazole": "[#8]1:[#7]:[#7]:[#7]:[#6]:1", + "1,2,3,5-oxatriazole": "[#8]1:[#7]:[#7]:[#6]:[#7]:1", + "3H-1,2,3-dioxazole": "[#8]1-[#8]-[#7]-[#6]=[#6]-1", + "1,2,4-dioxazole": "[#8]1-[#8]-[#6]=[#7]-[#6]-1", + "1,3,2-dioxazole": "[#8]1-[#7]-[#8]-[#6]=[#6]-1", + "1,3,4-dioxazole": "[#8]1-[#6]-[#8]-[#7]=[#6]-1", + "5H-1,2,5-oxathiazole": "[#8]1-[#16]-[#6]=[#6]-[#7]-1", + "1,3-oxathiole": "[#8]1-[#6]-[#16]-[#6]=[#6]-1", + "benzene": "[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "cyclohexane": "[#6]1-[#6]-[#6]-[#6]-[#6]-[#6]-1", + "2H-pyran": "[#6]1-[#6]=[#6]-[#6]=[#6]-[#8]-1", + "4H-pyran": "[#6]1=[#6]-[#6]-[#6]=[#6]-[#8]-1", + "2H-pyran-2-one": "[#8]=[#6]1:[#6]:[#6]:[#6]:[#6]:[#8]:1", + "4H-pyran-4-one": "[#8]=[#6]1:[#6]:[#6]:[#8]:[#6]:[#6]:1", + "1,2-dioxin": "[#8]1-[#8]-[#6]=[#6]-[#6]=[#6]-1", + "1,3-dioxin": "[#8]1-[#6]-[#8]-[#6]=[#6]-[#6]-1", + "pyridine": "[#6]1:[#6]:[#6]:[#7]:[#6]:[#6]:1", + "pyridazine": "[#6]1:[#7]:[#7]:[#6]:[#6]:[#6]:1", + "pyrimidine": "[#6]1:[#7]:[#6]:[#6]:[#6]:[#7]:1", + "pyrazine": "[#6]1:[#7]:[#6]:[#6]:[#7]:[#6]:1", + "piperazine": "[#7]1-[#6]-[#6]-[#7]-[#6]-[#6]-1", + "1,3,5-triazine": "[#7]1:[#6]:[#7]:[#6]:[#7]:[#6]:1", + "1,2,4-triazine": "[#7]1:[#7]:[#6]:[#7]:[#6]:[#6]:1", + "1,2,3-triazine": "[#7]1:[#7]:[#7]:[#6]:[#6]:[#6]:1", + "4H-1,2-Oxazine": "[#8]1-[#7]=[#6]-[#6]-[#6]=[#6]-1", + "2H-1,3-Oxazine": "[#8]1-[#6]-[#7]=[#6]-[#6]=[#6]-1", + "6H-1,3-Oxazine": "[#8]1-[#6]=[#7]-[#6]=[#6]-[#6]-1", + "6H-1,2-Oxazine": "[#8]1-[#7]=[#6]-[#6]=[#6]-[#6]-1", + "1,4-Oxazine": "[#8]1-[#6]=[#6]-[#7]=[#6]-[#6]-1", + "2H-1,2-Oxazine": "[#8]1-[#7]-[#6]=[#6]-[#6]=[#6]-1", + "4H-1,4-Oxazine": "[#8]1-[#6]=[#6]-[#7]-[#6]=[#6]-1", + "1,2,5-Oxathiazine": "[#8]1-[#16]-[#6]=[#6]-[#7]=[#6]-1", + "1,2,6-Oxathiazine": "[#8]1-[#16]-[#6]=[#6]-[#6]=[#7]-1", + "1,2,4-Oxadiazine": "[#8]1-[#7]-[#6]=[#7]-[#6]=[#6]-1", + "1,3,5-Oxadiazine": "[#8]1-[#6]=[#7]-[#6]=[#7]-[#6]-1", + "morpholine": "[#7]1-[#6]-[#6]-[#8]-[#6]-[#6]-1", + "azepine": "[#7]1-[#6]=[#6]-[#6]=[#6]-[#6]=[#6]-1", + "oxepin": "[#8]1-[#6]=[#6]-[#6]=[#6]-[#6]=[#6]-1", + "thiepin": "[#16]1-[#6]=[#6]-[#6]=[#6]-[#6]=[#6]-1", + "4H-1,2-diazepine": "[#7]1=[#6]-[#6]=[#6]-[#6]-[#6]=[#7]-1", + "indene": "[#6]12:[#6](-[#6]-[#6]=[#6]-1):[#6]:[#6]:[#6]:[#6]:2", + "2H-indene": "[#6]12=[#6]-[#6]-[#6]=[#6]:1:[#6]:[#6]:[#6]:[#6]:2", + "benzofuran": "[#6]12:[#6]:[#6]:[#6]:[#6]:[#6]:1:[#6]:[#6]:[#8]:2", + "isobenzofuran": "[#6]12:[#6]:[#8]:[#6]:[#6]:1:[#6]:[#6]:[#6]:[#6]:2", + "benzo[b]thiophene": "[#6]12:[#6]:[#6]:[#6]:[#6]:[#6]:1:[#6]:[#6]:[#16]:2", + "benzo[c]thiophene": "[#6]12:[#6]:[#16]:[#6]:[#6]:1:[#6]:[#6]:[#6]:[#6]:2", + "indole": "[#6]12:[#6]:[#6]:[#6]:[#6]:[#6]:1:[#6]:[#6]:[#7H]:2", + "3H-indole": "[#6]12:[#6](-[#7]=[#6]-[#6]-1):[#6]:[#6]:[#6]:[#6]:2", + "1H-indole": "[#6]12:[#6](:[#7H]:[#6]:[#6]:1):[#6]:[#6]:[#6]:[#6]:2", + "cyclopenta[b]pyridine": "[#6]12:[#6]:[#6]:[#6]:[#6]-1:[#6]:[#6]:[#6]:[#7H]:2", + "pyrano[3,4-b]-pyrrole": "[#6]12:[#6]:[#8]:[#6]:[#6]:[#6]-1:[#6]:[#6]:[#7]:2", + "indazole": "[#6]12:[#6](:[#7H]:[#7]:[#6]:1):[#6]:[#6]:[#6]:[#6]:2", + "benzisoxazole": "[#6]12:[#7]:[#8]:[#6]:[#6]:1:[#6]:[#6]:[#6]:[#6]:2", + "benzoxazole": "[#6]12:[#6]:[#6]:[#6]:[#6]:[#6]:1:[#8]:[#6]:[#7]:2", + "2,1-benzisoxazole": "[#6]12:[#6]:[#8]:[#7]:[#6]:1:[#6]:[#6]:[#6]:[#6]:2", + "naphthalene": "[#6]12:[#6]:[#6]:[#6]:[#6]:[#6]:1:[#6]:[#6]:[#6]:[#6]:2", + "1,2,3,4-tetrahydronaphthalene": "[#6]12:[#6](-[#6]-[#6]-[#6]-[#6]-1):[#6]:[#6]:[#6]:[#6]:2", + "octahydronaphthalene": "[#6]12-[#6]-[#6]-[#6]-[#6]-[#6]-1=[#6]-[#6]-[#6]-[#6]-2", + "2H-1-benzopyran": "[#6]12:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#8]-[#6]-[#6]=[#6]-2", + "2H-1-benzopyran-2-one": "[#8]=[#6]1:[#6]:[#6]:[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2:[#8]:1", + "4H-1-benzopyran-4-one": "[#8]=[#6]1:[#6]:[#6]:[#8]:[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:1:2", + "1H-2-benzopyran-1-one": "[#8]=[#6]1:[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2:[#6]:[#6]:[#8]:1", + "3H-2-benzopyran-1-one": "[#8]=[#6]1-[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2-[#6]-[#6]-[#8]-1", + "quinoline": "[#6]12:[#6]:[#6]:[#6]:[#6]:[#6]:1:[#7]:[#6]:[#6]:[#6]:2", + "isoquinoline": "[#6]12:[#6](:[#6]:[#7]:[#6]:[#6]:1):[#6]:[#6]:[#6]:[#6]:2", + "cinnoline": "[#6]12:[#6]:[#6]:[#7]:[#7]:[#6]:1:[#6]:[#6]:[#6]:[#6]:2", + "quinazoline": "[#6]12:[#6]:[#7]:[#6]:[#7]:[#6]:1:[#6]:[#6]:[#6]:[#6]:2", + "1,8-napthyhridine": "[#6]1:[#6]:[#6]2:[#6](:[#7]:[#6]:1):[#7]:[#6]:[#6]:[#6]:2", + "1,7-napththyridine": "[#6]1:[#6]:[#6]2:[#6](:[#6]:[#7]:[#6]:[#6]:2):[#7]:[#6]:1", + "1,5-napththridine": "[#6]1:[#6]:[#6]2:[#6](:[#6]:[#6]:[#6]:[#7]:2):[#7]:[#6]:1", + "1,6-napthyridine": "[#6]1:[#6]:[#6]2:[#6](:[#6]:[#6]:[#7]:[#6]:2):[#7]:[#6]:1", + "2H-1,3-benzoxazine": "[#6]12:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#8]-[#6]-[#7]=[#6]-2", + "2H-1,4-benzoxazine": "[#6]12:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#8]-[#6]-[#6]=[#7]-2", + "1H-2,3-benzoxazine": "[#6]12:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#6]-[#8]-[#7]=[#6]-2", + "4H-3,1-benzoxazine": "[#6]12:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#7]=[#6]-[#8]-[#6]-2", + "2H-1,2-benzoxazine": "[#6]12:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#8]-[#7]-[#6]=[#6]-2", + "4H-1,3-benzoxazine": "[#6]12:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#8]-[#6]=[#7]-[#6]-2", + "anthracene": "[#6]12:[#6]:[#6]:[#6]:[#6]:[#6]:1:[#6]:[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1:[#6]:2", + "phenanthrene": "[#6]12:[#6]:[#6]:[#6]:[#6]:[#6]:1:[#6]:[#6]:[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:2:1", + "phenalene": "[#6]12:[#6]3:[#6](-[#6]-[#6]=[#6]-1):[#6]:[#6]:[#6]:[#6]:3:[#6]:[#6]:[#6]:2", + "fluorene": "[#6]12-[#6]-[#6]3:[#6](:[#6]:[#6]:[#6]:[#6]:3)-[#6]:1:[#6]:[#6]:[#6]:[#6]:2", + "carbazole": "[#6]12:[#7H]:[#6]3:[#6](:[#6]:[#6]:[#6]:[#6]:3):[#6]:1:[#6]:[#6]:[#6]:[#6]:2", + "xanthene": "[#6]12-[#6]-[#6]3:[#6](:[#6]:[#6]:[#6]:[#6]:3)-[#8]-[#6]:1:[#6]:[#6]:[#6]:[#6]:2", + "acridine": "[#6]12:[#7]:[#6]3:[#6]:[#6]:[#6]:[#6]:[#6]:3:[#6]:[#6]:1:[#6]:[#6]:[#6]:[#6]:2", + "norpinane": "[#6]12-[#6]-[#6](-[#6]-[#6]-[#6]-1)-[#6]-2", + "7H-purine": "[#6]12:[#7]:[#6]:[#7]:[#6]:[#6]:1:[#7H]:[#6]:[#7]:2", + "steroid_ring_system": "[#6]12-[#6]-[#6]-[#6]-[#6]-[#6]-1-[#6]1-[#6](-[#6]3-[#6]-[#6]-[#6]-[#6]-3-[#6]-[#6]-1)-[#6]-[#6]-2", + "imidazole": "[#6]1:[#6]:[#7]:[#6]:[#7H]:1", + "thiazol-2-amine": "[#7]-[#6]1:[#7]:[#6]:[#6]:[#16]:1", + "tetrazole": "[#6]1:[#7]:[#7]:[#7]:[#7H]:1", + "cytosine": "[#8]=[#6]1:[#7]:[#6](-[#7]):[#6]:[#6]:[#7H]:1", + "adenine": "[#7]-[#6]1:[#7]:[#6]:[#7]:[#6]2:[#6]:1:[#7]:[#6]:[#7H]:2", + "5-methylindole": "[#6]-[#6]1:[#6]:[#6]:[#6]2:[#6](:[#6]:[#6]:[#7H]:2):[#6]:1", + "isocaffeine": "[#8]=[#6]1:[#7](-[#6]):[#6](:[#6]2:[#6](:[#7H]:1):[#7H]:[#6]:[#7]:2)=[#8]", + "tetrazolethiol": "[#16]-[#7]1:[#7]:[#7]:[#7]:[#6]:1", + "3-methylisoxazole": "[#6]1:[#6]:[#6]:[#7]:[#8]:1", + "1-methylimidazole": "[#6]-[#7]1:[#6]:[#7]:[#6]:[#6]:1", + "2-methylimidazole": "[#6]-[#6]1:[#7]:[#6]:[#6]:[#7H]:1", + "guanine": "[#7]-[#6]1:[#7H]:[#6](:[#6]2:[#6](:[#7]:1):[#7H]:[#6]:[#7]:2)=[#8]", + "tosufloxacin": "[#7]-[#6]1:[#6](-[#9]):[#6]:[#6]2:[#6](:[#7H]:[#6]:[#6](-[#6](-[#8])=[#8]):[#6]:2=[#8]):[#7]:1", + "acetamido": "[#8]=[#6](-[#7])-[#6]", + "acetoacetyl": "[#8]=[#6](-[#6])-[#6]-[#6](=[#8])-[#8]", + "acetyl": "[#6](-[#6])=[#8]", + "acryloyl": "[#6]=[#6]-[#6](-[#6])=[#8]", + "alanyl": "[#7]-[#6H](-[#6])-[#6](-[#6])=[#8]", + "beta-alanyl": "[#7]-[#6]-[#6]-[#6](-[#6])=[#8]", + "allylidene": "[#6H]-[#6]=[#6]", + "amidino": "[#7]-[#6]=[#7]", + "amino": "[#7]", + "amyl": "[#6H2]-[#6]-[#6]-[#6]-[#6]", + "anilino": "[#7]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "anisidino": "[#7]-[#6]1:[#6]:[#6]:[#6](-[#8]-[#6]):[#6]:[#6]:1", + "anthranoyl": "[#7]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#6](-[#6])=[#8]", + "arsino": "[AsH3]", + "azelaoyl": "[#8]=[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]=[#8]", + "azido": "[#7]=[#7+]=[#7-]", + "azo": "[#6]/[#7]=[#7]/[#6]", + "azoxy": "[#6]/[#7]=[#7+](\\[#8-])-[#6]", + "benzal": "[#6H]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "benzamido": "[#8]=[#6](-[#7])-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "benzhydrol": "[#8]-[#6](-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "benzoxy": "[#8]-[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "benzoyl": "[#8]=[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "benzylidene": "[#6H]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "benzylidyne": "[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "biphenylyl": "[#6]1(-[#6]2:[#6]:[#6]:[#6]:[#6]:[#6]:2):[#6]:[#6]:[#6]:[#6]:[#6]:1", + "biphenylene": "[#6]12=[#6]3:[#6]:[#6]:[#6]:[#6]:[#6]:3=[#6]:1:[#6]:[#6]:[#6]:[#6]:2", + "butoxy": "[#8]-[#6]-[#6]-[#6]-[#6]", + "sec-butoxy": "[#8]-[#6](-[#6])-[#6]-[#6]", + "tert-butoxy": "[#8]-[#6](-[#6])(-[#6])-[#6]", + "butyl": "[#6H2]-[#6]-[#6]-[#6]", + "sec-butyl": "[#6]-[#6]-[#6H]-[#6]", + "butyryl": "[#8]=[#6]-[#6]-[#6]-[#6]", + "caproyl": "[#6]-[#6]-[#6]-[#6]-[#6]-[#6]=[#8]", + "capryl": "[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]", + "capryloyl": "[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]=[#8]", + "carbamido": "[#6](=[#8])(-[#7])-[#7]", + "carbamoyl": "[#7]-[#6]=[#8]", + "carbamyl": "[#7]-[#6]=[#8]", + "carbazoyl": "[#7]-[#7]-[#6]=[#8]", + "carbethoxy": "[#8]=[#6]-[#8]-[#6]-[#6]", + "carbonyl": "[CX3]=[OX1]", + "carboxy": "[#8]=[#6]-[#8]", + "cetyl": "[#6H2]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]", + "chloroformyl": "[#8]=[#6]-[#17]", + "cinnamoyl": "[#8]=[#6]-[#6]=[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "cinnamyl": "[#6H2]-[#6]=[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "cinnamylidene": "[#6H]-[#6]=[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "cresyl": "[#8]-[#6]1:[#6]:[#6]:[#6](-[#6]):[#6]:[#6]:1", + "crotonoyl": "[#6]/[#6]=[#6]/[#6]=[#8]", + "crotyl": "[#6H2]/[#6]=[#6]/[#6]", + "cyanamido": "[#7H]-[#6]#[#7]", + "cyanato": "[#8]-[#6]#[#7]", + "cyano": "[#6]#[#7]", + "decanedioyl": "[#8]=[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]=[#8]", + "decanoyl": "[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]=[#8]", + "diazo": "[#7+]=[#7-]", + "diazoamino": "[#7]=[#7]-[#7]", + "disilanyl": "[SiH2]-[SiH3]", + "disiloxanyloxy": "[#8]-[SiH2]-[#8]-[SiH3]", + "disulfinyl": "[#8]=[#16]-[#16]=[#8]", + "dithio": "[#16]-[#16]", + "enanthoyl": "[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]=[#8]", + "epoxy": "[#8]", + "ethenyl": "[#6H]=[#6]", + "ethynyl": "[#6]#[#6]", + "ethoxy": "[#8]-[#6]-[#6]", + "ethylene": "[#6]=[#6]", + "ethylidene": "[#6H]-[#6]", + "ethylthio": "[#16]-[#6]-[#6]", + "formamido": "[#8]=[#6]-[#7H]", + "furmaroyl": "[#8]=[#6]-[#8]", + "furfuryl": "[#6H2]-[#6]1:[#6]:[#6]:[#6]:[#8]:1", + "furfurylidene": "[#6H]-[#6]1:[#6]:[#6]:[#6]:[#8]:1", + "glutamoyl": "[#7]-[#6@@H](-[#6]-[#6]-[#6]=[#8])-[#6]=[#8]", + "glutaryl": "[#8]=[#6]-[#6]-[#6]-[#6]-[#6]=[#8]", + "glycylamino": "[#7H]-[#6](-[#6]-[#7])=[#8]", + "glycoloyl": "[#8]-[#6]-[#6]=[#8]", + "glycyl": "[#7]-[#6]-[#6]=[#8]", + "glyoxyoyl": "[#8]=[#6]-[#6]=[#8]", + "guanidino": "[#7H]-[#6](-[#7])=[#7]", + "guanyl": "[#7]=[#6]-[#7]", + "heptadecanoyl": "[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]=[#8]", + "heptanamido": "[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6](-[#7H])=[#8]", + "heptanoyl": "[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]=[#8].[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6](-[#7H])=[#8]", + "hexadecanoyl": "[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]=[#8].[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]=[#8].[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6](-[#7H])=[#8]", + "hexamethylene": "[#6]-[#6]-[#6]-[#6]-[#6]-[#6]", + "hexanedioyl": "[#8]=[#6]-[#6]-[#6]-[#6]-[#6]-[#6]=[#8]", + "hippuryl": "[#6H2]-[#6]-[#7]-[#6](-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)=[#8]", + "hydrazino": "[#7]-[#7H]", + "hydrazo": "[#7]-[#7]", + "hydrocinnamoyl": "[#8]=[#6]-[#6]-[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "hydroperoxy": "[#8]-[#8]", + "hydroxyamino": "[#7H]-[#8]", + "imino": "[#7H]", + "iodoso": "[#53]=[#8]", + "iodyl": "[#8]=[#53]=[#8]", + "isoamyl": "[#6H2]-[#6]-[#6](-[#6])-[#6]", + "isobutenyl": "[#6H]=[#6](-[#6])-[#6]", + "isobutoxy": "[#8]-[#6]-[#6](-[#6])-[#6]", + "isobutyl": "[#6H2]-[#6](-[#6])-[#6]", + "isobutylidene": "[#6H]-[#6](-[#6])-[#6]", + "isobutyryl": "[#8]=[#6]-[#6](-[#6])-[#6]", + "isocyanato": "[#7]=[#6]=[#8]", + "isocyano": "[#7+]#[#6-]", + "isohexyl": "[#6H2]-[#6]-[#6]-[#6](-[#6])-[#6]", + "isoleucyl": "[#7]-[#6@@H](-[#6@@H](-[#6])-[#6]-[#6])-[#6]=[#8]", + "isonitroso": "[#7]-[#8]", + "isopentyl": "[#6H2]-[#6]-[#6](-[#6])-[#6]", + "isopentylidene": "[#6H]-[#6]-[#6](-[#6])-[#6]", + "isopropenyl": "[#6]=[#6]-[#6]", + "isopropoxy": "[#8]-[#6](-[#6])-[#6]", + "isopropyl": "[#6]-[#6H]-[#6]", + "isopropylidene": "[#6]-[#6]-[#6]", + "isothiocynato": "[#7]=[#6]=[#16]", + "isovaleryl": "[#8]=[#6]-[#6]-[#6](-[#6])-[#6]", + "lactoyl": "[#8]-[#6](-[#6])-[#6]=[#8]", + "lauroyl": "[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]=[#8]", + "lauryl": "[#6H2]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]", + "leucyl": "[#7]-[#6@@H](-[#6]-[#6](-[#6])-[#6])-[#6]=[#8]", + "levulinoyl": "[#8]=[#6](-[#6])-[#6]-[#6]-[#6]=[#8]", + "malonyl": "[#8]=[#6]-[#6]-[#6]=[#8]", + "mandeloyl": "[#8]-[#6](-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)-[#6]=[#8]", + "mercapto": "[#16H]", + "mesityl": "[#6]-[#6]1:[#6]:[#6](-[#6]):[#6]:[#6](-[#6]):[#6]:1", + "methacryloyl": "[#6]-[#6](-[#6]=[#8])=[#6]", + "methallyl": "[#6H2]-[#6](-[#6])=[#6]", + "methionyl": "[#7]-[#6@@H](-[#6]-[#6]-[#16]-[#6])-[#6]=[#8]", + "methoxy": "[#8]-[#6]", + "methylene": "[#6H2]", + "methylthio": "[#16]-[#6]", + "myristoyl": "[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]=[#8]", + "myristyl": "[#6H2]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]", + "naphthyl": "[#6]12:[#6]:[#6]:[#6]:[#6]:[#6]:1:[#6]:[#6]:[#6]:[#6]:2", + "naphthylene": "[#6]12:[#6]:[#6]:[#6]:[#6]:[#6]:1:[#6]:[#6]:[#6]:[#6]:2", + "neopentyl": "[#6H2]-[#6](-[#6])(-[#6])-[#6]", + "nitramino": "[#7H]-[#7+](-[#8-])=[#8]", + "nitrosamino": "[#7H]-[#7]=[#8]", + "nitroso": "[#7]=[#8]", + "nonanoyl": "[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]=[#8]", + "oleoyl": "[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]/[#6]=[#6]\\[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]=[#8]", + "oxalyl": "[#8]=[#6]-[#6]=[#8]", + "oxo": "[#8]", + "palmitoyl": "[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]=[#8]", + "pentamethylene": "[#8]=[#6]1-[#6](-[#6]=[#6])-[#6@H]2-[#16]-[#6]-[#6]-[#7]-1-2", + "pentyl": "[#6H2]-[#6]-[#6]-[#6]-[#6]", + "tert-pentyl": "[#6]-[#6]-[#6](-[#6])-[#6]", + "phenacylidene": "[#6H]-[#6](-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)=[#8]", + "phenethyl": "[#6H2]-[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "phenoxy": "[#8]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "phenyl": "[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "phenylene": "[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "phosphino": "[#15H2]", + "phosphinyl": "[#15H2]=[#8]", + "phospho": "[#8]=[#15](-[#8])-[#8]", + "phosphono": "[#8]=[#15](-[#8])-[#8]", + "phthaloyl": "[#8]=[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#6]=[#8]", + "picryl": "[#8-]-[#7+](-[#6]1:[#6]:[#6](-[#7+](-[#8-])=[#8]):[#6]:[#6](-[#7+](-[#8-])=[#8]):[#6]:1)=[#8]", + "pimeloyl": "[#8]=[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]=[#8]", + "piperidino": "[#7]1-[#6]-[#6]-[#6]-[#6]-[#6]-1", + "pivaloyl": "[#6]-[#6](-[#6])(-[#6])-[#6]=[#8]", + "prenyl": "[#6H2]-[#6]=[#6](-[#6])-[#6]", + "propargyl": "[#6H2]-[#6]#[#6]", + "1-propenyl": "[#6H]=[#6]-[#6]", + "2-propenyl": "[#6H2]-[#6]=[#6]", + "propionyl": "[#8]=[#6]-[#6]-[#6]", + "propoxy": "[#8]-[#6]-[#6]-[#6]", + "propyl": "[#6H2]-[#6]-[#6]", + "propylidene": "[#6H]-[#6]-[#6]", + "pyrryl": "[#7H]1:[#6]:[#6]:[#6]:[#6]:1", + "salicyloyl": "[#8]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#6]=[#8]", + "selenyl": "[SeH]", + "seryl": "[#7]-[#6@@H](-[#6]-[#8])-[#6]=[#8]", + "siloxy": "[#8]-[SiH3]", + "silyl": "[SiH3]", + "silyene": "[SiH2]", + "sorboyl": "[#6]-[#6]=[#6]-[#6]=[#6]-[#6](-[#8])=[#8]", + "stearoyl": "[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]=[#8]", + "stearyl": "[#6H2]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]", + "styryl": "[#6H]=[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "suberoyl": "[#8]=[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]=[#8]", + "succinyl": "[#8]=[#6]-[#6]-[#6]-[#6]=[#8]", + "sulfamino": "[#7H]-[#16](=[#8])(-[#8])=[#8]", + "sulfamoyl": "[#8]=[#16](-[#7])=[#8]", + "sulfanilyl": "[#8]=[#16](-[#6]1:[#6]:[#6]:[#6](-[#7]):[#6]:[#6]:1)=[#8]", + "sulfeno": "[#16]-[#8]", + "sulfhydryl": "[#16H]", + "sulfinyl": "[#16]=[#8]", + "sulfo": "[#8]=[#16](-[#8])=[#8]", + "sulfonyl": "[#8]=[#16]=[#8]", + "terephthaloyl": "[#8]=[#6]-[#6]1:[#6]:[#6]:[#6](-[#6]=[#8]):[#6]:[#6]:1", + "tetramethylene": "[#6]-[#6]-[#6]-[#6]", + "thienyl": "[#6]1:[#6]:[#6]:[#6]:[#16]:1", + "thiocarbonyl": "[#6H]=[#16]", + "thiocarboxy": "[#16]=[#6]-[#8]", + "thiocyanato": "[#16]-[#6]#[#7]", + "thionyl": "[#16]=[#8]", + "threonyl": "[#7]-[#6@@H](-[#6@H](-[#8])-[#6])-[#6]=[#8]", + "toluidino": "[#7H]-[#6]1:[#6]:[#6]:[#6](-[#6]):[#6]:[#6]:1", + "toluoyl": "[#6]-[#6]1:[#6]:[#6]:[#6](-[#6]=[#8]):[#6]:[#6]:1", + "tolyl": "[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "alpha-tolyl": "[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "tolylene": "[#6H]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1", + "tosyl": "[#8]=[#16](-[#6]1:[#6]:[#6]:[#6](-[#6]):[#6]:[#6]:1)=[#8]", + "triazano": "[#7H]-[#7]-[#7H]", + "trimethylene": "[#6]-[#6]-[#6]", + "valeryl": "[#8]=[#6]-[#6]-[#6]-[#6]-[#6]", + "valyl": "[#7]-[#6@@H](-[#6](-[#6])-[#6])-[#6]=[#8]", + "vinyl": "[#6H]=[#6]", + "vinylidene": "[#6]=[#6]", + "xylidino": "[#7H]-[#6]1:[#6]:[#6]:[#6](-[#6]):[#6]:[#6]:1-[#6]", + "xylyl": "[#6]-[#6]1:[#6]:[#6]:[#6]:[#6](-[#6]):[#6]:1", + "xylylene": "[#7]-[#6]-[#6]1:[#6]:[#6]:[#6]:[#6](-[#6]-[#7]):[#6]:1", + "propiolamide": "[#6]#[#6]-[#6](-[#7])=[#8]", + "fumarate ester": "[#7]-[#6](/[#6]=[#6]/[#6]-[#6](-[#8]-[#6])=[#8])=[#8]", + "allenamide": "[#7]-[#6](-[#6]=[#6]=[#6])=[#8]", + "propiolonitrile": "[#6]#[#6]-[#6]#[#7]", + "propargylamide": "[#6]#[#6]-[#6]-[#6](-[#7])=[#8]", + "arylsulfonyl bicyclobutane": "[#8]=[#16](-[#6]12-[#6]-[#6]-1-[#6]-2)(-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)=[#8]", + "haloalkane": "[#6]-[#35]", + "alpha-halomethyl": "[#6]-[#6](-[#6]-[#17])=[#8]", + "alpha-haloamide": "[#7]-[#6](-[#6]-[#17])=[#8]", + "alpha-haloester": "[#8]=[#6](-[#6]-[#17])-[#8]-[#6]", + "epoxide": "[#6]1-[#6]-[#8]-1", + "aziridine": "[#7]1-[#6]-[#6]-1", + "nitroalkane": "[#6]-[#6]-[#7+](-[#8-])=[#8]", + "acrylamide": "[#6]=[#6]-[#6](-[#7])=[#8]", + "cyanoenone": "[#8]=[#6](-[#6])-[#6](-[#6]#[#7])=[#6]", + "aldehyde": "[#8]=[#6H]-[#6]", + "ketone": "[#6][CX3](=O)[#6]", + "nitrile": "[#7]#[#6]-[#6]", + "cyanamide": "[#7]-[#6]#[#7]", + "isothicyanate": "[#7-]=[#6]=[#16]", + "sulfone": "[#6]-[#16]=[#8]", + "sulfonyl fluoride": "[#8]=[#16](-[#9])=[#8]", + "sulfonimidoyl fluoride": "[#7]=[#16](-[#9])(-[#9])=[#8]", + "aryl fluorosulfate": "[#8]=[#16](-[#8]-[#6]-[#6]-[#6]-[#6]-[#6])(-[#9])=[#8]", + "ester": "[#6]-[#6](-[#8]-[#6])=[#8]", + "sulfonamide": "[#8]=[#16](-[#7])=[#8]", + "2-carbonyl arylboronic acid": "[#8]=[#6](-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[#5](-[#8])-[#8])-[#6]", + "n-methyl isoxazolium": "[#6]-[#7+]1:[#6]:[#6]:[#6]:[#8]:1", + "oxaziridine": "[#8]1-[#7]-[#6]-1", + "carboxyl": "[CX3](=O)[OX2H1]", + "ether": "[OD2]([#6])[#6]", + "alkanol": "[#6][OX2H]", + "thiol": "[#16X2H]", + "halogen": "[F,Cl,Br,I]", + "amine": "[NX3;H2,H1;!$(NC=O)]", + "amide": "[NH2]", +} + + +def process(chemcaption_data): + df = pd.read_json(chemcaption_data, lines=True) + + trustworthy_smarts = [ + "carboxyl", + "carbonyl", + "ether", + "alkanol", + "thiol", + "halogen", + "amine", + "amide", + "ketone", + "cyclopropane", + "spiropentane", + "cyclobutane", + "cyclopentane", + "furan", + "thiophene", + "pyrrole", + "2H-pyrrole", + "3H-pyrrole", + "pyrazole", + "2H-imidazole", + "1,2,3-triazole", + "1,2,4-triazole", + "1,2-dithiole", + "1,3-dithiole", + "3H-1,2-oxathiole", + "isoxazole", + "oxazole", + "thiazole", + "isothiazole", + "1,2,3-oxadiazole", + "1,2,4-oxadiazole", + "1,2,5-oxadiazole", + "1,3,4-oxadiazole", + "1,2,3,4-oxatriazole", + "1,2,3,5-oxatriazole", + "3H-1,2,3-dioxazole", + "1,2,4-dioxazole", + "1,3,2-dioxazole", + "1,3,4-dioxazole", + "5H-1,2,5-oxathiazole", + "1,3-oxathiole", + "benzene", + "cyclohexane", + "2H-pyran", + "4H-pyran", + "2H-pyran-2-one", + "4H-pyran-4-one", + "1,2-dioxin", + "1,3-dioxin", + "pyridine", + "pyridazine", + "pyrimidine", + "pyrazine", + "piperazine", + "1,3,5-triazine", + "1,2,4-triazine", + "1,2,3-triazine", + "4H-1,2-Oxazine", + "2H-1,3-Oxazine", + "6H-1,3-Oxazine", + "6H-1,2-Oxazine", + "1,4-Oxazine", + "2H-1,2-Oxazine", + "4H-1,4-Oxazine", + "1,2,5-Oxathiazine", + "1,2,6-Oxathiazine", + "1,2,4-Oxadiazine", + "1,3,5-Oxadiazine", + "morpholine", + "azepine", + "oxepin", + "thiepin", + "4H-1,2-diazepine", + "indene", + "2H-indene", + "benzofuran", + "isobenzofuran", + "benzo[b]thiophene", + "benzo[c]thiophene", + "indole", + "3H-indole", + "1H-indole", + "cyclopenta[b]pyridine", + "pyrano[3,4-b]-pyrrole", + "indazole", + "benzisoxazole", + "benzoxazole", + "2,1-benzisoxazole", + "naphthalene", + "1,2,3,4-tetrahydronaphthalene", + "octahydronaphthalene", + "2H-1-benzopyran", + "2H-1-benzopyran-2-one", + "4H-1-benzopyran-4-one", + "1H-2-benzopyran-1-one", + "3H-2-benzopyran-1-one", + "quinoline", + "isoquinoline", + "cinnoline", + "quinazoline", + "1,8-napthyhridine", + "1,7-napththyridine", + "1,5-napththridine", + "1,6-napthyridine", + "2H-1,3-benzoxazine", + "2H-1,4-benzoxazine", + "1H-2,3-benzoxazine", + "4H-3,1-benzoxazine", + "2H-1,2-benzoxazine", + "4H-1,3-benzoxazine", + "anthracene", + "phenanthrene", + "phenalene", + "fluorene", + "carbazole", + "xanthene", + "acridine", + "norpinane", + "7H-purine", + "steroid_ring_system", + ] + + subset = df[df["filled_prompt"].str.contains("|".join(trustworthy_smarts))] + + subset["completion_labels"] = subset["completion_labels"].apply(lambda x: x[0]) + subset["completion"] = subset["completion"].apply(lambda x: x[0]) + subset["smarts"] = subset["completion_labels"].apply( + lambda x: NAME_SMARTS_MAP[x.replace("_count", "")] + ) + + relevant_frame = subset[ + [ + "representation", + "representation_type", + "completion", + "completion_labels", + "smarts", + ] + ] + + # subsample relevant frame such that completion 0 and completion 1 are approximately equal + # this is to avoid biasing the model towards 0 + completion_0 = relevant_frame[relevant_frame["completion"] == 0] + completion_1 = relevant_frame[relevant_frame["completion"] == 1] + completion_other = relevant_frame[relevant_frame["completion"] != 0] + + completion_0 = completion_0.sample(n=len(completion_1), random_state=42) + relevant_frame = pd.concat([completion_0, completion_1, completion_other]) + + return relevant_frame + relevant_frame.to_csv("data_clean.csv", index=False) + + +if __name__ == "__main__": + all_files = glob("*.jsonl") + all_data = [] + + for file in tqdm(all_files): + try: + all_data.append(process(file)) + except Exception as e: + print(file, e) + df = pd.concat(all_data) + + ds = Dataset.from_pandas(df) + ds.push_to_hub(repo_id="kjappelbaum/chemnlp-chem-caption", config_name="smarts") diff --git a/data/tabular/chem_caption_smarts/transform.py b/data/tabular/chem_caption_smarts/transform.py new file mode 100644 index 000000000..7259da626 --- /dev/null +++ b/data/tabular/chem_caption_smarts/transform.py @@ -0,0 +1,17 @@ +import pandas as pd + + +def process(): + # get the smarts config + df = pd.read_parquet( + "https://huggingface.co/datasets/kjappelbaum/chemnlp-chem-caption/resolve/main/smarts/train-00000-of-00001-71cef18c6383b463.parquet" # noqa + ) + df["completion_labels"] = df["completion_labels"].astype(str) + df["completion_labels"] = df["completion_labels"].str.replace( + "_count", "", regex=True + ) + df.to_csv("data_clean.csv", index=False) + + +if __name__ == "__main__": + process()