Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update converters #47

Merged
merged 7 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions nerdd_module/config/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,14 @@ class ResultProperty(BaseModel):
name: str
type: str
visible_name: Optional[str] = None
visible: bool = True
help_text: Optional[str] = None
sortable: bool = False
group: Optional[str] = None
level: Level = "molecule"
formats: Union[FormatSpec, IncludeExcludeFormatSpec, None] = None
representation: Optional[str] = None
from_property: Optional[str] = None
image_width: Optional[int] = None
image_height: Optional[int] = None

Expand Down
1 change: 1 addition & 0 deletions nerdd_module/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@
from .mol_converter import *
from .mol_to_image_converter import *
from .problem_list_converter import *
from .representation_converter import *
from .void_converter import *
2 changes: 1 addition & 1 deletion nerdd_module/converters/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class Converter(ABC):

def __init__(self, result_property: ResultProperty, output_format: str, **kwargs: Any) -> None:
super().__init__()
self.property = result_property
self.result_property = result_property
self.output_format = output_format

@classmethod
Expand Down
31 changes: 7 additions & 24 deletions nerdd_module/converters/mol_converter.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
from typing import Any

from rdkit.Chem import MolToInchi, MolToSmiles

from ..config import ResultProperty
from .converter import Converter
from .converter_config import ALL, ConverterConfig
from .converter_config import ConverterConfig

__all__ = ["MolConverter"]

Expand All @@ -13,29 +11,14 @@ class MolConverter(Converter):
def __init__(self, result_property: ResultProperty, output_format: str, **kwargs: Any) -> None:
super().__init__(result_property, output_format, **kwargs)

if output_format == "sdf" and result_property.name == "input_mol":
# in an SDF, the main molecule (input_mol) can be a Mol object
self._serialize = lambda x: x
elif output_format in ["pandas", "record_list", "iterator"]:
self._serialize = lambda mol: mol
else:
representation = result_property.representation or "smiles"
if representation == "inchi":
self._serialize = MolToInchi
elif representation == "smiles":
self._serialize = MolToSmiles
else:
raise ValueError(f"Unsupported representation: {representation}")

def _convert(self, input: Any, context: dict) -> Any:
try:
representation = self._serialize(input)
except: # noqa: E722 (allow bare except, because RDKit is unpredictable)
representation = None

return representation
if self.output_format == "sdf" and self.result_property.name != "input_mol":
# in an SDF, the main molecule (input_mol) can be a Mol object
return Converter.HIDE
elif self.output_format in ["pandas", "record_list", "iterator"]:
return input

config = ConverterConfig(
data_types="mol",
output_formats=ALL,
output_formats=["sdf", "pandas", "record_list", "iterator"],
)
8 changes: 4 additions & 4 deletions nerdd_module/converters/mol_to_image_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@

__all__ = ["MolToImageConverter"]

default_width = 400
default_height = 300
default_width = 300
default_height = 180


class MolToImageConverter(Converter):
def _convert(self, input: Any, context: dict) -> Any:
width = self.property.image_width
height = self.property.image_height
width = self.result_property.image_width
height = self.result_property.image_height

if width is None:
width = default_width
Expand Down
42 changes: 42 additions & 0 deletions nerdd_module/converters/representation_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from typing import Any

from rdkit.Chem import MolToInchi, MolToSmiles

from ..config import ResultProperty
from .converter import Converter
from .converter_config import ALL, ConverterConfig

__all__ = ["RepresentationConverter"]


class RepresentationConverter(Converter):
def __init__(self, result_property: ResultProperty, output_format: str, **kwargs: Any) -> None:
super().__init__(result_property, output_format, **kwargs)

representation = result_property.representation or "smiles"
if representation == "inchi":
self._serialize = MolToInchi
elif representation == "smiles":
self._serialize = MolToSmiles
else:
raise ValueError(f"Unsupported representation: {representation}")

def _convert(self, input: Any, context: dict) -> Any:
from_property = self.result_property.from_property

if from_property is None:
actual_input = input
else:
actual_input = context[from_property]

try:
representation = self._serialize(actual_input)
except: # noqa: E722 (allow bare except, because RDKit is unpredictable)
representation = None

return representation

config = ConverterConfig(
data_types="representation",
output_formats=ALL,
)
6 changes: 5 additions & 1 deletion nerdd_module/model/convert_representations_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,17 @@ def __init__(
self, result_properties: List[ResultProperty], output_format: str, **kwargs: Any
) -> None:
super().__init__()
self._result_properties = result_properties
self._converter_map = {
p.name: Converter.get_converter(p, output_format, **kwargs) for p in result_properties
}

def _process(self, record: dict) -> dict:
result = {
k: self._converter_map[k].convert(input=v, context=record) for k, v in record.items()
k.name: self._converter_map[k.name].convert(
input=record.get(k.name, None), context=record
)
for k in self._result_properties
}

return {k: v for k, v in result.items() if v is not Converter.HIDE}
50 changes: 42 additions & 8 deletions nerdd_module/model/simple_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,26 +113,60 @@ def _get_config(self) -> Configuration:
task_based_property = []
if task == "atom_property_prediction":
task_based_property = [
{"name": "atom_id", "type": "int"},
{"name": "atom_id", "type": "int", "visible": False},
]
elif task == "derivative_property_prediction":
task_based_property = [
{"name": "derivative_id", "type": "int"},
{"name": "derivative_id", "type": "int", "visible": False},
]

default_properties_start = [
{"name": "mol_id", "type": "int"},
{"name": "mol_id", "type": "int", "visible": False},
*task_based_property,
{"name": "input_text", "visible_name": "Input text", "type": "string"},
{"name": "input_type", "visible_name": "Input type", "type": "string"},
{"name": "source", "visible_name": "Source", "type": "string"},
{
"name": "input_text",
"visible_name": "Input text",
"type": "string",
"visible": False,
},
{
"name": "input_type",
"visible_name": "Input type",
"type": "string",
"visible": False,
},
{
"name": "source",
"visible_name": "Source",
"type": "string",
"visible": False,
},
{"name": "name", "visible_name": "Name", "type": "string"},
{"name": "input_mol", "visible_name": "Input SMILES", "type": "mol"},
{
"name": "input_mol",
"visible_name": "Input Structure",
"type": "mol",
"visible": False,
},
{
"name": "input_smiles",
"visible_name": "Input SMILES",
"type": "representation",
"from_property": "input_mol",
"visible": False,
},
{
"name": "preprocessed_mol",
"visible_name": "Preprocessed SMILES",
"visible_name": "Preprocessed Structure",
"type": "mol",
},
{
"name": "preprocessed_smiles",
"visible_name": "Preprocessed SMILES",
"type": "representation",
"from_property": "preprocessed_mol",
"visible": False,
},
]

default_properties_end = [
Expand Down
2 changes: 2 additions & 0 deletions nerdd_module/tests/models/AtomicMassModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,5 @@ def _get_base_config(self):
{"name": "mass", "type": "float", "level": "atom"},
],
}


5 changes: 3 additions & 2 deletions nerdd_module/tests/models/MolWeightModel.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from rdkit.Chem.rdMolDescriptors import CalcExactMolWt

from nerdd_module import SimpleModel
from nerdd_module.preprocessing import Sanitize
from rdkit.Chem.rdMolDescriptors import CalcExactMolWt

__all__ = ["MolWeightModel"]

Expand Down Expand Up @@ -47,4 +48,4 @@ def _get_base_config(self):
"result_properties": [
{"name": "weight", "type": "float"},
],
}
}
6 changes: 5 additions & 1 deletion nerdd_module/tests/representations.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from hypothesis import given as hgiven
from hypothesis import seed, settings
from hypothesis import strategies as st

# TODO: remove "type: ignore" later
from hypothesis_rdkit import mols # type: ignore
from pytest_bdd import given, parsers
Expand Down Expand Up @@ -69,6 +68,11 @@ def generate(ms):

generate()

for m in result:
if m is None:
continue
m.SetProp("_Name", "mol")

# replace random entries with None
indices = np.random.choice(num, num_none, replace=False)
for i in indices:
Expand Down
Loading