From 5bc23cebde88ed73a2742a800863aadd2e186995 Mon Sep 17 00:00:00 2001
From: ZeYi Lin <944270057@qq.com>
Date: Thu, 18 Apr 2024 03:54:11 +0800
Subject: [PATCH 1/5] init molecule

---
 swanlab/data/modules/chart.py    |  2 +
 swanlab/data/modules/molecule.py | 78 ++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+)
 create mode 100644 swanlab/data/modules/molecule.py

diff --git a/swanlab/data/modules/chart.py b/swanlab/data/modules/chart.py
index 23bd67037..9dedceefe 100644
--- a/swanlab/data/modules/chart.py
+++ b/swanlab/data/modules/chart.py
@@ -24,3 +24,5 @@ class Chart:
     video = "video", [list, str]
     # 3D点云类型，list代表一步多3D点云
     object3d = "object3d", [list, str]
+    # 生化分子类型，list代表一步多生化分子
+    melocule = "melocule", [list, str]
diff --git a/swanlab/data/modules/molecule.py b/swanlab/data/modules/molecule.py
new file mode 100644
index 000000000..63a0fc68c
--- /dev/null
+++ b/swanlab/data/modules/molecule.py
@@ -0,0 +1,78 @@
+# -*- coding: utf-8 -*-
+"""
+Date: 2024-01-22 15:01:36
+IDE: VSCode
+FilePath: /SwanLab/swanlab/data/modules/audio.py
+Description:
+    音频数据解析
+"""
+from .base import BaseType
+from typing import Union, List
+
+
+class Molecule(BaseType):
+    """Molecule class constructor
+
+    Parameters
+    ----------
+    data_or_path: str
+    """
+
+    def __init__(
+        self,
+        data_or_path: Union[str, List["Molecule"]],
+        caption: str = None,
+    ):
+
+        super().__init__(data_or_path)
+        self.molecule_data = None
+        self.caption = self.__convert_caption(caption)
+
+    def get_data(self):
+        pass
+
+    def expect_types(self, *args, **kwargs) -> list:
+        return ["str"]
+
+    def __convert_caption(self, caption):
+        """将caption转换为字符串"""
+        # 如果类型是字符串，则不做转换
+        if isinstance(caption, str):
+            caption = caption
+        # 如果类型是数字，则转换为字符串
+        elif isinstance(caption, (int, float)):
+            caption = str(caption)
+        # 如果类型是None，则转换为默认字符串
+        elif caption is None:
+            caption = None
+        else:
+            raise TypeError("caption must be a string, int or float.")
+        return caption.strip() if caption else None
+
+    def __preprocess(self, data_or_path):
+        pass
+
+    def __save(self, save_path):
+        pass
+
+    def get_more(self, *args, **kwargs) -> dict:
+        """返回config数据"""
+        # 如果传入的是Audio类列表
+        if isinstance(self.value, list):
+            return self.get_more_list()
+        else:
+            return (
+                {
+                    "caption": self.caption,
+                }
+                if self.caption is not None
+                else None
+            )
+
+    def get_namespace(self, *args, **kwargs) -> str:
+        """设定分组名"""
+        return "Molecule"
+
+    def get_chart_type(self) -> str:
+        """设定图表类型"""
+        return self.chart.melocule

From 6d0bb43bb4cc449657ae9a87434e4a09051fb145 Mon Sep 17 00:00:00 2001
From: ZeYi Lin <944270057@qq.com>
Date: Thu, 18 Apr 2024 15:41:04 +0800
Subject: [PATCH 2/5] Update molecule.py

---
 swanlab/data/modules/molecule.py | 289 +++++++++++++++++++++++++++----
 1 file changed, 255 insertions(+), 34 deletions(-)

diff --git a/swanlab/data/modules/molecule.py b/swanlab/data/modules/molecule.py
index 63a0fc68c..01258bc77 100644
--- a/swanlab/data/modules/molecule.py
+++ b/swanlab/data/modules/molecule.py
@@ -1,63 +1,266 @@
 # -*- coding: utf-8 -*-
-"""
-Date: 2024-01-22 15:01:36
-IDE: VSCode
-FilePath: /SwanLab/swanlab/data/modules/audio.py
-Description:
-    音频数据解析
-"""
 from .base import BaseType
-from typing import Union, List
+from typing import Union, List, TYPE_CHECKING
+import os
+import shutil
+import secrets
+import string
+import io
+
+if TYPE_CHECKING:
+    from typing import TextIO
+    import rdkit.Chem
+
+    RDKitDataType = Union[str, "rdkit.Chem.rdchem.Mol"]
+
+
+def generate_id(length: int = 8) -> str:
+    """Generate a random base-36 string of `length` digits."""
+    # There are ~2.8T base-36 8-digit strings. If we generate 210k ids,
+    # we'll have a ~1% chance of collision.
+    alphabet = string.ascii_lowercase + string.digits
+    return "".join(secrets.choice(alphabet) for _ in range(length))
 
 
 class Molecule(BaseType):
-    """Molecule class constructor
+    """SwanLab class for 3D Molecular data.
 
-    Parameters
-    ----------
-    data_or_path: str
+    Arguments:
+        data_or_path: (string, io)
+            Molecule can be initialized from a file name or an io object.
+            Molecules 可以被一个文件路径或一个IO对象初始化, 如swanlab.Molecule("path/to/file")
+        caption: (string)
+            Caption associated with the molecule for display.
+            与Molecule相关的标题, 用于在GUI上显示。swanlab.Molecule("path/to/file", caption="Ethanol")
+        file_type: (string)
+            Type of the file. If not provided, the file type will be inferred from the file extension.
+            文件类型。如果未提供，文件类型将从文件扩展名中推断出来。
     """
 
+    # IO流支持的文件类型
+    SUPPORTED_TYPES = {
+        "pdb",
+        "pqr",
+        "mmcif",
+        "mcif",
+        "cif",
+        "sdf",
+        "sd",
+        "gro",
+        "mol2",
+        "mmtf",
+    }
+
+    # RDKIT支持的文件类型
+    SUPPORTED_RDKIT_TYPES = {"mol", "sdf"}
+
     def __init__(
         self,
-        data_or_path: Union[str, List["Molecule"]],
+        data_or_path: Union[str, "TextIO", List["Molecule"]],
         caption: str = None,
-    ):
+        file_type: str = None,
+    ) -> None:
 
         super().__init__(data_or_path)
         self.molecule_data = None
         self.caption = self.__convert_caption(caption)
+        self.file_type = file_type
 
     def get_data(self):
-        pass
+        # 如果传入的是Molecule类列表
+        if isinstance(self.value, list):
+            return self.get_data_list()
+        # 数据预处理
+        self.__preprocess(self.value)
+        random_id = generate_id()
 
-    def expect_types(self, *args, **kwargs) -> list:
-        return ["str"]
+        # 生成保存路径
+        save_dir = os.path.join(self.settings.static_dir, self.tag)
+        save_name = f"Molecule-step{self.step}-{random_id}.{self.file_type}"
+        if not os.path.exists(save_dir):
+            os.mkdir(save_dir)
+        save_path = os.path.join(save_dir, save_name)
 
-    def __convert_caption(self, caption):
-        """将caption转换为字符串"""
-        # 如果类型是字符串，则不做转换
-        if isinstance(caption, str):
-            caption = caption
-        # 如果类型是数字，则转换为字符串
-        elif isinstance(caption, (int, float)):
-            caption = str(caption)
-        # 如果类型是None，则转换为默认字符串
-        elif caption is None:
-            caption = None
-        else:
-            raise TypeError("caption must be a string, int or float.")
-        return caption.strip() if caption else None
+        # 保存分子数据到指定目录
+        self.__save(save_path)
+        return save_name
 
     def __preprocess(self, data_or_path):
-        pass
+        if hasattr(data_or_path, "name"):
+            data_or_path = data_or_path.name
+
+        # 如果传入的是IO对象
+        if hasattr(data_or_path, "read"):
+            if hasattr(data_or_path, "seek"):
+                data_or_path.seek(0)
+
+            ext = self.file_type
+
+            # 如果没有传入file_type参数
+            if ext is None:
+                raise ValueError("When using the io object, the file_type keyword argument must be passed.")
+
+            if ext not in Molecule.SUPPORTED_TYPES:
+                raise ValueError("Molecule 3D only supports files of the type: " + ", ".join(Molecule.SUPPORTED_TYPES))
+
+            # 分子数据为IO对象读取的内容
+            self.molecule_data = data_or_path
+
+        # 如果传入的是文件路径
+        elif isinstance(data_or_path, str):
+            ext = os.path.splitext(data_or_path)[1][1:]
+            if ext not in Molecule.SUPPORTED_TYPES:
+                raise ValueError("Molecule 3D only supports files of the type: " + ", ".join(Molecule.SUPPORTED_TYPES))
+            # 分子数据为文件路径
+            self.molecule_data = data_or_path
+        else:
+            raise ValueError("The data passed to Melocule must be a file name or file object.")
 
     def __save(self, save_path):
-        pass
+
+        try:
+            # 如果传入的是IO对象
+            if hasattr(self.molecule_data, "read"):
+                # 将分子数据写入临时文件
+                with open(save_path, "w") as f:
+                    f.write(self.molecule_data.read())
+            # 如果传入的是文件路径, 复制文件到指定目录
+            else:
+                shutil.copyfile(self.molecule_data, save_path)
+
+        except Exception as e:
+            raise TypeError(f"Could not save the Molecule to the path: {save_path}") from e
+
+    @classmethod
+    def from_rdkit(
+        cls,
+        data_or_path: "RDKitDataType",
+        caption: str = None,
+        convert_to_3d_and_optimize: bool = True,
+        mmff_optimize_molecule_max_iterations: int = 200,
+    ) -> "Molecule":
+        """Convert RDKit-supported file/object types to swanlab.Molecule.
+        将RDKit支持的文件/对象类型转换为swanlab.Molecule。
+
+        Arguments:
+            data_or_path: (string, rdkit.Chem.rdchem.Mol)
+                Molecule can be initialized from a file name or an rdkit.Chem.rdchem.Mol object.
+                Molecule可以从文件名或rdkit.Chem.rdchem.Mol对象初始化。
+                如swanlab.Molecule.from_rdkit("path/to/file")
+
+            caption: (string)
+                Caption associated with the molecule for display.
+                与Molecule相关的标题, 用于在GUI上显示。如swanlab.Molecule.from_rdkit("path/to/file", caption="Ethanol")
+
+            convert_to_3d_and_optimize: (bool)
+                Convert to rdkit.Chem.rdchem.Mol with 3D coordinates.
+                This is an expensive operation that may take a long time for complicated molecules.
+                将具有3D坐标的rdkit.Chem.rdchem.Mol转换。
+                这是一个比较耗时的操作，对于复杂分子可能需要很长时间。
+
+            mmff_optimize_molecule_max_iterations: (int)
+                Number of iterations to use in rdkit.Chem.AllChem.MMFFOptimizeMolecule
+                在rdkit.Chem.AllChem.MMFFOptimizeMolecule中要使用的迭代次数
+        """
+
+        try:
+            import rdkit
+        except ImportError as e:
+            raise TypeError("swanlab.Molecule requires the rdkit pypi package. Install with 'pip install rdkit'.")
+
+        from rdkit import Chem
+        from rdkit.Chem import AllChem
+        import pathlib
+
+        # 如果传入的是文件路径
+        if isinstance(data_or_path, str):
+            path = pathlib.Path(data_or_path)
+            ext = path.suffix.split(".")[-1]
+            if ext not in Molecule.SUPPORTED_RDKIT_TYPES:
+                raise ValueError(
+                    "swanlab.Molecule.from_rdkit only supports files of the type: "
+                    + ", ".join(Molecule.SUPPORTED_RDKIT_TYPES)
+                )
+            # 如果后缀是sdf, 则使用SDMolSupplier进行读取
+            if ext == "sdf":
+                with Chem.SDMolSupplier(data_or_path) as supplier:
+                    molecule = next(supplier)  # 只获取第一个分子
+            else:
+                molecule = getattr(Chem, f"MolFrom{ext.capitalize()}File")(data_or_path)
+        elif isinstance(data_or_path, Chem.rdchem.Mol):
+            molecule = data_or_path
+        else:
+            raise ValueError("Data must be file name or an rdkit.Chem.rdchem.Mol object")
+
+        if convert_to_3d_and_optimize:
+            molecule = Chem.AddHs(molecule)
+            AllChem.EmbedMolecule(molecule)
+            AllChem.MMFFOptimizeMolecule(
+                molecule,
+                maxIters=mmff_optimize_molecule_max_iterations,
+            )
+
+        # 转换为Molecule支持的pdb格式
+        pdb_block = Chem.rdmolfiles.MolToPDBBlock(molecule)
+
+        return cls(io.StringIO(pdb_block), caption=caption, file_type="pdb")
+
+    @classmethod
+    def from_smiles(
+        cls,
+        data: str,
+        caption: str = None,
+        sanitize: bool = True,
+        convert_to_3d_and_optimize: bool = True,
+        mmff_optimize_molecule_max_iterations: int = 200,
+    ) -> "Molecule":
+        """Convert SMILES string to swanlab.Molecule.
+        将SMILES字符串转换为swanlab.Molecule。
+
+        Arguments:
+            data: (string)
+                SMILES string. 如swanlab.Molecule.from_smiles("CCO")
+
+            caption: (string)
+                Caption associated with the molecule for display
+                与Molecule相关的标题, 用于在GUI上显示。
+
+            sanitize: (bool)
+                Check if the molecule is chemically reasonable by the RDKit's definition.
+                通过RDKit的定义检查分子是否在化学上合理。
+
+            convert_to_3d_and_optimize: (bool)
+                Convert to rdkit.Chem.rdchem.Mol with 3D coordinates.
+                This is an expensive operation that may take a long time for complicated molecules.
+                将具有3D坐标的rdkit.Chem.rdchem.Mol转换。
+                这是一个比较耗时的操作，对于复杂分子可能需要很长时间。
+
+            mmff_optimize_molecule_max_iterations: (int)
+                Number of iterations to use in rdkit.Chem.AllChem.MMFFOptimizeMolecule
+                在rdkit.Chem.AllChem.MMFFOptimizeMolecule中要使用的迭代次数
+        """
+
+        try:
+            import rdkit
+        except ImportError as e:
+            raise TypeError("swanlab.Molecule requires the rdkit pypi package. Install with 'pip install rdkit'.")
+
+        from rdkit import Chem
+
+        molecule = Chem.MolFromSmiles(data, sanitize=sanitize)
+        if molecule is None:
+            raise ValueError("Unable to parse the SMILES string.")
+
+        return cls.from_rdkit(
+            data_or_path=molecule,
+            caption=caption,
+            convert_to_3d_and_optimize=convert_to_3d_and_optimize,
+            mmff_optimize_molecule_max_iterations=mmff_optimize_molecule_max_iterations,
+        )
 
     def get_more(self, *args, **kwargs) -> dict:
         """返回config数据"""
-        # 如果传入的是Audio类列表
+        # 如果传入的是Molecule类列表
         if isinstance(self.value, list):
             return self.get_more_list()
         else:
@@ -69,6 +272,24 @@ def get_more(self, *args, **kwargs) -> dict:
                 else None
             )
 
+    def expect_types(self, *args, **kwargs) -> list:
+        return ["str", "TextIO"]
+
+    def __convert_caption(self, caption):
+        """将caption转换为字符串"""
+        # 如果类型是字符串，则不做转换
+        if isinstance(caption, str):
+            caption = caption
+        # 如果类型是数字，则转换为字符串
+        elif isinstance(caption, (int, float)):
+            caption = str(caption)
+        # 如果类型是None，则转换为默认字符串
+        elif caption is None:
+            caption = None
+        else:
+            raise TypeError("caption must be a string, int or float.")
+        return caption.strip() if caption else None
+
     def get_namespace(self, *args, **kwargs) -> str:
         """设定分组名"""
         return "Molecule"

From 625bc33109727a7b550dfa423bd4342b4225035a Mon Sep 17 00:00:00 2001
From: ZeYi Lin <944270057@qq.com>
Date: Thu, 18 Apr 2024 15:49:13 +0800
Subject: [PATCH 3/5] fixbug

---
 swanlab/data/__init__.py         | 1 +
 swanlab/data/modules/__init__.py | 2 ++
 swanlab/data/modules/molecule.py | 8 +++++---
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/swanlab/data/__init__.py b/swanlab/data/__init__.py
index 9559456c3..ac8430b00 100644
--- a/swanlab/data/__init__.py
+++ b/swanlab/data/__init__.py
@@ -14,6 +14,7 @@
     Text,
     Video,
     Object3D,
+    Molecule,
 )
 from .sdk import (
     login,
diff --git a/swanlab/data/modules/__init__.py b/swanlab/data/modules/__init__.py
index d74852e45..d79ec0572 100644
--- a/swanlab/data/modules/__init__.py
+++ b/swanlab/data/modules/__init__.py
@@ -4,6 +4,8 @@
 from .text import Text
 from .video import Video
 from .object_3d import Object3D
+from .molecule import Molecule
+
 # from .video import Video
 from typing import Protocol, Union, List
 
diff --git a/swanlab/data/modules/molecule.py b/swanlab/data/modules/molecule.py
index 01258bc77..881144e45 100644
--- a/swanlab/data/modules/molecule.py
+++ b/swanlab/data/modules/molecule.py
@@ -14,7 +14,7 @@
     RDKitDataType = Union[str, "rdkit.Chem.rdchem.Mol"]
 
 
-def generate_id(length: int = 8) -> str:
+def generate_id(length: int = 16) -> str:
     """Generate a random base-36 string of `length` digits."""
     # There are ~2.8T base-36 8-digit strings. If we generate 210k ids,
     # we'll have a ~1% chance of collision.
@@ -71,12 +71,12 @@ def get_data(self):
         if isinstance(self.value, list):
             return self.get_data_list()
         # 数据预处理
-        self.__preprocess(self.value)
+        ext = self.__preprocess(self.value)
         random_id = generate_id()
 
         # 生成保存路径
         save_dir = os.path.join(self.settings.static_dir, self.tag)
-        save_name = f"Molecule-step{self.step}-{random_id}.{self.file_type}"
+        save_name = f"Molecule-step{self.step}-{random_id}.{ext}"
         if not os.path.exists(save_dir):
             os.mkdir(save_dir)
         save_path = os.path.join(save_dir, save_name)
@@ -116,6 +116,8 @@ def __preprocess(self, data_or_path):
         else:
             raise ValueError("The data passed to Melocule must be a file name or file object.")
 
+        return ext
+
     def __save(self, save_path):
 
         try:

From fa246e36c1813bae4a02940903297147b85a8752 Mon Sep 17 00:00:00 2001
From: ZeYi Lin <944270057@qq.com>
Date: Thu, 18 Apr 2024 16:21:45 +0800
Subject: [PATCH 4/5] Update molecule.py

---
 swanlab/data/modules/molecule.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/swanlab/data/modules/molecule.py b/swanlab/data/modules/molecule.py
index 881144e45..bf2bd8443 100644
--- a/swanlab/data/modules/molecule.py
+++ b/swanlab/data/modules/molecule.py
@@ -98,10 +98,10 @@ def __preprocess(self, data_or_path):
 
             # 如果没有传入file_type参数
             if ext is None:
-                raise ValueError("When using the io object, the file_type keyword argument must be passed.")
+                raise TypeError("When using the io object, the file_type keyword argument must be passed.")
 
             if ext not in Molecule.SUPPORTED_TYPES:
-                raise ValueError("Molecule 3D only supports files of the type: " + ", ".join(Molecule.SUPPORTED_TYPES))
+                raise TypeError("Molecule 3D only supports files of the type: " + ", ".join(Molecule.SUPPORTED_TYPES))
 
             # 分子数据为IO对象读取的内容
             self.molecule_data = data_or_path
@@ -110,11 +110,11 @@ def __preprocess(self, data_or_path):
         elif isinstance(data_or_path, str):
             ext = os.path.splitext(data_or_path)[1][1:]
             if ext not in Molecule.SUPPORTED_TYPES:
-                raise ValueError("Molecule 3D only supports files of the type: " + ", ".join(Molecule.SUPPORTED_TYPES))
+                raise TypeError("Molecule 3D only supports files of the type: " + ", ".join(Molecule.SUPPORTED_TYPES))
             # 分子数据为文件路径
             self.molecule_data = data_or_path
         else:
-            raise ValueError("The data passed to Melocule must be a file name or file object.")
+            raise TypeError("The data passed to Melocule must be a file name or file object.")
 
         return ext
 

From 785cea0884135d39842fa399e17dffad957e09c9 Mon Sep 17 00:00:00 2001
From: ZeYi Lin <944270057@qq.com>
Date: Thu, 18 Apr 2024 16:22:16 +0800
Subject: [PATCH 5/5] ValueError -> TypeError

---
 swanlab/data/modules/molecule.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/swanlab/data/modules/molecule.py b/swanlab/data/modules/molecule.py
index bf2bd8443..be01ca743 100644
--- a/swanlab/data/modules/molecule.py
+++ b/swanlab/data/modules/molecule.py
@@ -179,7 +179,7 @@ def from_rdkit(
             path = pathlib.Path(data_or_path)
             ext = path.suffix.split(".")[-1]
             if ext not in Molecule.SUPPORTED_RDKIT_TYPES:
-                raise ValueError(
+                raise TypeError(
                     "swanlab.Molecule.from_rdkit only supports files of the type: "
                     + ", ".join(Molecule.SUPPORTED_RDKIT_TYPES)
                 )
@@ -192,7 +192,7 @@ def from_rdkit(
         elif isinstance(data_or_path, Chem.rdchem.Mol):
             molecule = data_or_path
         else:
-            raise ValueError("Data must be file name or an rdkit.Chem.rdchem.Mol object")
+            raise TypeError("Data must be file name or an rdkit.Chem.rdchem.Mol object")
 
         if convert_to_3d_and_optimize:
             molecule = Chem.AddHs(molecule)
@@ -251,7 +251,7 @@ def from_smiles(
 
         molecule = Chem.MolFromSmiles(data, sanitize=sanitize)
         if molecule is None:
-            raise ValueError("Unable to parse the SMILES string.")
+            raise TypeError("Unable to parse the SMILES string.")
 
         return cls.from_rdkit(
             data_or_path=molecule,