Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

实现mol文件转换成sdf文件格式修改爬取域名,更换为老域名; #4

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.venv
.DS_Store
*/.DS_Store

Expand All @@ -9,3 +10,7 @@ build/
.tox/
.env
*.log

# result files
data/spider_data/*.xlsx
data/spider_data/*.sdf
Empty file added data/spider_data/.files
Empty file.
Binary file removed data/spider_data/Baizhu_disease.xlsx
Binary file not shown.
Binary file removed data/spider_data/Baizhu_ingredients.xlsx
Binary file not shown.
Binary file removed data/spider_data/Baizhu_targets.xlsx
Binary file not shown.
Binary file removed data/spider_data/Chenpi_disease.xlsx
Binary file not shown.
Binary file removed data/spider_data/Chenpi_ingredients.xlsx
Binary file not shown.
Binary file removed data/spider_data/Chenpi_targets.xlsx
Binary file not shown.
Binary file removed data/spider_data/Mahuang_disease.xlsx
Binary file not shown.
Binary file removed data/spider_data/Mahuang_ingredients.xlsx
Binary file not shown.
Binary file removed data/spider_data/Mahuang_targets.xlsx
Binary file not shown.
Binary file removed data/spider_data/Mahuanggen_disease.xlsx
Binary file not shown.
Binary file removed data/spider_data/Mahuanggen_ingredients.xlsx
Binary file not shown.
Binary file removed data/spider_data/Mahuanggen_targets.xlsx
Binary file not shown.
5 changes: 4 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
beautifulsoup4==4.11.2
lxml==4.9.2
pandas==1.5.2
pandas
requests==2.28.1
rdkit
openpyxl
tqdm
2 changes: 1 addition & 1 deletion src/get_all_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
def get_data(type):

tcmsp = TcmspSpider()
url = f"https://tcmsp-e.com/browse.php?qc={type}"
url = f"https://old.tcmsp-e.com/browse.php?qc={type}"

# 获取页面
html = tcmsp.get_response(url)
Expand Down
51 changes: 44 additions & 7 deletions src/tcmsp.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#!/usr/local/bin/python3
# -*- encoding: utf-8 -*-
'''
"""
@Brief : TCMSP数据库爬虫
@Time : 2023/02/09 19:39:55
@Author : https://github.com/shujuecn
'''
"""

import os
import re
Expand All @@ -13,12 +13,14 @@
import pandas as pd
from bs4 import BeautifulSoup as bs
import lxml.html

from rdkit import Chem
import tempfile
from tqdm import tqdm

class TcmspSpider:
def __init__(self):

self.root_url = "https://www.tcmsp-e.com/tcmspsearch.php"
self.root_url = "https://old.tcmsp-e.com/tcmspsearch.php"
self.headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; NCE-AL10 Build/HUAWEINCE-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/55.0.2883.91 Mobile Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
Expand Down Expand Up @@ -129,7 +131,7 @@ def get_herb_data(self, cn_name, en_name, pinyin_name):
ingredients_data,
file_path=f"{self.spider_file_path}",
file_name=f"{pinyin_name}_ingredients",
index="MOL_ID"
index="MOL_ID",
)

# 导出 Targets
Expand All @@ -139,7 +141,7 @@ def get_herb_data(self, cn_name, en_name, pinyin_name):
targets_data,
file_path=f"{self.spider_file_path}",
file_name=f"{pinyin_name}_targets",
index="MOL_ID"
index="MOL_ID",
)

# 导出 Disease
Expand All @@ -150,9 +152,11 @@ def get_herb_data(self, cn_name, en_name, pinyin_name):
disease_data,
file_path=f"{self.spider_file_path}",
file_name=f"{pinyin_name}_disease",
index=False
index=False,
)

print("正在导出sdf文件,过程较慢,耐心等待...")
self.mol2sdf(ingredients_data, f"{self.spider_file_path}{pinyin_name}.sdf")
print(f"{cn_name}下载完成!\n")

def get_json_data(self, html, num, pattern):
Expand Down Expand Up @@ -200,3 +204,36 @@ def text_to_excel(self, data, file_path, file_name, index):

else:
print(f"未查询到{file_name}的信息!")

def mol2sdf(self, data, output_path):
df = pd.DataFrame(data)
mol_ids = df["MOL_ID"].to_list()

with Chem.SDWriter(output_path) as writer:
for mol_id in tqdm(mol_ids, desc="导出进度"):
mol2_url = f"https://old.tcmsp-e.com/tcmspmol/{mol_id}.mol2"
response = requests.get(mol2_url)
if response.status_code == 200:
mol2_content = response.text
with tempfile.NamedTemporaryFile(
suffix=".mol2", delete=False
) as temp_file:
temp_file.write(mol2_content.encode("utf-8"))
temp_file.flush()
temp_file_path = temp_file.name

try:
mol = Chem.MolFromMol2File(
temp_file_path, sanitize=False, cleanupSubstructures=False
)
if mol:
writer.write(mol)
else:
print(f"Failed to convert MOL2 to SDF for MOL_ID: {mol_id}")
except Exception as e:
print(f"Error processing MOL_ID {mol_id}: {e}")
finally:
os.remove(temp_file_path)
else:
print(f"Failed to download MOL2 file for MOL_ID: {mol_id}")
print(f"已保存:{output_path}")