-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex_create.py
61 lines (51 loc) · 1.94 KB
/
index_create.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID, DATETIME
from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
from jieba.analyse import ChineseAnalyzer
from database.db import get_session
from database.models.news import News
from sqlmodel import select
from langdetect import detect # 语言检测库
def create_search_index_from_mysql(index_dir):
"""从数据库中搜索条目写出索引"""
try:
with get_session() as session:
statement = select(News).order_by(News.time)
results = session.exec(statement).all()
schema = Schema(
id=ID(stored=True, unique=True),
title=TEXT(stored=True, analyzer=ChineseAnalyzer()), # 默认中文
content=TEXT(stored=True, analyzer=ChineseAnalyzer()), # 默认中文
time=DATETIME(stored=True),
click_num=TEXT(stored=True),
url=TEXT(stored=True)
)
if not os.path.exists(index_dir):
os.makedirs(index_dir)
ix = create_in(index_dir, schema)
writer = ix.writer()
for item in results:
# 检测内容语言
language = detect(item.title)
if language == 'zh-cn': # 中文
analyzer = ChineseAnalyzer()
else: # 假设其他语言使用简单的英文分析器
analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter()
writer.add_document(
id=str(item.id),
title=item.title,
url=item.url,
content=item.content,
time=item.time,
click_num=str(item.click_num)
)
writer.commit()
print(f"Index created in {index_dir}")
except Exception as e:
print(f"Error: {str(e)}")
return
def main():
create_search_index_from_mysql("index_dir")
if __name__ == "__main__":
main()