-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTestSegmentationZh.py
79 lines (60 loc) · 1.92 KB
/
TestSegmentationZh.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import re
import jieba
import os
import codecs
FILE_PATH = "D:\\Documents\\PyCharm\\Native_Bayes\\data"
STOPWORDS = codecs.open(os.path.join(FILE_PATH, 'stopwords_cn.txt'), 'r', 'UTF-8').read().split('\r\n')
# 中文测试
def textParseZh(bigString):
str = jieba.lcut(bigString)
newStr = [re.sub(r'\W*','',s) for s in str]
return [tok.lower() for tok in newStr if len(tok) >0]
# mySent = '你好,欢迎来到西安邮电大学。Hello,Welcome to XUPT'
# str = textParseZh(mySent)
# print(str)
# 创建停用词列表
def stopwordslist():
stopwords = [line.strip() for line in open('data/stopwords_cn.txt', encoding='UTF-8').readlines()]
return stopwords
def delStopwords(fullText):
newList = []
stopwords = stopwordslist()
for word in fullText:
if word not in stopwords:
newList.append(word)
return newList
docList = []
classList = []
fullText = []
'''
wordlist = textParseZh(open('data/email_zh/spam/demo.txt' ,encoding='UTF-8').read())
temp.extend(wordlist)
temp = delStopwords(temp)
docList.extend(temp)
print(docList)
'''
'''
demo.txt :我有很多论文
['我', '有', '很多', '论文']
['论文']
'''
'''
append() 用于在列表末尾添加新的对象。
extend() 用于在列表末尾一次性追加另一个序列中的多个值(用新列表扩展原来的列表
'''
#导入并解析文本文件
for i in range(1, 26): # 遍历25个txt文件
temp1 = []
# 读取每个垃圾邮件,并字符串转换成字符串列表
wordlist1 = textParseZh(open('data/email_zh/ham/%d.txt' % i,encoding='UTF-8').read())
temp1.extend(wordlist1)
temp1 = delStopwords(temp1)
print("temp1:\n", temp1)
docList.append(temp1)
temp2 = []
wordlist2 = textParseZh(open('data/email_zh/spam/%d.txt' % i,encoding='UTF-8').read())
temp2.extend(wordlist2)
temp2 = delStopwords(temp2)
print("temp2\n",temp2)
docList.append(temp2)
print("docList:\n",docList)