-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathNBcategorization.py
94 lines (74 loc) · 3.07 KB
/
NBcategorization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import jieba
import pandas as pd
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
# 读取五个.csv文件
df_technology = pd.read_csv("./data/technology_news.csv", encoding='utf-8')
df_technology = df_technology.dropna()
df_car = pd.read_csv("./data/car_news.csv", encoding='utf-8')
df_car = df_car.dropna()
df_entertainment = pd.read_csv("./data/entertainment_news.csv", encoding='utf-8')
df_entertainment = df_entertainment.dropna()
df_military = pd.read_csv("./data/military_news.csv", encoding='utf-8')
df_military = df_military.dropna()
df_sports = pd.read_csv("./data/sports_news.csv", encoding='utf-8')
df_sports = df_sports.dropna()
# 每个类别抽取20000个数据
technology = df_technology.content.values.tolist()[1000:21000]
car = df_car.content.values.tolist()[1000:21000]
entertainment = df_entertainment.content.values.tolist()[:20000]
military = df_military.content.values.tolist()[:20000]
sports = df_sports.content.values.tolist()[:20000]
# 进行分词和去除停用词
stopwords = pd.read_csv("data/stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
stopwords = stopwords['stopword'].values
# 处理文件数据 去除停用词,加入标签信息
def preprocess_text(content_lines, sentences, category):
for line in content_lines:
try:
segs = jieba.lcut(line)
segs = filter(lambda x: len(x) > 1, segs)
segs = filter(lambda x: x not in stopwords, segs)
sentences.append((" ".join(segs), category))
except Exception:
print(line)
continue
# 生成训练数据
sentences = []
preprocess_text(technology, sentences, 'technology')
preprocess_text(car, sentences, 'car')
preprocess_text(entertainment, sentences, 'entertainment')
preprocess_text(military, sentences, 'military')
preprocess_text(sports, sentences, 'sports')
# 打乱数据
random.shuffle(sentences)
# 利用sklearn划分数据集
x, y = zip(*sentences)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1234)
# 定义朴素贝叶斯分类模型
class NBClassifier():
def __init__(self, classifier=MultinomialNB()):
self.classifier = classifier
# 词袋模型
self.vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 4), max_features=2000)
def features(self, X):
# 将X转换为词向量
return self.vectorizer.transform(X)
def fit(self, X, y):
# 向量化X
self.vectorizer.fit(X)
# 训练模型
self.classifier.fit(self.features(X), y)
def predict(self, x): # 预测
return self.classifier.predict(self.features([x]))
def score(self, X, y): # 打分
return self.classifier.score(self.features(X), y)
# 应用模型
text_classifier = NBClassifier()
text_classifier.fit(x_train, y_train)
# 测试
print(text_classifier.predict('中国 男篮 什么 时候 才能 进军 奥运会 8强'))
# 测试集评分
print(text_classifier.score(x_test, y_test))