-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStep3_TFIDFSpace.py
executable file
·44 lines (36 loc) · 1.9 KB
/
Step3_TFIDFSpace.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer
from Tools import readfile, readbunchobj, writebunchobj
def bunch2Space(bunch):
stopword_path = "train_word_bag/hlt_stop_words.txt"
stpwrdlst = readfile(stopword_path).splitlines()
tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[],
vocabulary={})
vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True)
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
tfidfspace.vocabulary = vectorizer.vocabulary_
return tfidfspace
def vector_space(stopword_path, bunch_path, space_path, train_tfidf_path=None):
stpwrdlst = readfile(stopword_path).splitlines()
bunch = readbunchobj(bunch_path)
tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[],
vocabulary={})
if train_tfidf_path is not None:
trainbunch = readbunchobj(train_tfidf_path)
tfidfspace.vocabulary = trainbunch.vocabulary
vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5,
vocabulary=trainbunch.vocabulary)
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
else:
vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5)
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
tfidfspace.vocabulary = vectorizer.vocabulary_
writebunchobj(space_path, tfidfspace)
print("if-idf success")
def Step3_TFIDFSpace():
stopword_path = "train_word_bag/hlt_stop_words.txt"
bunch_path = "train_word_bag/train_set.dat"
space_path = "train_word_bag/tfdifspace.dat"
vector_space(stopword_path, bunch_path, space_path)