-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel 1.py
120 lines (81 loc) · 3.08 KB
/
model 1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import string
import joblib
data = pd.read_csv(r'data\user_data.csv')
data.drop(104,axis = 0,inplace = True)
data.drop(['Timestamp','C1'],axis = 1,inplace =True)
data.dropna(inplace = True)
lst_1 = list(data['Questions '])
lst_2 = list(data['Category'])
data = pd.DataFrame({'Questions':lst_1,'Category':lst_2},index = list(range(117)))
X = np.array(data['Questions'])
y = np.array(data['Category'])
ref_dict = {'UPSC':0,'Research':1,'Corporate':2}
nltk.download('stopwords')
nltk.download('punkt')
class preprocessor:
def __init__(self, sentences):
self.sen = sentences
def sent_tokenizer(self,sent):
sent = nltk.sent_tokenize(sent)
for i in range(len(sent)):
sent[i] = sent[i][:len(sent[i])-1]
sent = ' '.join(sent)
return sent
def tokenize_it(self, sen):
# instantiate tokenizer class
sen_new = nltk.word_tokenize(sen)
return sen_new
def remove_stopW_and_punc(self, sen_tokens):
stopwords_english = stopwords.words('english')
sen_clean = []
for word in sen_tokens: # Go through every word in your tokens list
if (word not in stopwords_english and # remove stopwords
word not in string.punctuation): # remove punctuation
sen_clean.append(word)
return sen_clean
def sen_stemmer(self, sen_clean):
# Instantiate stemming class
stemmer = PorterStemmer()
# Create an empty list to store the stems
sen_stem = []
for word in sen_clean:
stem_word = stemmer.stem(word) # stemming word
sen_stem.append(stem_word) # append to the list
return sen_stem
# Now we setup a controller which initiates each mehtod of the above declared
# class in the correct chronological order.
def controller(self, sen):
temp_sen = self.sent_tokenizer(sen)
temp_sen = self.tokenize_it(temp_sen)
temp_sen = self.remove_stopW_and_punc(temp_sen)
temp_sen = self.sen_stemmer(temp_sen)
return temp_sen
def run(self):
processed_sens = []
while self.sen:
sen = self.sen.pop(0)
processed_sen = self.controller(sen)
processed_sens.append(processed_sen)
return processed_sens
processed_sentences= preprocessor(list(X)).run()
for i in range(len(processed_sentences)):
processed_sentences[i] = ' '.join(processed_sentences[i])
# vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
Vectorizer = TfidfVectorizer(use_idf = True)
X = Vectorizer.fit_transform(processed_sentences)
#Implementing the sequesntial model
y = [ref_dict[label] for label in y]
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
X = X.toarray().tolist()
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 1234)
result = model.fit(X,y)
joblib.dump(model,"model.sav")