-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_analysis.py
98 lines (85 loc) · 3.38 KB
/
text_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#IMPORTS
import re
import os
import pandas as pd
from string import punctuation
from textblob import Word
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
# Downloading Additional NLTK Resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# Load and Explore Data
for dirname, _, filenames in os.walk('./IMDB'):
for filename in filenames:
print(filename)
train = pd.read_csv("./IMDB/Train.csv")
print(train.head())
print(train.info())
# Visualize Data
fig = plt.figure(figsize=(5,5))
colors = ["skyblue", 'pink']
pos = train[train['label'] == 1]
neg = train[train['label'] == 0]
counts = [pos['label'].count(), neg['label'].count()]
plt.pie(counts, labels=["Positive", "Negative"], autopct='%1.1f%%', shadow=True, colors=colors, startangle=45, explode=(0, 0.1))
plt.show()
# Text Preprocessing
def transformations(dataframe):
dataframe['text'] = dataframe['text'].apply(lambda words: re.sub('<.*?>','',words)) # HTML Tags removal
dataframe['text'] = dataframe['text'].apply(word_tokenize)
dataframe['text'] = dataframe['text'].apply(lambda words: [x.lower() for x in words])
dataframe['text'] = dataframe['text'].apply(lambda words: [x for x in words if x not in punctuation])
dataframe['text'] = dataframe['text'].apply(lambda words: [x for x in words if not x.isdigit()])
dataframe['text'] = dataframe['text'].apply(lambda words: [x for x in words if x not in stopwords.words('english')])
temp = dataframe['text'].apply(lambda words: " ".join(words))
freq = pd.Series(temp).value_counts()[:10]
dataframe['text'] = dataframe['text'].apply(lambda words: [x for x in words if x not in freq.keys()])
dataframe['text'] = dataframe['text'].apply(lambda words: " ".join([Word(x).lemmatize() for x in words]))
return dataframe
train = transformations(train)
valid = pd.read_csv("./IMDB/Valid.csv")
valid = transformations(valid)
test = pd.read_csv("./IMDB/Test.csv")
test = transformations(test)
# Word Cloud Visualization
def wordcloud_draw(data, color='white'):
words = ' '.join(data)
cleaned_word = " ".join([word for word in words.split() if (word != 'movie' and word != 'film')])
wordcloud = WordCloud(stopwords=stopwords.words('english'), background_color=color, width=2500, height=2000).generate(cleaned_word)
plt.figure(1, figsize=(10, 7))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
print("Positive Words")
positivedata = train[train['label'] == 1]['text']
wordcloud_draw(positivedata)
print("Negative Words")
negdata = train[train['label'] == 0]['text']
wordcloud_draw(negdata)
# Prepare Data for Model
X_train = train.text
Y_train = train.label
X_valid = valid.text
Y_valid = valid.label
X_test = test.text
Y_test = test.label
# Model Training
clf = Pipeline([
('preprocessing', CountVectorizer()),
('classifier', LogisticRegression(dual=False, max_iter=2000))
])
clf.fit(X_train, Y_train)
# Model Evaluation
print("Validation Score:", clf.score(X_valid, Y_valid))
print("Test Score:", clf.score(X_test, Y_test))
# Predictions
p = clf.predict(X_test)
print(f'Number of reviews classified as Positive: {list(p).count(1)}')
print(f'Number of reviews classified as Negative: {list(p).count(0)}')