-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtraining.py
95 lines (77 loc) · 3.17 KB
/
training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import pandas as pd
# Parameters
VOCAB_SIZE = 10000
MAX_LEN = 250
EMBEDDING_DIM = 16
MODEL_PATH = 'sentiment_analysis_model.h5'
file_path = 'data.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1')
df_shuffled = data.sample(frac=1).reset_index(drop=True)
texts = []
labels = []
for _, row in df_shuffled.iterrows():
texts.append(row[-1])
label = row[0]
labels.append(0 if label == 0 else 1 if label == 2 else 2)
texts = np.array(texts)
labels = np.array(labels)
# Tokenize and pad the sequences
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=MAX_LEN, value=VOCAB_SIZE-1, padding='post')
# Save the tokenizer to a file
with open('tokenizer.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
# Split data into training and test sets (you might want to do this in a more balanced way)
train_data = padded_sequences[:-5000]
test_data = padded_sequences[-5000:]
train_labels = labels[:-5000]
test_labels = labels[-5000:]
# Check if saved model exists
if os.path.exists(MODEL_PATH):
print("Loading saved model...")
model = load_model(MODEL_PATH)
else:
print("Training a new model...")
# Define the model
model = Sequential([
Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LEN),
GlobalAveragePooling1D(),
Dense(16, activation='relu'),
Dense(3, activation='softmax') # 3 classes: negative, neutral, positive
])
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Train the model
model.fit(train_data, train_labels, epochs=10, batch_size=32, validation_split=0.2)
# Save the trained model
model.save(MODEL_PATH)
# Evaluate on test data
loss, accuracy = model.evaluate(test_data, test_labels)
print(f"Test accuracy: {accuracy * 100:.2f}%")
# Interactive loop for predictions
def encode_text(text):
tokens = tf.keras.preprocessing.text.text_to_word_sequence(text)
tokens = [tokenizer.word_index[word] if word in tokenizer.word_index else 0 for word in tokens]
return pad_sequences([tokens], maxlen=MAX_LEN, padding='post', value=VOCAB_SIZE-1)
while True:
user_input = input("Enter a sentence for sentiment analysis (or 'exit' to quit): ")
if user_input.lower() == 'exit':
break
encoded_input = encode_text(user_input)
prediction = np.argmax(model.predict(encoded_input))
if prediction == 0:
print("Sentiment: Negative")
elif prediction == 1:
print("Sentiment: Neutral")
else:
print("Sentiment: Positive")
# you need to download the training data from here: https://www.kaggle.com/datasets/kazanova/sentiment140?resource=download