-
Notifications
You must be signed in to change notification settings - Fork 0
/
dutch_as_eval_testing_TF-IDF.py
205 lines (174 loc) · 8.42 KB
/
dutch_as_eval_testing_TF-IDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# KEYWORD-MATCHING APPROACH ON NON-DUTCH TRAINING AND DUTCH DATA AS TESTING. THE SIZE OF N-GRAMS AND LIST IS
# SUPPOSED TO BE MANUALLY CHANGED DEPENDING ON THE DESIRED PURPOSE
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import nltk
import random
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import random
from nltk.stem import PorterStemmer
import seaborn as sns
import matplotlib.pyplot as plt
ps = PorterStemmer()
from sklearn.metrics import classification_report
# Calculate precision, recall, and F1-score using classification_report
# download stopwords and wordnet
nltk.download('stopwords')
nltk.download('wordnet')
# read in the dataset
data = pd.read_csv('merged_training.txt')
# preprocess text data
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
# Modify the preprocess_text function to exclude the added words
def preprocess_text(text):
# Tokenize
tokens = word_tokenize(text.lower())
# Remove numbers
tokens = [token for token in tokens if not re.match(r'\d+', token)]
# Remove stopwords
tokens = [t for t in tokens if t not in stop_words]
# Remove additional words
additional_words = [':', ':', '-', 'knowledge', 'available', 'information'
'competence',' competence', 'availablecompetences', 'availableknowledge', 'availableskills']
# consider this again after changing the data-stoprwords list
tokens = [t for t in tokens if t.strip() not in additional_words]
# Lemmatize
tokens = [lemmatizer.lemmatize(t) for t in tokens]
# Re-join tokens into a string
return ' '.join(tokens)
data['description'] = data['description'].apply(preprocess_text)
# select the column containing the preprocessed text data
text_data = data['description']
# print(text_data)
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
# create tf-idf vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))
# fit and transform the text data to a tf-idf matrix
tfidf_matrix = vectorizer.fit_transform(text_data)
# get the feature names (words and bigrams)
feature_names = vectorizer.get_feature_names_out()
# print(feature_names)
# create a dictionary to store the top words for each document class
top_words_by_class = {}
# iterate over each unique document class (eqf_level_id)
for eqf_level in data['eqf_level_id'].unique():
# get the indices of the documents that belong to this class
class_docs = data[data['eqf_level_id'] == eqf_level].index
# create a list to store the tf-idf scores for each word
word_scores = []
# iterate over each document in this class
# https://stackoverflow.com/questions/34449127/sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-documen
for doc_idx in class_docs:
# print(doc_idx)
doc = tfidf_matrix[doc_idx]
feature_index = doc.indices
tfidf_scores = doc.data
# append the tf-idf scores for this document to the list
word_scores.append(dict(zip([feature_names[i] for i in feature_index], tfidf_scores)))
# combine the tf-idf scores for all documents in this class
combined_scores = {}
for doc_scores in word_scores:
for word, score in doc_scores.items():
# print(word)
if word in combined_scores:
combined_scores[word] += score
else:
combined_scores[word] = score
# 81-87 calculating the classes which has more words and this explaings class 0s
# sort the words by their combined tf-idf scores
#By sorting the words based on their tf-idf scores and selecting the top words, we are prioritizing
# the words that have the highest scores and, therefore,
# the greatest potential to contribute to the classification of a document into a specific class.
sorted_words = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
# store the top 50/100 etc. words for this class in the dictionary
top_words_by_class[eqf_level] = [w[0] for w in sorted_words[:1000]]
# print('TOP WORDS',top_words_by_class[eqf_level])
# print(len(top_words_by_class[eqf_level]))
# test chech number of words per TF-IDF level line 96
# print the top words for each document class
# for eqf_level, top_words in top_words_by_class.items():
# print(f"Top words for documents in class {eqf_level}: {top_words}")
# print(tfidf_matrix.toarray())
# ask for user input to classify a new sentence
# Read the test dataset
test_data = pd.read_csv('NEW_DUTCH')
# Preprocess the text data in the test dataset
test_data['description'] = test_data['description'].apply(preprocess_text)
# Select the column containing the preprocessed text data from the test dataset
test_text_data = test_data['description']
# Transform the preprocessed text data of the test dataset into TF-IDF vectors
test_tfidf_matrix = vectorizer.transform(test_text_data)
# Store the actual classes from the test dataset
actual_classes = test_data['eqf_level_id']
actual_classes=actual_classes.to_list()
# Initialize variables for accuracy calculation
correct_predictions = 0
total_predictions = 0
predicted_classes = []
# Classify the sentences from the test dataset and check accuracy fixted by using ChatGpt for the classification by list part
for i, sentence in enumerate(test_text_data):
sentence_tfidf = test_tfidf_matrix[i]
predicted_class = None
max_score = 0
matching_words_by_class = {}
for eqf_level in data['eqf_level_id'].unique():
class_docs = data[data['eqf_level_id'] == eqf_level].index
class_tfidf = tfidf_matrix[class_docs].mean(axis=0)
score = (sentence_tfidf @ class_tfidf.T)[0, 0]
# Get the top words for the current class
top_words = top_words_by_class[eqf_level]
# Calculate the number of matching words between the sentence and the top words of the class
matching_words = len(set(preprocess_text(sentence).split()) & set(top_words))
# print('MATCHINGS',matching_words)
# Calculate the weighted score by multiplying the original score with the ratio of matching words to the total top words
weighted_score = score * (matching_words / len(top_words))
# Print the matching words for the current class
# print(f"Sentence: {sentence}")
# print(f"Matching words for class {eqf_level}: {matching_words}")
matching_words_by_class[eqf_level] = weighted_score
if weighted_score > max_score:
max_score = weighted_score
predicted_class = eqf_level
if predicted_class == None:
predicted_class = random.randint(1, 8)
if predicted_class == actual_classes[i]:
correct_predictions += 1
total_predictions += 1
predicted_classes.append(predicted_class)
# if predicted_class==2:
# print('x')
# print(len(predicted_classes))
# print(f"Sentence: {sentence}")
# print(f"Predicted class: {predicted_class}")
# print(f"Actual class: {actual_classes[i]}")
# print("----------")
# Calculate accuracy
accuracy = correct_predictions / total_predictions
# Calculate precision, recall, and F-score
precision = precision_score(actual_classes, predicted_classes, average='weighted', zero_division=0)
recall = recall_score(actual_classes, predicted_classes, average='weighted', zero_division=0)
# f_score = f1_score(actual_classes, predicted_classes, average='weighted', zero_division=0)
f1_score = 2 * (precision * recall) / (precision + recall)
# print('actual_classes','\n', actual_classes, 'predicted_classes',type( predicted_classes))
# print('PRE',predicted_classes, 'ACT',actual_classes)
print('list size 55 - Keyword-matching approach with unigrams all data')
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F-score: {f1_score}")
cm = confusion_matrix(actual_classes, predicted_classes)
label_names = ['1', '2', '3', '4', '5', '6', '7','8']
plt.figure(figsize=(6,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_names, yticklabels=label_names)
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix of Best Keyword-Matching system')
plt.show()
classification_report = classification_report(actual_classes, predicted_classes, zero_division=0)
print(classification_report)