Skip to content

Commit

Permalink
Python script run feedback datasets#22 (#30)
Browse files Browse the repository at this point in the history
* first

* update on assignment

* Modified task.py

* deleted floop_data_15k.json and word2vec11.model

* removed the commented

* word2vec11.model file added

* updated

* update line 83

* update csv file

* updates of

* Update on the code

* floop_data_15k.json deleted

* DataFromS3 to Field1 changed

* line 25 added

* Update on task.py and added info.md file
  • Loading branch information
eyerusalemdani authored Feb 4, 2022
1 parent 4880bf7 commit 875b6bc
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 0 deletions.
1 change: 1 addition & 0 deletions backend/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
word2vec11.model/
7 changes: 7 additions & 0 deletions backend/info.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
python version== 3.9.5
gensim version== 3.6.0
nltk version==3.6.7
sklearn version 1.0.2
pandas version== 1.4.0
matplotlib version== 3.5.1
re version== 2.2.1
129 changes: 129 additions & 0 deletions backend/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# -*- coding: utf-8 -*-
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import re
import nltk
import os

from sqlalchemy import column

from gensim.models import Word2Vec
from time import time # To time our operations
from collections import defaultdict # For word frequency
import logging # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

import matplotlib.pyplot as plt


# nltk.download()
path = input("Enter the file Path")
df = pd.read_json (path)
df.to_csv (r'floop_data_15k.csv', index = None)

data = pd.read_csv("floop_data_15k.csv")

data.columns= ["Field1"]

data.shape


#To remove special characters and punctuation from our dataset
from string import punctuation

punctuations = punctuation

def solution(sentence):
for p in punctuations:
sentence = sentence.replace(p, '')
return sentence

x = data["Field1"].apply(solution)
pattern = "[^a-zA-Z0-9]"
x_cleaned = [re.sub(pattern," ",text) for text in x]

x_lowered = [text.lower() for text in x_cleaned]
x_lowered

x_lowered[0]

nltk.download('punkt')
nltk.download('omw-1.4')

x_tokenized = [nltk.word_tokenize(text) for text in x_lowered]

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

x_lemmatized = [[lemma.lemmatize(word) for word in text] for text in x_tokenized]

print(x_lemmatized[0])





# For classification data whether good or bad.

w2v_model = Word2Vec(min_count=20,window=2,sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20,workers= 1 )

t = time()

w2v_model.build_vocab(x_lemmatized, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

t = time()

w2v_model.train(x_lemmatized, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

w2v_model.init_sims(replace=True)

w2v_model.save("word2vec11.model")

word_vectors = Word2Vec.load("word2vec11.model").wv
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors.astype('double'))
positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

model.cluster_centers_

def cast_vector(row):
return np.array(list(map(lambda x: x.astype('double'), row)))

words = pd.DataFrame(word_vectors.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
words['vectorsmean'] = words.vectors.apply(lambda x: x.mean())
words['vectors_typed'] = words.vectors.apply(cast_vector)
words['cluster'] = words.vectors_typed.apply(lambda x: model.predict([np.array(x, dtype=np.double)]))
words.cluster = words.cluster.apply(lambda x: x[0])
words['cluster_value'] = [1 if i==0 else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

words.head(10)

u_labels = np.unique(words['cluster'])

words['vectorsmean'] = words.vectors.apply(lambda x: x.mean())

words['vectorsmean'][0]

len(words["vectors"][1])

words.head(10)

# FOr plotting

colors = {1: 'black', -1: 'Red'}
plt.scatter(words['sentiment_coeff'] , words['vectorsmean'] , c=words['cluster_value'].map(colors))

plt.show()

os.remove("floop_data_15k.csv")
os.remove("word2vec11.model")

0 comments on commit 875b6bc

Please sign in to comment.