Skip to content

NarenSham/H2o-Review-Prediction-Engine

Repository files navigation

H2o-Review-Prediction-Engine

🔥 A review sentiment prediction analysis for any dataset using h2o

Classify Review Data

This application shows the predicted value of your review comment to be positive or negative based on the dataset at hand and their previous classifications. Please run as one complete block to include UI elements and the underlying text mining methods. Please see Review App.py for undelying h2o logic. For example purposes, you can use this UCI Machine Learning Datasets] which has the following datasets

Datasets
Amazon Reviews
Yelp Restaurant Reviews
IMDB Movies reviews

A graphical interface to understand how your review will be classified

The code block contains both the ui elements and the underlying classification algorithm In this case : h2o.ai. Please note that the prediction depends purely on the supervised set input to the application

Screenshot

#using tkinter for UI 
from tkinter import *
from tkinter import filedialog

#inititate the input file path and output
tsv_file=''


output_value=0.0


    
def get_pred():

    jobname=entry5.get()
    return jobname

#File :ask to open
def UploadAction(event=None):
    filename = filedialog.askopenfilename()
    tsvfile=filename.get()
    print('Selected:', filename)

def AssignAction(event=None):
    filename = filedialog.askopenfilename()
    #tsvfile=filename 
    return filename

#prediction algorithm
def predict(job_title,w2v, gbm):
    import h2o
    words = tokenize(h2o.H2OFrame(job_title).ascharacter())
    job_title_vec = w2v.transform(words, aggregate_method="AVERAGE")
    #job_title_vec_2= job_title.cbind(job_title_vec)
    print(gbm.predict(job_title_vec))
    if gbm.predict(job_title_vec)>0.5:
        print("Most likely Negative")
    else:
        print("Probably positive")

        
def submitted():
    predx=get_pred()
    op=(predict([predx], w2v_model, gbm_model))
    print(predict([predx], w2v_model, gbm_model))

# tokenize to split each word of review
def tokenize(sentences, stop_word = ["ax","i","you","edu","s","t","m","subject","can","lines","re","what","there","all","we","one","the","a","an","of","or","in","for","by","on","but","is","in","a","not","with","as","was","if","they","are","this","and","it","have","from","at","my","be","by","not","that","to","from","com","org","like","likes","so"]
):
        tokenized = sentences.tokenize("\\W+")
        tokenized_lower = tokenized.tolower()
        tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]
        tokenized_words = tokenized_filtered[tokenized_filtered.grep("[0-9]",invert=True,output_logical=True),:]
        tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(["ax","i","you","edu","s","t","m","subject","can","lines","re","what","there","all","we","one","the","a","an","of","or","in","for","by","on","but","is","in","a","not","with","as","was","if","they","are","this","and","it","have","from","at","my","be","by","not","that","to","from","com","org","like","likes","so"]
)),:]
        return tokenized_words


# The application interface
    
def analyze():
    

    tsv_file=AssignAction()


    import time
    time.sleep(10) 

    import h2o

    h2o.init(max_mem_size="2G")

    label3 = Label(root, text="Building Model")
    label3.pack()

    import pandas as pd
    import numpy as np

    csv_table=pd.read_table(tsv_file,sep='\t',header=None,names = ["Review", "Response"] )
    csv_table.to_csv('reviews.csv',index=False)

#print(type(csv_table))
    csv_table
    reviews = h2o.h2o.H2OFrame(csv_table)


    words = tokenize(reviews["Review"])


    from h2o.estimators.word2vec import H2OWord2vecEstimator


    w2v_model = H2OWord2vecEstimator(vec_size = 100, model_id = "w2v.hex")
    w2v_model.train(training_frame=words)

    review_vecs = w2v_model.transform(words, aggregate_method = "AVERAGE")

#ext_reviews

    ext_reviews = reviews.cbind(review_vecs)

    data_split = ext_reviews.split_frame(ratios=[0.8])

    ext_train = data_split[0]
    ext_test = data_split[1]


    #print("Build a basic GBM model")
    gbm_model = h2o.estimators.gbm.H2OGradientBoostingEstimator()
    gbm_model.train(x = ext_reviews.names,
                    y="Response", 
                    training_frame = data_split[0], 
                    validation_frame = data_split[1])
    #output_value=predict([entry5.get()], w2v_model, gbm_model)
    #label7 = Label(root, text=str(output_value))
    #label7.pack()
    x=entry5.get()
    y=h2o.H2OFrame([x])
    y.col_names = ['Review']
    return(predict([y], w2v_model, gbm_model))


#root.mainloop()
    
root = Tk()
root.title("h2o Data Analytics")


#Label 1
label1 = Label(root,text = 'Prediction of Comments')
label1.pack()
label1.config(justify = CENTER)


label5 = Label(root, text="Enter a comment that you want to predict")
label5.pack()
#label1.config(justify = CENTER)

entry5 = Entry(root, width = 100)
entry5.pack()



label4 = Label(root, text="Upload review data file to start the analysis based on the data")
label4.pack()

button6 = Button(root, text = 'Start Now')
button6.pack() 
button6.config(command = analyze)




root.mainloop()
Checking whether there is an H2O instance running at http://localhost:54321. connected.
H2O cluster uptime: 4 hours 52 mins
H2O cluster timezone: America/New_York
H2O data parsing timezone: UTC
H2O cluster version: 3.22.0.1
H2O cluster version age: 15 days
H2O cluster name: H2O_from_python_narensham_b4de52
H2O cluster total nodes: 1
H2O cluster free memory: 1.472 Gb
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster status: locked, healthy
H2O connection url: http://localhost:54321
H2O connection proxy: None
H2O internal security: False
H2O API Extensions: XGBoost, Algos, AutoML, Core V3, Core V4
Python version: 3.6.4 final
Parse progress: |█████████████████████████████████████████████████████████| 100%
word2vec Model Build progress: |██████████████████████████████████████████| 100%
gbm Model Build progress: |███████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Review
What is this about, I have no clue
Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%
predict
0.338029
gbm prediction progress: |████████████████████████████████████████████████| 100%
Most likely Negative
The above statement specifies the likely classification based on the dataset provided.

About

A review sentiment prediction for any dataset using h2o

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published