diff --git a/README.md b/README.md index 70636c15..1c1d9cd9 100755 --- a/README.md +++ b/README.md @@ -1,16 +1,49 @@ -# Title +# NLP on Amazon Reviews to Aid Customer’s Decision Process -# Abstract +## Abstract A 150 word description of the project idea, goals, dataset used. What story you would like to tell and why? What's the motivation behind your project? -# Research questions +Popular Amazon products usually have thousands of reviews from various users explaining the pros/cons of the products along with a description of their personal opinion on the product. While this data may be useful to Amazon customers, often times these comments are time-consuming to sift through. Additionally, users may get more confused reading many positive and negative reviews and Amazon may lose a potential customer. In order to save users time when they are online shopping, we want to conduct a keyword analysis on these reviews by identifying positive and negative words to help a user quickly determine whether they may want this product. After summarizing, we plan on creating a visualization for these reviews. The dataset used for this study comes from UCSD Julian McAuleys “Amazon product data” dataset. + +## Research questions A list of research questions you would like to address during the project. -# Dataset +* Can we generate keywords from Amazon reviews through sentiment analysis? +* Can we use sentiment analysis to improve similar word search? + +## Dataset +List the dataset(s) you want to use, and some ideas on how do you expect to get, manage, process and enrich it/them. Show us you've read the docs and some examples, and you've a clear idea on what to expect. Discuss data size and format if relevant. + List the dataset(s) you want to use, and some ideas on how do you expect to get, manage, process and enrich it/them. Show us you've read the docs and some examples, and you've a clear idea on what to expect. Discuss data size and format if relevant. -# A list of internal milestones up until project milestone 2 +We would like to use the Amazon reviews dataset provided to us. We will be using the 5-core (9.9gb) subset of the data where the items/users have at least 5 reviews. The dataset contains 41.13 million reviews and filters out both users with multiple accounts and plagiarized reviews. + +Each JSON object contains the following data: reviewerID, asin, reviewerName, helpfulness, reviewText, overall, summary, unixReviewTime, and reviewTime + +Here is an example below: +![picture alt](https://github.com/sdhar3/ADA-Project/blob/master/example.png "example of one review") + +Our intention is to conduct sentiment analysis on the data and the Amazon product data website recommends we use this data set specifically. + +We are going to take the review text from the file and the “overall” from each comment and anything above a “overall” of 3 will be labeled as positive and anything below will be labeled as negative. Then we will create a deep neural network to conduct sentiment analysis. We are going to run a couple noise reducing algorithms to clean up filler words and create higher accuracy within analysis. + +We also plan to do data visualization to see how related certain words are to each other. + +## A list of internal milestones up until project milestone 2 Add here a sketch of your planning for the next project milestone. -# Questions for TAa +Tasks before November 11th: +* Clean the data and combine the datasets +* Run noise reduction methods remove filler words and improve the speed of the deep network + +Tasks before November 18th: +* Write the code for the deep neural network create graph showing what words are indicative of positive review and what words are indicative of negative reviews. +* Look at graphs and create a method to find similar words. + +Tasks before November 25th: +* Apply the model to find keywords for the reviews so that it become easier for future customers to get a better general idea of the reviews without have to read all of them. +* Visualize these keywords for some of the reviews that were given for particular items being sold on Amazon. + + +## Questions for TAa Add here some questions you have for us, in general or project-specific. diff --git a/example.png b/example.png new file mode 100644 index 00000000..a0f3214d Binary files /dev/null and b/example.png differ diff --git a/model.ipynb b/model.ipynb new file mode 100644 index 00000000..7352ee40 --- /dev/null +++ b/model.ipynb @@ -0,0 +1,421 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# imports\n", + "from collections import Counter\n", + "import enchant\n", + "import numpy as np\n", + "import pandas as pd\n", + "import sys\n", + "import time" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Neural Netork\n", + "class HelpfulnessNetwork:\n", + " # initialize the network with parameters\n", + " def __init__(self, reviews, labels, min_count, polarity_cutoff, hidden_nodes, learning_rate):\n", + " # This is to get reproducable results each time we run the network\n", + " np.random.seed(1)\n", + "\n", + " # pre process the data and run it\n", + " self.pre_process_data(reviews, labels, polarity_cutoff, min_count)\n", + " self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)\n", + "\n", + " def pre_process_data(self, reviews, labels, polarity_cutoff, min_count):\n", + " # PART 1: get the count of words and the count of positive and negative words\n", + " positive_counts = Counter()\n", + " negative_counts = Counter()\n", + " total_counts = Counter()\n", + " \n", + " d = enchant.Dict(\"en_US\")\n", + " \n", + " for i in range(len(reviews)):\n", + " if(labels[i] == 1):\n", + " for word in reviews[i].split(\" \"):\n", + " # This is used to check if the words are English dictionary words. TEST\n", + " if word != '' and d.check(word):\n", + " positive_counts[word.lower()] += 1\n", + " total_counts[word.lower()] += 1\n", + " else:\n", + " for word in reviews[i].split(\" \"):\n", + " # This is used to check if the words are English dictionary words. TEST\n", + " if word != '' and d.check(word):\n", + " negative_counts[word.lower()] += 1\n", + " total_counts[word.lower()] += 1\n", + " \n", + " # We use this to get a better understanding of what words are correlated to helpful reviews and which\n", + " #ones are correlated to unhelpful reviews.\n", + " pos_neg_ratios = Counter()\n", + "\n", + " for term,count in list(total_counts.most_common()):\n", + " if count > 25:\n", + " pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)\n", + " pos_neg_ratios[term] = pos_neg_ratio\n", + "\n", + " # We do this so that we can get a scale, where 0 is a neutral word and the greater the number the more helpful\n", + " # and the less the number the more unhelpful.\n", + " for word,ratio in pos_neg_ratios.most_common():\n", + " if(ratio > 1):\n", + " pos_neg_ratios[word] = np.log(ratio)\n", + " else:\n", + " pos_neg_ratios[word] = -np.log((1 / (ratio + 0.01)))\n", + " \n", + " # This is used to testing and visualizing what is happening\n", + " topPos = pos_neg_ratios.most_common()[:30]\n", + " topNeg = list(reversed(pos_neg_ratios.most_common()))[:30]\n", + " print(topPos)\n", + " print(topNeg)\n", + " \n", + " for word in topPos:\n", + " print(total_counts[word[0]])\n", + " for word in topNeg:\n", + " print(total_counts[word[0]])\n", + " \n", + " # populating the vocabulary that is going to be used\n", + " review_vocab = set()\n", + " for review in reviews:\n", + " for word in review.split(\" \"):\n", + " # check the min_count requirement\n", + " if(total_counts[word.lower()] > min_count):\n", + " if(word.lower() in pos_neg_ratios.keys()):\n", + " # ensure the polarity cutoff\n", + " if((pos_neg_ratios[word.lower()] >= polarity_cutoff) or (pos_neg_ratios[word.lower()] <= -polarity_cutoff)):\n", + " review_vocab.add(word.lower())\n", + " else:\n", + " review_vocab.add(word.lower())\n", + "\n", + " self.review_vocab = list(review_vocab)\n", + " \n", + " # populate the labels\n", + " label_vocab = set()\n", + " for label in labels:\n", + " label_vocab.add(label)\n", + " \n", + " # Convert the label vocabulary set to a list so we can access labels via indices\n", + " self.label_vocab = list(label_vocab)\n", + " \n", + " self.review_vocab_size = len(self.review_vocab)\n", + " self.label_vocab_size = len(self.label_vocab)\n", + " \n", + " # Creating a dictionary where the words are mapped to the indices\n", + " self.word2index = {}\n", + " for i, word in enumerate(self.review_vocab):\n", + " self.word2index[word] = i\n", + " \n", + " # Creating a dictionary where the labels are mapped to the indices\n", + " self.label2index = {}\n", + " for i, label in enumerate(self.label_vocab):\n", + " self.label2index[label] = i\n", + "\n", + " def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):\n", + " self.input_nodes = input_nodes\n", + " self.hidden_nodes = hidden_nodes\n", + " self.output_nodes = output_nodes\n", + " self.learning_rate = learning_rate\n", + "\n", + " # weights input layer -> hidden layer.\n", + " self.weights_0_1 = np.zeros((self.input_nodes, self.hidden_nodes))\n", + "\n", + " # weights hidden layer -> output layer.\n", + " self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, (self.hidden_nodes, self.output_nodes))\n", + " \n", + " # The input layer\n", + " self.layer_1 = np.zeros((1,hidden_nodes))\n", + " \n", + " def sigmoid(self,x):\n", + " return 1 / (1 + np.exp(-x))\n", + " \n", + " def sigmoid_output_2_derivative(self,output):\n", + " return output * (1 - output)\n", + " \n", + " def train(self, training_reviews_raw, training_labels):\n", + " training_reviews = list()\n", + " for review in training_reviews_raw:\n", + " indices = set()\n", + " for word in review.split(\" \"):\n", + " if(word.lower() in self.word2index.keys()):\n", + " indices.add(self.word2index[word.lower()])\n", + " training_reviews.append(list(indices))\n", + " \n", + " correct_so_far = 0\n", + " start = time.time()\n", + " \n", + " # run a forward and backward pass and reviews and update weights\n", + " for i in range(len(training_reviews)):\n", + " \n", + " # Get the next review and its label\n", + " review = training_reviews[i]\n", + " label = training_labels[i]\n", + "\n", + " # Hidden layer\n", + " self.layer_1 *= 0\n", + " for index in review:\n", + " self.layer_1 += self.weights_0_1[index]\n", + "\n", + " # Output layer\n", + " layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2)) \n", + " \n", + " \n", + " ### Backward pass ###\n", + " # Output error\n", + " layer_2_error = layer_2 - label \n", + " layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)\n", + "\n", + " # Backpropagated error\n", + " layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # to the hidden layer\n", + " layer_1_delta = layer_1_error\n", + "\n", + " # Update the weights\n", + " self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate # update weights with gradient descent step\n", + " \n", + " for index in review:\n", + " self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate # update weights with gradient descent step\n", + "\n", + " if(layer_2 >= 0.5 and label == 1):\n", + " correct_so_far += 1\n", + " elif(layer_2 < 0.5 and label == 0):\n", + " correct_so_far += 1\n", + " \n", + " elapsed_time = float(time.time() - start)\n", + " reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0\n", + " \n", + " sys.stdout.write(\"\\rProgress:\" + str(100 * i/float(len(training_reviews)))[:4] \\\n", + " + \"% Speed(reviews/sec):\" + str(reviews_per_second)[0:5] \\\n", + " + \" #Correct:\" + str(correct_so_far) + \" #Trained:\" + str(i+1) \\\n", + " + \" Training Accuracy:\" + str(correct_so_far * 100 / float(i+1))[:4] + \"%\")\n", + " if(i % 2500 == 0):\n", + " print(\"\")\n", + " \n", + " def test(self, testing_reviews, testing_labels):\n", + " correct = 0\n", + " start = time.time()\n", + "\n", + " # Predict for the review \n", + " for i in range(len(testing_reviews)):\n", + " pred = self.run(testing_reviews[i])\n", + " if(pred == testing_labels[i]):\n", + " correct += 1\n", + "\n", + " elapsed_time = float(time.time() - start)\n", + " reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0\n", + " \n", + " sys.stdout.write(\"\\rProgress:\" + str(100 * i/float(len(testing_reviews)))[:4] \\\n", + " + \"% Speed(reviews/sec):\" + str(reviews_per_second)[0:5] \\\n", + " + \" #Correct:\" + str(correct) + \" #Tested:\" + str(i+1) \\\n", + " + \" Testing Accuracy:\" + str(correct * 100 / float(i+1))[:4] + \"%\")\n", + " \n", + " def run(self, review):\n", + " # forward pass\n", + " # Hidden layer \n", + " self.layer_1 *= 0\n", + "\n", + " unique_indices = set()\n", + " for word in review.split(\" \"):\n", + " if word.lower() in self.word2index.keys():\n", + " unique_indices.add(self.word2index[word.lower()])\n", + " \n", + " for index in unique_indices:\n", + " self.layer_1 += self.weights_0_1[index]\n", + " \n", + " # Output layer\n", + " layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))\n", + " \n", + " if(layer_2[0] >= 0.5):\n", + " return 1\n", + " else:\n", + " return 0" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda3/envs/ada/lib/python3.6/site-packages/ipykernel_launcher.py:11: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + " # This is added back by InteractiveShellApp.init_path()\n", + "/anaconda3/envs/ada/lib/python3.6/site-packages/ipykernel_launcher.py:12: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + " if sys.path[0] == '':\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "53129\n", + "0.7065320112943302\n", + "0.38052283401772596\n", + "0.6267763368405203\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda3/envs/ada/lib/python3.6/site-packages/ipykernel_launcher.py:20: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('mhz', 3.6109179126442243), ('keys.', 3.5263605246161616), ('drone', 3.332204510175204), ('2.0', 2.772588722239781), ('cruiser', 2.740840023925201), ('frame', 2.70805020110221), ('legs', 2.70805020110221), ('accident', 2.6741486494265287), ('drive.', 2.6390573296152584), ('recessed', 2.6026896854443837), ('gloves', 2.6026896854443837), ('epic.', 2.6026896854443837), ('pleasant', 2.5902671654458267), ('controls.', 2.5649493574615367), ('announce', 2.5257286443082556), ('cheek', 2.5257286443082556), ('parrot', 2.5257286443082556), ('lightning', 2.4277482359480516), ('marked', 2.367123614131617), ('hp', 2.3353749158170367), ('chrome', 2.3353749158170367), ('remaining', 2.302585092994046), ('joint', 2.302585092994046), ('zoom', 2.277267285009756), ('switches', 2.277267285009756), ('roadster', 2.2735975561207935), ('spoken', 2.268683541318364), ('compromise', 2.268683541318364), ('digital', 2.2300144001592104), ('defender', 2.217843864538955)]\n", + "[('t', -0.9844722442902273), ('clipper', -0.9156811616023082), ('la', -0.8242325874562075), ('el', -0.7585749858824007), ('y', -0.6733445532637655), ('radiation', -0.6733445532637655), ('inspire', -0.6316022662300688), ('retro', -0.3905764956144137), ('wasted', -0.3225693314522215), ('es', -0.2552718283010265), ('basics', -0.1065958462658192), ('ok.', -0.050055670467200504), ('horrible.', -0.043566986636409726), ('location.', 0.009950330853168092), ('union', 0.009950330853168092), ('computers', 0.009950330853168092), ('cancel', 0.009950330853168092), ('extreme', 0.009950330853168092), ('option.', 0.033901551675681416), ('sweet', 0.04652001563489291), ('idea.', 0.04879016416943205), ('natural', 0.06062462181643484), ('safety', 0.06899287148695142), ('country', 0.09531017980432493), ('waste', 0.09884583463663264), ('luck.', 0.1112256351102244), ('microsoft', 0.11778303565638346), ('signal.', 0.11778303565638346), ('period.', 0.125163142954006), ('person.', 0.14310084364067324)]\n", + "37\n", + "34\n", + "28\n", + "33\n", + "32\n", + "47\n", + "31\n", + "30\n", + "29\n", + "28\n", + "28\n", + "28\n", + "42\n", + "27\n", + "26\n", + "26\n", + "26\n", + "36\n", + "34\n", + "33\n", + "33\n", + "54\n", + "32\n", + "42\n", + "42\n", + "74\n", + "31\n", + "31\n", + "102\n", + "162\n", + "29\n", + "56\n", + "29\n", + "34\n", + "26\n", + "32\n", + "34\n", + "44\n", + "35\n", + "29\n", + "33\n", + "32\n", + "36\n", + "31\n", + "33\n", + "33\n", + "35\n", + "113\n", + "58\n", + "42\n", + "40\n", + "32\n", + "28\n", + "41\n", + "161\n", + "35\n", + "67\n", + "84\n", + "31\n", + "27\n", + "Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%\n", + "Progress:23.5% Speed(reviews/sec):2824. #Correct:1810 #Trained:2501 Training Accuracy:72.3%\n", + "Progress:47.0% Speed(reviews/sec):2674. #Correct:3519 #Trained:5001 Training Accuracy:70.3%\n", + "Progress:70.5% Speed(reviews/sec):2735. #Correct:5136 #Trained:7501 Training Accuracy:68.4%\n", + "Progress:94.1% Speed(reviews/sec):2767. #Correct:6815 #Trained:10001 Training Accuracy:68.1%\n", + "Progress:99.9% Speed(reviews/sec):3247. #Correct:6510 #Tested:10625 Testing Accuracy:61.2%0%" + ] + } + ], + "source": [ + "file1 = 'reviews_Cell_Phones_and_Accessories_5.json.gz'\n", + "#file2 = 'reviews_Kindle_Store_5.json.gz'\n", + "#file3 = 'reviews_Home_and_Kitchen_5.json.gz'\n", + "all_data1 = pd.read_json('../data/'+file1, lines=True)\n", + "#all_data2 = pd.read_json('../data/'+file2, lines=True)\n", + "#all_data3 = pd.read_json('../data/'+file,lines=True)\n", + "#all_data = pd.concat([all_data1, all_data2])\n", + "data = all_data1[['reviewText','helpful']]\n", + "\n", + "# We are calculating the helpful value here and dropping reviews with no ratings\n", + "data['helpful'] = data['helpful'].map(lambda x: x[0]/x[1] if x[1] != 0 else np.nan)#np.log(x[0]+1)/x[1] if x[1] != 0 else np.nan)\n", + "data.dropna(inplace=True)\n", + "\n", + "# Used for testing purposes\n", + "print(len(data['helpful']))\n", + "print(np.mean(data['helpful'].values))\n", + "print(np.std(data['helpful'].values))\n", + "\n", + "# from looking at the values above use to the mean for the comparison value. it is different for each data set.\n", + "data['helpful'] = data['helpful'].map(lambda x: 1 if x > .7 else 0)\n", + "\n", + "reviews = data['reviewText'].values\n", + "labels = data['helpful'].values\n", + "\n", + "# baseline for randomly guessing what is helpful and what is not helpful\n", + "print(sum(labels)/len(reviews))\n", + "\n", + "# initializing. training and running the neural network\n", + "mlp = HelpfulnessNetwork(reviews[:int(-1*(.8*len(reviews)))],labels[:int(-1*(.8*len(reviews)))],min_count=25,polarity_cutoff=.5,hidden_nodes=10,learning_rate=.01)\n", + "mlp.train(reviews[:int(-1*(.8*len(reviews)))],labels[:int(-1*(.8*len(reviews)))])\n", + "mlp.test(reviews[int(-1*(.2*len(reviews))):],labels[int(-1*(.2*len(reviews))):])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (ada)", + "language": "python", + "name": "ada" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}