diff --git a/README.md b/README.md
index 70636c15..1c1d9cd9 100755
--- a/README.md
+++ b/README.md
@@ -1,16 +1,49 @@
-# Title
+# NLP on Amazon Reviews to Aid Customer’s Decision Process
 
-# Abstract
+## Abstract
 A 150 word description of the project idea, goals, dataset used. What story you would like to tell and why? What's the motivation behind your project?
 
-# Research questions
+Popular Amazon products usually have thousands of reviews from various users explaining the pros/cons of the products along with a description of their personal opinion on the product. While this data may be useful to Amazon customers, often times these comments are time-consuming to sift through. Additionally, users may get more confused reading many positive and negative reviews and Amazon may lose a potential customer. In order to save users time when they are online shopping, we want to conduct a keyword analysis on these reviews by identifying positive and negative words to help a user quickly determine whether they may want this product. After summarizing, we plan on  creating a visualization for these reviews. The dataset used for this study comes from UCSD Julian McAuleys “Amazon product data” dataset.
+
+## Research questions
 A list of research questions you would like to address during the project. 
 
-# Dataset
+* Can we generate keywords from Amazon reviews through sentiment analysis?
+* Can we use sentiment analysis to improve similar word search?
+
+## Dataset
+List the dataset(s) you want to use, and some ideas on how do you expect to get, manage, process and enrich it/them. Show us you've read the docs and some examples, and you've a clear idea on what to expect. Discuss data size and format if relevant.
+
 List the dataset(s) you want to use, and some ideas on how do you expect to get, manage, process and enrich it/them. Show us you've read the docs and some examples, and you've a clear idea on what to expect. Discuss data size and format if relevant.
 
-# A list of internal milestones up until project milestone 2
+We would like to use the Amazon reviews dataset provided to us. We will be using the 5-core (9.9gb) subset of the data where the items/users have at least 5 reviews. The dataset contains 41.13 million reviews and filters out both users with multiple accounts and plagiarized reviews. 
+
+Each JSON object contains the following data: reviewerID, asin, reviewerName, helpfulness, reviewText, overall, summary, unixReviewTime, and reviewTime
+
+Here is an example below: 
+![picture alt](https://github.com/sdhar3/ADA-Project/blob/master/example.png "example of one review")
+
+Our intention is to conduct sentiment analysis on the data and the Amazon product data website recommends we use this data set specifically. 
+
+We are going to take the review text from the file and the “overall” from each comment and anything above a “overall” of 3 will be labeled as positive and anything below will be labeled as negative. Then we will create a deep neural network to conduct sentiment analysis. We are going to run a couple noise reducing algorithms to clean up filler words and create higher accuracy within analysis. 
+
+We also plan to do data visualization to see how related certain words are to each other. 
+
+## A list of internal milestones up until project milestone 2
 Add here a sketch of your planning for the next project milestone.
 
-# Questions for TAa
+Tasks before November 11th:
+* Clean the data and combine the datasets
+* Run noise reduction methods remove filler words and improve the speed of the deep network
+
+Tasks before November 18th:
+* Write the code for the deep neural network create graph showing what words are indicative of positive review and what words are indicative of negative reviews.
+* Look at graphs and create a method to find similar words.
+
+Tasks before November 25th:
+* Apply the model to find keywords for the reviews so that it become easier for future customers to get a better general idea of the reviews without have to read all of them.
+* Visualize these keywords for some of the reviews that were given for particular items being sold on Amazon.
+
+
+## Questions for TAa
 Add here some questions you have for us, in general or project-specific.
diff --git a/example.png b/example.png
new file mode 100644
index 00000000..a0f3214d
Binary files /dev/null and b/example.png differ
diff --git a/model.ipynb b/model.ipynb
new file mode 100644
index 00000000..7352ee40
--- /dev/null
+++ b/model.ipynb
@@ -0,0 +1,421 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# imports\n",
+    "from collections import Counter\n",
+    "import enchant\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import sys\n",
+    "import time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Neural Netork\n",
+    "class HelpfulnessNetwork:\n",
+    "    # initialize the network with parameters\n",
+    "    def __init__(self, reviews, labels, min_count, polarity_cutoff, hidden_nodes, learning_rate):\n",
+    "        # This is to get reproducable results each time we run the network\n",
+    "        np.random.seed(1)\n",
+    "\n",
+    "        # pre process the data and run it\n",
+    "        self.pre_process_data(reviews, labels, polarity_cutoff, min_count)\n",
+    "        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)\n",
+    "\n",
+    "    def pre_process_data(self, reviews, labels, polarity_cutoff, min_count):\n",
+    "        # PART 1: get the count of words and the count of positive and negative words\n",
+    "        positive_counts = Counter()\n",
+    "        negative_counts = Counter()\n",
+    "        total_counts = Counter()\n",
+    "        \n",
+    "        d = enchant.Dict(\"en_US\")\n",
+    "        \n",
+    "        for i in range(len(reviews)):\n",
+    "            if(labels[i] == 1):\n",
+    "                for word in reviews[i].split(\" \"):\n",
+    "                    # This is used to check if the words are English dictionary words. TEST\n",
+    "                    if word != '' and d.check(word):\n",
+    "                        positive_counts[word.lower()] += 1\n",
+    "                        total_counts[word.lower()] += 1\n",
+    "            else:\n",
+    "                for word in reviews[i].split(\" \"):\n",
+    "                    # This is used to check if the words are English dictionary words. TEST\n",
+    "                    if word != '' and d.check(word):\n",
+    "                        negative_counts[word.lower()] += 1\n",
+    "                        total_counts[word.lower()] += 1\n",
+    "        \n",
+    "        # We use this to get a better understanding of what words are correlated to helpful reviews and which\n",
+    "        #ones are correlated to unhelpful reviews.\n",
+    "        pos_neg_ratios = Counter()\n",
+    "\n",
+    "        for term,count in list(total_counts.most_common()):\n",
+    "            if count > 25:\n",
+    "                pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)\n",
+    "                pos_neg_ratios[term] = pos_neg_ratio\n",
+    "\n",
+    "        # We do this so that we can get a scale, where 0 is a neutral word and the greater the number the more helpful\n",
+    "        # and the less the number the more unhelpful.\n",
+    "        for word,ratio in pos_neg_ratios.most_common():\n",
+    "            if(ratio > 1):\n",
+    "                pos_neg_ratios[word] = np.log(ratio)\n",
+    "            else:\n",
+    "                pos_neg_ratios[word] = -np.log((1 / (ratio + 0.01)))\n",
+    "               \n",
+    "        # This is used to testing and visualizing what is happening\n",
+    "        topPos = pos_neg_ratios.most_common()[:30]\n",
+    "        topNeg = list(reversed(pos_neg_ratios.most_common()))[:30]\n",
+    "        print(topPos)\n",
+    "        print(topNeg)\n",
+    "        \n",
+    "        for word in topPos:\n",
+    "            print(total_counts[word[0]])\n",
+    "        for word in topNeg:\n",
+    "            print(total_counts[word[0]])\n",
+    "        \n",
+    "        # populating the vocabulary that is going to be used\n",
+    "        review_vocab = set()\n",
+    "        for review in reviews:\n",
+    "            for word in review.split(\" \"):\n",
+    "                # check the min_count requirement\n",
+    "                if(total_counts[word.lower()] > min_count):\n",
+    "                    if(word.lower() in pos_neg_ratios.keys()):\n",
+    "                        # ensure the polarity cutoff\n",
+    "                        if((pos_neg_ratios[word.lower()] >= polarity_cutoff) or (pos_neg_ratios[word.lower()] <= -polarity_cutoff)):\n",
+    "                            review_vocab.add(word.lower())\n",
+    "                    else:\n",
+    "                        review_vocab.add(word.lower())\n",
+    "\n",
+    "        self.review_vocab = list(review_vocab)\n",
+    "        \n",
+    "        # populate the labels\n",
+    "        label_vocab = set()\n",
+    "        for label in labels:\n",
+    "            label_vocab.add(label)\n",
+    "        \n",
+    "        # Convert the label vocabulary set to a list so we can access labels via indices\n",
+    "        self.label_vocab = list(label_vocab)\n",
+    "        \n",
+    "        self.review_vocab_size = len(self.review_vocab)\n",
+    "        self.label_vocab_size = len(self.label_vocab)\n",
+    "        \n",
+    "        # Creating a dictionary where the words are mapped to the indices\n",
+    "        self.word2index = {}\n",
+    "        for i, word in enumerate(self.review_vocab):\n",
+    "            self.word2index[word] = i\n",
+    "        \n",
+    "        # Creating a dictionary where the labels are mapped to the indices\n",
+    "        self.label2index = {}\n",
+    "        for i, label in enumerate(self.label_vocab):\n",
+    "            self.label2index[label] = i\n",
+    "\n",
+    "    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):\n",
+    "        self.input_nodes = input_nodes\n",
+    "        self.hidden_nodes = hidden_nodes\n",
+    "        self.output_nodes = output_nodes\n",
+    "        self.learning_rate = learning_rate\n",
+    "\n",
+    "        # weights input layer -> hidden layer.\n",
+    "        self.weights_0_1 = np.zeros((self.input_nodes, self.hidden_nodes))\n",
+    "\n",
+    "        # weights hidden layer -> output layer.\n",
+    "        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, (self.hidden_nodes, self.output_nodes))\n",
+    "        \n",
+    "        # The input layer\n",
+    "        self.layer_1 = np.zeros((1,hidden_nodes))\n",
+    "        \n",
+    "    def sigmoid(self,x):\n",
+    "        return 1 / (1 + np.exp(-x))\n",
+    "    \n",
+    "    def sigmoid_output_2_derivative(self,output):\n",
+    "        return output * (1 - output)\n",
+    "    \n",
+    "    def train(self, training_reviews_raw, training_labels):\n",
+    "        training_reviews = list()\n",
+    "        for review in training_reviews_raw:\n",
+    "            indices = set()\n",
+    "            for word in review.split(\" \"):\n",
+    "                if(word.lower() in self.word2index.keys()):\n",
+    "                    indices.add(self.word2index[word.lower()])\n",
+    "            training_reviews.append(list(indices))\n",
+    "        \n",
+    "        correct_so_far = 0\n",
+    "        start = time.time()\n",
+    "        \n",
+    "        # run a forward and backward pass and reviews and update weights\n",
+    "        for i in range(len(training_reviews)):\n",
+    "            \n",
+    "            # Get the next review and its label\n",
+    "            review = training_reviews[i]\n",
+    "            label = training_labels[i]\n",
+    "\n",
+    "            # Hidden layer\n",
+    "            self.layer_1 *= 0\n",
+    "            for index in review:\n",
+    "                self.layer_1 += self.weights_0_1[index]\n",
+    "\n",
+    "            # Output layer\n",
+    "            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))            \n",
+    "            \n",
+    "            \n",
+    "            ### Backward pass ###\n",
+    "            # Output error\n",
+    "            layer_2_error = layer_2 - label \n",
+    "            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)\n",
+    "\n",
+    "            # Backpropagated error\n",
+    "            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # to the hidden layer\n",
+    "            layer_1_delta = layer_1_error\n",
+    "\n",
+    "            # Update the weights\n",
+    "            self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate # update weights with gradient descent step\n",
+    "            \n",
+    "            for index in review:\n",
+    "                self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate # update weights with gradient descent step\n",
+    "\n",
+    "            if(layer_2 >= 0.5 and label == 1):\n",
+    "                correct_so_far += 1\n",
+    "            elif(layer_2 < 0.5 and label == 0):\n",
+    "                correct_so_far += 1\n",
+    "            \n",
+    "            elapsed_time = float(time.time() - start)\n",
+    "            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0\n",
+    "            \n",
+    "            sys.stdout.write(\"\\rProgress:\" + str(100 * i/float(len(training_reviews)))[:4] \\\n",
+    "                             + \"% Speed(reviews/sec):\" + str(reviews_per_second)[0:5] \\\n",
+    "                             + \" #Correct:\" + str(correct_so_far) + \" #Trained:\" + str(i+1) \\\n",
+    "                             + \" Training Accuracy:\" + str(correct_so_far * 100 / float(i+1))[:4] + \"%\")\n",
+    "            if(i % 2500 == 0):\n",
+    "                print(\"\")\n",
+    "    \n",
+    "    def test(self, testing_reviews, testing_labels):\n",
+    "        correct = 0\n",
+    "        start = time.time()\n",
+    "\n",
+    "        # Predict for the review \n",
+    "        for i in range(len(testing_reviews)):\n",
+    "            pred = self.run(testing_reviews[i])\n",
+    "            if(pred == testing_labels[i]):\n",
+    "                correct += 1\n",
+    "\n",
+    "            elapsed_time = float(time.time() - start)\n",
+    "            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0\n",
+    "            \n",
+    "            sys.stdout.write(\"\\rProgress:\" + str(100 * i/float(len(testing_reviews)))[:4] \\\n",
+    "                             + \"% Speed(reviews/sec):\" + str(reviews_per_second)[0:5] \\\n",
+    "                             + \" #Correct:\" + str(correct) + \" #Tested:\" + str(i+1) \\\n",
+    "                             + \" Testing Accuracy:\" + str(correct * 100 / float(i+1))[:4] + \"%\")\n",
+    "    \n",
+    "    def run(self, review):\n",
+    "        # forward pass\n",
+    "        # Hidden layer \n",
+    "        self.layer_1 *= 0\n",
+    "\n",
+    "        unique_indices = set()\n",
+    "        for word in review.split(\" \"):\n",
+    "            if word.lower() in self.word2index.keys():\n",
+    "                unique_indices.add(self.word2index[word.lower()])\n",
+    "        \n",
+    "        for index in unique_indices:\n",
+    "            self.layer_1 += self.weights_0_1[index]\n",
+    "        \n",
+    "        # Output layer\n",
+    "        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))\n",
+    "         \n",
+    "        if(layer_2[0] >= 0.5):\n",
+    "            return 1\n",
+    "        else:\n",
+    "            return 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/anaconda3/envs/ada/lib/python3.6/site-packages/ipykernel_launcher.py:11: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+      "  # This is added back by InteractiveShellApp.init_path()\n",
+      "/anaconda3/envs/ada/lib/python3.6/site-packages/ipykernel_launcher.py:12: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+      "  if sys.path[0] == '':\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "53129\n",
+      "0.7065320112943302\n",
+      "0.38052283401772596\n",
+      "0.6267763368405203\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/anaconda3/envs/ada/lib/python3.6/site-packages/ipykernel_launcher.py:20: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[('mhz', 3.6109179126442243), ('keys.', 3.5263605246161616), ('drone', 3.332204510175204), ('2.0', 2.772588722239781), ('cruiser', 2.740840023925201), ('frame', 2.70805020110221), ('legs', 2.70805020110221), ('accident', 2.6741486494265287), ('drive.', 2.6390573296152584), ('recessed', 2.6026896854443837), ('gloves', 2.6026896854443837), ('epic.', 2.6026896854443837), ('pleasant', 2.5902671654458267), ('controls.', 2.5649493574615367), ('announce', 2.5257286443082556), ('cheek', 2.5257286443082556), ('parrot', 2.5257286443082556), ('lightning', 2.4277482359480516), ('marked', 2.367123614131617), ('hp', 2.3353749158170367), ('chrome', 2.3353749158170367), ('remaining', 2.302585092994046), ('joint', 2.302585092994046), ('zoom', 2.277267285009756), ('switches', 2.277267285009756), ('roadster', 2.2735975561207935), ('spoken', 2.268683541318364), ('compromise', 2.268683541318364), ('digital', 2.2300144001592104), ('defender', 2.217843864538955)]\n",
+      "[('t', -0.9844722442902273), ('clipper', -0.9156811616023082), ('la', -0.8242325874562075), ('el', -0.7585749858824007), ('y', -0.6733445532637655), ('radiation', -0.6733445532637655), ('inspire', -0.6316022662300688), ('retro', -0.3905764956144137), ('wasted', -0.3225693314522215), ('es', -0.2552718283010265), ('basics', -0.1065958462658192), ('ok.', -0.050055670467200504), ('horrible.', -0.043566986636409726), ('location.', 0.009950330853168092), ('union', 0.009950330853168092), ('computers', 0.009950330853168092), ('cancel', 0.009950330853168092), ('extreme', 0.009950330853168092), ('option.', 0.033901551675681416), ('sweet', 0.04652001563489291), ('idea.', 0.04879016416943205), ('natural', 0.06062462181643484), ('safety', 0.06899287148695142), ('country', 0.09531017980432493), ('waste', 0.09884583463663264), ('luck.', 0.1112256351102244), ('microsoft', 0.11778303565638346), ('signal.', 0.11778303565638346), ('period.', 0.125163142954006), ('person.', 0.14310084364067324)]\n",
+      "37\n",
+      "34\n",
+      "28\n",
+      "33\n",
+      "32\n",
+      "47\n",
+      "31\n",
+      "30\n",
+      "29\n",
+      "28\n",
+      "28\n",
+      "28\n",
+      "42\n",
+      "27\n",
+      "26\n",
+      "26\n",
+      "26\n",
+      "36\n",
+      "34\n",
+      "33\n",
+      "33\n",
+      "54\n",
+      "32\n",
+      "42\n",
+      "42\n",
+      "74\n",
+      "31\n",
+      "31\n",
+      "102\n",
+      "162\n",
+      "29\n",
+      "56\n",
+      "29\n",
+      "34\n",
+      "26\n",
+      "32\n",
+      "34\n",
+      "44\n",
+      "35\n",
+      "29\n",
+      "33\n",
+      "32\n",
+      "36\n",
+      "31\n",
+      "33\n",
+      "33\n",
+      "35\n",
+      "113\n",
+      "58\n",
+      "42\n",
+      "40\n",
+      "32\n",
+      "28\n",
+      "41\n",
+      "161\n",
+      "35\n",
+      "67\n",
+      "84\n",
+      "31\n",
+      "27\n",
+      "Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%\n",
+      "Progress:23.5% Speed(reviews/sec):2824. #Correct:1810 #Trained:2501 Training Accuracy:72.3%\n",
+      "Progress:47.0% Speed(reviews/sec):2674. #Correct:3519 #Trained:5001 Training Accuracy:70.3%\n",
+      "Progress:70.5% Speed(reviews/sec):2735. #Correct:5136 #Trained:7501 Training Accuracy:68.4%\n",
+      "Progress:94.1% Speed(reviews/sec):2767. #Correct:6815 #Trained:10001 Training Accuracy:68.1%\n",
+      "Progress:99.9% Speed(reviews/sec):3247. #Correct:6510 #Tested:10625 Testing Accuracy:61.2%0%"
+     ]
+    }
+   ],
+   "source": [
+    "file1 = 'reviews_Cell_Phones_and_Accessories_5.json.gz'\n",
+    "#file2 = 'reviews_Kindle_Store_5.json.gz'\n",
+    "#file3 = 'reviews_Home_and_Kitchen_5.json.gz'\n",
+    "all_data1 = pd.read_json('../data/'+file1, lines=True)\n",
+    "#all_data2 = pd.read_json('../data/'+file2, lines=True)\n",
+    "#all_data3 = pd.read_json('../data/'+file,lines=True)\n",
+    "#all_data = pd.concat([all_data1, all_data2])\n",
+    "data = all_data1[['reviewText','helpful']]\n",
+    "\n",
+    "# We are calculating the helpful value here and dropping reviews with no ratings\n",
+    "data['helpful'] = data['helpful'].map(lambda x: x[0]/x[1] if x[1] != 0 else np.nan)#np.log(x[0]+1)/x[1] if x[1] != 0 else np.nan)\n",
+    "data.dropna(inplace=True)\n",
+    "\n",
+    "# Used for testing purposes\n",
+    "print(len(data['helpful']))\n",
+    "print(np.mean(data['helpful'].values))\n",
+    "print(np.std(data['helpful'].values))\n",
+    "\n",
+    "# from looking at the values above use to the mean for the comparison value. it is different for each data set.\n",
+    "data['helpful'] = data['helpful'].map(lambda x: 1 if x > .7 else 0)\n",
+    "\n",
+    "reviews = data['reviewText'].values\n",
+    "labels = data['helpful'].values\n",
+    "\n",
+    "# baseline for randomly guessing what is helpful and what is not helpful\n",
+    "print(sum(labels)/len(reviews))\n",
+    "\n",
+    "# initializing. training and running the neural network\n",
+    "mlp = HelpfulnessNetwork(reviews[:int(-1*(.8*len(reviews)))],labels[:int(-1*(.8*len(reviews)))],min_count=25,polarity_cutoff=.5,hidden_nodes=10,learning_rate=.01)\n",
+    "mlp.train(reviews[:int(-1*(.8*len(reviews)))],labels[:int(-1*(.8*len(reviews)))])\n",
+    "mlp.test(reviews[int(-1*(.2*len(reviews))):],labels[int(-1*(.2*len(reviews))):])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python (ada)",
+   "language": "python",
+   "name": "ada"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}