From b9c65ac076c6871486c4cb2055257e3b666fe71a Mon Sep 17 00:00:00 2001
From: Alex Bondar <zmagar@gmail.com>
Date: Thu, 17 Feb 2022 23:56:17 +0000
Subject: [PATCH 1/3] Connected Sagemaker notebook instance with our repo

---
 backend/jupyter/nltk-json.ipynb | 282 ++++++++++++++++++++++++++++++++
 1 file changed, 282 insertions(+)
 create mode 100644 backend/jupyter/nltk-json.ipynb

diff --git a/backend/jupyter/nltk-json.ipynb b/backend/jupyter/nltk-json.ipynb
new file mode 100644
index 0000000..e970b12
--- /dev/null
+++ b/backend/jupyter/nltk-json.ipynb
@@ -0,0 +1,282 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8a1e9638",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install textblob\n",
+    "%pip install -U pip setuptools wheel\n",
+    "%pip install -U spacy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e41ec177",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "from os import path\n",
+    "from os.path import exists\n",
+    "import string\n",
+    "from collections import Counter\n",
+    "from tokenize import maybe\n",
+    "import nltk\n",
+    "import matplotlib.pyplot as plt\n",
+    "from textblob import TextBlob\n",
+    "from pathlib import Path\n",
+    "from textblob.sentiments import NaiveBayesAnalyzer\n",
+    "import nltk.corpus\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "from nltk.probability import FreqDist\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
+    "import nltk.corpus\n",
+    "from nltk.tokenize import RegexpTokenizer\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "from nltk.data import load\n",
+    "fdist = FreqDist()\n",
+    "\n",
+    "# downloading required packages\n",
+    "nltk.download('nps_chat')\n",
+    "nltk.download('punkt')\n",
+    "nltk.download('brown')\n",
+    "nltk.download('movie_reviews')\n",
+    "nltk.download('stopwords')\n",
+    "nltk.download('vader_lexicon')\n",
+    "nltk.download('tagsets')\n",
+    "nltk.download('averaged_perceptron_tagger')\n",
+    "# nltk.help.upenn_tagset('MD')\n",
+    "\n",
+    "print('\\n'+'******************************************************************')\n",
+    "print('This application is about Natural Language Processing (NLP).')\n",
+    "print('The user enters sentence(s) add using NLP and other algorithm the application')\n",
+    "print('will analyze that entry and gives some feesback.')\n",
+    "print('\\n'+'*****   THIS APPLICATION DOES NOT SUPPORT CONTRACTED FORM    *****'+'\\n')\n",
+    "print('******************************************************************')\n",
+    "\n",
+    "\n",
+    "def main():\n",
+    "\n",
+    "   \n",
+    "    import spacy\n",
+    "    !python -m spacy download en_core_web_sm\n",
+    "    \n",
+    "    import json\n",
+    "    nlp = spacy.load('en_core_web_sm')\n",
+    "    data = open('floop-conv-data.json', 'r')\n",
+    "    jsondata = json.load(data)\n",
+    "    tokens = nlp.pipe(jsondata)\n",
+    "    tokenlist = ''\n",
+    "    for s in tokens:\n",
+    "        tokenlist += s.text\n",
+    "    \n",
+    "    \n",
+    "    myFile = tokenlist\n",
+    "\n",
+    "# to see if the feedback cointains any ? mark\n",
+    "    print('\\n'+'******************************************************************')\n",
+    "    print('                     RESULT for this entry                         ')\n",
+    "    print('******************************************************************')\n",
+    "\n",
+    "    sentList = nltk.sent_tokenize(myFile)\n",
+    "    sentences_qnt = len(sentList)\n",
+    "    print('\\n'+'You entered ', str(sentences_qnt), ' sentences.')\n",
+    "\n",
+    "    if '?' in myFile:\n",
+    "        qnt = myFile.count('?')\n",
+    "        print('This entry includes: ' + str(qnt)+' questions!')\n",
+    "        print('\\n'+'******************************************************************')\n",
+    "    else:\n",
+    "        print('\\n'+'******************************************************************')\n",
+    "        print('There is no question mark in this feedback but wait for more result!')\n",
+    "        print('\\n'+'******************************************************************')\n",
+    "        print('Here are more analyses about this entry:'+'\\n')\n",
+    "\n",
+    "# lists of words that indicates question\n",
+    "\n",
+    "    wh_question = ['what', 'when', 'where', 'who',\n",
+    "                   'whom', 'which', 'whose', 'why', 'how']\n",
+    "    yN_question = [\"am\", \"is\", \"are\", \"do\", \"does\", \"did\", \"have\", \"has\", \"was\", \"were\", \"can\", \"cannot\", \"could\",\n",
+    "                   \"couldn't\", \"dare\", \"may\", \"might\", \"must\", \"need\", \"ought\", \"shall\", \"should\", \"shouldn't\", \"will\", \"would\"]\n",
+    "\n",
+    "# initialize\n",
+    "\n",
+    "    first_words = []\n",
+    "    type = []\n",
+    "    list_of_question_words = []\n",
+    "    type_sentence = []\n",
+    "\n",
+    "# adding first words of all the sentences in one list and adding the last item in one sentence in another list\n",
+    "\n",
+    "    for items in sentList:\n",
+    "        first_words.append(items.split(' ')[0])\n",
+    "        type.append(items[-1][-1])\n",
+    "# validatin of words\n",
+    "    for item in type:\n",
+    "        if item == '.':\n",
+    "            type_sentence.append('Sentence')\n",
+    "        elif item == '!':\n",
+    "            type_sentence.append('Exclamatory')\n",
+    "        elif item == '?':\n",
+    "            type_sentence.append('Interrogative')\n",
+    "        else:\n",
+    "            type_sentence.append('Unknown or Numbers')\n",
+    "    for items in first_words:\n",
+    "        if items in wh_question:\n",
+    "            list_of_question_words.append('WH question')\n",
+    "        elif items in yN_question:\n",
+    "            list_of_question_words.append('Y/N question')\n",
+    "\n",
+    "    for i in range(len(type)):\n",
+    "        print('Sentence #', str(i+1), 'Type:', type_sentence[i])\n",
+    "        if type_sentence[i] == 'Unknown or Numbers':\n",
+    "            print(\"This is the type that we did not catch it: \", type[i])\n",
+    "    for i in range(len(list_of_question_words)):\n",
+    "        print('Question #', str(i+1), 'Type:', list_of_question_words[i])\n",
+    "\n",
+    "    countQuestionWords = Counter(list_of_question_words)\n",
+    "    print('Summary of question words: ', countQuestionWords)\n",
+    "\n",
+    "# cleanup the text and make it as words in a list\n",
+    "\n",
+    "    cleanedText = myFile.translate(str.maketrans('', '', string.punctuation))\n",
+    "    tokenizedList = word_tokenize(cleanedText, \"english\")\n",
+    "\n",
+    "# creating a list of stop words in the feedback\n",
+    "\n",
+    "    listOfWords = [\n",
+    "        item for item in tokenizedList if item in stopwords.words('english')]\n",
+    "    countstopwords = Counter(listOfWords)\n",
+    "\n",
+    "# creating a list of words in the feedback excluded all the stop words\n",
+    "\n",
+    "    finalList = [\n",
+    "        item for item in tokenizedList if item not in stopwords.words('english')]\n",
+    "\n",
+    "    count = 0\n",
+    "    totalCount = 0\n",
+    "    itemsCount = 0\n",
+    "    for item in tokenizedList:\n",
+    "        itemsCount += 1\n",
+    "        if item in stopwords.words('english'):\n",
+    "            count += 1\n",
+    "\n",
+    "# creating a list of emotion words and the words that have emotion according to our pre defined list\n",
+    "\n",
+    "    emotionList = []\n",
+    "    wordList = []\n",
+    "\n",
+    "# list of words with their emotions\n",
+    "\n",
+    "    with open('emotions.txt', 'r') as file:\n",
+    "        for line in file:\n",
+    "            clearLine = line.replace('\\n', '').replace(\n",
+    "                ',', '').replace(\"'\", '').replace(' ', '').strip()\n",
+    "            word, emotion = clearLine.split(':')\n",
+    "\n",
+    "            if word in finalList:\n",
+    "                emotionList.append(emotion)\n",
+    "                wordList.append(word)\n",
+    "            countEmotions = Counter(emotionList)\n",
+    "            countwords = Counter(finalList)\n",
+    "    totalCount = count+len(set(tokenizedList) -\n",
+    "                           set(wordList)-set(stopwords.words('english')))\n",
+    "\n",
+    "\n",
+    "# creating a blob text\n",
+    "\n",
+    "    blob = TextBlob(myFile)\n",
+    "    wordCount = blob.word_counts\n",
+    "\n",
+    "# make all the typos correct\n",
+    "\n",
+    "    blob = TextBlob(myFile, analyzer=NaiveBayesAnalyzer())\n",
+    "    sen_sub = blob.sentiment\n",
+    "\n",
+    "    def sentimentAnalyse(text):\n",
+    "        score = SentimentIntensityAnalyzer().polarity_scores(text)\n",
+    "        for k,v in score.items():\n",
+    "            print(f\"{k}:{v:.2f}\")\n",
+    "        neg = score['neg']\n",
+    "        pos = score['pos']\n",
+    "        if neg > pos:\n",
+    "            print(\"\\n\"+\"In general: Negative sentiment!\")\n",
+    "        elif pos > neg:\n",
+    "            print(\"\\n\"+\"In general: Positive sentiment!\")\n",
+    "        else:\n",
+    "            print(\"\\n\"+\"In general: Neutral vibe!\")\n",
+    "\n",
+    "    sentimentAnalyse(myFile)\n",
+    "    reg = RegexpTokenizer('(?u)\\w+|\\$[\\d\\.]+|\\s+')\n",
+    "\n",
+    "    print('\\n'+'******************************************************************')\n",
+    "    print('Summary of sentiments:')\n",
+    "    print('\\n'+'list of words with emotions: ', wordList)\n",
+    "    print('Summary of emotions count: ', countEmotions)\n",
+    "    print('\\n'+'******************************************************************')\n",
+    "    print('                      End of Application                          ')\n",
+    "    print('******************************************************************')\n",
+    "\n",
+    "# Drawing the bar chart\n",
+    "\n",
+    "    fig, axl = plt.subplots()\n",
+    "    axl.bar(countEmotions.keys(), countEmotions.values())\n",
+    "    fig.autofmt_xdate()\n",
+    "    plt.savefig('graph.png')\n",
+    "    plt.show()\n",
+    "\n",
+    "\n",
+    "\n",
+    "main()\n",
+    "\n",
+    "\n",
+    "##############################################################\n",
+    "# Symbol\tMeaning\t                Example                  #\n",
+    "#------------------------------------------------------------#\n",
+    "# S\t        sentence\t            the man walked           #\n",
+    "# NP\t    noun phrase\t            a dog                    #\n",
+    "# VP  \t    verb phrase\t            saw a park               #\n",
+    "# PP\t    prepositional phrase\twith a telescope         #\n",
+    "# Det\t    determiner\t            the                      #\n",
+    "# N\t        noun\t                dog                      #\n",
+    "# V\t        verb\t                walked                   #\n",
+    "# P\t        preposition \t        in                       #\n",
+    "##############################################################\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bc0b44ea",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "conda_python3",
+   "language": "python",
+   "name": "conda_python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From a3067dfff17ebebf55bb5aafeb34835079fe539d Mon Sep 17 00:00:00 2001
From: Alex Bondar <zmagar@gmail.com>
Date: Sat, 19 Feb 2022 21:21:19 +0000
Subject: [PATCH 2/3] Removed outputs

---
 backend/jupyter/nltk-json.ipynb | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/backend/jupyter/nltk-json.ipynb b/backend/jupyter/nltk-json.ipynb
index e970b12..2e7fb2f 100644
--- a/backend/jupyter/nltk-json.ipynb
+++ b/backend/jupyter/nltk-json.ipynb
@@ -3,8 +3,10 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "8a1e9638",
-   "metadata": {},
+   "id": "6ada69a3",
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "%pip install textblob\n",
@@ -15,7 +17,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "e41ec177",
+   "id": "477c1dc5",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -252,7 +254,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "bc0b44ea",
+   "id": "c0fe54e4",
    "metadata": {},
    "outputs": [],
    "source": []

From 98eb98cd52f23c0f3d3899f9ec4433def7ca7c45 Mon Sep 17 00:00:00 2001
From: Alex Bondar <zmagar@gmail.com>
Date: Sat, 19 Feb 2022 13:28:24 -0800
Subject: [PATCH 3/3] Adding the test comment

---
 backend/jupyter/nltk-json.ipynb | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backend/jupyter/nltk-json.ipynb b/backend/jupyter/nltk-json.ipynb
index 2e7fb2f..b1dddae 100644
--- a/backend/jupyter/nltk-json.ipynb
+++ b/backend/jupyter/nltk-json.ipynb
@@ -9,6 +9,7 @@
    },
    "outputs": [],
    "source": [
+    "# Resolving depedencies\n",
     "%pip install textblob\n",
     "%pip install -U pip setuptools wheel\n",
     "%pip install -U spacy"