From b9c65ac076c6871486c4cb2055257e3b666fe71a Mon Sep 17 00:00:00 2001 From: Alex Bondar Date: Thu, 17 Feb 2022 23:56:17 +0000 Subject: [PATCH 1/3] Connected Sagemaker notebook instance with our repo --- backend/jupyter/nltk-json.ipynb | 282 ++++++++++++++++++++++++++++++++ 1 file changed, 282 insertions(+) create mode 100644 backend/jupyter/nltk-json.ipynb diff --git a/backend/jupyter/nltk-json.ipynb b/backend/jupyter/nltk-json.ipynb new file mode 100644 index 0000000..e970b12 --- /dev/null +++ b/backend/jupyter/nltk-json.ipynb @@ -0,0 +1,282 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "8a1e9638", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install textblob\n", + "%pip install -U pip setuptools wheel\n", + "%pip install -U spacy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e41ec177", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "from os import path\n", + "from os.path import exists\n", + "import string\n", + "from collections import Counter\n", + "from tokenize import maybe\n", + "import nltk\n", + "import matplotlib.pyplot as plt\n", + "from textblob import TextBlob\n", + "from pathlib import Path\n", + "from textblob.sentiments import NaiveBayesAnalyzer\n", + "import nltk.corpus\n", + "from nltk.tokenize import word_tokenize\n", + "from nltk.probability import FreqDist\n", + "from nltk.corpus import stopwords\n", + "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n", + "import nltk.corpus\n", + "from nltk.tokenize import RegexpTokenizer\n", + "from nltk.tokenize import word_tokenize\n", + "from nltk.data import load\n", + "fdist = FreqDist()\n", + "\n", + "# downloading required packages\n", + "nltk.download('nps_chat')\n", + "nltk.download('punkt')\n", + "nltk.download('brown')\n", + "nltk.download('movie_reviews')\n", + "nltk.download('stopwords')\n", + "nltk.download('vader_lexicon')\n", + "nltk.download('tagsets')\n", + "nltk.download('averaged_perceptron_tagger')\n", + "# nltk.help.upenn_tagset('MD')\n", + "\n", + "print('\\n'+'******************************************************************')\n", + "print('This application is about Natural Language Processing (NLP).')\n", + "print('The user enters sentence(s) add using NLP and other algorithm the application')\n", + "print('will analyze that entry and gives some feesback.')\n", + "print('\\n'+'***** THIS APPLICATION DOES NOT SUPPORT CONTRACTED FORM *****'+'\\n')\n", + "print('******************************************************************')\n", + "\n", + "\n", + "def main():\n", + "\n", + " \n", + " import spacy\n", + " !python -m spacy download en_core_web_sm\n", + " \n", + " import json\n", + " nlp = spacy.load('en_core_web_sm')\n", + " data = open('floop-conv-data.json', 'r')\n", + " jsondata = json.load(data)\n", + " tokens = nlp.pipe(jsondata)\n", + " tokenlist = ''\n", + " for s in tokens:\n", + " tokenlist += s.text\n", + " \n", + " \n", + " myFile = tokenlist\n", + "\n", + "# to see if the feedback cointains any ? mark\n", + " print('\\n'+'******************************************************************')\n", + " print(' RESULT for this entry ')\n", + " print('******************************************************************')\n", + "\n", + " sentList = nltk.sent_tokenize(myFile)\n", + " sentences_qnt = len(sentList)\n", + " print('\\n'+'You entered ', str(sentences_qnt), ' sentences.')\n", + "\n", + " if '?' in myFile:\n", + " qnt = myFile.count('?')\n", + " print('This entry includes: ' + str(qnt)+' questions!')\n", + " print('\\n'+'******************************************************************')\n", + " else:\n", + " print('\\n'+'******************************************************************')\n", + " print('There is no question mark in this feedback but wait for more result!')\n", + " print('\\n'+'******************************************************************')\n", + " print('Here are more analyses about this entry:'+'\\n')\n", + "\n", + "# lists of words that indicates question\n", + "\n", + " wh_question = ['what', 'when', 'where', 'who',\n", + " 'whom', 'which', 'whose', 'why', 'how']\n", + " yN_question = [\"am\", \"is\", \"are\", \"do\", \"does\", \"did\", \"have\", \"has\", \"was\", \"were\", \"can\", \"cannot\", \"could\",\n", + " \"couldn't\", \"dare\", \"may\", \"might\", \"must\", \"need\", \"ought\", \"shall\", \"should\", \"shouldn't\", \"will\", \"would\"]\n", + "\n", + "# initialize\n", + "\n", + " first_words = []\n", + " type = []\n", + " list_of_question_words = []\n", + " type_sentence = []\n", + "\n", + "# adding first words of all the sentences in one list and adding the last item in one sentence in another list\n", + "\n", + " for items in sentList:\n", + " first_words.append(items.split(' ')[0])\n", + " type.append(items[-1][-1])\n", + "# validatin of words\n", + " for item in type:\n", + " if item == '.':\n", + " type_sentence.append('Sentence')\n", + " elif item == '!':\n", + " type_sentence.append('Exclamatory')\n", + " elif item == '?':\n", + " type_sentence.append('Interrogative')\n", + " else:\n", + " type_sentence.append('Unknown or Numbers')\n", + " for items in first_words:\n", + " if items in wh_question:\n", + " list_of_question_words.append('WH question')\n", + " elif items in yN_question:\n", + " list_of_question_words.append('Y/N question')\n", + "\n", + " for i in range(len(type)):\n", + " print('Sentence #', str(i+1), 'Type:', type_sentence[i])\n", + " if type_sentence[i] == 'Unknown or Numbers':\n", + " print(\"This is the type that we did not catch it: \", type[i])\n", + " for i in range(len(list_of_question_words)):\n", + " print('Question #', str(i+1), 'Type:', list_of_question_words[i])\n", + "\n", + " countQuestionWords = Counter(list_of_question_words)\n", + " print('Summary of question words: ', countQuestionWords)\n", + "\n", + "# cleanup the text and make it as words in a list\n", + "\n", + " cleanedText = myFile.translate(str.maketrans('', '', string.punctuation))\n", + " tokenizedList = word_tokenize(cleanedText, \"english\")\n", + "\n", + "# creating a list of stop words in the feedback\n", + "\n", + " listOfWords = [\n", + " item for item in tokenizedList if item in stopwords.words('english')]\n", + " countstopwords = Counter(listOfWords)\n", + "\n", + "# creating a list of words in the feedback excluded all the stop words\n", + "\n", + " finalList = [\n", + " item for item in tokenizedList if item not in stopwords.words('english')]\n", + "\n", + " count = 0\n", + " totalCount = 0\n", + " itemsCount = 0\n", + " for item in tokenizedList:\n", + " itemsCount += 1\n", + " if item in stopwords.words('english'):\n", + " count += 1\n", + "\n", + "# creating a list of emotion words and the words that have emotion according to our pre defined list\n", + "\n", + " emotionList = []\n", + " wordList = []\n", + "\n", + "# list of words with their emotions\n", + "\n", + " with open('emotions.txt', 'r') as file:\n", + " for line in file:\n", + " clearLine = line.replace('\\n', '').replace(\n", + " ',', '').replace(\"'\", '').replace(' ', '').strip()\n", + " word, emotion = clearLine.split(':')\n", + "\n", + " if word in finalList:\n", + " emotionList.append(emotion)\n", + " wordList.append(word)\n", + " countEmotions = Counter(emotionList)\n", + " countwords = Counter(finalList)\n", + " totalCount = count+len(set(tokenizedList) -\n", + " set(wordList)-set(stopwords.words('english')))\n", + "\n", + "\n", + "# creating a blob text\n", + "\n", + " blob = TextBlob(myFile)\n", + " wordCount = blob.word_counts\n", + "\n", + "# make all the typos correct\n", + "\n", + " blob = TextBlob(myFile, analyzer=NaiveBayesAnalyzer())\n", + " sen_sub = blob.sentiment\n", + "\n", + " def sentimentAnalyse(text):\n", + " score = SentimentIntensityAnalyzer().polarity_scores(text)\n", + " for k,v in score.items():\n", + " print(f\"{k}:{v:.2f}\")\n", + " neg = score['neg']\n", + " pos = score['pos']\n", + " if neg > pos:\n", + " print(\"\\n\"+\"In general: Negative sentiment!\")\n", + " elif pos > neg:\n", + " print(\"\\n\"+\"In general: Positive sentiment!\")\n", + " else:\n", + " print(\"\\n\"+\"In general: Neutral vibe!\")\n", + "\n", + " sentimentAnalyse(myFile)\n", + " reg = RegexpTokenizer('(?u)\\w+|\\$[\\d\\.]+|\\s+')\n", + "\n", + " print('\\n'+'******************************************************************')\n", + " print('Summary of sentiments:')\n", + " print('\\n'+'list of words with emotions: ', wordList)\n", + " print('Summary of emotions count: ', countEmotions)\n", + " print('\\n'+'******************************************************************')\n", + " print(' End of Application ')\n", + " print('******************************************************************')\n", + "\n", + "# Drawing the bar chart\n", + "\n", + " fig, axl = plt.subplots()\n", + " axl.bar(countEmotions.keys(), countEmotions.values())\n", + " fig.autofmt_xdate()\n", + " plt.savefig('graph.png')\n", + " plt.show()\n", + "\n", + "\n", + "\n", + "main()\n", + "\n", + "\n", + "##############################################################\n", + "# Symbol\tMeaning\t Example #\n", + "#------------------------------------------------------------#\n", + "# S\t sentence\t the man walked #\n", + "# NP\t noun phrase\t a dog #\n", + "# VP \t verb phrase\t saw a park #\n", + "# PP\t prepositional phrase\twith a telescope #\n", + "# Det\t determiner\t the #\n", + "# N\t noun\t dog #\n", + "# V\t verb\t walked #\n", + "# P\t preposition \t in #\n", + "##############################################################\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc0b44ea", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From a3067dfff17ebebf55bb5aafeb34835079fe539d Mon Sep 17 00:00:00 2001 From: Alex Bondar Date: Sat, 19 Feb 2022 21:21:19 +0000 Subject: [PATCH 2/3] Removed outputs --- backend/jupyter/nltk-json.ipynb | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/backend/jupyter/nltk-json.ipynb b/backend/jupyter/nltk-json.ipynb index e970b12..2e7fb2f 100644 --- a/backend/jupyter/nltk-json.ipynb +++ b/backend/jupyter/nltk-json.ipynb @@ -3,8 +3,10 @@ { "cell_type": "code", "execution_count": null, - "id": "8a1e9638", - "metadata": {}, + "id": "6ada69a3", + "metadata": { + "scrolled": false + }, "outputs": [], "source": [ "%pip install textblob\n", @@ -15,7 +17,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e41ec177", + "id": "477c1dc5", "metadata": {}, "outputs": [], "source": [ @@ -252,7 +254,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bc0b44ea", + "id": "c0fe54e4", "metadata": {}, "outputs": [], "source": [] From 98eb98cd52f23c0f3d3899f9ec4433def7ca7c45 Mon Sep 17 00:00:00 2001 From: Alex Bondar Date: Sat, 19 Feb 2022 13:28:24 -0800 Subject: [PATCH 3/3] Adding the test comment --- backend/jupyter/nltk-json.ipynb | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/jupyter/nltk-json.ipynb b/backend/jupyter/nltk-json.ipynb index 2e7fb2f..b1dddae 100644 --- a/backend/jupyter/nltk-json.ipynb +++ b/backend/jupyter/nltk-json.ipynb @@ -9,6 +9,7 @@ }, "outputs": [], "source": [ + "# Resolving depedencies\n", "%pip install textblob\n", "%pip install -U pip setuptools wheel\n", "%pip install -U spacy"