Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Linking sagemaker notebook to the GitHub repository #173

Open
wants to merge 4 commits into
base: development
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
285 changes: 285 additions & 0 deletions backend/jupyter/nltk-json.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "6ada69a3",
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# Resolving depedencies\n",
"%pip install textblob\n",
"%pip install -U pip setuptools wheel\n",
"%pip install -U spacy"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "477c1dc5",
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"from os import path\n",
"from os.path import exists\n",
"import string\n",
"from collections import Counter\n",
"from tokenize import maybe\n",
"import nltk\n",
"import matplotlib.pyplot as plt\n",
"from textblob import TextBlob\n",
"from pathlib import Path\n",
"from textblob.sentiments import NaiveBayesAnalyzer\n",
"import nltk.corpus\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.probability import FreqDist\n",
"from nltk.corpus import stopwords\n",
"from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
"import nltk.corpus\n",
"from nltk.tokenize import RegexpTokenizer\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.data import load\n",
"fdist = FreqDist()\n",
"\n",
"# downloading required packages\n",
"nltk.download('nps_chat')\n",
"nltk.download('punkt')\n",
"nltk.download('brown')\n",
"nltk.download('movie_reviews')\n",
"nltk.download('stopwords')\n",
"nltk.download('vader_lexicon')\n",
"nltk.download('tagsets')\n",
"nltk.download('averaged_perceptron_tagger')\n",
"# nltk.help.upenn_tagset('MD')\n",
"\n",
"print('\\n'+'******************************************************************')\n",
"print('This application is about Natural Language Processing (NLP).')\n",
"print('The user enters sentence(s) add using NLP and other algorithm the application')\n",
"print('will analyze that entry and gives some feesback.')\n",
"print('\\n'+'***** THIS APPLICATION DOES NOT SUPPORT CONTRACTED FORM *****'+'\\n')\n",
"print('******************************************************************')\n",
"\n",
"\n",
"def main():\n",
"\n",
" \n",
" import spacy\n",
" !python -m spacy download en_core_web_sm\n",
" \n",
" import json\n",
" nlp = spacy.load('en_core_web_sm')\n",
" data = open('floop-conv-data.json', 'r')\n",
" jsondata = json.load(data)\n",
" tokens = nlp.pipe(jsondata)\n",
" tokenlist = ''\n",
" for s in tokens:\n",
" tokenlist += s.text\n",
" \n",
" \n",
" myFile = tokenlist\n",
"\n",
"# to see if the feedback cointains any ? mark\n",
" print('\\n'+'******************************************************************')\n",
" print(' RESULT for this entry ')\n",
" print('******************************************************************')\n",
"\n",
" sentList = nltk.sent_tokenize(myFile)\n",
" sentences_qnt = len(sentList)\n",
" print('\\n'+'You entered ', str(sentences_qnt), ' sentences.')\n",
"\n",
" if '?' in myFile:\n",
" qnt = myFile.count('?')\n",
" print('This entry includes: ' + str(qnt)+' questions!')\n",
" print('\\n'+'******************************************************************')\n",
" else:\n",
" print('\\n'+'******************************************************************')\n",
" print('There is no question mark in this feedback but wait for more result!')\n",
" print('\\n'+'******************************************************************')\n",
" print('Here are more analyses about this entry:'+'\\n')\n",
"\n",
"# lists of words that indicates question\n",
"\n",
" wh_question = ['what', 'when', 'where', 'who',\n",
" 'whom', 'which', 'whose', 'why', 'how']\n",
" yN_question = [\"am\", \"is\", \"are\", \"do\", \"does\", \"did\", \"have\", \"has\", \"was\", \"were\", \"can\", \"cannot\", \"could\",\n",
" \"couldn't\", \"dare\", \"may\", \"might\", \"must\", \"need\", \"ought\", \"shall\", \"should\", \"shouldn't\", \"will\", \"would\"]\n",
"\n",
"# initialize\n",
"\n",
" first_words = []\n",
" type = []\n",
" list_of_question_words = []\n",
" type_sentence = []\n",
"\n",
"# adding first words of all the sentences in one list and adding the last item in one sentence in another list\n",
"\n",
" for items in sentList:\n",
" first_words.append(items.split(' ')[0])\n",
" type.append(items[-1][-1])\n",
"# validatin of words\n",
" for item in type:\n",
" if item == '.':\n",
" type_sentence.append('Sentence')\n",
" elif item == '!':\n",
" type_sentence.append('Exclamatory')\n",
" elif item == '?':\n",
" type_sentence.append('Interrogative')\n",
" else:\n",
" type_sentence.append('Unknown or Numbers')\n",
" for items in first_words:\n",
" if items in wh_question:\n",
" list_of_question_words.append('WH question')\n",
" elif items in yN_question:\n",
" list_of_question_words.append('Y/N question')\n",
"\n",
" for i in range(len(type)):\n",
" print('Sentence #', str(i+1), 'Type:', type_sentence[i])\n",
" if type_sentence[i] == 'Unknown or Numbers':\n",
" print(\"This is the type that we did not catch it: \", type[i])\n",
" for i in range(len(list_of_question_words)):\n",
" print('Question #', str(i+1), 'Type:', list_of_question_words[i])\n",
"\n",
" countQuestionWords = Counter(list_of_question_words)\n",
" print('Summary of question words: ', countQuestionWords)\n",
"\n",
"# cleanup the text and make it as words in a list\n",
"\n",
" cleanedText = myFile.translate(str.maketrans('', '', string.punctuation))\n",
" tokenizedList = word_tokenize(cleanedText, \"english\")\n",
"\n",
"# creating a list of stop words in the feedback\n",
"\n",
" listOfWords = [\n",
" item for item in tokenizedList if item in stopwords.words('english')]\n",
" countstopwords = Counter(listOfWords)\n",
"\n",
"# creating a list of words in the feedback excluded all the stop words\n",
"\n",
" finalList = [\n",
" item for item in tokenizedList if item not in stopwords.words('english')]\n",
"\n",
" count = 0\n",
" totalCount = 0\n",
" itemsCount = 0\n",
" for item in tokenizedList:\n",
" itemsCount += 1\n",
" if item in stopwords.words('english'):\n",
" count += 1\n",
"\n",
"# creating a list of emotion words and the words that have emotion according to our pre defined list\n",
"\n",
" emotionList = []\n",
" wordList = []\n",
"\n",
"# list of words with their emotions\n",
"\n",
" with open('emotions.txt', 'r') as file:\n",
" for line in file:\n",
" clearLine = line.replace('\\n', '').replace(\n",
" ',', '').replace(\"'\", '').replace(' ', '').strip()\n",
" word, emotion = clearLine.split(':')\n",
"\n",
" if word in finalList:\n",
" emotionList.append(emotion)\n",
" wordList.append(word)\n",
" countEmotions = Counter(emotionList)\n",
" countwords = Counter(finalList)\n",
" totalCount = count+len(set(tokenizedList) -\n",
" set(wordList)-set(stopwords.words('english')))\n",
"\n",
"\n",
"# creating a blob text\n",
"\n",
" blob = TextBlob(myFile)\n",
" wordCount = blob.word_counts\n",
"\n",
"# make all the typos correct\n",
"\n",
" blob = TextBlob(myFile, analyzer=NaiveBayesAnalyzer())\n",
" sen_sub = blob.sentiment\n",
"\n",
" def sentimentAnalyse(text):\n",
" score = SentimentIntensityAnalyzer().polarity_scores(text)\n",
" for k,v in score.items():\n",
" print(f\"{k}:{v:.2f}\")\n",
" neg = score['neg']\n",
" pos = score['pos']\n",
" if neg > pos:\n",
" print(\"\\n\"+\"In general: Negative sentiment!\")\n",
" elif pos > neg:\n",
" print(\"\\n\"+\"In general: Positive sentiment!\")\n",
" else:\n",
" print(\"\\n\"+\"In general: Neutral vibe!\")\n",
"\n",
" sentimentAnalyse(myFile)\n",
" reg = RegexpTokenizer('(?u)\\w+|\\$[\\d\\.]+|\\s+')\n",
"\n",
" print('\\n'+'******************************************************************')\n",
" print('Summary of sentiments:')\n",
" print('\\n'+'list of words with emotions: ', wordList)\n",
" print('Summary of emotions count: ', countEmotions)\n",
" print('\\n'+'******************************************************************')\n",
" print(' End of Application ')\n",
" print('******************************************************************')\n",
"\n",
"# Drawing the bar chart\n",
"\n",
" fig, axl = plt.subplots()\n",
" axl.bar(countEmotions.keys(), countEmotions.values())\n",
" fig.autofmt_xdate()\n",
" plt.savefig('graph.png')\n",
" plt.show()\n",
"\n",
"\n",
"\n",
"main()\n",
"\n",
"\n",
"##############################################################\n",
"# Symbol\tMeaning\t Example #\n",
"#------------------------------------------------------------#\n",
"# S\t sentence\t the man walked #\n",
"# NP\t noun phrase\t a dog #\n",
"# VP \t verb phrase\t saw a park #\n",
"# PP\t prepositional phrase\twith a telescope #\n",
"# Det\t determiner\t the #\n",
"# N\t noun\t dog #\n",
"# V\t verb\t walked #\n",
"# P\t preposition \t in #\n",
"##############################################################\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c0fe54e4",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "conda_python3",
"language": "python",
"name": "conda_python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}