From 29d04d94f8fe56ec3d3378112d472734c5ed7ef1 Mon Sep 17 00:00:00 2001 From: Jen Looper Date: Thu, 24 Jun 2021 22:06:15 -0400 Subject: [PATCH] lesson 4 NLP --- 6-NLP/3-Translation-Sentiment/README.md | 4 +- .../3-Translation-Sentiment/solution/book.py | 23 ----- .../solution/notebook.ipynb | 87 +++++++++++++++++++ 6-NLP/4-Hotel-Reviews-1/README.md | 24 ++--- 9-Real-World/1-Applications/README.md | 2 +- 9-Real-World/README.md | 2 +- 6 files changed, 105 insertions(+), 37 deletions(-) delete mode 100644 6-NLP/3-Translation-Sentiment/solution/book.py create mode 100644 6-NLP/3-Translation-Sentiment/solution/notebook.ipynb diff --git a/6-NLP/3-Translation-Sentiment/README.md b/6-NLP/3-Translation-Sentiment/README.md index 1072753592..6d3b1325d0 100644 --- a/6-NLP/3-Translation-Sentiment/README.md +++ b/6-NLP/3-Translation-Sentiment/README.md @@ -6,7 +6,7 @@ In the previous lessons you learned how to build a basic bot using `TextBlob`, a Translation is a very hard problem compounded by the fact that there are thousands of languages and each can have very different grammar rules. One approach is to convert the formal grammar rules for one language, such as English, into a non-language dependent structure, and then translate it by converting back to another language. This approach means that you would take the following steps: -1. **Identification**, identify or tag the words in input language into nouns, verbs etc. +1. **Identification**. Identify or tag the words in input language into nouns, verbs etc. 2. **Create translation**. Produce a direct translation of each word in the target language format. ### Example sentence, English to Irish @@ -139,7 +139,7 @@ Your task is to determine, using sentiment polarity, if *Pride and Prejudice* ha 1. If the polarity is 1 or -1 store the sentence in an array or list of positive or negative messages 5. At the end, print out all the positive sentences and negative sentences (separately) and the number of each. -Here is a sample [solution](solutions/book.py). +Here is a sample [solution](solutions/notebook.ipynb). ✅ Knowledge Check diff --git a/6-NLP/3-Translation-Sentiment/solution/book.py b/6-NLP/3-Translation-Sentiment/solution/book.py deleted file mode 100644 index a0189ba1b5..0000000000 --- a/6-NLP/3-Translation-Sentiment/solution/book.py +++ /dev/null @@ -1,23 +0,0 @@ -from textblob import TextBlob - -# You should download the book text, clean it, and import it here -with open("pride.txt", encoding="utf8") as f: - file_contents = f.read() - -book_pride = TextBlob(file_contents) -positive_sentiment_sentences = [] -negative_sentiment_sentences = [] - -for sentence in book_pride.sentences: - if sentence.sentiment.polarity == 1: - positive_sentiment_sentences.append(sentence) - if sentence.sentiment.polarity == -1: - negative_sentiment_sentences.append(sentence) - -print("The " + str(len(positive_sentiment_sentences)) + " most positive sentences:") -for sentence in positive_sentiment_sentences: - print("+ " + str(sentence.replace("\n", "").replace(" ", " "))) - - print("The " + str(len(negative_sentiment_sentences)) + " most negative sentences:") -for sentence in negative_sentiment_sentences: - print("- " + str(sentence.replace("\n", "").replace(" ", " "))) diff --git a/6-NLP/3-Translation-Sentiment/solution/notebook.ipynb b/6-NLP/3-Translation-Sentiment/solution/notebook.ipynb new file mode 100644 index 0000000000..02283fcdb5 --- /dev/null +++ b/6-NLP/3-Translation-Sentiment/solution/notebook.ipynb @@ -0,0 +1,87 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": 3 + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from textblob import TextBlob\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# You should download the book text, clean it, and import it here\n", + "with open(\"pride.txt\", encoding=\"utf8\") as f:\n", + " file_contents = f.read()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "book_pride = TextBlob(file_contents)\n", + "positive_sentiment_sentences = []\n", + "negative_sentiment_sentences = []" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for sentence in book_pride.sentences:\n", + " if sentence.sentiment.polarity == 1:\n", + " positive_sentiment_sentences.append(sentence)\n", + " if sentence.sentiment.polarity == -1:\n", + " negative_sentiment_sentences.append(sentence)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"The \" + str(len(positive_sentiment_sentences)) + \" most positive sentences:\")\n", + "for sentence in positive_sentiment_sentences:\n", + " print(\"+ \" + str(sentence.replace(\"\\n\", \"\").replace(\" \", \" \")))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"The \" + str(len(negative_sentiment_sentences)) + \" most negative sentences:\")\n", + "for sentence in negative_sentiment_sentences:\n", + " print(\"- \" + str(sentence.replace(\"\\n\", \"\").replace(\" \", \" \")))" + ] + } + ] +} \ No newline at end of file diff --git a/6-NLP/4-Hotel-Reviews-1/README.md b/6-NLP/4-Hotel-Reviews-1/README.md index c6247a176f..78b9cce782 100644 --- a/6-NLP/4-Hotel-Reviews-1/README.md +++ b/6-NLP/4-Hotel-Reviews-1/README.md @@ -1,6 +1,10 @@ # Sentiment analysis with hotel reviews - processing the data -In this section you will use the techniques in the previous lessons to do some exploratory data analysis of a large dataset. Once you have a good understanding of the usefulness of the various columns, you will learn how to remove the unneeded columns, calculate some new data based on the existing columns, and save the resulting dataset for use in the final challenge. +In this section you will use the techniques in the previous lessons to do some exploratory data analysis of a large dataset. Once you have a good understanding of the usefulness of the various columns, you will learn: + +- how to remove the unneeded columns +- how to calculate some new data based on the existing columns +- how to save the resulting dataset for use in the final challenge ## [Pre-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/37/) @@ -12,10 +16,10 @@ So far you've learned about how text data is quite unlike numerical types of dat You will need: -* Python 3 +* The ability to run .ipynb notebooks using Python 3 * pandas * NLTK, [which you should install locally](https://www.nltk.org/install.html) -* The data set which is available on Kaggle [515K Hotel Reviews Data in Europe](https://www.kaggle.com/jiashenliu/515k-hotel-reviews-data-in-europe). It is around 230 MB unzipped. Download it to the `/data` folder associated with these NLP lessons. +* The data set which is available on Kaggle [515K Hotel Reviews Data in Europe](https://www.kaggle.com/jiashenliu/515k-hotel-reviews-data-in-europe). It is around 230 MB unzipped. Download it to the root `/data` folder associated with these NLP lessons. ## Exploratory data analysis @@ -29,7 +33,7 @@ Using Python, a dataset of hotel reviews, and NLTK's sentiment analysis you coul #### Dataset -Let's explore the dataset first. Remember to download it and save it locally. Open the file in an editor like VS Code or even Excel. As it's a text-based CSV file, any editor that can handle large text files should be able to open it. +Let's explore the dataset that you've downloaded and saved locally. Open the file in an editor like VS Code or even Excel. The headers in the dataset are as follows: @@ -86,7 +90,7 @@ Here they are grouped in a way that might be easier to examine: | -------------- | ---------------------- | ---------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------- | ----------------------------------------------------------------------------------------- | | 7.8 | 1945 | 2.5 | This is currently not a hotel but a construction site I was terroized from early morning and all day with unacceptable building noise while resting after a long trip and working in the room People were working all day i e with jackhammers in the adjacent rooms I asked for a room change but no silent room was available To make thinks worse I was overcharged I checked out in the evening since I had to leave very early flight and received an appropiate bill A day later the hotel made another charge without my concent in excess of booked price It s a terrible place Don t punish yourself by booking here | Nothing Terrible place Stay away | Business trip Couple Standard Double Room Stayed 2 nights | -As you can see from this guest, they did not have a happy stay at this hotel. The hotel has a good average score of 7.8 and 1945 reviews, but this reviewer gave it 2.5 and wrote 115 words about how negative their stay was. If they wrote nothing at all in the Positive_Review column, you might surmise there was nothing positive, but alas they wrote 7 words of warning. If we just counted words instead of the meaning, or sentiment of the words, we might have a skewed view of the reviewers intent. Strangely, their score of 2.5 is confusing, because if that hotel stay was so bad, why give it any points at all? Investigating the dataset closely, you'll see that the lowest possible score is 2.5, not 0. The highest possible score is 10. +As you can see, this guest did not have a happy stay at this hotel. The hotel has a good average score of 7.8 and 1945 reviews, but this reviewer gave it 2.5 and wrote 115 words about how negative their stay was. If they wrote nothing at all in the Positive_Review column, you might surmise there was nothing positive, but alas they wrote 7 words of warning. If we just counted words instead of the meaning, or sentiment of the words, we might have a skewed view of the reviewers intent. Strangely, their score of 2.5 is confusing, because if that hotel stay was so bad, why give it any points at all? Investigating the dataset closely, you'll see that the lowest possible score is 2.5, not 0. The highest possible score is 10. ##### Tags @@ -126,8 +130,7 @@ On the possibility that these hotel might be an outlier, and that maybe most of > > When working with this dataset you will write code that calculates something from the text without having to read or analyse the text yourself. This is the essence of NLP, interpreting meaning or sentiment without having to have a human do it. However, it is possible that you will read some of the negative reviews. I would urge you not to, because you don't have to. Some of them are silly, or irrelevant negative hotel reviews, such as "The weather wasn't great", something beyond the control of the hotel, or indeed, anyone. But there is a dark side to some reviews too. Sometimes the negative reviews are racist, sexist, or ageist. This is unfortunate but to be expected in a dataset scraped off a public website. Some reviewers leave reviews that you would find distasteful, uncomfortable, or upsetting. Better to let the code measure the sentiment than read them yourself and be upset. That said, it is a minority that write such things, but they exist all the same. -## Exercise - +## Exercise - Data exploration ### Load the data That's enough examining the data visually, now you'll write some code and get some answers! This section uses the pandas library. Your very first task is to ensure you can load and read the CSV data. The pandas library has a fast CSV loader, and the result is placed in a dataframe, as in previous lessons. The CSV we are loading has over half a million rows, but only 17 columns. Pandas gives you lots of powerful ways to interact with a dataframe, including the ability to perform operations on every row. @@ -180,7 +183,6 @@ Treat the following questions as coding tasks and attempt to answer them without 7. Calculate and print out how many rows have column `Negative_Review` values of "No Negative" 8. Calculate and print out how many rows have column `Positive_Review` values of "No Positive" 9. Calculate and print out how many rows have column `Positive_Review` values of "No Positive" **and** `Negative_Review` values of "No Negative" - ### Code answers 1. Print out the *shape* of the data frame you have just loaded (the shape is the number of rows and columns) @@ -352,10 +354,12 @@ Treat the following questions as coding tasks and attempt to answer them without Number of No Negative reviews: 127890 Number of No Positive reviews: 35946 Number of both No Negative and No Positive reviews: 127 - Lamdas took 9.64 seconds + Lambdas took 9.64 seconds ``` - Another way to do that one is without Lambdas, and use sum to count the rows: +## Another way + +Another way count items without Lambdas, and use sum to count the rows: ```python # without lambdas (using a mixture of notations to show you can use both) diff --git a/9-Real-World/1-Applications/README.md b/9-Real-World/1-Applications/README.md index fe6a1794d9..87be29d314 100644 --- a/9-Real-World/1-Applications/README.md +++ b/9-Real-World/1-Applications/README.md @@ -1,4 +1,4 @@ -# Machine learning in the real world - a postscript +# Postscript: Machine learning in the real world In this curriculum, you have learned many ways to prepare data for training and create machine learning models. You built a series of classic regression, clustering, classification, natural language processing, and time series models. Congratulations! Now, you might be wondering what it's all for... what are the real world applications for these models? diff --git a/9-Real-World/README.md b/9-Real-World/README.md index e814e0532b..96dd3d016d 100644 --- a/9-Real-World/README.md +++ b/9-Real-World/README.md @@ -1,4 +1,4 @@ -# Real World Applications of Classic Machine Learning +# Postscript: Real world applications of classic machine learning In this section of the curriculum, you will be introduced to some real-world applications of classical ML. We have scoured the internet to find whitepapers and articles about applications that have used these strategies, avoiding neural networks, deep learning and AI as much as possible. Learn about how ML is used in business systems, ecological applications, finance, arts and culture, and more.