From 05dcd88fa248898ff6288c195a35ef18a2618a16 Mon Sep 17 00:00:00 2001 From: Jen Looper Date: Thu, 24 Jun 2021 21:28:56 -0400 Subject: [PATCH] moving files from py to notebook --- .../Hotel_Dataset_Filtering.py | 57 ------- .../Hotel_Reviews_Sentiment_Analysis.py | 56 ------- 6-NLP/5-Hotel-Reviews-2/notebook.ipynb | 0 .../solution/notebook-filtering.ipynb | 143 ++++++++++++++++ .../notebook-sentiment-analysis.ipynb | 155 ++++++++++++++++++ 5 files changed, 298 insertions(+), 113 deletions(-) delete mode 100644 6-NLP/5-Hotel-Reviews-2/Hotel_Dataset_Filtering.py delete mode 100644 6-NLP/5-Hotel-Reviews-2/Hotel_Reviews_Sentiment_Analysis.py create mode 100644 6-NLP/5-Hotel-Reviews-2/notebook.ipynb create mode 100644 6-NLP/5-Hotel-Reviews-2/solution/notebook-filtering.ipynb create mode 100644 6-NLP/5-Hotel-Reviews-2/solution/notebook-sentiment-analysis.ipynb diff --git a/6-NLP/5-Hotel-Reviews-2/Hotel_Dataset_Filtering.py b/6-NLP/5-Hotel-Reviews-2/Hotel_Dataset_Filtering.py deleted file mode 100644 index d11f159487..0000000000 --- a/6-NLP/5-Hotel-Reviews-2/Hotel_Dataset_Filtering.py +++ /dev/null @@ -1,57 +0,0 @@ -import pandas as pd -import time -import ast - -def replace_address(row): - if "Netherlands" in row["Hotel_Address"]: - return "Amsterdam, Netherlands" - elif "Barcelona" in row["Hotel_Address"]: - return "Barcelona, Spain" - elif "United Kingdom" in row["Hotel_Address"]: - return "London, United Kingdom" - elif "Milan" in row["Hotel_Address"]: - return "Milan, Italy" - elif "France" in row["Hotel_Address"]: - return "Paris, France" - elif "Vienna" in row["Hotel_Address"]: - return "Vienna, Austria" - else: - return row.Hotel_Address - -# Load the hotel reviews from CSV -start = time.time() -df = pd.read_csv('Hotel_Reviews.csv') - -# dropping columns we will not use: -df.drop(["lat", "lng"], axis = 1, inplace=True) - -# Replace all the addresses with a shortened, more useful form -df["Hotel_Address"] = df.apply(replace_address, axis = 1) - -# Drop `Additional_Number_of_Scoring` -df.drop(["Additional_Number_of_Scoring"], axis = 1, inplace=True) -# Replace `Total_Number_of_Reviews` and `Average_Score` with our own calculated values -df.Total_Number_of_Reviews = df.groupby('Hotel_Name').transform('count') -df.Average_Score = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1) - -# Process the Tags into new columns -# The file Hotel_Reviews_Tags.py, identifies the most important tags -# Leisure trip, Couple, Solo traveler, Business trip, Group combined with Travelers with friends, -# Family with young children, Family with older children, With a pet -df["Leisure_trip"] = df.Tags.apply(lambda tag: 1 if "Leisure trip" in tag else 0) -df["Couple"] = df.Tags.apply(lambda tag: 1 if "Couple" in tag else 0) -df["Solo_traveler"] = df.Tags.apply(lambda tag: 1 if "Solo traveler" in tag else 0) -df["Business_trip"] = df.Tags.apply(lambda tag: 1 if "Business trip" in tag else 0) -df["Group"] = df.Tags.apply(lambda tag: 1 if "Group" in tag or "Travelers with friends" in tag else 0) -df["Family_with_young_children"] = df.Tags.apply(lambda tag: 1 if "Family with young children" in tag else 0) -df["Family_with_older_children"] = df.Tags.apply(lambda tag: 1 if "Family with older children" in tag else 0) -df["With_a_pet"] = df.Tags.apply(lambda tag: 1 if "With a pet" in tag else 0) - -# No longer need any of these columns -df.drop(["Tags", "Review_Date", "Review_Total_Negative_Word_Counts", "Review_Total_Positive_Word_Counts", "days_since_review", "Total_Number_of_Reviews_Reviewer_Has_Given"], axis = 1, inplace=True) - -# Saving new data file with calculated columns -print("Saving results to Hotel_Reviews_Filtered.csv") -df.to_csv(r'Hotel_Reviews_Filtered.csv', index = False) -end = time.time() -print("Filtering took " + str(round(end - start, 2)) + " seconds") diff --git a/6-NLP/5-Hotel-Reviews-2/Hotel_Reviews_Sentiment_Analysis.py b/6-NLP/5-Hotel-Reviews-2/Hotel_Reviews_Sentiment_Analysis.py deleted file mode 100644 index c53df2b39a..0000000000 --- a/6-NLP/5-Hotel-Reviews-2/Hotel_Reviews_Sentiment_Analysis.py +++ /dev/null @@ -1,56 +0,0 @@ -import time -import pandas as pd -from nltk.corpus import stopwords -from nltk.sentiment.vader import SentimentIntensityAnalyzer - -# Create the vader sentiment analyser (there are others in NLTK you can try too) -vader_sentiment = SentimentIntensityAnalyzer() -# Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. -# Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014. - -# There are 3 possibilities of input for a review: -# It could be "No Negative", in which case, return 0 -# It could be "No Positive", in which case, return 0 -# It could be a review, in which case calculate the sentiment -def calc_sentiment(review): - if review == "No Negative" or review == "No Positive": - return 0 - return vader_sentiment.polarity_scores(review)["compound"] - -# Load the hotel reviews from CSV -df = pd.read_csv("Hotel_Reviews_Filtered.csv") - -# Remove stop words - can be slow for a lot of text! -# Ryan Han (ryanxjhan on Kaggle) has a great post measuring performance of different stop words removal approaches -# https://www.kaggle.com/ryanxjhan/fast-stop-words-removal # using the approach that Ryan recommends -start = time.time() -cache = set(stopwords.words("english")) -def remove_stopwords(review): - text = " ".join([word for word in review.split() if word not in cache]) - return text - -# Remove the stop words from both columns -df.Negative_Review = df.Negative_Review.apply(remove_stopwords) -df.Positive_Review = df.Positive_Review.apply(remove_stopwords) - -end = time.time() -print("Removing stop words took " + str(round(end - start, 2)) + " seconds") - -# Add a negative sentiment and positive sentiment column -print("Calculating sentiment columns for both positive and negative reviews") -start = time.time() -df["Negative_Sentiment"] = df.Negative_Review.apply(calc_sentiment) -df["Positive_Sentiment"] = df.Positive_Review.apply(calc_sentiment) -end = time.time() -print("Calculating sentiment took " + str(round(end - start, 2)) + " seconds") - -df = df.sort_values(by=["Negative_Sentiment"], ascending=True) -print(df[["Negative_Review", "Negative_Sentiment"]]) -df = df.sort_values(by=["Positive_Sentiment"], ascending=True) -print(df[["Positive_Review", "Positive_Sentiment"]]) - -# Reorder the columns (This is cosmetic, but to make it easier to explore the data later) -df = df.reindex(["Hotel_Name", "Hotel_Address", "Total_Number_of_Reviews", "Average_Score", "Reviewer_Score", "Negative_Sentiment", "Positive_Sentiment", "Reviewer_Nationality", "Leisure_trip", "Couple", "Solo_traveler", "Business_trip", "Group", "Family_with_young_children", "Family_with_older_children", "With_a_pet", "Negative_Review", "Positive_Review"], axis=1) - -print("Saving results to Hotel_Reviews_NLP.csv") -df.to_csv(r"Hotel_Reviews_NLP.csv", index = False) diff --git a/6-NLP/5-Hotel-Reviews-2/notebook.ipynb b/6-NLP/5-Hotel-Reviews-2/notebook.ipynb new file mode 100644 index 0000000000..e69de29bb2 diff --git a/6-NLP/5-Hotel-Reviews-2/solution/notebook-filtering.ipynb b/6-NLP/5-Hotel-Reviews-2/solution/notebook-filtering.ipynb new file mode 100644 index 0000000000..3baa7fc146 --- /dev/null +++ b/6-NLP/5-Hotel-Reviews-2/solution/notebook-filtering.ipynb @@ -0,0 +1,143 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": 3 + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import time\n", + "import ast" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def replace_address(row):\n", + " if \"Netherlands\" in row[\"Hotel_Address\"]:\n", + " return \"Amsterdam, Netherlands\"\n", + " elif \"Barcelona\" in row[\"Hotel_Address\"]:\n", + " return \"Barcelona, Spain\"\n", + " elif \"United Kingdom\" in row[\"Hotel_Address\"]:\n", + " return \"London, United Kingdom\"\n", + " elif \"Milan\" in row[\"Hotel_Address\"]: \n", + " return \"Milan, Italy\"\n", + " elif \"France\" in row[\"Hotel_Address\"]:\n", + " return \"Paris, France\"\n", + " elif \"Vienna\" in row[\"Hotel_Address\"]:\n", + " return \"Vienna, Austria\" \n", + " else:\n", + " return row.Hotel_Address\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the hotel reviews from CSV\n", + "start = time.time()\n", + "df = pd.read_csv('../../data/Hotel_Reviews.csv')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# dropping columns we will not use:\n", + "df.drop([\"lat\", \"lng\"], axis = 1, inplace=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Replace all the addresses with a shortened, more useful form\n", + "df[\"Hotel_Address\"] = df.apply(replace_address, axis = 1)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Drop `Additional_Number_of_Scoring`\n", + "df.drop([\"Additional_Number_of_Scoring\"], axis = 1, inplace=True)\n", + "# Replace `Total_Number_of_Reviews` and `Average_Score` with our own calculated values\n", + "df.Total_Number_of_Reviews = df.groupby('Hotel_Name').transform('count')\n", + "df.Average_Score = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Process the Tags into new columns\n", + "# The file Hotel_Reviews_Tags.py, identifies the most important tags\n", + "# Leisure trip, Couple, Solo traveler, Business trip, Group combined with Travelers with friends, \n", + "# Family with young children, Family with older children, With a pet\n", + "df[\"Leisure_trip\"] = df.Tags.apply(lambda tag: 1 if \"Leisure trip\" in tag else 0)\n", + "df[\"Couple\"] = df.Tags.apply(lambda tag: 1 if \"Couple\" in tag else 0)\n", + "df[\"Solo_traveler\"] = df.Tags.apply(lambda tag: 1 if \"Solo traveler\" in tag else 0)\n", + "df[\"Business_trip\"] = df.Tags.apply(lambda tag: 1 if \"Business trip\" in tag else 0)\n", + "df[\"Group\"] = df.Tags.apply(lambda tag: 1 if \"Group\" in tag or \"Travelers with friends\" in tag else 0)\n", + "df[\"Family_with_young_children\"] = df.Tags.apply(lambda tag: 1 if \"Family with young children\" in tag else 0)\n", + "df[\"Family_with_older_children\"] = df.Tags.apply(lambda tag: 1 if \"Family with older children\" in tag else 0)\n", + "df[\"With_a_pet\"] = df.Tags.apply(lambda tag: 1 if \"With a pet\" in tag else 0)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# No longer need any of these columns\n", + "df.drop([\"Tags\", \"Review_Date\", \"Review_Total_Negative_Word_Counts\", \"Review_Total_Positive_Word_Counts\", \"days_since_review\", \"Total_Number_of_Reviews_Reviewer_Has_Given\"], axis = 1, inplace=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Saving new data file with calculated columns\n", + "print(\"Saving results to Hotel_Reviews_Filtered.csv\")\n", + "df.to_csv(r'Hotel_Reviews_Filtered.csv', index = False)\n", + "end = time.time()\n", + "print(\"Filtering took \" + str(round(end - start, 2)) + \" seconds\")\n" + ] + } + ] +} \ No newline at end of file diff --git a/6-NLP/5-Hotel-Reviews-2/solution/notebook-sentiment-analysis.ipynb b/6-NLP/5-Hotel-Reviews-2/solution/notebook-sentiment-analysis.ipynb new file mode 100644 index 0000000000..9cc9970375 --- /dev/null +++ b/6-NLP/5-Hotel-Reviews-2/solution/notebook-sentiment-analysis.ipynb @@ -0,0 +1,155 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": 3 + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "import pandas as pd\n", + "from nltk.corpus import stopwords\n", + "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create the vader sentiment analyser (there are others in NLTK you can try too)\n", + "vader_sentiment = SentimentIntensityAnalyzer()\n", + "# Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. \n", + "# Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# There are 3 possibilities of input for a review:\n", + "# It could be \"No Negative\", in which case, return 0\n", + "# It could be \"No Positive\", in which case, return 0\n", + "# It could be a review, in which case calculate the sentiment\n", + "def calc_sentiment(review): \n", + " if review == \"No Negative\" or review == \"No Positive\":\n", + " return 0\n", + " return vader_sentiment.polarity_scores(review)[\"compound\"] \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the hotel reviews from CSV\n", + "df = pd.read_csv(\"../../data/Hotel_Reviews_Filtered.csv\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Remove stop words - can be slow for a lot of text!\n", + "# Ryan Han (ryanxjhan on Kaggle) has a great post measuring performance of different stop words removal approaches\n", + "# https://www.kaggle.com/ryanxjhan/fast-stop-words-removal # using the approach that Ryan recommends\n", + "start = time.time()\n", + "cache = set(stopwords.words(\"english\"))\n", + "def remove_stopwords(review):\n", + " text = \" \".join([word for word in review.split() if word not in cache])\n", + " return text\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Remove the stop words from both columns\n", + "df.Negative_Review = df.Negative_Review.apply(remove_stopwords) \n", + "df.Positive_Review = df.Positive_Review.apply(remove_stopwords)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "end = time.time()\n", + "print(\"Removing stop words took \" + str(round(end - start, 2)) + \" seconds\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Add a negative sentiment and positive sentiment column\n", + "print(\"Calculating sentiment columns for both positive and negative reviews\")\n", + "start = time.time()\n", + "df[\"Negative_Sentiment\"] = df.Negative_Review.apply(calc_sentiment)\n", + "df[\"Positive_Sentiment\"] = df.Positive_Review.apply(calc_sentiment)\n", + "end = time.time()\n", + "print(\"Calculating sentiment took \" + str(round(end - start, 2)) + \" seconds\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = df.sort_values(by=[\"Negative_Sentiment\"], ascending=True)\n", + "print(df[[\"Negative_Review\", \"Negative_Sentiment\"]])\n", + "df = df.sort_values(by=[\"Positive_Sentiment\"], ascending=True)\n", + "print(df[[\"Positive_Review\", \"Positive_Sentiment\"]])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Reorder the columns (This is cosmetic, but to make it easier to explore the data later)\n", + "df = df.reindex([\"Hotel_Name\", \"Hotel_Address\", \"Total_Number_of_Reviews\", \"Average_Score\", \"Reviewer_Score\", \"Negative_Sentiment\", \"Positive_Sentiment\", \"Reviewer_Nationality\", \"Leisure_trip\", \"Couple\", \"Solo_traveler\", \"Business_trip\", \"Group\", \"Family_with_young_children\", \"Family_with_older_children\", \"With_a_pet\", \"Negative_Review\", \"Positive_Review\"], axis=1)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Saving results to Hotel_Reviews_NLP.csv\")\n", + "df.to_csv(r\"../../data/Hotel_Reviews_NLP.csv\", index = False)\n" + ] + } + ] +} \ No newline at end of file