From af0461a289cda4190f7acca4cf4d6fcc3b4f43c7 Mon Sep 17 00:00:00 2001 From: Susan Li Date: Tue, 30 Apr 2019 03:49:16 -0400 Subject: [PATCH] Delete Hotel Description Generation LSTM.ipynb --- Hotel Description Generation LSTM.ipynb | 482 ------------------------ 1 file changed, 482 deletions(-) delete mode 100644 Hotel Description Generation LSTM.ipynb diff --git a/Hotel Description Generation LSTM.ipynb b/Hotel Description Generation LSTM.ipynb deleted file mode 100644 index 9d18b53..0000000 --- a/Hotel Description Generation LSTM.ipynb +++ /dev/null @@ -1,482 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using TensorFlow backend.\n" - ] - } - ], - "source": [ - "from keras.preprocessing.sequence import pad_sequences\n", - "from keras.layers import Embedding, LSTM, Dense, Dropout\n", - "from keras.preprocessing.text import Tokenizer\n", - "from keras.callbacks import EarlyStopping\n", - "from keras.models import Sequential\n", - "import keras.utils as ku \n", - "import pandas as pd\n", - "import numpy as np\n", - "import string, os \n", - "import warnings\n", - "warnings.filterwarnings(\"ignore\")\n", - "warnings.simplefilter(action='ignore', category=FutureWarning)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "hotel_df = pd.read_csv('Seattle_Hotels.csv', encoding=\"latin-1\")\n", - "all_descriptions = list(hotel_df.desc.values)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "152" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(all_descriptions)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[\"Located on the southern tip of Lake Union, the Hilton Garden Inn Seattle Downtown hotel is perfectly located for business and leisure. \\nThe neighborhood is home to numerous major international companies including Amazon, Google and the Bill & Melinda Gates Foundation. A wealth of eclectic restaurants and bars make this area of Seattle one of the most sought out by locals and visitors. Our proximity to Lake Union allows visitors to take in some of the Pacific Northwest's majestic scenery and enjoy outdoor activities like kayaking and sailing. over 2,000 sq. ft. of versatile space and a complimentary business center. State-of-the-art A/V technology and our helpful staff will guarantee your conference, cocktail reception or wedding is a success. Refresh in the sparkling saltwater pool, or energize with the latest equipment in the 24-hour fitness center. Tastefully decorated and flooded with natural light, our guest rooms and suites offer everything you need to relax and stay productive. Unwind in the bar, and enjoy American cuisine for breakfast, lunch and dinner in our restaurant. The 24-hour Pavilion Pantry? stocks a variety of snacks, drinks and sundries.\"]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "corpus = [x for x in all_descriptions]\n", - "corpus[:1]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "t = Tokenizer(num_words=None, filters='!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n', lower=True, split=' ', char_level=False, oov_token=None, document_count=0)\n", - "t.fit_on_texts(corpus)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# A dictionary of words and their counts.\n", - "print(t.word_counts)\n", - "\n", - "# A dictionary of words and how many documents each appeared in.\n", - "print(t.word_docs)\n", - "\n", - "# An integer count of the total number of documents that were used to fit the Tokenizer (i.e. total number of documents)\n", - "print(t.document_count)\n", - "\n", - "# A dictionary of words and their uniquely assigned integers.\n", - "print(t.word_index)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Found 3420 unique tokens.\n" - ] - } - ], - "source": [ - "print('Found %s unique tokens.' % len(t.word_index))" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# Tokenization\n", - "t = Tokenizer(num_words=None, filters='!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n', lower=True, split=' ', char_level=False, oov_token=None, document_count=0)\n", - "\n", - "def get_sequence_of_tokens(corpus):\n", - " t.fit_on_texts(corpus)\n", - " total_words = len(t.word_index) + 1\n", - " \n", - " input_sequences = []\n", - " for line in corpus:\n", - " token_list = t.texts_to_sequences([line])[0]\n", - " for i in range(1, len(token_list)):\n", - " n_gram_sequence = token_list[:i+1]\n", - " input_sequences.append(n_gram_sequence)\n", - " \n", - " return input_sequences, total_words\n", - "input_sequences, total_words = get_sequence_of_tokens(corpus)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[[24, 21],\n", - " [24, 21, 1],\n", - " [24, 21, 1, 1734],\n", - " [24, 21, 1, 1734, 1735],\n", - " [24, 21, 1, 1734, 1735, 4],\n", - " [24, 21, 1, 1734, 1735, 4, 81],\n", - " [24, 21, 1, 1734, 1735, 4, 81, 111],\n", - " [24, 21, 1, 1734, 1735, 4, 81, 111, 1],\n", - " [24, 21, 1, 1734, 1735, 4, 81, 111, 1, 330],\n", - " [24, 21, 1, 1734, 1735, 4, 81, 111, 1, 330, 331]]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "input_sequences[:10]" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "3421" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "total_words" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "# pad sequences \n", - "def generate_padded_sequences(input_sequences):\n", - " max_sequence_len = max([len(x) for x in input_sequences])\n", - " input_sequences = np.array(pad_sequences(input_sequences, maxlen = max_sequence_len, padding = 'pre'))\n", - " predictors, label = input_sequences[:,:-1],input_sequences[:,-1]\n", - " label = ku.to_categorical(label, num_classes = total_words)\n", - " \n", - " return predictors, label, max_sequence_len\n", - "\n", - "predictors, label, max_sequence_len = generate_padded_sequences(input_sequences)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Colocations handled automatically by placer.\n", - "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.\n", - "_________________________________________________________________\n", - "Layer (type) Output Shape Param # \n", - "=================================================================\n", - "embedding_1 (Embedding) (None, 491, 10) 34640 \n", - "_________________________________________________________________\n", - "lstm_1 (LSTM) (None, 100) 44400 \n", - "_________________________________________________________________\n", - "dropout_1 (Dropout) (None, 100) 0 \n", - "_________________________________________________________________\n", - "dense_1 (Dense) (None, 3464) 349864 \n", - "=================================================================\n", - "Total params: 428,904\n", - "Trainable params: 428,904\n", - "Non-trainable params: 0\n", - "_________________________________________________________________\n" - ] - } - ], - "source": [ - "def create_model(max_sequence_len, total_words):\n", - " model = Sequential()\n", - " \n", - " # Add Input Embedding Layer\n", - " model.add(Embedding(total_words, 10, input_length=max_sequence_len - 1))\n", - " \n", - " # Add Hidden Layer 1 - LSTM Layer\n", - " model.add(LSTM(100))\n", - " model.add(Dropout(0.1))\n", - " \n", - " # Add Output Layer\n", - " model.add(Dense(total_words, activation='softmax'))\n", - "\n", - " model.compile(loss='categorical_crossentropy', optimizer='adam')\n", - " \n", - " return model\n", - "\n", - "model = create_model(max_sequence_len, total_words)\n", - "model.summary()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Use tf.cast instead.\n", - "Epoch 1/100\n", - "Epoch 2/100\n", - "Epoch 3/100\n", - "Epoch 4/100\n", - "Epoch 5/100\n", - "Epoch 6/100\n", - "Epoch 7/100\n", - "Epoch 8/100\n", - "Epoch 9/100\n", - "Epoch 10/100\n", - "Epoch 11/100\n", - "Epoch 12/100\n", - "Epoch 13/100\n", - "Epoch 14/100\n", - "Epoch 15/100\n", - "Epoch 16/100\n", - "Epoch 17/100\n", - "Epoch 18/100\n", - "Epoch 19/100\n", - "Epoch 20/100\n", - "Epoch 21/100\n", - "Epoch 22/100\n", - "Epoch 23/100\n", - "Epoch 24/100\n", - "Epoch 25/100\n", - "Epoch 26/100\n", - "Epoch 27/100\n", - "Epoch 28/100\n", - "Epoch 29/100\n", - "Epoch 30/100\n", - "Epoch 31/100\n", - "Epoch 32/100\n", - "Epoch 33/100\n", - "Epoch 34/100\n", - "Epoch 35/100\n", - "Epoch 36/100\n", - "Epoch 37/100\n", - "Epoch 38/100\n", - "Epoch 39/100\n", - "Epoch 40/100\n", - "Epoch 41/100\n", - "Epoch 42/100\n", - "Epoch 43/100\n", - "Epoch 44/100\n", - "Epoch 45/100\n", - "Epoch 46/100\n", - "Epoch 47/100\n", - "Epoch 48/100\n", - "Epoch 49/100\n", - "Epoch 50/100\n", - "Epoch 51/100\n", - "Epoch 52/100\n", - "Epoch 53/100\n", - "Epoch 54/100\n", - "Epoch 55/100\n", - "Epoch 56/100\n", - "Epoch 57/100\n", - "Epoch 58/100\n", - "Epoch 59/100\n", - "Epoch 60/100\n", - "Epoch 61/100\n", - "Epoch 62/100\n", - "Epoch 63/100\n", - "Epoch 64/100\n", - "Epoch 65/100\n", - "Epoch 66/100\n", - "Epoch 67/100\n", - "Epoch 68/100\n", - "Epoch 69/100\n", - "Epoch 70/100\n", - "Epoch 71/100\n", - "Epoch 72/100\n", - "Epoch 73/100\n", - "Epoch 74/100\n", - "Epoch 75/100\n", - "Epoch 76/100\n", - "Epoch 77/100\n", - "Epoch 78/100\n", - "Epoch 79/100\n", - "Epoch 80/100\n", - "Epoch 81/100\n", - "Epoch 82/100\n", - "Epoch 83/100\n", - "Epoch 84/100\n", - "Epoch 85/100\n", - "Epoch 86/100\n", - "Epoch 87/100\n", - "Epoch 88/100\n", - "Epoch 89/100\n", - "Epoch 90/100\n", - "Epoch 91/100\n", - "Epoch 92/100\n", - "Epoch 93/100\n", - "Epoch 94/100\n", - "Epoch 95/100\n", - "Epoch 96/100\n", - "Epoch 97/100\n", - "Epoch 98/100\n", - "Epoch 99/100\n", - "Epoch 100/100\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model.fit(predictors, label, epochs=100, verbose=5)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "def generate_text(seed_text, next_words, model, max_seq_len):\n", - " for _ in range(next_words):\n", - " token_list = t.texts_to_sequences([seed_text])[0]\n", - " token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')\n", - " \n", - " predicted = model.predict_classes(token_list, verbose=0)\n", - " \n", - " output_word = ''\n", - " \n", - " for word,index in t.word_index.items():\n", - " if index == predicted:\n", - " output_word = word\n", - " break\n", - " \n", - " seed_text = seed_text + \" \" + output_word\n", - " \n", - " return seed_text.desc()" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Hilton Seattle Downtown Hotel Is Located In The Heart Of Downtown Seattle The Waterfront Inn Is A Contemporary Haven Near The Hotel At El Gaucho With The Simple Food Of Featured With A Large Inviting Of Featured In The Quiet Gym Sound Features A Local Views In The Side Floor Rooms And Harbor From The Seattle From Hotel In Seattle At Our Downtown Seattle Hotel Hotel Is The Friendliest Inn Seattle Airport And Enjoy Us In The Heart Of Seattle And Enjoy A Extended Old Baseball Experience That Within Directly From The Market And A Fullservice Hotel Leisure For A Range Of Upscale\n", - "\n", - "Best Western Seattle Airport Hotel Is A Leading 119 Guestroom Boutique Hotel In Seattle As Conveniently Located Across The Street From The Emp An Interactive Music Music Service With A Balcony 37Inch Airport With A Large Views Of Seattle Seattle Style And A Variety Of The Living Area Rooms To Offer The Unique Views Of The City And Take In The Heart Of The City This Legendary Hotel Offers A Oneofakind Place To The Emerald City Of Boastingunique Accommodationsinspired By The City It It Including Social Room Rooms To Staypineapple Many Of Our Valued Hotel Is The Best Western Of Seattle And South Lake Union And The Light Rail Link Proximity Of Downtown Seattle And Bars The Seattle Mariners Is The Market Is A Fullservice Restaurant Relax And A Business Center To Mt Days And Residents For An Artsy Core And Historic Chefs Northwest Along And A Interior On The Fresh New Of An Renowned World Ride On The Interior And Necessitated New East Our Spacious Guest Rooms Feature A Treasured And Cool Food Got An Library Library Within The 42 Hdtv Cozy Corner With Local Dining Options And A Courtyard Of Our Snack At The 24Hour Luxury Breakfast In Our Guestrooms Desk And Enjoy\n", - "\n", - "Located In The Heart Of Downtown Seattle The Seattle Hotel In Seattle Wa Is Just Steps Away From The Seattle Tacoma International Airport Our Allsuite Hotel Offers Guests Stress Thanks To Easy Access To The Citys Vibrant Sites And Sites And Nightlife The Renovated Music Scene With A Variety Of Rooms And Mind Amenities Like Home In Our Lobby Let The Menu Room That Will Be A Dip In The Indoor Swimming Pool Before Relaxing In Our Hot Tub Or Suites In The Back And Relax In The Bar And Luxury Gym Delicious Cuisine And A Cup Of Coffee Onsite Coffee Tea And Access To Our Lobby Lobby With The Kids Dining Dining Enjoy A Separated Drink And The Market And Relax In Your Convenience Of The Wac Fully Heated Pool And Breakfast Available In The Lobby At The Gaslight Room Need To Area The Room At The Seattle Hotel Also Also Offer A Fellow Urban Experience The Postklondike Gold Rush Days Of The Early 1900S It The Best Fullservice Seattle Area Featuring Spacious Options Rooms With A Warm Satellite Satellite Satellite Cable Tv Today And A Mini Culinary And Menu Featuring Our Plug Bed And All Plus Our Day With A Screen Flat Screen Desk That Several A Variety Of Programming And Beverage Options Including A Fastcasual Italianstyle Trattoria Andare Kitchen Bar Locally Renowned Steakhouse Daniels Broiler And The Market Open 24 Hours In The 24Hour Fitness Center And State Views From The Seattle Area We Offer With Free Wifi And Free Wifi Free Wifi And Free Wifi All Our Rooms Feature Movers And Shakers Such As Amazoncom Tommy Bahama The Cancer Care Alliance Washingtons History Industry Museum And A Variety Of The 24Hour Fitness Center Tastefully Decorated And Flooded With Our Meeting Room And Suites To A Dip In The Fitness Center Take On The\n" - ] - } - ], - "source": [ - "print(generate_text(\"hilton seattle downtown\", 100, model, max_sequence_len))\n", - "print()\n", - "print(generate_text(\"best western seattle airport hotel\", 200, model, max_sequence_len))\n", - "print()\n", - "print(generate_text('located in the heart of downtown seattle', 300, model, max_sequence_len))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}