airbnb_nn_dc.py

# -*- coding: utf-8 -*-
"""AirBnB_NN_DC.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1Id4UMB0W-21MUQXhXxlEs3KZPPkaL-y7

# Data Loading and Cleaning

This data is very large, and must be cleaned.

* NaN values must be replaced

* values must be converted into either floats, strings, or integers (check Dtype Warning)

## Data Cleaning
"""

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import joblib
from joblib import dump
import scipy
!pip install category_encoders

import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split

"""### Loading Data"""

df = pd.read_csv('/content/drive/My Drive/airbnb_listings_usa.csv')

"""### Bathroom Cleaning(**DONE**)"""

df['bathrooms'].isna().sum()
# 206 nans in bathrooms. These will get replaced by 0's

df['bathrooms'] = df['bathrooms'].fillna(0)

df['bathrooms'].isna().sum()
# Nan's fixed for this column. Now make sure this can get casted to floats

df['bathrooms'].unique()
# all unique values of bathrooms reveals that there's "t" somewhere in there.
# going to clean this and then convert

df['bathrooms'] = df['bathrooms'].replace('t', 0)

df['bathrooms'].unique()
# all good! going to cast bathrooms as float now

df['bathrooms'] = df['bathrooms'].astype('float')

df['bathrooms'].dtypes
# SUCCESS ! One column cleaned.

"""### Zip Cleaning(**DONE**)"""

df.columns[34]

df['zipcode'].unique()

extr = df['zipcode'].str.extract(r'^(\d{4})', expand=False)

df['zipcode'] = extr

df['zipcode'] = df['zipcode'].fillna(0)

df['zipcode'] = df['zipcode'].astype('int')

df['zipcode'].unique()

df['zipcode'].dtypes

"""### Square Feet Cleaning(**DONE**)"""

df.columns[50]

df['square_feet'] = df['square_feet'].fillna(0)

things_to_drop = ['2020-05-16', '2020-05-08', '2020-05-18', '2020-06-09', '2020-06-08', 
                  '2020-05-21']

df['square_feet'] = df['square_feet'].replace(to_replace =things_to_drop,  
                            value =100)

df['square_feet'] = df['square_feet'].astype('float')

df['square_feet'].dtypes

"""### Number of reviews Cleaning(**DONE**)"""

df.columns[66]

df['number_of_reviews'].unique()

extrac = df['number_of_reviews'].str.extract(r'^(\d{4})', expand=False)

df['number_of_reviews'] = extrac

df['number_of_reviews'] = df['number_of_reviews'].fillna(0)

df['number_of_reviews'] = df['number_of_reviews'].astype('int')

df['number_of_reviews'].dtypes

"""### Number of reviews LTM cleaning(**DONE**)"""

df.columns[67]

extractr = df['number_of_reviews_ltm'].str.extract(r'^(\d{4})', expand=False)
df['number_of_reviews_ltm'] = extractr
df['number_of_reviews_ltm'] = df['number_of_reviews_ltm'].fillna(0)
df['number_of_reviews_ltm'] = df['number_of_reviews_ltm'].astype('int')

df['number_of_reviews_ltm'].dtypes

"""### Converting the % columns to floats(**DONE**)"""

df['host_response_rate'] = df['host_response_rate'].fillna(0)

df['host_response_rate'].unique()

things_to_drop2 = ['Los Angeles', 'Austin', 'Nashville', 'Queens', 'New York', 'San Diego', 
                  'Seattle']

df['host_response_rate'] = df['host_response_rate'].replace(to_replace =things_to_drop2,  
                            value =0)

df['host_response_rate'].unique()

df['host_response_rate'] = df['host_response_rate'].apply(lambda x: float(x.strip('%'))/100 if isinstance(x, str) else x )

extractr = df['host_acceptance_rate'].str.extract(r'^(\d{4})', expand=False)
df['host_acceptance_rate'] = extractr
df['host_acceptance_rate'] = df['host_acceptance_rate'].fillna(0)
df['host_acceptance_rate'] = df['host_acceptance_rate'].astype('int')

print(df['host_response_rate'].dtypes)

print(df['host_acceptance_rate'].dtypes)

"""### Convert the price to float(**DONE**)"""

df['price'] = df['price'].fillna(0)

def clean_currency(x):
    """ If the value is a string, then remove currency symbol and delimiters
    otherwise, the value is numeric and can be converted
    """
    if isinstance(x, str):
        return(x.replace('$', '').replace(',', ''))
    return(x)

df['price'] = df['price'].apply(clean_currency).astype('float')

df['price'].dtypes

"""### Converting text columns to word count columns(**DONE**)"""

df['name_len'] = df.name.astype(str).apply(lambda x: len(x))
df['summary_len'] = df.summary.astype(str).apply(lambda x: len(x))
df['space_len'] = df.space.astype(str).apply(lambda x: len(x))
df['description_len'] = df.description.astype(str).apply(lambda x: len(x))
df['neighborhood_overview_len'] = df.neighborhood_overview.astype(str).apply(lambda x: len(x))
df['notes_len'] = df.notes.astype(str).apply(lambda x: len(x))
df['transit_len'] = df.transit.astype(str).apply(lambda x: len(x))
df['access_len'] = df.access.astype(str).apply(lambda x: len(x))
df['interaction_len'] = df.interaction.astype(str).apply(lambda x: len(x))
df['house_rules_len'] = df.house_rules.astype(str).apply(lambda x: len(x))
df['host_about_len'] = df.host_about.astype(str).apply(lambda x: len(x))

long_text_columns = ['name','summary','space','description','neighborhood_overview','notes','transit','access','interaction','house_rules','host_about']

df = df.drop(long_text_columns, axis=1)

"""### Dropping Columns due to Unique values, leaky values, Dtype errors, etc."""

var_drop = ['Unnamed: 0', 'host_listings_count', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
 'longitude', 'weekly_price', 'monthly_price', 'availability_60', 'availability_90', 'availability_365',
  'review_scores_communication', 'jurisdiction_names','calculated_host_listings_count' ,'calculated_host_listings_count_entire_homes','require_guest_profile_picture' ,'id',
   'picture_url', 'host_since', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'security_deposit', 'cleaning_fee',
    'extra_people', 'has_availability', 'availability_30', 'calendar_last_scraped', 'first_review', 
    'last_review', 'host_location', 'smart_location', 'country_code', 'country', 'is_location_exact']

def wrangle(X):
    """Wrangle everything in the same way"""
    
    # Prevent SettingWithCopyWarning
    X = X.copy()

    # Drop the columns with 0 feature importance
    more_variance = []

    X = X.drop(columns=var_drop)

    

    # Engineer features!
    
    # return the wrangled dataframe
    return X

df2 = wrangle(df)

for col in df2.columns:
  print(col)
print(df2.shape)

df2.head()

"""At this point, the Dtype warning has been taken care of. The dataframe (DF2) should now only consist of single dtype objects per column. This DF must now be passed through some transformations to do the following:

* Remove NaN values
* Encode Categorical Variables
* Make "t" = 1 and "f" = 0

## Transformers for the DF.

### Changing the Trues and Falses(**DONE**)
"""

df2['host_is_superhost'] = (df2['host_is_superhost'] == 't')
df2['host_has_profile_pic'] = (df2['host_has_profile_pic'] == 't')
df2['host_identity_verified'] = (df2['host_identity_verified'] == 't')
df2['requires_license'] = (df2['requires_license'] == 't')
df2['instant_bookable'] = (df2['instant_bookable'] == 't')
df2['is_business_travel_ready'] = (df2['is_business_travel_ready'] == 't')
df2['require_guest_phone_verification'] = (df2['require_guest_phone_verification'] == 't')

df2['host_is_superhost'] = (df2['host_is_superhost']*1)
df2['host_has_profile_pic'] = (df2['host_has_profile_pic']*1)
df2['host_identity_verified'] = (df2['host_identity_verified']*1)
df2['requires_license'] = (df2['requires_license']*1)
df2['instant_bookable'] = (df2['instant_bookable']*1)
df2['is_business_travel_ready'] = (df2['is_business_travel_ready']*1)
df2['require_guest_phone_verification'] = (df2['require_guest_phone_verification']*1)

df2

df2.info()

"""### Transform the Data

### Create the Feature matrix and Target vector(**DONE**)
"""

x = df2.drop(columns='price')
y = df2['price']
x3 = x[['zipcode', 'city', 'bathrooms', 'bedrooms', 'beds', 'state', 'property_type', 'square_feet', 'neighbourhood',
        'number_of_reviews', 'review_scores_rating', 'instant_bookable', 'guests_included', 'is_business_travel_ready', 'cancellation_policy',
        'transit_len', 'description_len', 'host_about_len', 'accommodates']].copy()

"""### Column Transformer with mixed types"""

x3.head()

numeric_features = ['accommodates', 'bathrooms', 'bedrooms', 'beds', 'square_feet', 'number_of_reviews', 'review_scores_rating',
                     'guests_included', 'transit_len', 'description_len', 'host_about_len']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['zipcode', 'city', 'state', 'property_type', 'neighbourhood', 'instant_bookable',
                         'is_business_travel_ready', 'cancellation_policy']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', OrdinalEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

X_train, X_test, y_train, y_test = train_test_split(x3, y, test_size=0.2)

X_tra = preprocessor.fit_transform(X_train)

X_tes = preprocessor.fit_transform(X_test)

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

"""## Model Implementation, Training, Testing, Accuracy Scoring.

Mean Absolute Error of about $176 when comparing y_test to preds.

This model works, but having trouble connecting it to the live heroku app.
"""

model = keras.Sequential()
model.add(layers.Dense(64, activation="relu", input_shape=[19]))
model.add(layers.Dense(64, activation="relu"))
model.add(layers.Dense(1, activation='linear'))

optimizer = tf.keras.optimizers.RMSprop(0.001)

model.compile(loss='mse',
              optimizer=optimizer,
              metrics=['mae', 'mse'])

model.summary()

print(type(X_tra))
print(type(X_tes))
print(type(y_train))
print(type(y_test))

model.fit(X_tra, y_train, epochs=10, verbose=1)

preds = model.predict(X_tes)

from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score as r2

type(y_test)

type(preds)

mae(y_test, preds)

r2(y_test, preds)

X_tes[0].shape

model.predict(X_tes[[0]])

"""## Save and load"""