-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
- Loading branch information
There are no files selected for viewing
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,393 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "7a6f6fd1", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# 필요한 패키지 설치\n", | ||
"\n", | ||
"import pandas as pd\n", | ||
"import numpy as np\n", | ||
"from sklearn.linear_model import LinearRegression\n", | ||
"from sklearn.model_selection import train_test_split\n", | ||
"from sklearn.metrics import mean_squared_error # MSE\n", | ||
"from sklearn.metrics import mean_absolute_error # MAE\n", | ||
"from sklearn.metrics import mean_absolute_percentage_error # MAPE\n", | ||
"from sklearn.metrics import mean_squared_log_error # MSLE" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "d01a91f2", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"<div>\n", | ||
"<style scoped>\n", | ||
" .dataframe tbody tr th:only-of-type {\n", | ||
" vertical-align: middle;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe tbody tr th {\n", | ||
" vertical-align: top;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe thead th {\n", | ||
" text-align: right;\n", | ||
" }\n", | ||
"</style>\n", | ||
"<table border=\"1\" class=\"dataframe\">\n", | ||
" <thead>\n", | ||
" <tr style=\"text-align: right;\">\n", | ||
" <th></th>\n", | ||
" <th>id</th>\n", | ||
" <th>date</th>\n", | ||
" <th>price</th>\n", | ||
" <th>bedrooms</th>\n", | ||
" <th>bathrooms</th>\n", | ||
" <th>sqft_living</th>\n", | ||
" <th>sqft_lot</th>\n", | ||
" <th>floors</th>\n", | ||
" <th>waterfront</th>\n", | ||
" <th>view</th>\n", | ||
" <th>...</th>\n", | ||
" <th>grade</th>\n", | ||
" <th>sqft_above</th>\n", | ||
" <th>sqft_basement</th>\n", | ||
" <th>yr_built</th>\n", | ||
" <th>yr_renovated</th>\n", | ||
" <th>zipcode</th>\n", | ||
" <th>lat</th>\n", | ||
" <th>long</th>\n", | ||
" <th>sqft_living15</th>\n", | ||
" <th>sqft_lot15</th>\n", | ||
" </tr>\n", | ||
" </thead>\n", | ||
" <tbody>\n", | ||
" <tr>\n", | ||
" <th>0</th>\n", | ||
" <td>7129300520</td>\n", | ||
" <td>20141013T000000</td>\n", | ||
" <td>221900.0</td>\n", | ||
" <td>3</td>\n", | ||
" <td>1.00</td>\n", | ||
" <td>1180</td>\n", | ||
" <td>5650</td>\n", | ||
" <td>1.0</td>\n", | ||
" <td>0</td>\n", | ||
" <td>0</td>\n", | ||
" <td>...</td>\n", | ||
" <td>7</td>\n", | ||
" <td>1180</td>\n", | ||
" <td>0</td>\n", | ||
" <td>1955</td>\n", | ||
" <td>0</td>\n", | ||
" <td>98178</td>\n", | ||
" <td>47.5112</td>\n", | ||
" <td>-122.257</td>\n", | ||
" <td>1340</td>\n", | ||
" <td>5650</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>1</th>\n", | ||
" <td>6414100192</td>\n", | ||
" <td>20141209T000000</td>\n", | ||
" <td>538000.0</td>\n", | ||
" <td>3</td>\n", | ||
" <td>2.25</td>\n", | ||
" <td>2570</td>\n", | ||
" <td>7242</td>\n", | ||
" <td>2.0</td>\n", | ||
" <td>0</td>\n", | ||
" <td>0</td>\n", | ||
" <td>...</td>\n", | ||
" <td>7</td>\n", | ||
" <td>2170</td>\n", | ||
" <td>400</td>\n", | ||
" <td>1951</td>\n", | ||
" <td>1991</td>\n", | ||
" <td>98125</td>\n", | ||
" <td>47.7210</td>\n", | ||
" <td>-122.319</td>\n", | ||
" <td>1690</td>\n", | ||
" <td>7639</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>2</th>\n", | ||
" <td>5631500400</td>\n", | ||
" <td>20150225T000000</td>\n", | ||
" <td>180000.0</td>\n", | ||
" <td>2</td>\n", | ||
" <td>1.00</td>\n", | ||
" <td>770</td>\n", | ||
" <td>10000</td>\n", | ||
" <td>1.0</td>\n", | ||
" <td>0</td>\n", | ||
" <td>0</td>\n", | ||
" <td>...</td>\n", | ||
" <td>6</td>\n", | ||
" <td>770</td>\n", | ||
" <td>0</td>\n", | ||
" <td>1933</td>\n", | ||
" <td>0</td>\n", | ||
" <td>98028</td>\n", | ||
" <td>47.7379</td>\n", | ||
" <td>-122.233</td>\n", | ||
" <td>2720</td>\n", | ||
" <td>8062</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>3</th>\n", | ||
" <td>2487200875</td>\n", | ||
" <td>20141209T000000</td>\n", | ||
" <td>604000.0</td>\n", | ||
" <td>4</td>\n", | ||
" <td>3.00</td>\n", | ||
" <td>1960</td>\n", | ||
" <td>5000</td>\n", | ||
" <td>1.0</td>\n", | ||
" <td>0</td>\n", | ||
" <td>0</td>\n", | ||
" <td>...</td>\n", | ||
" <td>7</td>\n", | ||
" <td>1050</td>\n", | ||
" <td>910</td>\n", | ||
" <td>1965</td>\n", | ||
" <td>0</td>\n", | ||
" <td>98136</td>\n", | ||
" <td>47.5208</td>\n", | ||
" <td>-122.393</td>\n", | ||
" <td>1360</td>\n", | ||
" <td>5000</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>4</th>\n", | ||
" <td>1954400510</td>\n", | ||
" <td>20150218T000000</td>\n", | ||
" <td>510000.0</td>\n", | ||
" <td>3</td>\n", | ||
" <td>2.00</td>\n", | ||
" <td>1680</td>\n", | ||
" <td>8080</td>\n", | ||
" <td>1.0</td>\n", | ||
" <td>0</td>\n", | ||
" <td>0</td>\n", | ||
" <td>...</td>\n", | ||
" <td>8</td>\n", | ||
" <td>1680</td>\n", | ||
" <td>0</td>\n", | ||
" <td>1987</td>\n", | ||
" <td>0</td>\n", | ||
" <td>98074</td>\n", | ||
" <td>47.6168</td>\n", | ||
" <td>-122.045</td>\n", | ||
" <td>1800</td>\n", | ||
" <td>7503</td>\n", | ||
" </tr>\n", | ||
" </tbody>\n", | ||
"</table>\n", | ||
"<p>5 rows × 21 columns</p>\n", | ||
"</div>" | ||
], | ||
"text/plain": [ | ||
" id date price bedrooms bathrooms sqft_living \\\n", | ||
"0 7129300520 20141013T000000 221900.0 3 1.00 1180 \n", | ||
"1 6414100192 20141209T000000 538000.0 3 2.25 2570 \n", | ||
"2 5631500400 20150225T000000 180000.0 2 1.00 770 \n", | ||
"3 2487200875 20141209T000000 604000.0 4 3.00 1960 \n", | ||
"4 1954400510 20150218T000000 510000.0 3 2.00 1680 \n", | ||
"\n", | ||
" sqft_lot floors waterfront view ... grade sqft_above sqft_basement \\\n", | ||
"0 5650 1.0 0 0 ... 7 1180 0 \n", | ||
"1 7242 2.0 0 0 ... 7 2170 400 \n", | ||
"2 10000 1.0 0 0 ... 6 770 0 \n", | ||
"3 5000 1.0 0 0 ... 7 1050 910 \n", | ||
"4 8080 1.0 0 0 ... 8 1680 0 \n", | ||
"\n", | ||
" yr_built yr_renovated zipcode lat long sqft_living15 \\\n", | ||
"0 1955 0 98178 47.5112 -122.257 1340 \n", | ||
"1 1951 1991 98125 47.7210 -122.319 1690 \n", | ||
"2 1933 0 98028 47.7379 -122.233 2720 \n", | ||
"3 1965 0 98136 47.5208 -122.393 1360 \n", | ||
"4 1987 0 98074 47.6168 -122.045 1800 \n", | ||
"\n", | ||
" sqft_lot15 \n", | ||
"0 5650 \n", | ||
"1 7639 \n", | ||
"2 8062 \n", | ||
"3 5000 \n", | ||
"4 7503 \n", | ||
"\n", | ||
"[5 rows x 21 columns]" | ||
] | ||
}, | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"# 데이터 불러오기\n", | ||
"# https://www.kaggle.com/datasets/harlfoxem/housesalesprediction\n", | ||
"df = pd.read_csv(\"datasets/kc_house_data.csv\")\n", | ||
"\n", | ||
"# 데이터 샘플 확인\n", | ||
"df.head()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "8e6c73dc", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# 독립변수와 종속변수 분리하여 생성\n", | ||
"x = df[[ 'bedrooms', 'bathrooms', 'sqft_living',\n", | ||
" 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',\n", | ||
" 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', \n", | ||
" 'sqft_living15', 'sqft_lot15']]\n", | ||
"# 'id', 'date'는 키값에 해당하므로 변수에서 제외 해준다.\n", | ||
"y = df[['price']]\n", | ||
"\n", | ||
"# 학습셋과 테스트셋 분리하여 생성(7:3)\n", | ||
"# df_train, df_test = train_test_split(df, test_size = 0.4) \n", | ||
"x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.6, test_size=0.4)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "ffb7e63a", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# 다중회귀모델 생성\n", | ||
"mreg = LinearRegression(fit_intercept=True)\n", | ||
"mreg.fit(x_train, y_train)\n", | ||
"\n", | ||
"# 테스트셋에 모델 적용\n", | ||
"y_predict = mreg.predict(x_test)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "5548daf8", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"RMSE : 214125.86\n", | ||
"MAE : 139072.03\n", | ||
"MAPE : 0.29\n", | ||
"RMSLE : 0.69\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# RMSE 산출 (MSE에 루트 적용)\n", | ||
"MSE = mean_squared_error(y_test, y_predict)\n", | ||
"RMSE = np.sqrt(MSE)\n", | ||
"print((\"RMSE : {:.2f}\".format(RMSE)))\n", | ||
"\n", | ||
"# MAE 산출\n", | ||
"MAE = mean_absolute_error(y_test, y_predict)\n", | ||
"print((\"MAE : {:.2f}\".format(MAE)))\n", | ||
"\n", | ||
"# MAPE 산출\n", | ||
"MAPE = mean_absolute_percentage_error(y_test, y_predict)\n", | ||
"print((\"MAPE : {:.2f}\".format(MAPE)))\n", | ||
"\n", | ||
"# RMSLE 산출 (MSLE에 루트 적용)\n", | ||
"\n", | ||
"# 음수값 전처리\n", | ||
"y_predict_df = pd.DataFrame(y_predict,columns=['price2'])\n", | ||
"y_predict_df2 = y_predict_df.copy()\n", | ||
"y_predict_df2.loc[y_predict_df2['price2'] < 0, 'price2'] = 0\n", | ||
"y_predict_rmsle = y_predict_df2.to_numpy()\n", | ||
"\n", | ||
"MSLE = mean_squared_log_error(y_test, (y_predict_rmsle))\n", | ||
"RMSLE = np.sqrt(MSLE)\n", | ||
"print((\"RMSLE : {:.2f}\".format(RMSLE)))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"id": "3ad066b9", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"C:\\Users\\angry\\AppData\\Local\\Temp\\ipykernel_28972\\1962255028.py:8: RuntimeWarning: invalid value encountered in log\n", | ||
" log_y_predict = np.log(y_predict + 1)\n" | ||
] | ||
}, | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"price 0.3703\n", | ||
"dtype: float64" | ||
] | ||
}, | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"# RMSLE ver. 2\n", | ||
" \n", | ||
"def rmsle(predicted_values, actual_values):\n", | ||
" \n", | ||
" # 테스트셋 y 값과 예측값에 +1 및 로그 \n", | ||
" log_y_test = np.log(y_test + 1)\n", | ||
" log_y_predict = np.log(y_predict + 1)\n", | ||
"\n", | ||
" # 테스트셋 y 값 - 예측값 및 제곱\n", | ||
" diff = log_y_predict - log_y_test\n", | ||
" diff_square = np.square(diff)\n", | ||
"\n", | ||
" # 차이값 평균 및 루트\n", | ||
" mean_diff = diff_square.mean()\n", | ||
" final_rmsle = np.sqrt(mean_diff) \n", | ||
"\n", | ||
" return final_rmsle\n", | ||
"\n", | ||
"rmsle(y_test, y_predict)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.12" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |