diff --git a/data_science/parkinsons_disease/Pakinsonsmodel2.ipynb b/data_science/parkinsons_disease/Pakinsonsmodel2.ipynb new file mode 100644 index 00000000..2c13ef57 --- /dev/null +++ b/data_science/parkinsons_disease/Pakinsonsmodel2.ipynb @@ -0,0 +1,902 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "acXt4XKngePt" + }, + "outputs": [], + "source": [ + "# Importing necessary libraries\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.ensemble import GradientBoostingClassifier\n", + "from xgboost import XGBClassifier\n", + "from sklearn.naive_bayes import GaussianNB\n", + "from sklearn.neural_network import MLPClassifier\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import precision_score\n", + "from sklearn.metrics import recall_score\n", + "from sklearn.metrics import f1_score\n" + ] + }, + { + "cell_type": "code", + "source": [ + "\n", + "# Load the dataset\n", + "df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/dataset/Parkinsons Dataset.xlsx')\n" + ], + "metadata": { + "id": "Dm8SnGgAgixk" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df.info()\n", + "df.describe()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 873 + }, + "id": "c0yOQDn0qJlg", + "outputId": "4cdcc109-d6d7-4746-b5e0-45a889f663d8" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "RangeIndex: 195 entries, 0 to 194\n", + "Data columns (total 24 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 name 195 non-null object \n", + " 1 MDVP:Fo(Hz) 195 non-null float64\n", + " 2 MDVP:Fhi(Hz) 195 non-null float64\n", + " 3 MDVP:Flo(Hz) 195 non-null float64\n", + " 4 MDVP:Jitter(%) 195 non-null float64\n", + " 5 MDVP:Jitter(Abs) 195 non-null float64\n", + " 6 MDVP:RAP 195 non-null float64\n", + " 7 MDVP:PPQ 195 non-null float64\n", + " 8 Jitter:DDP 195 non-null float64\n", + " 9 MDVP:Shimmer 195 non-null float64\n", + " 10 MDVP:Shimmer(dB) 195 non-null float64\n", + " 11 Shimmer:APQ3 195 non-null float64\n", + " 12 Shimmer:APQ5 195 non-null float64\n", + " 13 MDVP:APQ 195 non-null float64\n", + " 14 Shimmer:DDA 195 non-null float64\n", + " 15 NHR 195 non-null float64\n", + " 16 HNR 195 non-null float64\n", + " 17 status 195 non-null int64 \n", + " 18 RPDE 195 non-null float64\n", + " 19 DFA 195 non-null float64\n", + " 20 spread1 195 non-null float64\n", + " 21 spread2 195 non-null float64\n", + " 22 D2 195 non-null float64\n", + " 23 PPE 195 non-null float64\n", + "dtypes: float64(22), int64(1), object(1)\n", + "memory usage: 36.7+ KB\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " MDVP:Fo(Hz) MDVP:Fhi(Hz) MDVP:Flo(Hz) MDVP:Jitter(%) \\\n", + "count 195.000000 195.000000 195.000000 195.000000 \n", + "mean 154.228641 197.104918 116.324631 0.006220 \n", + "std 41.390065 91.491548 43.521413 0.004848 \n", + "min 88.333000 102.145000 65.476000 0.001680 \n", + "25% 117.572000 134.862500 84.291000 0.003460 \n", + "50% 148.790000 175.829000 104.315000 0.004940 \n", + "75% 182.769000 224.205500 140.018500 0.007365 \n", + "max 260.105000 592.030000 239.170000 0.033160 \n", + "\n", + " MDVP:Jitter(Abs) MDVP:RAP MDVP:PPQ Jitter:DDP MDVP:Shimmer \\\n", + "count 195.000000 195.000000 195.000000 195.000000 195.000000 \n", + "mean 0.000044 0.003306 0.003446 0.009920 0.029709 \n", + "std 0.000035 0.002968 0.002759 0.008903 0.018857 \n", + "min 0.000007 0.000680 0.000920 0.002040 0.009540 \n", + "25% 0.000020 0.001660 0.001860 0.004985 0.016505 \n", + "50% 0.000030 0.002500 0.002690 0.007490 0.022970 \n", + "75% 0.000060 0.003835 0.003955 0.011505 0.037885 \n", + "max 0.000260 0.021440 0.019580 0.064330 0.119080 \n", + "\n", + " MDVP:Shimmer(dB) ... Shimmer:DDA NHR HNR status \\\n", + "count 195.000000 ... 195.000000 195.000000 195.000000 195.000000 \n", + "mean 0.282251 ... 0.046993 0.024847 21.885974 0.753846 \n", + "std 0.194877 ... 0.030459 0.040418 4.425764 0.431878 \n", + "min 0.085000 ... 0.013640 0.000650 8.441000 0.000000 \n", + "25% 0.148500 ... 0.024735 0.005925 19.198000 1.000000 \n", + "50% 0.221000 ... 0.038360 0.011660 22.085000 1.000000 \n", + "75% 0.350000 ... 0.060795 0.025640 25.075500 1.000000 \n", + "max 1.302000 ... 0.169420 0.314820 33.047000 1.000000 \n", + "\n", + " RPDE DFA spread1 spread2 D2 PPE \n", + "count 195.000000 195.000000 195.000000 195.000000 195.000000 195.000000 \n", + "mean 0.498536 0.718099 -5.684397 0.226510 2.381826 0.206552 \n", + "std 0.103942 0.055336 1.090208 0.083406 0.382799 0.090119 \n", + "min 0.256570 0.574282 -7.964984 0.006274 1.423287 0.044539 \n", + "25% 0.421306 0.674758 -6.450096 0.174351 2.099125 0.137451 \n", + "50% 0.495954 0.722254 -5.720868 0.218885 2.361532 0.194052 \n", + "75% 0.587562 0.761881 -5.046192 0.279234 2.636456 0.252980 \n", + "max 0.685151 0.825288 -2.434031 0.450493 3.671155 0.527367 \n", + "\n", + "[8 rows x 23 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MDVP:Fo(Hz)MDVP:Fhi(Hz)MDVP:Flo(Hz)MDVP:Jitter(%)MDVP:Jitter(Abs)MDVP:RAPMDVP:PPQJitter:DDPMDVP:ShimmerMDVP:Shimmer(dB)...Shimmer:DDANHRHNRstatusRPDEDFAspread1spread2D2PPE
count195.000000195.000000195.000000195.000000195.000000195.000000195.000000195.000000195.000000195.000000...195.000000195.000000195.000000195.000000195.000000195.000000195.000000195.000000195.000000195.000000
mean154.228641197.104918116.3246310.0062200.0000440.0033060.0034460.0099200.0297090.282251...0.0469930.02484721.8859740.7538460.4985360.718099-5.6843970.2265102.3818260.206552
std41.39006591.49154843.5214130.0048480.0000350.0029680.0027590.0089030.0188570.194877...0.0304590.0404184.4257640.4318780.1039420.0553361.0902080.0834060.3827990.090119
min88.333000102.14500065.4760000.0016800.0000070.0006800.0009200.0020400.0095400.085000...0.0136400.0006508.4410000.0000000.2565700.574282-7.9649840.0062741.4232870.044539
25%117.572000134.86250084.2910000.0034600.0000200.0016600.0018600.0049850.0165050.148500...0.0247350.00592519.1980001.0000000.4213060.674758-6.4500960.1743512.0991250.137451
50%148.790000175.829000104.3150000.0049400.0000300.0025000.0026900.0074900.0229700.221000...0.0383600.01166022.0850001.0000000.4959540.722254-5.7208680.2188852.3615320.194052
75%182.769000224.205500140.0185000.0073650.0000600.0038350.0039550.0115050.0378850.350000...0.0607950.02564025.0755001.0000000.5875620.761881-5.0461920.2792342.6364560.252980
max260.105000592.030000239.1700000.0331600.0002600.0214400.0195800.0643300.1190801.302000...0.1694200.31482033.0470001.0000000.6851510.825288-2.4340310.4504933.6711550.527367
\n", + "

8 rows × 23 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe" + } + }, + "metadata": {}, + "execution_count": 18 + } + ] + }, + { + "cell_type": "code", + "source": [ + "numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']\n", + "categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']\n", + "numeric_features\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "iHIy7dsmqRgh", + "outputId": "31af90b8-3d1a-4528-f283-6be90dad92b1" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['MDVP:Fo(Hz)',\n", + " 'MDVP:Fhi(Hz)',\n", + " 'MDVP:Flo(Hz)',\n", + " 'MDVP:Jitter(%)',\n", + " 'MDVP:Jitter(Abs)',\n", + " 'MDVP:RAP',\n", + " 'MDVP:PPQ',\n", + " 'Jitter:DDP',\n", + " 'MDVP:Shimmer',\n", + " 'MDVP:Shimmer(dB)',\n", + " 'Shimmer:APQ3',\n", + " 'Shimmer:APQ5',\n", + " 'MDVP:APQ',\n", + " 'Shimmer:DDA',\n", + " 'NHR',\n", + " 'HNR',\n", + " 'status',\n", + " 'RPDE',\n", + " 'DFA',\n", + " 'spread1',\n", + " 'spread2',\n", + " 'D2',\n", + " 'PPE']" + ] + }, + "metadata": {}, + "execution_count": 19 + } + ] + }, + { + "cell_type": "code", + "source": [ + "\n", + "# Preprocessing: Drop the 'status' column and select only numeric columns for scaling\n", + "x = df.drop('status', axis=1)\n", + "x_numeric = x.select_dtypes(include=[np.number])\n", + "y = df['status'] # Target variable\n", + "\n" + ], + "metadata": { + "id": "yTz9C9j4gmu_" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Count the number of instances in each class\n", + "class_counts = y.value_counts()\n", + "\n", + "# Visualize the class distribution using a bar plot\n", + "plt.figure(figsize=(6, 6))\n", + "sns.barplot(x=class_counts.index, y=class_counts.values, palette=\"viridis\")\n", + "plt.title('Healthy vs Parkinson\\'s Disease Distribution')\n", + "plt.xlabel('Status')\n", + "plt.ylabel('Count')\n", + "plt.xticks(ticks=[0, 1], labels=['Parkinson\\'s Disease', 'Healthy'])\n", + "plt.show()\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 669 + }, + "id": "rJCiq22dqlEa", + "outputId": "22a4f560-7b08-43e1-8bff-e492a2315d7c" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + ":6: FutureWarning: \n", + "\n", + "Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.\n", + "\n", + " sns.barplot(x=class_counts.index, y=class_counts.values, palette=\"viridis\")\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "\n", + "# Scaling the numeric features\n", + "scaler = StandardScaler()\n", + "x_scaled = scaler.fit_transform(x_numeric)\n", + "\n", + "# Split the data into training and test sets\n", + "x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.3, stratify=y, random_state=2)\n", + "\n", + "\n" + ], + "metadata": { + "id": "dIQB8HCYg-Ix" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "\n", + "# Define new models to evaluate\n", + "new_models = {\n", + " 'K-Nearest Neighbors': KNeighborsClassifier(),\n", + " 'Gradient Boosting': GradientBoostingClassifier(),\n", + " 'XGBoost': XGBClassifier(),\n", + " 'Naive Bayes': GaussianNB(),\n", + " 'Neural Network (MLPClassifier)': MLPClassifier(max_iter=500)\n", + "}\n", + "\n", + "# Initialize lists to store performance metrics\n", + "accuracy = []\n", + "precision = []\n", + "recall = []\n", + "f1 = []\n", + "\n", + "for name, model in new_models.items():\n", + " classifier = model.fit(x_train, y_train)\n", + " y_pred = classifier.predict(x_test)\n", + "\n", + " # Append performance metrics to corresponding lists\n", + " accuracy.append(accuracy_score(y_test, y_pred))\n", + " precision.append(precision_score(y_test, y_pred))\n", + " recall.append(recall_score(y_test, y_pred))\n", + " f1.append(f1_score(y_test, y_pred))\n", + "\n", + " # Print performance for each model\n", + " print(f\"{name}:\")\n", + " print(f\" Accuracy: {accuracy_score(y_test, y_pred):.4f}\")\n", + " print(f\" Precision: {precision_score(y_test, y_pred):.4f}\")\n", + " print(f\" Recall: {recall_score(y_test, y_pred):.4f}\")\n", + " print(f\" F1 Score: {f1_score(y_test, y_pred):.4f}\")\n", + " print(\"-\" * 40)\n", + "\n", + "# Create a DataFrame to store the results\n", + "results_new_models = pd.DataFrame({\n", + " \"Model\": list(new_models.keys()),\n", + " \"Accuracy\": [round(a, 4) for a in accuracy],\n", + " \"Precision\": [round(p, 4) for p in precision],\n", + " \"Recall\": [round(r, 4) for r in recall],\n", + " \"F1 Score\": [round(f, 4) for f in f1]\n", + "})\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "0Z9ICuT2n2UI", + "outputId": "b684971a-eac0-40b7-a5de-9ee0b909ed4e" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "K-Nearest Neighbors:\n", + " Accuracy: 0.9492\n", + " Precision: 0.9362\n", + " Recall: 1.0000\n", + " F1 Score: 0.9670\n", + "----------------------------------------\n", + "Gradient Boosting:\n", + " Accuracy: 0.9153\n", + " Precision: 0.9333\n", + " Recall: 0.9545\n", + " F1 Score: 0.9438\n", + "----------------------------------------\n", + "XGBoost:\n", + " Accuracy: 0.8983\n", + " Precision: 0.9130\n", + " Recall: 0.9545\n", + " F1 Score: 0.9333\n", + "----------------------------------------\n", + "Naive Bayes:\n", + " Accuracy: 0.6949\n", + " Precision: 0.9643\n", + " Recall: 0.6136\n", + " F1 Score: 0.7500\n", + "----------------------------------------\n", + "Neural Network (MLPClassifier):\n", + " Accuracy: 0.9831\n", + " Precision: 0.9778\n", + " Recall: 1.0000\n", + " F1 Score: 0.9888\n", + "----------------------------------------\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:691: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (500) reached and the optimization hasn't converged yet.\n", + " warnings.warn(\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "\n", + "# Set the Model column as the index for better readability\n", + "results_new_models.set_index('Model', inplace=True)\n", + "\n", + "# Display the results DataFrame\n", + "print(results_new_models)\n", + "\n", + "# Visualize the model comparison\n", + "results_new_models.plot(kind='bar', figsize=(12, 8), colormap='viridis')\n", + "plt.title('Model Performance Comparison')\n", + "plt.ylabel('Score')\n", + "plt.xticks(rotation=45)\n", + "plt.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 985 + }, + "id": "eVa35CLPoDEF", + "outputId": "5a00d8a0-97db-42df-e95b-082f6b0d5a54" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " Accuracy Precision Recall F1 Score\n", + "Model \n", + "K-Nearest Neighbors 0.9492 0.9362 1.0000 0.9670\n", + "Gradient Boosting 0.9322 0.9545 0.9545 0.9545\n", + "XGBoost 0.8983 0.9130 0.9545 0.9333\n", + "Naive Bayes 0.6949 0.9643 0.6136 0.7500\n", + "Neural Network (MLPClassifier) 0.9661 0.9565 1.0000 0.9778\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + } + ] +} \ No newline at end of file