diff --git a/Feature Selection using Genetic ALgorithm/Feature_Selection_for_Machine_Learning_with_Genetic_Algorithm.ipynb b/Feature Selection using Genetic ALgorithm/Feature_Selection_for_Machine_Learning_with_Genetic_Algorithm.ipynb new file mode 100644 index 0000000000..33bea66749 --- /dev/null +++ b/Feature Selection using Genetic ALgorithm/Feature_Selection_for_Machine_Learning_with_Genetic_Algorithm.ipynb @@ -0,0 +1,696 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "9f4c0HZLL8lT", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 255 + }, + "outputId": "53a5719a-25d2-40ea-aa2a-a549fd4b7d08" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Feature_1 Feature_2 Feature_3 Feature_4 Feature_5 Feature_6 \\\n", + "0 1.470848 -0.360450 -0.591602 -0.728228 0.941690 1.065964 \n", + "1 4.513369 -2.227103 -1.140747 2.018263 -2.238358 -0.497370 \n", + "2 -2.355643 2.218601 -1.603269 0.873394 0.401483 0.717264 \n", + "3 -1.596198 -0.857427 1.772434 -0.639361 1.419409 -0.438525 \n", + "4 2.840049 -2.489600 -0.844902 -1.594362 -4.688517 0.459637 \n", + "\n", + " Feature_7 Feature_8 Feature_9 Feature_10 ... Feature_12 Feature_13 \\\n", + "0 0.017832 -0.596184 1.840712 -1.497093 ... -0.603968 2.899256 \n", + "1 0.714550 0.938883 -2.395169 0.159837 ... 1.461499 3.954171 \n", + "2 -0.859399 -1.042190 -2.175965 0.980231 ... 0.544434 -2.466258 \n", + "3 0.281949 2.345145 1.006230 0.389135 ... -1.025051 -2.422975 \n", + "4 0.913607 -1.143505 1.263937 -2.040928 ... 4.176424 1.341742 \n", + "\n", + " Feature_14 Feature_15 Feature_16 Feature_17 Feature_18 Feature_19 \\\n", + "0 0.037567 -1.249523 0.257963 0.416628 1.408208 -1.838041 \n", + "1 0.309054 0.538184 -7.157865 -4.532216 -0.081800 -9.325362 \n", + "2 -0.470256 0.073018 -2.203531 -2.299263 -1.742761 -0.271579 \n", + "3 1.579807 -0.300713 4.267120 2.893775 1.236697 6.034785 \n", + "4 0.133565 1.743819 1.531188 2.269808 0.053489 -3.151109 \n", + "\n", + " Feature_20 Target \n", + "0 -0.833142 1 \n", + "1 0.574386 1 \n", + "2 -0.359285 0 \n", + "3 -0.045711 0 \n", + "4 1.603702 0 \n", + "\n", + "[5 rows x 21 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Feature_1Feature_2Feature_3Feature_4Feature_5Feature_6Feature_7Feature_8Feature_9Feature_10...Feature_12Feature_13Feature_14Feature_15Feature_16Feature_17Feature_18Feature_19Feature_20Target
01.470848-0.360450-0.591602-0.7282280.9416901.0659640.017832-0.5961841.840712-1.497093...-0.6039682.8992560.037567-1.2495230.2579630.4166281.408208-1.838041-0.8331421
14.513369-2.227103-1.1407472.018263-2.238358-0.4973700.7145500.938883-2.3951690.159837...1.4614993.9541710.3090540.538184-7.157865-4.532216-0.081800-9.3253620.5743861
2-2.3556432.218601-1.6032690.8733940.4014830.717264-0.859399-1.042190-2.1759650.980231...0.544434-2.466258-0.4702560.073018-2.203531-2.299263-1.742761-0.271579-0.3592850
3-1.596198-0.8574271.772434-0.6393611.419409-0.4385250.2819492.3451451.0062300.389135...-1.025051-2.4229751.579807-0.3007134.2671202.8937751.2366976.034785-0.0457110
42.840049-2.489600-0.844902-1.594362-4.6885170.4596370.913607-1.1435051.263937-2.040928...4.1764241.3417420.1335651.7438191.5311882.2698080.053489-3.1511091.6037020
\n", + "

5 rows × 21 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df" + } + }, + "metadata": {}, + "execution_count": 4 + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# Load dataset\n", + "df = pd.read_csv('feature_selection.csv')\n", + "df.head()\n" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Define Helper Functions\n" + ], + "metadata": { + "id": "hA2TeA5_Kw2Y" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.model_selection import train_test_split\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "# Fitness function: Evaluate model performance\n", + "def fitness_function(individual, X, y):\n", + " selected_features = [feature for feature, include in zip(X.columns, individual) if include == 1]\n", + " if len(selected_features) == 0:\n", + " return 0\n", + " X_selected = X[selected_features]\n", + " X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)\n", + " model = RandomForestClassifier(random_state=42)\n", + " model.fit(X_train, y_train)\n", + " y_pred = model.predict(X_test)\n", + " return accuracy_score(y_test, y_pred)\n" + ], + "metadata": { + "id": "98bo4d4HK0Jw" + }, + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Initialize the Population" + ], + "metadata": { + "id": "E5ine5SzK_5f" + } + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "\n", + "def initialize_population(pop_size, num_features):\n", + " population = np.random.randint(2, size=(pop_size, num_features))\n", + " return population\n", + "\n", + "pop_size = 50\n", + "num_features = df.shape[1] - 1 # Exclude the target column\n", + "population = initialize_population(pop_size, num_features)\n" + ], + "metadata": { + "id": "dz67xrQfK_mP" + }, + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Define Genetic Algorithm Operations" + ], + "metadata": { + "id": "DA63bSjQLEO-" + } + }, + { + "cell_type": "code", + "source": [ + "# Selection: Select the best-performing individuals\n", + "def selection(population, fitness_scores, num_parents):\n", + " parents = np.empty((num_parents, population.shape[1]))\n", + " for i in range(num_parents):\n", + " max_fitness_idx = np.where(fitness_scores == np.max(fitness_scores))\n", + " max_fitness_idx = max_fitness_idx[0][0]\n", + " parents[i, :] = population[max_fitness_idx, :]\n", + " fitness_scores[max_fitness_idx] = -999999 # Avoid selecting the same individual\n", + " return parents\n", + "\n", + "# Crossover: Combine pairs of parents to create offspring\n", + "def crossover(parents, offspring_size):\n", + " offspring = np.empty(offspring_size)\n", + " crossover_point = np.uint8(offspring_size[1] / 2)\n", + "\n", + " for k in range(offspring_size[0]):\n", + " parent1_idx = k % parents.shape[0]\n", + " parent2_idx = (k + 1) % parents.shape[0]\n", + " offspring[k, 0:crossover_point] = parents[parent1_idx, 0:crossover_point]\n", + " offspring[k, crossover_point:] = parents[parent2_idx, crossover_point:]\n", + " return offspring\n", + "\n", + "# Mutation: Introduce random changes to some individuals\n", + "def mutation(offspring, mutation_rate=0.01):\n", + " for idx in range(offspring.shape[0]):\n", + " for gene_idx in range(offspring.shape[1]):\n", + " if np.random.rand() < mutation_rate:\n", + " offspring[idx, gene_idx] = 1 - offspring[idx, gene_idx]\n", + " return offspring\n" + ], + "metadata": { + "id": "z7JxcxlWK-aQ" + }, + "execution_count": 7, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Run the Genetic Algorithm" + ], + "metadata": { + "id": "oFKyWCb3LLCn" + } + }, + { + "cell_type": "code", + "source": [ + "def genetic_algorithm(X, y, num_generations, pop_size, num_parents, mutation_rate):\n", + " num_features = X.shape[1]\n", + " population = initialize_population(pop_size, num_features)\n", + " for generation in range(num_generations):\n", + " fitness_scores = np.array([fitness_function(individual, X, y) for individual in population])\n", + " parents = selection(population, fitness_scores, num_parents)\n", + " offspring_size = (pop_size - parents.shape[0], num_features)\n", + " offspring = crossover(parents, offspring_size)\n", + " offspring = mutation(offspring, mutation_rate)\n", + " population[0:parents.shape[0], :] = parents\n", + " population[parents.shape[0]:, :] = offspring\n", + " best_fitness = np.max(fitness_scores)\n", + " print(f\"Generation {generation}: Best Fitness = {best_fitness}\")\n", + " return population, fitness_scores\n", + "\n", + "X = df.drop(columns='Target')\n", + "y = df['Target']\n", + "\n", + "num_generations = 30\n", + "num_parents = 10\n", + "population, fitness_scores = genetic_algorithm(X, y, num_generations, pop_size, num_parents, mutation_rate=0.01)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1YnloIwTLPI1", + "outputId": "8d80a4f3-4c1f-49d5-cc8c-84552eedfe5d" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Generation 0: Best Fitness = 0.88\n", + "Generation 1: Best Fitness = 0.91\n", + "Generation 2: Best Fitness = 0.92\n", + "Generation 3: Best Fitness = 0.93\n", + "Generation 4: Best Fitness = 0.93\n", + "Generation 5: Best Fitness = 0.93\n", + "Generation 6: Best Fitness = 0.935\n", + "Generation 7: Best Fitness = 0.935\n", + "Generation 8: Best Fitness = 0.935\n", + "Generation 9: Best Fitness = 0.94\n", + "Generation 10: Best Fitness = 0.94\n", + "Generation 11: Best Fitness = 0.94\n", + "Generation 12: Best Fitness = 0.94\n", + "Generation 13: Best Fitness = 0.94\n", + "Generation 14: Best Fitness = 0.94\n", + "Generation 15: Best Fitness = 0.94\n", + "Generation 16: Best Fitness = 0.94\n", + "Generation 17: Best Fitness = 0.94\n", + "Generation 18: Best Fitness = 0.94\n", + "Generation 19: Best Fitness = 0.94\n", + "Generation 20: Best Fitness = 0.94\n", + "Generation 21: Best Fitness = 0.94\n", + "Generation 22: Best Fitness = 0.94\n", + "Generation 23: Best Fitness = 0.94\n", + "Generation 24: Best Fitness = 0.94\n", + "Generation 25: Best Fitness = 0.94\n", + "Generation 26: Best Fitness = 0.94\n", + "Generation 27: Best Fitness = 0.94\n", + "Generation 28: Best Fitness = 0.94\n", + "Generation 29: Best Fitness = 0.94\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Evaluate the best feature set" + ], + "metadata": { + "id": "C1XI1XXhLRA1" + } + }, + { + "cell_type": "code", + "source": [ + "best_idx = np.argmax(fitness_scores)\n", + "best_individual = population[best_idx, :]\n", + "selected_features = [feature for feature, include in zip(X.columns, best_individual) if include == 1]\n", + "\n", + "print(f\"Selected Features: {selected_features}\")\n", + "\n", + "# Evaluate performance using the best feature subset\n", + "X_selected = X[selected_features]\n", + "X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)\n", + "model = RandomForestClassifier(random_state=42)\n", + "model.fit(X_train, y_train)\n", + "y_pred = model.predict(X_test)\n", + "print(f\"Accuracy with selected features: {accuracy_score(y_test, y_pred)}\")\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KXQRb3C5LU9q", + "outputId": "88e484bb-2c30-46f6-837b-0d87dca90ff2" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Selected Features: ['Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9', 'Feature_11', 'Feature_13', 'Feature_15', 'Feature_17', 'Feature_19', 'Feature_20']\n", + "Accuracy with selected features: 0.91\n" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/Feature Selection using Genetic ALgorithm/readme.md b/Feature Selection using Genetic ALgorithm/readme.md new file mode 100644 index 0000000000..e894b581eb --- /dev/null +++ b/Feature Selection using Genetic ALgorithm/readme.md @@ -0,0 +1,26 @@ +## Introduction +Feature selection is a critical step in machine learning that involves selecting a subset of relevant features for model building. This project demonstrates how to use Genetic Algorithms (GA) to perform feature selection, optimizing the performance of machine learning models. + +## Project Overview +The primary objective of this project is to leverage Genetic Algorithms to select the best features for a given machine learning task. The project involves: + +1. Implementing a Genetic Algorithm for feature selection. +2. Comparing the performance of models trained with all features versus the selected features. +3. Analyzing the results to determine the effectiveness of the Genetic Algorithm in feature selection. + +## Methodology +The Genetic Algorithm (GA) follows these steps: + +1. Initialization: Generate an initial population of feature subsets. +2. Selection: Evaluate the fitness of each subset using a predefined fitness function (e.g., model accuracy). +3. Crossover: Combine pairs of feature subsets to produce new offspring. +4. Mutation: Introduce random changes to feature subsets to maintain genetic diversity. +5. Replacement: Replace less fit subsets with new offspring. +6. Termination: Stop the algorithm after a set number of generations or if convergence criteria are met. + +## Usage +1. Prepare dataset in a CSV file. +2. Load the dataset and preprocess it if necessary. +3. Configure the Genetic Algorithm parameters. +4. Run the Genetic Algorithm to perform feature selection. +5. Evaluate the selected features using a machine learning model. \ No newline at end of file diff --git a/Feature Selection using Genetic ALgorithm/requirement.txt b/Feature Selection using Genetic ALgorithm/requirement.txt new file mode 100644 index 0000000000..36dd3e911d --- /dev/null +++ b/Feature Selection using Genetic ALgorithm/requirement.txt @@ -0,0 +1,9 @@ +## Requirements +Python 3.x +NumPy +pandas +OpenAI Gym +Stable Baselines3 +TensorFlow/Keras +Matplotlib +Jupyter Notebook (optional, for interactive exploration) \ No newline at end of file