diff --git a/unsupervised_ml.ipynb b/unsupervised_ml.ipynb new file mode 100644 index 0000000..8118a2c --- /dev/null +++ b/unsupervised_ml.ipynb @@ -0,0 +1,89 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "90484ea3-113d-4578-a654-1f80d22d49e6", + "metadata": {}, + "source": [ + "# Unsupervised ML\n", + "\n", + "TEXT" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "241961fd-69dd-4036-839c-d5ff609e034a", + "metadata": {}, + "outputs": [], + "source": [ + "# read in packages create spark environment\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.sql import functions as F\n", + "\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "spark = SparkSession.builder.appName('supervised').getOrCreate()\n", + "\n", + "#change configuration settings on Spark \n", + "conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '4g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','4g')])\n", + "\n", + "#print spark configuration settings\n", + "spark.sparkContext.getConf().getAll()" + ] + }, + { + "cell_type": "markdown", + "id": "6e8338f7-2c56-4e05-b6b3-78577485dae4", + "metadata": {}, + "source": [ + "### Reading in cleaned data, partitioning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b98781e4-a2cd-4da2-aa73-70de31956265", + "metadata": {}, + "outputs": [], + "source": [ + "# read in rideshare data for all years, concatenate, create appropriate partitioning\n", + "# we are dropping 2020 because covid will affect the performance of our model\n", + "\n", + "df_2018 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2018.csv\", inferSchema=True, header=True)\n", + "df_2019 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2019.csv\", inferSchema=True, header=True)\n", + "df_2021 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2021.csv\", inferSchema=True, header=True)\n", + "df_2022 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2022.csv\", inferSchema=True, header=True)\n", + "df_2023 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2023.csv\", inferSchema=True, header=True)\n", + "\n", + "# dropping new columns in 2023\n", + "df_2023 = df_2023.drop('Shared Trip Match','Percent Time Chicago','Percent Distance Chicago')\n", + "\n", + "df_all = df_2018.union(df_2019).union(df_2021).union(df_2022).union(df_2023)\n", + "df_all.show(5)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "PySpark", + "language": "python", + "name": "pyspark" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}