eda first draft

rok12003 · Nov 10, 2023 · 7c0b35a · 7c0b35a
1 parent 789ee59
commit 7c0b35a
Showing 1 changed file with 165 additions and 0 deletions.
diff --git a/eda_2021.ipynb b/eda_2021.ipynb
@@ -0,0 +1,165 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "201288da-86ac-4db0-a56b-4d75e26e1753",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyspark.sql import SparkSession\n",
+    "from pyspark.sql import functions as F\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3443992c-4530-48f2-a133-fb1dacf4b84f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark = SparkSession.builder.appName('2021EDA').getOrCreate()\n",
+    "\n",
+    "#change configuration settings on Spark \n",
+    "conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '4g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','4g')])\n",
+    "\n",
+    "#print spark configuration settings\n",
+    "spark.sparkContext.getConf().getAll()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a10a9fef-7517-4947-a7a5-b17db05dbb79",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_2021 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/2021\", inferSchema=True, header=True)\n",
+    "# figure out how to read in shp file msca-bdp-student-gcs/bdp-rideshare-project/neighborhoods/shp files\n",
+    "df_weather = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2020-01-01 to 2022-08-31.csv\", inferSchema=True, header=True)\n",
+    "df_2021.printSchema()\n",
+    "df_weather.printSchema()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8138c57a-26d6-44c4-b765-c7b137277044",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#display number of records by partition\n",
+    "def displaypartitions(df):\n",
+    "    #number of records by partition\n",
+    "    num = df.rdd.getNumPartitions()\n",
+    "    print(\"Partitions:\", num)\n",
+    "    df.withColumn(\"partitionId\", F.spark_partition_id())\\\n",
+    "        .groupBy(\"partitionId\")\\\n",
+    "        .count()\\\n",
+    "        .orderBy(F.asc(\"count\"))\\\n",
+    "        .show(num)\n",
+    "\n",
+    "df_2021.rdd.getNumPartitions()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e70c86dd-041c-4967-b726-c058e32a76b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "displaypartitions(df_2021)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fe004162-5b22-4a11-9fad-665fa5cdecc0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_2021 = df_2021.repartition(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f34f9ec5-1a72-42ed-8bbe-3b54683a8bf4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "displaypartitions(df_2021)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22a6039e-9848-4717-98b6-bc915540357b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_2021.describe().show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c78e4618-8383-4df2-862b-4cb9dbeb20ab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Find the number of missing values for each column\n",
+    "from pyspark.sql.functions import isnan, when, count, col\n",
+    "df_2021.select([count(when(df_2021[c].isNull(), c)).alias(c) for c in df_2021.columns]).show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2dd6ea75-5417-4d27-92bb-4d9a24808545",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# number of observations with all the data in each column\n",
+    "df_2021.dropna().count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "46e2e9e5-3581-444c-b149-827a5cbc62f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Working with just data that contains full information and check for dupes\n",
+    "df_2021 = df_2021.dropna()\n",
+    "df_2021 = df_2021.dropDuplicates()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "PySpark",
+   "language": "python",
+   "name": "pyspark"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}