Skip to content

Commit

Permalink
eda first draft
Browse files Browse the repository at this point in the history
  • Loading branch information
root committed Nov 10, 2023
1 parent 789ee59 commit 7c0b35a
Showing 1 changed file with 165 additions and 0 deletions.
165 changes: 165 additions & 0 deletions eda_2021.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "201288da-86ac-4db0-a56b-4d75e26e1753",
"metadata": {},
"outputs": [],
"source": [
"from pyspark.sql import SparkSession\n",
"from pyspark.sql import functions as F\n",
"\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3443992c-4530-48f2-a133-fb1dacf4b84f",
"metadata": {},
"outputs": [],
"source": [
"spark = SparkSession.builder.appName('2021EDA').getOrCreate()\n",
"\n",
"#change configuration settings on Spark \n",
"conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '4g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','4g')])\n",
"\n",
"#print spark configuration settings\n",
"spark.sparkContext.getConf().getAll()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a10a9fef-7517-4947-a7a5-b17db05dbb79",
"metadata": {},
"outputs": [],
"source": [
"df_2021 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/2021\", inferSchema=True, header=True)\n",
"# figure out how to read in shp file msca-bdp-student-gcs/bdp-rideshare-project/neighborhoods/shp files\n",
"df_weather = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2020-01-01 to 2022-08-31.csv\", inferSchema=True, header=True)\n",
"df_2021.printSchema()\n",
"df_weather.printSchema()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8138c57a-26d6-44c4-b765-c7b137277044",
"metadata": {},
"outputs": [],
"source": [
"#display number of records by partition\n",
"def displaypartitions(df):\n",
" #number of records by partition\n",
" num = df.rdd.getNumPartitions()\n",
" print(\"Partitions:\", num)\n",
" df.withColumn(\"partitionId\", F.spark_partition_id())\\\n",
" .groupBy(\"partitionId\")\\\n",
" .count()\\\n",
" .orderBy(F.asc(\"count\"))\\\n",
" .show(num)\n",
"\n",
"df_2021.rdd.getNumPartitions()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e70c86dd-041c-4967-b726-c058e32a76b7",
"metadata": {},
"outputs": [],
"source": [
"displaypartitions(df_2021)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fe004162-5b22-4a11-9fad-665fa5cdecc0",
"metadata": {},
"outputs": [],
"source": [
"df_2021 = df_2021.repartition(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f34f9ec5-1a72-42ed-8bbe-3b54683a8bf4",
"metadata": {},
"outputs": [],
"source": [
"displaypartitions(df_2021)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "22a6039e-9848-4717-98b6-bc915540357b",
"metadata": {},
"outputs": [],
"source": [
"df_2021.describe().show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c78e4618-8383-4df2-862b-4cb9dbeb20ab",
"metadata": {},
"outputs": [],
"source": [
"#Find the number of missing values for each column\n",
"from pyspark.sql.functions import isnan, when, count, col\n",
"df_2021.select([count(when(df_2021[c].isNull(), c)).alias(c) for c in df_2021.columns]).show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2dd6ea75-5417-4d27-92bb-4d9a24808545",
"metadata": {},
"outputs": [],
"source": [
"# number of observations with all the data in each column\n",
"df_2021.dropna().count()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "46e2e9e5-3581-444c-b149-827a5cbc62f5",
"metadata": {},
"outputs": [],
"source": [
"# Working with just data that contains full information and check for dupes\n",
"df_2021 = df_2021.dropna()\n",
"df_2021 = df_2021.dropDuplicates()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "PySpark",
"language": "python",
"name": "pyspark"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.15"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit 7c0b35a

Please sign in to comment.