Skip to content

Commit

Permalink
Initial EDA for 2022
Browse files Browse the repository at this point in the history
  • Loading branch information
root committed Nov 10, 2023
1 parent 84095dc commit 96129b7
Showing 1 changed file with 71 additions and 70 deletions.
141 changes: 71 additions & 70 deletions eda_2022.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1154,75 +1154,6 @@
"df_weather = df_weather.withColumn('datetime',F.to_date(df_weather['datetime'], \"yyyy-mm-dd\"))"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "4dddc1a4-19f0-44e1-812c-e7f21c22fa55",
"metadata": {},
"outputs": [],
"source": [
"hp_census_tracts_2010_2020 = [17031411000,17031410900,17031410100,17031411100,17031410800,17031410200,17031410700,17031411200,17031836200,17031410600,17031836300,17031410500,\n",
" 17031410300,17031410400,17031410600,17031410800,17031411300,17031411400]\n",
"df_hp = df_2022.filter((df_2022.pickup_tract.isin(hp_census_tracts_2010_2020)) & (df_2022.dropoff_tract.isin(hp_census_tracts_2010_2020)))"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "0adf9b58-cacb-4529-b0d9-fc44f952d5e7",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Stage 106:============================================> (7 + 2) / 9]\r"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------+\n",
"|pickup_tract|\n",
"+------------+\n",
"| 17031832600|\n",
"| 17031837400|\n",
"| 17031831300|\n",
"| 17031560300|\n",
"| 17031530503|\n",
"| 17031650500|\n",
"| 17031710700|\n",
"| 17031271300|\n",
"| 17031062200|\n",
"| 17031843200|\n",
"| 17031430500|\n",
"| 17031020602|\n",
"| 17031130200|\n",
"| 17031230200|\n",
"| 17031241300|\n",
"| 17031838000|\n",
"| 17031150501|\n",
"| 17031351500|\n",
"| 17031440800|\n",
"| 17031291200|\n",
"+------------+\n",
"only showing top 20 rows\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" \r"
]
}
],
"source": [
"df_2022.select(['pickup_tract']).distinct().show()"
]
},
{
"cell_type": "code",
"execution_count": 48,
Expand Down Expand Up @@ -1291,9 +1222,79 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 58,
"id": "51b0e6b0-1fa9-431c-bdb5-d63c77751768",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" \r"
]
}
],
"source": [
"df_2019 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/2019\", inferSchema=True, header=True)\n"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "12953493-455d-43e7-8369-380ee0a0f538",
"metadata": {},
"outputs": [],
"source": [
"df_2019 = df_2019.withColumn(\"Trip Start Timestamp\", F.to_timestamp(F.col(\"Trip Start Timestamp\"), \"MM/dd/yyyy hh:mm:ss a\"))\n",
"df_2019 = df_2019.withColumnRenamed(\"Trip Start Timestamp\",\"start_timestamp\")"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "ae7d7643-5fab-4d4a-8d97-caa2a4ffe3c5",
"metadata": {},
"outputs": [],
"source": [
"df_2019 = df_2019.withColumn('year', F.year(df_2019.start_timestamp))"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "b032e6c0-885f-43a9-96b2-e32233db8af1",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" \r"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"+----+\n",
"|year|\n",
"+----+\n",
"|null|\n",
"|2019|\n",
"+----+\n",
"\n"
]
}
],
"source": [
"df_2019.select('year').distinct().show(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "370cc6d5-861b-48c3-ae35-b03679f03f5f",
"metadata": {},
"outputs": [],
"source": []
}
Expand Down

0 comments on commit 96129b7

Please sign in to comment.