Skip to content

Commit

Permalink
trying other features
Browse files Browse the repository at this point in the history
  • Loading branch information
root committed Nov 24, 2023
1 parent 53b1857 commit 4363f41
Showing 1 changed file with 82 additions and 39 deletions.
121 changes: 82 additions & 39 deletions clustering_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1936,22 +1936,32 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 17,
"id": "3a1943f1-e3fd-44c6-a125-30268d45b7a6",
"metadata": {},
"outputs": [],
"source": [
"df_all = df_all.na.drop()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "04f8d859-1872-4496-b211-6f8a72469c2c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Stage 135:====================================================>(592 + 8) / 600]\r"
"[Stage 294:====================================================>(597 + 3) / 600]\r"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Silhouette with squared euclidean distance = 0.8605552355933582\n"
"Silhouette with squared euclidean distance = 0.6724456252751287\n"
]
},
{
Expand All @@ -1963,8 +1973,8 @@
}
],
"source": [
"# Clustering by pick_up area. Understanding the most-popular spots in the city to call a rideshare:\n",
"feature_cols = [\"pickup_area\"]\n",
"# Clustering by pick_up area. Understanding the most-popular spots in the city to call a rideshare and their locations:\n",
"feature_cols = [\"pickup_area\", \"pickup_lat\", \"pickup_lon\"]\n",
"\n",
"# Step 1: Vector Assembly:\n",
"feature_assembler = VectorAssembler(inputCols=feature_cols, outputCol=\"feature_vector\")\n",
Expand All @@ -1991,7 +2001,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 20,
"id": "86b58900-850f-4a83-82be-91ffc6f62be7",
"metadata": {},
"outputs": [
Expand All @@ -2000,9 +2010,9 @@
"output_type": "stream",
"text": [
"Cluster Centers: \n",
"[1.49670524]\n",
"[0.35388964]\n",
"[3.43428613]\n"
"[ 1.80161164 869.01481622 -1588.97018395]\n",
"[ 4.18828318 871.29960529 -1593.60879614]\n",
"[ 5.21132153e-01 8.70095495e+02 -1.58910819e+03]\n"
]
}
],
Expand All @@ -2016,56 +2026,56 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 24,
"id": "b7acfa60-e70a-4d9c-a120-a9e71c037148",
"metadata": {},
"outputs": [],
"source": [
"# Display cluster assignments\n",
"chicago_clustering = predictions.select(\"pickup_area\", \"features\", \"scaled_features\", \"prediction\")"
"chicago_clustering = predictions.select(\"pickup_area\", \"pickup_lat\", \"pickup_lon\", \"features\", \"scaled_features\", \"prediction\")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 25,
"id": "ed1bed2e-98ab-4391-82a9-e98353394258",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Stage 141:> (0 + 1) / 1]\r"
"[Stage 303:> (0 + 1) / 1]\r"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------+--------+--------------------+----------+\n",
"|pickup_area|features| scaled_features|prediction|\n",
"+-----------+--------+--------------------+----------+\n",
"| 19| [19.0]|[0.9455276609584078]| 0|\n",
"| 19| [19.0]|[0.9455276609584078]| 0|\n",
"| 19| [19.0]|[0.9455276609584078]| 0|\n",
"| 19| [19.0]|[0.9455276609584078]| 0|\n",
"| 67| [67.0]|[3.3342291202217536]| 2|\n",
"| 67| [67.0]|[3.3342291202217536]| 2|\n",
"| 26| [26.0]| [1.293879957100979]| 0|\n",
"| 61| [61.0]|[3.0356414378138354]| 2|\n",
"| 61| [61.0]|[3.0356414378138354]| 2|\n",
"| 61| [61.0]|[3.0356414378138354]| 2|\n",
"| 43| [43.0]|[2.1398783905900807]| 0|\n",
"| 43| [43.0]|[2.1398783905900807]| 0|\n",
"| 43| [43.0]|[2.1398783905900807]| 0|\n",
"| 43| [43.0]|[2.1398783905900807]| 0|\n",
"| 11| [11.0]|[0.5474107510811834]| 1|\n",
"| 11| [11.0]|[0.5474107510811834]| 1|\n",
"| 66| [66.0]|[3.2844645064871005]| 2|\n",
"| 66| [66.0]|[3.2844645064871005]| 2|\n",
"| 22| [22.0]|[1.0948215021623668]| 0|\n",
"| 22| [22.0]|[1.0948215021623668]| 0|\n",
"+-----------+--------+--------------------+----------+\n",
"+-----------+-------------+--------------+--------------------+--------------------+----------+\n",
"|pickup_area| pickup_lat| pickup_lon| features| scaled_features|prediction|\n",
"+-----------+-------------+--------------+--------------------+--------------------+----------+\n",
"| 41|41.8016710371|-87.5942656985|[41.0,41.80167103...|[2.26101319928869...| 0|\n",
"| 5|41.9448137543|-87.6907750098|[5.0,41.944813754...|[0.27573331698642...| 2|\n",
"| 6|41.9359889065|-87.6709663837|[6.0,41.935988906...|[0.33087998038371...| 2|\n",
"| 6| 41.936159071|-87.6612652184|[6.0,41.936159071...|[0.33087998038371...| 2|\n",
"| 23|41.9066839592|-87.7103539349|[23.0,41.90668395...|[1.26837325813756...| 2|\n",
"| 41|41.8012268363|-87.5853031602|[41.0,41.80122683...|[2.26101319928869...| 0|\n",
"| 6| 41.942577185|-87.6470785093|[6.0,41.942577185...|[0.33087998038371...| 2|\n",
"| 6| 41.942577185|-87.6470785093|[6.0,41.942577185...|[0.33087998038371...| 2|\n",
"| 7|41.9217781876|-87.6510618838|[7.0,41.921778187...|[0.38602664378099...| 2|\n",
"| 7|41.9217781876|-87.6510618838|[7.0,41.921778187...|[0.38602664378099...| 2|\n",
"| 7|41.9217781876|-87.6510618838|[7.0,41.921778187...|[0.38602664378099...| 2|\n",
"| 3|41.9724370811|-87.6711095263|[3.0,41.972437081...|[0.16543999019185...| 2|\n",
"| 28|41.8773834707|-87.6806541164|[28.0,41.87738347...|[1.54410657512398...| 0|\n",
"| 16|41.9595267906|-87.7020985409|[16.0,41.95952679...|[0.88234661435656...| 2|\n",
"| 32|41.8809944707|-87.6327464887|[32.0,41.88099447...|[1.76469322871312...| 0|\n",
"| 32|41.8809944707|-87.6327464887|[32.0,41.88099447...|[1.76469322871312...| 0|\n",
"| 32|41.8809944707|-87.6327464887|[32.0,41.88099447...|[1.76469322871312...| 0|\n",
"| 32|41.8809944707|-87.6327464887|[32.0,41.88099447...|[1.76469322871312...| 0|\n",
"| 32|41.8809944707|-87.6327464887|[32.0,41.88099447...|[1.76469322871312...| 0|\n",
"| 32|41.8809944707|-87.6327464887|[32.0,41.88099447...|[1.76469322871312...| 0|\n",
"+-----------+-------------+--------------+--------------------+--------------------+----------+\n",
"only showing top 20 rows\n",
"\n"
]
Expand All @@ -2079,6 +2089,7 @@
}
],
"source": [
"# Seeing 20 of the results:\n",
"chicago_clustering.show()"
]
},
Expand All @@ -2087,10 +2098,42 @@
"execution_count": null,
"id": "1a2f3163-097d-4ca3-8105-d891019a7bad",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" \r"
]
}
],
"source": [
"# Plotting chicago_clustering:\n"
"# Convert Spark DataFrame to Pandas DataFrame\n",
"pandas_df = chicago_clustering.toPandas()\n",
"\n",
"# Plotting chicago_clustering:\n",
"gdf = gpd.GeoDataFrame(pandas_df, geometry=gpd.points_from_xy(pandas_df['pickup_lon'].astype(float), pandas_df['pickup_lat'].astype(float)))\n",
"\n",
"# Plot the clusters\n",
"fig, ax = plt.subplots(figsize=(10, 8))\n",
"gdf.plot(ax=ax, column='prediction', legend=True, markersize=50, cmap='viridis', legend_kwds={'label': \"Cluster\"})\n",
"\n",
"# Add labels and title\n",
"plt.xlabel('Longitude')\n",
"plt.ylabel('Latitude')\n",
"plt.title('Clusters of Pickup Areas')\n",
"\n",
"# Show the plot\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a614bd22-b64e-4ced-827d-a7f5e7e613d0",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down

0 comments on commit 4363f41

Please sign in to comment.