From 4363f41cb4781e0b97e8797e42f0c029e07e7917 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 24 Nov 2023 22:07:52 +0000 Subject: [PATCH] trying other features --- clustering_analysis.ipynb | 121 ++++++++++++++++++++++++++------------ 1 file changed, 82 insertions(+), 39 deletions(-) diff --git a/clustering_analysis.ipynb b/clustering_analysis.ipynb index 5e9e0e2..eb0fae1 100644 --- a/clustering_analysis.ipynb +++ b/clustering_analysis.ipynb @@ -1936,7 +1936,17 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 17, + "id": "3a1943f1-e3fd-44c6-a125-30268d45b7a6", + "metadata": {}, + "outputs": [], + "source": [ + "df_all = df_all.na.drop()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, "id": "04f8d859-1872-4496-b211-6f8a72469c2c", "metadata": {}, "outputs": [ @@ -1944,14 +1954,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "[Stage 135:====================================================>(592 + 8) / 600]\r" + "[Stage 294:====================================================>(597 + 3) / 600]\r" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Silhouette with squared euclidean distance = 0.8605552355933582\n" + "Silhouette with squared euclidean distance = 0.6724456252751287\n" ] }, { @@ -1963,8 +1973,8 @@ } ], "source": [ - "# Clustering by pick_up area. Understanding the most-popular spots in the city to call a rideshare:\n", - "feature_cols = [\"pickup_area\"]\n", + "# Clustering by pick_up area. Understanding the most-popular spots in the city to call a rideshare and their locations:\n", + "feature_cols = [\"pickup_area\", \"pickup_lat\", \"pickup_lon\"]\n", "\n", "# Step 1: Vector Assembly:\n", "feature_assembler = VectorAssembler(inputCols=feature_cols, outputCol=\"feature_vector\")\n", @@ -1991,7 +2001,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 20, "id": "86b58900-850f-4a83-82be-91ffc6f62be7", "metadata": {}, "outputs": [ @@ -2000,9 +2010,9 @@ "output_type": "stream", "text": [ "Cluster Centers: \n", - "[1.49670524]\n", - "[0.35388964]\n", - "[3.43428613]\n" + "[ 1.80161164 869.01481622 -1588.97018395]\n", + "[ 4.18828318 871.29960529 -1593.60879614]\n", + "[ 5.21132153e-01 8.70095495e+02 -1.58910819e+03]\n" ] } ], @@ -2016,18 +2026,18 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 24, "id": "b7acfa60-e70a-4d9c-a120-a9e71c037148", "metadata": {}, "outputs": [], "source": [ "# Display cluster assignments\n", - "chicago_clustering = predictions.select(\"pickup_area\", \"features\", \"scaled_features\", \"prediction\")" + "chicago_clustering = predictions.select(\"pickup_area\", \"pickup_lat\", \"pickup_lon\", \"features\", \"scaled_features\", \"prediction\")" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 25, "id": "ed1bed2e-98ab-4391-82a9-e98353394258", "metadata": {}, "outputs": [ @@ -2035,37 +2045,37 @@ "name": "stderr", "output_type": "stream", "text": [ - "[Stage 141:> (0 + 1) / 1]\r" + "[Stage 303:> (0 + 1) / 1]\r" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "+-----------+--------+--------------------+----------+\n", - "|pickup_area|features| scaled_features|prediction|\n", - "+-----------+--------+--------------------+----------+\n", - "| 19| [19.0]|[0.9455276609584078]| 0|\n", - "| 19| [19.0]|[0.9455276609584078]| 0|\n", - "| 19| [19.0]|[0.9455276609584078]| 0|\n", - "| 19| [19.0]|[0.9455276609584078]| 0|\n", - "| 67| [67.0]|[3.3342291202217536]| 2|\n", - "| 67| [67.0]|[3.3342291202217536]| 2|\n", - "| 26| [26.0]| [1.293879957100979]| 0|\n", - "| 61| [61.0]|[3.0356414378138354]| 2|\n", - "| 61| [61.0]|[3.0356414378138354]| 2|\n", - "| 61| [61.0]|[3.0356414378138354]| 2|\n", - "| 43| [43.0]|[2.1398783905900807]| 0|\n", - "| 43| [43.0]|[2.1398783905900807]| 0|\n", - "| 43| [43.0]|[2.1398783905900807]| 0|\n", - "| 43| [43.0]|[2.1398783905900807]| 0|\n", - "| 11| [11.0]|[0.5474107510811834]| 1|\n", - "| 11| [11.0]|[0.5474107510811834]| 1|\n", - "| 66| [66.0]|[3.2844645064871005]| 2|\n", - "| 66| [66.0]|[3.2844645064871005]| 2|\n", - "| 22| [22.0]|[1.0948215021623668]| 0|\n", - "| 22| [22.0]|[1.0948215021623668]| 0|\n", - "+-----------+--------+--------------------+----------+\n", + "+-----------+-------------+--------------+--------------------+--------------------+----------+\n", + "|pickup_area| pickup_lat| pickup_lon| features| scaled_features|prediction|\n", + "+-----------+-------------+--------------+--------------------+--------------------+----------+\n", + "| 41|41.8016710371|-87.5942656985|[41.0,41.80167103...|[2.26101319928869...| 0|\n", + "| 5|41.9448137543|-87.6907750098|[5.0,41.944813754...|[0.27573331698642...| 2|\n", + "| 6|41.9359889065|-87.6709663837|[6.0,41.935988906...|[0.33087998038371...| 2|\n", + "| 6| 41.936159071|-87.6612652184|[6.0,41.936159071...|[0.33087998038371...| 2|\n", + "| 23|41.9066839592|-87.7103539349|[23.0,41.90668395...|[1.26837325813756...| 2|\n", + "| 41|41.8012268363|-87.5853031602|[41.0,41.80122683...|[2.26101319928869...| 0|\n", + "| 6| 41.942577185|-87.6470785093|[6.0,41.942577185...|[0.33087998038371...| 2|\n", + "| 6| 41.942577185|-87.6470785093|[6.0,41.942577185...|[0.33087998038371...| 2|\n", + "| 7|41.9217781876|-87.6510618838|[7.0,41.921778187...|[0.38602664378099...| 2|\n", + "| 7|41.9217781876|-87.6510618838|[7.0,41.921778187...|[0.38602664378099...| 2|\n", + "| 7|41.9217781876|-87.6510618838|[7.0,41.921778187...|[0.38602664378099...| 2|\n", + "| 3|41.9724370811|-87.6711095263|[3.0,41.972437081...|[0.16543999019185...| 2|\n", + "| 28|41.8773834707|-87.6806541164|[28.0,41.87738347...|[1.54410657512398...| 0|\n", + "| 16|41.9595267906|-87.7020985409|[16.0,41.95952679...|[0.88234661435656...| 2|\n", + "| 32|41.8809944707|-87.6327464887|[32.0,41.88099447...|[1.76469322871312...| 0|\n", + "| 32|41.8809944707|-87.6327464887|[32.0,41.88099447...|[1.76469322871312...| 0|\n", + "| 32|41.8809944707|-87.6327464887|[32.0,41.88099447...|[1.76469322871312...| 0|\n", + "| 32|41.8809944707|-87.6327464887|[32.0,41.88099447...|[1.76469322871312...| 0|\n", + "| 32|41.8809944707|-87.6327464887|[32.0,41.88099447...|[1.76469322871312...| 0|\n", + "| 32|41.8809944707|-87.6327464887|[32.0,41.88099447...|[1.76469322871312...| 0|\n", + "+-----------+-------------+--------------+--------------------+--------------------+----------+\n", "only showing top 20 rows\n", "\n" ] @@ -2079,6 +2089,7 @@ } ], "source": [ + "# Seeing 20 of the results:\n", "chicago_clustering.show()" ] }, @@ -2087,10 +2098,42 @@ "execution_count": null, "id": "1a2f3163-097d-4ca3-8105-d891019a7bad", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], "source": [ - "# Plotting chicago_clustering:\n" + "# Convert Spark DataFrame to Pandas DataFrame\n", + "pandas_df = chicago_clustering.toPandas()\n", + "\n", + "# Plotting chicago_clustering:\n", + "gdf = gpd.GeoDataFrame(pandas_df, geometry=gpd.points_from_xy(pandas_df['pickup_lon'].astype(float), pandas_df['pickup_lat'].astype(float)))\n", + "\n", + "# Plot the clusters\n", + "fig, ax = plt.subplots(figsize=(10, 8))\n", + "gdf.plot(ax=ax, column='prediction', legend=True, markersize=50, cmap='viridis', legend_kwds={'label': \"Cluster\"})\n", + "\n", + "# Add labels and title\n", + "plt.xlabel('Longitude')\n", + "plt.ylabel('Latitude')\n", + "plt.title('Clusters of Pickup Areas')\n", + "\n", + "# Show the plot\n", + "plt.show()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a614bd22-b64e-4ced-827d-a7f5e7e613d0", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {