From a0530bb81d7742284f59495b440a1c79ae3c405b Mon Sep 17 00:00:00 2001
From: root
 <root@hub-msca-bdp-dphub-students-abejburton-m.c.msca-bdp-student-ap.internal>
Date: Wed, 22 Nov 2023 23:46:50 +0000
Subject: [PATCH] included weather data and created ML pseudocode

---
 supervised_ml.ipynb | 134 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 130 insertions(+), 4 deletions(-)

diff --git a/supervised_ml.ipynb b/supervised_ml.ipynb
index 5a32969..8d8c884 100644
--- a/supervised_ml.ipynb
+++ b/supervised_ml.ipynb
@@ -1852,8 +1852,8 @@
     "df_weather_1 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2018-01-01 to 2020-01-01.csv\", inferSchema=True, header=True)\n",
     "df_weather_2 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2020-01-01 to 2022-08-31.csv\", inferSchema=True, header=True)\n",
     "df_weather_3 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2022-09-01 to 2022-12-31.csv\", inferSchema=True, header=True)\n",
-    "# add 2023 data\n",
-    "df_weather = df_weather_1.union(df_weather_2).union(df_weather_3)"
+    "df_weather_4 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2023-01-01 to 2023-11-22.csv\", inferSchema=True, header=True)\n",
+    "df_weather = df_weather_1.union(df_weather_2).union(df_weather_3).union(df_weather_4)"
    ]
   },
   {
@@ -1995,13 +1995,139 @@
     "pivoted_df_with_date.show()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "4e6dadc9-db41-4492-adaa-be633c0c4afa",
+   "metadata": {},
+   "source": [
+    "# ML Model Psuedo Code\n",
+    "\n",
+    "Assumes a dataframe df_merged where the the columns are daily program ride counts followed by daily ride counts for other community areas. Then the daily weather columns with datetime dropped after the merge. \n",
+    "\n",
+    "I folowed fairly closely the ML from one of Ashish's notebooks. I didn't do cross validation yet because I wanted to know how long it takes to run a single regression. "
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "aa29201c-1d9b-493b-8378-558ae294f38e",
+   "id": "975dadc7-cb0c-4e75-89fb-b73b8e7ba743",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "from pyspark.ml.regression import LinearRegression\n",
+    "from pyspark.ml.feature import StringIndexer,OneHotEncoder, IndexToString, VectorAssembler\n",
+    "from pyspark.ml.feature import VectorAssembler\n",
+    "from pyspark.sql.types import FloatType\n",
+    "from pyspark.ml.evaluation import RegressionEvaluator\n",
+    "from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e0b37ea7-3f52-4698-8d63-114d69164046",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vectorAssembler = VectorAssembler(inputCols=[ ”CA_1”,”CA_2”,”…”,\n",
+    "                                              \"datetime\",\"temp\",\"precip\",\"snow\",\"snowdepth\",\"windspeed\",\"sunset\"],\n",
+    "                                  outputCol=\"features\")\n",
+    "df = vectorAssembler.transform(df_merged)\n",
+    "train_df, test_df =  df.randomSplit([0.7, 0.3],0.0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bd8be7aa-1cc7-4753-bf08-6142797364c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Train Model\n",
+    "from pyspark.ml.regression import LinearRegression\n",
+    "\n",
+    "#Elastic Net\n",
+    "lr = LinearRegression(featuresCol = 'features', labelCol=‘program_area’, regParam=0.3, elasticNetParam=0.8, maxIter=10)\n",
+    "lrm = lr.fit(train_df)\n",
+    "\n",
+    "#coefficients\n",
+    "print(\"Coefficients: \" + str(lrm.coefficients))\n",
+    "print(\"Intercept: \" + str(lrm.intercept))\n",
+    "\n",
+    "#model summary\n",
+    "print(\"RMSE: %f\" % lrm.summary.rootMeanSquaredError)\n",
+    "print(\"r2: %f\" % lrm.summary.r2)\n",
+    "\n",
+    "# Run the classifier on the test set\n",
+    "predictions = lrm.transform(test_df)\n",
+    "predictions.select('features','program_area’,’prediction').show(5,truncate=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e904f1bc-2143-439c-a6fd-e30ef8cfac28",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#print evaluation metrics\n",
+    "e = RegressionEvaluator(labelCol=\"program_area”, predictionCol=\"prediction\", metricName=\"rmse\")\n",
+    "\n",
+    "# Root Mean Square Error\n",
+    "rmse = e.evaluate(predictions)\n",
+    "print(\"RMSE: %.3f\" % rmse)\n",
+    "\n",
+    "# Mean Square Error\n",
+    "mse = e.evaluate(predictions, {e.metricName: \"mse\"})\n",
+    "print(\"MSE: %.3f\" % mse)\n",
+    "\n",
+    "# Mean Absolute Error\n",
+    "mae = e.evaluate(predictions, {e.metricName: \"mae\"})\n",
+    "print(\"MAE: %.3f\" % mae)\n",
+    "\n",
+    "# r2 - coefficient of determination\n",
+    "r2 = e.evaluate(predictions, {e.metricName: \"r2\"})\n",
+    "print(\"r2: %.3f\" %r2)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4a8f0db1-4c39-4a3f-a340-303747b3ae29",
+   "metadata": {},
+   "source": [
+    "# pseudocode if we want to do cross validation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "115c0a40-ec37-419a-ae05-4115adad625f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# an open question for me is that in Ashish's code and online examples, the model is fit and tested on the same full dataset. which is confusing to me\n",
+    "# I would think you would fit on training and transform the test set. Something to look into.\n",
+    "\n",
+    "paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01]).build() # look to adjust the parameters regparam and elastic net\n",
+    "\n",
+    "crossval = CrossValidator(estimator=lr,\n",
+    "                          estimatorParamMaps=paramGrid,\n",
+    "                          evaluator=RegressionEvaluator(),\n",
+    "                          numFolds=3) \n",
+    "\n",
+    "# Run cross-validation, and choose the best set of parameters.\n",
+    "cvModel = crossval.fit(train_df)\n",
+    "\n",
+    "# Save the model in a location so we don't have to rerun this\n",
+    "model_path = \"gs://msca-bdp-student-gcs/bdp-rideshare-project/models/supervised_model/\"\n",
+    "cvModel.write().save(model_path)\n",
+    "\n",
+    "# Read the model back in\n",
+    "cvModelRead = CrossValidatorModel.read().load(model_path)\n",
+    "\n",
+    "# Make predictions on test documents. cvModel uses the best model found (lrModel).\n",
+    "predictions = cvModel.transform(test_df)"
+   ]
   }
  ],
  "metadata": {