included weather data and created ML pseudocode

rok12003 · Nov 22, 2023 · a0530bb · a0530bb
1 parent d6ddc5e
commit a0530bb
Showing 1 changed file with 130 additions and 4 deletions.
diff --git a/supervised_ml.ipynb b/supervised_ml.ipynb
@@ -1852,8 +1852,8 @@
     "df_weather_1 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2018-01-01 to 2020-01-01.csv\", inferSchema=True, header=True)\n",
     "df_weather_2 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2020-01-01 to 2022-08-31.csv\", inferSchema=True, header=True)\n",
     "df_weather_3 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2022-09-01 to 2022-12-31.csv\", inferSchema=True, header=True)\n",
-    "# add 2023 data\n",
-    "df_weather = df_weather_1.union(df_weather_2).union(df_weather_3)"
+    "df_weather_4 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2023-01-01 to 2023-11-22.csv\", inferSchema=True, header=True)\n",
+    "df_weather = df_weather_1.union(df_weather_2).union(df_weather_3).union(df_weather_4)"
    ]
   },
   {
@@ -1995,13 +1995,139 @@
     "pivoted_df_with_date.show()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "4e6dadc9-db41-4492-adaa-be633c0c4afa",
+   "metadata": {},
+   "source": [
+    "# ML Model Psuedo Code\n",
+    "\n",
+    "Assumes a dataframe df_merged where the the columns are daily program ride counts followed by daily ride counts for other community areas. Then the daily weather columns with datetime dropped after the merge. \n",
+    "\n",
+    "I folowed fairly closely the ML from one of Ashish's notebooks. I didn't do cross validation yet because I wanted to know how long it takes to run a single regression. "
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "aa29201c-1d9b-493b-8378-558ae294f38e",
+   "id": "975dadc7-cb0c-4e75-89fb-b73b8e7ba743",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "from pyspark.ml.regression import LinearRegression\n",
+    "from pyspark.ml.feature import StringIndexer,OneHotEncoder, IndexToString, VectorAssembler\n",
+    "from pyspark.ml.feature import VectorAssembler\n",
+    "from pyspark.sql.types import FloatType\n",
+    "from pyspark.ml.evaluation import RegressionEvaluator\n",
+    "from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e0b37ea7-3f52-4698-8d63-114d69164046",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vectorAssembler = VectorAssembler(inputCols=[ ”CA_1”,”CA_2”,”…”,\n",
+    "                                              \"datetime\",\"temp\",\"precip\",\"snow\",\"snowdepth\",\"windspeed\",\"sunset\"],\n",
+    "                                  outputCol=\"features\")\n",
+    "df = vectorAssembler.transform(df_merged)\n",
+    "train_df, test_df =  df.randomSplit([0.7, 0.3],0.0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bd8be7aa-1cc7-4753-bf08-6142797364c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Train Model\n",
+    "from pyspark.ml.regression import LinearRegression\n",
+    "\n",
+    "#Elastic Net\n",
+    "lr = LinearRegression(featuresCol = 'features', labelCol=‘program_area’, regParam=0.3, elasticNetParam=0.8, maxIter=10)\n",
+    "lrm = lr.fit(train_df)\n",
+    "\n",
+    "#coefficients\n",
+    "print(\"Coefficients: \" + str(lrm.coefficients))\n",
+    "print(\"Intercept: \" + str(lrm.intercept))\n",
+    "\n",
+    "#model summary\n",
+    "print(\"RMSE: %f\" % lrm.summary.rootMeanSquaredError)\n",
+    "print(\"r2: %f\" % lrm.summary.r2)\n",
+    "\n",
+    "# Run the classifier on the test set\n",
+    "predictions = lrm.transform(test_df)\n",
+    "predictions.select('features','program_area’,’prediction').show(5,truncate=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e904f1bc-2143-439c-a6fd-e30ef8cfac28",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#print evaluation metrics\n",
+    "e = RegressionEvaluator(labelCol=\"program_area”, predictionCol=\"prediction\", metricName=\"rmse\")\n",
+    "\n",
+    "# Root Mean Square Error\n",
+    "rmse = e.evaluate(predictions)\n",
+    "print(\"RMSE: %.3f\" % rmse)\n",
+    "\n",
+    "# Mean Square Error\n",
+    "mse = e.evaluate(predictions, {e.metricName: \"mse\"})\n",
+    "print(\"MSE: %.3f\" % mse)\n",
+    "\n",
+    "# Mean Absolute Error\n",
+    "mae = e.evaluate(predictions, {e.metricName: \"mae\"})\n",
+    "print(\"MAE: %.3f\" % mae)\n",
+    "\n",
+    "# r2 - coefficient of determination\n",
+    "r2 = e.evaluate(predictions, {e.metricName: \"r2\"})\n",
+    "print(\"r2: %.3f\" %r2)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4a8f0db1-4c39-4a3f-a340-303747b3ae29",
+   "metadata": {},
+   "source": [
+    "# pseudocode if we want to do cross validation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "115c0a40-ec37-419a-ae05-4115adad625f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# an open question for me is that in Ashish's code and online examples, the model is fit and tested on the same full dataset. which is confusing to me\n",
+    "# I would think you would fit on training and transform the test set. Something to look into.\n",
+    "\n",
+    "paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01]).build() # look to adjust the parameters regparam and elastic net\n",
+    "\n",
+    "crossval = CrossValidator(estimator=lr,\n",
+    "                          estimatorParamMaps=paramGrid,\n",
+    "                          evaluator=RegressionEvaluator(),\n",
+    "                          numFolds=3) \n",
+    "\n",
+    "# Run cross-validation, and choose the best set of parameters.\n",
+    "cvModel = crossval.fit(train_df)\n",
+    "\n",
+    "# Save the model in a location so we don't have to rerun this\n",
+    "model_path = \"gs://msca-bdp-student-gcs/bdp-rideshare-project/models/supervised_model/\"\n",
+    "cvModel.write().save(model_path)\n",
+    "\n",
+    "# Read the model back in\n",
+    "cvModelRead = CrossValidatorModel.read().load(model_path)\n",
+    "\n",
+    "# Make predictions on test documents. cvModel uses the best model found (lrModel).\n",
+    "predictions = cvModel.transform(test_df)"
+   ]
   }
  ],
  "metadata": {