Skip to content

Commit

Permalink
included weather data and created ML pseudocode
Browse files Browse the repository at this point in the history
  • Loading branch information
root committed Nov 22, 2023
1 parent d6ddc5e commit a0530bb
Showing 1 changed file with 130 additions and 4 deletions.
134 changes: 130 additions & 4 deletions supervised_ml.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1852,8 +1852,8 @@
"df_weather_1 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2018-01-01 to 2020-01-01.csv\", inferSchema=True, header=True)\n",
"df_weather_2 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2020-01-01 to 2022-08-31.csv\", inferSchema=True, header=True)\n",
"df_weather_3 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2022-09-01 to 2022-12-31.csv\", inferSchema=True, header=True)\n",
"# add 2023 data\n",
"df_weather = df_weather_1.union(df_weather_2).union(df_weather_3)"
"df_weather_4 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2023-01-01 to 2023-11-22.csv\", inferSchema=True, header=True)\n",
"df_weather = df_weather_1.union(df_weather_2).union(df_weather_3).union(df_weather_4)"
]
},
{
Expand Down Expand Up @@ -1995,13 +1995,139 @@
"pivoted_df_with_date.show()"
]
},
{
"cell_type": "markdown",
"id": "4e6dadc9-db41-4492-adaa-be633c0c4afa",
"metadata": {},
"source": [
"# ML Model Psuedo Code\n",
"\n",
"Assumes a dataframe df_merged where the the columns are daily program ride counts followed by daily ride counts for other community areas. Then the daily weather columns with datetime dropped after the merge. \n",
"\n",
"I folowed fairly closely the ML from one of Ashish's notebooks. I didn't do cross validation yet because I wanted to know how long it takes to run a single regression. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aa29201c-1d9b-493b-8378-558ae294f38e",
"id": "975dadc7-cb0c-4e75-89fb-b73b8e7ba743",
"metadata": {},
"outputs": [],
"source": []
"source": [
"from pyspark.ml.regression import LinearRegression\n",
"from pyspark.ml.feature import StringIndexer,OneHotEncoder, IndexToString, VectorAssembler\n",
"from pyspark.ml.feature import VectorAssembler\n",
"from pyspark.sql.types import FloatType\n",
"from pyspark.ml.evaluation import RegressionEvaluator\n",
"from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e0b37ea7-3f52-4698-8d63-114d69164046",
"metadata": {},
"outputs": [],
"source": [
"vectorAssembler = VectorAssembler(inputCols=[ ”CA_1”,”CA_2”,”…”,\n",
" \"datetime\",\"temp\",\"precip\",\"snow\",\"snowdepth\",\"windspeed\",\"sunset\"],\n",
" outputCol=\"features\")\n",
"df = vectorAssembler.transform(df_merged)\n",
"train_df, test_df = df.randomSplit([0.7, 0.3],0.0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bd8be7aa-1cc7-4753-bf08-6142797364c3",
"metadata": {},
"outputs": [],
"source": [
"# Train Model\n",
"from pyspark.ml.regression import LinearRegression\n",
"\n",
"#Elastic Net\n",
"lr = LinearRegression(featuresCol = 'features', labelCol=‘program_area’, regParam=0.3, elasticNetParam=0.8, maxIter=10)\n",
"lrm = lr.fit(train_df)\n",
"\n",
"#coefficients\n",
"print(\"Coefficients: \" + str(lrm.coefficients))\n",
"print(\"Intercept: \" + str(lrm.intercept))\n",
"\n",
"#model summary\n",
"print(\"RMSE: %f\" % lrm.summary.rootMeanSquaredError)\n",
"print(\"r2: %f\" % lrm.summary.r2)\n",
"\n",
"# Run the classifier on the test set\n",
"predictions = lrm.transform(test_df)\n",
"predictions.select('features','program_area’,’prediction').show(5,truncate=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e904f1bc-2143-439c-a6fd-e30ef8cfac28",
"metadata": {},
"outputs": [],
"source": [
"#print evaluation metrics\n",
"e = RegressionEvaluator(labelCol=\"program_area”, predictionCol=\"prediction\", metricName=\"rmse\")\n",
"\n",
"# Root Mean Square Error\n",
"rmse = e.evaluate(predictions)\n",
"print(\"RMSE: %.3f\" % rmse)\n",
"\n",
"# Mean Square Error\n",
"mse = e.evaluate(predictions, {e.metricName: \"mse\"})\n",
"print(\"MSE: %.3f\" % mse)\n",
"\n",
"# Mean Absolute Error\n",
"mae = e.evaluate(predictions, {e.metricName: \"mae\"})\n",
"print(\"MAE: %.3f\" % mae)\n",
"\n",
"# r2 - coefficient of determination\n",
"r2 = e.evaluate(predictions, {e.metricName: \"r2\"})\n",
"print(\"r2: %.3f\" %r2)\n"
]
},
{
"cell_type": "markdown",
"id": "4a8f0db1-4c39-4a3f-a340-303747b3ae29",
"metadata": {},
"source": [
"# pseudocode if we want to do cross validation"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "115c0a40-ec37-419a-ae05-4115adad625f",
"metadata": {},
"outputs": [],
"source": [
"# an open question for me is that in Ashish's code and online examples, the model is fit and tested on the same full dataset. which is confusing to me\n",
"# I would think you would fit on training and transform the test set. Something to look into.\n",
"\n",
"paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01]).build() # look to adjust the parameters regparam and elastic net\n",
"\n",
"crossval = CrossValidator(estimator=lr,\n",
" estimatorParamMaps=paramGrid,\n",
" evaluator=RegressionEvaluator(),\n",
" numFolds=3) \n",
"\n",
"# Run cross-validation, and choose the best set of parameters.\n",
"cvModel = crossval.fit(train_df)\n",
"\n",
"# Save the model in a location so we don't have to rerun this\n",
"model_path = \"gs://msca-bdp-student-gcs/bdp-rideshare-project/models/supervised_model/\"\n",
"cvModel.write().save(model_path)\n",
"\n",
"# Read the model back in\n",
"cvModelRead = CrossValidatorModel.read().load(model_path)\n",
"\n",
"# Make predictions on test documents. cvModel uses the best model found (lrModel).\n",
"predictions = cvModel.transform(test_df)"
]
}
],
"metadata": {
Expand Down

0 comments on commit a0530bb

Please sign in to comment.