diff --git a/notebooks/deepmc/mc_forecast.ipynb b/notebooks/deepmc/mc_forecast.ipynb index d41b43b7..7fbfa798 100755 --- a/notebooks/deepmc/mc_forecast.ipynb +++ b/notebooks/deepmc/mc_forecast.ipynb @@ -15,7 +15,9 @@ "```bash\n", "$ micromamba env create -f ./deepmc_env.yaml\n", "$ micromamba activate deepmc-pytorch\n", - "```\n" + "```\n", + "\n", + "**We currently only support Unix-based systems (Linux and MacOS) for running this notebook.**" ] }, { @@ -55,33 +57,22 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/azureuser/.conda/envs/deepmc-pytorch/lib/python3.8/site-packages/torchvision/io/image.py:11: UserWarning: Failed to load image Python extension: /home/azureuser/.conda/envs/deepmc-pytorch/lib/python3.8/site-packages/torchvision/image.so: undefined symbol: _ZNK3c1010TensorImpl36is_contiguous_nondefault_policy_implENS_12MemoryFormatE\n", - " warn(f\"Failed to load image Python extension: {e}\")\n" - ] - } - ], + "outputs": [], "source": [ - "import pandas as pd\n", - "import numpy as np\n", + "import warnings\n", + "from datetime import datetime\n", "\n", - "from datetime import datetime, timedelta\n", + "import numpy as np\n", + "import pandas as pd\n", "from matplotlib import pyplot as plt\n", - "\n", - "from shapely import geometry\n", - "\n", - "from notebook_lib import utils\n", - "from notebook_lib import prediction\n", "from notebook_lib import train\n", - "from notebook_lib.forecast import Forecast\n", + "from shapely import geometry\n", + "\n", + "from vibe_notebook.deepmc import prediction, utils\n", + "from vibe_notebook.deepmc.forecast import Forecast\n", "\n", - "import warnings\n", "warnings.filterwarnings(\"ignore\")" ] }, @@ -90,7 +81,7 @@ "metadata": {}, "source": [ "### Workflows\n", - "The notebook utilize below workflows available in farmvibes" + "The notebook utilizes the workflow below, which is available in FarmVibes.AI: " ] }, { @@ -107,9 +98,9 @@ "metadata": {}, "source": [ "### Data\n", - "The notebook utilizing two types of datasets\n", + "The notebook utilizes two types of datasets:\n", "\n", - "1. The historical observations recorded by weather stations\n", + "1. The historical observations recorded by weather stations.\n", "2. The forecast observations downloaded using the [herbie package](https://blaylockbk.github.io/Herbie/_build/html/). This package helps to download recent and archived numerical weather prediction (NWP) model output from different cloud archive sources. Its most popular capability is to download HRRR model data.\n" ] }, @@ -118,7 +109,7 @@ "metadata": {}, "source": [ "### AGWeatherNet\n", - "In this notebook, we utilize historical observations downloaded from AGWeatherNet for a station \\\"Palouse\\\". The data used for training range from May 2020 to June 2022. For more information check [AGWeatherNet documentation](http://weather.wsu.edu/?p=92850&desktop)." + "In this notebook, we utilize historical observations downloaded from AGWeatherNet for the station `Palouse`. The data used for training range from May 2020 to June 2022. For more information check [AGWeatherNet documentation](http://weather.wsu.edu/?p=92850&desktop)." ] }, { @@ -146,8 +137,8 @@ "metadata": {}, "outputs": [], "source": [ - "PREDICT=\"%s\"\n", - "RELEVANT=\"%s\"\n", + "PREDICT = \"%s\"\n", + "RELEVANT = \"%s\"\n", "ROOT_PATH = f\"./data/model_{PREDICT}/\"\n", "DATA_EXPORT_PATH = ROOT_PATH + f\"{STATION_NAME}/{RELEVANT}/train_data.pkl\"" ] @@ -166,7 +157,7 @@ "outputs": [], "source": [ "# weather dataset filtered and model training limited to train features.\n", - "HISTORICAL_MODEL_TRAIN_FEATURES = ['humidity', 'wind_speed', 'temperature']\n", + "HISTORICAL_MODEL_TRAIN_FEATURES = [\"humidity\", \"wind_speed\", \"temperature\"]\n", "\n", "# Historical data aligned using INDEX variable\n", "INDEX = \"date\"" @@ -203,7 +194,7 @@ "outputs": [], "source": [ "# Models trained to predict out features\n", - "OUT_FEATURES = ['wind_speed' , 'temperature']" + "OUT_FEATURES = [\"wind_speed\", \"temperature\"]" ] }, { @@ -211,14 +202,16 @@ "metadata": {}, "source": [ "### Relevant vs Not Relevant\n", - "The notebook support performing micro climate predictions with below approaches. \n", + "The scenario is considered relevant when there is a close match between historical data and forecasts, with minimal discrepancies else it's not relevant.\n", + "\n", + "The notebook supports performing micro climate predictions with the following approaches: \n", "\n", "1. Utilizing both Historical & Forecast observations. This approach is suggested to use if both observations are relevant.

\n", "\n", "2. Utilizing only Historical dataset. This approach is suggested to use if both Historical & Forecast observations are not relevant or Forecast dataset doesn't exist.

\n", "\n", "\n", - "In next cells, demonstrated training & prediction process for both relevant and not relevant scenarios. " + "In next cells, we demonstrate the training and prediction processes for both relevant and non-relevant scenarios. " ] }, { @@ -270,7 +263,7 @@ "1. The index variable is converted to datetime\n", "2. The input data is interpolated to fill the missing values using their neighbors\n", "3. The script focuses on training the model with a 60-minute frequency, hence the data is grouped for this frequency.\n", - "4. The data is scaled using the scikit-learn StandardScalar. For more information check [scikit-learn documentaion](https://github.com/scikit-learn/scikit-learn)" + "4. The data is scaled using the scikit-learn StandardScalar. For more information check [scikit-learn documentation](https://github.com/scikit-learn/scikit-learn)" ] }, { @@ -296,7 +289,7 @@ "metadata": {}, "outputs": [], "source": [ - "historical_dataset = utils.get_csv_data(path=file_path)" + "historical_dataset = utils.get_csv_data(path=file_path, interpolate=False, fill_na=False)" ] }, { @@ -319,7 +312,7 @@ "  - humidity - \"RH:2 m\"
\n", "  - wind speed - The forecast observations of wind speed are derived using data downloaded for u & v components. The algebraic expression used to calculate wind speed is
\n", " $$ \n", - " ws = \\sqrt{u^2 + v^2}\n", + " ws(u, v) = \\sqrt{u^2 + v^2}\n", " $$\n", "   i. u component - \"UGRD:10 m\"
\n", "   ii. v component - \"VGRD:10 m\"" @@ -348,24 +341,14 @@ "start_date = datetime(year=2020, month=5, day=31)\n", "end_date = datetime(year=2022, month=8, day=2)\n", "time_range = (start_date, end_date)\n", - "date_column=\"date\"\n", + "date_column = \"date\"\n", "\n", - "parameters = [{\n", - " \"weather_type\": \"temperature\",\n", - " \"search_text\": \"TMP:2 m\"\n", - " },\n", - " {\n", - " \"weather_type\": \"humidity\",\n", - " \"search_text\": \"RH:2 m\"\n", - " },\n", - " {\n", - " \"weather_type\": \"u-component\",\n", - " \"search_text\": \"UGRD:10 m\"\n", - " },\n", - " {\n", - " \"weather_type\": \"v-component\",\n", - " \"search_text\": \"VGRD:10 m\"\n", - " }]" + "parameters = [\n", + " {\"weather_type\": \"temperature\", \"search_text\": \"TMP:2 m\"},\n", + " {\"weather_type\": \"humidity\", \"search_text\": \"RH:2 m\"},\n", + " {\"weather_type\": \"u-component\", \"search_text\": \"UGRD:10 m\"},\n", + " {\"weather_type\": \"v-component\", \"search_text\": \"VGRD:10 m\"},\n", + "]" ] }, { @@ -373,7 +356,7 @@ "metadata": {}, "source": [ "### Submit Request to Worker\n", - "Download forecast observations by submitting request to worker running in background. If more than one worker instance running in background, it process the request in parallel for each parameter. Workflow execution utilize below parameters while processing requests, this can be overwritten using the parameter argument.\n", + "We download forecast observations by submitting a request to the worker running in the background. If more than one worker instance is running in the background, the request is processed in parallel for each parameter. Workflow execution utilizes the parameters below while processing requests, this can be overwritten using the parameter argument.\n", "\n", "- fxx: [1, 25, 1] # start, stop, step\n", "- search_text: \"TMP:2 m\"\n", @@ -388,24 +371,48 @@ "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "'VibeWorkflowRun'(id='d7c0dc6a-339f-45b9-81d1-2fb93d2938f6', name='forecast_temperature', workflow='data_ingestion/weather/herbie_forecast', status='done')\n", - "'VibeWorkflowRun'(id='61d952d1-b068-4c2c-b522-a680efed450f', name='forecast_humidity', workflow='data_ingestion/weather/herbie_forecast', status='running')\n", - "'VibeWorkflowRun'(id='8c95f7ab-6d6b-40e8-a3bd-c12b854d0a7b', name='forecast_u-component', workflow='data_ingestion/weather/herbie_forecast', status='running')\n", - "'VibeWorkflowRun'(id='7490cd70-9731-4cac-ab36-051d3903776a', name='forecast_v-component', workflow='data_ingestion/weather/herbie_forecast', status='running')\n" - ] + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c00b48cb983f4c2184d411cd346f2bdb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
-    "forecast_ = Forecast(\n",
-    "                workflow_name=HERBIE_DOWNLOAD_WORKFLOW,\n",
-    "                geometry=STATION_GEOMETRY,\n",
-    "                time_range=time_range,\n",
-    "                parameters=parameters,\n",
-    "                )\n",
-    "run_list = forecast_.submit_download_request()"
+    "forecast = Forecast(\n",
+    "    workflow_name=HERBIE_DOWNLOAD_WORKFLOW,\n",
+    "    geometry=STATION_GEOMETRY,\n",
+    "    time_range=time_range,\n",
+    "    parameters=parameters,\n",
+    ")\n",
+    "run_list = forecast.submit_download_request()"
    ]
   },
   {
@@ -413,12 +420,14 @@
    "metadata": {},
    "source": [
     "### Monitor download of Forecast observations\n",
-    "Check the download status and fetch the downloaded data from the cluster running in backend. The execution time of download depends on time_range. The downloaded data undergoes below changes.\n",
+    "Check the download status and fetch the downloaded data from FarmVibes.AI. The execution time of the download depends on the time range. \n",
+    "\n",
+    "The downloaded data undergoes the following changes:\n",
     "\n",
-    "1. concatenate the output of all requests submitted.\n",
+    "1. Concatenate the output of all submitted requests.\n",
     "2. Set index on date column.\n",
-    "3. Does interpolate to derive the missing data.\n",
-    "4. The data downloaded follows the utc timezone. It's required to transform the data to the timezone of historical observations. The historical observations used in this notebook follows pst timezone, hence the data offset by -8 hours."
+    "3. Interpolate to derive the missing data.\n",
+    "4. The data downloaded follows the UTC timezone. It's required to transform the data to the timezone of historical observations. The historical observations used in this notebook follows the PST timezone, hence the data is offset by -8 hours."
    ]
   },
   {
@@ -501,14 +510,14 @@
        "2020-05-30 17:00:00             -2.861307              1.178179  "
       ]
      },
-     "execution_count": 15,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# transform downloaded data from utc to pst timezone\n",
-    "forecast_dataset = forecast_.get_downloaded_data(run_list=run_list, offset_hours=-8)\n",
+    "forecast_dataset = forecast.get_downloaded_data(run_list=run_list, offset_hours=-8)\n",
     "forecast_dataset.to_csv(f\"{STATION_NAME}_forecast.csv\")\n",
     "forecast_dataset.head(2)"
    ]
@@ -517,109 +526,40 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Preprocess Forecast Observations\n",
-    "Below preprocessing performed on downloaded data before performing model training.\n",
-    "\n",
-    "- Temperature: The downloaded data has units \"kelvin\". It will be converted to Fahrenheit.\n",
-    "- wind_speed: Using the u-component & v-component values downloaded, the wind_speed values derived. The derived values multiplied by 2.23 to convert from m/sec to mph\n",
-    "- drop u-component & v-component"
+    "### Preprocess Forecast Observations"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 16,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "# Temperature\n",
-    "# convert kelvin to celsius\n",
-    "forecast_dataset[\"temperature_forecast\"] = forecast_dataset[\"temperature_forecast\"]-273.15\n",
+    "We perform the following preprocessing in the downloaded data before training the model.\n",
     "\n",
-    "# convert celsius to Fahrenheit\n",
-    "forecast_dataset[\"temperature_forecast\"] = forecast_dataset[\"temperature_forecast\"].apply(lambda x: (x * 9/5) + 32)"
+    "- `temperature`: The downloaded data is in Kelvin. It will be converted to Fahrenheit.\n",
+    "- `wind_speed`: Using the u-component & v-component values downloaded, the `wind_speed` values are derived. The derived values are multiplied by 2.23 to convert from m/sec to mph.\n",
+    "- Drop u-component & v-component"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 17,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
temperature_forecasthumidity_forecastwind_speed_forecast
date
2020-05-30 16:00:0084.17363349.2999997.025768
2020-05-30 17:00:0082.14973154.5999986.900466
\n", - "
" - ], - "text/plain": [ - " temperature_forecast humidity_forecast \\\n", - "date \n", - "2020-05-30 16:00:00 84.173633 49.299999 \n", - "2020-05-30 17:00:00 82.149731 54.599998 \n", - "\n", - " wind_speed_forecast \n", - "date \n", - "2020-05-30 16:00:00 7.025768 \n", - "2020-05-30 17:00:00 6.900466 " - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "# wind_speed\n", - "# multiplying with 2.23 to convert wind speed from m/sec to mph\n", - "forecast_dataset[\"wind_speed_forecast\"] = forecast_dataset.apply(lambda x: np.sqrt(np.square(x[\"u-component_forecast\"]) + \n", - " np.square(x[\"v-component_forecast\"]))*2.23, axis=1)\n", - "\n", - "forecast_dataset.drop(columns=[\"u-component_forecast\", \"v-component_forecast\"], inplace=True)\n", + "forecast_dataset = utils.convert_forecast_data(forecast_dataset)\n", "forecast_dataset.head(2)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We also clean the input data with the following operations:\n", + "- Exclude input data outside the time_range of interest.\n", + "- Shift forecast data by number of hours\n", + "- Fill missing data with neighboring data points using pandas interpolate techniques." + ] + }, { "cell_type": "code", "execution_count": 18, @@ -698,18 +638,21 @@ "2020-07-06 01:00:00 57.220984 3.85 10.642863 " ] }, - "execution_count": 18, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "input_df = utils.clean_relevant_data(\n", - " actual_df=historical_dataset, \n", - " forecast_df=forecast_dataset, \n", - " out_variables=RELEVANT_FEATURES,\n", - " freq_hours=frequency_hour,\n", - " num_of_indices=number_of_hours)\n", + "input_df = utils.clean_relevant_data_using_hrrr(\n", + " actual_df=historical_dataset,\n", + " forecast_df=forecast_dataset,\n", + " out_variables=RELEVANT_FEATURES,\n", + " freq_hours=frequency_hour,\n", + " num_of_indices=number_of_hours,\n", + " start_date=start_date,\n", + " end_date=end_date,\n", + ")\n", "input_df.head(2)" ] }, @@ -717,12 +660,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Verifying the forecast observations are relevant or not relevant" + "### Verifying if the forecast observations are relevant or not relevant" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -731,7 +674,7 @@ "" ] }, - "execution_count": 19, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" }, @@ -747,7 +690,7 @@ } ], "source": [ - "plot_df = input_df[(input_df.index.month==7) & (input_df.index.year==2020)]\n", + "plot_df = input_df[(input_df.index.month == 7) & (input_df.index.year == 2020)]\n", "\n", "plt.figure(figsize=(20, 4))\n", "plt.plot(plot_df.index.values, plot_df[\"temperature_forecast\"].values, label=\"forecast\")\n", @@ -759,7 +702,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Based on the distribution of observation in above plot, the forecast observations are relevant. In this scenario continue with model training process using relevant dataset." + "Based on the distribution of observation in above plot, the forecast observations are relevant. In this scenario, we will continue with model training using the relevant dataset." ] }, { @@ -768,7 +711,7 @@ "source": [ "### Training\n", "\n", - "The script is configured to train the Micro Climate prediction model for 24 hours and the historical weather station data has points with a 60-minute frequency. Below inputs vary based on number of hours of prediction and frequency of weather station data points.\n", + "The script is configured to train the Micro Climate prediction model for 24 hours and the historical weather station data has points with a 60-minute frequency. Below inputs vary based on the number of hours of prediction and frequency of weather station data points.\n", "\n", "1. `chunk_size` - The value of the chunk size is based on the frequency of the weather station data points. For a frequency of 60 minutes, the minimum required data points are 528. If the data frequency is 15 minutes, the minimum number of data points required is 528*4 = 2112. These are the minimum number of data points need to be provided as input during the inference.\n", "2. `ts_lookahead` - The value used during the data preprocessing. It is the value used to consider weather data points ahead for a given time period while grouping the data.\n", @@ -791,7 +734,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": { "tags": [] }, @@ -803,7 +746,8 @@ " root_path=ROOT_PATH,\n", " data_export_path=DATA_EXPORT_PATH,\n", " station_name=STATION_NAME,\n", - " relevant=True)\n", + " relevant=True,\n", + ")\n", "\n", "train_weather.train_model(input_df)" ] @@ -820,26 +764,27 @@ "metadata": {}, "source": [ "### Current\n", - "Predict weather for the next 24 hours. To predict weather for next 24 hours it is required to certain hours of historical forecast observations, the default size called chunk size of historical forecast observations is 528. Choosing start time of prediction is important, if historical observations used to train model has the start time of 12:00:00 then the historical observations used for prediction should start at the same time." + "To predict the weather for next 24 hours, we need certain hours of historical forecast observations beforehand. The default size (chunk size) of historical forecast observations is 528. Choosing a start time to predict is important, if historical observations used to train model have a starting time of 12:00:00, then the historical observations used for prediction should start at the same time." ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "weather_forecast = prediction.InferenceWeather(\n", - " root_path=ROOT_PATH,\n", - " data_export_path=DATA_EXPORT_PATH,\n", - " station_name=STATION_NAME,\n", - " predicts=OUT_FEATURES,\n", - " relevant=True)" + " root_path=ROOT_PATH,\n", + " data_export_path=DATA_EXPORT_PATH,\n", + " station_name=STATION_NAME,\n", + " predicts=OUT_FEATURES,\n", + " relevant=True,\n", + ")" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -848,55 +793,69 @@ "p_end_date = datetime(year=2022, month=6, day=3, hour=0, minute=0, second=0)\n", "\n", "time_range = (p_start_date, p_end_date)\n", - "date_column=\"date\"\n", + "date_column = \"date\"\n", "\n", - "parameters = [{\n", - " \"weather_type\": \"temperature\",\n", - " \"search_text\": \"TMP:2 m\"\n", - " },\n", - " {\n", - " \"weather_type\": \"humidity\",\n", - " \"search_text\": \"RH:2 m\"\n", - " },\n", - " {\n", - " \"weather_type\": \"u-component\",\n", - " \"search_text\": \"UGRD:10 m\"\n", - " },\n", - " {\n", - " \"weather_type\": \"v-component\",\n", - " \"search_text\": \"VGRD:10 m\"\n", - " }]" + "parameters = [\n", + " {\"weather_type\": \"temperature\", \"search_text\": \"TMP:2 m\"},\n", + " {\"weather_type\": \"humidity\", \"search_text\": \"RH:2 m\"},\n", + " {\"weather_type\": \"u-component\", \"search_text\": \"UGRD:10 m\"},\n", + " {\"weather_type\": \"v-component\", \"search_text\": \"VGRD:10 m\"},\n", + "]" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 25, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "'VibeWorkflowRun'(id='ea662424-c9c5-4d1f-8d1c-ed907f0176ea', name='forecast_temperature', workflow='data_ingestion/weather/herbie_forecast', status='done')\n", - "'VibeWorkflowRun'(id='e93b52c2-9c20-4bf2-b647-a7dc04ff4947', name='forecast_humidity', workflow='data_ingestion/weather/herbie_forecast', status='done')\n", - "'VibeWorkflowRun'(id='5cbd7199-626c-43dc-aa73-17639c97bc30', name='forecast_u-component', workflow='data_ingestion/weather/herbie_forecast', status='done')\n", - "'VibeWorkflowRun'(id='e9373c58-5730-4b03-aee4-83015ab08848', name='forecast_v-component', workflow='data_ingestion/weather/herbie_forecast', status='done')\n" - ] + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "04e890c9992c4710acdbb661b49c1e56",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
-    "forecast_ = Forecast(\n",
-    "                workflow_name=HERBIE_DOWNLOAD_WORKFLOW,\n",
-    "                geometry=STATION_GEOMETRY,\n",
-    "                time_range=time_range,\n",
-    "                parameters=parameters,\n",
-    "                )\n",
-    "run_list = forecast_.submit_download_request()"
+    "forecast = Forecast(\n",
+    "    workflow_name=HERBIE_DOWNLOAD_WORKFLOW,\n",
+    "    geometry=STATION_GEOMETRY,\n",
+    "    time_range=time_range,\n",
+    "    parameters=parameters,\n",
+    ")\n",
+    "run_list = forecast.submit_download_request()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
@@ -974,21 +933,21 @@
        "2022-03-17 17:00:00              4.563419              1.176411  "
       ]
      },
-     "execution_count": 23,
+     "execution_count": 27,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# transform downloaded data from utc to pst timezone\n",
-    "p_forecast_dataset = forecast_.get_downloaded_data(run_list=run_list, offset_hours=-8)\n",
+    "p_forecast_dataset = forecast.get_downloaded_data(run_list=run_list, offset_hours=-8)\n",
     "p_forecast_dataset.to_csv(f\"{STATION_NAME}_forecast.csv\")\n",
     "p_forecast_dataset.head(2)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [
     {
@@ -1047,14 +1006,14 @@
        "2022-03-18 14:00:00    66.300      16.175       50.075"
       ]
      },
-     "execution_count": 24,
+     "execution_count": 29,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "predict_file_path = f\"./data/{STATION_NAME}/prediction.csv\"\n",
-    "p_historical_dataset = utils.get_csv_data(path=predict_file_path)\n",
+    "p_historical_dataset = utils.get_csv_data(path=predict_file_path, interpolate=False, fill_na=False)\n",
     "p_historical_dataset = p_historical_dataset[HISTORICAL_MODEL_TRAIN_FEATURES]\n",
     "\n",
     "p_historical_dataset.head(2)"
@@ -1062,35 +1021,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Temperature\n",
-    "# convert kelvin to celsius\n",
-    "p_forecast_dataset[\"temperature_forecast\"] = p_forecast_dataset[\"temperature_forecast\"]-273.15\n",
-    "\n",
-    "# convert celsius to Fahrenheit\n",
-    "p_forecast_dataset[\"temperature_forecast\"] = p_forecast_dataset[\"temperature_forecast\"].apply(lambda x: (x * 9/5) + 32)"
+    "p_forecast_dataset = utils.convert_forecast_data(p_forecast_dataset)"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 26,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "# wind_speed\n",
-    "# multiplying with 2.23 to convert wind speed from m/sec to mph\n",
-    "p_forecast_dataset[\"wind_speed_forecast\"] = p_forecast_dataset.apply(lambda x: np.sqrt(np.square(x[\"u-component_forecast\"]) + \n",
-    "                                    np.square(x[\"v-component_forecast\"]))*2.23, axis=1)\n",
-    "\n",
-    "p_forecast_dataset.drop(columns=[\"u-component_forecast\", \"v-component_forecast\"], inplace=True)"
+    "We clean the input data with the following operations:\n",
+    "- Exclude input data outside the time range of interest.\n",
+    "- Shift forecast data by number of hours\n",
+    "- Fill missing data with neighboring data points using pandas interpolate techniques."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [
     {
@@ -1166,19 +1116,21 @@
        "2022-03-18 14:00:00             45.456384      16.175            17.855009  "
       ]
      },
-     "execution_count": 27,
+     "execution_count": 32,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "input_df = utils.clean_relevant_data(\n",
-    "                    actual_df=p_historical_dataset.copy(),\n",
-    "                    forecast_df= p_forecast_dataset.copy(),\n",
-    "                    out_variables= RELEVANT_FEATURES,\n",
-    "                    freq_hours=frequency_hour,\n",
-    "                    num_of_indices=number_of_hours\n",
-    "                )\n",
+    "input_df = utils.clean_relevant_data_using_hrrr(\n",
+    "    actual_df=p_historical_dataset.copy(),\n",
+    "    forecast_df=p_forecast_dataset.copy(),\n",
+    "    out_variables=RELEVANT_FEATURES,\n",
+    "    freq_hours=frequency_hour,\n",
+    "    num_of_indices=number_of_hours,\n",
+    "    start_date=start_date,\n",
+    "    end_date=end_date,\n",
+    ")\n",
     "\n",
     "base_data_df = input_df[RELEVANT_FEATURES]\n",
     "base_data_df.head(2)"
@@ -1186,7 +1138,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1197,7 +1149,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 34,
    "metadata": {},
    "outputs": [
     {
@@ -1239,7 +1191,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [
     {
@@ -1315,45 +1267,45 @@
        "2022-03-16 17:00:00             44.783197      14.325            10.509131  "
       ]
      },
-     "execution_count": 30,
+     "execution_count": 36,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "predict_file_path = f\"./data/{STATION_NAME}/training.csv\"\n",
-    "p_historical_dataset = utils.get_csv_data(path=predict_file_path)\n",
+    "p_historical_dataset = utils.get_csv_data(path=predict_file_path, interpolate=False, fill_na=False)\n",
     "p_historical_dataset = p_historical_dataset[HISTORICAL_MODEL_TRAIN_FEATURES]\n",
     "p_historical_dataset.head(5)\n",
     "\n",
     "input_df = utils.clean_relevant_data(\n",
-    "                    p_historical_dataset.copy(), \n",
-    "                    p_forecast_dataset.copy(), \n",
-    "                    RELEVANT_FEATURES,\n",
-    "                    freq_hours=frequency_hour,\n",
-    "                    num_of_indices=number_of_hours)\n",
+    "    p_historical_dataset.copy(),\n",
+    "    p_forecast_dataset.copy(),\n",
+    "    RELEVANT_FEATURES,\n",
+    "    freq_hours=frequency_hour,\n",
+    "    num_of_indices=number_of_hours,\n",
+    ")\n",
     "base_data_df = input_df[RELEVANT_FEATURES]\n",
     "base_data_df.head(2)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 37,
    "metadata": {},
    "outputs": [],
    "source": [
     "predict_start_datetime = datetime(year=2022, month=4, day=30, hour=13, minute=0, second=0)\n",
     "predict_end_datetime = datetime(year=2022, month=5, day=21, hour=13, minute=0, second=0)\n",
     "\n",
-    "df_out = weather_forecast.inference_historical(base_data_df.copy(),\n",
-    "            start_datetime=predict_start_datetime,\n",
-    "            end_datetime=predict_end_datetime\n",
-    "            )"
+    "df_out = weather_forecast.inference_historical(\n",
+    "    base_data_df.copy(), start_datetime=predict_start_datetime, end_datetime=predict_end_datetime\n",
+    ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 38,
    "metadata": {},
    "outputs": [
     {
@@ -1378,20 +1330,22 @@
     }
    ],
    "source": [
-    "base_data_df = base_data_df[(base_data_df.index >= predict_start_datetime) & (base_data_df.index <= predict_end_datetime)]\n",
+    "base_data_df = base_data_df[\n",
+    "    (base_data_df.index >= predict_start_datetime) & (base_data_df.index <= predict_end_datetime)\n",
+    "]\n",
     "\n",
     "for predict in OUT_FEATURES:\n",
     "    plt.figure(figsize=(18, 6))\n",
-    "    plt.plot(df_out[\"date\"].values, utils.smooth(df_out[predict].values, 2), label=\"Predict\")\n",
+    "    plt.plot(df_out[\"date\"].values, utils.smooth(df_out[predict].values, 2), label=\"Prediction\")\n",
     "    plt.plot(base_data_df.index.values, base_data_df[predict].values, label=\"Ground Truth\")\n",
-    "    # plt.plot(base_data_df.index.values, base_data_df[predict+\"_forecast\"].values, label=\"Forecast\")\n",
+    "\n",
     "    plt.title(f\"24 Models {predict} Ground Truth Vs Predict\")\n",
     "    plt.legend()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 39,
    "metadata": {},
    "outputs": [
     {
@@ -1411,31 +1365,28 @@
     }
    ],
    "source": [
-    "from sklearn.metrics import mean_squared_error, mean_absolute_error\n",
-    "import math\n",
-    "\n",
-    "def calculate_KPI(y, yhat):\n",
-    "    print(\"RMSE: {}\".format(round(mean_squared_error(y,yhat,squared=False),2)))\n",
-    "    print(\"MAE: {}\".format(round(mean_absolute_error(y,yhat),2)))\n",
-    "    print(\"MAE%: {}%\".format(round(100*sum(abs(y-yhat))/sum(y),2)))\n",
-    "\n",
     "print(\"temperature\")\n",
-    "calculate_KPI(utils.smooth(df_out[\"temperature\"].values, 1),base_data_df[\"temperature\"].values)\n",
+    "utils.calculate_KPI(\n",
+    "    utils.smooth(list(df_out[\"temperature\"].values), 1),\n",
+    "    np.array(base_data_df[\"temperature\"].values),\n",
+    ")\n",
     "\n",
     "print(\"\\n\", \"wind_speed\")\n",
-    "calculate_KPI(utils.smooth(df_out[\"wind_speed\"].values, 1),base_data_df[\"wind_speed\"].values)"
+    "utils.calculate_KPI(\n",
+    "    utils.smooth(list(df_out[\"wind_speed\"].values), 1), np.array(base_data_df[\"wind_speed\"].values)\n",
+    ")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Training model using not relevant dataset or without forecast observations"
+    "### Training model using non-relevant dataset or without forecast observations"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 40,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1445,7 +1396,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 41,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1455,7 +1406,8 @@
     "    root_path=ROOT_PATH,\n",
     "    data_export_path=DATA_EXPORT_PATH,\n",
     "    station_name=STATION_NAME,\n",
-    "    relevant=False)\n",
+    "    relevant=False,\n",
+    ")\n",
     "\n",
     "train_weather.train_model(historical_df, start=0, epochs=1)"
    ]
@@ -1470,15 +1422,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 42,
    "metadata": {},
    "outputs": [],
    "source": [
     "weather_forecast = prediction.InferenceWeather(\n",
-    "                        root_path=ROOT_PATH,\n",
-    "                        data_export_path=DATA_EXPORT_PATH,\n",
-    "                        station_name=STATION_NAME,\n",
-    "                        predicts=OUT_FEATURES)"
+    "    root_path=ROOT_PATH,\n",
+    "    data_export_path=DATA_EXPORT_PATH,\n",
+    "    station_name=STATION_NAME,\n",
+    "    predicts=OUT_FEATURES,\n",
+    ")"
    ]
   },
   {
@@ -1491,7 +1444,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 43,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1501,7 +1454,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 44,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1511,14 +1464,12 @@
     "\n",
     "df_output_merge = pd.DataFrame(columns=base_data_df.columns)\n",
     "\n",
-    "df_out = weather_forecast.inference(base_data_df,\n",
-    "            start_datetime=predict_start_datetime\n",
-    "            )"
+    "df_out = weather_forecast.inference(base_data_df, start_datetime=predict_start_datetime)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 45,
    "metadata": {},
    "outputs": [
     {
@@ -1560,7 +1511,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 46,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1571,15 +1522,14 @@
     "predict_start_datetime = datetime(year=2022, month=4, day=30, hour=13, minute=0, second=0)\n",
     "predict_end_datetime = datetime(year=2022, month=5, day=21, hour=13, minute=0, second=0)\n",
     "\n",
-    "df_out = weather_forecast.inference_historical(base_data_df,\n",
-    "            start_datetime=predict_start_datetime,\n",
-    "            end_datetime=predict_end_datetime\n",
-    "            )"
+    "df_out = weather_forecast.inference_historical(\n",
+    "    base_data_df, start_datetime=predict_start_datetime, end_datetime=predict_end_datetime\n",
+    ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 47,
    "metadata": {},
    "outputs": [
     {
@@ -1604,12 +1554,14 @@
     }
    ],
    "source": [
-    "base_data_df = base_data_df[(base_data_df.index >= predict_start_datetime) & (base_data_df.index <= predict_end_datetime)]\n",
+    "base_data_df = base_data_df[\n",
+    "    (base_data_df.index >= predict_start_datetime) & (base_data_df.index <= predict_end_datetime)\n",
+    "]\n",
     "for predict in OUT_FEATURES:\n",
     "    plt.figure(figsize=(20, 5))\n",
     "    plt.plot(df_out[\"date\"].values, df_out[predict].values)\n",
     "    plt.plot(base_data_df.index.values, base_data_df[predict].values)\n",
-    "    plt.title(f\"24 Models {predict} Ground Truth Vs Predict\")\n",
+    "    plt.title(f\"24 Models {predict} Ground Truth Vs Prediction\")\n",
     "    plt.legend([\"Predict\", \"Ground Truth\"])"
    ]
   }
@@ -1632,7 +1584,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.16"
+   "version": "3.8.18"
   },
   "name": "Micro climate prediction",
   "running_time": "",
diff --git a/notebooks/deepmc/notebook_lib/forecast.py b/notebooks/deepmc/notebook_lib/forecast.py
index 645b8e8d..993a6617 100644
--- a/notebooks/deepmc/notebook_lib/forecast.py
+++ b/notebooks/deepmc/notebook_lib/forecast.py
@@ -1,4 +1,3 @@
-import time
 from datetime import datetime, timedelta
 from typing import Any, Dict, List, Tuple, cast
 
@@ -8,7 +7,7 @@
 from shapely.geometry import Point
 
 from vibe_core.client import FarmvibesAiClient, get_default_vibe_client
-from vibe_core.datamodel import RunConfig, RunConfigUser, RunDetails, SpatioTemporalJson
+from vibe_core.datamodel import RunConfig, RunConfigUser, SpatioTemporalJson
 
 
 class Forecast:
@@ -31,7 +30,8 @@ def submit_download_request(self):
         """
         Submit request to worker to download forecast data
         """
-        run_list = []
+        run_metadata_list = []
+        runs = []
         for parameter in self.parameters:
             run_name = f"forecast_{parameter['weather_type']}"
             run = self.client.run(
@@ -42,57 +42,40 @@ def submit_download_request(self):
                 parameters=parameter,
             )
 
-            try:
-                run.block_until_complete(5)
-            except RuntimeError:
-                print(run)
-
-            run_list.append(
+            run_metadata_list.append(
                 {
                     "id": run.id,
                     "weather_type": parameter["weather_type"],
                 }
             )
+            runs.append(run)
+
+        self.client.monitor(runs, 5)
 
-        return run_list
+        return run_metadata_list
 
     def get_run_status(self, run_list: List[Dict[str, str]]):
         clear_output(wait=True)
-        all_done = True
-        out_ = []
+        out = []
         for run_item in run_list:
             o = self.client.describe_run(run_item["id"])
             print(f"Execution status for {run_item['weather_type']}: {o.details.status}")
 
             if o.details.status == "done":
-                out_.append(o)
-            elif o.details.status == "failed":
-                print(o.details)
+                out.append(o)
             else:
-                all_done = False
-                cnt_complete = 0
-                for key, value in o.task_details.items():
-                    value = cast(RunDetails, value)
-                    assert value.subtasks is not None, "Subtasks don't exist"
-                    for subtask in value.subtasks:
-                        if subtask.status == "done":
-                            cnt_complete += 1
-                    print(
-                        "\t",
-                        f"Subtask {key}",
-                        cnt_complete,
-                        "/",
-                        len(value.subtasks),
-                    )
-                    cnt_complete = 0
-        return all_done, out_
+                raise Exception(
+                    f"Execution status for {run_item['weather_type']}: {o.details.status}"
+                )
+
+        return out
 
     def get_all_assets(self, details: RunConfigUser):
         asset_files = []
         output = details.output["weather_forecast"]
         record: Dict[str, Any]
         for record in cast(List[Dict[str, Any]], output):
-            for _, value in record["assets"].items():
+            for value in record["assets"].values():
                 asset_files.append(value["href"])
         df_assets = [pd.read_csv(f, index_col=False) for f in asset_files]
         df_out = pd.concat(df_assets)
@@ -104,21 +87,15 @@ def get_downloaded_data(self, run_list: List[Dict[str, str]], offset_hours: int
         check the download status. If status is done, fetch the downloaded data
         """
         forecast_dataset = pd.DataFrame()
-        status = False
-        out_ = []
-        while status is False:
-            status, out_ = self.get_run_status(run_list)
-            time.sleep(10)
-
-        if status:
-            for detail in out_:
-                df = self.get_all_assets(detail)
+        out = self.get_run_status(run_list)
+        for detail in out:
+            df = self.get_all_assets(detail)
 
-                # Offset from UTC to specified timezone
-                df.index = df.index + pd.offsets.Hour(offset_hours)
+            # Offset from UTC to specified timezone
+            df.index = df.index + pd.offsets.Hour(offset_hours)
 
-                if not df.empty:
-                    forecast_dataset = pd.concat([forecast_dataset, df], axis=1)
+            if not df.empty:
+                forecast_dataset = pd.concat([forecast_dataset, df], axis=1)
 
         return forecast_dataset
 
diff --git a/notebooks/deepmc/notebook_lib/modules.py b/notebooks/deepmc/notebook_lib/modules.py
index 5fbfe012..9be52ab4 100644
--- a/notebooks/deepmc/notebook_lib/modules.py
+++ b/notebooks/deepmc/notebook_lib/modules.py
@@ -59,14 +59,14 @@ def training_step(self, train_batch: Tensor, _):
         x, y = train_batch[:6], train_batch[6]
         y_hat = self.deepmc(x)
         loss = self.loss(y_hat, y)
-        self.log("train_loss/total", loss)
+        self.log("train_loss/total", loss, on_epoch=True, prog_bar=True, logger=True, on_step=True)
         return loss
 
     def validation_step(self, validation_batch: Tensor, _):
         x, y = validation_batch[:6], validation_batch[6]
         y_hat = self.deepmc(x)
         loss = self.loss(y_hat, y)
-        self.log("val_loss/total", loss, on_epoch=True)
+        self.log("val_loss/total", loss, on_epoch=True, prog_bar=True, logger=True, on_step=True)
         return loss
 
 
diff --git a/notebooks/deepmc/notebook_lib/post_models.py b/notebooks/deepmc/notebook_lib/post_models.py
deleted file mode 100644
index 224be6fd..00000000
--- a/notebooks/deepmc/notebook_lib/post_models.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from keras.layers import BatchNormalization, Dense, Input
-from keras.models import Sequential
-from keras.utils.vis_utils import plot_model
-
-
-def simple_mixture_model(inshape: int):
-    model = Sequential()
-    model.add(Input(shape=(inshape,)))
-
-    model.add(Dense(inshape * 2, activation="relu"))
-    model.add(BatchNormalization())
-    model.add(Dense(inshape * 4, activation="relu"))
-    model.add(BatchNormalization())
-    model.add(Dense(inshape))
-
-    model.compile(loss="mae", optimizer="adam")
-    return model
-
-
-def fit_model(model, train_X, train_y, test_X, test_y, batch_size: int):
-    batch_size = batch_size
-    validation_data = (test_X, test_y)
-
-    # fit network
-    history = model.fit(
-        train_X,
-        train_y,
-        epochs=20,
-        batch_size=batch_size,
-        validation_data=validation_data,
-        verbose=1,
-    )
-
-    return model, history
diff --git a/notebooks/deepmc/notebook_lib/train.py b/notebooks/deepmc/notebook_lib/train.py
index 6a6f2242..4b9fa8b7 100644
--- a/notebooks/deepmc/notebook_lib/train.py
+++ b/notebooks/deepmc/notebook_lib/train.py
@@ -14,8 +14,8 @@
 from torch import Tensor
 from torch.utils.data import DataLoader, TensorDataset
 
-from . import utils
-from .preprocess import Preprocess
+from vibe_notebook.deepmc import utils
+from vibe_notebook.deepmc.preprocess import Preprocess
 
 MODEL_SUFFIX = "deepmc."
 
@@ -35,7 +35,7 @@ def __init__(
         wavelet: str = "bior3.5",
         mode: str = "periodic",
         level: int = 5,
-        batch_size: int = 256,
+        batch_size: int = 24,
         relevant: bool = False,
     ):
         if relevant:
@@ -67,6 +67,7 @@ def train_model(
         start: int = 0,
         end: int = -1,
         epochs: int = 20,
+        reset_preprocess: bool = False,
     ):
         end = self.total_models if end == -1 else end
 
@@ -80,12 +81,22 @@ def train_model(
             input_order_df[out_feature] = out_feature_df
 
             # data preprocessing
-            (train_scaler, output_scaler, train_df, test_df,) = utils.get_split_scaled_data(
+            (
+                train_scaler,
+                output_scaler,
+                train_df,
+                test_df,
+            ) = utils.get_split_scaled_data(
                 data=input_order_df, out_feature=out_feature, split_ratio=0.92
             )
+            if reset_preprocess and os.path.exists(
+                self.data_export_path % (out_feature, self.relevant_text)
+            ):
+                os.remove(self.data_export_path % (out_feature, self.relevant_text))
 
             if os.path.exists(self.data_export_path % (out_feature, self.relevant_text)):
-                with open(self.data_export_path % (out_feature, self.relevant_text), "rb") as f:
+                exp_path = self.data_export_path.replace("train_data.pkl", "train_data_dates.pkl")
+                with open(exp_path % (out_feature, self.relevant_text), "rb") as f:
                     (
                         train_X,
                         train_y,
@@ -93,6 +104,10 @@ def train_model(
                         test_y,
                         train_scaler,
                         output_scaler,
+                        train_dates_X,
+                        train_dates_y,
+                        test_dates_X,
+                        test_dates_y,
                     ) = pickle.load(f)
 
                 self.preprocess = Preprocess(
@@ -128,6 +143,10 @@ def train_model(
                     train_y,
                     test_X,
                     test_y,
+                    train_dates_X,
+                    train_dates_y,
+                    test_dates_X,
+                    test_dates_y,
                 ) = self.preprocess.wavelet_transform_train(train_df, test_df, out_feature)
 
                 with open(self.data_export_path % (out_feature, self.relevant_text), "wb") as f:
@@ -136,6 +155,25 @@ def train_model(
                         f,
                     )
 
+                exp_path = self.data_export_path.replace("train_data.pkl", "train_data_dates.pkl")
+
+                with open(exp_path % (out_feature, self.relevant_text), "wb") as f:
+                    pickle.dump(
+                        [
+                            train_X,
+                            train_y,
+                            test_X,
+                            test_y,
+                            train_scaler,
+                            output_scaler,
+                            train_dates_X,
+                            train_dates_y,
+                            test_dates_X,
+                            test_dates_y,
+                        ],
+                        f,
+                    )
+
             self.train_models(
                 train_X=train_X,  # type: ignore
                 train_y=train_y,  # type: ignore
@@ -145,6 +183,8 @@ def train_model(
                 out_feature=out_feature,
                 start=start,
                 end=end,
+                train_dates_y=train_dates_y,  # type: ignore
+                test_dates_y=test_dates_y,  # type: ignore
             )
 
     def train_models(
@@ -157,6 +197,8 @@ def train_models(
         out_feature: str,
         start: int,
         end: int,
+        train_dates_y: List[str],
+        test_dates_y: List[str],
     ):
         first_channels = train_X[0].shape[2]
         rest_channels = train_X[1].shape[2]
@@ -209,7 +251,6 @@ def train_models(
                         dirpath=model_path,
                     ),
                 ],
-                num_processes=1,
             )
 
             t_obj.fit(m, train_loader, val_loader)
@@ -225,6 +266,8 @@ def train_models(
                 out_feature=out_feature,
                 model_index=i,
                 epochs=epochs,
+                train_dates_y=train_dates_y,
+                test_dates_y=test_dates_y,
             )
 
     def export_to_onnx(
@@ -249,19 +292,24 @@ def export_to_onnx(
         )
 
     def get_dataloader(
-        self, gt: NDArray[Any], target: NDArray[Any], o_feature: str
+        self,
+        gt: NDArray[Any],
+        target: NDArray[Any],
+        o_feature: str,
+        dates_mapped: NDArray[Any],
     ) -> Tuple[DataLoader[Any], List[Tensor]]:
-        o_x = self.preprocess.dl_preprocess_data(pd.DataFrame(gt), o_feature)[0][:, :, 0].astype(
-            np.float32
-        )
+        dates_mapped = pd.to_datetime(dates_mapped, format="%Y-%m-%d %H:%M:%S").values
+        df = pd.DataFrame(list(zip(gt, dates_mapped)), columns=["data", "date"])
+        df.set_index("date", inplace=True)
+        o_x = self.preprocess.dl_preprocess_data(df, o_feature)[0][:, :, 0].astype(np.float32)
 
-        o_y = self.preprocess.dl_preprocess_data(pd.DataFrame(target), o_feature)[0][
-            :, :, 0
-        ].astype(np.float32)
+        df = pd.DataFrame(list(zip(target, dates_mapped)), columns=["data", "date"])
+        df.set_index("date", inplace=True)
+        o_y = self.preprocess.dl_preprocess_data(df, o_feature)[0][:, :, 0].astype(np.float32)
 
         o_inputs = [torch.from_numpy(x.astype(np.float32)) for x in (o_x, o_y)]
         o_dataset = TensorDataset(*o_inputs)
-        o_loader = DataLoader(o_dataset, batch_size=self.batch_size, shuffle=True)
+        o_loader = DataLoader(o_dataset, batch_size=self.batch_size, shuffle=True, drop_last=True)
         return o_loader, o_inputs
 
     def post_model(
@@ -274,6 +322,8 @@ def post_model(
         out_feature: str,
         model_index: int,
         epochs: int,
+        train_dates_y: List[str],
+        test_dates_y: List[str],
     ):
         m.eval()
 
@@ -288,11 +338,17 @@ def xf(a: List[NDArray[Any]]) -> List[Tensor]:
             os.mkdir(post_model_path)
 
         train_dataloader, _ = self.get_dataloader(
-            gt=train_y[:, model_index, 0], target=train_yhat, o_feature=out_feature  # type: ignore
+            gt=train_y[:, model_index, 0],  # type: ignore
+            target=train_yhat,
+            o_feature=out_feature,
+            dates_mapped=train_dates_y[:, model_index],  # type: ignore
         )
 
-        val_dataloader, val_inputs = self.get_dataloader(
-            gt=test_y[:, model_index, 0], target=test_yhat, o_feature=out_feature  # type: ignore
+        val_dataloader, _ = self.get_dataloader(
+            gt=test_y[:, model_index, 0],  # type: ignore
+            target=test_yhat,
+            o_feature=out_feature,
+            dates_mapped=test_dates_y[:, model_index],  # type: ignore
         )
 
         p_m = DeepMCPostTrain(first_in_features=self.total_models)
@@ -308,9 +364,113 @@ def xf(a: List[NDArray[Any]]) -> List[Tensor]:
                     dirpath=post_model_path,
                 ),
             ],
-            num_processes=1,
         )
 
         t_obj.fit(p_m, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)
 
         self.export_to_onnx(file_path=post_model_path, model=p_m.deepmc, inputs=torch.rand((1, 24)))
+
+    def preprocess_data(
+        self,
+        input_df: pd.DataFrame,
+        out_path: str,
+        start: int = 0,
+        end: int = -1,
+        epochs: int = 20,
+        reset_preprocess: bool = False,
+    ):
+        end = self.total_models if end == -1 else end
+
+        for out_feature in self.out_features:
+            if not os.path.exists(self.path_to_station % out_feature):
+                os.makedirs(self.path_to_station % out_feature, exist_ok=True)
+
+            input_order_df = input_df[self.train_features].copy()
+            out_feature_df = input_order_df[out_feature]
+            input_order_df.drop(columns=[out_feature], inplace=True)
+            input_order_df[out_feature] = out_feature_df
+
+            # data preprocessing
+            (
+                train_scaler,
+                output_scaler,
+                train_df,
+                test_df,
+            ) = utils.get_split_scaled_data(
+                data=input_order_df, out_feature=out_feature, split_ratio=0.92
+            )
+            if reset_preprocess and os.path.exists(
+                self.data_export_path % (out_feature, self.relevant_text)
+            ):
+                os.remove(self.data_export_path % (out_feature, self.relevant_text))
+
+            if os.path.exists(self.data_export_path % (out_feature, self.relevant_text)):
+                with open(self.data_export_path % (out_feature, self.relevant_text), "rb") as f:
+                    (
+                        train_X,
+                        train_y,
+                        test_X,
+                        test_y,
+                        train_scaler,
+                        output_scaler,
+                    ) = pickle.load(f)
+
+                self.preprocess = Preprocess(
+                    train_scaler=train_scaler,
+                    output_scaler=output_scaler,
+                    is_training=True,
+                    is_validation=self.is_validation,
+                    ts_lookahead=self.ts_lookahead,
+                    ts_lookback=self.ts_lookback,
+                    chunk_size=self.chunk_size,
+                    wavelet=self.wavelet,
+                    mode=self.mode,
+                    level=self.level,
+                    relevant=self.relevant,
+                )
+            else:
+                self.preprocess = Preprocess(
+                    train_scaler=train_scaler,
+                    output_scaler=output_scaler,
+                    is_training=True,
+                    is_validation=self.is_validation,
+                    ts_lookahead=self.ts_lookahead,
+                    ts_lookback=self.ts_lookback,
+                    chunk_size=self.chunk_size,
+                    wavelet=self.wavelet,
+                    mode=self.mode,
+                    level=self.level,
+                    relevant=self.relevant,
+                )
+
+                (
+                    train_X,
+                    train_y,
+                    test_X,
+                    test_y,
+                    train_dates,
+                    test_dates,
+                ) = self.preprocess.wavelet_transform_train(train_df, test_df, out_feature)
+
+                with open(self.data_export_path % (out_feature, self.relevant_text), "wb") as f:
+                    pickle.dump(
+                        [train_X, train_y, test_X, test_y, train_scaler, output_scaler],
+                        f,
+                    )
+
+                exp_path = self.data_export_path.replace("train_data.pkl", "train_data_dates.pkl")
+
+                with open(exp_path % (out_feature, self.relevant_text), "wb") as f:
+                    pickle.dump(
+                        [
+                            train_X,
+                            train_y,
+                            test_X,
+                            test_y,
+                            train_scaler,
+                            output_scaler,
+                            train_dates,
+                            test_dates,
+                        ],
+                        f,
+                    )
diff --git a/notebooks/deepmc/notebook_lib/transformer_models_ts.py b/notebooks/deepmc/notebook_lib/transformer_models_ts.py
deleted file mode 100644
index ba55aaca..00000000
--- a/notebooks/deepmc/notebook_lib/transformer_models_ts.py
+++ /dev/null
@@ -1,367 +0,0 @@
-import numpy as np
-import tensorflow as tf
-
-
-def get_angles(pos, i, d_model):
-    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
-    return pos * angle_rates
-
-
-def positional_encoding(position, d_model):
-    angle_rads = get_angles(
-        np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model
-    )
-
-    # apply sin to even indices in the array; 2i
-    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
-
-    # apply cos to odd indices in the array; 2i+1
-    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
-
-    pos_encoding = angle_rads[np.newaxis, ...]
-
-    return tf.cast(pos_encoding, dtype=tf.float32)
-
-
-# create mask for padding, 0 --> 1 (mask)
-def create_padding_mask(seq):
-    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
-
-    # add extra dimensions to add the padding
-    # to the attention logits.
-    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)
-
-
-def create_look_ahead_mask(size):
-    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
-    return mask  # (seq_len, seq_len)
-
-
-def scaled_dot_product_attention(q, k, v, mask):
-    """Calculate the attention weights.
-    q, k, v must have matching leading dimensions.
-    k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
-    The mask has different shapes depending on its type(padding or look ahead)
-    but it must be broadcastable for addition.
-
-    Args:
-    q: query shape == (..., seq_len_q, depth)
-    k: key shape == (..., seq_len_k, depth)
-    v: value shape == (..., seq_len_v, depth_v)
-    mask: Float tensor with shape broadcastable
-          to (..., seq_len_q, seq_len_k). Defaults to None.
-
-    Returns:
-    output, attention_weights
-    """
-
-    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
-
-    # scale matmul_qk
-    dk = tf.cast(tf.shape(k)[-1], tf.float32)
-    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
-
-    # add the mask to the scaled tensor.
-    if mask is not None:
-        scaled_attention_logits += mask * -1e9
-
-    # softmax is normalized on the last axis (seq_len_k) so that the scores
-    # add up to 1.
-    attention_weights = tf.nn.softmax(
-        scaled_attention_logits, axis=-1
-    )  # (..., seq_len_q, seq_len_k)
-
-    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)
-
-    return output, attention_weights
-
-
-def print_out(q, k, v):
-    temp_out, temp_attn = scaled_dot_product_attention(q, k, v, None)
-    print("Attention weights are:")
-    print(temp_attn)
-    print("Output is:")
-    print(temp_out)
-
-
-"""
-    - Q (query), K (key) and V (value) are split into multiple heads (num_heads)
-    - each tuple (q, k, v) are fed to scaled_dot_product_attention
-    - all attention outputs are concatenated
-"""
-
-
-class MultiHeadAttention(tf.keras.layers.Layer):
-    def __init__(self, d_model, num_heads):
-        super(MultiHeadAttention, self).__init__()
-        self.num_heads = num_heads
-        self.d_model = d_model
-
-        assert d_model % self.num_heads == 0
-
-        self.depth = d_model // self.num_heads
-
-        self.wq = tf.keras.layers.Dense(d_model)
-        self.wk = tf.keras.layers.Dense(d_model)
-        self.wv = tf.keras.layers.Dense(d_model)
-
-        self.dense = tf.keras.layers.Dense(d_model)
-
-    def split_heads(self, x, batch_size):
-        """Split the last dimension into (num_heads, depth).
-        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
-        """
-        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
-        return tf.transpose(x, perm=[0, 2, 1, 3])
-
-    def call(self, v, k, q, mask):
-        batch_size = tf.shape(q)[0]
-
-        q = self.wq(q)  # (batch_size, seq_len, d_model)
-        k = self.wk(k)  # (batch_size, seq_len, d_model)
-        v = self.wv(v)  # (batch_size, seq_len, d_model)
-
-        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
-        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
-        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
-
-        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)
-
-        scaled_attention = tf.transpose(
-            scaled_attention, perm=[0, 2, 1, 3]
-        )  # (batch_size, seq_len_q, num_heads, depth)
-
-        concat_attention = tf.reshape(
-            scaled_attention, (batch_size, -1, self.d_model)
-        )  # (batch_size, seq_len_q, d_model)
-
-        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
-
-        return output, attention_weights
-
-
-def point_wise_feed_forward_network(d_model, dff):
-    return tf.keras.Sequential(
-        [
-            tf.keras.layers.Dense(dff, activation="relu"),  # (batch_size, seq_len, dff)
-            tf.keras.layers.Dense(d_model),  # (batch_size, seq_len, d_model)
-        ]
-    )
-
-
-class EncoderLayer(tf.keras.layers.Layer):
-    def __init__(self, d_model, num_heads, dff, rate=0.1):
-        super(EncoderLayer, self).__init__()
-
-        self.mha = MultiHeadAttention(d_model, num_heads)
-        self.ffn = point_wise_feed_forward_network(d_model, dff)
-
-        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-
-        self.dropout1 = tf.keras.layers.Dropout(rate)
-        self.dropout2 = tf.keras.layers.Dropout(rate)
-
-    def call(self, x, training, mask):
-
-        attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
-        attn_output = self.dropout1(attn_output, training=training)
-        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
-
-        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
-        ffn_output = self.dropout2(ffn_output, training=training)
-        out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
-
-        return out2
-
-
-class DecoderLayer(tf.keras.layers.Layer):
-    def __init__(self, d_model, num_heads, dff, rate=0.1):
-        super(DecoderLayer, self).__init__()
-
-        self.mha1 = MultiHeadAttention(d_model, num_heads)
-        self.mha2 = MultiHeadAttention(d_model, num_heads)
-
-        self.ffn = point_wise_feed_forward_network(d_model, dff)
-
-        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-
-        self.dropout1 = tf.keras.layers.Dropout(rate)
-        self.dropout2 = tf.keras.layers.Dropout(rate)
-        self.dropout3 = tf.keras.layers.Dropout(rate)
-
-    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
-        # enc_output.shape == (batch_size, input_seq_len, d_model)
-
-        attn1, attn_weights_block1 = self.mha1(
-            x, x, x, look_ahead_mask
-        )  # (batch_size, target_seq_len, d_model)
-        attn1 = self.dropout1(attn1, training=training)
-        out1 = self.layernorm1(attn1 + x)
-
-        attn2, attn_weights_block2 = self.mha2(
-            enc_output, enc_output, out1, padding_mask
-        )  # (batch_size, target_seq_len, d_model)
-        attn2 = self.dropout2(attn2, training=training)
-        out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)
-
-        ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
-        ffn_output = self.dropout3(ffn_output, training=training)
-        out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)
-
-        return out3, attn_weights_block1, attn_weights_block2
-
-
-class Encoder(tf.keras.layers.Layer):
-    def __init__(self, num_layers, d_model, num_heads, dff, maximum_position_encoding, rate=0.1):
-        super(Encoder, self).__init__()
-
-        self.d_model = d_model
-        self.num_layers = num_layers
-
-        self.embedding = tf.keras.layers.Dense(d_model, activation="relu")
-        self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)
-
-        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
-
-        self.dropout = tf.keras.layers.Dropout(rate)
-
-    def call(self, x, training, mask):
-
-        seq_len = tf.shape(x)[1]
-
-        # print("Encoder:", x.shape)
-        # adding embedding and position encoding.
-        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
-        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
-        x += self.pos_encoding[:, :seq_len, :]
-
-        x = self.dropout(x, training=training)
-
-        for i in range(self.num_layers):
-            x = self.enc_layers[i](x, training, mask)
-
-        return x  # (batch_size, input_seq_len, d_model)
-
-
-class Decoder(tf.keras.layers.Layer):
-    def __init__(self, num_layers, d_model, num_heads, dff, maximum_position_encoding, rate=0.1):
-        super(Decoder, self).__init__()
-
-        self.d_model = d_model
-        self.num_layers = num_layers
-
-        self.embedding = tf.keras.layers.Dense(d_model, activation="relu")
-        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
-
-        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
-        self.dropout = tf.keras.layers.Dropout(rate)
-
-    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
-
-        seq_len = tf.shape(x)[1]
-        attention_weights = {}
-
-        x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
-        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
-        x += self.pos_encoding[:, :seq_len, :]
-
-        x = self.dropout(x, training=training)
-
-        for i in range(self.num_layers):
-            x, block1, block2 = self.dec_layers[i](
-                x, enc_output, training, look_ahead_mask, padding_mask
-            )
-            attention_weights["decoder_layer{}_block1".format(i + 1)] = block1
-            attention_weights["decoder_layer{}_block2".format(i + 1)] = block2
-
-        return x, attention_weights
-
-
-class Transformer(tf.keras.Model):
-    def __init__(
-        self, num_layers, d_model, num_heads, dff, target_vocab_size, pe_input, pe_target, rate=0.1
-    ):
-        super(Transformer, self).__init__()
-
-        self.encoder = Encoder(num_layers, d_model, num_heads, dff, pe_input, rate)
-
-        self.decoder = Decoder(num_layers, d_model, num_heads, dff, pe_target, rate)
-
-        self.final_layer = tf.keras.layers.Dense(target_vocab_size)
-
-    def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
-
-        enc_output = self.encoder(
-            inp, training, enc_padding_mask
-        )  # (batch_size, inp_seq_len, d_model)
-
-        # dec_output.shape == (batch_size, tar_seq_len, d_model)
-        dec_output, attention_weights = self.decoder(
-            tar, enc_output, training, look_ahead_mask, dec_padding_mask
-        )
-
-        final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)
-
-        return final_output, attention_weights
-
-
-class GLU(tf.keras.layers.Layer):
-    def __init__(self, input_channel, output_channel):
-        super(GLU, self).__init__()
-        self.linear_left = tf.keras.layers.Dense(output_channel)
-        self.linear_right = tf.keras.layers.Dense(output_channel)
-
-    def call(self, x):
-        return tf.math.multiply(
-            self.linear_left(x), tf.keras.activations.sigmoid(self.linear_right(x))
-        )
-
-
-class FFT(tf.keras.layers.Layer):
-    def __init__(self, time_step, order, output_channel):
-        super(FFT, self).__init__()
-        self.time_step = time_step
-        self.order = order
-        self.output_channel = output_channel
-        self.GLUs = []  # nn.ModuleList()
-        for i in range(3):
-            if i == 0:
-                self.GLUs.append(
-                    GLU(self.time_step * self.order, self.time_step * self.output_channel)
-                )
-                self.GLUs.append(
-                    GLU(self.time_step * self.order, self.time_step * self.output_channel)
-                )
-            elif i == 1:
-                self.GLUs.append(
-                    GLU(self.time_step * self.output_channel, self.time_step * self.output_channel)
-                )
-                self.GLUs.append(
-                    GLU(self.time_step * self.output_channel, self.time_step * self.output_channel)
-                )
-            else:
-                self.GLUs.append(
-                    GLU(self.time_step * self.output_channel, self.time_step * self.output_channel)
-                )
-                self.GLUs.append(
-                    GLU(self.time_step * self.output_channel, self.time_step * self.output_channel)
-                )
-
-    def call(self, x):
-        # x should be (b, seq_len, units)
-        x = tf.keras.layers.Permute((2, 1))(x)
-        ffted = tf.signal.fft(tf.cast(x, dtype=tf.complex64))  # (b, units, seq_len)
-        real = tf.math.real(ffted)  # [b, units, seq_len]
-        img = tf.math.imag(ffted)
-        for i in range(3):
-            real = self.GLUs[i * 2](real)
-            img = self.GLUs[2 * i + 1](img)
-
-        time_step_as_inner = tf.dtypes.complex(real, img)
-        iffted = tf.signal.ifft(time_step_as_inner)  # [b, k, node_cnt, 48]
-        iffted = tf.cast(iffted, dtype=tf.float32)
-        iffted = tf.keras.layers.Permute((2, 1))(iffted)
-        return iffted
diff --git a/notebooks/deepmc/notebook_lib/utils.py b/notebooks/deepmc/notebook_lib/utils.py
deleted file mode 100644
index eacec1aa..00000000
--- a/notebooks/deepmc/notebook_lib/utils.py
+++ /dev/null
@@ -1,104 +0,0 @@
-from datetime import datetime, timedelta
-from typing import Any, Dict, List
-
-import numpy as np
-import pandas as pd
-from numpy._typing import NDArray
-from pandas.tseries.offsets import DateOffset
-from sklearn.preprocessing import StandardScaler
-
-
-def get_csv_data(
-    path: str,
-    date_attribute: str = "date",
-    columns_rename: Dict[str, str] = {},
-    frequency: str = "60min",
-):
-    """
-    Read data from CSV file using Pandas python package.
-    """
-
-    data_df = pd.read_csv(path)
-    data_df[date_attribute] = pd.to_datetime(data_df[date_attribute])
-
-    if columns_rename:
-        data_df.rename(columns=columns_rename, inplace=True)
-
-    # apply index on date
-    data_df.reset_index(drop=True, inplace=True)
-    data_df.set_index(date_attribute, inplace=True)
-    data_df.sort_index(ascending=True, inplace=True)
-
-    # interpolate to derive missing data
-    data_df = data_df.interpolate(method="from_derivatives")
-    assert data_df is not None, "Interpolate deleted all data"
-    data_df = data_df.dropna()
-
-    # Group rows by frequency, requires date attribute indexed to execute this
-    data_df = data_df.fillna(method="ffill")
-    data_df = data_df.fillna(method="bfill")
-    data_df = data_df.groupby(pd.Grouper(freq=frequency)).mean()
-    data_df = data_df.fillna(method="ffill")
-    data_df = data_df.fillna(method="bfill")
-
-    return data_df
-
-
-def hour_round(t: datetime):
-    # Rounds to nearest hour by adding a timedelta hour if minute >= 30
-    return t.replace(second=0, microsecond=0, minute=0, hour=t.hour) + timedelta(
-        hours=t.minute // 30
-    )
-
-
-def get_split_scaled_data(data: pd.DataFrame, out_feature: str, split_ratio: float = 0.92):
-    split = int(split_ratio * data.shape[0])
-
-    train_data = data.iloc[:split]
-    test_data = data.iloc[split:]
-
-    output_scaler = StandardScaler()
-    output_scaler.fit_transform(np.expand_dims(data[out_feature].values, axis=1))  # type: ignore
-
-    train_scaler = StandardScaler()
-    train_scale_df = pd.DataFrame(
-        train_scaler.fit_transform(train_data), columns=train_data.columns, index=train_data.index
-    )
-    test_scale_df = pd.DataFrame(
-        train_scaler.transform(test_data), columns=test_data.columns, index=test_data.index
-    )
-
-    return train_scaler, output_scaler, train_scale_df, test_scale_df
-
-
-def shift_index(ds_df: pd.DataFrame, freq_minutes: int, num_indices: int, dateColumn: str = "date"):
-    ds_df[dateColumn] = ds_df.index.shift(-num_indices, freq=DateOffset(minutes=freq_minutes))
-    ds_df = ds_df.reset_index(drop=True)
-    ds_df = ds_df.set_index(dateColumn)
-    return ds_df
-
-
-def clean_relevant_data(
-    actual_df: pd.DataFrame,
-    forecast_df: pd.DataFrame,
-    out_variables: List[str],
-    freq_hours: int,
-    num_of_indices: int,
-):
-    base_data_df = actual_df.copy()
-    current_ws_df = forecast_df.add_suffix("Current")
-    base_data_df = base_data_df.join(current_ws_df)
-    shift_forecast_df = shift_index(forecast_df, freq_hours * 60, num_of_indices)
-    base_data_df = base_data_df.join(shift_forecast_df)
-
-    base_data_df = base_data_df[out_variables]
-    base_data_df = base_data_df.interpolate(method="from_derivatives")
-    assert base_data_df is not None, "Interpolate deleted all data"
-    base_data_df = base_data_df.dropna()
-    return base_data_df
-
-
-def smooth(y: NDArray[Any], box_pts: int):
-    box = np.ones(box_pts) / box_pts
-    y_smooth = np.convolve(y, box, mode="same")
-    return y_smooth
diff --git a/notebooks/deepmc_neighbors/deepmc_neighbors_env.yaml b/notebooks/deepmc_neighbors/deepmc_neighbors_env.yaml
new file mode 100644
index 00000000..834e34c9
--- /dev/null
+++ b/notebooks/deepmc_neighbors/deepmc_neighbors_env.yaml
@@ -0,0 +1,19 @@
+name: deepmc-pytorch-neighbors
+channels:
+  - pyg
+  - conda-forge
+  - defaults
+dependencies:
+  - python=3.9.*
+  - pip~=21.2.4
+  - pip:
+    - geopandas~=0.9.0
+    - einops~=0.6.0
+    - geopy~=2.4.1
+    - ipykernel~=6.17.1
+    - unfoldNd~=0.2.0
+    - pyWavelets~=1.3.0
+    - pydantic~=1.10.12
+    - matplotlib~=3.9.0
+    - ../../src/vibe_core
+    - ../../src/vibe_notebook
\ No newline at end of file
diff --git a/notebooks/deepmc_neighbors/gnn_forecast.ipynb b/notebooks/deepmc_neighbors/gnn_forecast.ipynb
new file mode 100644
index 00000000..75834edc
--- /dev/null
+++ b/notebooks/deepmc_neighbors/gnn_forecast.ipynb
@@ -0,0 +1,643 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Micro Climate Predictions with Nearby Weather Stations\n",
+    "\n",
+    "It helps infer weather forecasts for stations that have no data or limited data by utilizing data of neighboring stations. The notebook demonstrates configuring inputs and training a model using neighboring weather stations data.\n",
+    "\n",
+    "This is an extension of the deepmc notebook [notebooks/deepmc/mc_forecast.ipynb](https://github.com/microsoft/farmvibes-ai/blob/main/notebooks/deepmc/mc_forecast.ipynb).\n",
+    "\n",
+    "Before running this notebook, let's build a micromamba environment. If you do not have micromamba installed, please follow the instructions from the [micromamba installation guide](https://mamba.readthedocs.io/en/latest/installation/micromamba-installation.html).\n",
+    "\n",
+    "```bash\n",
+    "$ micromamba env create -f ./deepmc_neighbors_env.yaml\n",
+    "$ micromamba activate deepmc-pytorch-neighbors\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Install Packages**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! pip install torch==1.12.1 --index-url https://download.pytorch.org/whl/cpu\n",
+    "! pip install torch-scatter==2.1.0 torch-sparse==0.6.15 torch-geometric==2.3.0 -f https://data.pyg.org/whl/torch-1.12.1%2Bcpu.html\n",
+    "! pip install torch-geometric-temporal~=0.54.0 onnxruntime~=1.15.0 pytorch-lightning~=1.8.0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Notebook overview\n",
+    "In this notebook, we describe steps to generate forecast for weather variables at a specific station with no or limited data. We employ [Graphical Neural Network (GNNs)](https://pytorch-geometric.readthedocs.io/) for cross-learning from nearby weather stations by capturing spatial relationships. \n",
+    "\n",
+    "To illustrate this approach, we focus on three locations in Washington state, U.S.A., utilizing data accessible through [AGWeatherNet](https://weather.wsu.edu/). An example is shown the figure below. For instance, assuming that the Warden SW station has missing data, we look to neighboring stations (such as Royal Slope and Ringold) that provide relevant data. We consider the weather variables, temperature, humidity and wind_speed.\n",
+    "\n",
+    "\n",
+    "\n",
+    "Selecting appropriate neighboring stations is crucial for accurate predictions. When choosing neighboring weather stations, consider the following factors:\n",
+    "\n",
+    "- Elevation Similarity: In the current model the neighboring stations should be at a similar elevation to the target station. This ensures that altitude-related effects are consistent. Although, one can build a edge weight model which includes altitude differential to accommodate for the topography (this notebook does not cover that). \n",
+    "\n",
+    "- Spatial Proximity: The distance between neighboring stations should be small. Proximity often implies similar local weather patterns. For example, in the example, we chose stations with distance less than 25 km between them. In our experiments we noticed significant errors with distances greater than 25 Kms.\n",
+    "\n",
+    "**Graph Representation of Weather Stations for GNNs**\n",
+    "\n",
+    "Each weather station corresponds to a node in our graph. To capture the relationships between stations, we connect stations based on the distance between them. This graph does not change with time during inference. If a new station is available which can be helpful to increase accuracy, then the model can be dynamically updated by recomputing & retraining the GNN.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Following steps are required for training a model and inference.\n",
+    "\n",
+    "**Step 1: Download AgWeatherNet data**\n",
+    "- Download historical weather data for the stations Royal Slope and Ringold from [AGWeatherNet]( https://weather.wsu.edu/) for the time range of interest (minimum 2 years of data).\n",
+    "- Clean downloaded historical data for considered variables temperature, humidity and wind_speed. \n",
+    "\n",
+    "Note: these two steps are not included in the notebook. See [sample data](sample_data.csv) for an example. \n",
+    "\n",
+    "**Step 2: Download forecast data**\n",
+    "- Download HRRR data for the stations Warden SW, Royal Slope and Ringold using herbie_forecast workflow in Farmvibes for the time range of interest (minimum 2 years of data).\n",
+    "- Clean downloaded HRRR data for considered variables temperature, humidity and wind_speed.\n",
+    "\n",
+    "**Step 3: Train DeepMC models**\n",
+    "- For stations Royal Slope and Ringold, train the DeepMC model using the notebook [notebooks/deepmc/mc_forecast.ipynb]( https://github.com/microsoft/farmvibes-ai/blob/main/notebooks/deepmc/mc_forecast.ipynb). You will need to train separately for each station.\n",
+    "- The results received from DeepMC inference results are weather forecasts for next 24 hours for the stations Royal Slope and Ringold.\n",
+    "\n",
+    "**Step 4: Preparation for GNN model training**\n",
+    "- Create embeddings: Concatenate cleaned HRRR weather forecast data of station Warden SW and DeepMC inference results of station Royal Slope & Ringold.\n",
+    "- Create train and test splits from the embeddings.\n",
+    "- Train GNN model.\n",
+    "\n",
+    "**Step 5: Inference**\n",
+    "\n",
+    "Run the inference to infer weather forecasts for the Warden SW station.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Notebook Setup\n",
+    "\n",
+    "Let's start by importing the required packages and defining some constants.\n",
+    "\n",
+    "### Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import warnings\n",
+    "from datetime import datetime\n",
+    "\n",
+    "from notebook_lib.post_deepmc_inference import download_forecast_data\n",
+    "from notebook_lib.train import MC_Neighbors\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Constants\n",
+    "- ROOT_DIR - Root directory of DeepMC output.\n",
+    "- WEATHER_TYPE - temperature, humidity, or wind_speed.\n",
+    "- INFERENCE_STATION - Station having missing weather data.\n",
+    "- MODEL_TYPE - relevant or not-relevant"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ROOT_DIR = \"\"\n",
+    "WEATHER_TYPE = \"temperature\"\n",
+    "INFERENCE_STATION = \"Warden_SW\"\n",
+    "MODEL_TYPE = \"relevant\"\n",
+    "ROOT_PATH = os.path.join(ROOT_DIR, WEATHER_TYPE)\n",
+    "\n",
+    "# Forecast data\n",
+    "infer_forecast_data_path = f\"{ROOT_PATH}/{INFERENCE_STATION}/{MODEL_TYPE}/forecast.csv\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 1: Download stations data.  \n",
+    "Here, we are taking the stations from [AGWeatherNet](https://weather.wsu.edu/). \n",
+    "\n",
+    "We are assuming that the station Warden_SW does not have the weather station data. We consider the stations Royal Slope and Ringold as neighboring weather stations having similar weather patterns, hence historical data download is required for these two stations. See [sample data](sample_data.csv) for an example."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Neighboring stations\n",
+    "# Coordinates are in (longitude, latitude)\n",
+    "neighbor_stations = [\n",
+    "    {\n",
+    "        \"name\": \"Warden_SW\",\n",
+    "        \"column_name\": \"temperature_forecast\",\n",
+    "        \"coordinates\": (-119.12, 46.93),\n",
+    "    },\n",
+    "    {\n",
+    "        \"name\": \"royal_slope\",\n",
+    "        \"column_name\": \"temperature\",\n",
+    "        \"coordinates\": (-119.32, 46.95),\n",
+    "    },\n",
+    "    {\n",
+    "        \"name\": \"ringold\",\n",
+    "        \"column_name\": \"temperature\",\n",
+    "        \"coordinates\": (-119.18, 46.48),\n",
+    "    },\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 2: Download Forecast data\n",
+    "\n",
+    "For weather station Warden SW, download weather forecast observations by submitting request to worker running in background. Workflow execution utilize below parameters while processing requests, this can be overwritten using the parameter argument.\n",
+    "\n",
+    "- fxx: [1, 25, 1] # start, stop, step\n",
+    "- search_text: \"TMP:2 m\"\n",
+    "- interval: 60 # in minutes\n",
+    "- weather_type: \"temperature\"\n",
+    "- multi_threads: 25"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "start_date = datetime(year=2021, month=7, day=30)\n",
+    "end_date = datetime(year=2023, month=8, day=2)\n",
+    "forecast_data = download_forecast_data([neighbor_stations[0]], start_date, end_date)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "forecast_data[\"Warden_SW\"].to_csv(infer_forecast_data_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 3: Train DeepMC models\n",
+    "\n",
+    "Complete the DeepMC model training using the notebook [notebooks/deepmc/mc_forecast.ipynb](https://github.com/microsoft/farmvibes-ai/blob/main/notebooks/deepmc/mc_forecast.ipynb) for weather stations Royal Slope and Ringold.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 4: Train [Graphical Neural Network (GNN)](https://pytorch-geometric.readthedocs.io/) model\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 4.1 Create embeddings\n",
+    "\n",
+    "The get_embeddings module does the following: \n",
+    "1. Run inference using DeepMC trained model to find weather forecasts of temperature for station Royal Slope and Ringold weather stations.\n",
+    "2. Pre-process inference results to create a lookback by transforming it to a 2D matrix.\n",
+    "3. Pre-process HRRR weather forecast to create a lookback by transforming it to a 2D matrix.\n",
+    "4. Embeddings created by concatenating pre-process results. The embeddings are sorted by timestamp and station name."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "obj_neighbors = MC_Neighbors(root_dir=ROOT_PATH, learning_rate=0.0025, use_edge_weights=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_embeddings, test_embeddings = obj_neighbors.get_embeddings(\n",
+    "    INFERENCE_STATION,\n",
+    "    neighbor_stations,\n",
+    "    24,\n",
+    "    infer_forecast_data_path,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 4.2 Model training\n",
+    "\n",
+    "The GNN training script does the following:\n",
+    "\n",
+    "1. Creates Dataset that reads the input embeddings, creates a node for each timestamp, and creates edges connecting weather stations.\n",
+    "2. Creates BatchSampler to split data into batches for training and testing dataset.\n",
+    "3. Using PyTorch lightning package, the model training is initiated."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "obj_neighbors.run_train(\n",
+    "    train_embeddings=train_embeddings,\n",
+    "    test_embeddings=test_embeddings,\n",
+    "    neighbor_stations=neighbor_stations,\n",
+    "    infer_station=INFERENCE_STATION,\n",
+    "    epochs=20,\n",
+    "    batch_size=24 * len(neighbor_stations),\n",
+    "    forecast_hours=24,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Run Inference to validate the trained model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pred_df = obj_neighbors.run_inference(\n",
+    "    embeddings=test_embeddings.copy(),\n",
+    "    neighbors_station=neighbor_stations,\n",
+    "    infer_station=INFERENCE_STATION,\n",
+    "    batch_size=len(neighbor_stations),\n",
+    "    forecast_hours=24,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- historical_data_path: it's a path to historical weather data downloaded and cleaned in Step 1.\n",
+    "- hrrr_data_path: it's a path to hrr weather data downloaded and cleaned in Step 2."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "historical_data_path = \"\"\n",
+    "hrrr_data_path = \"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "obj_neighbors.view_plot(pred_df, historical_data_path, hrrr_data_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GNN temperature\n", + "RMSE: 3.98\n", + "MAE: 3.19\n", + "MAE%: 4.22%\n", + "\n", + "Hrrr temperature\n", + "RMSE: 4.64\n", + "MAE: 3.8\n", + "MAE%: 4.91%\n" + ] + } + ], + "source": [ + "obj_neighbors.view_performance(pred_df, historical_data_path, hrrr_data_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5: Inference\n", + "For weather stations used in GNN model training, we will perform the steps below to get inference results.\n", + "\n", + "**5.1 Download data**\n", + "- Download historical weather data for the stations Royal Slope and Ringold from [AGWeatherNet]( https://weather.wsu.edu/) for the time range interested in.\n", + "\n", + "Note: To perform the inference for 24 hours with 60 minutes interval, the minimum data required for 528 hours, similarly If the data frequency is 15 minutes, the minimum number of data points required is 528*4 = 2112. These are the minimum number of data points need to be provided as input during the inference.\n", + "\n", + "**5.2 Preprocessing**\n", + "- For each weather station, historical and HRRR data are concatenated by timestamp.\n", + "- Data processing is done using Wavelet Transformation techniques. \n", + "- For each weather station, using the trained DeepMC model, we run the inference to find weather forecasts.\n", + "- Embeddings are created by combining HRRR data and the predicted weather forecasts.\n", + "\n", + "**5.3 Run GNN model inference**\n", + "\n", + "Finally, we plot the results and calculate KPIs." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.1 Download data\n", + "\n", + "Download AgWeatherNet data and clean it. See [sample data](sample_data.csv)." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# weather dataset filtered and model training limited to train features.\n", + "HISTORICAL_MODEL_TRAIN_FEATURES = [\"humidity\", \"wind_speed\", \"temperature\"]\n", + "\n", + "# Historical data aligned using INDEX variable\n", + "INDEX = \"date\"\n", + "\n", + "# weather dataset filtered and model training limited to train features.\n", + "FORECAST_MODEL_TRAIN_FEATURES = [\"humidity_forecast\", \"wind_speed_forecast\", \"temperature_forecast\"]\n", + "\n", + "# Models trained to predict out features\n", + "OUT_FEATURES = [\"temperature\"] # ['wind_speed' , 'temperature']" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# Get actual observations data for a station, '%s' is a place holder for station name.\n", + "file_path = f\"/%s/prediction.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# time range interested in\n", + "start_date = datetime(year=2022, month=7, day=1, hour=0, minute=0, second=0)\n", + "end_date = datetime(year=2022, month=8, day=15, hour=0, minute=0, second=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Execution status for temperature: done\n", + "Execution status for humidity: done\n", + "Execution status for u-component: done\n", + "Execution status for v-component: done\n" + ] + } + ], + "source": [ + "forecast_data = download_forecast_data(neighbor_stations, start_date, end_date)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.2 Preprocessing\n", + "\n", + "Steps to derive embeddings:\n", + "\n", + "- We perform wavelet transformation on selected weather variables (historical and forecast data). \n", + "- The preprocessed output is used as input to run the inference using the DeepMC trained model. The inference results are weather forecasts for neighboring stations.\n", + "- The DeepMC inference results are concatenated with HRRR forecast data to create embeddings." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "obj_neighbors = MC_Neighbors(root_dir=ROOT_PATH, learning_rate=0.0025, use_edge_weights=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = obj_neighbors.get_embeddings_inference(\n", + " INFERENCE_STATION,\n", + " neighbor_stations,\n", + " 24,\n", + " infer_forecast_data_path,\n", + " OUT_FEATURES,\n", + " file_path,\n", + " forecast_data,\n", + " start_date,\n", + " end_date,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.3 Run inference\n", + "\n", + "The inference results are weather forecast for stations that are missing station data." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "pred_df = obj_neighbors.run_inference(\n", + " embeddings=embeddings.copy(),\n", + " neighbors_station=neighbor_stations,\n", + " infer_station=INFERENCE_STATION,\n", + " batch_size=len(neighbor_stations),\n", + " forecast_hours=24,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.4 Plot results and calculate KPIs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- historical_data_path: it's a path to historical weather data downloaded and cleaned in Step 5.1.\n", + "- hrrr_data_path: it's a path to hrr weather data downloaded and cleaned in Step 5.1." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "historical_data_path = \"\"\n", + "hrrr_data_path = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "obj_neighbors.view_plot(pred_df, historical_data_path, hrrr_data_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GNN temperature\n", + "RMSE: 3.05\n", + "MAE: 2.57\n", + "MAE%: 3.48%\n", + "\n", + "Hrrr temperature\n", + "RMSE: 3.72\n", + "MAE: 3.02\n", + "MAE%: 4.05%\n" + ] + } + ], + "source": [ + "obj_neighbors.view_performance(pred_df, historical_data_path, hrrr_data_path)" + ] + } + ], + "metadata": { + "description": "It helps to find weather forecasts for sensors that have no data by utilizing data of neighboring stations", + "disk_space": "", + "kernelspec": { + "display_name": "dev-vibes3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + }, + "name": "Micro Climate Predictions using Neighbor stations", + "running_time": "", + "tags": [ + "Weather", + "Model Training" + ] + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/deepmc_neighbors/graph.svg b/notebooks/deepmc_neighbors/graph.svg new file mode 100755 index 00000000..53ae5c23 --- /dev/null +++ b/notebooks/deepmc_neighbors/graph.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/notebooks/deepmc_neighbors/notebook_lib/base_dataset.py b/notebooks/deepmc_neighbors/notebook_lib/base_dataset.py new file mode 100644 index 00000000..82488fd7 --- /dev/null +++ b/notebooks/deepmc_neighbors/notebook_lib/base_dataset.py @@ -0,0 +1,318 @@ +from math import cos, sin +from typing import Any, Dict, List, Union + +import geopy.distance +import numpy as np +import pandas as pd +import torch +import torch.utils +import torch.utils.data +from sklearn.preprocessing import StandardScaler +from torch import Tensor +from torch.utils.data import IterableDataset + + +class GNNDataset: + def __init__( + self, + data: pd.DataFrame, + scaler_input: StandardScaler, + scaler_label: StandardScaler, + neighbor_station: Dict[str, Any], + infer_station: str, + forecast_hours: int, + label_column_index: Union[int, None], + forecast_step: int = 0, + device_count: int = torch.cuda.device_count(), + ): + super().__init__() + self.data = data + self.forecast_step = forecast_step + self.device_count = device_count + self.scaler_input = scaler_input + self.scaler_label = scaler_label + self.neighbor_stations = neighbor_station + self.stations_count = len(self.neighbor_stations["stations"]) + self.infer_station = infer_station + self.forecast_hours = forecast_hours + self.label_column_index = label_column_index + self.load_nodes() + self.load_edges() + + def load_node_labels(self, data: pd.DataFrame): + if "labels" not in data.columns: + return data + + node_labels = data["labels"].to_numpy() + node_labels = node_labels.reshape(-1)[ + : int(len(data.index.get_level_values(0)) / self.node_num) * self.node_num * 1 + ] + + self.node_labels = torch.from_numpy( + node_labels.reshape( + int(len(data.index.get_level_values(0)) / self.node_num), + self.node_num, + 1, + ).astype("float32") + ) + data.drop(columns=["labels"], inplace=True) + return data + + def load_nodes(self): + data = self.node_feature_selection(self.data) + data["timestamp"] = [pd.Timestamp(a).replace(tzinfo=None) for a in data["timestamp"]] + data = data.rename(columns={"station": "Node"}) + self.node_names = data["Node"].unique().astype(str) + self.node_num = len(self.node_names) + data.set_index(["timestamp", "Node"], inplace=True) + data = self.load_node_labels(data) + data.drop(columns=["forecast_step"], inplace=True) + + # Set node variables + self.lookback_indices = list(range(self.forecast_hours)) + self.target_idx = self.forecast_step + self.timestamps = data.index.get_level_values(0).unique() + self.infer_station_index = next( + (i for i, a in enumerate(self.node_names) if a == self.infer_station), None + ) + self.node_feas = list(data.columns) + self.node_fea_dim = len(self.node_feas) + node_vals = data.values.reshape(-1)[ + : int(len(data.index.get_level_values(0)) / self.node_num) + * self.node_num + * self.node_fea_dim + ] + + self.node_data = torch.from_numpy( + node_vals.reshape( + int(len(data.index.get_level_values(0)) / self.node_num), + self.node_num, + self.node_fea_dim, + ).astype("float32") + ) + + self.timestamps = self.timestamps[: self.node_data.shape[0]] + + def get_from_to_nodes(self, neighbor_stations: Dict[str, Any]): + from_node = [] + to_node = [] + for s in neighbor_stations["stations"]: + for c in self.neighbor_stations["stations"]: + if s != c and s != self.infer_station: + from_node.append(s) + to_node.append(c) + return from_node, to_node + + def get_edges(self, neighbor_stations: Dict[str, Any]): + from_node, to_node = self.get_from_to_nodes(neighbor_stations) + + coords = neighbor_stations["long_lat"] + edges = zip(from_node, to_node) + distances = [] + turbine_dir_x = [] + turbine_dir_y = [] + + for edge in edges: + coord_1 = coords[edge[0]][::-1] + coord_2 = coords[edge[1]][::-1] + distances.append(geopy.distance.geodesic(coord_1, coord_2).km) + x1, y1 = coord_1 + x2, y2 = coord_2 + turbine_dir_x.append(cos(x1) * sin(y1 - y2)) + turbine_dir_y.append(cos(x2) * sin(x1) - sin(x2) * cos(x1) * cos(y1 - y2)) + + data = { + "from_node": from_node, + "to_node": to_node, + "distance": distances, + "dir_x": turbine_dir_x, + "dir_y": turbine_dir_y, + } + return data + + def load_edges(self): + data = self.get_edges(self.neighbor_stations) + data = pd.DataFrame(data) + data["to_node"] = data["to_node"] + data["from_node"] = data["from_node"] + data["edge"] = data.apply(lambda x: "{}->{}".format(x["from_node"], x["to_node"]), axis=1) + data.loc[:, "distance"] = 1 / data.loc[:, "distance"] + data.drop(columns=["from_node", "to_node"], inplace=True) + edge_names = sorted(data["edge"].unique()) + node2id = dict(zip(self.node_names, range(len(self.node_names)))) + edge_index = [ + [node2id[src_node], node2id[tgt_node]] + for src_node, tgt_node in [edge.split("->") for edge in edge_names] + ] + + edge_df = data[["distance", "edge"]].set_index(["edge"]) + self.edge_names = edge_names + self.edge_feas = list(edge_df.columns) + self.edge_index = torch.LongTensor(edge_index) + self.edge_num = len(self.edge_names) + + self.edge_fea_dim = len(self.edge_feas) + self.edge_data = torch.from_numpy( + edge_df.values.reshape( + self.edge_num, + self.edge_fea_dim, + ).astype("float32") + ) + + def node_feature_selection(self, df_node: pd.DataFrame): + df_node = df_node.sort_values(["timestamp", "forecast_step", "station"]) + scaled_input_array = self.scaler_input.transform( + df_node.to_numpy()[:, 0 : self.forecast_hours] + ) + df_node.iloc[:, 0 : self.forecast_hours] = scaled_input_array # type: ignore + + if self.label_column_index is not None: + scaled_label = self.scaler_label.transform( + np.expand_dims(df_node.to_numpy()[:, self.label_column_index], axis=-1) + ) + df_node.iloc[:, self.label_column_index] = scaled_label # type: ignore + return df_node + + +class BatchSampler(IterableDataset): # type: ignore + def __init__( + self, + dataset: GNNDataset, + batch_size: int, + lookahead_horizon: int, + lookback_horizon: int, + device: Union[str, torch.device], + random: bool = True, + noise_parameters: Dict[str, Any] = {}, + use_edge_weights: bool = False, + ): + self.dataset = dataset + self.batch_size = batch_size + self.device_count = dataset.device_count + self.random = random + self.lookahead_horizon = lookahead_horizon + self.lookback_horizon = lookback_horizon + self.device = device + self.noise_parameters = noise_parameters + self.use_edge_weights = use_edge_weights + self.stations_count = dataset.stations_count + + def get_forecast_indices(self): + forecast_indices = list(range(len(self.dataset.timestamps))) + if self.random: + np.random.seed() + np.random.shuffle(forecast_indices) + + return forecast_indices + + def get_batch_edge_index(self, cur_batch_size: int, num_devices: int): + edge_num = self.dataset.edge_num + if num_devices == 0: + num_devices = 1 + + batch_size_each_device = int(cur_batch_size / num_devices) + + # Reshape edge_index to [batch_size, 2, edge_num] + self.edge_index = torch.cat( + batch_size_each_device * [self.dataset.edge_index] # type: ignore + ).reshape( # type: ignore + batch_size_each_device, 2, edge_num + ) + + # Add offset to edge_index + offset = torch.arange( + 0, batch_size_each_device * self.dataset.node_num, self.dataset.node_num + ).view(-1, 1, 1) + self.edge_index = self.edge_index + offset + self.edge_index = torch.cat(num_devices * [self.edge_index]).reshape( + cur_batch_size, 2, edge_num + ) + + def get_batch_edge_data(self, cur_batch_size: int, num_devices: int): + edge_num = self.dataset.edge_num + if num_devices == 0: + num_devices = 1 + batch_size_each_device = int(cur_batch_size / num_devices) + + # Reshape edge_index to [batch_size, 2, edge_num] + self.edge_data = torch.cat(batch_size_each_device * [self.dataset.edge_data]).reshape( + batch_size_each_device, self.dataset.edge_fea_dim, edge_num + ) # batch_size, edge_in_fea_dim, num_edges + # Add offset to edge_index + offset = torch.arange( + 0, batch_size_each_device * self.dataset.node_num, self.dataset.node_num + ).view(-1, 1, 1) + self.edge_data = self.edge_data + offset # [batch_size, edge_node_dim, num_edges] + + self.edge_data = torch.cat(num_devices * [self.edge_data]).reshape( + cur_batch_size, self.dataset.edge_fea_dim, edge_num + ) + + def generate(self): + total_forecast_indices = self.get_forecast_indices() + num_batches = (len(total_forecast_indices) // (self.batch_size)) + ( + len(total_forecast_indices) % self.batch_size != 0 + ) + + for batch_id in range(num_batches): + lookback_indices = [] + batch_id_s = batch_id * self.batch_size + batch_id_e = batch_id_s + self.batch_size + forecast_indices = total_forecast_indices[batch_id_s:batch_id_e] + cur_batch_size = len(forecast_indices) + lookback_indices = forecast_indices + + # Collect meta data + forecast_timestamps = [self.dataset.timestamps[i] for i in forecast_indices] + + # Collect node-level time series + node_lookback = ( + self.dataset.node_data[lookback_indices] + .reshape(cur_batch_size, 1, self.dataset.node_num, self.dataset.node_fea_dim) + .transpose(1, 2) + .contiguous() + ) + + if self.dataset.label_column_index is not None: + # Collect node-level time series + node_lookback_labels = ( + self.dataset.node_labels[lookback_indices] + .reshape(cur_batch_size, 1, self.dataset.node_num, 1) + .transpose(1, 2) + .contiguous() + ) + else: + node_lookback_labels = None + + self.get_batch_edge_index(cur_batch_size, self.device_count) + self.get_batch_edge_data(cur_batch_size, self.device_count) + + batch = self.get_output(node_lookback, node_lookback_labels, forecast_timestamps) + + yield batch + + def get_output( + self, + node_lookback: Tensor, + node_lookback_labels: Union[Tensor, None], + forecast_timestamps: List[str], + ): + if self.use_edge_weights: + self.edge_data = torch.squeeze(self.edge_data.reshape(-1, 1)) + + self.edge_index = self.edge_index.permute(1, 0, 2).contiguous().view(2, -1) + # node_lookahead not implemented + # when we get it in the future, we will implement it + batch = {} + batch["node_data"] = node_lookback[:, :, :, :] + batch["edge_index"] = self.edge_index + batch["edge_data"] = self.edge_data + batch["forecast_timestamps"] = forecast_timestamps + + if node_lookback_labels is not None: + batch["node_labels"] = node_lookback_labels + + return list(batch.values()) + + def __iter__(self): + return iter(self.generate()) diff --git a/notebooks/deepmc_neighbors/notebook_lib/base_deepmc.py b/notebooks/deepmc_neighbors/notebook_lib/base_deepmc.py new file mode 100644 index 00000000..83129eaa --- /dev/null +++ b/notebooks/deepmc_neighbors/notebook_lib/base_deepmc.py @@ -0,0 +1,47 @@ +import os +from typing import Any, List + +import numpy as np +import onnxruntime +from numpy.typing import NDArray + +from vibe_notebook.deepmc.utils import transform_to_array + + +def inference_deepmc(model_path: str, data_x: NDArray[Any], inference_hours: int): + list_data_x = [] + for pred_idx in range(inference_hours): + model_onnx_path = os.path.join(model_path, f"model_{pred_idx}", "export.onnx") + session = onnxruntime.InferenceSession(model_onnx_path, None) + data_in = { + out.name: data_x[i].astype(np.float32) for i, out in enumerate(session.get_inputs()) + } + + result = session.run(None, input_feed=data_in)[0] + result = result.astype(np.float32) + result = transform_to_array(result, inference_hours) + result = result[..., 0] + list_data_x.append(result) + return list_data_x + + +def inference_deepmc_post( + model_path: str, + post_data_x: List[NDArray[Any]], +): + # Train Post-Processing Scaling Models + inshape = len(post_data_x) + mix_data_yhat = np.empty([post_data_x[0].shape[0], inshape, inshape]) + idx = 0 + + for pred_idx, train_yhat in enumerate(post_data_x): + post_model_onnx_path = os.path.join(model_path, f"model_{pred_idx}", "post", "export.onnx") + post_session = onnxruntime.InferenceSession(post_model_onnx_path, None) + data_in = { + out.name: train_yhat.astype(np.float32) + for i, out in enumerate(post_session.get_inputs()) + } + result = post_session.run(None, input_feed=data_in)[0] + mix_data_yhat[:, :, idx] = result + idx = idx + 1 + return mix_data_yhat diff --git a/notebooks/deepmc_neighbors/notebook_lib/base_model.py b/notebooks/deepmc_neighbors/notebook_lib/base_model.py new file mode 100644 index 00000000..78769485 --- /dev/null +++ b/notebooks/deepmc_neighbors/notebook_lib/base_model.py @@ -0,0 +1,154 @@ +from typing import Any, Dict, List, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor +from torch.nn import Conv1d, Linear +from torch.utils.data import TensorDataset +from torch_geometric_temporal.nn.recurrent import TGCN + +from .schema import BatchTGCNInputs + + +def get_batch(batch: Union[Tensor, List[Tensor], TensorDataset], use_edge_weights: bool): + if isinstance(batch, TensorDataset): + batch = batch[:] + node_data = batch[0] + edge_index = batch[1] + # used for training + # skipped during inference + if len(batch) == 5: + node_labels = batch[4] + else: + node_labels = torch.tensor([]) + + if use_edge_weights: + edge_data = batch[2] + else: + edge_data = torch.tensor([]) + return node_data, edge_index, edge_data, node_labels + + +class BaseModule(nn.Module): + def __init__(self, problem_params: Dict[str, Any]): + super().__init__() + self.batch_size = problem_params["batch_size"] + self.lookback_horizon = problem_params["lookback_horizon"] + self.lookahead_horizon = problem_params["lookahead_horizon"] + + # node + self.num_nodes = problem_params["node_num"] + self.node_in_fea_dim = problem_params["node_in_fea_dim"] + self.node_out_fea_dim = problem_params["node_out_fea_dim"] + self.node_input_dim = self.lookback_horizon * self.node_in_fea_dim + self.node_output_dim = self.lookahead_horizon * self.node_out_fea_dim + self.use_dropout = problem_params["use_dropout"] + + # edge + self.edge_in_fea_dim = problem_params["edge_in_fea_dim"] + self.edge_out_fea_dim = problem_params["edge_out_fea_dim"] + self.edge_input_dim = self.lookback_horizon * self.edge_in_fea_dim + self.edge_output_dim = self.lookahead_horizon * self.edge_out_fea_dim + + # Add day and hour embeddings + self.day_em_dim = problem_params["day_em_dim"] + self.hour_em_dim = problem_params["hour_em_dim"] + # 7 days + self.day_em = nn.Embedding(7, self.day_em_dim) + # 24 hours + self.hour_em = nn.Embedding(24, self.hour_em_dim) + + # GRU hidden him + self.hidden_dim = problem_params["hidden_dim"] + self.dropout = nn.Dropout2d(0.01) + + # linear layer + self.linear1_node = nn.Linear(self.hidden_dim, self.node_output_dim) + self.linear2_node = nn.Linear(self.node_in_fea_dim - 1, self.lookahead_horizon) + self.ar = nn.Linear(self.lookback_horizon, self.lookahead_horizon) + + # Multi-dimensional edge attribute to one dimension + self.edge_num = problem_params["edge_num"] + self.use_edge_weights = problem_params["use_edge_weights"] + self.linear_edge = nn.Linear(self.edge_in_fea_dim, 1) + + def weights_init(self, m: Union[Conv1d, Linear]): + if isinstance(m, Conv1d) or isinstance(m, Linear): + nn.init.xavier_uniform_(m.weight.data) + if m.bias is not None: + nn.init.zeros_(m.bias.data) + + def initialize_weights(self): + pass + + def forward(self, batch: Dict[str, Any]): + pass + + +class BatchTGCN(BaseModule): + def __init__( + self, + inputs: BatchTGCNInputs, + ): + super().__init__(inputs.dict()) + self.inputs = inputs.dict() + self.decoder_in_fea_dim = 2 + self.node_in_fea_dim = self.node_in_fea_dim + + self.tgcn_cell_encoder = TGCN(self.node_in_fea_dim, self.hidden_dim) + self.tgcn_cell_encoder1 = TGCN(self.node_in_fea_dim, self.hidden_dim) + + self.tgcn_cell_decoder = TGCN(self.decoder_in_fea_dim, self.hidden_dim) + self.tgcn_cell_decoder1 = TGCN(self.decoder_in_fea_dim, self.hidden_dim) + # stopping loop reference + self.get_batch = get_batch + self.dropout_encoder1 = nn.Dropout(0.05) + + def forward(self, inputs: Union[Tensor, List[Tensor]]): + node_data, edge_index, edge_data, _ = get_batch(inputs, self.use_edge_weights) + h = torch.empty + self.edge_index = edge_index # 2, num_edges + # Process edge + self.batch_size, self.num_nodes, _, _ = node_data.shape + hh, e = self.process(node_data, edge_data) + h = F.relu_(hh) + h = self.linear1_node(h) + h = h.reshape(self.batch_size, self.num_nodes, self.lookahead_horizon) # type: ignore + hh = hh.reshape(self.batch_size, self.num_nodes, self.hidden_dim) # type: ignore + return h, e, hh + + def get_hidden_embedding( + self, + horizon: int, + x: Tensor, + edge_weights: Union[Tensor, None], + ) -> Tuple[Tensor, Union[Tensor, None]]: + for i in range(horizon): + indices_lookback = torch.tensor(self.inputs["lookback_indices"]).to(x.device) + input = torch.index_select(x[:, :, i, :], 2, indices_lookback) + input = input.reshape(self.batch_size * self.num_nodes, -1) + h = self.tgcn_cell_encoder(input, self.edge_index, edge_weights) + h = F.relu(h) + h = self.dropout_encoder1(h) + return h, edge_weights + + def process( + self, + node_data: Tensor, + edge_data: Tensor, + ) -> Tuple[Tensor, Union[Tensor, None]]: + # Add hour and day embedding + horizon = self.lookback_horizon + x = node_data + + if self.use_dropout: + x = self.dropout(x) + + edge_weights = None + if self.use_edge_weights: + edge_weights = edge_data + + self.prev_input = x[:, :, -1, :horizon] + h, e = self.get_hidden_embedding(horizon, x, edge_weights) + return h, e diff --git a/notebooks/deepmc_neighbors/notebook_lib/base_modules.py b/notebooks/deepmc_neighbors/notebook_lib/base_modules.py new file mode 100644 index 00000000..34896c29 --- /dev/null +++ b/notebooks/deepmc_neighbors/notebook_lib/base_modules.py @@ -0,0 +1,50 @@ +from typing import List, Union + +import pytorch_lightning as pl +from notebook_lib.base_model import BatchTGCN +from torch import Tensor, nn +from torch.optim import Adagrad + +from .schema import BatchTGCNInputs + + +class BatchTGCNTrain(pl.LightningModule): + def __init__( + self, + inputs: BatchTGCNInputs, + learning_rate: float = 0.001, + ): + super().__init__() + self.gnn = BatchTGCN(inputs) + self.loss = nn.MSELoss() + self.learning_rate = learning_rate + + def forward(self, batch: Union[Tensor, List[Tensor]]): + y_hat, _, _ = self.gnn(batch) + return y_hat + + def configure_optimizers(self): + optimizer = Adagrad( + self.parameters(), + lr=self.learning_rate, + initial_accumulator_value=1e-6, + eps=1e-6, + weight_decay=1e-6, + ) + return optimizer + + def training_step(self, train_batch: Union[Tensor, List[Tensor]], _): + _, _, _, node_labels = self.gnn.get_batch(train_batch, self.gnn.use_edge_weights) + y = node_labels + y_hat, _, _ = self.gnn(train_batch) + loss = self.loss(y_hat, y.reshape(y_hat.shape)) + self.log("train_loss/total", loss, on_epoch=True, prog_bar=True) + return loss + + def validation_step(self, validation_batch: Union[Tensor, List[Tensor]], _): + _, _, _, node_labels = self.gnn.get_batch(validation_batch, self.gnn.use_edge_weights) + y = node_labels + y_hat, _, _ = self.gnn(validation_batch) + loss = self.loss(y_hat, y.reshape(y_hat.shape)) + self.log("val_loss/total", loss, on_epoch=True, prog_bar=True) + return loss diff --git a/notebooks/deepmc_neighbors/notebook_lib/data_utils.py b/notebooks/deepmc_neighbors/notebook_lib/data_utils.py new file mode 100644 index 00000000..85ce7231 --- /dev/null +++ b/notebooks/deepmc_neighbors/notebook_lib/data_utils.py @@ -0,0 +1,241 @@ +import os +import pickle +from typing import Any, Dict, List, Tuple, Union + +import numpy as np +import pandas as pd +import torch +from numpy.typing import NDArray +from sklearn.preprocessing import StandardScaler +from torch import Tensor +from torch.utils.data import DataLoader, TensorDataset + +from vibe_notebook.deepmc.utils import transform_to_array + +from .base_dataset import BatchSampler, GNNDataset +from .base_modules import BatchTGCNTrain + + +def build_scaler(train_embeddings: pd.DataFrame, forecast_hours: int) -> StandardScaler: + train_data_scaler = StandardScaler() + train_data_scaler.fit(train_embeddings.to_numpy()[:, :forecast_hours]) + return train_data_scaler + + +def build_scaler_label( + train_embeddings: pd.DataFrame, labels_column: str +) -> Tuple[StandardScaler, int]: + index = -1 + for i, column in enumerate(train_embeddings.columns): + if column == labels_column: + index = i + + if index == -1: + raise ValueError(f"Labels column '{labels_column}' not found") + + train_label_scaler = StandardScaler() + train_label_scaler.fit(np.expand_dims(train_embeddings.to_numpy()[:, index], axis=-1)) + return train_label_scaler, index + + +def get_batch_sample( + train_dataset: GNNDataset, + test_dataset: GNNDataset, + batch_size: int, + lookahead_horizon: int, + lookback_horizon: int, + device: torch.device, + use_edge_weights: bool, +) -> Tuple[BatchSampler, BatchSampler]: + train_sampler = BatchSampler( + dataset=train_dataset, + batch_size=batch_size, + lookahead_horizon=lookahead_horizon, + lookback_horizon=lookback_horizon, + device=device, + random=False, + use_edge_weights=use_edge_weights, + ) + + test_sampler = BatchSampler( + dataset=test_dataset, + batch_size=batch_size, + lookahead_horizon=lookahead_horizon, + lookback_horizon=lookback_horizon, + device=device, + random=False, + use_edge_weights=use_edge_weights, + ) + + return (train_sampler, test_sampler) + + +def train_test_dataset( + train_data: pd.DataFrame, + test_data: pd.DataFrame, + step: int, + neighbors_station: Dict[str, Any], + scaler_data: StandardScaler, + scaler_label: StandardScaler, + infer_station: str, + labels_column_index: int, +) -> Tuple[GNNDataset, GNNDataset]: + train_dataset = GNNDataset( + train_data, + forecast_step=step, + scaler_input=scaler_data, + scaler_label=scaler_label, + neighbor_station=neighbors_station, + forecast_hours=24, + infer_station=infer_station, + label_column_index=labels_column_index, + ) + + test_dataset = GNNDataset( + test_data, + forecast_step=step, + scaler_input=scaler_data, + scaler_label=scaler_label, + neighbor_station=neighbors_station, + forecast_hours=24, + infer_station=infer_station, + label_column_index=labels_column_index, + ) + + return (train_dataset, test_dataset) + + +def problem_params( + dataset: GNNDataset, + batch_size: int, + lookback_horizon: int, + lookahead_horizon: int, + use_edge_weights: bool, + use_dropout: bool, + hidden_dim: int, + forecast_hours: int, +) -> Dict[str, Any]: + problem_params = { + "lookback_horizon": lookback_horizon, + "lookahead_horizon": lookahead_horizon, + "node_num": dataset.node_num, + "node_in_fea_dim": dataset.node_fea_dim, + "node_out_fea_dim": 1, + "edge_in_fea_dim": dataset.edge_fea_dim, + "edge_out_fea_dim": 1, + "edge_num": dataset.edge_num, + "use_edge_weights": use_edge_weights, + "day_em_dim": 1, + "hour_em_dim": 1, + "period": 5, # for attention model + "batch_size": batch_size, + "use_dropout": use_dropout, + "hidden_dim": hidden_dim, + "device_count": torch.cuda.device_count(), + "lookback_indices": list(range(forecast_hours)), + } + + return problem_params + + +def export_to_onnx( + file_path: str, + model: BatchTGCNTrain, + inputs: DataLoader, # type: ignore + use_edge_weights: bool, + edge_num: int, + number_of_stations: int, +): + data = next(iter(inputs)) + node_data, edge_index, edge_data, _ = get_batch(data, use_edge_weights) + data = { + "node_data": node_data[:number_of_stations], + "edge_index": edge_index[:, : (edge_num * number_of_stations)], + "edge_data": edge_data[: (edge_num * number_of_stations)], + } + keys = list(data.keys()) + batch_axes = {keys[i]: {0: "batch_size"} for i in range(len(keys))} + onnx_output_path = os.path.join(file_path, "model_output.onnx") + if os.path.exists(onnx_output_path): + os.remove(onnx_output_path) + + # Export the model + torch.onnx.export( + model, + list(data.values()), # type: ignore + onnx_output_path, + input_names=list(batch_axes.keys()), + dynamic_axes=batch_axes, + opset_version=16, + ) + + +def write_to_file(output_file: str, data: List[Any]): + with open(output_file, "wb") as f: + pickle.dump(data, f) + + +def get_file(file_path: str) -> List[Any]: + if os.path.exists(file_path): + with open(file_path, "rb") as f: + return pickle.load(f) + else: + raise Exception(f"File {file_path} not found") + + +def get_batch(batch: Union[Tensor, List[Tensor], TensorDataset], use_edge_weights: bool): + if type(batch) == TensorDataset: + batch = batch[:] + node_data = batch[0] + edge_index = batch[1] + # considered for training + # skipped during inference + if len(batch) == 5: + node_labels = batch[4] + else: + node_labels = torch.tensor([]) + + if use_edge_weights: + edge_data = batch[2] + else: + edge_data = torch.tensor([]) + return node_data, edge_index, edge_data, node_labels + + +def smooth(y: List[float], box_pts: int): + box = np.ones(box_pts) / box_pts + y_smooth = np.convolve(y, box, mode="same") + return y_smooth + + +def get_split_data(split_data: NDArray[Any], timestamps: NDArray[Any], split_at_index: int): + split_by_index = [] + for i in range(split_at_index): + data_at_index = split_data[i::split_at_index][:, i] + timestamp_at_index = timestamps[i::split_at_index] + split_by_index.append( + pd.DataFrame(zip(timestamp_at_index, data_at_index), columns=["timestamp", "label"]) + ) + + split_data_df = pd.concat(split_by_index, axis=0, ignore_index=True) + split_data_df["timestamp"] = pd.to_datetime(split_data_df["timestamp"]) + split_data_df = split_data_df.sort_values(by="timestamp") + + return np.array(split_data_df["label"].values) + + +def preprocess_transform( + mix_data_yhat: NDArray[Any], + inference_hours: int, + dates_list: NDArray[Any], +): + init_start = 0 + data_list = [] + end = mix_data_yhat.shape[0] + for i in range(init_start, end, inference_hours): + for j in range(inference_hours): + data_list.append(mix_data_yhat[i, 0, j]) + + mix_data_yhat = transform_to_array(np.array(data_list))[: mix_data_yhat.shape[0]] + dates_list = dates_list[: mix_data_yhat.shape[0]] + return mix_data_yhat, dates_list diff --git a/notebooks/deepmc_neighbors/notebook_lib/embeddings.py b/notebooks/deepmc_neighbors/notebook_lib/embeddings.py new file mode 100644 index 00000000..012f8a4a --- /dev/null +++ b/notebooks/deepmc_neighbors/notebook_lib/embeddings.py @@ -0,0 +1,235 @@ +import os +from datetime import datetime +from typing import Any, Dict, List + +import pandas as pd + +from .data_utils import get_file + + +def construct_neighbor_stations(stations: List[Dict[str, Any]]): + neighbors = {"stations": [], "coordinates": {}} + for station in stations: + neighbors["stations"].append(station["name"]) + neighbors["coordinates"][station["name"]] = station["coordinates"] + + return neighbors + + +def get_deepmc_post_results(root_path: str, stations: List[Dict[str, Any]], model_type: str): + predict_out = {} + for station in stations: + deepmc_post_path = os.path.join( + root_path, station["name"], model_type, "embeddings", "post_processed_results.pkl" + ) + ( + intermediate_test, + intermediate_train, + _, + _, + train_labels_station, + test_labels_station, + out_train_dates, + out_test_dates, + ) = get_file(deepmc_post_path) + predict_out[station["name"]] = ( + intermediate_train, + intermediate_test, + train_labels_station, + test_labels_station, + out_train_dates, + out_test_dates, + ) + + return predict_out + + +def get_date(stations: Dict[str, Any], data_index: int = -2, date_type: int = 0): + """Retrieves the start date and end date by comparing data of all stations. + :param stations: Dictionary with station name as key and values + with collection of station information used to generate embeddings. + + :param data_index: It defines position of data in array. + will use -2 for train, -1 for test, 1 for inference. + + :param date_type: 0 for start_date, -1 for end_date. + + return: date. + """ + station_name = next(iter(stations)) + station_values = stations[station_name] + date = datetime.strptime(station_values[data_index][date_type], "%Y-%m-%d %H:%M:%S") + for station_values in stations.values(): + try: + s_date = datetime.strptime(station_values[data_index][date_type], "%Y-%m-%d %H:%M:%S") + # for start date + if date_type == 0 and date < s_date: + date = s_date + # for end date + if date_type == -1 and date > s_date: + date = s_date + except Exception as e: + print(e) + return date + + +def create_embeddings( + stations: List[Dict[str, Any]], + inference_hours: int, + root_path: str, + model_type: str, +): + neighbor_stations = construct_neighbor_stations(stations) + predict_out = get_deepmc_post_results(root_path, stations, model_type) + + # get start date + train_start_date = get_date(predict_out, data_index=-2, date_type=0) + test_start_date = get_date(predict_out, data_index=-1, date_type=0) + + # get end date + train_end_date = get_date(predict_out, data_index=-2, date_type=-1) + test_end_date = get_date(predict_out, data_index=-1, date_type=-1) + + test_start_date = datetime.strptime( + test_start_date.strftime("%Y-%m-%d") + " " + train_start_date.strftime("%H:%M:%S"), + "%Y-%m-%d %H:%M:%S", + ) + + df_train_embeddings = process_embeddings( + predict_out=predict_out, + inference_hours=inference_hours, + neighbor_stations=neighbor_stations, + start_date=train_start_date, + end_date=train_end_date, + data_index=0, + label_index=2, + timestamp_index=4, + ) + + df_test_embeddings = process_embeddings( + predict_out=predict_out, + inference_hours=inference_hours, + neighbor_stations=neighbor_stations, + start_date=test_start_date, + end_date=test_end_date, + data_index=1, + label_index=3, + timestamp_index=5, + ) + + return df_train_embeddings, df_test_embeddings + + +def create_embeddings_inference( + stations: List[Dict[str, Any]], + inference_hours: int, + deepmc_post_results: Dict[str, Any], +): + neighbor_stations = construct_neighbor_stations(stations) + inference_start_date = get_date(deepmc_post_results, data_index=1, date_type=0) + inference_end_date = get_date(deepmc_post_results, data_index=1, date_type=-1) + + df_embeddings = get_inference_embeddings( + predict_out=deepmc_post_results, + inference_hours=inference_hours, + neighbor_stations=neighbor_stations, + start_date=inference_start_date, + end_date=inference_end_date, + ) + + return df_embeddings + + +def get_inference_embeddings( + predict_out: Dict[str, Any], + inference_hours: int, + neighbor_stations: Dict[str, Any], + start_date: datetime, + end_date: datetime, +): + embeddings = [] + for station in neighbor_stations["stations"]: + df = pd.DataFrame( + predict_out[station][0].reshape( + predict_out[station][0].shape[0], predict_out[station][0].shape[2] + ), + columns=list(range(inference_hours)), + ) + timestamps = predict_out[station][1] + + df["station"] = station + df["timestamp"] = timestamps + df["timestamp"] = pd.to_datetime(df["timestamp"], format="%Y-%m-%d %H:%M:%S") + + mask = (df["timestamp"] >= start_date) & (df["timestamp"] <= end_date) + df = df.loc[mask] + + df.reset_index(drop=True, inplace=True) + df["forecast_step"] = df.index + embeddings.append(df) + + df_embeddings = pd.concat(embeddings, axis=0) + df_embeddings.sort_values(by=["forecast_step", "station"], inplace=True) + return df_embeddings + + +def process_embeddings( + predict_out: Dict[str, Any], + inference_hours: int, + neighbor_stations: Dict[str, Any], + start_date: datetime, + end_date: datetime, + data_index: int, + label_index: int, + timestamp_index: int, +): + """ + Process embeddings for train or test data. + + :param predict_out: Dictionary with station name as key and values. It's output of deepmc post processing. + :param inference_hours: Number of hours to predict. + :param neighbor_stations: Dictionary with stations and coordinates. + :param start_date: Start date for embeddings. + :param end_date: End date for embeddings. + :param data_index: Index of train or test data in predict_out. The pickle file + generated by deepmc follows this index train=0, test=1 + :param label_index: Index of train or test labels in predict_out. The pickle file + generated by deepmc follows this index train=2, test=3 + :param timestamp_index: Index of train or test timestamps in predict_out. The pickle file + generated by deepmc follows this index train=4, test=5 + """ + embeddings = [] + for station in neighbor_stations["stations"]: + df = pd.DataFrame( + predict_out[station][data_index].reshape( + predict_out[station][data_index].shape[0], predict_out[station][data_index].shape[2] + ), + columns=list(range(inference_hours)), + ) + + labels = predict_out[station][label_index] + timestamps = predict_out[station][timestamp_index] + + df["station"] = station + if len(timestamps) < len(labels): + labels = labels[: len(timestamps)] + + df["labels"] = labels + + if len(timestamps) > len(labels): + timestamps = timestamps[: len(labels)] + df["timestamp"] = timestamps + + df["timestamp"] = pd.to_datetime(df["timestamp"], format="%Y-%m-%d %H:%M:%S") + + mask = (df["timestamp"] >= start_date) & (df["timestamp"] <= end_date) + df = df.loc[mask] + + df.reset_index(drop=True, inplace=True) + df["forecast_step"] = df.index + + embeddings.append(df) + + df_embeddings = pd.concat(embeddings, axis=0) + df_embeddings.sort_values(by=["forecast_step", "station"], inplace=True) + return df_embeddings diff --git a/notebooks/deepmc_neighbors/notebook_lib/post_deepmc.py b/notebooks/deepmc_neighbors/notebook_lib/post_deepmc.py new file mode 100644 index 00000000..77191ab3 --- /dev/null +++ b/notebooks/deepmc_neighbors/notebook_lib/post_deepmc.py @@ -0,0 +1,271 @@ +import os +import pickle +from typing import Any, Dict, List + +import numpy as np +import pandas as pd +from notebook_lib.base_deepmc import inference_deepmc, inference_deepmc_post +from notebook_lib.data_utils import get_file, preprocess_transform +from numpy.typing import NDArray +from sklearn.preprocessing import StandardScaler + +from vibe_notebook.deepmc.utils import get_csv_data, transform_to_array_3D + + +def write_embeddings_input( + embeddings_input_path: str, + data_scaler: StandardScaler, + mix_yhat: NDArray[Any], + mix_train_yhat: NDArray[Any], + mix_yc: NDArray[Any], + mix_train_yc: NDArray[Any], + train_y: NDArray[Any], + test_y: NDArray[Any], + train_dates_list: NDArray[Any], + test_dates_list: NDArray[Any], +): + if os.path.exists(embeddings_input_path): + os.remove(embeddings_input_path) + + p_path_dir = os.path.dirname(embeddings_input_path) + if not os.path.exists(p_path_dir): + os.makedirs(p_path_dir) + + # Inverse transform outputs, save results + with open( + embeddings_input_path, + "wb", + ) as f: + mix_yhat = np.expand_dims(np.array(data_scaler.inverse_transform(mix_yhat[:, :])), axis=1) + mix_yc = np.expand_dims(np.array(data_scaler.inverse_transform(mix_yc[:, 0, :])), axis=1) + mix_train_yhat = np.expand_dims( + np.array(data_scaler.inverse_transform(mix_train_yhat[:, :])), axis=1 + ) + mix_train_yc = np.expand_dims( + np.array(data_scaler.inverse_transform(mix_train_yc[:, 0, :])), axis=1 + ) + train_dates_list = train_dates_list[:, 0] + test_dates_list = test_dates_list[:, 0] + train_labels = np.array(data_scaler.inverse_transform(np.rollaxis(train_y, 2, 1)[:, 0, :])) + test_labels = np.array(data_scaler.inverse_transform(np.rollaxis(test_y, 2, 1)[:, 0, :])) + train_labels = train_labels[:, 0] + test_labels = test_labels[:, 0] + pickle.dump( + [ + mix_yhat, + mix_train_yhat, + mix_yc, + mix_train_yc, + train_labels, + test_labels, + train_dates_list, + test_dates_list, + ], + f, + ) + + return mix_yhat, mix_train_yhat, mix_yc, mix_train_yc, train_labels, test_labels + + +def get_date_range( + stations: List[Dict[str, Any]], infer_station_name: str, root_path: str, model_type: str +): + for station in stations: + if station["name"] != infer_station_name: + model_path = os.path.join(root_path, station["name"], model_type) + train_data_path = os.path.join(model_path, "train_data_dates.pkl") + ( + _, + _, + _, + _, + _, + _, + _, + train_dates_list, + _, + test_dates_list, + ) = get_file(train_data_path) + + return (train_dates_list, test_dates_list) + raise Exception("No station found to get date range") + + +def get_station_object(stations: List[Dict[str, Any]], infer_station_name: str): + station, column_name = None, None + for stations_dict in stations: + if stations_dict["name"] == infer_station_name: + station = stations_dict["name"] + column_name = stations_dict["column_name"] + return station, column_name + + raise Exception(f"No station found with name {infer_station_name}") + + +def dump_forecast_output( + train_df: pd.DataFrame, + test_df: pd.DataFrame, + model_path: str, + column_name: str, + train_dates_list: List[str], + test_dates_list: List[str], + inference_hours: int, +): + train_data = np.array(train_df[column_name].values) + test_data = np.array(test_df[column_name].values) + mix_train_yhat = transform_to_array_3D(train_data[:-inference_hours], inference_hours) + mix_train_y = transform_to_array_3D(train_data[inference_hours:], inference_hours) + mix_test_yhat = transform_to_array_3D(test_data[:-inference_hours], inference_hours) + mix_test_y = transform_to_array_3D(test_data[inference_hours:], inference_hours) + out_dir = os.path.join(model_path, "embeddings") + if not os.path.exists(out_dir): + os.makedirs(out_dir) + + out_path = os.path.join(out_dir, "post_processed_results.pkl") + + # Inverse transform outputs, save results + with open(out_path, "wb") as f: + train_labels = mix_train_y.squeeze() + test_labels = mix_test_y.squeeze() + train_labels = train_labels[:, 0] + test_labels = test_labels[:, 0] + + pickle.dump( + [ + mix_test_yhat, + mix_train_yhat, + mix_test_y, + mix_train_y, + train_labels, + test_labels, + train_dates_list, + test_dates_list, + ], + f, + ) + + +def embeddings_preprocess_forecast( + stations: List[Dict[str, Any]], + infer_station_name: str, + root_path: str, + input_data_path: str, + forecast_interval: int, + model_type: str, + column_name: str, +): + model_path = os.path.join(root_path, infer_station_name, model_type) + forecast_df = get_csv_data(input_data_path) + train_dates_list, test_dates_list = get_date_range( + stations, infer_station_name, root_path, model_type + ) + train_df = forecast_df[forecast_df.index.isin(train_dates_list[:, 0])] + test_df = forecast_df[forecast_df.index.isin(test_dates_list[:, 0])] + + train_dates_list = ( + train_df[forecast_interval:].index.strftime("%Y-%m-%d %H:%M:%S").tolist() # type: ignore + ) + test_dates_list = ( + test_df[forecast_interval:].index.strftime("%Y-%m-%d %H:%M:%S").tolist() # type: ignore + ) + + dump_forecast_output( + train_df, + test_df, + model_path, + column_name, + train_dates_list, + test_dates_list, + forecast_interval, + ) + + +def embeddings_preprocess_deepmc( + model_path: str, + inference_hours: int, +): + train_data_path = os.path.join(model_path, "train_data_dates.pkl") + ( + train_X, + train_y, + test_X, + test_y, + _, + output_scaler1, + _, + train_dates_list, + _, + test_dates_list, + ) = get_file(train_data_path) + + list_train_X = inference_deepmc(model_path, train_X, inference_hours) + list_test_X = inference_deepmc(model_path, test_X, inference_hours) + + # Train data deepmc inference Post-Processing + mix_train_yc = preprocess_post_deepmc_gt(list_train_X, train_y, inference_hours) + mix_train_yhat = inference_deepmc_post(model_path, list_train_X) + + # Test data deepmc inference Post-Processing + mix_yc = preprocess_post_deepmc_gt(list_test_X, test_y, inference_hours) + mix_yhat = inference_deepmc_post(model_path, list_test_X) + + mix_train_yhat, train_dates_list = preprocess_transform( + mix_train_yhat, inference_hours, train_dates_list + ) + mix_yhat, test_dates_list = preprocess_transform(mix_yhat, inference_hours, test_dates_list) + embeddings_input_path = os.path.join(model_path, "embeddings", "post_processed_results.pkl") + + # Inverse transform outputs, save results + write_embeddings_input( + embeddings_input_path, + output_scaler1, + mix_yhat, + mix_train_yhat, + mix_yc, + mix_train_yc, + train_y, + test_y, + train_dates_list, + test_dates_list, + ) + + +def preprocess_post_deepmc_gt( + post_data_x: List[NDArray[Any]], data_y: NDArray[Any], inference_hours: int +): + data_y = data_y[: data_y.shape[0] - inference_hours] + mix_data_gt = np.empty([data_y.shape[0], data_y.shape[1], len(post_data_x)]) + + idx = 0 + for _, _ in enumerate(post_data_x): + mix_data_gt[:, :, idx] = mix_data_gt[:, idx, :] + idx = idx + 1 + + return mix_data_gt + + +def initialize_embeddings_preprocessing( + infer_station_name: str, + stations: List[Dict[str, Any]], + root_path: str, + infer_forecast_data_path: str, + infer_interval: int, + model_type: str, +): + for station in stations: + model_path = os.path.join(root_path, station["name"], model_type) + if station["name"] == infer_station_name: + embeddings_preprocess_forecast( + stations, + infer_station_name, + root_path, + infer_forecast_data_path, + infer_interval, + model_type, + station["column_name"], + ) + else: + embeddings_preprocess_deepmc( + model_path, + inference_hours=24, + ) diff --git a/notebooks/deepmc_neighbors/notebook_lib/post_deepmc_inference.py b/notebooks/deepmc_neighbors/notebook_lib/post_deepmc_inference.py new file mode 100644 index 00000000..eb6065ef --- /dev/null +++ b/notebooks/deepmc_neighbors/notebook_lib/post_deepmc_inference.py @@ -0,0 +1,268 @@ +import os +from datetime import datetime +from typing import Any, Dict, List, Tuple + +import numpy as np +import pandas as pd +from notebook_lib.base_deepmc import inference_deepmc, inference_deepmc_post +from notebook_lib.data_utils import preprocess_transform +from numpy.typing import NDArray +from shapely import geometry +from sklearn.preprocessing import StandardScaler + +from vibe_notebook.deepmc import prediction, utils +from vibe_notebook.deepmc.forecast import Forecast +from vibe_notebook.deepmc.utils import get_csv_data, transform_to_array_3D + +HRRR_PARAMETERS = [ + {"weather_type": "temperature", "search_text": "TMP:2 m"}, + {"weather_type": "humidity", "search_text": "RH:2 m"}, + {"weather_type": "u-component", "search_text": "UGRD:10 m"}, + {"weather_type": "v-component", "search_text": "VGRD:10 m"}, +] + + +def get_date_range( + stations: List[Dict[str, Any]], + infer_station_name: str, + deepmc_inference_results: Dict[str, Any], +): + for station in stations: + if station["name"] != infer_station_name: + (_, dates_list, _, _) = deepmc_inference_results[station["name"]] + dates_list = np.squeeze(np.array(dates_list)[:, 0]) + dates_list = dates_list[:, 0] + return dates_list + + raise Exception("No station found to get date range") + + +def get_station_object(stations: List[Dict[str, Any]], infer_station_name: str): + station, column_name = None, None + for stations_dict in stations: + if stations_dict["name"] == infer_station_name: + station = stations_dict["name"] + column_name = stations_dict["column_name"] + return station, column_name + + if station is None: + raise Exception(f"No station found with name {infer_station_name}") + + +def embeddings_preprocess_forecast( + stations: List[Dict[str, Any]], + infer_station_name: str, + input_data_path: str, + forecast_interval: int, + deepmc_inference_results: Dict[str, Any], + column_name: str, +): + forecast_df = get_csv_data(input_data_path) + dates_list = get_date_range(stations, infer_station_name, deepmc_inference_results) + data_df = forecast_df[forecast_df.index.isin(dates_list)] + + dates_list = ( + data_df[forecast_interval:].index.strftime("%Y-%m-%d %H:%M:%S").tolist() # type: ignore + ) + + data_forecast = np.array(data_df[column_name].values) + data_forecast = transform_to_array_3D(data_forecast[:], forecast_interval) + + return data_forecast, dates_list + + +def embeddings_preprocess_deepmc( + model_path: str, + inference_hours: int, + deepmc_inference_results: Tuple[NDArray[Any], NDArray[Any], StandardScaler, StandardScaler], +): + (data_x, dates_list, _, output_scaler) = deepmc_inference_results + + deepmc_out = inference_deepmc(model_path, data_x, inference_hours) + + # Train Post-Processing Scaling Models + mix_yhat = inference_deepmc_post(model_path, deepmc_out) + mix_yhat, dates_list = preprocess_transform(mix_yhat, inference_hours, dates_list) + dates_list = np.squeeze(np.array(dates_list)[:, 0]) + dates_list = dates_list[:, 0] + dates_list = pd.to_datetime(dates_list).strftime("%Y-%m-%d %H:%M:%S") + mix_yhat = np.expand_dims(np.array(output_scaler.inverse_transform(mix_yhat[:, :])), axis=1) + return mix_yhat, dates_list + + +def inference_embeddings_preprocessing( + infer_station_name: str, + stations: List[Dict[str, Any]], + root_path: str, + infer_forecast_data_path: str, + infer_interval: int, + model_type: str, + deepmc_inference_results: Dict[str, Any], +): + process_out = {} + for station in stations: + model_path = os.path.join(root_path, station["name"], model_type) + if station["name"] == infer_station_name: + process_out[station["name"]] = embeddings_preprocess_forecast( + stations, + infer_station_name, + infer_forecast_data_path, + infer_interval, + deepmc_inference_results, + station["column_name"], + ) + else: + process_out[station["name"]] = embeddings_preprocess_deepmc( + model_path, + infer_interval, + deepmc_inference_results[station["name"]], + ) + return process_out + + +def download_forecast_data( + stations: List[Dict[str, Any]], + start_date: datetime, + end_date: datetime, +): + parameters = HRRR_PARAMETERS + hrrr_data_workflow = "data_ingestion/weather/herbie_forecast" + time_range = (start_date, end_date) + forecast_dataset = {} + + for station in stations: + # AGWeatherNet station + station_name = station["name"] + station_location = station["coordinates"] + station_geometry = geometry.Point(station_location) + + forecast_ = Forecast( + workflow_name=hrrr_data_workflow, + geometry=station_geometry, + time_range=time_range, + parameters=parameters, + ) + run_list = forecast_.submit_download_request() + + p_forecast_dataset = forecast_.get_downloaded_data(run_list=run_list, offset_hours=-8) + p_forecast_dataset = utils.convert_forecast_data(p_forecast_dataset) + forecast_dataset[station_name] = p_forecast_dataset + return forecast_dataset + + +def get_historical_data( + stations: List[Dict[str, Any]], + historical_data_path: str, + historical_dataset_features: List[str], + inference_station: str, +): + historical_datasets = {} + for station in stations: + if station["name"] != inference_station: + p = historical_data_path % station["name"] + historical_df = utils.get_csv_data(path=p, interpolate=False, fill_na=False) + historical_df = historical_df[historical_dataset_features] + + historical_datasets[station["name"]] = historical_df + + return historical_datasets + + +def concat_historical_forecast( + stations: List[Dict[str, Any]], + historical_data_path: str, + hrrr_datasets: Dict[str, pd.DataFrame], + start_date: datetime, + end_date: datetime, + inference_station: str, + historical_dataset_features: List[str] = ["humidity", "wind_speed", "temperature"], + forecast_dataset_features: List[str] = [ + "humidity_forecast", + "wind_speed_forecast", + "temperature_forecast", + ], + frequency_hour: int = 1, + number_of_hours: int = 24, + weather_inference_type: str = "temperature", +): + historical_datasets = get_historical_data( + stations, historical_data_path, historical_dataset_features, inference_station + ) + + dataset_variables = historical_dataset_features.copy() + dataset_variables.extend(forecast_dataset_features) + dataset_variables.sort() + + out_dataset = {} + for station, historical_df in historical_datasets.items(): + forecast_df = hrrr_datasets[station] + + input_df = utils.clean_relevant_data_using_hrrr( + actual_df=historical_df.copy(), + forecast_df=forecast_df.copy(), + out_variables=dataset_variables, + freq_hours=frequency_hour, + num_of_indices=number_of_hours, + start_date=start_date, + end_date=end_date, + ) + + input_df = input_df[dataset_variables] + input_df = input_df[input_df.columns] + out_feature_df = input_df[weather_inference_type] + input_df.drop(columns=[weather_inference_type], inplace=True) + input_df[weather_inference_type] = out_feature_df + out_dataset[station] = input_df + + return out_dataset + + +def run_deepmc_inference( + root_path: str, + model_type: str, + out_features: List[str], + stations: List[Dict[str, Any]], + historical_data_path: str, + hrrr_datasets: Dict[str, pd.DataFrame], + start_date: datetime, + end_date: datetime, + inference_station: str, + historical_dataset_features: List[str] = ["humidity", "wind_speed", "temperature"], + forecast_dataset_features: List[str] = [ + "humidity_forecast", + "wind_speed_forecast", + "temperature_forecast", + ], + frequency_hour: int = 1, + number_of_hours: int = 24, + weather_inference_type: str = "temperature", +): + historical_clean_dataset = concat_historical_forecast( + stations, + historical_data_path, + hrrr_datasets, + start_date, + end_date, + inference_station, + historical_dataset_features, + forecast_dataset_features, + frequency_hour, + number_of_hours, + weather_inference_type, + ) + + inference_output = {} + for station, clean_dataset in historical_clean_dataset.items(): + train_data_export_path = os.path.join(root_path, station, model_type, "train_data.pkl") + + weather_forecast = prediction.InferenceWeather( + root_path=root_path, + data_export_path=train_data_export_path, + station_name=station, + predicts=out_features, + relevant=True, + ) + + inference_output[station] = weather_forecast.deepmc_preprocess(clean_dataset, "temperature") + + return inference_output diff --git a/notebooks/deepmc_neighbors/notebook_lib/schema.py b/notebooks/deepmc_neighbors/notebook_lib/schema.py new file mode 100644 index 00000000..a9868007 --- /dev/null +++ b/notebooks/deepmc_neighbors/notebook_lib/schema.py @@ -0,0 +1,23 @@ +from typing import List + +from pydantic import BaseModel + + +class BatchTGCNInputs(BaseModel): + lookback_horizon: int + lookahead_horizon: int + node_num: int + node_in_fea_dim: int + node_out_fea_dim: int + edge_in_fea_dim: int + edge_out_fea_dim: int + edge_num: int + use_edge_weights: bool + day_em_dim: int + hour_em_dim: int + period: int + batch_size: int + use_dropout: bool + hidden_dim: int + device_count: int + lookback_indices: List[int] diff --git a/notebooks/deepmc_neighbors/notebook_lib/train.py b/notebooks/deepmc_neighbors/notebook_lib/train.py new file mode 100644 index 00000000..84b84c9a --- /dev/null +++ b/notebooks/deepmc_neighbors/notebook_lib/train.py @@ -0,0 +1,516 @@ +import os +import shutil +import warnings +from datetime import datetime +from typing import Any, Dict, List, Union + +import numpy as np +import onnxruntime +import pandas as pd +import pytorch_lightning as pl +import torch +from matplotlib import pyplot as plt +from notebook_lib.embeddings import create_embeddings, create_embeddings_inference +from notebook_lib.post_deepmc import initialize_embeddings_preprocessing +from notebook_lib.post_deepmc_inference import ( + inference_embeddings_preprocessing, + run_deepmc_inference, +) +from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint +from sklearn.preprocessing import StandardScaler +from torch.utils.data import DataLoader + +from vibe_notebook.deepmc.utils import calculate_KPI, get_csv_data + +from .base_dataset import BatchSampler, GNNDataset +from .base_modules import BatchTGCNInputs, BatchTGCNTrain +from .data_utils import ( + build_scaler, + build_scaler_label, + export_to_onnx, + get_batch, + get_batch_sample, + get_file, + get_split_data, + problem_params, + smooth, + train_test_dataset, + write_to_file, +) + + +class MC_Neighbors: + def __init__( + self, + root_dir: str, + hidden_dim: int = 528, + lookahead_horizon: int = 1, + lookback_horizon: int = 1, + learning_rate: float = 0.001, + use_dropout: bool = False, + use_edge_weights: bool = False, + device_type: str = "cpu", # cuda, cpu + labels_column: str = "labels", + weather_type: str = "temperature", + model_type: str = "relevant", + ): + """ + Initialize the MC_Neighbors. + + :param root_dir: Path to trained model and preprocessed files. + :param hidden_dim: Input dimension transforms it to linear layer. + :param lookahead_horizon: Number of hours to lookahead. + :param lookback_horizon: Number of hours to lookback. + :param learning_rate: The learning rate of the model. + :param use_dropout: True or False to use dropout layer for model training. + :param use_edge_weights: True or False. If True consider spatial distance + between stations for model training. + :param device_type: The device type of the model. + :param labels_column: The labels column of the dataset. + :param weather_type: Purpose of trained model. It can be temperature or wind_speed etc.,. + :param model_type: relevant or not-relevant. + """ + self.weather_type = weather_type + self.root_dir = root_dir + self.lookahead_horizon = lookahead_horizon + self.lookback_horizon = lookback_horizon + self.hidden_dim = hidden_dim + self.learning_rate = learning_rate + self.use_dropout = use_dropout + self.use_edge_weights = use_edge_weights + self.labels_column = labels_column + self.device = torch.device( + device_type if device_type == "cuda" and torch.cuda.is_available() else "cpu" + ) + self.model_type = model_type + + def gnn_output_dir(self, infer_station: str): + if self.use_edge_weights: + edge_weights = "edge_weights" + else: + edge_weights = "no_edge_weights" + return os.path.join( + self.root_dir, + infer_station, + self.model_type, + "gnn_models", + edge_weights, + ) + + def gnn_preprocess_file(self, infer_station: str): + output_dir = self.gnn_output_dir(infer_station) + return os.path.join(output_dir, "pre_process_data_export.json") + + def run_train( + self, + train_embeddings: pd.DataFrame, + test_embeddings: pd.DataFrame, + neighbor_stations: List[Dict[str, Any]], + infer_station: str, + epochs: int, + batch_size: int, + forecast_hours: int, + ) -> None: + self.output_dir = self.gnn_output_dir(infer_station) + stations = self.get_neighbor_stations(neighbor_stations) + scaler_data = build_scaler(train_embeddings.copy(), forecast_hours) + scaler_label, labels_column_index = build_scaler_label( + train_embeddings.copy(), self.labels_column + ) + data_export_path = self.gnn_preprocess_file(infer_station) + if not os.path.exists(data_export_path): + os.makedirs(os.path.dirname(data_export_path), exist_ok=True) + write_to_file(data_export_path, data=[scaler_data, scaler_label, labels_column_index]) + + self.initialize_train( + train_embeddings, + test_embeddings, + stations, + infer_station, + epochs, + batch_size, + forecast_hours, + scaler_data, + scaler_label, + labels_column_index, + ) + + def initialize_train( + self, + train_embeddings: pd.DataFrame, + test_embeddings: pd.DataFrame, + neighbors_station: Dict[str, Any], + infer_station: str, + epochs: int, + batch_size: int, + forecast_hours: int, + scaler_data: StandardScaler, + scaler_label: StandardScaler, + labels_column_index: int, + ): + for step in range(forecast_hours): + train_dataset, test_dataset = train_test_dataset( + train_data=train_embeddings, + test_data=test_embeddings, + step=step, + neighbors_station=neighbors_station, + scaler_data=scaler_data, + scaler_label=scaler_label, + infer_station=infer_station, + labels_column_index=labels_column_index, + ) + + train_sampler, test_sampler = get_batch_sample( + train_dataset=train_dataset, + test_dataset=test_dataset, + batch_size=batch_size, + lookahead_horizon=self.lookahead_horizon, + lookback_horizon=self.lookback_horizon, + device=self.device, + use_edge_weights=self.use_edge_weights, + ) + + inputs = BatchTGCNInputs( + **problem_params( + train_dataset, + batch_size, + self.lookback_horizon, + self.lookahead_horizon, + self.use_edge_weights, + self.use_dropout, + self.hidden_dim, + forecast_hours, + ) + ) + model = BatchTGCNTrain(inputs, self.learning_rate) + model.to(self.device) + self.train_model(model, epochs, train_sampler, test_sampler, step) + + def train_model( + self, + model: BatchTGCNTrain, + epochs: int, + train_sampler: BatchSampler, + test_sampler: BatchSampler, + forecast_step: int, + ): + if not os.path.exists(self.output_dir): + os.makedirs(self.output_dir) + + model_path = "{}/model_{}".format(self.output_dir, forecast_step) + + if os.path.exists(model_path): + shutil.rmtree(model_path, ignore_errors=True) + + os.makedirs(model_path, exist_ok=True) + + # batch_size is set to None to avoid batch size in dataloader + # batch_size is set when creating the sampler + train_loader = DataLoader(train_sampler, batch_size=None, collate_fn=lambda x: x) + val_loader = DataLoader(test_sampler, batch_size=None, collate_fn=lambda x: x) + + t_obj = pl.Trainer( + logger=True, + max_epochs=epochs, + callbacks=[ + LearningRateMonitor(), + ModelCheckpoint( + monitor="val_loss/total", + save_last=True, + dirpath=model_path, + ), + ], + ) + t_obj.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader) + export_to_onnx( + model_path, + model, + train_loader, + self.use_edge_weights, + train_sampler.dataset.edge_num, + train_sampler.dataset.stations_count, + ) + + def run_inference( + self, + embeddings: pd.DataFrame, + neighbors_station: List[Dict[str, Any]], + infer_station: str, + batch_size: int, + forecast_hours: int, + ): + self.output_dir = self.gnn_output_dir(infer_station) + stations = self.get_neighbor_stations(neighbors_station) + scaler_data, scaler_label, labels_column_index = get_file( + self.gnn_preprocess_file(infer_station) + ) + + pred_data = [] + for step in range(forecast_hours): + dataset, sampler = self.get_infer_inputs( + embeddings, + stations, + infer_station, + batch_size, + forecast_hours, + step, + None, + scaler_data, + scaler_label, + ) + loader = DataLoader(sampler, batch_size=None, collate_fn=lambda x: x) + for index, data in enumerate(loader): + onnx_file_path = "{}/model_{}/model_output.onnx".format(self.output_dir, step) + if data[0].shape[0] != batch_size: + warnings.warn( + f"""Data at step {step} batch index {index} is less than batch size. + It will be skipped from running inference.""" + ) + continue + if step == 0: + results = np.zeros((batch_size, forecast_hours)) + results[:, step] = self.inference(onnx_file_path, data)[ + :, dataset.infer_station_index + ].squeeze() + pred_data.append(results) + else: + pred_data[index][:, step] = self.inference(onnx_file_path, data)[ + :, dataset.infer_station_index + ].squeeze() + pred_data = np.concatenate(pred_data, axis=0) + pred_data = scaler_data.inverse_transform(pred_data) + timestamps = dataset.timestamps[: pred_data.shape[0]] + pred_data = get_split_data(pred_data, timestamps, forecast_hours) # type: ignore + pred_data_df = pd.DataFrame( + zip(pred_data, timestamps), columns=[self.weather_type, "timestamp"] + ) + return pred_data_df + + def get_historical_data(self, data_path: str): + historical_data_df = get_csv_data(data_path) + historical_data_df.reset_index(inplace=True) + historical_data_df.rename(columns={"date": "timestamp"}, inplace=True) + return historical_data_df + + def get_hrrr_data( + self, + data_path: str, + ): + df_node = pd.read_csv(data_path, parse_dates=["date"]) + df_node.rename(columns={"date": "timestamp"}, inplace=True) + return df_node + + def get_infer_inputs( + self, + embeddings: pd.DataFrame, + neighbors_station: Dict[str, Any], + infer_station: str, + batch_size: int, + forecast_hours: int, + step: int, + labels_column_index: Union[int, None], + scaler_data: StandardScaler, + scaler_label: StandardScaler, + ): + dataset = GNNDataset( + embeddings, + forecast_step=step, + scaler_input=scaler_data, + scaler_label=scaler_label, + neighbor_station=neighbors_station, + forecast_hours=forecast_hours, + infer_station=infer_station, + label_column_index=labels_column_index, + ) + + sampler = BatchSampler( + dataset=dataset, + batch_size=batch_size, + lookahead_horizon=self.lookahead_horizon, + lookback_horizon=self.lookback_horizon, + device=self.device, + random=False, + use_edge_weights=self.use_edge_weights, + ) + + return dataset, sampler + + def inference(self, onnx_file_path: str, data: torch.Tensor): + session = onnxruntime.InferenceSession(onnx_file_path, None) + node_data, edge_index, edge_data, _ = get_batch(data, self.use_edge_weights) + + inputs = { + "node_data": node_data.numpy(), + "edge_index": edge_index.numpy(), + "edge_data": edge_data.numpy(), + } + + inputs = {out.name: inputs[out.name] for i, out in enumerate(session.get_inputs())} + results = session.run(None, input_feed=inputs)[0] + return results + + def get_embeddings( + self, + inference_station: str, + neighbor_stations: List[Dict[str, Any]], + inference_hours: int, + infer_forecast_data_path: str, + ): + initialize_embeddings_preprocessing( + infer_station_name=inference_station, + stations=neighbor_stations, + root_path=self.root_dir, + infer_forecast_data_path=infer_forecast_data_path, + infer_interval=inference_hours, + model_type=self.model_type, + ) + + df_train_embeddings, df_test_embeddings = create_embeddings( + stations=neighbor_stations, + inference_hours=inference_hours, + root_path=self.root_dir, + model_type=self.model_type, + ) + + return df_train_embeddings, df_test_embeddings + + def get_neighbor_stations( + self, + neighbor_stations: List[Dict[str, Any]], + ): + stations_connection = {} + stations = [] + station_long_lat = {} + for station in neighbor_stations: + stations.append(station["name"]) + station_long_lat[station["name"]] = station["coordinates"] + + stations_connection["stations"] = stations + stations_connection["long_lat"] = station_long_lat + + return stations_connection + + def filter_data( + self, + df_inference: pd.DataFrame, + df_historical: pd.DataFrame, + df_forecast: pd.DataFrame, + ): + start_date = df_inference["timestamp"].min() + end_date = df_inference["timestamp"].max() + + df_historical = df_historical[df_historical.timestamp.between(start_date, end_date)] + df_historical = df_historical[["timestamp", self.weather_type]] + + df_inference = df_inference[df_inference.timestamp.between(start_date, end_date)] + df_inference = df_inference[["timestamp", self.weather_type]] + + df_forecast = df_forecast[df_forecast.timestamp.between(start_date, end_date)] + df_forecast.rename(columns={"temperature_forecast": self.weather_type}, inplace=True) + df_forecast = df_forecast[["timestamp", self.weather_type]] + + return df_inference, df_historical, df_forecast + + def view_plot( + self, + df_inference: pd.DataFrame, + historical_data_path: str, + hrrr_data_path: str, + ): + df_historical = self.get_historical_data(historical_data_path) + df_forecast = self.get_hrrr_data(hrrr_data_path) + + df_inference, df_historical, df_forecast = self.filter_data( + df_inference, df_historical, df_forecast + ) + + timestamps = df_inference["timestamp"] + y_hat = list(df_inference[self.weather_type].values) + y = list(df_historical[self.weather_type].values) + hrrr_data_y = list(df_forecast[self.weather_type].values) + + plt.figure(figsize=(18, 6)) + plt.plot(timestamps, smooth(y_hat, 2), label="Predict") + plt.plot(timestamps, y, label="Ground Truth") + plt.plot(timestamps, hrrr_data_y, label="HRRR", linestyle="--") + plt.title("Comparison Ground Truth Vs Inference Results Vs HRRR") + plt.legend() + + def view_performance( + self, + df_inference: pd.DataFrame, + historical_data_path: str, + hrrr_data_path: str, + ): + df_historical = self.get_historical_data(historical_data_path) + df_forecast = self.get_hrrr_data(hrrr_data_path) + + df_inference, df_historical, df_forecast = self.filter_data( + df_inference, df_historical, df_forecast + ) + + y_hat = list(df_inference[self.weather_type].values) + y = np.array(df_historical[self.weather_type].values) + hrrr_data_y = list(df_forecast[self.weather_type].values) + + print("GNN ", self.weather_type) + calculate_KPI(smooth(y_hat, 1), y) + print("") + print("Hrrr", self.weather_type) + calculate_KPI(smooth(hrrr_data_y, 1), y) + + def get_embeddings_inference( + self, + inference_station: str, + neighbor_stations: List[Dict[str, Any]], + inference_hours: int, + infer_forecast_data_path: str, + out_features: List[str], + historical_data_path: str, + hrrr_datasets: Dict[str, pd.DataFrame], + start_date: datetime, + end_date: datetime, + historical_dataset_featues: List[str] = ["humidity", "wind_speed", "temperature"], + forecast_dataset_features: List[str] = [ + "humidity_forecast", + "wind_speed_forecast", + "temperature_forecast", + ], + frequency_hour: int = 1, + number_of_hours: int = 24, + weather_inference_type: str = "temperature", + ): + deepmc_results = run_deepmc_inference( + self.root_dir, + self.model_type, + out_features, + neighbor_stations, + historical_data_path, + hrrr_datasets, + start_date, + end_date, + inference_station, + historical_dataset_featues, + forecast_dataset_features, + frequency_hour, + number_of_hours, + weather_inference_type, + ) + + deepmc_post_results = inference_embeddings_preprocessing( + infer_station_name=inference_station, + stations=neighbor_stations, + root_path=self.root_dir, + infer_forecast_data_path=infer_forecast_data_path, + infer_interval=inference_hours, + model_type=self.model_type, + deepmc_inference_results=deepmc_results, + ) + + df_embeddings = create_embeddings_inference( + stations=neighbor_stations, + inference_hours=inference_hours, + deepmc_post_results=deepmc_post_results, + ) + + return df_embeddings diff --git a/notebooks/deepmc_neighbors/sample_data.csv b/notebooks/deepmc_neighbors/sample_data.csv new file mode 100644 index 00000000..b4646fd3 --- /dev/null +++ b/notebooks/deepmc_neighbors/sample_data.csv @@ -0,0 +1,92 @@ +date,temperature,humidity,wind_speed +2021-07-25 00:15:00,79.4,33.3,8.5 +2021-07-25 00:30:00,78.4,35.3,9.4 +2021-07-25 00:45:00,78.1,34.9,9.2 +2021-07-25 01:00:00,78.0,35.0,8.9 +2021-07-25 01:15:00,77.7,35.6,9.0 +2021-07-25 01:30:00,77.8,35.8,8.9 +2021-07-25 01:45:00,77.6,35.9,8.8 +2021-07-25 02:00:00,76.8,37.5,9.1 +2021-07-25 02:15:00,75.4,39.8,9.4 +2021-07-25 02:30:00,74.5,41.6,8.0 +2021-07-25 02:45:00,74.4,41.9,7.3 +2021-07-25 03:00:00,74.6,41.5,6.0 +2021-07-25 03:15:00,73.8,43.1,5.0 +2021-07-25 03:30:00,73.4,43.7,5.9 +2021-07-25 03:45:00,73.4,42.8,6.3 +2021-07-25 04:00:00,73.5,42.2,4.9 +2021-07-25 04:15:00,72.4,44.5,5.2 +2021-07-25 04:30:00,73.9,40.8,7.1 +2021-07-25 04:45:00,74.1,40.7,7.2 +2021-07-25 05:00:00,73.9,41.6,7.2 +2021-07-25 05:15:00,73.6,42.5,7.2 +2021-07-25 05:30:00,73.7,42.6,6.9 +2021-07-25 05:45:00,74.0,42.3,6.9 +2021-07-25 06:00:00,74.2,42.5,6.3 +2021-07-25 06:15:00,74.4,42.5,7.1 +2021-07-25 06:30:00,75.1,41.1,6.9 +2021-07-25 06:45:00,76.1,39.9,5.7 +2021-07-25 07:00:00,76.9,40.2,4.6 +2021-07-25 07:15:00,76.7,44.9,3.9 +2021-07-25 07:30:00,76.8,46.7,3.1 +2021-07-25 07:45:00,77.0,45.6,3.5 +2021-07-25 08:00:00,77.7,44.3,3.4 +2021-07-25 08:15:00,78.5,44.1,3.9 +2021-07-25 08:30:00,79.0,44.0,3.9 +2021-07-25 08:45:00,79.9,42.2,3.4 +2021-07-25 09:00:00,81.0,43.7,3.8 +2021-07-25 09:15:00,81.5,44.2,5.0 +2021-07-25 09:30:00,81.8,42.4,6.3 +2021-07-25 09:45:00,82.5,42.7,6.9 +2021-07-25 10:00:00,82.8,40.9,7.3 +2021-07-25 10:15:00,83.2,38.5,7.0 +2021-07-25 10:30:00,83.7,36.1,5.8 +2021-07-25 10:45:00,84.4,35.2,5.2 +2021-07-25 11:00:00,86.0,31.6,4.8 +2021-07-25 11:15:00,86.5,29.0,5.2 +2021-07-25 11:30:00,87.6,26.1,6.2 +2021-07-25 11:45:00,87.9,26.2,6.5 +2021-07-25 12:00:00,88.0,25.9,6.0 +2021-07-25 12:15:00,88.5,27.0,5.7 +2021-07-25 12:30:00,89.2,25.6,5.0 +2021-07-25 12:45:00,89.7,24.0,4.6 +2021-07-25 13:00:00,90.4,23.0,4.7 +2021-07-25 13:15:00,91.6,21.7,5.2 +2021-07-25 13:30:00,91.5,20.5,5.8 +2021-07-25 13:45:00,91.7,21.1,5.5 +2021-07-25 14:00:00,93.4,20.4,4.9 +2021-07-25 14:15:00,94.3,18.2,4.4 +2021-07-25 14:30:00,93.4,18.4,4.7 +2021-07-25 14:45:00,94.4,17.4,4.0 +2021-07-25 15:00:00,94.6,17.8,4.3 +2021-07-25 15:15:00,93.9,19.1,5.4 +2021-07-25 15:30:00,93.4,18.9,6.5 +2021-07-25 15:45:00,93.6,18.1,5.2 +2021-07-25 16:00:00,93.8,18.3,4.7 +2021-07-25 16:15:00,93.9,17.9,4.5 +2021-07-25 16:30:00,94.0,16.4,4.7 +2021-07-25 16:45:00,94.0,16.7,4.3 +2021-07-25 17:00:00,94.0,16.9,4.2 +2021-07-25 17:15:00,94.2,16.7,3.2 +2021-07-25 17:30:00,94.3,16.8,3.7 +2021-07-25 17:45:00,93.7,18.9,3.8 +2021-07-25 18:00:00,93.7,18.3,3.2 +2021-07-25 18:15:00,93.3,20.6,3.0 +2021-07-25 18:30:00,91.7,26.2,3.5 +2021-07-25 18:45:00,90.5,26.8,2.6 +2021-07-25 19:00:00,88.4,28.1,3.0 +2021-07-25 19:15:00,85.3,30.7,4.3 +2021-07-25 19:30:00,83.4,31.0,4.7 +2021-07-25 19:45:00,82.0,33.8,4.6 +2021-07-25 20:00:00,80.7,36.4,3.6 +2021-07-25 20:15:00,78.6,39.3,4.4 +2021-07-25 20:30:00,80.6,31.3,4.7 +2021-07-25 20:45:00,79.1,37.6,4.8 +2021-07-25 21:00:00,80.4,30.2,5.7 +2021-07-25 21:15:00,82.8,24.7,6.5 +2021-07-25 21:30:00,82.2,24.7,6.8 +2021-07-25 21:45:00,81.6,25.2,6.7 +2021-07-25 22:00:00,80.8,26.2,6.9 +2021-07-25 22:15:00,80.2,27.3,6.9 +2021-07-25 22:30:00,79.8,28.0,6.9 +2021-07-25 22:45:00,79.3,28.5,6.8 \ No newline at end of file diff --git a/src/vibe_notebook/vibe_notebook/deepmc/__init__.py b/src/vibe_notebook/vibe_notebook/deepmc/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/vibe_notebook/vibe_notebook/deepmc/forecast.py b/src/vibe_notebook/vibe_notebook/deepmc/forecast.py new file mode 100644 index 00000000..993a6617 --- /dev/null +++ b/src/vibe_notebook/vibe_notebook/deepmc/forecast.py @@ -0,0 +1,164 @@ +from datetime import datetime, timedelta +from typing import Any, Dict, List, Tuple, cast + +import numpy as np +import pandas as pd +from IPython.display import clear_output +from shapely.geometry import Point + +from vibe_core.client import FarmvibesAiClient, get_default_vibe_client +from vibe_core.datamodel import RunConfig, RunConfigUser, SpatioTemporalJson + + +class Forecast: + def __init__( + self, + workflow_name: str, + geometry: Point, + time_range: Tuple[datetime, datetime], + parameters: List[Dict[str, str]], + date_column: str = "date", + ): + self.client: FarmvibesAiClient = get_default_vibe_client() + self.workflow_name = workflow_name + self.geometry = geometry + self.parameters = parameters + self.time_range = time_range + self.date_column = date_column + + def submit_download_request(self): + """ + Submit request to worker to download forecast data + """ + run_metadata_list = [] + runs = [] + for parameter in self.parameters: + run_name = f"forecast_{parameter['weather_type']}" + run = self.client.run( + workflow=self.workflow_name, + name=run_name, + geometry=self.geometry, + time_range=self.time_range, + parameters=parameter, + ) + + run_metadata_list.append( + { + "id": run.id, + "weather_type": parameter["weather_type"], + } + ) + runs.append(run) + + self.client.monitor(runs, 5) + + return run_metadata_list + + def get_run_status(self, run_list: List[Dict[str, str]]): + clear_output(wait=True) + out = [] + for run_item in run_list: + o = self.client.describe_run(run_item["id"]) + print(f"Execution status for {run_item['weather_type']}: {o.details.status}") + + if o.details.status == "done": + out.append(o) + else: + raise Exception( + f"Execution status for {run_item['weather_type']}: {o.details.status}" + ) + + return out + + def get_all_assets(self, details: RunConfigUser): + asset_files = [] + output = details.output["weather_forecast"] + record: Dict[str, Any] + for record in cast(List[Dict[str, Any]], output): + for value in record["assets"].values(): + asset_files.append(value["href"]) + df_assets = [pd.read_csv(f, index_col=False) for f in asset_files] + df_out = pd.concat(df_assets) + df_out = self.clean_forecast_data(forecast_df=df_out, run_details=details) + return df_out + + def get_downloaded_data(self, run_list: List[Dict[str, str]], offset_hours: int = 0): + """ + check the download status. If status is done, fetch the downloaded data + """ + forecast_dataset = pd.DataFrame() + out = self.get_run_status(run_list) + for detail in out: + df = self.get_all_assets(detail) + + # Offset from UTC to specified timezone + df.index = df.index + pd.offsets.Hour(offset_hours) + + if not df.empty: + forecast_dataset = pd.concat([forecast_dataset, df], axis=1) + + return forecast_dataset + + def clean_forecast_data( + self, + forecast_df: pd.DataFrame, + run_details: RunConfig, + ): + df = forecast_df[self.date_column] + assert isinstance(run_details.user_input, SpatioTemporalJson) + start_date: datetime = run_details.user_input.start_date + end_date: datetime = run_details.user_input.end_date + + # derive forecast data + forecast_df.drop(columns=[self.date_column], inplace=True) + a = forecast_df.values.tolist() + o = pd.DataFrame([a]) + o = o.T + + df_date = pd.DataFrame( + data=pd.date_range(start_date, end_date + timedelta(days=1), freq="h"), + columns=[self.date_column], + ) + + # derive hours + hours = [f"{str(i)}:00:00" for i in range(24)] + list_hours = [hours for _ in range(forecast_df.shape[0])] + + assert run_details.parameters is not None, "Parameters are not defined" + # transform forecast data with date and time + df = pd.DataFrame( + data={ + self.date_column: df.values, + "time": list_hours, + run_details.parameters["weather_type"]: o[0], + } + ) + df = df.explode(column=["time", run_details.parameters["weather_type"]]) + df[self.date_column] = df[self.date_column].astype(str) + " " + df["time"] + df[self.date_column] = pd.to_datetime(df[self.date_column].values) + + df.drop(columns=["time"], inplace=True) + df = pd.merge(df_date, df, how="left", left_on=self.date_column, right_on=self.date_column) + + df.reset_index() + df.set_index(self.date_column, inplace=True) + df.sort_index(ascending=True, inplace=True) + df[run_details.parameters["weather_type"]] = df[ + run_details.parameters["weather_type"] + ].values.astype(np.float32) + + # rename columns with suffix forecast + df.rename( + columns={ + run_details.parameters[ + "weather_type" + ]: f"{run_details.parameters['weather_type']}_forecast" + }, + inplace=True, + ) + + # interpolate to derive missing data + df = df.interpolate(method="from_derivatives") + assert df is not None, "Interpolation deleted all data" + df = df.dropna() + return df diff --git a/notebooks/deepmc/notebook_lib/prediction.py b/src/vibe_notebook/vibe_notebook/deepmc/prediction.py similarity index 90% rename from notebooks/deepmc/notebook_lib/prediction.py rename to src/vibe_notebook/vibe_notebook/deepmc/prediction.py index fad17778..eb72cd4a 100644 --- a/notebooks/deepmc/notebook_lib/prediction.py +++ b/src/vibe_notebook/vibe_notebook/deepmc/prediction.py @@ -6,9 +6,9 @@ import numpy as np import onnxruntime import pandas as pd -from numpy._typing import NDArray +from numpy.typing import NDArray -from .preprocess import Preprocess +from vibe_notebook.deepmc.preprocess import Preprocess MODEL_SUFFIX = "deepmc." @@ -167,7 +167,7 @@ def run_individual_predict( relevant=self.relevant, ) - test_X = preprocess.wavelet_transform_predict(df_in=df_in, predict=predict) + test_X, _, _ = preprocess.wavelet_transform_predict(df_in=df_in, predict=predict) time_arr = [] post_yhat = np.empty([1, self.ts_lookahead, self.ts_lookahead]) for idx in range(0, self.total_models): @@ -251,7 +251,7 @@ def run_individual_predict_historical( ) inshape = self.total_models - test_X = preprocess.wavelet_transform_predict(df_in=df_in, predict=predict) + test_X, _, _ = preprocess.wavelet_transform_predict(df_in=df_in, predict=predict) post_yhat = np.empty([test_X[0].shape[0] + 1 - inshape, inshape, self.total_models]) for idx in range(0, self.total_models): out_x = self.predict(path=self.onnx_file, predict=predict, model_idx=idx, inputs=test_X) @@ -279,3 +279,24 @@ def run_individual_predict_historical( yhat_final = output_scaler.inverse_transform(np.expand_dims(yhat_final, axis=1))[:, 0] df_predict = pd.DataFrame(data=list(zip(df_out, yhat_final)), columns=["date", predict]) return df_predict + + def deepmc_preprocess(self, df_in: pd.DataFrame, predict: str): + with open(self.data_export_path, "rb") as f: + train_scaler, output_scaler = pickle.load(f)[4:6] + + preprocess = Preprocess( + train_scaler=train_scaler, + output_scaler=output_scaler, + is_training=False, + ts_lookahead=self.ts_lookahead, + ts_lookback=self.ts_lookback, + chunk_size=self.chunk_size, + wavelet=self.wavelet, + mode=self.mode, + level=self.level, + relevant=self.relevant, + ) + + test_x, test_x_dates, _ = preprocess.wavelet_transform_predict(df_in=df_in, predict=predict) + + return test_x, test_x_dates, train_scaler, output_scaler diff --git a/notebooks/deepmc/notebook_lib/preprocess.py b/src/vibe_notebook/vibe_notebook/deepmc/preprocess.py similarity index 64% rename from notebooks/deepmc/notebook_lib/preprocess.py rename to src/vibe_notebook/vibe_notebook/deepmc/preprocess.py index c8d81b93..a6a28113 100644 --- a/notebooks/deepmc/notebook_lib/preprocess.py +++ b/src/vibe_notebook/vibe_notebook/deepmc/preprocess.py @@ -1,10 +1,11 @@ +from datetime import timedelta from math import ceil -from typing import Any, Optional, Tuple +from typing import Any, List, Optional, Tuple -from numpy._typing import NDArray import numpy as np import pandas as pd import pywt +from numpy.typing import NDArray from sklearn.preprocessing import StandardScaler @@ -35,11 +36,15 @@ def __init__( self.is_validation = is_validation self.relevant = relevant - def wavelet_transform_predict(self, df_in: pd.DataFrame, predict: str) -> NDArray[Any]: + def wavelet_transform_predict( + self, df_in: pd.DataFrame, predict: str + ) -> Tuple[NDArray[Any], List[Any], List[Any]]: i = 1 start = i end = start t_test_X = [] + t_x_dates = [] + t_y_dates = [] test_df = pd.DataFrame( self.train_scaler.transform(df_in), columns=df_in.columns, index=df_in.index @@ -52,9 +57,13 @@ def wavelet_transform_predict(self, df_in: pd.DataFrame, predict: str) -> NDArra i = i + 1 chunkdataDF = test_df.iloc[start:end] - test_uX, _ = self.convert_df_wavelet_input(data_df=chunkdataDF, predict=predict) + test_uX, _, test_x_dates, test_y_dates = self.convert_df_wavelet_input( + data_df=chunkdataDF, predict=predict + ) t_test_X.append(test_uX) + t_x_dates.append(test_x_dates) + t_y_dates.append(test_y_dates) test_X = t_test_X[0].copy() @@ -62,30 +71,51 @@ def wavelet_transform_predict(self, df_in: pd.DataFrame, predict: str) -> NDArra for j in range(len(t_test_X[i])): test_X[j] = np.append(test_X[j], t_test_X[i][j], axis=0) - return test_X + return test_X, t_x_dates, t_y_dates def wavelet_transform_train( self, train_df: pd.DataFrame, test_df: pd.DataFrame, out_feature: str ) -> Tuple[NDArray[Any], ...]: - t_train_X, t_train_y = self.prepare_wavelet_data(train_df, out_feature=out_feature) + t_train_X, t_train_y, t_train_X_dates, t_train_y_dates = self.prepare_wavelet_data( + train_df, out_feature=out_feature + ) - t_test_X, t_test_y = self.prepare_wavelet_data(test_df, out_feature=out_feature) + t_test_X, t_test_y, t_test_X_dates, t_test_y_dates = self.prepare_wavelet_data( + test_df, out_feature=out_feature + ) train_X = t_train_X[0].copy() train_y = t_train_y[0].copy() - for i in range(1, len(t_train_X)): + train_dates_X = t_train_X_dates[0][0].copy() + train_dates_y = t_train_y_dates[0].copy() + for i in range(len(t_train_X)): train_y = np.append(train_y, t_train_y[i], axis=0) + train_dates_X = np.append(train_dates_X, t_train_X_dates[i][0], axis=0) + train_dates_y = np.append(train_dates_y, t_train_y_dates[i], axis=0) for j in range(len(t_train_X[i])): train_X[j] = np.append(train_X[j], t_train_X[i][j], axis=0) test_X = t_test_X[0].copy() test_y = t_test_y[0].copy() + test_dates_X = t_test_X_dates[0][0].copy() + test_dates_y = t_test_y_dates[0].copy() for i in range(1, len(t_test_X)): test_y = np.append(test_y, t_test_y[i], axis=0) + test_dates_X = np.append(test_dates_X, t_test_X_dates[i][0], axis=0) + test_dates_y = np.append(test_dates_y, t_test_y_dates[i], axis=0) for j in range(len(t_test_X[i])): test_X[j] = np.append(test_X[j], t_test_X[i][j], axis=0) - return train_X, train_y, test_X, test_y + return ( + train_X, + train_y, + test_X, + test_y, + train_dates_X, + train_dates_y, + test_dates_X, + test_dates_y, + ) def prepare_wavelet_data(self, data_df: pd.DataFrame, out_feature: str): i = 0 @@ -93,6 +123,8 @@ def prepare_wavelet_data(self, data_df: pd.DataFrame, out_feature: str): end = start t_data_x = [] t_data_y = [] + t_dates_x = [] + t_dates_y = [] while end < data_df.shape[0]: start = i @@ -100,14 +132,16 @@ def prepare_wavelet_data(self, data_df: pd.DataFrame, out_feature: str): i = i + 1 o_data_df = data_df.iloc[start:end] - data_ux, data_uy = self.convert_df_wavelet_input( + data_ux, data_uy, data_ux_dates, data_uy_dates = self.convert_df_wavelet_input( o_data_df, predict=out_feature, ) t_data_x.append(data_ux) t_data_y.append(data_uy) + t_dates_x.append(data_ux_dates) + t_dates_y.append(data_uy_dates) - return t_data_x, t_data_y + return t_data_x, t_data_y, t_dates_x, t_dates_y def dl_preprocess_data( self, @@ -115,7 +149,7 @@ def dl_preprocess_data( predict: str, per_split: float = 0.8, training: bool = False, - ) -> Tuple[NDArray, Optional[NDArray], Optional[NDArray], Optional[NDArray]]: # type: ignore + ) -> Tuple[NDArray, Optional[NDArray], Optional[NDArray], Optional[NDArray], Optional[NDArray]]: # type: ignore """ merge chunk of data as single entity Args: @@ -140,7 +174,7 @@ def dl_preprocess_data( label_data = label_df.values # label_data = label_df.values - X, y = list(), list() + X, y, dates = list(), list(), list() in_start = 0 # step over the entire history one time step at a time @@ -153,30 +187,37 @@ def dl_preprocess_data( if out_end <= len(data): X.append(data[in_start:in_end, :]) y.append(label_data[in_end:out_end, :]) + dates.append(df.index[in_end:out_end].strftime("%Y-%m-%d %H:%M:%S").values) # move along one time step in_start += 1 X = np.array(X) y = np.array(y) + dates = np.array(dates) if self.is_validation is True: n_train_split = ceil(len(data) * per_split) train_X, train_y = X[:n_train_split, :, :], y[:n_train_split, :, :] test_X, test_y = X[n_train_split:, :], y[n_train_split:, :] - return train_X, train_y, test_X, test_y + return train_X, train_y, test_X, test_y, dates else: - return X, y, None, None + return X, y, None, None, dates else: - X = list() + X, dates = list(), list() in_start = 0 for _ in range(len(data) - n_in + 1): in_end = in_start + n_in if in_end <= len(data): X.append(data[in_start:in_end, :]) + # shift dates by lookahead to match it with the y + dates.append( + [t + timedelta(hours=self.ts_lookback) for t in df.index[in_start:in_end]] + ) in_start += 1 X = np.array(X) - return X, None, None, None + dates = np.array(dates) + return X, None, None, None, dates def convert_df_wavelet_input(self, data_df: pd.DataFrame, predict: str): if self.relevant: @@ -188,59 +229,66 @@ def convert_df_wavelet_input_not_relevant(self, data_df: pd.DataFrame, predict: level = self.level rd = list() N = data_df.shape[0] - test_X = list() + test_X, test_X_dates, test_y_dates, test_y = list(), list(), list(), list() if self.is_training: - test_y = self.dl_preprocess_data( + (_, test_y, _, _, test_y_dates) = self.dl_preprocess_data( data_df.iloc[-self.ts_lookback - self.ts_lookahead :], predict=predict, training=self.is_training, - )[1] + ) assert test_y is not None test_y = test_y[[-1], :, :] + dates = test_y_dates[[-1], :] data_df = data_df.iloc[: -self.ts_lookahead] - else: - test_y = [] wp5 = pywt.wavedec(data=data_df[predict], wavelet=self.wavelet, mode=self.mode, level=level) N = data_df.shape[0] for i in range(1, level + 1): rd.append(pywt.waverec(wp5[:-i] + [None] * i, wavelet=self.wavelet, mode=self.mode)[:N]) - t_test_X = self.dl_preprocess_data(data_df.iloc[-self.ts_lookback :], predict=predict)[0] + (t_test_X, _, _, _, t_test_X_dates) = self.dl_preprocess_data( + data_df.iloc[-self.ts_lookback :], predict=predict + ) test_X.append(t_test_X[[-1], :, :]) + test_X_dates.append(t_test_X_dates[[-1], :]) wpt_df = data_df[[]].copy() for i in range(0, level): wpt_df[predict] = rd[i][:] - t_test_X = self.dl_preprocess_data(wpt_df.iloc[-self.ts_lookback :], predict=predict)[0] + (t_test_X, _, _, _, t_test_X_dates) = self.dl_preprocess_data( + wpt_df.iloc[-self.ts_lookback :], predict=predict + ) test_X.append(t_test_X[[-1], :, :]) + test_X_dates.append(t_test_X_dates) - return test_X, test_y + return test_X, test_y, test_X_dates, test_y_dates def convert_df_wavelet_input_relevant(self, data_df: pd.DataFrame, predict: str): rd = list() test_X = list() + test_X, test_X_dates, test_y_dates, test_y = list(), list(), list(), list() if self.is_training: - test_y = self.dl_preprocess_data( + (_, test_y, _, _, test_y_dates) = self.dl_preprocess_data( data_df.iloc[-self.ts_lookback - self.ts_lookahead :], predict=predict, training=self.is_training, - )[1] + ) assert test_y is not None test_y = test_y[[-1], :, :] - else: - test_y = [] + test_y_dates = test_y_dates[[-1], :] data_df = data_df.iloc[: -self.ts_lookahead] - t_test_X = self.dl_preprocess_data(data_df.iloc[-self.ts_lookback :], predict=predict)[0] + (t_test_X, _, _, _, t_test_X_dates) = self.dl_preprocess_data( + data_df.iloc[-self.ts_lookback :], predict=predict + ) data = data_df[predict] data = data.append(data_df[predict + "_forecast"].iloc[-self.ts_lookback :]).values @@ -253,13 +301,17 @@ def convert_df_wavelet_input_relevant(self, data_df: pd.DataFrame, predict: str) ) test_X.append(t_test_X[[-1], :, :]) + test_X_dates.append(t_test_X_dates[[-1], :]) wpt_df = data_df[[]].copy() for i in range(0, self.level): wpt_df[predict] = rd[i] - t_test_X = self.dl_preprocess_data(wpt_df.iloc[-self.ts_lookback :], predict=predict)[0] + (t_test_X, _, _, _, t_test_X_dates) = self.dl_preprocess_data( + wpt_df.iloc[-self.ts_lookback :], predict=predict + ) test_X.append(t_test_X[[-1], :, :]) + test_X_dates.append(t_test_X_dates) - return test_X, test_y + return test_X, test_y, test_X_dates, test_y_dates diff --git a/src/vibe_notebook/vibe_notebook/deepmc/utils.py b/src/vibe_notebook/vibe_notebook/deepmc/utils.py new file mode 100644 index 00000000..ffc2a2e2 --- /dev/null +++ b/src/vibe_notebook/vibe_notebook/deepmc/utils.py @@ -0,0 +1,200 @@ +from datetime import datetime, timedelta +from typing import Any, Dict, List + +import numpy as np +import pandas as pd +from numpy._typing import NDArray +from pandas.tseries.offsets import DateOffset +from sklearn.metrics import mean_absolute_error, mean_squared_error +from sklearn.preprocessing import StandardScaler + + +def get_csv_data( + path: str, + date_attribute: str = "date", + columns_rename: Dict[str, str] = {}, + frequency: str = "60min", + interpolate: bool = True, + fill_na: bool = True, +): + """ + Read data from CSV file using Pandas python package. + """ + + data_df = pd.read_csv(path) + data_df[date_attribute] = pd.to_datetime(data_df[date_attribute]) + + if columns_rename: + data_df.rename(columns=columns_rename, inplace=True) + + # apply index on date + data_df.reset_index(drop=True, inplace=True) + data_df.set_index(date_attribute, inplace=True) + data_df.sort_index(ascending=True, inplace=True) + + if interpolate: + # interpolate to derive missing data + data_df = data_df.interpolate(method="from_derivatives") + assert data_df is not None, "Interpolate deleted all data" + data_df = data_df.dropna() + + if fill_na: + # Group rows by frequency, requires date attribute indexed to execute this + data_df = data_df.fillna(method="ffill") # type: ignore + data_df = data_df.fillna(method="bfill") + data_df = data_df.groupby(pd.Grouper(freq=frequency)).mean() + data_df = data_df.fillna(method="ffill") + data_df = data_df.fillna(method="bfill") + else: + data_df = data_df.groupby(pd.Grouper(freq=frequency)).mean() + + return data_df + + +def hour_round(t: datetime): + # Rounds to nearest hour by adding a timedelta hour if minute >= 30 + return t.replace(second=0, microsecond=0, minute=0, hour=t.hour) + timedelta( + hours=t.minute // 30 + ) + + +def get_split_scaled_data(data: pd.DataFrame, out_feature: str, split_ratio: float = 0.92): + split = int(split_ratio * data.shape[0]) + + train_data = data.iloc[:split] + test_data = data.iloc[split:] + + output_scaler = StandardScaler() + output_scaler.fit_transform(np.expand_dims(data[out_feature].values, axis=1)) # type: ignore + + train_scaler = StandardScaler() + train_scale_df = pd.DataFrame( + train_scaler.fit_transform(train_data), + columns=train_data.columns, + index=train_data.index, + ) + test_scale_df = pd.DataFrame( + train_scaler.transform(test_data), + columns=test_data.columns, + index=test_data.index, + ) + + return train_scaler, output_scaler, train_scale_df, test_scale_df + + +def shift_index(ds_df: pd.DataFrame, freq_minutes: int, num_indices: int, dateColumn: str = "date"): + ds_df[dateColumn] = ds_df.index.shift(-num_indices, freq=DateOffset(minutes=freq_minutes)) + ds_df = ds_df.reset_index(drop=True) + ds_df = ds_df.set_index(dateColumn) + return ds_df + + +def clean_relevant_data( + actual_df: pd.DataFrame, + forecast_df: pd.DataFrame, + out_variables: List[str], + freq_hours: int, + num_of_indices: int, +): + base_data_df = actual_df.copy() + current_ws_df = forecast_df.add_suffix("Current") + base_data_df = base_data_df.join(current_ws_df) + shift_forecast_df = shift_index(forecast_df, freq_hours * 60, num_of_indices) + base_data_df = base_data_df.join(shift_forecast_df) + + base_data_df = base_data_df[out_variables] + base_data_df = base_data_df.interpolate(method="from_derivatives") + assert base_data_df is not None, "Interpolate deleted all data" + base_data_df = base_data_df.dropna() + return base_data_df + + +def smooth(y: List[float], box_pts: int): + box = np.ones(box_pts) / box_pts + y_smooth = np.convolve(y, box, mode="same") + return y_smooth + + +def clean_relevant_data_using_hrrr( + actual_df: pd.DataFrame, + forecast_df: pd.DataFrame, + out_variables: List[str], + freq_hours: int, + num_of_indices: int, + start_date: datetime, + end_date: datetime, +): + forecast_df = forecast_df.loc[ + (forecast_df.index >= start_date) & (forecast_df.index <= end_date) + ] + actual_df = actual_df.loc[(actual_df.index >= start_date) & (actual_df.index <= end_date)] + + for col in actual_df.columns: + sub_df = actual_df[actual_df[col].isna()] + if col + "_forecast" in forecast_df.columns: + actual_df.loc[actual_df.index.isin(sub_df.index.values), col] = forecast_df[ + forecast_df.index.isin(sub_df.index.values) + ][col + "_forecast"] + + base_data_df = actual_df.copy() + current_ws_df = forecast_df.add_suffix("Current") + base_data_df = base_data_df.join(current_ws_df) + shift_forecast_df = shift_index(forecast_df, freq_hours * 60, num_of_indices) + base_data_df = base_data_df.join(shift_forecast_df) + + base_data_df = base_data_df[out_variables] + base_data_df = base_data_df.interpolate(method="from_derivatives") + assert base_data_df is not None, "Interpolate deleted all data" + base_data_df = base_data_df.dropna() + return base_data_df + + +def calculate_KPI(y: NDArray[Any], yhat: NDArray[Any]): + mae = float(mean_absolute_error(y, yhat)) + rmse = float(mean_squared_error(y, yhat, squared=False)) + print(f"RMSE: {round(rmse, 2)}") + print(f"MAE: {round(mae, 2)}") + print(f"MAE%: {round(100*sum(abs(y-yhat))/sum(y),2)}%") + + +def convert_forecast_data(data: pd.DataFrame): + # Temperature + # convert kelvin to celsius + # convert celsius to Fahrenheit + data["temperature_forecast"] = data["temperature_forecast"].apply( + lambda x: ((x - 273.15) * 9 / 5) + 32 + ) + + # wind_speed + # multiplying with 2.23 to convert wind speed from m/sec to mph + data["wind_speed_forecast"] = data.apply( + lambda x: np.sqrt( + np.square(x["u-component_forecast"]) + np.square(x["v-component_forecast"]) + ) + * 2.23, + axis=1, + ) + data.drop(columns=["u-component_forecast", "v-component_forecast"], inplace=True) + return data + + +def transform_to_array_3D(data: NDArray[Any], inference_hours: int = 24) -> NDArray[Any]: + X = transform_to_array(data, inference_hours) + X = X.reshape(X.shape[0], 1, X.shape[1]) + return X + + +def transform_to_array(data: NDArray[Any], inference_hours: int = 24) -> NDArray[Any]: + data = np.array(data) + X = [] + for in_start in range(len(data)): + in_end = in_start + inference_hours + if in_end <= (len(data)): + X.append(data[in_start:in_end]) + else: + break + + X = np.array(X) + # skip rows not in loop + X = X[: data.shape[0] - inference_hours] + return X