Merge pull request #536 from opendsm/feature/daily_model_weights

Add weights to daily model
opendsm · Feb 6, 2025 · 09fbcc2 · 09fbcc2
2 parents 1daea97 + 18052ff
commit 09fbcc2
Show file tree

Hide file tree

Showing 33 changed files with 1,716 additions and 1,363 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,9 @@ Development
 -----------
 
 * Add GHI sufficiency check requiring 90% coverage for each month
+* Add weights propogation from data class to daily model via "weights" column
+* Converted daily model settings from attrs to pydantic
+* Refactored daily model initial guess optimization to use consolidated optimize function
 
 4.1.0
 -----

diff --git a/eemeter/common/base_settings.py b/eemeter/common/base_settings.py
@@ -26,15 +26,32 @@
 
 
 class BaseSettings(pydantic.BaseModel):
-    model_config = pydantic.ConfigDict(arbitrary_types_allowed=True)
-
-    """Make all property keys case insensitive"""
-
-    # @pydantic.model_validator(mode="before")
-    # def __uppercase_property_keys__(cls, values: Any) -> Any:
-    #     def __upper__(value: Any) -> Any:
-    #         if isinstance(value, dict):
-    #             return {k.upper() if isinstance(k, str) else k: __upper__(v) for k, v in value.items()}
-    #         return value
-
-    #     return __upper__(values)
+    model_config = pydantic.ConfigDict(
+        frozen = True,
+        arbitrary_types_allowed=True,
+        str_to_lower = True,
+        str_strip_whitespace = True,
+    )
+
+    """Make all property keys lowercase and strip whitespace"""
+    @pydantic.model_validator(mode="before")
+    def __lowercase_property_keys__(cls, values: Any) -> Any:
+        def __lower__(value: Any) -> Any:
+            if isinstance(value, dict):
+                return {k.lower().strip() if isinstance(k, str) else k: __lower__(v) for k, v in value.items()}
+            return value
+
+        return __lower__(values)
+
+    """Make all property values lowercase and strip whitespace before validation"""
+    @pydantic.field_validator("*", mode="before")
+    def lowercase_values(cls, v):
+        if isinstance(v, str):
+            return v.lower().strip()
+        return v
+
+
+# add developer field to pydantic Field
+def CustomField(developer=False, *args, **kwargs):
+    field = pydantic.Field(json_schema_extra={"developer": developer}, *args, **kwargs)
+    return field
diff --git a/eemeter/eemeter/common/features.py b/eemeter/eemeter/common/features.py
@@ -540,6 +540,9 @@ def compute_temperature_features(
     if not keep_partial_nan_rows:
         df = overwrite_partial_rows_with_nan(df)
 
+    if df.dropna(how='all').empty:
+        raise ValueError("All rows are NaN.")
+
     # nan last row
     df = df.iloc[:-1].reindex(df.index)
     return df

diff --git a/eemeter/eemeter/models/billing/__init__.py b/eemeter/eemeter/models/billing/__init__.py
@@ -20,9 +20,11 @@
 
 from .data import BillingBaselineData, BillingReportingData
 from .model import BillingModel
+from .weighted_model import BillingWeightedModel
 
 __all__ = (
     "BillingBaselineData",
     "BillingReportingData",
     "BillingModel",
+    "BillingWeightedModel",
 )
diff --git a/eemeter/eemeter/models/billing/data.py b/eemeter/eemeter/models/billing/data.py
@@ -279,6 +279,61 @@ def _compute_temperature_features(
         features = temperature_features.drop(columns=["temperature_mean"])
         return temp, features
 
+    # TODO: DELETE THIS after making real billing data class
+    @property
+    def billing_df(self) -> pd.DataFrame | None:
+        """Get the corrected input data stored in the class. The actual dataframe is immutable, this returns a copy."""
+
+        df = self._df.copy()
+
+        # find indices where observed changes from prior
+        observed_change = df["observed"].diff()
+        observed_change = observed_change[observed_change != 0].index
+        obs_change_idx = df.index.get_indexer(observed_change)
+        obs_change_idx = np.append(obs_change_idx, len(df))
+        obs_change_idx = np.delete(obs_change_idx, np.where(np.diff(obs_change_idx) < 15)[0])
+
+        if obs_change_idx[0] != 0:
+            obs_change_idx = np.insert(obs_change_idx, 0, 0)
+
+        # create vector where value increases at each observed change
+        group = []
+        for i in range(1, len(obs_change_idx)):
+            idx_range = obs_change_idx[i] - obs_change_idx[i-1]
+
+            group.extend([i] * idx_range)
+
+        df["group"] = group
+
+        # get median delta
+
+        # get first datetime, average temperature, sum of observed for each group and make new df
+        df_temp = df.reset_index()
+        df_temp = df_temp.rename(columns={"index": "datetime"})
+
+        df_grouped = df_temp.groupby("group").agg({
+            "datetime": "first",
+            "season": "first",
+            "weekday_weekend": "first",
+            "temperature": "mean",
+            "observed": "mean",
+        }).set_index("datetime")
+
+        # create days column for number of days between current and previous index
+        df_grouped["days"] = df_grouped.index.to_series().diff().dt.days
+
+        df_grouped = df_grouped.dropna()
+
+        # create weights from days column
+        df_grouped["weights"] = df_grouped["days"] / df_grouped["days"].sum()
+
+        df_grouped = df_grouped.drop(columns=["days"])
+
+        if self._df is None:
+            return None
+        else:
+            return df_grouped.copy()
+
 
 class BillingBaselineData(_BillingData):
     """

diff --git a/eemeter/eemeter/models/billing/model.py b/eemeter/eemeter/models/billing/model.py
@@ -56,49 +56,19 @@ class BillingModel(DailyModel):
         model (sklearn.pipeline.Pipeline): The final fitted model.
         id (str): The index of the meter data.
     """
+    _baseline_data_type = BillingBaselineData
+    _reporting_data_type = BillingReportingData
+    _data_df_name = "df"
 
-    def __init__(self, settings=None):
-        super().__init__(model="legacy", settings=settings)
+    def __init__(self, settings=None, verbose: bool = False,):
+        super().__init__(model="legacy", settings=settings, verbose=verbose)
 
     def fit(
-        self, baseline_data: BillingBaselineData, ignore_disqualification: bool = False
+        self, 
+        baseline_data: BillingBaselineData, 
+        ignore_disqualification: bool = False
     ) -> BillingModel:
-        """Fit the model using baseline data.
-
-        Args:
-            baseline_data: BillingBaselineData object.
-            ignore_disqualification: Whether to ignore disqualification errors / warnings.
-
-        Returns:
-            The fitted model.
-
-        Raises:
-            TypeError: If baseline_data is not a BillingBaselineData object.
-            DataSufficiencyError: If the model can't be fit on disqualified baseline data.
-        """
-        # TODO there's a fair bit of duplicated code between this and daily fit(), refactor
-        if not isinstance(baseline_data, BillingBaselineData):
-            raise TypeError("baseline_data must be a BillingBaselineData object")
-        baseline_data.log_warnings()
-        if baseline_data.disqualification and not ignore_disqualification:
-            for warning in baseline_data.disqualification + baseline_data.warnings:
-                print(warning.json())
-            raise DataSufficiencyError("Can't fit model on disqualified baseline data")
-        self.baseline_timezone = baseline_data.tz
-        self.warnings = baseline_data.warnings
-        self.disqualification = baseline_data.disqualification
-        self._fit(baseline_data.df)
-        if self.error["CVRMSE"] > self.settings.cvrmse_threshold:
-            cvrmse_warning = EEMeterWarning(
-                qualified_name="eemeter.model_fit_metrics.cvrmse",
-                description=(
-                    f"Fit model has CVRMSE > {self.settings.cvrmse_threshold}"
-                ),
-                data={"CVRMSE": self.error["CVRMSE"]},
-            )
-            cvrmse_warning.warn()
-            self.disqualification.append(cvrmse_warning)
-        return self
+        return super().fit(baseline_data, ignore_disqualification=ignore_disqualification)
 
     def predict(
         self,
@@ -135,7 +105,8 @@ def predict(
                 "reporting_data must be a BillingBaselineData or BillingReportingData object"
             )
 
-        df_res = self._predict(reporting_data.df)
+        df = getattr(reporting_data, self._data_df_name)
+        df_res = self._predict(df)
 
         if aggregation is None:
             agg = None
@@ -182,7 +153,7 @@ def predict(
 
     def plot(
         self,
-        df_eval,
+        data,
         aggregation: str | None = None,
     ):
         """Plot a model fit with baseline or reporting data. Requires matplotlib to use.
@@ -198,7 +169,7 @@ def plot(
 
         # TODO: pass more kwargs to plotting function
 
-        plot(self, self.predict(df_eval, aggregation=aggregation))
+        plot(self, self.predict(data, aggregation=aggregation))
 
     def to_dict(self) -> dict:
         """Returns a dictionary of model parameters.

diff --git a/eemeter/eemeter/models/billing/settings.py b/eemeter/eemeter/models/billing/settings.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+
+   Copyright 2014-2024 OpenEEmeter contributors
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+"""
+from __future__ import annotations
+
+from eemeter.common.base_settings import CustomField
+
+from eemeter.eemeter.models.daily.utilities.settings import DailyLegacySettings
+
+
+
+class BillingSettings(DailyLegacySettings):
+    segment_minimum_count: int = CustomField(
+        default=3,
+        ge=3,
+        developer=True,
+        description="Minimum number of data points for HDD/CDD",
+    )