Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Normalize IndexSet.data DB storage #122

Merged
merged 12 commits into from
Nov 22, 2024
16 changes: 8 additions & 8 deletions ixmp4/core/optimization/indexset.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,17 @@ def name(self) -> str:
return self._model.name

@property
def elements(self) -> list[float | int | str]:
return self._model.elements
def data(self) -> list[float | int | str]:
return self._model.data

def add(self, elements: float | int | list[float | int | str] | str) -> None:
"""Adds elements to an existing IndexSet."""
self.backend.optimization.indexsets.add_elements(
indexset_id=self._model.id, elements=elements
def add(self, data: float | int | list[float | int | str] | str) -> None:
"""Adds data to an existing IndexSet."""
self.backend.optimization.indexsets.add_data(
indexset_id=self._model.id, data=data
)
self._model.elements = self.backend.optimization.indexsets.get(
self._model.data = self.backend.optimization.indexsets.get(
run_id=self._model.run__id, name=self._model.name
).elements
).data

@property
def run_id(self) -> int:
Expand Down
2 changes: 1 addition & 1 deletion ixmp4/data/abstract/optimization/equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def add_data(self, equation_id: int, data: dict[str, Any] | pd.DataFrame) -> Non
The data will be validated with the linked constrained
:class:`ixmp4.data.abstract.optimization.IndexSet`s. For that, `data.keys()`
must correspond to the names of the Equation's columns. Each column can only
contain values that are in the linked `IndexSet.elements`. Each row of entries
contain values that are in the linked `IndexSet.data`. Each row of entries
must be unique. No values can be missing, `None`, or `NaN`. If `data.keys()`
contains names already present in `Equation.data`, existing values will be
overwritten.
Expand Down
16 changes: 8 additions & 8 deletions ixmp4/data/abstract/optimization/indexset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ class IndexSet(base.BaseModel, Protocol):
"""The id of the :class:`ixmp4.data.abstract.Run` for which this IndexSet is
defined. """

elements: types.JsonList
"""Unique list of str or int."""
data: types.OptimizationDataList
"""Unique list of str, int, or float."""

created_at: types.DateTime
"Creation date/time. TODO"
Expand Down Expand Up @@ -120,24 +120,24 @@ def tabulate(self, *, name: str | None = None, **kwargs) -> pd.DataFrame:
A data frame with the columns:
- id
- name
- elements
- data
- run__id
- created_at
- created_by
"""
...

def add_elements(
self, indexset_id: int, elements: float | int | List[float | int | str] | str
def add_data(
self, indexset_id: int, data: float | int | List[float | int | str] | str
) -> None:
"""Adds elements to an existing IndexSet.
"""Adds data to an existing IndexSet.

Parameters
----------
indexset_id : int
The id of the target IndexSet.
elements : float | int | List[float | int | str] | str
The elements to be added to the IndexSet.
data : float | int | List[float | int | str] | str
The data to be added to the IndexSet.

Returns
-------
Expand Down
2 changes: 1 addition & 1 deletion ixmp4/data/abstract/optimization/parameter.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def add_data(self, parameter_id: int, data: dict[str, Any] | pd.DataFrame) -> No
The data will be validated with the linked constrained
:class:`ixmp4.data.abstract.optimization.IndexSet`s. For that, `data.keys()`
must correspond to the names of the Parameter's columns. Each column can only
contain values that are in the linked `IndexSet.elements`. Each row of entries
contain values that are in the linked `IndexSet.data`. Each row of entries
must be unique. No values can be missing, `None`, or `NaN`. If `data.keys()`
contains names already present in `Parameter.data`, existing values will be
overwritten.
Expand Down
2 changes: 1 addition & 1 deletion ixmp4/data/abstract/optimization/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def add_data(self, table_id: int, data: dict[str, Any] | pd.DataFrame) -> None:
The data will be validated with the linked constrained
:class:`ixmp4.data.abstract.optimization.IndexSet`s. For that, `data.keys()`
must correspond to the names of the Table's columns. Each column can only
contain values that are in the linked `IndexSet.elements`. Each row of entries
contain values that are in the linked `IndexSet.data`. Each row of entries
must be unique. No values can be missing, `None`, or `NaN`. If `data.keys()`
contains names already present in `Table.data`, existing values will be
overwritten.
Expand Down
2 changes: 1 addition & 1 deletion ixmp4/data/abstract/optimization/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def add_data(self, variable_id: int, data: dict[str, Any] | pd.DataFrame) -> Non
The data will be validated with the linked constrained
:class:`ixmp4.data.abstract.optimization.IndexSet`s. For that, `data.keys()`
must correspond to the names of the Variable's columns. Each column can only
contain values that are in the linked `IndexSet.elements`. Each row of entries
contain values that are in the linked `IndexSet.data`. Each row of entries
must be unique. No values can be missing, `None`, or `NaN`. If `data.keys()`
contains names already present in `Variable.data`, existing values will be
overwritten.
Expand Down
8 changes: 4 additions & 4 deletions ixmp4/data/api/optimization/indexset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class IndexSet(base.BaseModel):

id: int
name: str
elements: (
data: (
StrictFloat
| StrictInt
| StrictStr
Expand Down Expand Up @@ -67,13 +67,13 @@ def list(self, **kwargs) -> list[IndexSet]:
def tabulate(self, **kwargs) -> pd.DataFrame:
return super()._tabulate(json=kwargs)

def add_elements(
def add_data(
self,
indexset_id: int,
elements: StrictFloat
data: StrictFloat
| StrictInt
| List[StrictFloat | StrictInt | StrictStr]
| StrictStr,
) -> None:
kwargs = {"indexset_id": indexset_id, "elements": elements}
kwargs = {"indexset_id": indexset_id, "data": data}
self._request("PATCH", self.prefix + str(indexset_id) + "/", json=kwargs)
2 changes: 1 addition & 1 deletion ixmp4/data/db/optimization/equation/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def _add_column(
self.columns.create(
name=column_name,
constrained_to_indexset=indexset.id,
dtype=pd.Series(indexset.elements).dtype.name,
dtype=pd.Series(indexset.data).dtype.name,
equation_id=equation_id,
unique=True,
**kwargs,
Expand Down
52 changes: 37 additions & 15 deletions ixmp4/data/db/optimization/indexset/model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
from typing import ClassVar

from sqlalchemy.orm import validates
from typing import ClassVar, Literal

from ixmp4 import db
from ixmp4.core.exceptions import OptimizationDataValidationError
Expand All @@ -10,26 +8,50 @@
from .. import base


# TODO Feels like there ought to be this kind of functionality already
def cast_data_as_type(
data: "IndexSetData", type: Literal["float", "int", "str"] | None
) -> float | int | str:
if type == "str":
return data.value
elif type == "int":
return int(data.value)
elif type == "float":
return float(data.value)
else: # type is None
return 0

Check warning on line 22 in ixmp4/data/db/optimization/indexset/model.py

View check run for this annotation

Codecov / codecov/patch

ixmp4/data/db/optimization/indexset/model.py#L22

Added line #L22 was not covered by tests


class IndexSet(base.BaseModel):
NotFound: ClassVar = abstract.IndexSet.NotFound
NotUnique: ClassVar = abstract.IndexSet.NotUnique
DataInvalid: ClassVar = OptimizationDataValidationError
DeletionPrevented: ClassVar = abstract.IndexSet.DeletionPrevented

elements: types.JsonList = db.Column(db.JsonType, nullable=False, default=[])
data_type: types.OptimizationDataType

_data: types.Mapped[list["IndexSetData"]] = db.relationship(
back_populates="indexset"
)

@validates("elements")
def validate_elements(self, key, value: list[float | int | str]):
unique = set()
for element in value:
if element in unique:
raise self.DataInvalid(
f"{element} already defined for IndexSet {self.name}!"
)
else:
unique.add(element)
return value
@db.hybrid_property
def data(self) -> list[float | int | str]:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I discussed how to do this best with an SQLAlchemy maintainer here. Since we are not going to use .data in SQL queries, we should be better served with a normal Python property.

return [cast_data_as_type(data, self.data_type) for data in self._data]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This also looks quite slow, but is porbably unavoidable?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe @danielhuppmann can clarify, but I thought we want the indexset.data attribute to accurately portray the data type. So I guess we can either cast types here or in the API and core layers.
But the function itself I was also wondering about: there should be some efficient built-in that does this kind of casting, my intuition tells me. Do you know of a better way? Would it be faster to collect all self._data.values in a numpy array or so and cast that appropriately?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to this guide, the O() of using loops and using numpy is both O(n), but locally, the tests/data test runs 0.1 seconds faster when using numpy as it is used now, so hope that's better :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well of course they are both O(n) yes, doesnt really say much about the actual cost though.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I always thought it did, assuming the n can't be made smaller or so. Looking forward to your workshop on this :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alright, a quick one liner would be: Big O notation only describes the relationship between runtime (or memory use if we are talking space complexity) and some characteristic "n" of the input data. To get a real runtime estimation one would have to insert scaling factors into the notation which would have to come from actual measurements...


# NOTE For the core layer (setting and retrieving) to work, the property needs a
# setter method
@data.inplace.setter
def _data_setter(self, value: list[float | int | str]) -> None:
return None

run__id: types.RunId

__table_args__ = (db.UniqueConstraint("name", "run__id"),)


class IndexSetData(base.RootBaseModel):
indexset: types.Mapped["IndexSet"] = db.relationship(back_populates="_data")
indexset__id: types.IndexSetId
value: types.String = db.Column(db.String, nullable=False)

__table_args__ = (db.UniqueConstraint("indexset__id", "value"),)
35 changes: 25 additions & 10 deletions ixmp4/data/db/optimization/indexset/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from .. import base
from .docs import IndexSetDocsRepository
from .model import IndexSet
from .model import IndexSet, IndexSetData


class IndexSetRepository(
Expand Down Expand Up @@ -61,21 +61,36 @@ def list(self, *args, **kwargs) -> list[IndexSet]:

@guard("view")
def tabulate(self, *args, **kwargs) -> pd.DataFrame:
return super().tabulate(*args, **kwargs)
result = super().tabulate(*args, **kwargs).drop(labels="data_type", axis=1)
result.insert(
loc=0,
column="data",
value=[self.get_by_id(id=indexset_id).data for indexset_id in result.id],
)
return result

glatterf42 marked this conversation as resolved.
Show resolved Hide resolved
@guard("edit")
def add_elements(
def add_data(
self,
indexset_id: int,
elements: float | int | List[float | int | str] | str,
data: float | int | List[float | int | str] | str,
) -> None:
indexset = self.get_by_id(id=indexset_id)
if not isinstance(elements, list):
elements = [elements]
if indexset.elements is None:
indexset.elements = elements
else:
indexset.elements = indexset.elements + elements
if not isinstance(data, list):
data = [data]
# TODO If adding rows one by one is too expensive, look into executemany pattern
for value in data:
self.session.add(
IndexSetData(indexset=indexset, indexset__id=indexset_id, value=value)
)

try:
self.session.flush()
except db.IntegrityError as e:
self.session.rollback()
raise indexset.DataInvalid from e

indexset.data_type = type(data[0]).__name__

self.session.add(indexset)
self.session.commit()
2 changes: 1 addition & 1 deletion ixmp4/data/db/optimization/parameter/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def _add_column(
self.columns.create(
name=column_name,
constrained_to_indexset=indexset.id,
dtype=pd.Series(indexset.elements).dtype.name,
dtype=pd.Series(indexset.data).dtype.name,
parameter_id=parameter_id,
unique=True,
**kwargs,
Expand Down
2 changes: 1 addition & 1 deletion ixmp4/data/db/optimization/table/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def _add_column(
self.columns.create(
name=column_name,
constrained_to_indexset=indexset.id,
dtype=pd.Series(indexset.elements).dtype.name,
dtype=pd.Series(indexset.data).dtype.name,
table_id=table_id,
unique=True,
**kwargs,
Expand Down
4 changes: 2 additions & 2 deletions ixmp4/data/db/optimization/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ def collect_indexsets_to_check(
columns: list["Column"],
) -> dict[str, Any]:
"""Creates a {key:value} dict from linked Column.names and their
IndexSet.elements."""
IndexSet.data."""
collection: dict[str, Any] = {}
for column in columns:
collection[column.name] = column.indexset.elements
collection[column.name] = column.indexset.data
return collection


Expand Down
2 changes: 1 addition & 1 deletion ixmp4/data/db/optimization/variable/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def _add_column(
self.columns.create(
name=column_name,
constrained_to_indexset=indexset.id,
dtype=pd.Series(indexset.elements).dtype.name,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ALso this looks quite expensive for what it does, no idea how to avoid it right now though...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For all items like parameter, variable, etc, I'm thinking that tables similar to IndexSetData might relieve us of the whole Column table. Especially if we end up translating parameter.data etc to IndexSetData.ids, since the main purpose of the columns is currently to store which indexset the data columns belong to and which type it has. All of this can probably be taken care of in a better way, eliminating these function calls :)
However, this will only happen in a later PR, I'm afraid.

dtype=pd.Series(indexset.data).dtype.name,
variable_id=variable_id,
unique=True,
**kwargs,
Expand Down
6 changes: 4 additions & 2 deletions ixmp4/data/types.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from datetime import datetime
from typing import Any
from typing import Any, Literal

from sqlalchemy.orm import Mapped as Mapped

Expand All @@ -8,9 +8,11 @@
Boolean = Mapped[bool]
DateTime = Mapped[datetime]
Float = Mapped[float]
IndexSetId = Mapped[db.IndexSetIdType]
Integer = Mapped[int]
JsonList = Mapped[list[float | int | str]]
OptimizationDataList = Mapped[list[float | int | str]]
JsonDict = Mapped[dict[str, Any]]
OptimizationDataType = Mapped[Literal["float", "int", "str"] | None]
String = Mapped[str]
Name = Mapped[db.NameType]
UniqueName = Mapped[db.UniqueNameType]
Expand Down
14 changes: 7 additions & 7 deletions ixmp4/db/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@
update,
)
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.exc import MultipleResultsFound
from sqlalchemy.exc import IntegrityError, MultipleResultsFound
from sqlalchemy.ext.hybrid import hybrid_property
from sqlalchemy.orm import (
Relationship,
Session,
Expand All @@ -65,17 +66,16 @@
from . import utils

Column = mapped_column
IndexSetIdType = Annotated[
int,
Column(Integer, ForeignKey("optimization_indexset.id"), nullable=False, index=True),
]
JsonType = JSON()
JsonType = JsonType.with_variant(JSONB(), "postgresql")
NameType = Annotated[str, Column(String(255), nullable=False, unique=False)]
RunIdType = Annotated[
int,
Column(
Integer,
ForeignKey("run.id"),
nullable=False,
index=True,
),
Column(Integer, ForeignKey("run.id"), nullable=False, index=True),
]
UniqueNameType = Annotated[str, Column(String(255), nullable=False, unique=True)]
UsernameType = Annotated[str, Column(String(255), nullable=True)]
12 changes: 6 additions & 6 deletions ixmp4/server/rest/optimization/indexset.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ class IndexSetInput(BaseModel):
name: str


class ElementsInput(BaseModel):
elements: (
class DataInput(BaseModel):
data: (
StrictFloat | StrictInt | StrictStr | list[StrictFloat | StrictInt | StrictStr]
)

Expand Down Expand Up @@ -57,11 +57,11 @@ def create(

@autodoc
@router.patch("/{indexset_id}/")
def add_elements(
def add_data(
indexset_id: int,
elements: ElementsInput,
data: DataInput,
backend: Backend = Depends(deps.get_backend),
):
backend.optimization.indexsets.add_elements(
indexset_id=indexset_id, **elements.model_dump()
backend.optimization.indexsets.add_data(
indexset_id=indexset_id, **data.model_dump()
)
Loading
Loading