Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Normalize IndexSet.data DB storage #122

Merged
merged 12 commits into from
Nov 22, 2024
28 changes: 15 additions & 13 deletions ixmp4/core/optimization/indexset.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,17 @@ def name(self) -> str:
return self._model.name

@property
def elements(self) -> list[float | int | str]:
return self._model.elements
def data(self) -> list[float | int | str]:
return self._model.data

def add(self, elements: float | int | list[float | int | str] | str) -> None:
"""Adds elements to an existing IndexSet."""
self.backend.optimization.indexsets.add_elements(
indexset_id=self._model.id, elements=elements
def add(self, data: float | int | list[float | int | str] | str) -> None:
"""Adds data to an existing IndexSet."""
self.backend.optimization.indexsets.add_data(
indexset_id=self._model.id, data=data
)
self._model.elements = self.backend.optimization.indexsets.get(
self._model.data = self.backend.optimization.indexsets.get(
run_id=self._model.run__id, name=self._model.name
).elements
).data

@property
def run_id(self) -> int:
Expand All @@ -48,21 +48,21 @@ def created_by(self) -> str | None:
return self._model.created_by

@property
def docs(self):
def docs(self) -> str | None:
try:
return self.backend.optimization.indexsets.docs.get(self.id).description
except DocsModel.NotFound:
return None

@docs.setter
def docs(self, description):
def docs(self, description: str | None) -> None:
if description is None:
self.backend.optimization.indexsets.docs.delete(self.id)
else:
self.backend.optimization.indexsets.docs.set(self.id, description)

@docs.deleter
def docs(self):
def docs(self) -> None:
try:
self.backend.optimization.indexsets.docs.delete(self.id)
# TODO: silently failing
Expand Down Expand Up @@ -105,7 +105,9 @@ def list(self, name: str | None = None) -> list[IndexSet]:
for i in indexsets
]

def tabulate(self, name: str | None = None) -> pd.DataFrame:
def tabulate(
self, name: str | None = None, include_data: bool = False
) -> pd.DataFrame:
return self.backend.optimization.indexsets.tabulate(
run_id=self._run.id, name=name
run_id=self._run.id, name=name, include_data=include_data
)
2 changes: 1 addition & 1 deletion ixmp4/data/abstract/optimization/equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def add_data(self, equation_id: int, data: dict[str, Any] | pd.DataFrame) -> Non
The data will be validated with the linked constrained
:class:`ixmp4.data.abstract.optimization.IndexSet`s. For that, `data.keys()`
must correspond to the names of the Equation's columns. Each column can only
contain values that are in the linked `IndexSet.elements`. Each row of entries
contain values that are in the linked `IndexSet.data`. Each row of entries
must be unique. No values can be missing, `None`, or `NaN`. If `data.keys()`
contains names already present in `Equation.data`, existing values will be
overwritten.
Expand Down
25 changes: 15 additions & 10 deletions ixmp4/data/abstract/optimization/indexset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ class IndexSet(base.BaseModel, Protocol):
"""The id of the :class:`ixmp4.data.abstract.Run` for which this IndexSet is
defined. """

elements: types.JsonList
"""Unique list of str or int."""
data: types.OptimizationDataList
"""Unique list of str, int, or float."""

created_at: types.DateTime
"Creation date/time. TODO"
Expand Down Expand Up @@ -102,13 +102,18 @@ def list(self, *, name: str | None = None, **kwargs) -> list[IndexSet]:
"""
...

def tabulate(self, *, name: str | None = None, **kwargs) -> pd.DataFrame:
def tabulate(
self, *, name: str | None = None, include_data: bool = False, **kwargs
) -> pd.DataFrame:
r"""Tabulate IndexSets by specified criteria.

Parameters
----------
name : str
name : str, optional
The name of an IndexSet. If supplied only one result will be returned.
include_data : bool, optional
Whether to load all IndexSet data, which reduces loading speed. Defaults to
`False`.
# TODO: Update kwargs
\*\*kwargs: any
More filter parameters as specified in
Expand All @@ -120,24 +125,24 @@ def tabulate(self, *, name: str | None = None, **kwargs) -> pd.DataFrame:
A data frame with the columns:
- id
- name
- elements
- data
- run__id
- created_at
- created_by
"""
...

def add_elements(
self, indexset_id: int, elements: float | int | List[float | int | str] | str
def add_data(
self, indexset_id: int, data: float | int | List[float | int | str] | str
) -> None:
"""Adds elements to an existing IndexSet.
"""Adds data to an existing IndexSet.

Parameters
----------
indexset_id : int
The id of the target IndexSet.
elements : float | int | List[float | int | str] | str
The elements to be added to the IndexSet.
data : float | int | List[float | int | str] | str
The data to be added to the IndexSet.

Returns
-------
Expand Down
2 changes: 1 addition & 1 deletion ixmp4/data/abstract/optimization/parameter.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def add_data(self, parameter_id: int, data: dict[str, Any] | pd.DataFrame) -> No
The data will be validated with the linked constrained
:class:`ixmp4.data.abstract.optimization.IndexSet`s. For that, `data.keys()`
must correspond to the names of the Parameter's columns. Each column can only
contain values that are in the linked `IndexSet.elements`. Each row of entries
contain values that are in the linked `IndexSet.data`. Each row of entries
must be unique. No values can be missing, `None`, or `NaN`. If `data.keys()`
contains names already present in `Parameter.data`, existing values will be
overwritten.
Expand Down
2 changes: 1 addition & 1 deletion ixmp4/data/abstract/optimization/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def add_data(self, table_id: int, data: dict[str, Any] | pd.DataFrame) -> None:
The data will be validated with the linked constrained
:class:`ixmp4.data.abstract.optimization.IndexSet`s. For that, `data.keys()`
must correspond to the names of the Table's columns. Each column can only
contain values that are in the linked `IndexSet.elements`. Each row of entries
contain values that are in the linked `IndexSet.data`. Each row of entries
must be unique. No values can be missing, `None`, or `NaN`. If `data.keys()`
contains names already present in `Table.data`, existing values will be
overwritten.
Expand Down
2 changes: 1 addition & 1 deletion ixmp4/data/abstract/optimization/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def add_data(self, variable_id: int, data: dict[str, Any] | pd.DataFrame) -> Non
The data will be validated with the linked constrained
:class:`ixmp4.data.abstract.optimization.IndexSet`s. For that, `data.keys()`
must correspond to the names of the Variable's columns. Each column can only
contain values that are in the linked `IndexSet.elements`. Each row of entries
contain values that are in the linked `IndexSet.data`. Each row of entries
must be unique. No values can be missing, `None`, or `NaN`. If `data.keys()`
contains names already present in `Variable.data`, existing values will be
overwritten.
Expand Down
22 changes: 6 additions & 16 deletions ixmp4/data/api/optimization/indexset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from typing import ClassVar, List

import pandas as pd
from pydantic import StrictFloat, StrictInt, StrictStr

from ixmp4.data import abstract

Expand All @@ -17,13 +16,7 @@ class IndexSet(base.BaseModel):

id: int
name: str
elements: (
StrictFloat
| StrictInt
| StrictStr
| list[StrictFloat | StrictInt | StrictStr]
| None
)
data: float | int | str | list[int | float | str] | None
run__id: int

created_at: datetime | None
Expand Down Expand Up @@ -64,16 +57,13 @@ def enumerate(self, **kwargs) -> list[IndexSet] | pd.DataFrame:
def list(self, **kwargs) -> list[IndexSet]:
return super()._list(json=kwargs)

def tabulate(self, **kwargs) -> pd.DataFrame:
return super()._tabulate(json=kwargs)
def tabulate(self, include_data: bool = False, **kwargs) -> pd.DataFrame:
return super()._tabulate(json=kwargs, params={"include_data": include_data})

def add_elements(
def add_data(
self,
indexset_id: int,
elements: StrictFloat
| StrictInt
| List[StrictFloat | StrictInt | StrictStr]
| StrictStr,
data: float | int | str | List[float | int | str],
) -> None:
kwargs = {"indexset_id": indexset_id, "elements": elements}
kwargs = {"indexset_id": indexset_id, "data": data}
self._request("PATCH", self.prefix + str(indexset_id) + "/", json=kwargs)
2 changes: 1 addition & 1 deletion ixmp4/data/db/optimization/equation/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def _add_column(
self.columns.create(
name=column_name,
constrained_to_indexset=indexset.id,
dtype=pd.Series(indexset.elements).dtype.name,
dtype=pd.Series(indexset.data).dtype.name,
equation_id=equation_id,
unique=True,
**kwargs,
Expand Down
40 changes: 27 additions & 13 deletions ixmp4/data/db/optimization/indexset/model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import ClassVar

from sqlalchemy.orm import validates
import numpy as np

from ixmp4 import db
from ixmp4.core.exceptions import OptimizationDataValidationError
Expand All @@ -16,20 +16,34 @@ class IndexSet(base.BaseModel):
DataInvalid: ClassVar = OptimizationDataValidationError
DeletionPrevented: ClassVar = abstract.IndexSet.DeletionPrevented

elements: types.JsonList = db.Column(db.JsonType, nullable=False, default=[])
_data_type: types.OptimizationDataType

@validates("elements")
def validate_elements(self, key, value: list[float | int | str]):
unique = set()
for element in value:
if element in unique:
raise self.DataInvalid(
f"{element} already defined for IndexSet {self.name}!"
)
else:
unique.add(element)
return value
_data: types.Mapped[list["IndexSetData"]] = db.relationship(
back_populates="indexset"
)

@property
def data(self) -> list[float | int | str]:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I discussed how to do this best with an SQLAlchemy maintainer here. Since we are not going to use .data in SQL queries, we should be better served with a normal Python property.

return (
[]
if self._data_type is None
else np.array([d.value for d in self._data], dtype=self._data_type).tolist()
)

@data.setter
def data(self, value: list[float | int | str]) -> None:
return None

run__id: types.RunId

__table_args__ = (db.UniqueConstraint("name", "run__id"),)


class IndexSetData(base.RootBaseModel):
table_prefix = "optimization_"

indexset: types.Mapped["IndexSet"] = db.relationship(back_populates="_data")
indexset__id: types.IndexSetId
value: types.String = db.Column(db.String, nullable=False)

__table_args__ = (db.UniqueConstraint("indexset__id", "value"),)
45 changes: 34 additions & 11 deletions ixmp4/data/db/optimization/indexset/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from .. import base
from .docs import IndexSetDocsRepository
from .model import IndexSet
from .model import IndexSet, IndexSetData


class IndexSetRepository(
Expand Down Expand Up @@ -60,22 +60,45 @@ def list(self, *args, **kwargs) -> list[IndexSet]:
return super().list(*args, **kwargs)

@guard("view")
def tabulate(self, *args, **kwargs) -> pd.DataFrame:
return super().tabulate(*args, **kwargs)
def tabulate(self, *args, include_data: bool = False, **kwargs) -> pd.DataFrame:
if not include_data:
return (
super()
.tabulate(*args, **kwargs)
.rename(columns={"_data_type": "data_type"})
)
else:
result = super().tabulate(*args, **kwargs).drop(labels="_data_type", axis=1)
result.insert(
loc=0,
column="data",
value=[indexset.data for indexset in self.list(**kwargs)],
)
return result

@guard("edit")
def add_elements(
def add_data(
self,
indexset_id: int,
elements: float | int | List[float | int | str] | str,
data: float | int | List[float | int | str] | str,
) -> None:
indexset = self.get_by_id(id=indexset_id)
if not isinstance(elements, list):
elements = [elements]
if indexset.elements is None:
indexset.elements = elements
else:
indexset.elements = indexset.elements + elements
if not isinstance(data, list):
data = [data]

bulk_insert_enabled_data: list[dict[str, str]] = [
{"value": str(d)} for d in data
]
try:
self.session.execute(
db.insert(IndexSetData).values(indexset__id=indexset_id),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using ORM-enabled bulk inserting should be faster than creating individual objects (as I did before).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes this got very fast in sqlalchemy 2!

bulk_insert_enabled_data,
)
except db.IntegrityError as e:
self.session.rollback()
raise indexset.DataInvalid from e

indexset._data_type = type(data[0]).__name__

self.session.add(indexset)
self.session.commit()
2 changes: 1 addition & 1 deletion ixmp4/data/db/optimization/parameter/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def _add_column(
self.columns.create(
name=column_name,
constrained_to_indexset=indexset.id,
dtype=pd.Series(indexset.elements).dtype.name,
dtype=pd.Series(indexset.data).dtype.name,
parameter_id=parameter_id,
unique=True,
**kwargs,
Expand Down
2 changes: 1 addition & 1 deletion ixmp4/data/db/optimization/table/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def _add_column(
self.columns.create(
name=column_name,
constrained_to_indexset=indexset.id,
dtype=pd.Series(indexset.elements).dtype.name,
dtype=pd.Series(indexset.data).dtype.name,
table_id=table_id,
unique=True,
**kwargs,
Expand Down
4 changes: 2 additions & 2 deletions ixmp4/data/db/optimization/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ def collect_indexsets_to_check(
columns: list["Column"],
) -> dict[str, Any]:
"""Creates a {key:value} dict from linked Column.names and their
IndexSet.elements."""
IndexSet.data."""
collection: dict[str, Any] = {}
for column in columns:
collection[column.name] = column.indexset.elements
collection[column.name] = column.indexset.data
return collection


Expand Down
2 changes: 1 addition & 1 deletion ixmp4/data/db/optimization/variable/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def _add_column(
self.columns.create(
name=column_name,
constrained_to_indexset=indexset.id,
dtype=pd.Series(indexset.elements).dtype.name,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ALso this looks quite expensive for what it does, no idea how to avoid it right now though...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For all items like parameter, variable, etc, I'm thinking that tables similar to IndexSetData might relieve us of the whole Column table. Especially if we end up translating parameter.data etc to IndexSetData.ids, since the main purpose of the columns is currently to store which indexset the data columns belong to and which type it has. All of this can probably be taken care of in a better way, eliminating these function calls :)
However, this will only happen in a later PR, I'm afraid.

dtype=pd.Series(indexset.data).dtype.name,
variable_id=variable_id,
unique=True,
**kwargs,
Expand Down
6 changes: 4 additions & 2 deletions ixmp4/data/types.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from datetime import datetime
from typing import Any
from typing import Any, Literal

from sqlalchemy.orm import Mapped as Mapped

Expand All @@ -8,9 +8,11 @@
Boolean = Mapped[bool]
DateTime = Mapped[datetime]
Float = Mapped[float]
IndexSetId = Mapped[db.IndexSetIdType]
Integer = Mapped[int]
JsonList = Mapped[list[float | int | str]]
OptimizationDataList = Mapped[list[float | int | str]]
JsonDict = Mapped[dict[str, Any]]
OptimizationDataType = Mapped[Literal["float", "int", "str"] | None]
String = Mapped[str]
Name = Mapped[db.NameType]
UniqueName = Mapped[db.UniqueNameType]
Expand Down
Loading
Loading