Skip to content

Commit

Permalink
[Artifacts] Add Project-BestIteration-Updated Index for List Artifact…
Browse files Browse the repository at this point in the history
…s Query (mlrun#7230)
  • Loading branch information
quaark authored Feb 4, 2025
1 parent 22740bf commit de71511
Show file tree
Hide file tree
Showing 5 changed files with 223 additions and 2 deletions.
Empty file added db/mlrun.db
Empty file.
111 changes: 109 additions & 2 deletions server/py/framework/db/sqldb/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import collections
import functools
import hashlib
import inspect
import pathlib
import re
import typing
Expand All @@ -42,7 +43,7 @@
text,
)
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.inspection import inspect
from sqlalchemy.inspection import inspect as sqlalchemy_inspect
from sqlalchemy.orm import Session, aliased
from sqlalchemy.orm.attributes import flag_modified

Expand Down Expand Up @@ -1517,6 +1518,35 @@ def _find_artifacts(
ArtifactV2.Tag.name,
)

# If the query matches the default UI list artifacts request, we bypass the DB optimizer and use the index
# `idx_project_bi_updated` because we know it provides optimal results for this specific query.
if self._is_default_list_artifacts_query(
project,
ids,
tag,
labels,
since,
until,
name,
kind,
category,
iter,
uid,
producer_id,
producer_uri,
best_iteration,
most_recent,
attach_tags,
offset,
limit,
with_entities,
partition_by,
rows_per_partition,
partition_sort_by,
partition_order,
):
query = query.with_hint(ArtifactV2, "USE INDEX idx_project_bi_updated")

if project:
query = query.filter(ArtifactV2.project == project)
if ids and ids != "*":
Expand Down Expand Up @@ -1615,6 +1645,81 @@ def _find_artifacts(

return results

def _is_default_list_artifacts_query(
self,
project: str,
ids: typing.Optional[typing.Union[list[str], str]] = None,
tag: typing.Optional[str] = None,
labels: typing.Optional[typing.Union[list[str], str]] = None,
since: typing.Optional[datetime] = None,
until: typing.Optional[datetime] = None,
name: typing.Optional[str] = None,
kind: mlrun.common.schemas.ArtifactCategories = None,
category: mlrun.common.schemas.ArtifactCategories = None,
iter: typing.Optional[int] = None,
uid: typing.Optional[str] = None,
producer_id: typing.Optional[str] = None,
producer_uri: typing.Optional[str] = None,
best_iteration: bool = False,
most_recent: bool = False,
attach_tags: bool = False,
offset: typing.Optional[int] = None,
limit: typing.Optional[int] = None,
with_entities: typing.Optional[list[Any]] = None,
partition_by: typing.Optional[
mlrun.common.schemas.ArtifactPartitionByField
] = None,
rows_per_partition: typing.Optional[int] = 1,
partition_sort_by: typing.Optional[
mlrun.common.schemas.SortField
] = mlrun.common.schemas.SortField.updated,
partition_order: typing.Optional[
mlrun.common.schemas.OrderType
] = mlrun.common.schemas.OrderType.desc,
) -> bool:
parameters = inspect.signature(self._find_artifacts).parameters
default_list_params = {
name: parameter.default for name, parameter in parameters.items()
}
default_list_params.update(
{
"limit": 1001,
"best_iteration": True,
"tag": "latest",
}
)

# The project and category parameters are ignored since they are variable in the default query.
# The offset parameter varies with pagination, whereas the limit remains constant, so we only validate
# the limit and the offset is also ignored here.
current_params = {
"ids": ids,
"tag": tag,
"labels": labels,
"since": since,
"until": until,
"name": name,
"kind": kind,
"iter": iter,
"uid": uid,
"producer_id": producer_id,
"producer_uri": producer_uri,
"best_iteration": best_iteration,
"most_recent": most_recent,
"attach_tags": attach_tags,
"limit": limit,
"with_entities": with_entities,
"partition_by": partition_by,
"rows_per_partition": rows_per_partition,
"partition_sort_by": partition_sort_by,
"partition_order": partition_order,
}

# Check if all current parameters match their default values
return all(
default_list_params[key] == value for key, value in current_params.items()
)

def _find_artifacts_for_producer_id(
self,
session: Session,
Expand Down Expand Up @@ -4961,7 +5066,9 @@ def _query(self, session, cls, **kw):
return session.query(cls).filter_by(**kw)

def _get_count(self, session, cls):
return session.query(func.count(inspect(cls).primary_key[0])).scalar()
return session.query(
func.count(sqlalchemy_inspect(cls).primary_key[0])
).scalar()

def _get_class_instance_by_uid(self, session, cls, name, project, uid):
query = (
Expand Down
8 changes: 8 additions & 0 deletions server/py/framework/db/sqldb/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,19 +230,27 @@ class ArtifactV2(Base, mlrun.utils.db.BaseModel):
__tablename__ = "artifacts_v2"
__table_args__ = (
UniqueConstraint("uid", "project", "key", name="_artifacts_v2_uc"),
# Used when enriching workflow status with run artifacts. See https://iguazio.atlassian.net/browse/ML-6770
Index(
"idx_artifacts_producer_id_best_iteration_and_project",
"project",
"producer_id",
"best_iteration",
),
# Used to speed up querying artifact tags which is frequently done by UI with project and category.
# See https://iguazio.atlassian.net/browse/ML-7266
Index(
"idx_project_kind",
"project",
"kind",
),
Index("idx_artifacts_name_uid_project", "key", "uid", "project"),
# Used for calculating the project counters more efficiently.
# See https://iguazio.atlassian.net/browse/ML-8556
Index("idx_project_kind_key", "project", "kind", "key"),
# Used explicitly in list_artifacts, as most of the queries request best_iteration, and all always sort by
# updated. See https://iguazio.atlassian.net/browse/ML-9189
Index("idx_project_bi_updated", "project", "best_iteration", "updated"),
)

Label = make_label(__tablename__)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright 2024 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""artifact project_bi_updated index
Revision ID: 2abdb7a8f1e6
Revises: 2a37ea63671d
Create Date: 2025-02-04 11:57:06.040713
"""

from alembic import op

# revision identifiers, used by Alembic.
revision = "2abdb7a8f1e6"
down_revision = "2a37ea63671d"
branch_labels = None
depends_on = None


def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.create_index(
"idx_project_bi_updated",
"artifacts_v2",
["project", "best_iteration", "updated"],
unique=False,
)
# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_index("idx_project_bi_updated", table_name="artifacts_v2")
# ### end Alembic commands ###
59 changes: 59 additions & 0 deletions server/py/services/api/tests/unit/db/test_artifacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -2465,6 +2465,65 @@ def test_list_artifacts_for_producer_id(self):
assert artifacts[0]["metadata"]["uid"] == artifact_2["metadata"]["uid"]
assert artifacts[0]["metadata"]["tag"] == "latest"

@pytest.mark.parametrize(
"kwargs, expected",
[
pytest.param(
{
"limit": 1001,
"best_iteration": True,
"tag": "latest",
},
True,
id="default_query",
),
pytest.param(
{
"best_iteration": True,
"tag": "latest",
},
False,
id="no_pagination",
),
pytest.param(
{
"limit": 1001,
"best_iteration": False,
"tag": "latest",
},
False,
id="best_iteration_false",
),
pytest.param(
{
"limit": 1001,
"best_iteration": True,
"tag": "any_tag",
},
False,
id="tag_not_latest",
),
pytest.param(
{
"limit": 1001,
"best_iteration": True,
"tag": "latest",
"name": "any_name",
},
False,
id="additional_params",
),
],
)
def test_is_default_list_artifacts_query(self, kwargs: dict, expected: bool):
ignored_params = {
"project": "any_project",
"category": "any_category",
"offset": 5, # any offset
}
kwargs.update(ignored_params)
assert self._db._is_default_list_artifacts_query(**kwargs) == expected

def _generate_artifact_with_iterations(
self, key, tree, num_iters, best_iter, kind, project=""
):
Expand Down

0 comments on commit de71511

Please sign in to comment.