[Artifacts] Add Project-BestIteration-Updated Index for List Artifact…

…s Query (mlrun#7230)
davesh0812 · Feb 4, 2025 · de71511 · de71511
1 parent 22740bf
commit de71511
Show file tree

Hide file tree

Showing 5 changed files with 223 additions and 2 deletions.
diff --git a/db/mlrun.db b/db/mlrun.db
diff --git a/server/py/framework/db/sqldb/db.py b/server/py/framework/db/sqldb/db.py
@@ -16,6 +16,7 @@
 import collections
 import functools
 import hashlib
+import inspect
 import pathlib
 import re
 import typing
@@ -42,7 +43,7 @@
     text,
 )
 from sqlalchemy.exc import SQLAlchemyError
-from sqlalchemy.inspection import inspect
+from sqlalchemy.inspection import inspect as sqlalchemy_inspect
 from sqlalchemy.orm import Session, aliased
 from sqlalchemy.orm.attributes import flag_modified
 
@@ -1517,6 +1518,35 @@ def _find_artifacts(
             ArtifactV2.Tag.name,
         )
 
+        # If the query matches the default UI list artifacts request, we bypass the DB optimizer and use the index
+        # `idx_project_bi_updated` because we know it provides optimal results for this specific query.
+        if self._is_default_list_artifacts_query(
+            project,
+            ids,
+            tag,
+            labels,
+            since,
+            until,
+            name,
+            kind,
+            category,
+            iter,
+            uid,
+            producer_id,
+            producer_uri,
+            best_iteration,
+            most_recent,
+            attach_tags,
+            offset,
+            limit,
+            with_entities,
+            partition_by,
+            rows_per_partition,
+            partition_sort_by,
+            partition_order,
+        ):
+            query = query.with_hint(ArtifactV2, "USE INDEX idx_project_bi_updated")
+
         if project:
             query = query.filter(ArtifactV2.project == project)
         if ids and ids != "*":
@@ -1615,6 +1645,81 @@ def _find_artifacts(
 
         return results
 
+    def _is_default_list_artifacts_query(
+        self,
+        project: str,
+        ids: typing.Optional[typing.Union[list[str], str]] = None,
+        tag: typing.Optional[str] = None,
+        labels: typing.Optional[typing.Union[list[str], str]] = None,
+        since: typing.Optional[datetime] = None,
+        until: typing.Optional[datetime] = None,
+        name: typing.Optional[str] = None,
+        kind: mlrun.common.schemas.ArtifactCategories = None,
+        category: mlrun.common.schemas.ArtifactCategories = None,
+        iter: typing.Optional[int] = None,
+        uid: typing.Optional[str] = None,
+        producer_id: typing.Optional[str] = None,
+        producer_uri: typing.Optional[str] = None,
+        best_iteration: bool = False,
+        most_recent: bool = False,
+        attach_tags: bool = False,
+        offset: typing.Optional[int] = None,
+        limit: typing.Optional[int] = None,
+        with_entities: typing.Optional[list[Any]] = None,
+        partition_by: typing.Optional[
+            mlrun.common.schemas.ArtifactPartitionByField
+        ] = None,
+        rows_per_partition: typing.Optional[int] = 1,
+        partition_sort_by: typing.Optional[
+            mlrun.common.schemas.SortField
+        ] = mlrun.common.schemas.SortField.updated,
+        partition_order: typing.Optional[
+            mlrun.common.schemas.OrderType
+        ] = mlrun.common.schemas.OrderType.desc,
+    ) -> bool:
+        parameters = inspect.signature(self._find_artifacts).parameters
+        default_list_params = {
+            name: parameter.default for name, parameter in parameters.items()
+        }
+        default_list_params.update(
+            {
+                "limit": 1001,
+                "best_iteration": True,
+                "tag": "latest",
+            }
+        )
+
+        # The project and category parameters are ignored since they are variable in the default query.
+        # The offset parameter varies with pagination, whereas the limit remains constant, so we only validate
+        # the limit and the offset is also ignored here.
+        current_params = {
+            "ids": ids,
+            "tag": tag,
+            "labels": labels,
+            "since": since,
+            "until": until,
+            "name": name,
+            "kind": kind,
+            "iter": iter,
+            "uid": uid,
+            "producer_id": producer_id,
+            "producer_uri": producer_uri,
+            "best_iteration": best_iteration,
+            "most_recent": most_recent,
+            "attach_tags": attach_tags,
+            "limit": limit,
+            "with_entities": with_entities,
+            "partition_by": partition_by,
+            "rows_per_partition": rows_per_partition,
+            "partition_sort_by": partition_sort_by,
+            "partition_order": partition_order,
+        }
+
+        # Check if all current parameters match their default values
+        return all(
+            default_list_params[key] == value for key, value in current_params.items()
+        )
+
     def _find_artifacts_for_producer_id(
         self,
         session: Session,
@@ -4961,7 +5066,9 @@ def _query(self, session, cls, **kw):
         return session.query(cls).filter_by(**kw)
 
     def _get_count(self, session, cls):
-        return session.query(func.count(inspect(cls).primary_key[0])).scalar()
+        return session.query(
+            func.count(sqlalchemy_inspect(cls).primary_key[0])
+        ).scalar()
 
     def _get_class_instance_by_uid(self, session, cls, name, project, uid):
         query = (

diff --git a/server/py/framework/db/sqldb/models.py b/server/py/framework/db/sqldb/models.py
@@ -230,19 +230,27 @@ class ArtifactV2(Base, mlrun.utils.db.BaseModel):
         __tablename__ = "artifacts_v2"
         __table_args__ = (
             UniqueConstraint("uid", "project", "key", name="_artifacts_v2_uc"),
+            # Used when enriching workflow status with run artifacts. See https://iguazio.atlassian.net/browse/ML-6770
             Index(
                 "idx_artifacts_producer_id_best_iteration_and_project",
                 "project",
                 "producer_id",
                 "best_iteration",
             ),
+            # Used to speed up querying artifact tags which is frequently done by UI with project and category.
+            # See https://iguazio.atlassian.net/browse/ML-7266
             Index(
                 "idx_project_kind",
                 "project",
                 "kind",
             ),
             Index("idx_artifacts_name_uid_project", "key", "uid", "project"),
+            # Used for calculating the project counters more efficiently.
+            # See https://iguazio.atlassian.net/browse/ML-8556
             Index("idx_project_kind_key", "project", "kind", "key"),
+            # Used explicitly in list_artifacts, as most of the queries request best_iteration, and all always sort by
+            # updated. See https://iguazio.atlassian.net/browse/ML-9189
+            Index("idx_project_bi_updated", "project", "best_iteration", "updated"),
         )
 
         Label = make_label(__tablename__)

diff --git a/server/py/services/api/migrations/versions/2abdb7a8f1e6_artifact_project_bi_updated_index.py b/server/py/services/api/migrations/versions/2abdb7a8f1e6_artifact_project_bi_updated_index.py
@@ -0,0 +1,47 @@
+# Copyright 2024 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""artifact project_bi_updated index
+
+Revision ID: 2abdb7a8f1e6
+Revises: 2a37ea63671d
+Create Date: 2025-02-04 11:57:06.040713
+
+"""
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision = "2abdb7a8f1e6"
+down_revision = "2a37ea63671d"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_index(
+        "idx_project_bi_updated",
+        "artifacts_v2",
+        ["project", "best_iteration", "updated"],
+        unique=False,
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_index("idx_project_bi_updated", table_name="artifacts_v2")
+    # ### end Alembic commands ###
diff --git a/server/py/services/api/tests/unit/db/test_artifacts.py b/server/py/services/api/tests/unit/db/test_artifacts.py
@@ -2465,6 +2465,65 @@ def test_list_artifacts_for_producer_id(self):
         assert artifacts[0]["metadata"]["uid"] == artifact_2["metadata"]["uid"]
         assert artifacts[0]["metadata"]["tag"] == "latest"
 
+    @pytest.mark.parametrize(
+        "kwargs, expected",
+        [
+            pytest.param(
+                {
+                    "limit": 1001,
+                    "best_iteration": True,
+                    "tag": "latest",
+                },
+                True,
+                id="default_query",
+            ),
+            pytest.param(
+                {
+                    "best_iteration": True,
+                    "tag": "latest",
+                },
+                False,
+                id="no_pagination",
+            ),
+            pytest.param(
+                {
+                    "limit": 1001,
+                    "best_iteration": False,
+                    "tag": "latest",
+                },
+                False,
+                id="best_iteration_false",
+            ),
+            pytest.param(
+                {
+                    "limit": 1001,
+                    "best_iteration": True,
+                    "tag": "any_tag",
+                },
+                False,
+                id="tag_not_latest",
+            ),
+            pytest.param(
+                {
+                    "limit": 1001,
+                    "best_iteration": True,
+                    "tag": "latest",
+                    "name": "any_name",
+                },
+                False,
+                id="additional_params",
+            ),
+        ],
+    )
+    def test_is_default_list_artifacts_query(self, kwargs: dict, expected: bool):
+        ignored_params = {
+            "project": "any_project",
+            "category": "any_category",
+            "offset": 5,  # any offset
+        }
+        kwargs.update(ignored_params)
+        assert self._db._is_default_list_artifacts_query(**kwargs) == expected
+
     def _generate_artifact_with_iterations(
         self, key, tree, num_iters, best_iter, kind, project=""
     ):