Skip to content
This repository has been archived by the owner on Sep 16, 2022. It is now read-only.

Commit

Permalink
CSCFAIRMETA-1416: Optimize dataset listing performance
Browse files Browse the repository at this point in the history
* Use prefetch_related when listing datasets to avoid querying related objects one-by-one
* Don't make a separate query for dataset_version_set records if there is only 1
* Use pickle instead of copy.deepcopy in track_fields for faster copying of research_dataset
* Add research_dataset_fields query param to allow returning only needed fields
  • Loading branch information
tahme committed Mar 31, 2022
1 parent 8ab0cc4 commit d3c4338
Show file tree
Hide file tree
Showing 7 changed files with 137 additions and 32 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,29 @@ def _check_end_user_allowed_catalogs(self, dc_identifier):
}
)

def _filter_research_dataset_fields(self, res):
"""
If research_dataset_fields query parameter is supplied, return only
requested fields from research_dataset.
"""
if (
"research_dataset" in res
and "view" in self.context
and "research_dataset_fields" in self.context["view"].request.query_params
):
research_dataset_fields = set(
self.context["view"]
.request.query_params.get("research_dataset_fields", "")
.split(",")
)
research_dataset = {
key: value
for (key, value) in res["research_dataset"].items()
if key in research_dataset_fields
}
return {**res, "research_dataset": research_dataset}
return res

def to_representation(self, instance):
res = super(CatalogRecordSerializer, self).to_representation(instance)

Expand Down Expand Up @@ -281,7 +304,14 @@ def to_representation(self, instance):
res["alternate_record_set"] = [ar.identifier for ar in alternate_records]

if "dataset_version_set" in res:
res["dataset_version_set"] = instance.dataset_version_set.get_listing()
# avoid querying records when there are no other datasets in dataset_version_set
if (
hasattr(instance, "dataset_version_set__records__count")
and instance.dataset_version_set__records__count == 1
):
res["dataset_version_set"] = [instance.version_dict]
else:
res["dataset_version_set"] = instance.dataset_version_set.get_listing()

if "next_dataset_version" in res:
if instance.next_dataset_version.state == CatalogRecord.STATE_PUBLISHED:
Expand Down Expand Up @@ -322,6 +352,7 @@ def to_representation(self, instance):
if "request" in self.context and "file_details" in self.context["request"].query_params:
CRS.populate_file_details(res, self.context["request"])

res = self._filter_research_dataset_fields(res)
res = self._check_and_strip_sensitive_fields(instance, res)

return res
Expand Down
21 changes: 21 additions & 0 deletions src/metax_api/api/rest/base/views/dataset_view.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from django.conf import settings
from django.http import Http404
from django.db.models import Count

from rest_framework import status
from rest_framework.decorators import action
Expand Down Expand Up @@ -74,6 +75,26 @@ def get_queryset(self):
data_catalog__catalog_json__identifier__in=settings.LEGACY_CATALOGS
)

if self.request.META["REQUEST_METHOD"] == "GET":
# Optimize dataset listing by prefetching related objects.
# Annotate results with number of records in dataset_version_set
# to allow the serializer skip querying other versions when there
# is only one.
return (
super()
.get_queryset()
.prefetch_related(
"data_catalog",
"dataset_version_set",
"preservation_dataset_version",
"preservation_dataset_origin_version",
"next_draft",
"draft_of",
"editor_permissions",
)
.annotate(Count("dataset_version_set__records"))
)

return super().get_queryset()

def retrieve(self, request, *args, **kwargs):
Expand Down
43 changes: 25 additions & 18 deletions src/metax_api/models/catalog_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,25 +158,17 @@ def get_listing(self):
self.records(manager="objects_unfiltered")
.filter(state=CatalogRecord.STATE_PUBLISHED)
.order_by("-date_created")
.only(
"id",
"identifier",
"research_dataset",
"dataset_version_set_id",
"date_created",
"date_removed",
"removed",
)
)

versions = [
{
"identifier": r.identifier,
"preferred_identifier": r.preferred_identifier,
"removed": r.removed,
"date_created": r.date_created.astimezone().isoformat(),
"date_removed": r.date_removed.astimezone().isoformat() if r.date_removed else None,
}
for r in records
]

# dont show the date_removed field at all if the value is None (record has not been removed)
versions = [
{key: value for (key, value) in i.items() if value is not None} for i in versions
]

return versions
return [r.version_dict for r in records]

def print_records(self): # pragma: no cover
for r in self.records.all():
Expand Down Expand Up @@ -1295,6 +1287,21 @@ def identifiers_dict(self):
except:
return {}

@property
def version_dict(self):
try:
val = {
"identifier": self.identifier,
"preferred_identifier": self.research_dataset["preferred_identifier"],
"date_created": self.date_created.astimezone().isoformat(),
"removed": self.removed,
}
if self.removed and self.date_removed:
val['date_removed'] = self.date_removed
return val
except:
return {}

@property
def preferred_identifier(self):
try:
Expand Down
21 changes: 15 additions & 6 deletions src/metax_api/models/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# :author: CSC - IT Center for Science Ltd., Espoo Finland <[email protected]>
# :license: MIT

from copy import deepcopy
import pickle

from dateutil import parser
from django.core.exceptions import FieldError
Expand Down Expand Up @@ -132,6 +132,16 @@ def modified_since(self, timestamp):

return timestamp < self.date_modified

def _deepcopy_field(self, field_value):
"""
Deep copy field value.
Pickle can be an order of magnitude faster than copy.deepcopy for
deeply nested fields like CatalogRecord.research_dataset.
"""
return pickle.loads(pickle.dumps(field_value))


def track_fields(self, *fields):
"""
Save initial values from object fields when object is created (= retrieved from db),
Expand All @@ -141,17 +151,16 @@ def track_fields(self, *fields):
field_name is a dict (a JSON field). For now only one level of nesting is supported.
If a need arises, can be made mega generic.
"""
for field_name in fields:

self._tracked_fields.append(field_name)
self._tracked_fields.extend(fields)

for field_name in fields:
if "." in field_name:
self._track_json_field(field_name)
else:
if self._field_is_loaded(field_name):
requested_field = getattr(self, field_name)
if isinstance(requested_field, dict):
self._initial_data[field_name] = deepcopy(requested_field)
self._initial_data[field_name] = self._deepcopy_field(requested_field)
else:
self._initial_data[field_name] = requested_field

Expand All @@ -173,7 +182,7 @@ def _track_json_field(self, field_name):
self._initial_data[field_name] = {}

if isinstance(json_field_value, dict):
self._initial_data[field_name][json_field_name] = deepcopy(json_field_value)
self._initial_data[field_name][json_field_name] = self._deepcopy_field(json_field_value)
else:
self._initial_data[field_name][json_field_name] = json_field_value

Expand Down
11 changes: 10 additions & 1 deletion src/metax_api/swagger/v1/swagger.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1007,6 +1007,7 @@ paths:
required: false
type: string
- $ref: "#/parameters/fields"
- $ref: "#/parameters/research_dataset_fields"
- $ref: "#/parameters/include_legacy"
responses:
"200":
Expand Down Expand Up @@ -1261,6 +1262,7 @@ paths:
required: false
type: boolean
- $ref: "#/parameters/fields"
- $ref: "#/parameters/research_dataset_fields"
- $ref: "#/parameters/include_legacy"
responses:
'200':
Expand Down Expand Up @@ -2181,7 +2183,14 @@ parameters:
fields:
name: fields
in: query
description: Comma separated list of fields that is returned. Note that nested fields are not supported.
description: Comma separated list of fields that are returned. Note that nested fields are not supported.
required: false
type: string

research_dataset_fields:
name: research_dataset_fields
in: query
description: Comma separated list of fields in research_dataset that are returned.
required: false
type: string

Expand Down
11 changes: 10 additions & 1 deletion src/metax_api/swagger/v2/swagger.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1013,6 +1013,7 @@ paths:
type: string
- $ref: "#/parameters/include_user_metadata"
- $ref: "#/parameters/fields"
- $ref: "#/parameters/research_dataset_fields"
- $ref: "#/parameters/include_legacy"
responses:
"200":
Expand Down Expand Up @@ -1282,6 +1283,7 @@ paths:
type: boolean
- $ref: "#/parameters/include_user_metadata"
- $ref: "#/parameters/fields"
- $ref: "#/parameters/research_dataset_fields"
- $ref: "#/parameters/include_legacy"
responses:
'200':
Expand Down Expand Up @@ -2482,7 +2484,14 @@ parameters:
fields:
name: fields
in: query
description: Comma separated list of fields that is returned. Note that nested fields are not supported.
description: Comma separated list of fields that are returned. Note that nested fields are not supported.
required: false
type: string

research_dataset_fields:
name: research_dataset_fields
in: query
description: Comma separated list of fields in research_dataset that are returned.
required: false
type: string

Expand Down
29 changes: 24 additions & 5 deletions src/metax_api/tests/api/rest/base/views/datasets/read.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,7 +611,11 @@ def test_agents_and_actors_with_ids(self):
)
cr.research_dataset["creator"] = []
cr.research_dataset["creator"].append(
{"@type": "Organization", "name": {"en": "Unique Organization"}, "identifier": "http://uri.suomi.fi/codelist/fairdata/organization/code/1234567"}
{
"@type": "Organization",
"name": {"en": "Unique Organization"},
"identifier": "http://uri.suomi.fi/codelist/fairdata/organization/code/1234567",
}
)
cr.force_save()

Expand Down Expand Up @@ -892,7 +896,10 @@ def test_filter_by_projects_for_end_user(self):
self._mock_token_validation_succeeds()
self._use_http_authorization(
method="bearer",
token={"group_names": ["IDA01:project_x", "IDA01:no_datasets_here"], "CSCUserName": "testi"}
token={
"group_names": ["IDA01:project_x", "IDA01:no_datasets_here"],
"CSCUserName": "testi",
},
)

response = self.client.get("/rest/datasets?projects=project_x&pagination=false")
Expand Down Expand Up @@ -967,17 +974,30 @@ def test_filter_by_legacy(self):

def test_filter_by_editor_permissions_user_ok(self):
cr = CatalogRecord.objects.get(pk=1)
cr.editor_permissions.users.update(user_id='test_user_x')
cr.editor_permissions.users.update(user_id="test_user_x")
response = self.client.get(f"/rest/datasets?editor_permissions_user=test_user_x")
self.assertEqual(response.data["count"], 1)

def test_filter_by_editor_permissions_user_removed(self):
cr = CatalogRecord.objects.get(pk=1)
cr.editor_permissions.users.update(user_id='test_user_x')
cr.editor_permissions.users.update(user_id="test_user_x")
cr.editor_permissions.users.first().delete()
response = self.client.get(f"/rest/datasets?editor_permissions_user=test_user_x")
self.assertEqual(response.data["count"], 0)

def test_research_dataset_fields(self):
cr = CatalogRecord.objects.get(pk=1)
expected_fields = {
"title": cr.research_dataset["title"],
"description": cr.research_dataset["description"],
"preferred_identifier": cr.research_dataset["preferred_identifier"],
}
response = self.client.get(
f"/rest/datasets/1?research_dataset_fields=title,description,preferred_identifier"
)
returned_fields = response.data["research_dataset"]
self.assertEqual(returned_fields, expected_fields)


class CatalogRecordApiReadXMLTransformationTests(CatalogRecordApiReadCommon):

Expand Down Expand Up @@ -1082,7 +1102,6 @@ def test_read_dataset_format_dummy_datacite_doi(self):
def _check_dataset_xml_format_response(self, response, element_name):
self.assertEqual(response.status_code, status.HTTP_200_OK)


self.assertEqual("<?xml version" in response.data[:20], True, response.data)
self.assertEqual(element_name in response.data[:60], True, response.data)

Expand Down

0 comments on commit d3c4338

Please sign in to comment.