From 88a3617dc64797154aae1d3dfcbf71032ca4ea52 Mon Sep 17 00:00:00 2001 From: Syed Muhammad Dawoud Sheraz Ali <40599381+DawoudSheraz@users.noreply.github.com> Date: Wed, 27 Dec 2023 13:22:19 +0500 Subject: [PATCH] perf: fix N+1 and some duplicate query issues across CatalogSerializer (#4206) * perf: fix N+1 and some duplicate query issues across CourseSerializer * chore: address review comments, run edx_lint to ignore broad-except --- course_discovery/apps/api/serializers.py | 27 +++++----- .../tests/test_views/test_affiliate_window.py | 4 +- .../api/v1/tests/test_views/test_catalogs.py | 2 +- .../api/v1/tests/test_views/test_courses.py | 16 +++--- .../data_loaders/csv_loader.py | 6 +-- .../tests/test_migrate_course_slugs.py | 2 + .../apps/course_metadata/models.py | 46 +++++++++++----- .../apps/course_metadata/tests/test_models.py | 8 ++- .../apps/course_metadata/tests/test_utils.py | 53 +++++++------------ .../apps/course_metadata/utils.py | 10 ++++ pylintrc | 7 +-- 11 files changed, 102 insertions(+), 79 deletions(-) diff --git a/course_discovery/apps/api/serializers.py b/course_discovery/apps/api/serializers.py index bfc085dc65..b1291addd1 100644 --- a/course_discovery/apps/api/serializers.py +++ b/course_discovery/apps/api/serializers.py @@ -1275,7 +1275,7 @@ class Meta(AbstractLocationRestrictionSerializer.Meta): class CourseSerializer(TaggitSerializer, MinimalCourseSerializer): """Serializer for the ``Course`` model.""" level_type = SlugRelatedTranslatableField(required=False, allow_null=True, slug_field='name_t', - queryset=LevelType.objects.all()) + queryset=LevelTypeSerializer.prefetch_queryset(LevelType.objects.all())) subjects = SlugRelatedFieldWithReadSerializer(slug_field='slug', required=False, many=True, queryset=SubjectSerializer.prefetch_queryset(), read_serializer=SubjectSerializer()) @@ -1307,20 +1307,14 @@ class CourseSerializer(TaggitSerializer, MinimalCourseSerializer): geolocation = GeoLocationSerializer(required=False, allow_null=True) location_restriction = CourseLocationRestrictionSerializer(required=False) in_year_value = ProductValueSerializer(required=False) - product_source = serializers.SlugRelatedField(required=False, slug_field='slug', queryset=Source.objects.all()) + product_source = SlugRelatedFieldWithReadSerializer( + required=False, slug_field='slug', queryset=Source.objects.all(), + read_serializer=SourceSerializer() + ) watchers = serializers.ListField( child=serializers.EmailField(), allow_empty=True, allow_null=True, required=False ) - def to_representation(self, instance): - """ - Conversion of the source slug to the source serializer data - """ - representation = super().to_representation(instance) - if representation.get('product_source'): - representation['product_source'] = SourceSerializer(Source.objects.get(slug=representation['product_source'])).data # pylint: disable=line-too-long - return representation - def get_organization_logo_override_url(self, obj): logo_image_override = getattr(obj, 'organization_logo_override', None) if logo_image_override: @@ -1667,14 +1661,19 @@ def prefetch_queryset(cls, partner, queryset=None, course_runs=None): filtered CourseRun queryset. """ queryset = queryset if queryset is not None else Course.objects.filter(partner=partner) - - return queryset.select_related('level_type', 'video', 'partner').prefetch_related( + return queryset.select_related( + 'type', '_official_version', 'level_type', 'video', 'partner', 'additional_metadata', + 'location_restriction', 'in_year_value', 'product_source', 'canonical_course_run' + ).prefetch_related( 'expected_learning_items', + 'collaborators', 'prerequisites', - 'subjects', + 'subjects__translations', 'url_slug_history', + 'topics', 'editors', 'url_redirects', + 'level_type__translations', cls.prefetch_course_runs(CourseRunSerializer, course_runs), Prefetch('authoring_organizations', queryset=OrganizationSerializer.prefetch_queryset(partner)), Prefetch('sponsoring_organizations', queryset=OrganizationSerializer.prefetch_queryset(partner)), diff --git a/course_discovery/apps/api/v1/tests/test_views/test_affiliate_window.py b/course_discovery/apps/api/v1/tests/test_views/test_affiliate_window.py index 75b8277820..e6012fba67 100644 --- a/course_discovery/apps/api/v1/tests/test_views/test_affiliate_window.py +++ b/course_discovery/apps/api/v1/tests/test_views/test_affiliate_window.py @@ -275,8 +275,8 @@ def test_dtd_with_valid_data(self): assert response.status_code == 200 filename = abspath(join(dirname(dirname(__file__)), 'affiliate_window_product_feed.1.4.dtd')) - dtd = etree.DTD(open(filename)) # lint-amnesty, pylint: disable=consider-using-with,c-extension-no-member - root = etree.XML(response.content) # pylint: disable=c-extension-no-member + dtd = etree.DTD(open(filename)) # lint-amnesty, pylint: disable=consider-using-with + root = etree.XML(response.content) assert dtd.validate(root) def test_permissions(self): diff --git a/course_discovery/apps/api/v1/tests/test_views/test_catalogs.py b/course_discovery/apps/api/v1/tests/test_views/test_catalogs.py index d21994bf51..c294e1384c 100644 --- a/course_discovery/apps/api/v1/tests/test_views/test_catalogs.py +++ b/course_discovery/apps/api/v1/tests/test_views/test_catalogs.py @@ -317,7 +317,7 @@ def test_courses(self, state): # to be included. filtered_course_run = CourseRunFactory(course=course) - with self.assertNumQueries(48, threshold=3): + with self.assertNumQueries(38, threshold=3): response = self.client.get(url) assert response.status_code == 200 diff --git a/course_discovery/apps/api/v1/tests/test_views/test_courses.py b/course_discovery/apps/api/v1/tests/test_views/test_courses.py index 2fa1573edd..6d090a32c1 100644 --- a/course_discovery/apps/api/v1/tests/test_views/test_courses.py +++ b/course_discovery/apps/api/v1/tests/test_views/test_courses.py @@ -103,7 +103,7 @@ def test_get(self): """ Verify the endpoint returns the details for a single course. """ url = reverse('api:v1:course-detail', kwargs={'key': self.course.key}) - with self.assertNumQueries(48, threshold=3): + with self.assertNumQueries(43, threshold=3): response = self.client.get(url) assert response.status_code == 200 assert response.data == self.serialize_course(self.course) @@ -112,7 +112,7 @@ def test_get_uuid(self): """ Verify the endpoint returns the details for a single course with UUID. """ url = reverse('api:v1:course-detail', kwargs={'key': self.course.uuid}) - with self.assertNumQueries(47): + with self.assertNumQueries(44): response = self.client.get(url) assert response.status_code == 200 assert response.data == self.serialize_course(self.course) @@ -121,7 +121,7 @@ def test_get_exclude_deleted_programs(self): """ Verify the endpoint returns no deleted associated programs """ ProgramFactory(courses=[self.course], status=ProgramStatus.Deleted) url = reverse('api:v1:course-detail', kwargs={'key': self.course.key}) - with self.assertNumQueries(47): + with self.assertNumQueries(43): response = self.client.get(url) assert response.status_code == 200 assert response.data.get('programs') == [] @@ -134,7 +134,7 @@ def test_get_include_deleted_programs(self): ProgramFactory(courses=[self.course], status=ProgramStatus.Deleted) url = reverse('api:v1:course-detail', kwargs={'key': self.course.key}) url += '?include_deleted_programs=1' - with self.assertNumQueries(50): + with self.assertNumQueries(47): response = self.client.get(url) assert response.status_code == 200 assert response.data == self.serialize_course(self.course, extra_context={'include_deleted_programs': True}) @@ -282,7 +282,7 @@ def test_list(self): """ Verify the endpoint returns a list of all courses. """ url = reverse('api:v1:course-list') - with self.assertNumQueries(35, threshold=3): + with self.assertNumQueries(28, threshold=3): response = self.client.get(url) assert response.status_code == 200 self.assertListEqual( @@ -300,7 +300,7 @@ def test_list_query(self): # Known to be flaky prior to the addition of tearDown() # and logout() code which is the same number of additional queries - with self.assertNumQueries(52, threshold=3): + with self.assertNumQueries(40, threshold=3): response = self.client.get(url) self.assertListEqual(response.data['results'], self.serialize_course(courses, many=True)) @@ -310,7 +310,7 @@ def test_list_key_filter(self): keys = ','.join([course.key for course in courses]) url = '{root}?{params}'.format(root=reverse('api:v1:course-list'), params=urlencode({'keys': keys})) - with self.assertNumQueries(51, threshold=3): + with self.assertNumQueries(39, threshold=3): response = self.client.get(url) self.assertListEqual(response.data['results'], self.serialize_course(courses, many=True)) @@ -322,7 +322,7 @@ def test_list_uuid_filter(self): # Increasing threshold because Course skill fetch SQL queries are executed twice # on CI. Listing returns skill details and skill names as two separate fields. # TODO: Figure out why the cache behavior is not working as expected on CI. - with self.assertNumQueries(51, threshold=5): + with self.assertNumQueries(40, threshold=5): response = self.client.get(url) self.assertListEqual(response.data['results'], self.serialize_course(courses, many=True)) diff --git a/course_discovery/apps/course_metadata/data_loaders/csv_loader.py b/course_discovery/apps/course_metadata/data_loaders/csv_loader.py index 0a3514d5ca..203a7963e9 100644 --- a/course_discovery/apps/course_metadata/data_loaders/csv_loader.py +++ b/course_discovery/apps/course_metadata/data_loaders/csv_loader.py @@ -176,14 +176,14 @@ def ingest(self): # pylint: disable=too-many-statements course_run, is_course_run_created = self._get_or_create_course_run( row, course, course_type, course_run_type.uuid ) - except Exception as exc: + except Exception as exc: # pylint: disable=broad-except logger.exception(exc) continue else: logger.info("Course key {} could not be found in database, creating the course.".format(course_key)) # lint-amnesty, pylint: disable=logging-format-interpolation try: _ = self._create_course(row, course_type, course_run_type.uuid) - except Exception as exc: + except Exception as exc: # pylint: disable=broad-except exception_message = exc if hasattr(exc, 'response'): exception_message = exc.response.content.decode('utf-8') @@ -316,7 +316,7 @@ def _get_or_create_course_run(self, data, course, course_type, course_run_type_u last_run = course_runs.last() _ = self._create_course_run(data, course, course_type, course_run_type_uuid, last_run.key) is_course_run_created = True - except Exception as exc: # pylint: disable=broad-except + except Exception as exc: exception_message = exc if hasattr(exc, 'response'): exception_message = exc.response.content.decode('utf-8') diff --git a/course_discovery/apps/course_metadata/management/commands/tests/test_migrate_course_slugs.py b/course_discovery/apps/course_metadata/management/commands/tests/test_migrate_course_slugs.py index a735bb39f9..30137701e9 100644 --- a/course_discovery/apps/course_metadata/management/commands/tests/test_migrate_course_slugs.py +++ b/course_discovery/apps/course_metadata/management/commands/tests/test_migrate_course_slugs.py @@ -1,6 +1,7 @@ from django.core.files.uploadedfile import SimpleUploadedFile from django.core.management import call_command from django.test import TestCase +from edx_django_utils.cache import RequestCache from edx_toggles.toggles.testutils import override_waffle_switch from slugify import slugify from testfixtures import LogCapture @@ -50,6 +51,7 @@ def setUp(self): product_source=self.product_source, partner=partner, title='test_course' ) self.course3_non_draft.url_slug_history.add(course_url_slug) + RequestCache("active_url_cache").clear() self.csv_header = 'course_uuids\n' self.csv_file_content = self.csv_header + str(self.course1.uuid) + '\n' + str(self.course2.uuid) self.csv_file = SimpleUploadedFile( diff --git a/course_discovery/apps/course_metadata/models.py b/course_discovery/apps/course_metadata/models.py index 0bc68349b6..8dbc7973d7 100644 --- a/course_discovery/apps/course_metadata/models.py +++ b/course_discovery/apps/course_metadata/models.py @@ -23,6 +23,7 @@ from django_elasticsearch_dsl.registries import registry from django_extensions.db.fields import AutoSlugField from django_extensions.db.models import TimeStampedModel +from edx_django_utils.cache import RequestCache, get_cache_key from elasticsearch.exceptions import RequestError from elasticsearch_dsl.query import Q as ESDSLQ from localflavor.us.us_states import CONTIGUOUS_STATES @@ -58,8 +59,9 @@ IS_SUBDIRECTORY_SLUG_FORMAT_FOR_EXEC_ED_ENABLED ) from course_discovery.apps.course_metadata.utils import ( - UploadToFieldNamePath, clean_query, custom_render_variations, get_slug_for_course, is_ocm_course, - push_to_ecommerce_for_course_run, push_tracks_to_lms_for_course_run, set_official_state, subtract_deadline_delta + UploadToFieldNamePath, clean_query, clear_slug_request_cache_for_course, custom_render_variations, + get_slug_for_course, is_ocm_course, push_to_ecommerce_for_course_run, push_tracks_to_lms_for_course_run, + set_official_state, subtract_deadline_delta ) from course_discovery.apps.ietf_language_tags.models import LanguageTag from course_discovery.apps.publisher.utils import VALID_CHARS_IN_COURSE_NUM_AND_ORG_KEY @@ -1587,11 +1589,22 @@ def active_url_slug(self): """ Official rows just return whatever slug is active, draft rows will first look for an associated active slug and, if they fail to find one, take the slug associated with the official course that has is_active_on_draft: True.""" - active_url = self.url_slug_history.filter(is_active=True).first() + active_url_cache = RequestCache("active_url_cache") + cache_key = get_cache_key(course_uuid=self.uuid, draft=self.draft) + cached_active_url = active_url_cache.get_cached_response(cache_key) + if cached_active_url.is_found: + return cached_active_url.value + + active_url = CourseUrlSlug.objects.filter(course=self, is_active=True).first() + if not active_url and self.draft and self.official_version: # current draft url slug has already been published at least once, so get it from the official course - active_url = self.official_version.url_slug_history.filter(is_active_on_draft=True).first() - return getattr(active_url, 'url_slug', None) + active_url = CourseUrlSlug.objects.filter(course=self.official_version, is_active_on_draft=True).first() + + url_slug = getattr(active_url, 'url_slug', None) + if url_slug: + active_url_cache.set(cache_key, url_slug) + return url_slug def get_course_url_path(self): """ @@ -1684,15 +1697,19 @@ def course_run_statuses(self): Returns all unique course run status values inside this course. Note that it skips hidden courses - this list is typically used for presentational purposes. + The code performs the filtering on Python level instead of ORM/SQL because filtering on course_runs + invalidates the prefetch on API level. """ - statuses = [] + statuses = set() now = datetime.datetime.now(pytz.UTC) - runs = self.course_runs.exclude(hidden=True) - if runs.filter(status=CourseRunStatus.Unpublished, end__lt=now).exists(): - statuses = ['archived'] - runs = runs.exclude(status=CourseRunStatus.Unpublished, end__lt=now) - statuses += list(runs.values_list('status', flat=True).distinct().order_by('status')) - return statuses + for course_run in self.course_runs.all(): + if course_run.hidden: + continue + if course_run.end and course_run.end < now and course_run.status == CourseRunStatus.Unpublished: + statuses.add('archived') + else: + statuses.add(course_run.status) + return sorted(list(statuses)) def unpublish_inactive_runs(self, published_runs=None): """ @@ -1767,8 +1784,11 @@ def set_active_url_slug(self, slug): logger.info('The current slug is {}; The slug to be set is {}; Current course is a draft: {}' # lint-amnesty, pylint: disable=logging-format-interpolation .format(self.url_slug, slug, self.draft)) + # There are too many branches in this method, and it is ok to clear cache than to serve stale data. + clear_slug_request_cache_for_course(self.uuid) + if slug: - # case 0: if slug is already in use with another, rasie an IntegrityError + # case 0: if slug is already in use with another, raise an IntegrityError excluded_course = self.official_version if self.draft else self.draft_version slug_already_in_use = CourseUrlSlug.objects.filter(url_slug__iexact=slug.lower()).exclude( course__in=[self, excluded_course]).exists() diff --git a/course_discovery/apps/course_metadata/tests/test_models.py b/course_discovery/apps/course_metadata/tests/test_models.py index 993bd723c0..91d5650a76 100644 --- a/course_discovery/apps/course_metadata/tests/test_models.py +++ b/course_discovery/apps/course_metadata/tests/test_models.py @@ -18,6 +18,7 @@ from django.core.management import call_command from django.db import IntegrityError, transaction from django.test import TestCase, override_settings +from edx_django_utils.cache import RequestCache from edx_toggles.toggles.testutils import override_waffle_switch from freezegun import freeze_time from slugify import slugify @@ -562,6 +563,9 @@ def test_set_active_url_slug__draft_with_official_version(self): non_draft_course = CourseFactory(draft_version=draft_course, title=draft_course.title, key=draft_course.key) draft_course.url_slug_history.all().delete() non_draft_course.url_slug_history.all().delete() + # Need to clear cache explicitly as marketing_url creation, that uses active_url_slug, sets the + # cache with factory data + RequestCache("active_url_cache").clear() draft_previous_data_modified_timestamp = draft_course.data_modified_timestamp non_draft_previous_data_modified_timestamp = non_draft_course.data_modified_timestamp with LogCapture(LOGGER_PATH) as logger: @@ -590,7 +594,9 @@ def test_set_active_url_slug__draft_with_official_version_matching_slug(self): non_draft_course = CourseFactory(draft_version=draft_course, title=draft_course.title, key=draft_course.key) draft_course.url_slug_history.all().delete() non_draft_course.url_slug_history.all().delete() - + # Need to clear cache explicitly as marketing_url creation, that uses active_url_slug, sets the + # cache with factory data + RequestCache("active_url_cache").clear() CourseUrlSlugFactory(course=draft_course, is_active=True, is_active_on_draft=True, url_slug='test-course') CourseUrlSlugFactory(course=non_draft_course, is_active=True, is_active_on_draft=False, url_slug='slug1') non_draft_slug_obj = CourseUrlSlugFactory( diff --git a/course_discovery/apps/course_metadata/tests/test_utils.py b/course_discovery/apps/course_metadata/tests/test_utils.py index 91bb155204..74b43299fd 100644 --- a/course_discovery/apps/course_metadata/tests/test_utils.py +++ b/course_discovery/apps/course_metadata/tests/test_utils.py @@ -12,6 +12,7 @@ from django.conf import settings from django.core.exceptions import ValidationError from django.test import TestCase +from edx_django_utils.cache import RequestCache from edx_toggles.toggles.testutils import override_waffle_switch from slugify import slugify @@ -1287,49 +1288,33 @@ def test_get_slug_for_course__with_no_url_slug(self): organization = OrganizationFactory(name='test-organization') course.authoring_organizations.add(organization) CourseUrlSlug.objects.filter(course=course).delete() + RequestCache("active_url_cache").clear() slug, error = utils.get_slug_for_course(course) assert error is None assert slug == f"learn/{subject.slug}/{organization.name}-{course.title}" def test_get_slug_for_course__with_existing_url_slug(self): + """ + Verify that get slug utility factors in courses with same org, title, and subject in subdirectory + slug generation by prefixing the slugs with increasing integers. + """ partner = PartnerFactory() - course1 = CourseFactory(title='test-title') subject = SubjectFactory(name='business') - course1.subjects.add(subject) organization = OrganizationFactory(name='test-organization') - course1.authoring_organizations.add(organization) - course1.partner = partner - course1.save() - CourseUrlSlug.objects.filter(course=course1).delete() - slug, error = utils.get_slug_for_course(course1) - course1.set_active_url_slug(slug) - - # duplicate a new course with same title, subject and organization - course2 = CourseFactory(title='test-title') - subject = SubjectFactory(name='business') - course2.subjects.add(subject) - organization = OrganizationFactory(name='test-organization') - course2.authoring_organizations.add(organization) - course2.partner = partner - course2.save() - CourseUrlSlug.objects.filter(course=course2).delete() - slug, error = utils.get_slug_for_course(course2) - assert error is None - assert slug == f"learn/{subject.slug}/{organization.name}-{course2.title}-2" + for course_count in range(3): + course = CourseFactory(title='test-title') + course.subjects.add(subject) + course.authoring_organizations.add(organization) + course.partner = partner + course.save() + CourseUrlSlug.objects.filter(course=course).delete() + RequestCache("active_url_cache").clear() + slug, error = utils.get_slug_for_course(course) - course2.set_active_url_slug(slug) - # duplicate a new course with same title, subject and organization - course3 = CourseFactory(title='test-title') - subject = SubjectFactory(name='business') - course3.subjects.add(subject) - organization = OrganizationFactory(name='test-organization') - course3.authoring_organizations.add(organization) - course3.partner = partner - course3.save() - CourseUrlSlug.objects.filter(course=course3).delete() - slug, error = utils.get_slug_for_course(course3) - assert error is None - assert slug == f"learn/{subject.slug}/{organization.name}-{course3.title}-3" + assert error is None + slug_end_prefix = f"-{course_count+1}" if course_count else "" + assert slug == f"learn/{subject.slug}/{organization.name}-{course.title}{slug_end_prefix}" + course.set_active_url_slug(slug) def test_get_slug_for_exec_ed_course__with_existing_url_slug(self): ee_type_2u = CourseTypeFactory(slug=CourseType.EXECUTIVE_EDUCATION_2U) diff --git a/course_discovery/apps/course_metadata/utils.py b/course_discovery/apps/course_metadata/utils.py index 9684e2ed8c..82b22176ba 100644 --- a/course_discovery/apps/course_metadata/utils.py +++ b/course_discovery/apps/course_metadata/utils.py @@ -19,6 +19,7 @@ from django.utils.functional import cached_property from django.utils.translation import gettext as _ from dynamic_filenames import FilePattern +from edx_django_utils.cache import RequestCache, get_cache_key from getsmarter_api_clients.geag import GetSmarterEnterpriseApiClient from slugify import slugify from stdimage.models import StdImageFieldFile @@ -1220,3 +1221,12 @@ def transform_dict_keys(data): transformed_dict[updated_key] = value return transformed_dict + + +def clear_slug_request_cache_for_course(course_uuid): + """ + Clear request cache for both draft and non-draft entries to ensure data consistency. + """ + active_url_cache = RequestCache("active_url_cache") + active_url_cache.delete(get_cache_key(course_uuid=course_uuid, draft=True)) + active_url_cache.delete(get_cache_key(course_uuid=course_uuid, draft=False)) diff --git a/pylintrc b/pylintrc index cd2feb3cf6..85f3dc665a 100644 --- a/pylintrc +++ b/pylintrc @@ -64,7 +64,7 @@ # SERIOUSLY. # # ------------------------------ -# Generated by edx-lint version: 5.3.0 +# Generated by edx-lint version: 5.3.6 # ------------------------------ [MASTER] ignore = ,migrations, settings, wsgi.py @@ -259,6 +259,7 @@ enable = useless-suppression, disable = bad-indentation, + broad-exception-raised, consider-using-f-string, duplicate-code, file-ignored, @@ -380,6 +381,6 @@ ext-import-graph = int-import-graph = [EXCEPTIONS] -overgeneral-exceptions = Exception +overgeneral-exceptions = builtins.Exception -# 8a8966b1f11ff0d7d10e4220af6c96d5f23e1b8c +# 27dc71699a03ebc866b520f139a3199397ec887d