From e9e4a3d0419597033b8f663c90ad069601d1d8a7 Mon Sep 17 00:00:00 2001 From: Muhammad Umar Khan <42294172+mumarkhan999@users.noreply.github.com> Date: Fri, 26 Jul 2024 15:34:17 +0500 Subject: [PATCH] feat!: upgrade pymongo (#35179) --- Makefile | 2 + requirements/common_constraints.txt | 2 +- requirements/constraints.txt | 12 ++-- requirements/edx/base.txt | 10 ++- requirements/edx/development.txt | 8 ++- requirements/edx/doc.txt | 10 ++- requirements/edx/paver.txt | 4 +- requirements/edx/testing.txt | 11 +-- .../structures_pruning/requirements/base.txt | 4 +- .../requirements/testing.txt | 6 +- xmodule/contentstore/mongo.py | 65 +++++++++++++---- xmodule/modulestore/mongo/base.py | 72 ++++++++++++------- .../split_mongo/mongo_connection.py | 52 +++++++++++--- .../tests/test_mixed_modulestore.py | 41 ++++++++++- xmodule/mongo_utils.py | 33 +++++---- 15 files changed, 248 insertions(+), 84 deletions(-) diff --git a/Makefile b/Makefile index c70de65fb454..15bab5df67a9 100644 --- a/Makefile +++ b/Makefile @@ -137,6 +137,8 @@ compile-requirements: pre-requirements $(COMMON_CONSTRAINTS_TXT) ## Re-compile * mv requirements/common_constraints.tmp requirements/common_constraints.txt sed 's/Django<4.0//g' requirements/common_constraints.txt > requirements/common_constraints.tmp mv requirements/common_constraints.tmp requirements/common_constraints.txt + sed 's/event-tracking<2.4.1//g' requirements/common_constraints.txt > requirements/common_constraints.tmp + mv requirements/common_constraints.tmp requirements/common_constraints.txt pip-compile -v --allow-unsafe ${COMPILE_OPTS} -o requirements/pip.txt requirements/pip.in pip install -r requirements/pip.txt diff --git a/requirements/common_constraints.txt b/requirements/common_constraints.txt index ef8bc86061b7..9405a605c520 100644 --- a/requirements/common_constraints.txt +++ b/requirements/common_constraints.txt @@ -40,4 +40,4 @@ importlib-metadata<7 # We will pin event-tracking to do not break existing installations # This can be unpinned once https://github.com/openedx/edx-platform/issues/34586 # has been resolved and edx-platform is running with pymongo>=4.4.0 -event-tracking<2.4.1 + diff --git a/requirements/constraints.txt b/requirements/constraints.txt index 22b982b9ddab..b32989514d5d 100644 --- a/requirements/constraints.txt +++ b/requirements/constraints.txt @@ -32,9 +32,13 @@ django-oauth-toolkit==1.7.1 # incremental upgrade django-simple-history==3.4.0 -# constrained in opaque_keys. migration guide here: https://pymongo.readthedocs.io/en/4.0/migrate-to-pymongo4.html -# Major upgrade will be done in separate ticket. -pymongo<4.0.0 +# Adding pin to avoid any major upgrade +pymongo<4.4.1 + +# To override the constraint of edx-lint +# This can be removed once https://github.com/openedx/edx-platform/issues/34586 is resolved +# and the upstream constraint in edx-lint has been removed. +event-tracking==3.0.0 # greater version has breaking changes and requires some migration steps. django-webpack-loader==0.7.0 @@ -125,4 +129,4 @@ numpy<2.0.0 # django-storages==1.14.4 breaks course imports # Two lines were added in 1.14.4 that make file_exists_in_storage function always return False, # as the default value of AWS_S3_FILE_OVERWRITE is True -django-storages<1.14.4 \ No newline at end of file +django-storages<1.14.4 diff --git a/requirements/edx/base.txt b/requirements/edx/base.txt index 4b4356bd8a70..a7d2d457cc22 100644 --- a/requirements/edx/base.txt +++ b/requirements/edx/base.txt @@ -376,6 +376,10 @@ djangorestframework==3.14.0 # super-csv djangorestframework-xml==2.0.0 # via edx-enterprise +dnspython==2.6.1 + # via + # -r requirements/edx/paver.txt + # pymongo done-xblock==2.3.0 # via -r requirements/edx/bundled.in drf-jwt==1.19.2 @@ -536,9 +540,9 @@ enmerkar==0.7.1 # via enmerkar-underscore enmerkar-underscore==2.3.0 # via -r requirements/edx/kernel.in -event-tracking==2.4.0 +event-tracking==3.0.0 # via - # -c requirements/edx/../common_constraints.txt + # -c requirements/edx/../constraints.txt # -r requirements/edx/kernel.in # edx-completion # edx-proctoring @@ -865,7 +869,7 @@ pylti1p3==2.0.0 # via -r requirements/edx/kernel.in pymemcache==4.0.0 # via -r requirements/edx/paver.txt -pymongo==3.13.0 +pymongo==4.4.0 # via # -c requirements/edx/../constraints.txt # -r requirements/edx/kernel.in diff --git a/requirements/edx/development.txt b/requirements/edx/development.txt index f2e8ec77cf75..397923ac1e3b 100644 --- a/requirements/edx/development.txt +++ b/requirements/edx/development.txt @@ -614,8 +614,10 @@ djangorestframework-xml==2.0.0 # edx-enterprise dnspython==2.6.1 # via + # -r requirements/edx/doc.txt # -r requirements/edx/testing.txt # email-validator + # pymongo docutils==0.21.2 # via # -r requirements/edx/doc.txt @@ -853,9 +855,9 @@ enmerkar-underscore==2.3.0 # via # -r requirements/edx/doc.txt # -r requirements/edx/testing.txt -event-tracking==2.4.0 +event-tracking==3.0.0 # via - # -c requirements/edx/../common_constraints.txt + # -c requirements/edx/../constraints.txt # -r requirements/edx/doc.txt # -r requirements/edx/testing.txt # edx-completion @@ -1535,7 +1537,7 @@ pymemcache==4.0.0 # via # -r requirements/edx/doc.txt # -r requirements/edx/testing.txt -pymongo==3.13.0 +pymongo==4.4.0 # via # -c requirements/edx/../constraints.txt # -r requirements/edx/doc.txt diff --git a/requirements/edx/doc.txt b/requirements/edx/doc.txt index 20b3198c8048..182ff8eb8684 100644 --- a/requirements/edx/doc.txt +++ b/requirements/edx/doc.txt @@ -440,6 +440,10 @@ djangorestframework-xml==2.0.0 # via # -r requirements/edx/base.txt # edx-enterprise +dnspython==2.6.1 + # via + # -r requirements/edx/base.txt + # pymongo docutils==0.21.2 # via # pydata-sphinx-theme @@ -614,9 +618,9 @@ enmerkar==0.7.1 # enmerkar-underscore enmerkar-underscore==2.3.0 # via -r requirements/edx/base.txt -event-tracking==2.4.0 +event-tracking==3.0.0 # via - # -c requirements/edx/../common_constraints.txt + # -c requirements/edx/../constraints.txt # -r requirements/edx/base.txt # edx-completion # edx-proctoring @@ -1026,7 +1030,7 @@ pylti1p3==2.0.0 # via -r requirements/edx/base.txt pymemcache==4.0.0 # via -r requirements/edx/base.txt -pymongo==3.13.0 +pymongo==4.4.0 # via # -c requirements/edx/../constraints.txt # -r requirements/edx/base.txt diff --git a/requirements/edx/paver.txt b/requirements/edx/paver.txt index bf7337f4147a..0b82d71c91e6 100644 --- a/requirements/edx/paver.txt +++ b/requirements/edx/paver.txt @@ -10,6 +10,8 @@ charset-normalizer==2.0.12 # via # -c requirements/edx/../constraints.txt # requests +dnspython==2.6.1 + # via pymongo edx-opaque-keys==2.10.0 # via -r requirements/edx/paver.in idna==3.7 @@ -36,7 +38,7 @@ psutil==6.0.0 # via -r requirements/edx/paver.in pymemcache==4.0.0 # via -r requirements/edx/paver.in -pymongo==3.13.0 +pymongo==4.4.0 # via # -c requirements/edx/../constraints.txt # -r requirements/edx/paver.in diff --git a/requirements/edx/testing.txt b/requirements/edx/testing.txt index e443719e8b1f..bf5c2817d519 100644 --- a/requirements/edx/testing.txt +++ b/requirements/edx/testing.txt @@ -476,7 +476,10 @@ djangorestframework-xml==2.0.0 # -r requirements/edx/base.txt # edx-enterprise dnspython==2.6.1 - # via email-validator + # via + # -r requirements/edx/base.txt + # email-validator + # pymongo done-xblock==2.3.0 # via -r requirements/edx/base.txt drf-jwt==1.19.2 @@ -650,9 +653,9 @@ enmerkar==0.7.1 # enmerkar-underscore enmerkar-underscore==2.3.0 # via -r requirements/edx/base.txt -event-tracking==2.4.0 +event-tracking==3.0.0 # via - # -c requirements/edx/../common_constraints.txt + # -c requirements/edx/../constraints.txt # -r requirements/edx/base.txt # edx-completion # edx-proctoring @@ -1141,7 +1144,7 @@ pylti1p3==2.0.0 # via -r requirements/edx/base.txt pymemcache==4.0.0 # via -r requirements/edx/base.txt -pymongo==3.13.0 +pymongo==4.4.0 # via # -c requirements/edx/../constraints.txt # -r requirements/edx/base.txt diff --git a/scripts/structures_pruning/requirements/base.txt b/scripts/structures_pruning/requirements/base.txt index 87aa858e9f8e..828a81a8d4ed 100644 --- a/scripts/structures_pruning/requirements/base.txt +++ b/scripts/structures_pruning/requirements/base.txt @@ -11,11 +11,13 @@ click==8.1.6 # click-log click-log==0.4.0 # via -r scripts/structures_pruning/requirements/base.in +dnspython==2.6.1 + # via pymongo edx-opaque-keys==2.10.0 # via -r scripts/structures_pruning/requirements/base.in pbr==6.0.0 # via stevedore -pymongo==3.13.0 +pymongo==4.4.0 # via # -c scripts/structures_pruning/requirements/../../../requirements/constraints.txt # -r scripts/structures_pruning/requirements/base.in diff --git a/scripts/structures_pruning/requirements/testing.txt b/scripts/structures_pruning/requirements/testing.txt index 2590ca8ca52b..d74b204fad5c 100644 --- a/scripts/structures_pruning/requirements/testing.txt +++ b/scripts/structures_pruning/requirements/testing.txt @@ -12,6 +12,10 @@ click-log==0.4.0 # via -r scripts/structures_pruning/requirements/base.txt ddt==1.7.2 # via -r scripts/structures_pruning/requirements/testing.in +dnspython==2.6.1 + # via + # -r scripts/structures_pruning/requirements/base.txt + # pymongo edx-opaque-keys==2.10.0 # via -r scripts/structures_pruning/requirements/base.txt iniconfig==2.0.0 @@ -24,7 +28,7 @@ pbr==6.0.0 # stevedore pluggy==1.5.0 # via pytest -pymongo==3.13.0 +pymongo==4.4.0 # via # -r scripts/structures_pruning/requirements/base.txt # edx-opaque-keys diff --git a/xmodule/contentstore/mongo.py b/xmodule/contentstore/mongo.py index 66d9474cde7c..e44f03cede05 100644 --- a/xmodule/contentstore/mongo.py +++ b/xmodule/contentstore/mongo.py @@ -3,6 +3,7 @@ """ +import hashlib import json import os @@ -40,16 +41,29 @@ def __init__( # GridFS will throw an exception if the Database is wrapped in a MongoProxy. So don't wrap it. # The appropriate methods below are marked as autoretry_read - those methods will handle # the AutoReconnect errors. - proxy = False - mongo_db = connect_to_mongodb( - db, host, - port=port, tz_aware=tz_aware, user=user, password=password, proxy=proxy, **kwargs - ) + self.connection_params = { + 'db': db, + 'host': host, + 'port': port, + 'tz_aware': tz_aware, + 'user': user, + 'password': password, + 'proxy': False, + **kwargs + } + self.bucket = bucket + self.do_connection() + + def do_connection(self): + """ + Connects to mongodb. + """ + mongo_db = connect_to_mongodb(**self.connection_params) - self.fs = gridfs.GridFS(mongo_db, bucket) # pylint: disable=invalid-name + self.fs = gridfs.GridFS(mongo_db, self.bucket) # pylint: disable=invalid-name - self.fs_files = mongo_db[bucket + ".files"] # the underlying collection GridFS uses - self.chunks = mongo_db[bucket + ".chunks"] + self.fs_files = mongo_db[self.bucket + ".files"] # the underlying collection GridFS uses + self.chunks = mongo_db[self.bucket + ".chunks"] def close_connections(self): """ @@ -57,6 +71,25 @@ def close_connections(self): """ self.fs_files.database.client.close() + def ensure_connection(self): + """ + Ensure that mongodb connection is open. + """ + if self.check_connection(): + return + self.do_connection() + + def check_connection(self): + """ + Check if mongodb connection is open or not. + """ + connection = self.fs_files.database.client + try: + connection.admin.command('ping') + return True + except pymongo.errors.InvalidOperation: + return False + def _drop_database(self, database=True, collections=True, connections=True): """ A destructive operation to drop the underlying database and close all connections. @@ -69,8 +102,8 @@ def _drop_database(self, database=True, collections=True, connections=True): If connections is True, then close the connection to the database as well. """ + self.ensure_connection() connection = self.fs_files.database.client - if database: connection.drop_database(self.fs_files.database.name) elif collections: @@ -103,16 +136,22 @@ def save(self, content): # but many more objects have this in python3 and shouldn't be using the chunking logic. For string and # byte streams we write them directly to gridfs and convert them to byetarrys if necessary. if hasattr(content.data, '__iter__') and not isinstance(content.data, (bytes, (str,))): + custom_md5 = hashlib.md5() for chunk in content.data: fp.write(chunk) + custom_md5.update(chunk) + fp.custom_md5 = custom_md5.hexdigest() else: # Ideally we could just ensure that we don't get strings in here and only byte streams # but being confident of that wolud be a lot more work than we have time for so we just # handle both cases here. if isinstance(content.data, str): - fp.write(content.data.encode('utf-8')) + encoded_data = content.data.encode('utf-8') + fp.write(encoded_data) + fp.custom_md5 = hashlib.md5(encoded_data).hexdigest() else: fp.write(content.data) + fp.custom_md5 = hashlib.md5(content.data).hexdigest() return content @@ -142,12 +181,13 @@ def find(self, location, throw_on_not_found=True, as_stream=False): # lint-amne 'thumbnail', thumbnail_location[4] ) + return StaticContentStream( location, fp.displayname, fp.content_type, fp, last_modified_at=fp.uploadDate, thumbnail_location=thumbnail_location, import_path=getattr(fp, 'import_path', None), length=fp.length, locked=getattr(fp, 'locked', False), - content_digest=getattr(fp, 'md5', None), + content_digest=getattr(fp, 'custom_md5', None), ) else: with self.fs.get(content_id) as fp: @@ -161,12 +201,13 @@ def find(self, location, throw_on_not_found=True, as_stream=False): # lint-amne 'thumbnail', thumbnail_location[4] ) + return StaticContent( location, fp.displayname, fp.content_type, fp.read(), last_modified_at=fp.uploadDate, thumbnail_location=thumbnail_location, import_path=getattr(fp, 'import_path', None), length=fp.length, locked=getattr(fp, 'locked', False), - content_digest=getattr(fp, 'md5', None), + content_digest=getattr(fp, 'custom_md5', None), ) except NoFile: if throw_on_not_found: # lint-amnesty, pylint: disable=no-else-raise diff --git a/xmodule/modulestore/mongo/base.py b/xmodule/modulestore/mongo/base.py index c5cc935861d2..16a8c134c1d6 100644 --- a/xmodule/modulestore/mongo/base.py +++ b/xmodule/modulestore/mongo/base.py @@ -473,30 +473,9 @@ def __init__(self, contentstore, doc_store_config, fs_root, render_template, super().__init__(contentstore=contentstore, **kwargs) - def do_connection( - db, collection, host, port=27017, tz_aware=True, user=None, password=None, asset_collection=None, **kwargs - ): - """ - Create & open the connection, authenticate, and provide pointers to the collection - """ - # Set a write concern of 1, which makes writes complete successfully to the primary - # only before returning. Also makes pymongo report write errors. - kwargs['w'] = 1 - - self.database = connect_to_mongodb( - db, host, - port=port, tz_aware=tz_aware, user=user, password=password, - retry_wait_time=retry_wait_time, **kwargs - ) - - self.collection = self.database[collection] - - # Collection which stores asset metadata. - if asset_collection is None: - asset_collection = self.DEFAULT_ASSET_COLLECTION_NAME - self.asset_collection = self.database[asset_collection] - - do_connection(**doc_store_config) + self.doc_store_config = doc_store_config + self.retry_wait_time = retry_wait_time + self.do_connection(**self.doc_store_config) if default_class is not None: module_path, _, class_name = default_class.rpartition('.') @@ -523,6 +502,48 @@ def do_connection( self._course_run_cache = {} self.signal_handler = signal_handler + def check_connection(self): + """ + Check if mongodb connection is open or not. + """ + try: + # The ismaster command is cheap and does not require auth. + self.database.client.admin.command('ismaster') + return True + except pymongo.errors.InvalidOperation: + return False + + def ensure_connection(self): + """ + Ensure that mongodb connection is open. + """ + if self.check_connection(): + return + self.do_connection(**self.doc_store_config) + + def do_connection( + self, db, collection, host, port=27017, tz_aware=True, user=None, password=None, asset_collection=None, **kwargs + ): + """ + Create & open the connection, authenticate, and provide pointers to the collection + """ + # Set a write concern of 1, which makes writes complete successfully to the primary + # only before returning. Also makes pymongo report write errors. + kwargs['w'] = 1 + + self.database = connect_to_mongodb( + db, host, + port=port, tz_aware=tz_aware, user=user, password=password, + retry_wait_time=self.retry_wait_time, **kwargs + ) + + self.collection = self.database[collection] + + # Collection which stores asset metadata. + if asset_collection is None: + asset_collection = self.DEFAULT_ASSET_COLLECTION_NAME + self.asset_collection = self.database[asset_collection] + def close_connections(self): """ Closes any open connections to the underlying database @@ -541,6 +562,7 @@ def _drop_database(self, database=True, collections=True, connections=True): If connections is True, then close the connection to the database as well. """ + self.ensure_connection() # drop the assets super()._drop_database(database, collections, connections) @@ -872,6 +894,8 @@ def has_course(self, course_key, ignore_case=False, **kwargs): # lint-amnesty, course_query[key] = re.compile(r"(?i)^{}$".format(course_query[key])) else: course_query = {'_id': location.to_deprecated_son()} + + self.ensure_connection() course = self.collection.find_one(course_query, projection={'_id': True}) if course: return CourseKey.from_string('/'.join([ diff --git a/xmodule/modulestore/split_mongo/mongo_connection.py b/xmodule/modulestore/split_mongo/mongo_connection.py index bfb20fe0f5d5..9c00b0c22fc8 100644 --- a/xmodule/modulestore/split_mongo/mongo_connection.py +++ b/xmodule/modulestore/split_mongo/mongo_connection.py @@ -279,20 +279,30 @@ def __init__( #make sure the course index cache is fresh. RequestCache(namespace="course_index_cache").clear() - self.database = connect_to_mongodb( - db, host, - port=port, tz_aware=tz_aware, user=user, password=password, - retry_wait_time=retry_wait_time, **kwargs - ) - - self.course_index = self.database[collection + '.active_versions'] - self.structures = self.database[collection + '.structures'] - self.definitions = self.database[collection + '.definitions'] + self.collection = collection + self.connection_params = { + 'db': db, + 'host': host, + 'port': port, + 'tz_aware': tz_aware, + 'user': user, + 'password': password, + 'retry_wait_time': retry_wait_time, + **kwargs + } + + self.do_connection() # Is the MySQL subclass in use, passing through some reads/writes to us? If so this will be True. # If this MongoPersistenceBackend is being used directly (only MongoDB is involved), this is False. self.with_mysql_subclass = with_mysql_subclass + def do_connection(self): + self.database = connect_to_mongodb(**self.connection_params) + self.course_index = self.database[self.collection + '.active_versions'] + self.structures = self.database[self.collection + '.structures'] + self.definitions = self.database[self.collection + '.definitions'] + def heartbeat(self): """ Check that the db is reachable. @@ -304,6 +314,24 @@ def heartbeat(self): except pymongo.errors.ConnectionFailure: raise HeartbeatFailure(f"Can't connect to {self.database.name}", 'mongo') # lint-amnesty, pylint: disable=raise-missing-from + def check_connection(self): + """ + Check if mongodb connection is open or not. + """ + try: + self.database.client.admin.command("ping") + return True + except pymongo.errors.InvalidOperation: + return False + + def ensure_connection(self): + """ + Ensure that mongodb connection is open. + """ + if self.check_connection(): + return + self.do_connection() + def get_structure(self, key, course_context=None): """ Get the structure from the persistence mechanism whose id is the given key. @@ -502,6 +530,7 @@ def delete_course_index(self, course_key): """ Delete the course_index from the persistence mechanism whose id is the given course_index """ + self.ensure_connection() with TIMER.timer("delete_course_index", course_key): query = { key_attr: getattr(course_key, key_attr) @@ -561,7 +590,8 @@ def close_connections(self): Closes any open connections to the underlying databases """ RequestCache(namespace="course_index_cache").clear() - self.database.client.close() + if self.check_connection(): + self.database.client.close() def _drop_database(self, database=True, collections=True, connections=True): """ @@ -576,6 +606,8 @@ def _drop_database(self, database=True, collections=True, connections=True): If connections is True, then close the connection to the database as well. """ RequestCache(namespace="course_index_cache").clear() + + self.ensure_connection() connection = self.database.client if database: diff --git a/xmodule/modulestore/tests/test_mixed_modulestore.py b/xmodule/modulestore/tests/test_mixed_modulestore.py index 91292cd88d71..8adbfcb911a4 100644 --- a/xmodule/modulestore/tests/test_mixed_modulestore.py +++ b/xmodule/modulestore/tests/test_mixed_modulestore.py @@ -156,8 +156,8 @@ def setUp(self): tz_aware=True, ) self.connection.drop_database(self.DB) - self.addCleanup(self.connection.drop_database, self.DB) - self.addCleanup(self.connection.close) + self.addCleanup(self._drop_database) + self.addCleanup(self._close_connection) # define attrs which get set in initdb to quell pylint self.writable_chapter_location = self.store = self.fake_location = None @@ -165,6 +165,43 @@ def setUp(self): self.user_id = ModuleStoreEnum.UserID.test + def _check_connection(self): + """ + Check mongodb connection is open or not. + """ + try: + self.connection.admin.command('ping') + return True + except pymongo.errors.InvalidOperation: + return False + + def _ensure_connection(self): + """ + Make sure that mongodb connection is open. + """ + if not self._check_connection(): + self.connection = pymongo.MongoClient( + host=self.HOST, + port=self.PORT, + tz_aware=True, + ) + + def _drop_database(self): + """ + Drop mongodb database. + """ + self._ensure_connection() + self.connection.drop_database(self.DB) + + def _close_connection(self): + """ + Close mongodb connection. + """ + try: + self.connection.close() + except pymongo.errors.InvalidOperation: + pass + def _create_course(self, course_key, asides=None): """ Create a course w/ one item in the persistence store using the given course & item location. diff --git a/xmodule/mongo_utils.py b/xmodule/mongo_utils.py index 5daeff034e99..b86abd28b466 100644 --- a/xmodule/mongo_utils.py +++ b/xmodule/mongo_utils.py @@ -51,27 +51,30 @@ def connect_to_mongodb( if read_preference is not None: kwargs['read_preference'] = read_preference - mongo_conn = pymongo.database.Database( - pymongo.MongoClient( - host=host, - port=port, - tz_aware=tz_aware, - document_class=dict, - **kwargs - ), - db - ) + if 'replicaSet' in kwargs and kwargs['replicaSet'] == '': + kwargs['replicaSet'] = None + + connection_params = { + 'host': host, + 'port': port, + 'tz_aware': tz_aware, + 'document_class': dict, + **kwargs, + } + + if user is not None and password is not None and not db.startswith('test_'): + connection_params.update({'username': user, 'password': password, 'authSource': db}) + + mongo_conn = pymongo.MongoClient(**connection_params) if proxy: mongo_conn = MongoProxy( - mongo_conn, + mongo_conn[db], wait_time=retry_wait_time ) - # If credentials were provided, authenticate the user. - if user is not None and password is not None: - mongo_conn.authenticate(user, password, source=auth_source) + return mongo_conn - return mongo_conn + return mongo_conn[db] def create_collection_index(