From 75ed4f5e05af81891a4a47f51e13de21586947e0 Mon Sep 17 00:00:00 2001 From: Samuel Paccoud - DINUM Date: Fri, 27 Dec 2024 10:19:16 +0100 Subject: [PATCH 01/10] =?UTF-8?q?=E2=9C=A8(backend)=20add=20util=20to=20ex?= =?UTF-8?q?tract=20text=20from=20Ydoc=20content?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Documents content is stored in the Ydoc format. We need a util to extract it as xml/text. --- CHANGELOG.md | 1 + Dockerfile | 7 ++++ src/backend/core/factories.py | 18 ++++++++- src/backend/core/tests/test_utils.py | 37 +++++++++++++++++++ .../tests/test_utils_base64_yjs_to_text.py | 29 +++++++++++++++ src/backend/core/utils.py | 25 +++++++++++++ src/backend/pyproject.toml | 4 +- 7 files changed, 119 insertions(+), 2 deletions(-) create mode 100644 src/backend/core/tests/test_utils.py create mode 100644 src/backend/core/tests/test_utils_base64_yjs_to_text.py create mode 100644 src/backend/core/utils.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e7f3713b..ccb845c44 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to ## Added +- ⚗️(backend) add util to extract text from base64 yjs document - ✨(backend) add soft delete and restore API endpoints to documents #516 - ✨(backend) allow organizing documents in a tree structure #516 - ✨(backend) add "excerpt" field to document list serializer #516 diff --git a/Dockerfile b/Dockerfile index 60f464ee0..23f26b706 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,6 +15,13 @@ FROM base AS back-builder WORKDIR /builder +# Install Rust and Cargo using Alpine's package manager +RUN apk add --no-cache \ + build-base \ + libffi-dev \ + rust \ + cargo + # Copy required python dependencies COPY ./src/backend /builder diff --git a/src/backend/core/factories.py b/src/backend/core/factories.py index 3f2c085fb..d0a641d8e 100644 --- a/src/backend/core/factories.py +++ b/src/backend/core/factories.py @@ -13,6 +13,22 @@ fake = Faker() +YDOC_HELLO_WORLD_BASE64 = ( + "AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh" + "aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI" + "ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y" + "1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm" + "YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y" + "AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt" + "BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE" + "bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck" + "ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH" + "ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv" + "bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA" + "9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J" + "dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA" +) + class UserFactory(factory.django.DjangoModelFactory): """A factory to random users for testing purposes.""" @@ -75,7 +91,7 @@ class Meta: title = factory.Sequence(lambda n: f"document{n}") excerpt = factory.Sequence(lambda n: f"excerpt{n}") - content = factory.Sequence(lambda n: f"content{n}") + content = YDOC_HELLO_WORLD_BASE64 creator = factory.SubFactory(UserFactory) deleted_at = None link_reach = factory.fuzzy.FuzzyChoice( diff --git a/src/backend/core/tests/test_utils.py b/src/backend/core/tests/test_utils.py new file mode 100644 index 000000000..4fa33e1ef --- /dev/null +++ b/src/backend/core/tests/test_utils.py @@ -0,0 +1,37 @@ +"""Test util base64_yjs_to_text.""" + +from core import utils + +# This base64 string is an example of what is saved in the database. +# This base64 is generated from the blocknote editor, it contains +# the text \n# *Hello* \n- w**or**ld +TEST_BASE64_STRING = ( + "AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh" + "aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI" + "ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y" + "1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm" + "YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y" + "AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt" + "BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE" + "bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck" + "ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH" + "ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv" + "bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA" + "9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J" + "dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA" +) + + +def test_utils_base64_yjs_to_text(): + """Test extract text from saved yjs document""" + assert utils.base64_yjs_to_text(TEST_BASE64_STRING) == "Hello world" + + +def test_utils_base64_yjs_to_xml(): + """Test extract xml from saved yjs document""" + content = utils.base64_yjs_to_xml(TEST_BASE64_STRING) + assert ( + 'Hello' in content + or 'Hello' in content + ) + assert 'world' in content diff --git a/src/backend/core/tests/test_utils_base64_yjs_to_text.py b/src/backend/core/tests/test_utils_base64_yjs_to_text.py new file mode 100644 index 000000000..376bb85d3 --- /dev/null +++ b/src/backend/core/tests/test_utils_base64_yjs_to_text.py @@ -0,0 +1,29 @@ +"""Test util base64_yjs_to_text.""" + +from core.utils import base64_yjs_to_text + + +def test_base64_yjs_to_text(): + """ + Test extract_text_from_saved_yjs_document + This base64 string is an example of what is saved in the database. + This base64 is generated from the blocknote editor, it contains + the text \n# *Hello* \n- w**or**ld + """ + base64_string = ( + "AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh" + "aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI" + "ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y" + "1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm" + "YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y" + "AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt" + "BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE" + "bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck" + "ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH" + "ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv" + "bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA" + "9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J" + "dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA" + ) + + assert base64_yjs_to_text(base64_string) == "Hello world" diff --git a/src/backend/core/utils.py b/src/backend/core/utils.py new file mode 100644 index 000000000..bd2e0170c --- /dev/null +++ b/src/backend/core/utils.py @@ -0,0 +1,25 @@ +"""Utils for the core app.""" + +import base64 + +import y_py as Y +from bs4 import BeautifulSoup + + +def base64_yjs_to_xml(base64_string): + """Extract xml from base64 yjs document.""" + + decoded_bytes = base64.b64decode(base64_string) + uint8_array = bytearray(decoded_bytes) + + doc = Y.YDoc() # pylint: disable=E1101 + Y.apply_update(doc, uint8_array) # pylint: disable=E1101 + return str(doc.get_xml_element("document-store")) + + +def base64_yjs_to_text(base64_string): + """Extract text from base64 yjs document.""" + + blocknote_structure = base64_yjs_to_xml(base64_string) + soup = BeautifulSoup(blocknote_structure, "html.parser") + return soup.get_text(separator=" ").strip() diff --git a/src/backend/pyproject.toml b/src/backend/pyproject.toml index 987208219..491e2f037 100644 --- a/src/backend/pyproject.toml +++ b/src/backend/pyproject.toml @@ -26,6 +26,7 @@ readme = "README.md" requires-python = ">=3.12" dependencies = [ "boto3==1.36.7", + "beautifulsoup4==4.12.3", "Brotli==1.1.0", "celery[redis]==5.4.0", "django-configurations==2.5.1", @@ -47,6 +48,7 @@ dependencies = [ "gunicorn==23.0.0", "jsonschema==4.23.0", "markdown==3.7", + "mozilla-django-oidc==4.0.1", "nested-multipart-parser==1.5.0", "openai==1.60.2", "psycopg[binary]==3.2.4", @@ -56,7 +58,7 @@ dependencies = [ "sentry-sdk==2.20.0", "url-normalize==1.4.3", "whitenoise==6.8.2", - "mozilla-django-oidc==4.0.1", + "y-py==0.6.2", ] [project.urls] From 7d496cde024c73aba7a639bf57f6eadd8c013648 Mon Sep 17 00:00:00 2001 From: Samuel Paccoud - DINUM Date: Fri, 27 Dec 2024 17:19:16 +0100 Subject: [PATCH 02/10] =?UTF-8?q?=E2=99=BB=EF=B8=8F(backend)=20refactor=20?= =?UTF-8?q?media=5Fauth=20and=20collaboration=5Fauth=20for=20flexibility?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These 2 actions had factorized code but a few iterations lead to spaghetti code where factorized code includes "if" clauses. Refactor abstractions so that code factorization really works. --- src/backend/core/api/viewsets.py | 109 +++++++++++++++---------------- 1 file changed, 52 insertions(+), 57 deletions(-) diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py index e9616affc..7d585aad3 100644 --- a/src/backend/core/api/viewsets.py +++ b/src/backend/core/api/viewsets.py @@ -935,10 +935,10 @@ def attachment_upload(self, request, *args, **kwargs): status=drf.status.HTTP_201_CREATED, ) - def _authorize_subrequest(self, request, pattern): + def _auth_get_original_url(self, request): """ - Shared method to authorize access based on the original URL of an Nginx subrequest - and user permissions. Returns a dictionary of URL parameters if authorized. + Extracts and parses the original URL from the "HTTP_X_ORIGINAL_URL" header. + Raises PermissionDenied if the header is missing. The original url is passed by nginx in the "HTTP_X_ORIGINAL_URL" header. See corresponding ingress configuration in Helm chart and read about the @@ -949,14 +949,6 @@ def _authorize_subrequest(self, request, pattern): to let this request go through (by returning a 200 code) or if we block it (by returning a 403 error). Note that we return 403 errors without any further details for security reasons. - - Parameters: - - pattern: The regex pattern to extract identifiers from the URL. - - Returns: - - A dictionary of URL parameters if the request is authorized. - Raises: - - PermissionDenied if authorization fails. """ # Extract the original URL from the request header original_url = request.META.get("HTTP_X_ORIGINAL_URL") @@ -964,52 +956,32 @@ def _authorize_subrequest(self, request, pattern): logger.debug("Missing HTTP_X_ORIGINAL_URL header in subrequest") raise drf.exceptions.PermissionDenied() - parsed_url = urlparse(original_url) - match = pattern.search(parsed_url.path) - - # If the path does not match the pattern, try to extract the parameters from the query - if not match: - match = pattern.search(parsed_url.query) - - if not match: - logger.debug( - "Subrequest URL '%s' did not match pattern '%s'", - parsed_url.path, - pattern, - ) - raise drf.exceptions.PermissionDenied() + logger.debug("Original url: '%s'", original_url) + return urlparse(original_url) + def _auth_get_url_params(self, pattern, fragment): + """ + Extracts URL parameters from the given fragment using the specified regex pattern. + Raises PermissionDenied if parameters cannot be extracted. + """ + match = pattern.search(fragment) try: - url_params = match.groupdict() + return match.groupdict() except (ValueError, AttributeError) as exc: logger.debug("Failed to extract parameters from subrequest URL: %s", exc) raise drf.exceptions.PermissionDenied() from exc - pk = url_params.get("pk") - if not pk: - logger.debug("Document ID (pk) not found in URL parameters: %s", url_params) - raise drf.exceptions.PermissionDenied() - - # Fetch the document and check if the user has access + def _auth_get_document(self, pk): + """ + Retrieves the document corresponding to the given primary key (pk). + Raises PermissionDenied if the document is not found. + """ try: - document = models.Document.objects.get(pk=pk) + return models.Document.objects.get(pk=pk) except models.Document.DoesNotExist as exc: logger.debug("Document with ID '%s' does not exist", pk) raise drf.exceptions.PermissionDenied() from exc - user_abilities = document.get_abilities(request.user) - - if not user_abilities.get(self.action, False): - logger.debug( - "User '%s' lacks permission for document '%s'", request.user, pk - ) - raise drf.exceptions.PermissionDenied() - - logger.debug( - "Subrequest authorization successful. Extracted parameters: %s", url_params - ) - return url_params, user_abilities, request.user.id - @drf.decorators.action(detail=False, methods=["get"], url_path="media-auth") def media_auth(self, request, *args, **kwargs): """ @@ -1021,13 +993,22 @@ def media_auth(self, request, *args, **kwargs): annotation. The request will then be proxied to the object storage backend who will respond with the file after checking the signature included in headers. """ - url_params, _, _ = self._authorize_subrequest( - request, MEDIA_STORAGE_URL_PATTERN - ) - pk, key = url_params.values() + url = self._auth_get_original_url(request) + url_params = self._auth_get_url_params(MEDIA_STORAGE_URL_PATTERN, url.path) + document = self._auth_get_document(url_params["pk"]) + + if not document.get_abilities(request.user).get(self.action, False): + logger.debug( + "User '%s' lacks permission for document '%s'", + request.user, + document.pk, + ) + raise drf.exceptions.PermissionDenied() # Generate S3 authorization headers using the extracted URL parameters - request = utils.generate_s3_authorization_headers(f"{pk:s}/{key:s}") + request = utils.generate_s3_authorization_headers( + f"{url_params['pk']:s}/{url_params['key']:s}" + ) return drf.response.Response("authorized", headers=request.headers, status=200) @@ -1037,18 +1018,32 @@ def collaboration_auth(self, request, *args, **kwargs): This view is used by an Nginx subrequest to control access to a document's collaboration server. """ - _, user_abilities, user_id = self._authorize_subrequest( - request, COLLABORATION_WS_URL_PATTERN - ) - can_edit = user_abilities["partial_update"] + url = self._auth_get_original_url(request) + url_params = self._auth_get_url_params(COLLABORATION_WS_URL_PATTERN, url.query) + document = self._auth_get_document(url_params["pk"]) + + abilities = document.get_abilities(request.user) + if not abilities.get(self.action, False): + logger.debug( + "User '%s' lacks permission for document '%s'", + request.user, + document.pk, + ) + raise drf.exceptions.PermissionDenied() + + if not settings.COLLABORATION_SERVER_SECRET: + logger.debug("Collaboration server secret is not defined") + raise drf.exceptions.PermissionDenied() # Add the collaboration server secret token to the headers headers = { "Authorization": settings.COLLABORATION_SERVER_SECRET, - "X-Can-Edit": str(can_edit), - "X-User-Id": str(user_id), + "X-Can-Edit": str(abilities["partial_update"]), } + if request.user.is_authenticated: + headers["X-User-Id"] = str(request.user.id) + return drf.response.Response("authorized", headers=headers, status=200) @drf.decorators.action( From 852f3866be8ca76b524df1789f02a5c5f70bd5c1 Mon Sep 17 00:00:00 2001 From: Samuel Paccoud - DINUM Date: Mon, 27 Jan 2025 23:04:29 +0100 Subject: [PATCH 03/10] =?UTF-8?q?fixup!=20=E2=99=BB=EF=B8=8F(backend)=20re?= =?UTF-8?q?factor=20media=5Fauth=20and=20collaboration=5Fauth=20for=20flex?= =?UTF-8?q?ibility?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/backend/core/api/viewsets.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py index 7d585aad3..fac132190 100644 --- a/src/backend/core/api/viewsets.py +++ b/src/backend/core/api/viewsets.py @@ -993,8 +993,10 @@ def media_auth(self, request, *args, **kwargs): annotation. The request will then be proxied to the object storage backend who will respond with the file after checking the signature included in headers. """ - url = self._auth_get_original_url(request) - url_params = self._auth_get_url_params(MEDIA_STORAGE_URL_PATTERN, url.path) + parsed_url = self._auth_get_original_url(request) + url_params = self._auth_get_url_params( + enums.MEDIA_STORAGE_URL_PATTERN, parsed_url.path + ) document = self._auth_get_document(url_params["pk"]) if not document.get_abilities(request.user).get(self.action, False): @@ -1018,8 +1020,10 @@ def collaboration_auth(self, request, *args, **kwargs): This view is used by an Nginx subrequest to control access to a document's collaboration server. """ - url = self._auth_get_original_url(request) - url_params = self._auth_get_url_params(COLLABORATION_WS_URL_PATTERN, url.query) + parsed_url = self._auth_get_original_url(request) + url_params = self._auth_get_url_params( + enums.COLLABORATION_WS_URL_PATTERN, parsed_url.query + ) document = self._auth_get_document(url_params["pk"]) abilities = document.get_abilities(request.user) From 778f9e228f739d9fc7512c071d82120ab8800812 Mon Sep 17 00:00:00 2001 From: Samuel Paccoud - DINUM Date: Fri, 27 Dec 2024 19:02:09 +0100 Subject: [PATCH 04/10] =?UTF-8?q?=E2=9C=85(backend)=20add=20missing=20test?= =?UTF-8?q?s=20for=20collaboration=20auth?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tests were forgotten. While writing the tests, I fixed a few edge cases like the possibility to connect to the collaboration server for an anonymous user. --- .../test_api_documents_collaboration_auth.py | 166 ++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 src/backend/core/tests/documents/test_api_documents_collaboration_auth.py diff --git a/src/backend/core/tests/documents/test_api_documents_collaboration_auth.py b/src/backend/core/tests/documents/test_api_documents_collaboration_auth.py new file mode 100644 index 000000000..201e2d01d --- /dev/null +++ b/src/backend/core/tests/documents/test_api_documents_collaboration_auth.py @@ -0,0 +1,166 @@ +""" +Test collaboration websocket access API endpoint for users in impress's core app. +""" + +from django.test import override_settings + +import pytest +from rest_framework.test import APIClient + +from core import factories, models +from core.tests.conftest import TEAM, USER, VIA + +pytestmark = pytest.mark.django_db + + +def test_api_documents_collaboration_auth_original_url_not_matching(): + """ + Trying to authenticate on the collaboration server with an invalid + original url should return a 403. + """ + document = factories.DocumentFactory(link_reach="public") + + response = APIClient().get( + "/api/v1.0/documents/collaboration-auth/", + HTTP_X_ORIGINAL_URL=f"http://localhost/ws/?invalid={document.pk}", + ) + + assert response.status_code == 403 + assert "Authorization" not in response + assert "X-Can-Edit" not in response + assert "X-User-Id" not in response + + +def test_api_documents_collaboration_auth_secret_not_defined(): + """ + Trying to authenticate on the collaboration server when the secret is not defined + should return a 403. + """ + document = factories.DocumentFactory(link_reach="public") + + response = APIClient().get( + "/api/v1.0/documents/collaboration-auth/", + HTTP_X_ORIGINAL_URL=f"http://localhost/ws/?room={document.pk}", + ) + + assert response.status_code == 403 + assert "Authorization" not in response + assert "X-Can-Edit" not in response + assert "X-User-Id" not in response + + +@override_settings(COLLABORATION_SERVER_SECRET="123") +@pytest.mark.parametrize("reach", ["authenticated", "restricted"]) +def test_api_documents_collaboration_auth_anonymous_authenticated_or_restricted(reach): + """ + Anonymous users should not be allowed to connect to the collaboration server for a document + with link reach set to authenticated or restricted. + """ + document = factories.DocumentFactory(link_reach=reach) + + response = APIClient().get( + "/api/v1.0/documents/collaboration-auth/", + HTTP_X_ORIGINAL_URL=f"http://localhost/ws/?room={document.pk}", + ) + + assert response.status_code == 403 + assert "Authorization" not in response + assert "X-Can-Edit" not in response + assert "X-User-Id" not in response + + +@override_settings(COLLABORATION_SERVER_SECRET="123") +def test_api_documents_collaboration_auth_anonymous_public(): + """ + Anonymous users should be able to connect to the collaboration server for a public document. + """ + document = factories.DocumentFactory(link_reach="public") + + response = APIClient().get( + "/api/v1.0/documents/collaboration-auth/", + HTTP_X_ORIGINAL_URL=f"http://localhost/ws/?room={document.pk}", + ) + + assert response.status_code == 200 + assert response["Authorization"] == "123" + assert response["X-Can-Edit"] == str(document.link_role == "editor") + assert "X-User-Id" not in response + + +@override_settings(COLLABORATION_SERVER_SECRET="123") +@pytest.mark.parametrize("reach", ["public", "authenticated"]) +def test_api_documents_collaboration_auth_authenticated_public_or_authenticated(reach): + """ + Authenticated users who are not related to a document should be able to connect to the + collaboration server if this document's link reach is set to public or authenticated. + """ + user = factories.UserFactory() + client = APIClient() + client.force_login(user) + + document = factories.DocumentFactory(link_reach=reach) + + response = client.get( + "/api/v1.0/documents/collaboration-auth/", + HTTP_X_ORIGINAL_URL=f"http://localhost/ws/?room={document.pk}", + ) + + assert response.status_code == 200 + assert response["Authorization"] == "123" + assert response["X-Can-Edit"] == str(document.link_role == "editor") + assert response["X-User-Id"] == str(user.id) + + +@override_settings(COLLABORATION_SERVER_SECRET="123") +def test_api_documents_collaboration_auth_authenticated_restricted(): + """ + Authenticated users who are not related to a document should not be allowed to connect to the + collaboration server if this document's link reach is set to restricted. + """ + user = factories.UserFactory() + client = APIClient() + client.force_login(user) + + document = factories.DocumentFactory(link_reach="restricted") + + response = client.get( + "/api/v1.0/documents/collaboration-auth/", + HTTP_X_ORIGINAL_URL=f"http://localhost/ws/?room={document.pk}", + ) + + assert response.status_code == 403 + assert "Authorization" not in response + assert "X-Can-Edit" not in response + assert "X-User-Id" not in response + + +@override_settings(COLLABORATION_SERVER_SECRET="123") +@pytest.mark.parametrize("role", models.RoleChoices.values) +@pytest.mark.parametrize("via", VIA) +def test_api_documents_collaboration_auth_related(via, role, mock_user_teams): + """ + Users who have a specific access to a document, whatever the role, should be able to + connect to the collaboration server for this document. + """ + user = factories.UserFactory() + client = APIClient() + client.force_login(user) + + document = factories.DocumentFactory(link_reach="restricted") + if via == USER: + factories.UserDocumentAccessFactory(document=document, user=user, role=role) + elif via == TEAM: + mock_user_teams.return_value = ["lasuite", "unknown"] + factories.TeamDocumentAccessFactory( + document=document, team="lasuite", role=role + ) + + response = client.get( + "/api/v1.0/documents/collaboration-auth/", + HTTP_X_ORIGINAL_URL=f"http://localhost/ws/?room={document.pk}", + ) + + assert response.status_code == 200 + assert response["Authorization"] == "123" + assert response["X-Can-Edit"] == str(role != "reader") + assert response["X-User-Id"] == str(user.id) From 397242cb3f69ae7c2b8e7b0e32618da21651d281 Mon Sep 17 00:00:00 2001 From: Samuel Paccoud - DINUM Date: Fri, 17 Jan 2025 18:59:24 +0100 Subject: [PATCH 05/10] =?UTF-8?q?=E2=9C=85(backend)=20add=20missing=20test?= =?UTF-8?q?=20on=20media-auth=20and=20collaboration-auth?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These methods were involved in a bug that was fixed without first evidencing the error in a test: https://github.com/suitenumerique/docs/pull/556 Fixes https://github.com/suitenumerique/docs/issues/567 --- src/backend/core/api/viewsets.py | 13 +-------- src/backend/core/enums.py | 16 +++++++++- .../test_api_documents_collaboration_auth.py | 17 +++++++++++ .../test_api_documents_media_auth.py | 29 ++++++++++++++----- 4 files changed, 55 insertions(+), 20 deletions(-) diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py index fac132190..4abc723ca 100644 --- a/src/backend/core/api/viewsets.py +++ b/src/backend/core/api/viewsets.py @@ -2,7 +2,6 @@ # pylint: disable=too-many-lines import logging -import re import uuid from urllib.parse import urlparse @@ -34,16 +33,6 @@ logger = logging.getLogger(__name__) -ATTACHMENTS_FOLDER = "attachments" -UUID_REGEX = ( - r"[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}" -) -FILE_EXT_REGEX = r"\.[a-zA-Z]{3,4}" -MEDIA_STORAGE_URL_PATTERN = re.compile( - f"{settings.MEDIA_URL:s}(?P{UUID_REGEX:s})/" - f"(?P{ATTACHMENTS_FOLDER:s}/{UUID_REGEX:s}{FILE_EXT_REGEX:s})$" -) -COLLABORATION_WS_URL_PATTERN = re.compile(rf"(?:^|&)room=(?P{UUID_REGEX})(?:&|$)") # pylint: disable=too-many-ancestors @@ -915,7 +904,7 @@ def attachment_upload(self, request, *args, **kwargs): # Generate a generic yet unique filename to store the image in object storage file_id = uuid.uuid4() extension = serializer.validated_data["expected_extension"] - key = f"{document.key_base}/{ATTACHMENTS_FOLDER:s}/{file_id!s}.{extension:s}" + key = f"{document.key_base}/{enums.ATTACHMENTS_FOLDER:s}/{file_id!s}.{extension:s}" # Prepare metadata for storage extra_args = { diff --git a/src/backend/core/enums.py b/src/backend/core/enums.py index 592a84a44..8cd50bafd 100644 --- a/src/backend/core/enums.py +++ b/src/backend/core/enums.py @@ -2,10 +2,24 @@ Core application enums declaration """ -from django.conf import global_settings +import re + +from django.conf import global_settings, settings from django.db import models from django.utils.translation import gettext_lazy as _ +ATTACHMENTS_FOLDER = "attachments" +UUID_REGEX = ( + r"[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}" +) +FILE_EXT_REGEX = r"\.[a-zA-Z]{3,4}" +MEDIA_STORAGE_URL_PATTERN = re.compile( + f"{settings.MEDIA_URL:s}(?P{UUID_REGEX:s})/" + f"(?P{ATTACHMENTS_FOLDER:s}/{UUID_REGEX:s}{FILE_EXT_REGEX:s})$" +) +COLLABORATION_WS_URL_PATTERN = re.compile(rf"(?:^|&)room=(?P{UUID_REGEX})(?:&|$)") + + # In Django's code base, `LANGUAGES` is set by default with all supported languages. # We can use it for the choice of languages which should not be limited to the few languages # active in the app. diff --git a/src/backend/core/tests/documents/test_api_documents_collaboration_auth.py b/src/backend/core/tests/documents/test_api_documents_collaboration_auth.py index 201e2d01d..6649490ea 100644 --- a/src/backend/core/tests/documents/test_api_documents_collaboration_auth.py +++ b/src/backend/core/tests/documents/test_api_documents_collaboration_auth.py @@ -2,6 +2,8 @@ Test collaboration websocket access API endpoint for users in impress's core app. """ +from uuid import uuid4 + from django.test import override_settings import pytest @@ -13,6 +15,21 @@ pytestmark = pytest.mark.django_db +def test_api_documents_collaboration_auth_unkown_document(): + """ + Trying to connect to the collaboration server on a document ID that does not exist + should not have the side effect to create it (no regression test). + """ + original_url = f"http://localhost/collaboration/ws/?room={uuid4()!s}" + + response = APIClient().get( + "/api/v1.0/documents/collaboration-auth/", HTTP_X_ORIGINAL_URL=original_url + ) + + assert response.status_code == 403 + assert models.Document.objects.exists() is False + + def test_api_documents_collaboration_auth_original_url_not_matching(): """ Trying to authenticate on the collaboration server with an invalid diff --git a/src/backend/core/tests/documents/test_api_documents_media_auth.py b/src/backend/core/tests/documents/test_api_documents_media_auth.py index 28fd370c0..f053b4e91 100644 --- a/src/backend/core/tests/documents/test_api_documents_media_auth.py +++ b/src/backend/core/tests/documents/test_api_documents_media_auth.py @@ -2,9 +2,9 @@ Test file uploads API endpoint for users in impress's core app. """ -import uuid from io import BytesIO from urllib.parse import urlparse +from uuid import uuid4 from django.conf import settings from django.core.files.storage import default_storage @@ -14,17 +14,32 @@ import requests from rest_framework.test import APIClient -from core import factories +from core import factories, models from core.tests.conftest import TEAM, USER, VIA pytestmark = pytest.mark.django_db +def test_api_documents_media_auth_unkown_document(): + """ + Trying to download a media related to a document ID that does not exist + should not have the side effect to create it (no regression test). + """ + original_url = f"http://localhost/media/{uuid4()!s}/attachments/{uuid4()!s}.jpg" + + response = APIClient().get( + "/api/v1.0/documents/media-auth/", HTTP_X_ORIGINAL_URL=original_url + ) + + assert response.status_code == 403 + assert models.Document.objects.exists() is False + + def test_api_documents_media_auth_anonymous_public(): """Anonymous users should be able to retrieve attachments linked to a public document""" document = factories.DocumentFactory(link_reach="public") - filename = f"{uuid.uuid4()!s}.jpg" + filename = f"{uuid4()!s}.jpg" key = f"{document.pk!s}/attachments/{filename:s}" default_storage.connection.meta.client.put_object( @@ -72,7 +87,7 @@ def test_api_documents_media_auth_anonymous_authenticated_or_restricted(reach): """ document = factories.DocumentFactory(link_reach=reach) - filename = f"{uuid.uuid4()!s}.jpg" + filename = f"{uuid4()!s}.jpg" media_url = f"http://localhost/media/{document.pk!s}/attachments/{filename:s}" response = APIClient().get( @@ -95,7 +110,7 @@ def test_api_documents_media_auth_authenticated_public_or_authenticated(reach): client = APIClient() client.force_login(user) - filename = f"{uuid.uuid4()!s}.jpg" + filename = f"{uuid4()!s}.jpg" key = f"{document.pk!s}/attachments/{filename:s}" default_storage.connection.meta.client.put_object( @@ -146,7 +161,7 @@ def test_api_documents_media_auth_authenticated_restricted(): client = APIClient() client.force_login(user) - filename = f"{uuid.uuid4()!s}.jpg" + filename = f"{uuid4()!s}.jpg" media_url = f"http://localhost/media/{document.pk!s}/attachments/{filename:s}" response = client.get( @@ -174,7 +189,7 @@ def test_api_documents_media_auth_related(via, mock_user_teams): mock_user_teams.return_value = ["lasuite", "unknown"] factories.TeamDocumentAccessFactory(document=document, team="lasuite") - filename = f"{uuid.uuid4()!s}.jpg" + filename = f"{uuid4()!s}.jpg" key = f"{document.pk!s}/attachments/{filename:s}" default_storage.connection.meta.client.put_object( From 69015617f2f8c9a6d583b28ef316fed1050c1ddf Mon Sep 17 00:00:00 2001 From: Samuel Paccoud - DINUM Date: Mon, 20 Jan 2025 10:23:18 +0100 Subject: [PATCH 06/10] =?UTF-8?q?=E2=9C=A8(backend)=20add=20duplicate=20ac?= =?UTF-8?q?tion=20to=20the=20document=20API=20endpoint?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We took this opportunity to refactor the way access is controlled on media attachments. We now add the media key to a list on the document instance each time a media is uploaded to a document. This list is passed along when a document is duplicated, allowing us to grant access to readers on the new document, even if they don't have or lost access to the original document. We also propose an option to reproduce the same access rights on the duplicate document as what was in place on the original document. This can be requested by passing the "with_accesses=true" option in the query string. The tricky point is that we need to extract attachment keys from the existing documents and set them on the new "attachments" field that is now used to track access rights on media files. --- CHANGELOG.md | 1 + src/backend/core/admin.py | 4 + src/backend/core/api/serializers.py | 21 ++ src/backend/core/api/viewsets.py | 100 ++++++++- src/backend/core/enums.py | 15 ++ ...d_field_attachments_and_duplicated_from.py | 55 +++++ src/backend/core/models.py | 67 ++++++ .../test_api_documents_attachment_upload.py | 37 +++- .../documents/test_api_documents_duplicate.py | 204 ++++++++++++++++++ .../test_api_documents_media_auth.py | 118 +++++++--- .../documents/test_api_documents_retrieve.py | 5 + .../documents/test_api_documents_trashbin.py | 1 + .../core/tests/test_models_documents.py | 8 + .../tests/test_utils_base64_yjs_to_text.py | 43 +++- src/backend/core/utils.py | 9 + 15 files changed, 648 insertions(+), 40 deletions(-) create mode 100644 src/backend/core/migrations/0018_remove_is_public_add_field_attachments_and_duplicated_from.py create mode 100644 src/backend/core/tests/documents/test_api_documents_duplicate.py diff --git a/CHANGELOG.md b/CHANGELOG.md index ccb845c44..dcb3845c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to ## Added +- ✨(backend) add duplicate action to the document API endpoint - ⚗️(backend) add util to extract text from base64 yjs document - ✨(backend) add soft delete and restore API endpoints to documents #516 - ✨(backend) allow organizing documents in a tree structure #516 diff --git a/src/backend/core/admin.py b/src/backend/core/admin.py index 080b492ab..e096b020c 100644 --- a/src/backend/core/admin.py +++ b/src/backend/core/admin.py @@ -151,6 +151,8 @@ class DocumentAdmin(TreeAdmin): "path", "depth", "numchild", + "duplicated_from", + "attachments", ) }, ), @@ -166,8 +168,10 @@ class DocumentAdmin(TreeAdmin): "updated_at", ) readonly_fields = ( + "attachments", "creator", "depth", + "duplicated_from", "id", "numchild", "path", diff --git a/src/backend/core/api/serializers.py b/src/backend/core/api/serializers.py index c16ecd4d9..270f8478a 100644 --- a/src/backend/core/api/serializers.py +++ b/src/backend/core/api/serializers.py @@ -373,6 +373,27 @@ class Meta: ] +class DocumentDuplicationSerializer(serializers.Serializer): + """ + Serializer for duplicating a document. + Allows specifying whether to keep access permissions. + """ + + with_accesses = serializers.BooleanField(default=False) + + def create(self, validated_data): + """ + This serializer is not intended to create objects. + """ + raise NotImplementedError("This serializer does not support creation.") + + def update(self, instance, validated_data): + """ + This serializer is not intended to update objects. + """ + raise NotImplementedError("This serializer does not support updating.") + + # Suppress the warning about not implementing `create` and `update` methods # since we don't use a model and only rely on the serializer for validation # pylint: disable=abstract-method diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py index 4abc723ca..89f5ce385 100644 --- a/src/backend/core/api/viewsets.py +++ b/src/backend/core/api/viewsets.py @@ -16,6 +16,8 @@ from django.db.models.expressions import RawSQL from django.db.models.functions import Left, Length from django.http import Http404 +from django.utils.text import capfirst +from django.utils.translation import gettext_lazy as _ import rest_framework as drf from botocore.exceptions import ClientError @@ -27,6 +29,7 @@ from core import authentication, enums, models from core.services.ai_services import AIService from core.services.collaboration_services import CollaborationService +from core.utils import extract_attachments from . import permissions, serializers, utils from .filters import DocumentFilter @@ -754,6 +757,79 @@ def children(self, request, *args, **kwargs): queryset = self.annotate_user_roles(queryset) return self.get_response_for_queryset(queryset) + @drf.decorators.action( + detail=True, + methods=["post"], + permission_classes=[permissions.IsAuthenticated, permissions.AccessPermission], + url_path="duplicate", + ) + @transaction.atomic + def duplicate(self, request, *args, **kwargs): + """ + Duplicate a document and store the links to attached files in the duplicated + document to allow cross-access. + + Optionally duplicates accesses if `with_accesses` is set to true + in the payload. + """ + serializer = serializers.DocumentDuplicationSerializer(data=request.GET) + serializer.is_valid(raise_exception=True) + with_accesses = serializer.validated_data["with_accesses"] + + # Get document while checking permissions + document = self.get_object() + base64_yjs_content = document.content + + # Duplicate the document instance + link_kwargs = ( + {"link_reach": document.link_reach, "link_role": document.link_role} + if with_accesses + else {} + ) + extracted_attachments = set(extract_attachments(document.content)) + attachments = list(extracted_attachments & set(document.attachments)) + duplicated_document = document.add_sibling( + "right", + title=capfirst(_("copy of {title}").format(title=document.title)), + content=base64_yjs_content, + attachments=attachments, + duplicated_from=document, + creator=request.user, + **link_kwargs, + ) + + # Always add the logged-in user as OWNER + accesses_to_create = [ + models.DocumentAccess( + document=duplicated_document, + user=request.user, + role=models.RoleChoices.OWNER, + ) + ] + + # If accesses should be duplicated, add other users' accesses as per original document + if with_accesses: + original_accesses = models.DocumentAccess.objects.filter( + document=document + ).exclude(user=request.user) + + accesses_to_create.extend( + models.DocumentAccess( + document=duplicated_document, + user_id=access.user_id, + team=access.team, + role=access.role, + ) + for access in original_accesses + ) + + # Bulk create all the duplicated accesses + models.DocumentAccess.objects.bulk_create(accesses_to_create) + + return drf_response.Response( + {"id": str(duplicated_document.id)}, status=status.HTTP_201_CREATED + ) + @drf.decorators.action(detail=True, methods=["get"], url_path="versions") def versions_list(self, request, *args, **kwargs): """ @@ -919,6 +995,10 @@ def attachment_upload(self, request, *args, **kwargs): file, default_storage.bucket_name, key, ExtraArgs=extra_args ) + # Make the attachment readable by document readers + document.attachments.append(key) + document.save() + return drf.response.Response( {"file": f"{settings.MEDIA_URL:s}{key:s}"}, status=drf.status.HTTP_201_CREATED, @@ -986,20 +1066,20 @@ def media_auth(self, request, *args, **kwargs): url_params = self._auth_get_url_params( enums.MEDIA_STORAGE_URL_PATTERN, parsed_url.path ) - document = self._auth_get_document(url_params["pk"]) - if not document.get_abilities(request.user).get(self.action, False): - logger.debug( - "User '%s' lacks permission for document '%s'", - request.user, - document.pk, - ) + user = request.user + key = f"{url_params['pk']:s}/{url_params['attachment']:s}" + + if ( + not self.queryset.readable(user) + .filter(attachments__contains=[key]) + .exists() + ): + logger.debug("User '%s' lacks permission for image", user) raise drf.exceptions.PermissionDenied() # Generate S3 authorization headers using the extracted URL parameters - request = utils.generate_s3_authorization_headers( - f"{url_params['pk']:s}/{url_params['key']:s}" - ) + request = utils.generate_s3_authorization_headers(key) return drf.response.Response("authorized", headers=request.headers, status=200) diff --git a/src/backend/core/enums.py b/src/backend/core/enums.py index 8cd50bafd..1cda96a0e 100644 --- a/src/backend/core/enums.py +++ b/src/backend/core/enums.py @@ -36,3 +36,18 @@ class MoveNodePositionChoices(models.TextChoices): LAST_SIBLING = "last-sibling", _("Last sibling") LEFT = "left", _("Left") RIGHT = "right", _("Right") + + +ATTACHMENTS_FOLDER = "attachments" +UUID_REGEX = ( + r"[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}" +) +FILE_EXT_REGEX = r"\.[a-zA-Z]{3,4}" +MEDIA_STORAGE_URL_PATTERN = re.compile( + f"{settings.MEDIA_URL:s}(?P{UUID_REGEX:s})/" + f"(?P{ATTACHMENTS_FOLDER:s}/{UUID_REGEX:s}{FILE_EXT_REGEX:s})$" +) +MEDIA_STORAGE_URL_EXTRACT = re.compile( + f"{settings.MEDIA_URL:s}({UUID_REGEX}/{ATTACHMENTS_FOLDER}/{UUID_REGEX}{FILE_EXT_REGEX})" +) +COLLABORATION_WS_URL_PATTERN = re.compile(rf"(?:^|&)room=(?P{UUID_REGEX})(?:&|$)") diff --git a/src/backend/core/migrations/0018_remove_is_public_add_field_attachments_and_duplicated_from.py b/src/backend/core/migrations/0018_remove_is_public_add_field_attachments_and_duplicated_from.py new file mode 100644 index 000000000..df93c704b --- /dev/null +++ b/src/backend/core/migrations/0018_remove_is_public_add_field_attachments_and_duplicated_from.py @@ -0,0 +1,55 @@ +# Generated by Django 5.1.4 on 2025-01-18 11:53 +import re + +import core.models +import django.contrib.postgres.fields +import django.db.models.deletion +from django.db import migrations, models + +from core.utils import extract_attachments + + +def populate_attachments_on_all_documents(apps, schema_editor): + """Populate "attachments" field on all existing documents in the database.""" + Document = apps.get_model("core", "Document") + + for document in Document.objects.all(): + document.attachments = extract_attachments(document.content) + document.save(update_fields=['attachments']) + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0017_add_fields_for_soft_delete'), + ] + + operations = [ + # v2.0.0 was released so we can now remove BC field "is_public" + migrations.RemoveField( + model_name='document', + name='is_public', + ), + migrations.AlterModelManagers( + name='user', + managers=[ + ('objects', core.models.UserManager()), + ], + ), + migrations.AddField( + model_name='document', + name='attachments', + field=django.contrib.postgres.fields.ArrayField(base_field=models.CharField(max_length=255), blank=True, default=list, editable=False, null=True, size=None), + ), + migrations.AddField( + model_name='document', + name='duplicated_from', + field=models.ForeignKey(blank=True, editable=False, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='duplicates', to='core.document'), + ), + migrations.AlterField( + model_name='user', + name='language', + field=models.CharField(choices="(('en-us', 'English'), ('fr-fr', 'French'), ('de-de', 'German'))", default='en-us', help_text='The language in which the user wants to see the interface.', max_length=10, verbose_name='language'), + ), + migrations.RunPython(populate_attachments_on_all_documents), + ] diff --git a/src/backend/core/models.py b/src/backend/core/models.py index 4aa4de87d..dcf09d1f2 100644 --- a/src/backend/core/models.py +++ b/src/backend/core/models.py @@ -12,6 +12,7 @@ from django.conf import settings from django.contrib.auth import models as auth_models from django.contrib.auth.base_user import AbstractBaseUser +from django.contrib.postgres.fields import ArrayField from django.contrib.sites.models import Site from django.core import mail, validators from django.core.cache import cache @@ -367,6 +368,54 @@ def _get_abilities(self, resource, user): } +class DocumentQuerySet(models.QuerySet): + """ + Custom queryset for the Document model, providing additional methods + to filter documents based on user permissions. + """ + + def readable(self, user): + """ + Filters the queryset to return documents that the given user has + permission to read. + + :param user: The user for whom readable documents are to be fetched. + :return: A queryset of documents readable by the user. + """ + if user.is_authenticated: + return self.filter( + models.Q(accesses__user=user) + | models.Q(accesses__team__in=user.teams) + | ~models.Q(link_reach=LinkReachChoices.RESTRICTED) + ) + + return self.filter(models.Q(link_reach=LinkReachChoices.PUBLIC)) + + +class DocumentManager(models.Manager): + """ + Custom manager for the Document model, enabling the use of the custom + queryset methods directly from the model manager. + """ + + def get_queryset(self): + """ + Overrides the default get_queryset method to return a custom queryset. + + :return: An instance of DocumentQuerySet. + """ + return DocumentQuerySet(self.model, using=self._db) + + def readable(self, user): + """ + Filters documents based on user permissions using the custom queryset. + + :param user: The user for whom readable documents are to be fetched. + :return: A queryset of documents readable by the user. + """ + return self.get_queryset().readable(user) + + class Document(MP_Node, BaseModel): """Pad document carrying the content.""" @@ -389,6 +438,21 @@ class Document(MP_Node, BaseModel): ) deleted_at = models.DateTimeField(null=True, blank=True) ancestors_deleted_at = models.DateTimeField(null=True, blank=True) + duplicated_from = models.ForeignKey( + "self", + on_delete=models.SET_NULL, + related_name="duplicates", + editable=False, + blank=True, + null=True, + ) + attachments = ArrayField( + models.CharField(max_length=255), + default=list, + editable=False, + blank=True, + null=True, + ) _content = None @@ -399,6 +463,8 @@ class Document(MP_Node, BaseModel): path = models.CharField(max_length=7 * 36, unique=True, db_collation="C") + objects = DocumentManager() + class Meta: db_table = "impress_document" ordering = ("path",) @@ -657,6 +723,7 @@ def get_abilities(self, user): "children_create": can_update and user.is_authenticated, "collaboration_auth": can_get, "destroy": is_owner, + "duplicate": can_get, "favorite": can_get and user.is_authenticated, "link_configuration": is_owner_or_admin, "invite_owner": is_owner, diff --git a/src/backend/core/tests/documents/test_api_documents_attachment_upload.py b/src/backend/core/tests/documents/test_api_documents_attachment_upload.py index 4a6564d6a..61026383a 100644 --- a/src/backend/core/tests/documents/test_api_documents_attachment_upload.py +++ b/src/backend/core/tests/documents/test_api_documents_attachment_upload.py @@ -67,10 +67,12 @@ def test_api_documents_attachment_upload_anonymous_success(): file_path = response.json()["file"] match = pattern.search(file_path) file_id = match.group(1) - # Validate that file_id is a valid UUID uuid.UUID(file_id) + document.refresh_from_db() + assert document.attachments == [f"{document.id!s}/attachments/{file_id!s}.png"] + # Now, check the metadata of the uploaded file key = file_path.replace("/media", "") file_head = default_storage.connection.meta.client.head_object( @@ -111,6 +113,9 @@ def test_api_documents_attachment_upload_authenticated_forbidden(reach, role): "detail": "You do not have permission to perform this action." } + document.refresh_from_db() + assert document.attachments == [] + @pytest.mark.parametrize( "reach, role", @@ -121,8 +126,8 @@ def test_api_documents_attachment_upload_authenticated_forbidden(reach, role): ) def test_api_documents_attachment_upload_authenticated_success(reach, role): """ - Autenticated who are not related to a document should be able to upload a file - if the link reach and role permit it. + Autenticated users who are not related to a document should be able to upload + a file when the link reach and role permit it. """ user = factories.UserFactory() @@ -144,6 +149,9 @@ def test_api_documents_attachment_upload_authenticated_success(reach, role): # Validate that file_id is a valid UUID uuid.UUID(file_id) + document.refresh_from_db() + assert document.attachments == [f"{document.id!s}/attachments/{file_id!s}.png"] + @pytest.mark.parametrize("via", VIA) def test_api_documents_attachment_upload_reader(via, mock_user_teams): @@ -174,6 +182,9 @@ def test_api_documents_attachment_upload_reader(via, mock_user_teams): "detail": "You do not have permission to perform this action." } + document.refresh_from_db() + assert document.attachments == [] + @pytest.mark.parametrize("role", ["editor", "administrator", "owner"]) @pytest.mark.parametrize("via", VIA) @@ -210,6 +221,9 @@ def test_api_documents_attachment_upload_success(via, role, mock_user_teams): # Validate that file_id is a valid UUID uuid.UUID(file_id) + document.refresh_from_db() + assert document.attachments == [f"{document.id!s}/attachments/{file_id!s}.png"] + # Now, check the metadata of the uploaded file key = file_path.replace("/media", "") file_head = default_storage.connection.meta.client.head_object( @@ -234,6 +248,9 @@ def test_api_documents_attachment_upload_invalid(client): assert response.status_code == 400 assert response.json() == {"file": ["No file was submitted."]} + document.refresh_from_db() + assert document.attachments == [] + def test_api_documents_attachment_upload_size_limit_exceeded(settings): """The uploaded file should not exceeed the maximum size in settings.""" @@ -256,6 +273,9 @@ def test_api_documents_attachment_upload_size_limit_exceeded(settings): assert response.status_code == 400 assert response.json() == {"file": ["File size exceeds the maximum limit of 1 MB."]} + document.refresh_from_db() + assert document.attachments == [] + @pytest.mark.parametrize( "name,content,extension,content_type", @@ -294,6 +314,11 @@ def test_api_documents_attachment_upload_fix_extension( # Validate that file_id is a valid UUID uuid.UUID(file_id) + document.refresh_from_db() + assert document.attachments == [ + f"{document.id!s}/attachments/{file_id!s}.{extension:s}" + ] + # Now, check the metadata of the uploaded file key = file_path.replace("/media", "") file_head = default_storage.connection.meta.client.head_object( @@ -318,6 +343,9 @@ def test_api_documents_attachment_upload_empty_file(): assert response.status_code == 400 assert response.json() == {"file": ["The submitted file is empty."]} + document.refresh_from_db() + assert document.attachments == [] + def test_api_documents_attachment_upload_unsafe(): """A file with an unsafe mime type should be tagged as such.""" @@ -343,6 +371,9 @@ def test_api_documents_attachment_upload_unsafe(): # Validate that file_id is a valid UUID uuid.UUID(file_id) + document.refresh_from_db() + assert document.attachments == [f"{document.id!s}/attachments/{file_id!s}.exe"] + # Now, check the metadata of the uploaded file key = file_path.replace("/media", "") file_head = default_storage.connection.meta.client.head_object( diff --git a/src/backend/core/tests/documents/test_api_documents_duplicate.py b/src/backend/core/tests/documents/test_api_documents_duplicate.py new file mode 100644 index 000000000..89f248815 --- /dev/null +++ b/src/backend/core/tests/documents/test_api_documents_duplicate.py @@ -0,0 +1,204 @@ +""" +Test file uploads API endpoint for users in impress's core app. +""" + +import base64 +import uuid +from io import BytesIO +from urllib.parse import urlparse + +from django.conf import settings +from django.core.files.storage import default_storage +from django.utils import timezone + +import pytest +import requests +import y_py +from rest_framework.test import APIClient + +from core import factories, models + +pytestmark = pytest.mark.django_db + +PIXEL = ( + b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x06\x00" + b"\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\nIDATx\x9cc\xf8\xff\xff?\x00\x05\xfe\x02\xfe" + b"\xa7V\xbd\xfa\x00\x00\x00\x00IEND\xaeB`\x82" +) + + +def get_image_refs(document_id): + """Generate an image key for testing.""" + image_key = f"{document_id!s}/attachments/{uuid.uuid4()!s}.png" + default_storage.connection.meta.client.put_object( + Bucket=default_storage.bucket_name, + Key=image_key, + Body=BytesIO(PIXEL), + ContentType="image/png", + ) + return image_key, f"http://localhost/media/{image_key:s}" + + +def test_api_documents_duplicate_forbidden(): + """A user who doesn't have read access to a document should not be allowed to duplicate it.""" + user = factories.UserFactory() + client = APIClient() + client.force_login(user) + + document = factories.DocumentFactory( + link_reach="restricted", + users=[factories.UserFactory()], + title="my document", + ) + + response = client.post(f"/api/v1.0/documents/{document.id!s}/duplicate/") + + assert response.status_code == 403 + assert models.Document.objects.count() == 1 + + +def test_api_documents_duplicate_anonymous(): + """Anonymous users should not be able to duplicate documents even with read access.""" + + document = factories.DocumentFactory(link_reach="public") + + response = APIClient().post(f"/api/v1.0/documents/{document.id!s}/duplicate/") + + assert response.status_code == 401 + assert models.Document.objects.count() == 1 + + +@pytest.mark.parametrize("index", range(3)) +def test_api_documents_duplicate_success(index): + """ + Anonymous users should be able to retrieve attachments linked to a public document. + Accesses should not be duplicated if the user does not request it specifically. + Attachments that are not in the content should not be passed for access in the + duplicated document's "attachments" list. + """ + user = factories.UserFactory() + client = APIClient() + client.force_login(user) + + document_ids = [uuid.uuid4() for _ in range(3)] + image_refs = [get_image_refs(doc_id) for doc_id in document_ids] + + # Create document content with the first image only + ydoc = y_py.YDoc() # pylint: disable=no-member + with ydoc.begin_transaction() as txn: + xml_fragment = ydoc.get_xml_element("document-store") + xml_fragment.push_xml_element(txn, "image").set_attribute( + txn, "src", image_refs[0][1] + ) + update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member + base64_content = base64.b64encode(update).decode("utf-8") + + # Create documents + document = factories.DocumentFactory( + id=document_ids[index], + content=base64_content, + link_reach="restricted", + users=[user, factories.UserFactory()], + title="document with an image", + attachments=[key for key, _ in image_refs], + ) + factories.DocumentFactory(id=document_ids[(index + 1) % 3]) + # Don't create document for third ID to check that it doesn't impact access to attachments + + # Duplicate the document via the API endpoint + response = client.post(f"/api/v1.0/documents/{document.id}/duplicate/") + + assert response.status_code == 201 + + duplicated_document = models.Document.objects.get(id=response.json()["id"]) + assert duplicated_document.title == "Copy of document with an image" + assert duplicated_document.content == document.content + assert duplicated_document.creator == user + assert duplicated_document.link_reach == "restricted" + assert duplicated_document.link_role == "reader" + assert duplicated_document.duplicated_from == document + assert duplicated_document.attachments == [ + image_refs[0][0] + ] # Only the first image key + assert duplicated_document.get_parent() == document.get_parent() + assert duplicated_document.path == document.get_next_sibling().path + + # Check that accesses were not duplicated. + # The user who did the duplicate is forced as owner + assert duplicated_document.accesses.count() == 1 + access = duplicated_document.accesses.first() + assert access.user == user + assert access.role == "owner" + + # Ensure access persists after the owner loses access to the original document + models.DocumentAccess.objects.filter(document=document).delete() + response = client.get( + "/api/v1.0/documents/media-auth/", HTTP_X_ORIGINAL_URL=image_refs[0][1] + ) + + assert response.status_code == 200 + + authorization = response["Authorization"] + assert "AWS4-HMAC-SHA256 Credential=" in authorization + assert ( + "SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=" + in authorization + ) + assert response["X-Amz-Date"] == timezone.now().strftime("%Y%m%dT%H%M%SZ") + + s3_url = urlparse(settings.AWS_S3_ENDPOINT_URL) + response = requests.get( + f"{settings.AWS_S3_ENDPOINT_URL:s}/impress-media-storage/{image_refs[0][0]:s}", + headers={ + "authorization": authorization, + "x-amz-date": response["x-amz-date"], + "x-amz-content-sha256": response["x-amz-content-sha256"], + "Host": f"{s3_url.hostname:s}:{s3_url.port:d}", + }, + timeout=1, + ) + assert response.content == PIXEL + + # Ensure the other images are not accessible + for _, url in image_refs[1:]: + response = client.get( + "/api/v1.0/documents/media-auth/", HTTP_X_ORIGINAL_URL=url + ) + assert response.status_code == 403 + + +def test_api_documents_duplicate_with_accesses(): + """Accesses should be duplicated if the user requests it specifically.""" + user = factories.UserFactory() + client = APIClient() + client.force_login(user) + + document = factories.DocumentFactory( + users=[user], + title="document with accesses", + ) + user_access = factories.UserDocumentAccessFactory(document=document) + team_access = factories.TeamDocumentAccessFactory(document=document) + + # Duplicate the document via the API endpoint requesting to duplicate accesses + response = client.post( + f"/api/v1.0/documents/{document.id!s}/duplicate/?with_accesses=true" + ) + + assert response.status_code == 201 + + duplicated_document = models.Document.objects.get(id=response.json()["id"]) + assert duplicated_document.title == "Copy of document with accesses" + assert duplicated_document.content == document.content + assert duplicated_document.link_reach == document.link_reach + assert duplicated_document.link_role == document.link_role + assert duplicated_document.creator == user + assert duplicated_document.duplicated_from == document + assert duplicated_document.attachments == [] + + # Check that accesses were duplicated and the user who did the duplicate is forced as owner + duplicated_accesses = duplicated_document.accesses + assert duplicated_accesses.count() == 3 + assert duplicated_accesses.get(user=user).role == "owner" + assert duplicated_accesses.get(user=user_access.user).role == user_access.role + assert duplicated_accesses.get(team=team_access.team).role == team_access.role diff --git a/src/backend/core/tests/documents/test_api_documents_media_auth.py b/src/backend/core/tests/documents/test_api_documents_media_auth.py index f053b4e91..aa7a0aaf7 100644 --- a/src/backend/core/tests/documents/test_api_documents_media_auth.py +++ b/src/backend/core/tests/documents/test_api_documents_media_auth.py @@ -37,11 +37,9 @@ def test_api_documents_media_auth_unkown_document(): def test_api_documents_media_auth_anonymous_public(): """Anonymous users should be able to retrieve attachments linked to a public document""" - document = factories.DocumentFactory(link_reach="public") - + document_id = uuid4() filename = f"{uuid4()!s}.jpg" - key = f"{document.pk!s}/attachments/{filename:s}" - + key = f"{document_id!s}/attachments/{filename:s}" default_storage.connection.meta.client.put_object( Bucket=default_storage.bucket_name, Key=key, @@ -49,6 +47,8 @@ def test_api_documents_media_auth_anonymous_public(): ContentType="text/plain", ) + factories.DocumentFactory(id=document_id, link_reach="public", attachments=[key]) + original_url = f"http://localhost/media/{key:s}" response = APIClient().get( "/api/v1.0/documents/media-auth/", HTTP_X_ORIGINAL_URL=original_url @@ -85,10 +85,11 @@ def test_api_documents_media_auth_anonymous_authenticated_or_restricted(reach): Anonymous users should not be allowed to retrieve attachments linked to a document with link reach set to authenticated or restricted. """ - document = factories.DocumentFactory(link_reach=reach) - + document_id = uuid4() filename = f"{uuid4()!s}.jpg" - media_url = f"http://localhost/media/{document.pk!s}/attachments/{filename:s}" + media_url = f"http://localhost/media/{document_id!s}/attachments/{filename:s}" + + factories.DocumentFactory(id=document_id, link_reach=reach) response = APIClient().get( "/api/v1.0/documents/media-auth/", HTTP_X_ORIGINAL_URL=media_url @@ -98,20 +99,78 @@ def test_api_documents_media_auth_anonymous_authenticated_or_restricted(reach): assert "Authorization" not in response +def test_api_documents_media_auth_anonymous_attachments(): + """ + Declaring a media key as original attachment on a document to which + a user has access should give them access to the attachment file + regarless of their access rights on the original document. + """ + document_id = uuid4() + filename = f"{uuid4()!s}.jpg" + key = f"{document_id!s}/attachments/{filename:s}" + media_url = f"http://localhost/media/{key:s}" + + default_storage.connection.meta.client.put_object( + Bucket=default_storage.bucket_name, + Key=key, + Body=BytesIO(b"my prose"), + ContentType="text/plain", + ) + + factories.DocumentFactory(id=document_id, link_reach="restricted") + + response = APIClient().get( + "/api/v1.0/documents/media-auth/", HTTP_X_ORIGINAL_URL=media_url + ) + assert response.status_code == 403 + + # Let's now add a document to which the anonymous user has access and + # pointing to the attachment + factories.DocumentFactory(link_reach="public", attachments=[key]) + + response = APIClient().get( + "/api/v1.0/documents/media-auth/", HTTP_X_ORIGINAL_URL=media_url + ) + + assert response.status_code == 200 + + authorization = response["Authorization"] + assert "AWS4-HMAC-SHA256 Credential=" in authorization + assert ( + "SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=" + in authorization + ) + assert response["X-Amz-Date"] == timezone.now().strftime("%Y%m%dT%H%M%SZ") + + s3_url = urlparse(settings.AWS_S3_ENDPOINT_URL) + file_url = f"{settings.AWS_S3_ENDPOINT_URL:s}/impress-media-storage/{key:s}" + response = requests.get( + file_url, + headers={ + "authorization": authorization, + "x-amz-date": response["x-amz-date"], + "x-amz-content-sha256": response["x-amz-content-sha256"], + "Host": f"{s3_url.hostname:s}:{s3_url.port:d}", + }, + timeout=1, + ) + assert response.content.decode("utf-8") == "my prose" + + @pytest.mark.parametrize("reach", ["public", "authenticated"]) def test_api_documents_media_auth_authenticated_public_or_authenticated(reach): """ Authenticated users who are not related to a document should be able to retrieve attachments related to a document with public or authenticated link reach. """ - document = factories.DocumentFactory(link_reach=reach) - user = factories.UserFactory() client = APIClient() client.force_login(user) + document_id = uuid4() filename = f"{uuid4()!s}.jpg" - key = f"{document.pk!s}/attachments/{filename:s}" + key = f"{document_id!s}/attachments/{filename:s}" + media_url = f"http://localhost/media/{key:s}" default_storage.connection.meta.client.put_object( Bucket=default_storage.bucket_name, @@ -120,9 +179,10 @@ def test_api_documents_media_auth_authenticated_public_or_authenticated(reach): ContentType="text/plain", ) - original_url = f"http://localhost/media/{key:s}" + factories.DocumentFactory(id=document_id, link_reach=reach, attachments=[key]) + response = client.get( - "/api/v1.0/documents/media-auth/", HTTP_X_ORIGINAL_URL=original_url + "/api/v1.0/documents/media-auth/", HTTP_X_ORIGINAL_URL=media_url ) assert response.status_code == 200 @@ -155,14 +215,18 @@ def test_api_documents_media_auth_authenticated_restricted(): Authenticated users who are not related to a document should not be allowed to retrieve attachments linked to a document that is restricted. """ - document = factories.DocumentFactory(link_reach="restricted") - user = factories.UserFactory(with_owned_document=True) client = APIClient() client.force_login(user) + document_id = uuid4() filename = f"{uuid4()!s}.jpg" - media_url = f"http://localhost/media/{document.pk!s}/attachments/{filename:s}" + key = f"{document_id!s}/attachments/{filename:s}" + media_url = f"http://localhost/media/{key:s}" + + factories.DocumentFactory( + id=document_id, link_reach="restricted", attachments=[key] + ) response = client.get( "/api/v1.0/documents/media-auth/", HTTP_X_ORIGINAL_URL=media_url @@ -182,16 +246,10 @@ def test_api_documents_media_auth_related(via, mock_user_teams): client = APIClient() client.force_login(user) - document = factories.DocumentFactory() - if via == USER: - factories.UserDocumentAccessFactory(document=document, user=user) - elif via == TEAM: - mock_user_teams.return_value = ["lasuite", "unknown"] - factories.TeamDocumentAccessFactory(document=document, team="lasuite") - + document_id = uuid4() filename = f"{uuid4()!s}.jpg" - key = f"{document.pk!s}/attachments/{filename:s}" - + key = f"{document_id!s}/attachments/{filename:s}" + media_url = f"http://localhost/media/{key:s}" default_storage.connection.meta.client.put_object( Bucket=default_storage.bucket_name, Key=key, @@ -199,9 +257,17 @@ def test_api_documents_media_auth_related(via, mock_user_teams): ContentType="text/plain", ) - original_url = f"http://localhost/media/{key:s}" + document = factories.DocumentFactory( + id=document_id, link_reach="restricted", attachments=[key] + ) + if via == USER: + factories.UserDocumentAccessFactory(document=document, user=user) + elif via == TEAM: + mock_user_teams.return_value = ["lasuite", "unknown"] + factories.TeamDocumentAccessFactory(document=document, team="lasuite") + response = client.get( - "/api/v1.0/documents/media-auth/", HTTP_X_ORIGINAL_URL=original_url + "/api/v1.0/documents/media-auth/", HTTP_X_ORIGINAL_URL=media_url ) assert response.status_code == 200 diff --git a/src/backend/core/tests/documents/test_api_documents_retrieve.py b/src/backend/core/tests/documents/test_api_documents_retrieve.py index 215164df1..51346d659 100644 --- a/src/backend/core/tests/documents/test_api_documents_retrieve.py +++ b/src/backend/core/tests/documents/test_api_documents_retrieve.py @@ -35,6 +35,7 @@ def test_api_documents_retrieve_anonymous_public_standalone(): "children_list": True, "collaboration_auth": True, "destroy": False, + "duplicate": True, # Anonymous user can't favorite a document even with read access "favorite": False, "invite_owner": False, @@ -91,6 +92,7 @@ def test_api_documents_retrieve_anonymous_public_parent(): "children_list": True, "collaboration_auth": True, "destroy": False, + "duplicate": True, # Anonymous user can't favorite a document even with read access "favorite": False, "invite_owner": False, @@ -181,6 +183,7 @@ def test_api_documents_retrieve_authenticated_unrelated_public_or_authenticated( "children_list": True, "collaboration_auth": True, "destroy": False, + "duplicate": True, "favorite": True, "invite_owner": False, "link_configuration": False, @@ -244,6 +247,7 @@ def test_api_documents_retrieve_authenticated_public_or_authenticated_parent(rea "children_list": True, "collaboration_auth": True, "destroy": False, + "duplicate": True, "favorite": True, "invite_owner": False, "link_configuration": False, @@ -416,6 +420,7 @@ def test_api_documents_retrieve_authenticated_related_parent(): "children_list": True, "collaboration_auth": True, "destroy": access.role == "owner", + "duplicate": True, "favorite": True, "invite_owner": access.role == "owner", "link_configuration": access.role in ["administrator", "owner"], diff --git a/src/backend/core/tests/documents/test_api_documents_trashbin.py b/src/backend/core/tests/documents/test_api_documents_trashbin.py index b1a8b3b56..14f95afcf 100644 --- a/src/backend/core/tests/documents/test_api_documents_trashbin.py +++ b/src/backend/core/tests/documents/test_api_documents_trashbin.py @@ -79,6 +79,7 @@ def test_api_documents_trashbin_format(): "children_list": True, "collaboration_auth": True, "destroy": True, + "duplicate": True, "favorite": True, "invite_owner": True, "link_configuration": True, diff --git a/src/backend/core/tests/test_models_documents.py b/src/backend/core/tests/test_models_documents.py index 028740cb9..de0dd1165 100644 --- a/src/backend/core/tests/test_models_documents.py +++ b/src/backend/core/tests/test_models_documents.py @@ -154,6 +154,7 @@ def test_models_documents_get_abilities_forbidden( "children_list": False, "collaboration_auth": False, "destroy": False, + "duplicate": False, "favorite": False, "invite_owner": False, "media_auth": False, @@ -202,6 +203,7 @@ def test_models_documents_get_abilities_reader( "children_list": True, "collaboration_auth": True, "destroy": False, + "duplicate": True, "favorite": is_authenticated, "invite_owner": False, "link_configuration": False, @@ -250,6 +252,7 @@ def test_models_documents_get_abilities_editor( "children_list": True, "collaboration_auth": True, "destroy": False, + "duplicate": True, "favorite": is_authenticated, "invite_owner": False, "link_configuration": False, @@ -285,6 +288,7 @@ def test_models_documents_get_abilities_owner(django_assert_num_queries): "children_list": True, "collaboration_auth": True, "destroy": True, + "duplicate": True, "favorite": True, "invite_owner": True, "link_configuration": True, @@ -320,6 +324,7 @@ def test_models_documents_get_abilities_administrator(django_assert_num_queries) "children_list": True, "collaboration_auth": True, "destroy": False, + "duplicate": True, "favorite": True, "invite_owner": False, "link_configuration": True, @@ -354,6 +359,7 @@ def test_models_documents_get_abilities_editor_user(django_assert_num_queries): "children_list": True, "collaboration_auth": True, "destroy": False, + "duplicate": True, "favorite": True, "invite_owner": False, "link_configuration": False, @@ -391,6 +397,7 @@ def test_models_documents_get_abilities_reader_user(django_assert_num_queries): "children_list": True, "collaboration_auth": True, "destroy": False, + "duplicate": True, "favorite": True, "invite_owner": False, "link_configuration": False, @@ -431,6 +438,7 @@ def test_models_documents_get_abilities_preset_role(django_assert_num_queries): "children_list": True, "collaboration_auth": True, "destroy": False, + "duplicate": True, "favorite": True, "invite_owner": False, "link_configuration": False, diff --git a/src/backend/core/tests/test_utils_base64_yjs_to_text.py b/src/backend/core/tests/test_utils_base64_yjs_to_text.py index 376bb85d3..2ffa168db 100644 --- a/src/backend/core/tests/test_utils_base64_yjs_to_text.py +++ b/src/backend/core/tests/test_utils_base64_yjs_to_text.py @@ -1,9 +1,15 @@ """Test util base64_yjs_to_text.""" +import base64 +import uuid + +import y_py + +from core import utils from core.utils import base64_yjs_to_text -def test_base64_yjs_to_text(): +def test_utils_base64_yjs_to_text(): """ Test extract_text_from_saved_yjs_document This base64 string is an example of what is saved in the database. @@ -27,3 +33,38 @@ def test_base64_yjs_to_text(): ) assert base64_yjs_to_text(base64_string) == "Hello world" + + +def test_utils_extract_attachments(): + """ + All attachment keys in the document content should be extracted. + """ + document_id = uuid.uuid4() + image_key1 = f"{document_id!s}/attachments/{uuid.uuid4()!s}.png" + image_url1 = f"http://localhost/media/{image_key1:s}" + + image_key2 = f"{uuid.uuid4()!s}/attachments/{uuid.uuid4()!s}.png" + image_url2 = f"http://localhost/{image_key2:s}" + + image_key3 = f"{uuid.uuid4()!s}/attachments/{uuid.uuid4()!s}.png" + image_url3 = f"http://localhost/media/{image_key3:s}" + + ydoc = y_py.YDoc() # pylint: disable=no-member + with ydoc.begin_transaction() as txn: + xml_fragment = ydoc.get_xml_element("document-store") + + xml_image = xml_fragment.push_xml_element(txn, "image") + xml_image.set_attribute(txn, "src", image_url1) + + xml_image = xml_fragment.push_xml_element(txn, "image") + xml_image.set_attribute(txn, "src", image_url2) + + xml_paragraph = xml_fragment.push_xml_element(txn, "paragraph") + xml_text = xml_paragraph.push_xml_text(txn) + xml_text.push(txn, image_url3) + + update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member + base64_string = base64.b64encode(update).decode("utf-8") + + # image_url3 is missing the "/media/" part and shouldn't get extracted + assert utils.extract_attachments(base64_string) == [image_key1, image_key3] diff --git a/src/backend/core/utils.py b/src/backend/core/utils.py index bd2e0170c..0374c0af9 100644 --- a/src/backend/core/utils.py +++ b/src/backend/core/utils.py @@ -1,10 +1,13 @@ """Utils for the core app.""" import base64 +import re import y_py as Y from bs4 import BeautifulSoup +from core import enums + def base64_yjs_to_xml(base64_string): """Extract xml from base64 yjs document.""" @@ -23,3 +26,9 @@ def base64_yjs_to_text(base64_string): blocknote_structure = base64_yjs_to_xml(base64_string) soup = BeautifulSoup(blocknote_structure, "html.parser") return soup.get_text(separator=" ").strip() + + +def extract_attachments(content): + """Helper method to extract media paths from a document's content.""" + xml_content = base64_yjs_to_xml(content) + return re.findall(enums.MEDIA_STORAGE_URL_EXTRACT, xml_content) From c74928759166d71f9f29fddbcfcafc6a057b4dac Mon Sep 17 00:00:00 2001 From: Samuel Paccoud - DINUM Date: Mon, 27 Jan 2025 23:03:53 +0100 Subject: [PATCH 07/10] =?UTF-8?q?fixup!=20=E2=9C=A8(backend)=20add=20dupli?= =?UTF-8?q?cate=20action=20to=20the=20document=20API=20endpoint?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/backend/core/api/viewsets.py | 4 +++- .../core/tests/documents/test_api_documents_media_auth.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py index 89f5ce385..aa2a13d23 100644 --- a/src/backend/core/api/viewsets.py +++ b/src/backend/core/api/viewsets.py @@ -772,7 +772,9 @@ def duplicate(self, request, *args, **kwargs): Optionally duplicates accesses if `with_accesses` is set to true in the payload. """ - serializer = serializers.DocumentDuplicationSerializer(data=request.GET) + serializer = serializers.DocumentDuplicationSerializer( + data=request.query_params + ) serializer.is_valid(raise_exception=True) with_accesses = serializer.validated_data["with_accesses"] diff --git a/src/backend/core/tests/documents/test_api_documents_media_auth.py b/src/backend/core/tests/documents/test_api_documents_media_auth.py index aa7a0aaf7..fdcd9f072 100644 --- a/src/backend/core/tests/documents/test_api_documents_media_auth.py +++ b/src/backend/core/tests/documents/test_api_documents_media_auth.py @@ -103,7 +103,7 @@ def test_api_documents_media_auth_anonymous_attachments(): """ Declaring a media key as original attachment on a document to which a user has access should give them access to the attachment file - regarless of their access rights on the original document. + regardless of their access rights on the original document. """ document_id = uuid4() filename = f"{uuid4()!s}.jpg" From 13b1523de3e1a5f66e1668ec3218e9e7c9fab8ad Mon Sep 17 00:00:00 2001 From: Samuel Paccoud - DINUM Date: Tue, 21 Jan 2025 23:56:50 +0100 Subject: [PATCH 08/10] =?UTF-8?q?=E2=9C=A8(backend)=20extract=20attachment?= =?UTF-8?q?=20keys=20from=20updated=20content=20for=20access?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We can't prevent document editors from copy/pasting content to from one document to another. The problem is that copying content, will copy the urls pointing to attachments but if we don't do anything, the reader of the document to which the content is being pasted, may not be allowed to access the attachment files from the original document. Using the work from the previous commit, we can grant access to the readers of the target document by extracting the attachment keys from the content and adding themto the target document's "attachments" field. Before doing this, we check that the current user can indeed access the attachment files extracted from the content and that they are allowed to edit the current document. --- src/backend/core/api/serializers.py | 32 +++- .../test_api_documents_media_auth.py | 2 +- ...pi_documents_update_extract_attachments.py | 153 ++++++++++++++++++ src/backend/core/tests/test_utils.py | 40 +++++ src/backend/core/utils.py | 3 + 5 files changed, 228 insertions(+), 2 deletions(-) create mode 100644 src/backend/core/tests/documents/test_api_documents_update_extract_attachments.py diff --git a/src/backend/core/api/serializers.py b/src/backend/core/api/serializers.py index 270f8478a..5db53d51c 100644 --- a/src/backend/core/api/serializers.py +++ b/src/backend/core/api/serializers.py @@ -10,7 +10,7 @@ import magic from rest_framework import exceptions, serializers -from core import enums, models +from core import enums, models, utils from core.services.ai_services import AI_ACTIONS from core.services.converter_services import ( ConversionError, @@ -260,6 +260,36 @@ def validate_id(self, value): return value + def save(self, **kwargs): + """ + Process the content field to extract attachment keys and update the document's + "attachments" field for access control. + """ + content = self.validated_data.get("content", "") + extracted_attachments = set(utils.extract_attachments(content)) + + existing_attachments = ( + set(self.instance.attachments or []) if self.instance else set() + ) + new_attachments = extracted_attachments - existing_attachments + + if new_attachments: + user = self.context["request"].user + readable_documents = models.Document.objects.readable(user).filter( + Q(attachments__overlap=list(new_attachments)) + ) + + readable_attachments = set() + for document in readable_documents: + readable_attachments.update(set(document.attachments) & new_attachments) + + # Update attachments with readable keys + self.validated_data["attachments"] = list( + existing_attachments | readable_attachments + ) + + return super().save(**kwargs) + class ServerCreateDocumentSerializer(serializers.Serializer): """ diff --git a/src/backend/core/tests/documents/test_api_documents_media_auth.py b/src/backend/core/tests/documents/test_api_documents_media_auth.py index fdcd9f072..894b0e504 100644 --- a/src/backend/core/tests/documents/test_api_documents_media_auth.py +++ b/src/backend/core/tests/documents/test_api_documents_media_auth.py @@ -1,5 +1,5 @@ """ -Test file uploads API endpoint for users in impress's core app. +Test media-auth authorization API endpoint in docs core app. """ from io import BytesIO diff --git a/src/backend/core/tests/documents/test_api_documents_update_extract_attachments.py b/src/backend/core/tests/documents/test_api_documents_update_extract_attachments.py new file mode 100644 index 000000000..b32cb9c6d --- /dev/null +++ b/src/backend/core/tests/documents/test_api_documents_update_extract_attachments.py @@ -0,0 +1,153 @@ +""" +Test extract-attachments on document update in docs core app. +""" + +import base64 +from uuid import uuid4 + +import pytest +import y_py +from rest_framework.test import APIClient + +from core import factories + +pytestmark = pytest.mark.django_db + + +def get_ydoc_with_mages(image_keys): + """Return a ydoc from text for testing purposes.""" + ydoc = y_py.YDoc() # pylint: disable=no-member + with ydoc.begin_transaction() as txn: + xml_fragment = ydoc.get_xml_element("document-store") + for key in image_keys: + xml_image = xml_fragment.push_xml_element(txn, "image") + xml_image.set_attribute(txn, "src", f"http://localhost/media/{key:s}") + + update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member + return base64.b64encode(update).decode("utf-8") + + +def test_api_documents_update_new_attachment_keys_anonymous(django_assert_num_queries): + """ + When an anonymous user updates a document, the attachment keys extracted from the + updated content should be added to the list of "attachments" ot the document if these + attachments are already readable by anonymous users. + """ + image_keys = [f"{uuid4()!s}/attachments/{uuid4()!s}.png" for _ in range(4)] + document = factories.DocumentFactory( + content=get_ydoc_with_mages(image_keys[:1]), + attachments=[image_keys[0]], + link_reach="public", + link_role="editor", + ) + + factories.DocumentFactory(attachments=[image_keys[1]], link_reach="public") + factories.DocumentFactory(attachments=[image_keys[2]], link_reach="authenticated") + factories.DocumentFactory(attachments=[image_keys[3]], link_reach="restricted") + expected_keys = {image_keys[i] for i in [0, 1]} + + with django_assert_num_queries(4): + response = APIClient().put( + f"/api/v1.0/documents/{document.id!s}/", + {"content": get_ydoc_with_mages(image_keys)}, + format="json", + ) + assert response.status_code == 200 + + document.refresh_from_db() + assert set(document.attachments) == expected_keys + + # Check that the db query to check attachments readability for extracted + # keys is not done if the content changes but no new keys are found + with django_assert_num_queries(3): + response = APIClient().put( + f"/api/v1.0/documents/{document.id!s}/", + {"content": get_ydoc_with_mages(image_keys[:2])}, + format="json", + ) + assert response.status_code == 200 + + document.refresh_from_db() + assert len(document.attachments) == 2 + assert set(document.attachments) == expected_keys + + +def test_api_documents_update_new_attachment_keys_authenticated( + django_assert_num_queries, +): + """ + When an authenticated user updates a document, the attachment keys extracted from the + updated content should be added to the list of "attachments" ot the document if these + attachments are already readable by the editing user. + """ + user = factories.UserFactory() + client = APIClient() + client.force_login(user) + + image_keys = [f"{uuid4()!s}/attachments/{uuid4()!s}.png" for _ in range(5)] + document = factories.DocumentFactory( + content=get_ydoc_with_mages(image_keys[:1]), + attachments=[image_keys[0]], + users=[(user, "editor")], + ) + + factories.DocumentFactory(attachments=[image_keys[1]], link_reach="public") + factories.DocumentFactory(attachments=[image_keys[2]], link_reach="authenticated") + factories.DocumentFactory(attachments=[image_keys[3]], link_reach="restricted") + factories.DocumentFactory(attachments=[image_keys[4]], users=[user]) + expected_keys = {image_keys[i] for i in [0, 1, 2, 4]} + + with django_assert_num_queries(5): + response = client.put( + f"/api/v1.0/documents/{document.id!s}/", + {"content": get_ydoc_with_mages(image_keys)}, + format="json", + ) + assert response.status_code == 200 + + document.refresh_from_db() + assert set(document.attachments) == expected_keys + + # Check that the db query to check attachments readability for extracted + # keys is not done if the content changes but no new keys are found + with django_assert_num_queries(4): + response = client.put( + f"/api/v1.0/documents/{document.id!s}/", + {"content": get_ydoc_with_mages(image_keys[:2])}, + format="json", + ) + assert response.status_code == 200 + + document.refresh_from_db() + assert len(document.attachments) == 4 + assert set(document.attachments) == expected_keys + + +def test_api_documents_update_new_attachment_keys_duplicate(): + """ + Duplicate keys in the content should not result in duplicates in the document's attachments. + """ + user = factories.UserFactory() + client = APIClient() + client.force_login(user) + + image_key1 = f"{uuid4()!s}/attachments/{uuid4()!s}.png" + image_key2 = f"{uuid4()!s}/attachments/{uuid4()!s}.png" + document = factories.DocumentFactory( + content=get_ydoc_with_mages([image_key1]), + attachments=[image_key1], + users=[(user, "editor")], + ) + + factories.DocumentFactory(attachments=[image_key2], users=[user]) + + response = client.put( + f"/api/v1.0/documents/{document.id!s}/", + {"content": get_ydoc_with_mages([image_key1, image_key2, image_key2])}, + format="json", + ) + assert response.status_code == 200 + + document.refresh_from_db() + assert len(document.attachments) == 2 + assert set(document.attachments) == {image_key1, image_key2} diff --git a/src/backend/core/tests/test_utils.py b/src/backend/core/tests/test_utils.py index 4fa33e1ef..3fea93ede 100644 --- a/src/backend/core/tests/test_utils.py +++ b/src/backend/core/tests/test_utils.py @@ -1,5 +1,10 @@ """Test util base64_yjs_to_text.""" +import base64 +import uuid + +import y_py + from core import utils # This base64 string is an example of what is saved in the database. @@ -35,3 +40,38 @@ def test_utils_base64_yjs_to_xml(): or 'Hello' in content ) assert 'world' in content + + +def test_utils_extract_attachments(): + """ + All attachment keys in the document content should be extracted. + """ + document_id = uuid.uuid4() + image_key1 = f"{document_id!s}/attachments/{uuid.uuid4()!s}.png" + image_url1 = f"http://localhost/media/{image_key1:s}" + + image_key2 = f"{uuid.uuid4()!s}/attachments/{uuid.uuid4()!s}.png" + image_url2 = f"http://localhost/{image_key2:s}" + + image_key3 = f"{uuid.uuid4()!s}/attachments/{uuid.uuid4()!s}.png" + image_url3 = f"http://localhost/media/{image_key3:s}" + + ydoc = y_py.YDoc() # pylint: disable=no-member + with ydoc.begin_transaction() as txn: + xml_fragment = ydoc.get_xml_element("document-store") + + xml_image = xml_fragment.push_xml_element(txn, "image") + xml_image.set_attribute(txn, "src", image_url1) + + xml_image = xml_fragment.push_xml_element(txn, "image") + xml_image.set_attribute(txn, "src", image_url2) + + xml_paragraph = xml_fragment.push_xml_element(txn, "paragraph") + xml_text = xml_paragraph.push_xml_text(txn) + xml_text.push(txn, image_url3) + + update = y_py.encode_state_as_update(ydoc) # pylint: disable=no-member + base64_string = base64.b64encode(update).decode("utf-8") + + # image_key2 is missing the "/media/" part and shouldn't get extracted + assert utils.extract_attachments(base64_string) == [image_key1, image_key3] diff --git a/src/backend/core/utils.py b/src/backend/core/utils.py index 0374c0af9..960a8c8d7 100644 --- a/src/backend/core/utils.py +++ b/src/backend/core/utils.py @@ -30,5 +30,8 @@ def base64_yjs_to_text(base64_string): def extract_attachments(content): """Helper method to extract media paths from a document's content.""" + if not content: + return [] + xml_content = base64_yjs_to_xml(content) return re.findall(enums.MEDIA_STORAGE_URL_EXTRACT, xml_content) From 5411210fe5a11300dcad77f9c6c1bf27c6b40ef1 Mon Sep 17 00:00:00 2001 From: Samuel Paccoud - DINUM Date: Thu, 30 Jan 2025 19:37:46 +0100 Subject: [PATCH 09/10] =?UTF-8?q?fixup!=20=E2=9C=A8(backend)=20extract=20a?= =?UTF-8?q?ttachment=20keys=20from=20updated=20content=20for=20access?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../test_api_documents_update_extract_attachments.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/backend/core/tests/documents/test_api_documents_update_extract_attachments.py b/src/backend/core/tests/documents/test_api_documents_update_extract_attachments.py index b32cb9c6d..6a0ea54be 100644 --- a/src/backend/core/tests/documents/test_api_documents_update_extract_attachments.py +++ b/src/backend/core/tests/documents/test_api_documents_update_extract_attachments.py @@ -46,7 +46,7 @@ def test_api_documents_update_new_attachment_keys_anonymous(django_assert_num_qu factories.DocumentFactory(attachments=[image_keys[3]], link_reach="restricted") expected_keys = {image_keys[i] for i in [0, 1]} - with django_assert_num_queries(4): + with django_assert_num_queries(7): response = APIClient().put( f"/api/v1.0/documents/{document.id!s}/", {"content": get_ydoc_with_mages(image_keys)}, @@ -59,7 +59,7 @@ def test_api_documents_update_new_attachment_keys_anonymous(django_assert_num_qu # Check that the db query to check attachments readability for extracted # keys is not done if the content changes but no new keys are found - with django_assert_num_queries(3): + with django_assert_num_queries(5): response = APIClient().put( f"/api/v1.0/documents/{document.id!s}/", {"content": get_ydoc_with_mages(image_keys[:2])}, @@ -97,7 +97,7 @@ def test_api_documents_update_new_attachment_keys_authenticated( factories.DocumentFactory(attachments=[image_keys[4]], users=[user]) expected_keys = {image_keys[i] for i in [0, 1, 2, 4]} - with django_assert_num_queries(5): + with django_assert_num_queries(8): response = client.put( f"/api/v1.0/documents/{document.id!s}/", {"content": get_ydoc_with_mages(image_keys)}, @@ -110,7 +110,7 @@ def test_api_documents_update_new_attachment_keys_authenticated( # Check that the db query to check attachments readability for extracted # keys is not done if the content changes but no new keys are found - with django_assert_num_queries(4): + with django_assert_num_queries(6): response = client.put( f"/api/v1.0/documents/{document.id!s}/", {"content": get_ydoc_with_mages(image_keys[:2])}, From bddfd207c01634edf160fc4934150e177a6d71fe Mon Sep 17 00:00:00 2001 From: Samuel Paccoud - DINUM Date: Mon, 27 Jan 2025 23:27:23 +0100 Subject: [PATCH 10/10] =?UTF-8?q?=E2=99=BB=EF=B8=8F(backend)=20remove=20la?= =?UTF-8?q?zy=20from=20languages=20field=20on=20User=20model?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The idea behind wrapping choices in `lazy` function was to allow overriding the list of languages in tests with `override_settings`. This was causin makemigrations to keep on including the field in migrations when it is not needed. Since we finally don't override the LANGUAGES setting in tests, we can remove it to fix the problem. --- src/backend/core/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/core/models.py b/src/backend/core/models.py index dcf09d1f2..abe889ef1 100644 --- a/src/backend/core/models.py +++ b/src/backend/core/models.py @@ -23,7 +23,7 @@ from django.db.models.functions import Left, Length from django.template.loader import render_to_string from django.utils import timezone -from django.utils.functional import cached_property, lazy +from django.utils.functional import cached_property from django.utils.translation import get_language, override from django.utils.translation import gettext_lazy as _ @@ -194,7 +194,7 @@ class User(AbstractBaseUser, BaseModel, auth_models.PermissionsMixin): language = models.CharField( max_length=10, - choices=lazy(lambda: settings.LANGUAGES, tuple)(), + choices=settings.LANGUAGES, default=settings.LANGUAGE_CODE, verbose_name=_("language"), help_text=_("The language in which the user wants to see the interface."),