Skip to content

Commit

Permalink
fix: protect video transcript handler from scraper (#34017)
Browse files Browse the repository at this point in the history
Web scrapers do annoying stuff like visit urls they shouldn't know about and cause xblock handlers to break.
I tested this by:
Making sure video transcripts worked as normal while logged in
making sure that I got no 500s in my logs while attempting to view it logged out.
  • Loading branch information
connorhaugh authored Jan 8, 2024
1 parent 83d104f commit efaef45
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 14 deletions.
46 changes: 32 additions & 14 deletions lms/djangoapps/courseware/tests/test_video_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,12 @@
import freezegun
from django.core.files.base import ContentFile
from django.utils.timezone import now
from django.test import RequestFactory
from edxval import api
from xblock.django.request import DjangoWebobRequest
from webob import Request, Response

from common.djangoapps.student.tests.factories import UserFactory
from common.test.utils import normalize_repr
from openedx.core.djangoapps.contentserver.caching import del_cached_content
from xmodule.contentstore.content import StaticContent # lint-amnesty, pylint: disable=wrong-import-order
Expand All @@ -36,6 +39,7 @@
from .helpers import BaseTestXmodule
from .test_video_xml import SOURCE_XML

TEST_USERNAME = 'test-user'
TRANSCRIPT = {"start": [10], "end": [100], "text": ["Hi, welcome to Edx."]}
BUMPER_TRANSCRIPT = {"start": [1], "end": [10], "text": ["A bumper"]}
SRT_content = textwrap.dedent("""
Expand Down Expand Up @@ -574,6 +578,16 @@ def test_download_fallback_transcript(self, mock_get_video_transcript_data):
assert response.headers[attribute] == value


def _create_djangowebobrequest_object_for_url(url: str) -> DjangoWebobRequest:
"""
create a DjangoWebobRequest (The type of requests used by xblocks)
with a relevant user,
"""
django_request = RequestFactory().get(url)
django_request.user = UserFactory.create(username=TEST_USERNAME)
return DjangoWebobRequest(django_request)


@ddt.ddt
class TestTranscriptTranslationGetDispatch(TestVideo): # lint-amnesty, pylint: disable=test-inherits-tests
"""
Expand Down Expand Up @@ -620,7 +634,7 @@ def setUp(self):
)
@ddt.unpack
def test_translation_fails(self, url, dispatch, status_code):
request = Request.blank(url)
request = _create_djangowebobrequest_object_for_url(url)
response = self.block.transcript(request=request, dispatch=dispatch)
assert response.status == status_code

Expand All @@ -635,7 +649,7 @@ def test_translaton_en_youtube_success(self, url, dispatch, attach):
subs_id = _get_subs_id(good_sjson.name)

attach(self.block, subs_id)
request = Request.blank(url.format(subs_id))
request = _create_djangowebobrequest_object_for_url(url.format(subs_id))
response = self.block.transcript(request=request, dispatch=dispatch)
self.assertDictEqual(json.loads(response.body.decode('utf-8')), subs)

Expand All @@ -655,12 +669,12 @@ def test_translation_non_en_youtube_success(self):
self.block.youtube_id_1_0 = subs_id
self.block.youtube_id_0_75 = '0_75'
self.store.update_item(self.block, self.user.id)
request = Request.blank(f'/translation/uk?videoId={subs_id}')
request = _create_djangowebobrequest_object_for_url(f'/translation/uk?videoId={subs_id}')
response = self.block.transcript(request=request, dispatch='translation/uk')
self.assertDictEqual(json.loads(response.body.decode('utf-8')), subs)

# 0_75 subs are exist
request = Request.blank('/translation/uk?videoId={}'.format('0_75'))
request = _create_djangowebobrequest_object_for_url('/translation/uk?videoId={}'.format('0_75'))
response = self.block.transcript(request=request, dispatch='translation/uk')
calculated_0_75 = {
'end': [75],
Expand All @@ -674,7 +688,7 @@ def test_translation_non_en_youtube_success(self):
# 1_5 will be generated from 1_0
self.block.youtube_id_1_5 = '1_5'
self.store.update_item(self.block, self.user.id)
request = Request.blank('/translation/uk?videoId={}'.format('1_5'))
request = _create_djangowebobrequest_object_for_url('/translation/uk?videoId={}'.format('1_5'))
response = self.block.transcript(request=request, dispatch='translation/uk')
calculated_1_5 = {
'end': [150],
Expand All @@ -696,7 +710,7 @@ def test_translaton_en_html5_success(self, url, dispatch, attach):

attach(self.block, subs_id)
self.store.update_item(self.block, self.user.id)
request = Request.blank(url)
request = _create_djangowebobrequest_object_for_url(url)
response = self.block.transcript(request=request, dispatch=dispatch)
self.assertDictEqual(json.loads(response.body.decode('utf-8')), TRANSCRIPT)

Expand All @@ -713,7 +727,7 @@ def test_translaton_non_en_html5_success(self):

# manually clean youtube_id_1_0, as it has default value
self.block.youtube_id_1_0 = ""
request = Request.blank('/translation/uk')
request = _create_djangowebobrequest_object_for_url('/translation/uk')
response = self.block.transcript(request=request, dispatch='translation/uk')
self.assertDictEqual(json.loads(response.body.decode('utf-8')), subs)

Expand All @@ -731,21 +745,21 @@ def test_translation_static_transcript_xml_with_data_dirc(self):
self.block.runtime.modulestore = test_modulestore

# Test youtube style en
request = Request.blank('/translation/en?videoId=12345')
request = _create_djangowebobrequest_object_for_url('/translation/en?videoId=12345')
response = self.block.transcript(request=request, dispatch='translation/en')
assert response.status == '307 Temporary Redirect'
assert ('Location', '/static/dummy/static/subs_12345.srt.sjson') in response.headerlist

# Test HTML5 video style
self.block.sub = 'OEoXaMPEzfM'
request = Request.blank('/translation/en')
request = _create_djangowebobrequest_object_for_url('/translation/en')
response = self.block.transcript(request=request, dispatch='translation/en')
assert response.status == '307 Temporary Redirect'
assert ('Location', '/static/dummy/static/subs_OEoXaMPEzfM.srt.sjson') in response.headerlist

# Test different language to ensure we are just ignoring it since we can't
# translate with static fallback
request = Request.blank('/translation/uk')
request = _create_djangowebobrequest_object_for_url('/translation/uk')
response = self.block.transcript(request=request, dispatch='translation/uk')
assert response.status == '404 Not Found'

Expand Down Expand Up @@ -773,7 +787,7 @@ def test_translation_static_transcript(self, url, dispatch, status_code, sub=Non

if attach:
attach(self.block, sub)
request = Request.blank(url)
request = _create_djangowebobrequest_object_for_url(url)
response = self.block.transcript(request=request, dispatch=dispatch)
assert response.status == status_code
if sub:
Expand All @@ -788,7 +802,7 @@ def test_translation_static_non_course(self, __):
self._set_static_asset_path()

# When course_id is not mocked out, these values would result in 307, as tested above.
request = Request.blank('/translation/en?videoId=12345')
request = _create_djangowebobrequest_object_for_url('/translation/en?videoId=12345')
response = self.block.transcript(request=request, dispatch='translation/en')
assert response.status == '404 Not Found'

Expand Down Expand Up @@ -819,7 +833,8 @@ def test_translation_fallback_transcript(self, mock_get_video_transcript_data):
mock_get_video_transcript_data.return_value = transcript

# Make request to XModule transcript handler
response = self.block.transcript(request=Request.blank('/translation/en'), dispatch='translation/en')
request = _create_djangowebobrequest_object_for_url('/translation/en')
response = self.block.transcript(request=request, dispatch='translation/en')

# Expected headers
expected_headers = {
Expand All @@ -840,7 +855,10 @@ def test_translation_fallback_transcript_feature_disabled(self):
Verify that val transcript is not returned when its feature is disabled.
"""
# Make request to XModule transcript handler
response = self.block.transcript(request=Request.blank('/translation/en'), dispatch='translation/en')
response = self.block.transcript(
request=_create_djangowebobrequest_object_for_url('/translation/en'),
dispatch='translation/en'
)
# Assert the actual response
assert response.status_code == 404

Expand Down
7 changes: 7 additions & 0 deletions xmodule/video_block/video_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,12 @@ def transcript(self, request, dispatch):
if dispatch.startswith('translation'):
language = dispatch.replace('translation', '').strip('/')

# Because scrapers hit video blocks, verify that a user exists.
# use the _request attr to get the django request object.
if not request._request.user: # pylint: disable=protected-access
log.info("Transcript: user must be logged or public view enabled to get transcript")
return Response(status=403)

if not language:
log.info("Invalid /translation request: no language.")
return Response(status=400)
Expand All @@ -324,6 +330,7 @@ def transcript(self, request, dispatch):
return Response(status=404)

if language != self.transcript_language:

self.transcript_language = language

try:
Expand Down

0 comments on commit efaef45

Please sign in to comment.