From c3b702066097c4ff24a1e2ca7dd82b73716e69f1 Mon Sep 17 00:00:00 2001 From: muhammadali124 <109604645+muhammadali124@users.noreply.github.com> Date: Mon, 3 Jun 2024 14:43:35 +0500 Subject: [PATCH] video module transcripts fix (#543) --- .../contentstore/views/transcripts_ajax.py | 32 +++-- cms/envs/common.py | 13 +- .../xmodule/video_module/transcripts_utils.py | 128 +++++++++++++----- 3 files changed, 114 insertions(+), 59 deletions(-) diff --git a/cms/djangoapps/contentstore/views/transcripts_ajax.py b/cms/djangoapps/contentstore/views/transcripts_ajax.py index 461fd9deb230..f0b12d350084 100644 --- a/cms/djangoapps/contentstore/views/transcripts_ajax.py +++ b/cms/djangoapps/contentstore/views/transcripts_ajax.py @@ -42,8 +42,9 @@ get_transcript, get_transcript_for_video, get_transcript_from_val, - get_transcripts_from_youtube, - youtube_video_transcript_name + get_transcript_from_youtube, + get_transcript_link_from_youtube, + get_transcript_links_from_youtube, ) __all__ = [ @@ -234,6 +235,8 @@ def upload_transcripts(request): file_data=ContentFile(sjson_subs), ) + video.transcripts['en'] = f"{edx_video_id}-en.srt" + video.save_with_metadata(request.user) if transcript_created is None: response = JsonResponse({'status': 'Invalid Video ID'}, status=400) @@ -341,26 +344,22 @@ def check_transcripts(request): except NotFoundError: log.debug(u"Can't find transcripts in storage for youtube id: %s", youtube_id) - # youtube server - youtube_text_api = copy.deepcopy(settings.YOUTUBE['TEXT_API']) - youtube_text_api['params']['v'] = youtube_id - youtube_transcript_name = youtube_video_transcript_name(youtube_text_api) - if youtube_transcript_name: - youtube_text_api['params']['name'] = youtube_transcript_name - youtube_response = requests.get('http://' + youtube_text_api['url'], params=youtube_text_api['params']) - - if youtube_response.status_code == 200 and youtube_response.text: + if get_transcript_link_from_youtube(youtube_id): transcripts_presence['youtube_server'] = True #check youtube local and server transcripts for equality if transcripts_presence['youtube_server'] and transcripts_presence['youtube_local']: try: - youtube_server_subs = get_transcripts_from_youtube( + transcript_links = get_transcript_links_from_youtube( youtube_id, settings, item.runtime.service(item, "i18n") ) - if json.loads(local_transcripts) == youtube_server_subs: # check transcripts for equality - transcripts_presence['youtube_diff'] = False + for (_, link) in transcript_links.items(): + youtube_server_subs = get_transcript_from_youtube( + link, youtube_id, item.runtime.service(item, "i18n") + ) + if json.loads(local_transcripts) == youtube_server_subs: # check transcripts for equality + transcripts_presence['youtube_diff'] = False except GetTranscriptsFromYouTubeException: pass @@ -632,7 +631,10 @@ def replace_transcripts(request): edx_video_id = link_video_to_component(video, request.user) # 3. Upload YT transcript to DS for the linked video ID. - success = save_video_transcript(edx_video_id, Transcript.SJSON, transcript_content, language_code=u'en') + success = True + for transcript in transcript_content: + [language_code, json_content] = transcript + success = save_video_transcript(edx_video_id, Transcript.SJSON, json_content, language_code) if success: response = JsonResponse({'edx_video_id': edx_video_id, 'status': 'Success'}, status=200) else: diff --git a/cms/envs/common.py b/cms/envs/common.py index b6eada7a11ba..fe82a03a6a87 100644 --- a/cms/envs/common.py +++ b/cms/envs/common.py @@ -1377,14 +1377,11 @@ # URL to get YouTube metadata 'METADATA_URL': 'https://www.googleapis.com/youtube/v3/videos', - # Current youtube api for requesting transcripts. - # For example: http://video.google.com/timedtext?lang=en&v=j_jEn79vS3g. - 'TEXT_API': { - 'url': 'video.google.com/timedtext', - 'params': { - 'lang': 'en', - 'v': 'set_youtube_id_of_11_symbols_here', - }, + # Web page mechanism for scraping transcript information from youtube video pages + 'TRANSCRIPTS': { + 'CAPTION_TRACKS_REGEX': r"captionTracks\"\:\[(?P[^\]]+)", + 'YOUTUBE_URL_BASE': 'https://www.youtube.com/watch?v=', + 'ALLOWED_LANGUAGE_CODES': ["en", "en-US", "en-GB"], }, 'IMAGE_API': 'http://img.youtube.com/vi/{youtube_id}/0.jpg', # /maxresdefault.jpg for 1920*1080 diff --git a/common/lib/xmodule/xmodule/video_module/transcripts_utils.py b/common/lib/xmodule/xmodule/video_module/transcripts_utils.py index 781b2ce5f1a0..dae0ed28541e 100644 --- a/common/lib/xmodule/xmodule/video_module/transcripts_utils.py +++ b/common/lib/xmodule/xmodule/video_module/transcripts_utils.py @@ -9,6 +9,7 @@ import logging import os from functools import wraps +import re import requests import simplejson as json @@ -133,30 +134,72 @@ def save_subs_to_store(subs, subs_id, item, language='en'): return save_to_store(filedata, filename, 'application/json', item.location) -def youtube_video_transcript_name(youtube_text_api): +def get_transcript_link_from_youtube(youtube_id): """ - Get the transcript name from available transcripts of video - with respect to language from youtube server - """ - utf8_parser = etree.XMLParser(encoding='utf-8') + Get the link for YouTube transcript by parsing the source of the YouTube webpage. + Inside the webpage, the details of the transcripts are located in a JSON object. + After prettifying the object, it looks like: + + "captions": { + "playerCaptionsTracklistRenderer": { + "captionTracks": [ + { + "baseUrl": "...", + "name": { + "simpleText": "(Japanese in local language)" + }, + "vssId": ".ja", + "languageCode": "ja", + "isTranslatable": true + }, + { + "baseUrl": "...", + "name": { + "simpleText": "(French in local language)" + }, + "vssId": ".fr", + "languageCode": "fr", + "isTranslatable": true + }, + { + "baseUrl": "...", + "name": { + "simpleText": "(English in local language)" + }, + "vssId": ".en", + "languageCode": "en", + "isTranslatable": true + }, + ... + ], + "audioTracks": [...] + "translationLanguages": ... + }, + ... + } - transcripts_param = {'type': 'list', 'v': youtube_text_api['params']['v']} - lang = youtube_text_api['params']['lang'] - # get list of transcripts of specific video - # url-form - # http://video.google.com/timedtext?type=list&v={VideoId} - youtube_response = requests.get('http://' + youtube_text_api['url'], params=transcripts_param) - if youtube_response.status_code == 200 and youtube_response.text: - youtube_data = etree.fromstring(youtube_response.text.encode('utf-8'), parser=utf8_parser) - # iterate all transcripts information from youtube server - for element in youtube_data: - # search specific language code such as 'en' in transcripts info list - if element.tag == 'track' and element.get('lang_code', '') == lang: - return element.get('name') - return None - - -def get_transcripts_from_youtube(youtube_id, settings, i18n, youtube_transcript_name=''): + So we use a regex to find the captionTracks JavaScript array, and then convert it + to a Python dict and return the link for en caption + """ + youtube_url_base = settings.YOUTUBE['TRANSCRIPTS']['YOUTUBE_URL_BASE'] + try: + youtube_html = requests.get(f"{youtube_url_base}{youtube_id}") + caption_re = settings.YOUTUBE['TRANSCRIPTS']['CAPTION_TRACKS_REGEX'] + caption_matched = re.search(caption_re, youtube_html.content.decode("utf-8")) + if caption_matched: + caption_tracks = json.loads(f'[{caption_matched.group("caption_tracks")}]') + caption_links = {} + for caption in caption_tracks: + language_code = caption.get('languageCode', None) + if language_code and not language_code == 'None': + link = caption.get("baseUrl") + caption_links[language_code] = link + return None if not caption_links else caption_links + return None + except ConnectionError: + return None + +def get_transcript_links_from_youtube(youtube_id, settings, i18n, youtube_transcript_name=''): # lint-amnesty, pylint: disable=redefined-outer-name """ Gets transcripts from youtube for youtube_id. @@ -165,19 +208,29 @@ def get_transcripts_from_youtube(youtube_id, settings, i18n, youtube_transcript_ Returns (status, transcripts): bool, dict. """ - _ = i18n.ugettext + _ = i18n.gettext + transcript_links = get_transcript_link_from_youtube(youtube_id) - utf8_parser = etree.XMLParser(encoding='utf-8') + if not transcript_links: + msg = _("Can't get transcript link from Youtube for {youtube_id}.").format( + youtube_id=youtube_id, + ) + raise GetTranscriptsFromYouTubeException(msg) + + return transcript_links - youtube_text_api = copy.deepcopy(settings.YOUTUBE['TEXT_API']) - youtube_text_api['params']['v'] = youtube_id - # if the transcript name is not empty on youtube server we have to pass - # name param in url in order to get transcript - # example http://video.google.com/timedtext?lang=en&v={VideoId}&name={transcript_name} - youtube_transcript_name = youtube_video_transcript_name(youtube_text_api) - if youtube_transcript_name: - youtube_text_api['params']['name'] = youtube_transcript_name - data = requests.get('http://' + youtube_text_api['url'], params=youtube_text_api['params']) +def get_transcript_from_youtube(link, youtube_id, i18n): + """ + Gets transcripts from youtube for youtube_id. + + Parses only utf-8 encoded transcripts. + Other encodings are not supported at the moment. + + Returns (status, transcripts): bool, dict. + """ + _ = i18n.gettext + utf8_parser = etree.XMLParser(encoding='utf-8') + data = requests.get(link) if data.status_code != 200 or not data.text: msg = _("Can't receive transcripts from Youtube for {youtube_id}. Status code: {status_code}.").format( @@ -222,9 +275,12 @@ def download_youtube_subs(youtube_id, video_descriptor, settings): """ i18n = video_descriptor.runtime.service(video_descriptor, "i18n") _ = i18n.ugettext - - subs = get_transcripts_from_youtube(youtube_id, settings, i18n) - return json.dumps(subs, indent=2) + transcript_links = get_transcript_links_from_youtube(youtube_id, settings, i18n) + subs = [] + for (language_code, link) in transcript_links.items(): + sub = get_transcript_from_youtube(link, youtube_id, i18n) + subs.append([language_code, json.dumps(sub, indent=2)]) + return subs def remove_subs_from_store(subs_id, item, lang='en'):