From 205ad29cb0763dd263a5940cdcb3059d189bbfe7 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Fri, 29 Nov 2019 18:36:27 -0800 Subject: Extraction: Add general subtitle extraction and translation --- youtube/watch.py | 132 +++++++++++++++++++++++++++++++-------------- youtube/yt_data_extract.py | 63 +++++++++++----------- 2 files changed, 126 insertions(+), 69 deletions(-) (limited to 'youtube') diff --git a/youtube/watch.py b/youtube/watch.py index 8a396a7..fa697ba 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -44,50 +44,104 @@ def get_video_sources(info): return video_sources +def make_caption_src(info, lang, auto=False, trans_lang=None): + label = lang + if auto: + label += ' (Automatic)' + if trans_lang: + label += ' -> ' + trans_lang + return { + 'url': '/' + yt_data_extract.get_caption_url(info, lang, 'vtt', auto, trans_lang), + 'label': label, + 'srclang': trans_lang[0:2] if trans_lang else lang[0:2], + 'on': False, + } + +def lang_in(lang, sequence): + '''Tests if the language is in sequence, with e.g. en and en-US considered the same''' + lang = lang[0:2] + return lang in (l[0:2] for l in sequence) + +def lang_eq(lang1, lang2): + '''Tests if two iso 639-1 codes are equal, with en and en-US considered the same. + Just because the codes are equal does not mean the dialects are mutually intelligible, but this will have to do for now without a complex language model''' + return lang1[0:2] == lang2[0:2] + +def equiv_lang_in(lang, sequence): + '''Extracts a language in sequence which is equivalent to lang. + e.g. if lang is en, extracts en-GB from sequence. + Necessary because if only a specific variant like en-GB is available, can't ask Youtube for simply en. Need to get the available variant.''' + lang = lang[0:2] + for l in sequence: + if l[0:2] == lang: + return l + return None + def get_subtitle_sources(info): + '''Returns these sources, ordered from least to most intelligible: + native_video_lang (Automatic) + foreign_langs (Manual) + native_video_lang (Automatic) -> pref_lang + foreign_langs (Manual) -> pref_lang + native_video_lang (Manual) -> pref_lang + pref_lang (Automatic) + pref_lang (Manual)''' sources = [] - default_found = False - default = None - for language, formats in info['subtitles'].items(): - for format in formats: - if format['ext'] == 'vtt': - source = { - 'url': '/' + format['url'], - 'label': language, - 'srclang': language, - - # set as on by default if this is the preferred language and a default-on subtitles mode is in settings - 'on': language == settings.subtitles_language and settings.subtitles_mode > 0, - } - - if language == settings.subtitles_language: - default_found = True - default = source - else: - sources.append(source) - break - - # Put it at the end to avoid browser bug when there are too many languages - # (in firefox, it is impossible to select a language near the top of the list because it is cut off) - if default_found: - sources.append(default) + pref_lang = settings.subtitles_language + native_video_lang = None + if info['automatic_caption_languages']: + native_video_lang = info['automatic_caption_languages'][0] - try: - formats = info['automatic_captions'][settings.subtitles_language] - except KeyError: - pass - else: - for format in formats: - if format['ext'] == 'vtt': - sources.append({ - 'url': '/' + format['url'], - 'label': settings.subtitles_language + ' - Automatic', - 'srclang': settings.subtitles_language, + highest_fidelity_is_manual = False - # set as on by default if this is the preferred language and a default-on subtitles mode is in settings - 'on': settings.subtitles_mode == 2 and not default_found, + # Sources are added in very specific order outlined above + # More intelligible sources are put further down to avoid browser bug when there are too many languages + # (in firefox, it is impossible to select a language near the top of the list because it is cut off) - }) + # native_video_lang (Automatic) + if native_video_lang and not lang_eq(native_video_lang, pref_lang): + sources.append(make_caption_src(info, native_video_lang, auto=True)) + + # foreign_langs (Manual) + for lang in info['manual_caption_languages']: + if not lang_eq(lang, pref_lang): + sources.append(make_caption_src(info, lang)) + + if (lang_in(pref_lang, info['translation_languages']) + and not lang_in(pref_lang, info['automatic_caption_languages']) + and not lang_in(pref_lang, info['manual_caption_languages'])): + # native_video_lang (Automatic) -> pref_lang + if native_video_lang and not lang_eq(pref_lang, native_video_lang): + sources.append(make_caption_src(info, native_video_lang, auto=True, trans_lang=pref_lang)) + + # foreign_langs (Manual) -> pref_lang + for lang in info['manual_caption_languages']: + if not lang_eq(lang, native_video_lang): + sources.append(make_caption_src(info, lang, trans_lang=pref_lang)) + + # native_video_lang (Manual) -> pref_lang + if lang_in(native_video_lang, info['manual_caption_languages']): + sources.append(make_caption_src(info, native_video_lang, trans_lang=pref_lang)) + + # pref_lang (Automatic) + if lang_in(pref_lang, info['automatic_caption_languages']): + sources.append(make_caption_src(info, equiv_lang_in(pref_lang, info['automatic_caption_languages']), auto=True)) + + # pref_lang (Manual) + if lang_in(pref_lang, info['manual_caption_languages']): + sources.append(make_caption_src(info, equiv_lang_in(pref_lang, info['manual_caption_languages']))) + highest_fidelity_is_manual = True + + if sources and sources[-1]['srclang'] == pref_lang: + # set as on by default since it's manual a default-on subtitles mode is in settings + if highest_fidelity_is_manual and settings.subtitles_mode > 0: + sources[-1]['on'] = True + # set as on by default since settings indicate to set it as such even if it's not manual + elif settings.subtitles_mode == 2: + sources[-1]['on'] = True + + if len(sources) == 0: + assert len(info['automatic_caption_languages']) == 0 and len(info['manual_caption_languages']) == 0 return sources diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index 15ab706..7c2b717 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -309,6 +309,8 @@ def ajax_info(item_json): youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$') def normalize_url(url): + if url is None: + return None match = youtube_url_re.fullmatch(url) if match is None: raise Exception() @@ -1042,7 +1044,18 @@ def extract_watch_info_desktop(top_level): return info -_SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') +def get_caption_url(info, language, format, automatic=False, translation_language=None): + '''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.''' + url = info['_captions_base_url'] + url += '&lang=' + language + url += '&fmt=' + format + if automatic: + url += '&kind=asr' + if translation_language: + url += '&tlang=' + translation_language + return url + +SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') def extract_watch_info(polymer_json): info = {'playability_error': None, 'error': None} @@ -1072,34 +1085,25 @@ def extract_watch_info(polymer_json): if playability_status not in (None, 'OK'): info['playability_error'] = playability_reason - # automatic captions - - # adapted from youtube_dl: - # https://github.com/ytdl-org/youtube-dl/blob/76e510b92c4a1c4b0001f892504ba2cbb4b8d486/youtube_dl/extractor/youtube.py#L1490-#L1523 - info['automatic_captions'] = {} - - renderer = default_multi_get(player_response, 'captions', 'playerCaptionsTracklistRenderer', default={}) - base_url = default_multi_get(renderer, 'captionTracks', 0, 'baseUrl') - - if base_url and '?' in base_url: - base_url = normalize_url(base_url) - base_url_path, base_url_query_string = base_url.split('?') - url_info = urllib.parse.parse_qs(base_url_query_string) - - for lang in renderer.get('translationLanguages', []): - lang_code = lang.get('languageCode') - if not lang_code: - continue - formats_for_this_lang = [] - for ext in _SUBTITLE_FORMATS: - url_info['tlang'] = [lang_code] - url_info['fmt'] = [ext] - url = base_url_path + '?' + urllib.parse.urlencode(url_info, doseq=True) - formats_for_this_lang.append({ - 'url': url, - 'ext': ext, - }) - info['automatic_captions'][lang_code] = formats_for_this_lang + # captions + info['automatic_caption_languages'] = [] + info['manual_caption_languages'] = [] + info['translation_languages'] = [] + captions_info = player_response.get('captions', {}) + info['_captions_base_url'] = normalize_url(default_multi_get(captions_info, 'playerCaptionsRenderer', 'baseUrl')) + for caption_track in default_multi_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()): + lang_code = caption_track.get('languageCode') + if lang_code: + if caption_track.get('kind') == 'asr': + info['automatic_caption_languages'].append(lang_code) + else: + info['manual_caption_languages'].append(lang_code) + for translation_lang_info in default_multi_get(captions_info, 'playerCaptionsTracklistRenderer', 'translationLanguages', default=()): + lang_code = translation_lang_info.get('languageCode') + if lang_code: + info['translation_languages'].append(lang_code) + if translation_lang_info.get('isTranslatable') == False: + print('WARNING: Found non-translatable caption language') # formats streaming_data = player_response.get('streamingData', {}) @@ -1157,5 +1161,4 @@ def extract_watch_info(polymer_json): # other stuff info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None - info['subtitles'] = {} # TODO return info -- cgit v1.2.3