aboutsummaryrefslogtreecommitdiffstats
path: root/youtube
diff options
context:
space:
mode:
authorJames Taylor <user234683@users.noreply.github.com>2019-11-29 18:36:27 -0800
committerJames Taylor <user234683@users.noreply.github.com>2019-11-29 18:36:27 -0800
commit205ad29cb0763dd263a5940cdcb3059d189bbfe7 (patch)
tree45405ed2f1d6c6ee5630eea8b0571aee2c2a8622 /youtube
parent95da24a2060fe2575a4edd10140a3426424978d4 (diff)
downloadyt-local-205ad29cb0763dd263a5940cdcb3059d189bbfe7.tar.lz
yt-local-205ad29cb0763dd263a5940cdcb3059d189bbfe7.tar.xz
yt-local-205ad29cb0763dd263a5940cdcb3059d189bbfe7.zip
Extraction: Add general subtitle extraction and translation
Diffstat (limited to 'youtube')
-rw-r--r--youtube/watch.py132
-rw-r--r--youtube/yt_data_extract.py63
2 files changed, 126 insertions, 69 deletions
diff --git a/youtube/watch.py b/youtube/watch.py
index 8a396a7..fa697ba 100644
--- a/youtube/watch.py
+++ b/youtube/watch.py
@@ -44,50 +44,104 @@ def get_video_sources(info):
return video_sources
+def make_caption_src(info, lang, auto=False, trans_lang=None):
+ label = lang
+ if auto:
+ label += ' (Automatic)'
+ if trans_lang:
+ label += ' -> ' + trans_lang
+ return {
+ 'url': '/' + yt_data_extract.get_caption_url(info, lang, 'vtt', auto, trans_lang),
+ 'label': label,
+ 'srclang': trans_lang[0:2] if trans_lang else lang[0:2],
+ 'on': False,
+ }
+
+def lang_in(lang, sequence):
+ '''Tests if the language is in sequence, with e.g. en and en-US considered the same'''
+ lang = lang[0:2]
+ return lang in (l[0:2] for l in sequence)
+
+def lang_eq(lang1, lang2):
+ '''Tests if two iso 639-1 codes are equal, with en and en-US considered the same.
+ Just because the codes are equal does not mean the dialects are mutually intelligible, but this will have to do for now without a complex language model'''
+ return lang1[0:2] == lang2[0:2]
+
+def equiv_lang_in(lang, sequence):
+ '''Extracts a language in sequence which is equivalent to lang.
+ e.g. if lang is en, extracts en-GB from sequence.
+ Necessary because if only a specific variant like en-GB is available, can't ask Youtube for simply en. Need to get the available variant.'''
+ lang = lang[0:2]
+ for l in sequence:
+ if l[0:2] == lang:
+ return l
+ return None
+
def get_subtitle_sources(info):
+ '''Returns these sources, ordered from least to most intelligible:
+ native_video_lang (Automatic)
+ foreign_langs (Manual)
+ native_video_lang (Automatic) -> pref_lang
+ foreign_langs (Manual) -> pref_lang
+ native_video_lang (Manual) -> pref_lang
+ pref_lang (Automatic)
+ pref_lang (Manual)'''
sources = []
- default_found = False
- default = None
- for language, formats in info['subtitles'].items():
- for format in formats:
- if format['ext'] == 'vtt':
- source = {
- 'url': '/' + format['url'],
- 'label': language,
- 'srclang': language,
-
- # set as on by default if this is the preferred language and a default-on subtitles mode is in settings
- 'on': language == settings.subtitles_language and settings.subtitles_mode > 0,
- }
-
- if language == settings.subtitles_language:
- default_found = True
- default = source
- else:
- sources.append(source)
- break
-
- # Put it at the end to avoid browser bug when there are too many languages
- # (in firefox, it is impossible to select a language near the top of the list because it is cut off)
- if default_found:
- sources.append(default)
+ pref_lang = settings.subtitles_language
+ native_video_lang = None
+ if info['automatic_caption_languages']:
+ native_video_lang = info['automatic_caption_languages'][0]
- try:
- formats = info['automatic_captions'][settings.subtitles_language]
- except KeyError:
- pass
- else:
- for format in formats:
- if format['ext'] == 'vtt':
- sources.append({
- 'url': '/' + format['url'],
- 'label': settings.subtitles_language + ' - Automatic',
- 'srclang': settings.subtitles_language,
+ highest_fidelity_is_manual = False
- # set as on by default if this is the preferred language and a default-on subtitles mode is in settings
- 'on': settings.subtitles_mode == 2 and not default_found,
+ # Sources are added in very specific order outlined above
+ # More intelligible sources are put further down to avoid browser bug when there are too many languages
+ # (in firefox, it is impossible to select a language near the top of the list because it is cut off)
- })
+ # native_video_lang (Automatic)
+ if native_video_lang and not lang_eq(native_video_lang, pref_lang):
+ sources.append(make_caption_src(info, native_video_lang, auto=True))
+
+ # foreign_langs (Manual)
+ for lang in info['manual_caption_languages']:
+ if not lang_eq(lang, pref_lang):
+ sources.append(make_caption_src(info, lang))
+
+ if (lang_in(pref_lang, info['translation_languages'])
+ and not lang_in(pref_lang, info['automatic_caption_languages'])
+ and not lang_in(pref_lang, info['manual_caption_languages'])):
+ # native_video_lang (Automatic) -> pref_lang
+ if native_video_lang and not lang_eq(pref_lang, native_video_lang):
+ sources.append(make_caption_src(info, native_video_lang, auto=True, trans_lang=pref_lang))
+
+ # foreign_langs (Manual) -> pref_lang
+ for lang in info['manual_caption_languages']:
+ if not lang_eq(lang, native_video_lang):
+ sources.append(make_caption_src(info, lang, trans_lang=pref_lang))
+
+ # native_video_lang (Manual) -> pref_lang
+ if lang_in(native_video_lang, info['manual_caption_languages']):
+ sources.append(make_caption_src(info, native_video_lang, trans_lang=pref_lang))
+
+ # pref_lang (Automatic)
+ if lang_in(pref_lang, info['automatic_caption_languages']):
+ sources.append(make_caption_src(info, equiv_lang_in(pref_lang, info['automatic_caption_languages']), auto=True))
+
+ # pref_lang (Manual)
+ if lang_in(pref_lang, info['manual_caption_languages']):
+ sources.append(make_caption_src(info, equiv_lang_in(pref_lang, info['manual_caption_languages'])))
+ highest_fidelity_is_manual = True
+
+ if sources and sources[-1]['srclang'] == pref_lang:
+ # set as on by default since it's manual a default-on subtitles mode is in settings
+ if highest_fidelity_is_manual and settings.subtitles_mode > 0:
+ sources[-1]['on'] = True
+ # set as on by default since settings indicate to set it as such even if it's not manual
+ elif settings.subtitles_mode == 2:
+ sources[-1]['on'] = True
+
+ if len(sources) == 0:
+ assert len(info['automatic_caption_languages']) == 0 and len(info['manual_caption_languages']) == 0
return sources
diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py
index 15ab706..7c2b717 100644
--- a/youtube/yt_data_extract.py
+++ b/youtube/yt_data_extract.py
@@ -309,6 +309,8 @@ def ajax_info(item_json):
youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$')
def normalize_url(url):
+ if url is None:
+ return None
match = youtube_url_re.fullmatch(url)
if match is None:
raise Exception()
@@ -1042,7 +1044,18 @@ def extract_watch_info_desktop(top_level):
return info
-_SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
+def get_caption_url(info, language, format, automatic=False, translation_language=None):
+ '''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.'''
+ url = info['_captions_base_url']
+ url += '&lang=' + language
+ url += '&fmt=' + format
+ if automatic:
+ url += '&kind=asr'
+ if translation_language:
+ url += '&tlang=' + translation_language
+ return url
+
+SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
def extract_watch_info(polymer_json):
info = {'playability_error': None, 'error': None}
@@ -1072,34 +1085,25 @@ def extract_watch_info(polymer_json):
if playability_status not in (None, 'OK'):
info['playability_error'] = playability_reason
- # automatic captions
-
- # adapted from youtube_dl:
- # https://github.com/ytdl-org/youtube-dl/blob/76e510b92c4a1c4b0001f892504ba2cbb4b8d486/youtube_dl/extractor/youtube.py#L1490-#L1523
- info['automatic_captions'] = {}
-
- renderer = default_multi_get(player_response, 'captions', 'playerCaptionsTracklistRenderer', default={})
- base_url = default_multi_get(renderer, 'captionTracks', 0, 'baseUrl')
-
- if base_url and '?' in base_url:
- base_url = normalize_url(base_url)
- base_url_path, base_url_query_string = base_url.split('?')
- url_info = urllib.parse.parse_qs(base_url_query_string)
-
- for lang in renderer.get('translationLanguages', []):
- lang_code = lang.get('languageCode')
- if not lang_code:
- continue
- formats_for_this_lang = []
- for ext in _SUBTITLE_FORMATS:
- url_info['tlang'] = [lang_code]
- url_info['fmt'] = [ext]
- url = base_url_path + '?' + urllib.parse.urlencode(url_info, doseq=True)
- formats_for_this_lang.append({
- 'url': url,
- 'ext': ext,
- })
- info['automatic_captions'][lang_code] = formats_for_this_lang
+ # captions
+ info['automatic_caption_languages'] = []
+ info['manual_caption_languages'] = []
+ info['translation_languages'] = []
+ captions_info = player_response.get('captions', {})
+ info['_captions_base_url'] = normalize_url(default_multi_get(captions_info, 'playerCaptionsRenderer', 'baseUrl'))
+ for caption_track in default_multi_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()):
+ lang_code = caption_track.get('languageCode')
+ if lang_code:
+ if caption_track.get('kind') == 'asr':
+ info['automatic_caption_languages'].append(lang_code)
+ else:
+ info['manual_caption_languages'].append(lang_code)
+ for translation_lang_info in default_multi_get(captions_info, 'playerCaptionsTracklistRenderer', 'translationLanguages', default=()):
+ lang_code = translation_lang_info.get('languageCode')
+ if lang_code:
+ info['translation_languages'].append(lang_code)
+ if translation_lang_info.get('isTranslatable') == False:
+ print('WARNING: Found non-translatable caption language')
# formats
streaming_data = player_response.get('streamingData', {})
@@ -1157,5 +1161,4 @@ def extract_watch_info(polymer_json):
# other stuff
info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None
- info['subtitles'] = {} # TODO
return info