diff options
-rw-r--r-- | youtube/yt_data_extract/watch_extraction.py | 19 |
1 files changed, 19 insertions, 0 deletions
diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py index 31f6466..6b1b30d 100644 --- a/youtube/yt_data_extract/watch_extraction.py +++ b/youtube/yt_data_extract/watch_extraction.py @@ -561,6 +561,25 @@ def extract_watch_info(polymer_json): info['translation_languages'] = [] captions_info = player_response.get('captions', {}) info['_captions_base_url'] = normalize_url(deep_get(captions_info, 'playerCaptionsRenderer', 'baseUrl')) + # Sometimes the above playerCaptionsRender is randomly missing + # Extract base_url from one of the captions by removing lang specifiers + if not info['_captions_base_url']: + base_url = normalize_url(deep_get( + captions_info, + 'playerCaptionsTracklistRenderer', + 'captionTracks', + 0, + 'baseUrl' + )) + if base_url: + url_parts = urllib.parse.urlparse(base_url) + qs = urllib.parse.parse_qs(url_parts.query) + for key in ('tlang', 'lang', 'name', 'kind', 'fmt'): + if key in qs: + del qs[key] + base_url = urllib.parse.urlunparse(url_parts._replace( + query=urllib.parse.urlencode(qs, doseq=True))) + info['_captions_base_url'] = base_url for caption_track in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()): lang_code = caption_track.get('languageCode') if not lang_code: |