From 79fd2966cdd53dd969b225de9e56f17cc3895206 Mon Sep 17 00:00:00 2001 From: James Taylor <28744867+user234683@users.noreply.github.com> Date: Fri, 25 Mar 2022 22:02:05 -0700 Subject: Extract captions base_url using different method when missing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The base url will be randomly missing. Take one of the listed captions urls which already has the &lang and automatic specifiers. Then remove these specifiers. Signed-off-by: Jesús --- youtube/yt_data_extract/watch_extraction.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'youtube/yt_data_extract') diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py index 31f6466..6b1b30d 100644 --- a/youtube/yt_data_extract/watch_extraction.py +++ b/youtube/yt_data_extract/watch_extraction.py @@ -561,6 +561,25 @@ def extract_watch_info(polymer_json): info['translation_languages'] = [] captions_info = player_response.get('captions', {}) info['_captions_base_url'] = normalize_url(deep_get(captions_info, 'playerCaptionsRenderer', 'baseUrl')) + # Sometimes the above playerCaptionsRender is randomly missing + # Extract base_url from one of the captions by removing lang specifiers + if not info['_captions_base_url']: + base_url = normalize_url(deep_get( + captions_info, + 'playerCaptionsTracklistRenderer', + 'captionTracks', + 0, + 'baseUrl' + )) + if base_url: + url_parts = urllib.parse.urlparse(base_url) + qs = urllib.parse.parse_qs(url_parts.query) + for key in ('tlang', 'lang', 'name', 'kind', 'fmt'): + if key in qs: + del qs[key] + base_url = urllib.parse.urlunparse(url_parts._replace( + query=urllib.parse.urlencode(qs, doseq=True))) + info['_captions_base_url'] = base_url for caption_track in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()): lang_code = caption_track.get('languageCode') if not lang_code: -- cgit v1.2.3