From 79fd2966cdd53dd969b225de9e56f17cc3895206 Mon Sep 17 00:00:00 2001
From: James Taylor <28744867+user234683@users.noreply.github.com>
Date: Fri, 25 Mar 2022 22:02:05 -0700
Subject: Extract captions base_url using different method when missing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The base url will be randomly missing.

Take one of the listed captions urls which already
has the &lang and automatic specifiers. Then remove these
specifiers.

Signed-off-by: Jesús <heckyel@hyperbola.info>
---
 youtube/yt_data_extract/watch_extraction.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'youtube/yt_data_extract')

diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py
index 31f6466..6b1b30d 100644
--- a/youtube/yt_data_extract/watch_extraction.py
+++ b/youtube/yt_data_extract/watch_extraction.py
@@ -561,6 +561,25 @@ def extract_watch_info(polymer_json):
     info['translation_languages'] = []
     captions_info = player_response.get('captions', {})
     info['_captions_base_url'] = normalize_url(deep_get(captions_info, 'playerCaptionsRenderer', 'baseUrl'))
+    # Sometimes the above playerCaptionsRender is randomly missing
+    # Extract base_url from one of the captions by removing lang specifiers
+    if not info['_captions_base_url']:
+        base_url = normalize_url(deep_get(
+            captions_info,
+            'playerCaptionsTracklistRenderer',
+            'captionTracks',
+            0,
+            'baseUrl'
+        ))
+        if base_url:
+            url_parts = urllib.parse.urlparse(base_url)
+            qs = urllib.parse.parse_qs(url_parts.query)
+            for key in ('tlang', 'lang', 'name', 'kind', 'fmt'):
+                if key in qs:
+                    del qs[key]
+            base_url = urllib.parse.urlunparse(url_parts._replace(
+                query=urllib.parse.urlencode(qs, doseq=True)))
+            info['_captions_base_url'] = base_url
     for caption_track in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()):
         lang_code = caption_track.get('languageCode')
         if not lang_code:
-- 
cgit v1.2.3