aboutsummaryrefslogtreecommitdiffstats
path: root/youtube/yt_data_extract
diff options
context:
space:
mode:
authorJames Taylor <28744867+user234683@users.noreply.github.com>2022-03-25 22:02:05 -0700
committerJesús <heckyel@hyperbola.info>2022-03-30 00:41:30 +0800
commit79fd2966cdd53dd969b225de9e56f17cc3895206 (patch)
tree35732a6cdf45f2cd0c87db5d7dabad7f86d1441b /youtube/yt_data_extract
parentdcd4b0f0aeee19755d3d732695e94c51be54522c (diff)
downloadyt-local-79fd2966cdd53dd969b225de9e56f17cc3895206.tar.lz
yt-local-79fd2966cdd53dd969b225de9e56f17cc3895206.tar.xz
yt-local-79fd2966cdd53dd969b225de9e56f17cc3895206.zip
Extract captions base_url using different method when missing
The base url will be randomly missing. Take one of the listed captions urls which already has the &lang and automatic specifiers. Then remove these specifiers. Signed-off-by: Jesús <heckyel@hyperbola.info>
Diffstat (limited to 'youtube/yt_data_extract')
-rw-r--r--youtube/yt_data_extract/watch_extraction.py19
1 files changed, 19 insertions, 0 deletions
diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py
index 31f6466..6b1b30d 100644
--- a/youtube/yt_data_extract/watch_extraction.py
+++ b/youtube/yt_data_extract/watch_extraction.py
@@ -561,6 +561,25 @@ def extract_watch_info(polymer_json):
info['translation_languages'] = []
captions_info = player_response.get('captions', {})
info['_captions_base_url'] = normalize_url(deep_get(captions_info, 'playerCaptionsRenderer', 'baseUrl'))
+ # Sometimes the above playerCaptionsRender is randomly missing
+ # Extract base_url from one of the captions by removing lang specifiers
+ if not info['_captions_base_url']:
+ base_url = normalize_url(deep_get(
+ captions_info,
+ 'playerCaptionsTracklistRenderer',
+ 'captionTracks',
+ 0,
+ 'baseUrl'
+ ))
+ if base_url:
+ url_parts = urllib.parse.urlparse(base_url)
+ qs = urllib.parse.parse_qs(url_parts.query)
+ for key in ('tlang', 'lang', 'name', 'kind', 'fmt'):
+ if key in qs:
+ del qs[key]
+ base_url = urllib.parse.urlunparse(url_parts._replace(
+ query=urllib.parse.urlencode(qs, doseq=True)))
+ info['_captions_base_url'] = base_url
for caption_track in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()):
lang_code = caption_track.get('languageCode')
if not lang_code: