diff options
author | James Taylor <28744867+user234683@users.noreply.github.com> | 2022-03-25 22:02:05 -0700 |
---|---|---|
committer | Jesús <heckyel@hyperbola.info> | 2022-03-30 00:41:30 +0800 |
commit | 79fd2966cdd53dd969b225de9e56f17cc3895206 (patch) | |
tree | 35732a6cdf45f2cd0c87db5d7dabad7f86d1441b /youtube/yt_data_extract | |
parent | dcd4b0f0aeee19755d3d732695e94c51be54522c (diff) | |
download | yt-local-79fd2966cdd53dd969b225de9e56f17cc3895206.tar.lz yt-local-79fd2966cdd53dd969b225de9e56f17cc3895206.tar.xz yt-local-79fd2966cdd53dd969b225de9e56f17cc3895206.zip |
Extract captions base_url using different method when missing
The base url will be randomly missing.
Take one of the listed captions urls which already
has the &lang and automatic specifiers. Then remove these
specifiers.
Signed-off-by: Jesús <heckyel@hyperbola.info>
Diffstat (limited to 'youtube/yt_data_extract')
-rw-r--r-- | youtube/yt_data_extract/watch_extraction.py | 19 |
1 files changed, 19 insertions, 0 deletions
diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py index 31f6466..6b1b30d 100644 --- a/youtube/yt_data_extract/watch_extraction.py +++ b/youtube/yt_data_extract/watch_extraction.py @@ -561,6 +561,25 @@ def extract_watch_info(polymer_json): info['translation_languages'] = [] captions_info = player_response.get('captions', {}) info['_captions_base_url'] = normalize_url(deep_get(captions_info, 'playerCaptionsRenderer', 'baseUrl')) + # Sometimes the above playerCaptionsRender is randomly missing + # Extract base_url from one of the captions by removing lang specifiers + if not info['_captions_base_url']: + base_url = normalize_url(deep_get( + captions_info, + 'playerCaptionsTracklistRenderer', + 'captionTracks', + 0, + 'baseUrl' + )) + if base_url: + url_parts = urllib.parse.urlparse(base_url) + qs = urllib.parse.parse_qs(url_parts.query) + for key in ('tlang', 'lang', 'name', 'kind', 'fmt'): + if key in qs: + del qs[key] + base_url = urllib.parse.urlunparse(url_parts._replace( + query=urllib.parse.urlencode(qs, doseq=True))) + info['_captions_base_url'] = base_url for caption_track in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()): lang_code = caption_track.get('languageCode') if not lang_code: |