From 6443cedf6299ba5a516df5a00cc78fb0b6800a1e Mon Sep 17 00:00:00 2001 From: James Taylor Date: Wed, 9 Dec 2020 17:08:12 -0800 Subject: Retrieve base.js url from html watch page when it's missing Fixes failure mode 3 in #22 --- youtube/yt_data_extract/__init__.py | 3 ++- youtube/yt_data_extract/watch_extraction.py | 13 +++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'youtube/yt_data_extract') diff --git a/youtube/yt_data_extract/__init__.py b/youtube/yt_data_extract/__init__.py index 8934f74..697e003 100644 --- a/youtube/yt_data_extract/__init__.py +++ b/youtube/yt_data_extract/__init__.py @@ -9,4 +9,5 @@ from .everything_else import (extract_channel_info, extract_search_info, from .watch_extraction import (extract_watch_info, get_caption_url, update_with_age_restricted_info, requires_decryption, extract_decryption_function, decrypt_signatures, _formats, - update_format_with_type_info, extract_hls_formats) + update_format_with_type_info, extract_hls_formats, + update_with_missing_base_js) diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py index aa7efa9..75fa206 100644 --- a/youtube/yt_data_extract/watch_extraction.py +++ b/youtube/yt_data_extract/watch_extraction.py @@ -602,6 +602,19 @@ def update_with_age_restricted_info(info, video_info_page): _extract_formats(info, player_response) _extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX) +html_watch_page_base_js_re = re.compile(r'jsUrl":\s*"([\w\-\./]+/base.js)"') +def update_with_missing_base_js(info, html_watch_page): + '''Extracts base_js url and player_name from html watch page. return err + Use when base_js is missing from the json page.''' + match = html_watch_page_base_js_re.search(html_watch_page) + if match: + info['base_js'] = normalize_url(match.group(1)) + # must uniquely identify url + info['player_name'] = urllib.parse.urlparse(info['base_js']).path + return False + else: + return 'Could not find base_js url in watch page html' + def requires_decryption(info): return ('formats' in info) and info['formats'] and info['formats'][0]['s'] -- cgit v1.2.3