aboutsummaryrefslogtreecommitdiffstats
path: root/youtube/yt_data_extract/watch_extraction.py
diff options
context:
space:
mode:
authorJames Taylor <user234683@users.noreply.github.com>2020-12-09 17:08:12 -0800
committerJames Taylor <user234683@users.noreply.github.com>2020-12-09 17:08:12 -0800
commit6443cedf6299ba5a516df5a00cc78fb0b6800a1e (patch)
tree42646b65c7ce267bdb12b7b7c382dafb73d8e489 /youtube/yt_data_extract/watch_extraction.py
parent1a7ed0a981f653c7eba85e57d9914501ade127fe (diff)
downloadyt-local-6443cedf6299ba5a516df5a00cc78fb0b6800a1e.tar.lz
yt-local-6443cedf6299ba5a516df5a00cc78fb0b6800a1e.tar.xz
yt-local-6443cedf6299ba5a516df5a00cc78fb0b6800a1e.zip
Retrieve base.js url from html watch page when it's missing
Fixes failure mode 3 in #22
Diffstat (limited to 'youtube/yt_data_extract/watch_extraction.py')
-rw-r--r--youtube/yt_data_extract/watch_extraction.py13
1 files changed, 13 insertions, 0 deletions
diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py
index aa7efa9..75fa206 100644
--- a/youtube/yt_data_extract/watch_extraction.py
+++ b/youtube/yt_data_extract/watch_extraction.py
@@ -602,6 +602,19 @@ def update_with_age_restricted_info(info, video_info_page):
_extract_formats(info, player_response)
_extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX)
+html_watch_page_base_js_re = re.compile(r'jsUrl":\s*"([\w\-\./]+/base.js)"')
+def update_with_missing_base_js(info, html_watch_page):
+ '''Extracts base_js url and player_name from html watch page. return err
+ Use when base_js is missing from the json page.'''
+ match = html_watch_page_base_js_re.search(html_watch_page)
+ if match:
+ info['base_js'] = normalize_url(match.group(1))
+ # must uniquely identify url
+ info['player_name'] = urllib.parse.urlparse(info['base_js']).path
+ return False
+ else:
+ return 'Could not find base_js url in watch page html'
+
def requires_decryption(info):
return ('formats' in info) and info['formats'] and info['formats'][0]['s']