aboutsummaryrefslogtreecommitdiffstats
path: root/youtube
diff options
context:
space:
mode:
authorJames Taylor <user234683@users.noreply.github.com>2020-12-09 17:08:12 -0800
committerJames Taylor <user234683@users.noreply.github.com>2020-12-09 17:08:12 -0800
commit6443cedf6299ba5a516df5a00cc78fb0b6800a1e (patch)
tree42646b65c7ce267bdb12b7b7c382dafb73d8e489 /youtube
parent1a7ed0a981f653c7eba85e57d9914501ade127fe (diff)
downloadyt-local-6443cedf6299ba5a516df5a00cc78fb0b6800a1e.tar.lz
yt-local-6443cedf6299ba5a516df5a00cc78fb0b6800a1e.tar.xz
yt-local-6443cedf6299ba5a516df5a00cc78fb0b6800a1e.zip
Retrieve base.js url from html watch page when it's missing
Fixes failure mode 3 in #22
Diffstat (limited to 'youtube')
-rw-r--r--youtube/watch.py38
-rw-r--r--youtube/yt_data_extract/__init__.py3
-rw-r--r--youtube/yt_data_extract/watch_extraction.py13
3 files changed, 40 insertions, 14 deletions
diff --git a/youtube/watch.py b/youtube/watch.py
index 34deb01..f7b8051 100644
--- a/youtube/watch.py
+++ b/youtube/watch.py
@@ -177,14 +177,32 @@ def save_decrypt_cache():
f.write(json.dumps({'version': 1, 'decrypt_cache':decrypt_cache}, indent=4, sort_keys=True))
f.close()
-def decrypt_signatures(info):
+watch_headers = (
+ ('Accept', '*/*'),
+ ('Accept-Language', 'en-US,en;q=0.5'),
+ ('X-YouTube-Client-Name', '2'),
+ ('X-YouTube-Client-Version', '2.20180830'),
+) + util.mobile_ua
+
+def decrypt_signatures(info, video_id):
'''return error string, or False if no errors'''
if not yt_data_extract.requires_decryption(info):
return False
if not info['player_name']:
- return 'Could not find player name'
- if not info['base_js']:
- return 'Failed to find base.js'
+ # base.js urls missing. Usually this is because there is no
+ # embedded player response; instead it's in the json as playerResponse,
+ # but there's no base.js key.
+ # Example: https://www.youtube.com/watch?v=W6iQPK3F16U
+ # See https://github.com/user234683/youtube-local/issues/22#issuecomment-706395160
+ url = 'https://m.youtube.com/watch?v=' + video_id + '&bpctr=9999999999'
+ html_watch_page = util.fetch_url(
+ url,
+ headers=watch_headers,
+ report_text='Fetching html watch page to retrieve missing base.js',
+ debug_name='watch_page_html').decode('utf-8')
+ err = yt_data_extract.update_with_missing_base_js(info, html_watch_page)
+ if err:
+ return err
player_name = info['player_name']
if player_name in decrypt_cache:
@@ -201,13 +219,6 @@ def decrypt_signatures(info):
err = yt_data_extract.decrypt_signatures(info)
return err
-headers = (
- ('Accept', '*/*'),
- ('Accept-Language', 'en-US,en;q=0.5'),
- ('X-YouTube-Client-Name', '2'),
- ('X-YouTube-Client-Version', '2.20180830'),
-) + util.mobile_ua
-
def extract_info(video_id, use_invidious, playlist_id=None, index=None):
# bpctr=9999999999 will bypass are-you-sure dialogs for controversial
# videos
@@ -216,7 +227,8 @@ def extract_info(video_id, use_invidious, playlist_id=None, index=None):
url += '&list=' + playlist_id
if index:
url += '&index=' + index
- polymer_json = util.fetch_url(url, headers=headers, debug_name='watch')
+ polymer_json = util.fetch_url(url, headers=watch_headers,
+ debug_name='watch')
polymer_json = polymer_json.decode('utf-8')
# TODO: Decide whether this should be done in yt_data_extract.extract_watch_info
try:
@@ -242,7 +254,7 @@ def extract_info(video_id, use_invidious, playlist_id=None, index=None):
yt_data_extract.update_with_age_restricted_info(info, video_info_page)
# signature decryption
- decryption_error = decrypt_signatures(info)
+ decryption_error = decrypt_signatures(info, video_id)
if decryption_error:
decryption_error = 'Error decrypting url signatures: ' + decryption_error
info['playability_error'] = decryption_error
diff --git a/youtube/yt_data_extract/__init__.py b/youtube/yt_data_extract/__init__.py
index 8934f74..697e003 100644
--- a/youtube/yt_data_extract/__init__.py
+++ b/youtube/yt_data_extract/__init__.py
@@ -9,4 +9,5 @@ from .everything_else import (extract_channel_info, extract_search_info,
from .watch_extraction import (extract_watch_info, get_caption_url,
update_with_age_restricted_info, requires_decryption,
extract_decryption_function, decrypt_signatures, _formats,
- update_format_with_type_info, extract_hls_formats)
+ update_format_with_type_info, extract_hls_formats,
+ update_with_missing_base_js)
diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py
index aa7efa9..75fa206 100644
--- a/youtube/yt_data_extract/watch_extraction.py
+++ b/youtube/yt_data_extract/watch_extraction.py
@@ -602,6 +602,19 @@ def update_with_age_restricted_info(info, video_info_page):
_extract_formats(info, player_response)
_extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX)
+html_watch_page_base_js_re = re.compile(r'jsUrl":\s*"([\w\-\./]+/base.js)"')
+def update_with_missing_base_js(info, html_watch_page):
+ '''Extracts base_js url and player_name from html watch page. return err
+ Use when base_js is missing from the json page.'''
+ match = html_watch_page_base_js_re.search(html_watch_page)
+ if match:
+ info['base_js'] = normalize_url(match.group(1))
+ # must uniquely identify url
+ info['player_name'] = urllib.parse.urlparse(info['base_js']).path
+ return False
+ else:
+ return 'Could not find base_js url in watch page html'
+
def requires_decryption(info):
return ('formats' in info) and info['formats'] and info['formats'][0]['s']