From 6443cedf6299ba5a516df5a00cc78fb0b6800a1e Mon Sep 17 00:00:00 2001
From: James Taylor <user234683@users.noreply.github.com>
Date: Wed, 9 Dec 2020 17:08:12 -0800
Subject: Retrieve base.js url from html watch page when it's missing

Fixes failure mode 3 in #22
---
 youtube/watch.py                            | 38 +++++++++++++++++++----------
 youtube/yt_data_extract/__init__.py         |  3 ++-
 youtube/yt_data_extract/watch_extraction.py | 13 ++++++++++
 3 files changed, 40 insertions(+), 14 deletions(-)

(limited to 'youtube')

diff --git a/youtube/watch.py b/youtube/watch.py
index 34deb01..f7b8051 100644
--- a/youtube/watch.py
+++ b/youtube/watch.py
@@ -177,14 +177,32 @@ def save_decrypt_cache():
     f.write(json.dumps({'version': 1, 'decrypt_cache':decrypt_cache}, indent=4, sort_keys=True))
     f.close()
 
-def decrypt_signatures(info):
+watch_headers = (
+    ('Accept', '*/*'),
+    ('Accept-Language', 'en-US,en;q=0.5'),
+    ('X-YouTube-Client-Name', '2'),
+    ('X-YouTube-Client-Version', '2.20180830'),
+) + util.mobile_ua
+
+def decrypt_signatures(info, video_id):
     '''return error string, or False if no errors'''
     if not yt_data_extract.requires_decryption(info):
         return False
     if not info['player_name']:
-        return 'Could not find player name'
-    if not info['base_js']:
-        return 'Failed to find base.js'
+        # base.js urls missing. Usually this is because there is no
+        # embedded player response; instead it's in the json as playerResponse,
+        # but there's no base.js key.
+        # Example: https://www.youtube.com/watch?v=W6iQPK3F16U
+        # See https://github.com/user234683/youtube-local/issues/22#issuecomment-706395160
+        url = 'https://m.youtube.com/watch?v=' + video_id + '&bpctr=9999999999'
+        html_watch_page = util.fetch_url(
+            url,
+            headers=watch_headers,
+            report_text='Fetching html watch page to retrieve missing base.js',
+            debug_name='watch_page_html').decode('utf-8')
+        err = yt_data_extract.update_with_missing_base_js(info, html_watch_page)
+        if err:
+            return err
 
     player_name = info['player_name']
     if player_name in decrypt_cache:
@@ -201,13 +219,6 @@ def decrypt_signatures(info):
     err = yt_data_extract.decrypt_signatures(info)
     return err
 
-headers = (
-    ('Accept', '*/*'),
-    ('Accept-Language', 'en-US,en;q=0.5'),
-    ('X-YouTube-Client-Name', '2'),
-    ('X-YouTube-Client-Version', '2.20180830'),
-) + util.mobile_ua
-
 def extract_info(video_id, use_invidious, playlist_id=None, index=None):
     # bpctr=9999999999 will bypass are-you-sure dialogs for controversial
     # videos
@@ -216,7 +227,8 @@ def extract_info(video_id, use_invidious, playlist_id=None, index=None):
         url += '&list=' + playlist_id
     if index:
         url += '&index=' + index
-    polymer_json = util.fetch_url(url, headers=headers, debug_name='watch')
+    polymer_json = util.fetch_url(url, headers=watch_headers,
+                                  debug_name='watch')
     polymer_json = polymer_json.decode('utf-8')
     # TODO: Decide whether this should be done in yt_data_extract.extract_watch_info
     try:
@@ -242,7 +254,7 @@ def extract_info(video_id, use_invidious, playlist_id=None, index=None):
         yt_data_extract.update_with_age_restricted_info(info, video_info_page)
 
     # signature decryption
-    decryption_error = decrypt_signatures(info)
+    decryption_error = decrypt_signatures(info, video_id)
     if decryption_error:
         decryption_error = 'Error decrypting url signatures: ' + decryption_error
         info['playability_error'] = decryption_error
diff --git a/youtube/yt_data_extract/__init__.py b/youtube/yt_data_extract/__init__.py
index 8934f74..697e003 100644
--- a/youtube/yt_data_extract/__init__.py
+++ b/youtube/yt_data_extract/__init__.py
@@ -9,4 +9,5 @@ from .everything_else import (extract_channel_info, extract_search_info,
 from .watch_extraction import (extract_watch_info, get_caption_url,
     update_with_age_restricted_info, requires_decryption,
     extract_decryption_function, decrypt_signatures, _formats,
-    update_format_with_type_info, extract_hls_formats)
+    update_format_with_type_info, extract_hls_formats,
+    update_with_missing_base_js)
diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py
index aa7efa9..75fa206 100644
--- a/youtube/yt_data_extract/watch_extraction.py
+++ b/youtube/yt_data_extract/watch_extraction.py
@@ -602,6 +602,19 @@ def update_with_age_restricted_info(info, video_info_page):
     _extract_formats(info, player_response)
     _extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX)
 
+html_watch_page_base_js_re = re.compile(r'jsUrl":\s*"([\w\-\./]+/base.js)"')
+def update_with_missing_base_js(info, html_watch_page):
+    '''Extracts base_js url and player_name from html watch page. return err
+    Use when base_js is missing from the json page.'''
+    match = html_watch_page_base_js_re.search(html_watch_page)
+    if match:
+        info['base_js'] = normalize_url(match.group(1))
+        # must uniquely identify url
+        info['player_name'] = urllib.parse.urlparse(info['base_js']).path
+        return False
+    else:
+        return 'Could not find base_js url in watch page html'
+
 def requires_decryption(info):
     return ('formats' in info) and info['formats'] and info['formats'][0]['s']
 
-- 
cgit v1.2.3