3 files changed, 77 insertions, 39 deletions
diff --git a/youtube/watch.py b/youtube/watch.py
index f7b8051..7f3b5be 100644
--- a/youtube/watch.py
+++ b/youtube/watch.py
@@ -189,20 +189,7 @@ def decrypt_signatures(info, video_id):
     if not yt_data_extract.requires_decryption(info):
         return False
     if not info['player_name']:
-        # base.js urls missing. Usually this is because there is no
-        # embedded player response; instead it's in the json as playerResponse,
-        # but there's no base.js key.
-        # Example: https://www.youtube.com/watch?v=W6iQPK3F16U
-        # See https://github.com/user234683/youtube-local/issues/22#issuecomment-706395160
-        url = 'https://m.youtube.com/watch?v=' + video_id + '&bpctr=9999999999'
-        html_watch_page = util.fetch_url(
-            url,
-            headers=watch_headers,
-            report_text='Fetching html watch page to retrieve missing base.js',
-            debug_name='watch_page_html').decode('utf-8')
-        err = yt_data_extract.update_with_missing_base_js(info, html_watch_page)
-        if err:
-            return err
+        return 'Could not find player name'
 
     player_name = info['player_name']
     if player_name in decrypt_cache:
@@ -222,21 +209,15 @@ def decrypt_signatures(info, video_id):
 def extract_info(video_id, use_invidious, playlist_id=None, index=None):
     # bpctr=9999999999 will bypass are-you-sure dialogs for controversial
     # videos
-    url = 'https://m.youtube.com/watch?v=' + video_id + '&pbj=1&bpctr=9999999999'
+    url = 'https://m.youtube.com/watch?v=' + video_id + '&bpctr=9999999999'
     if playlist_id:
         url += '&list=' + playlist_id
     if index:
         url += '&index=' + index
-    polymer_json = util.fetch_url(url, headers=watch_headers,
-                                  debug_name='watch')
-    polymer_json = polymer_json.decode('utf-8')
-    # TODO: Decide whether this should be done in yt_data_extract.extract_watch_info
-    try:
-        polymer_json = json.loads(polymer_json)
-    except json.decoder.JSONDecodeError:
-        traceback.print_exc()
-        return {'error': 'Failed to parse json response'}
-    info = yt_data_extract.extract_watch_info(polymer_json)
+    watch_page = util.fetch_url(url, headers=watch_headers,
+                                debug_name='watch')
+    watch_page = watch_page.decode('utf-8')
+    info = yt_data_extract.extract_watch_info_from_html(watch_page)
 
     # request player urls if it's missing
     # see https://github.com/user234683/youtube-local/issues/22#issuecomment-706395160
diff --git a/youtube/yt_data_extract/__init__.py b/youtube/yt_data_extract/__init__.py
index 697e003..ad7bd03 100644
--- a/youtube/yt_data_extract/__init__.py
+++ b/youtube/yt_data_extract/__init__.py
@@ -10,4 +10,4 @@ from .watch_extraction import (extract_watch_info, get_caption_url,
     update_with_age_restricted_info, requires_decryption,
     extract_decryption_function, decrypt_signatures, _formats,
     update_format_with_type_info, extract_hls_formats,
-    update_with_missing_base_js)
+    extract_watch_info_from_html)
diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py
index 75fa206..c304d23 100644
--- a/youtube/yt_data_extract/watch_extraction.py
+++ b/youtube/yt_data_extract/watch_extraction.py
@@ -569,6 +569,76 @@ def extract_watch_info(polymer_json):
     info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None
     return info
 
+single_char_codes = {
+    'n': '\n',
+    '\\': '\\',
+    '"': '"',
+    "'": "'",
+    'b': '\b',
+    'f': '\f',
+    'n': '\n',
+    'r': '\r',
+    't': '\t',
+    'v': '\x0b',
+    '0': '\x00',
+    '\n': '', # backslash followed by literal newline joins lines
+}
+def js_escape_replace(match):
+    r'''Resolves javascript string escape sequences such as \x..'''
+    # some js-strings in the watch page html include them for no reason
+    # https://mathiasbynens.be/notes/javascript-escapes
+    escaped_sequence = match.group(1)
+    if escaped_sequence[0] in ('x', 'u'):
+        return chr(int(escaped_sequence[1:], base=16))
+
+    # In javascript, if it's not one of those escape codes, it's just the
+    # literal character. e.g., "\a" = "a"
+    return single_char_codes.get(escaped_sequence, escaped_sequence)
+
+PLAYER_RESPONSE_RE = re.compile(r'<script[^>]*?>var ytInitialPlayerResponse = ({.*?});</script>')
+INITIAL_DATA_RE = re.compile(r"<script[^>]*?>var ytInitialData = '(.+?[^\\])';")
+BASE_JS_RE = re.compile(r'jsUrl":\s*"([\w\-\./]+?/base.js)"')
+JS_STRING_ESCAPE_RE = re.compile(r'\\([^xu]|x..|u....)')
+def extract_watch_info_from_html(watch_html):
+    base_js_match = BASE_JS_RE.search(watch_html)
+    player_response_match = PLAYER_RESPONSE_RE.search(watch_html)
+    initial_data_match = INITIAL_DATA_RE.search(watch_html)
+
+    if base_js_match is not None:
+        base_js_url = base_js_match.group(1)
+    else:
+        base_js_url = None
+
+    if player_response_match is not None:
+        player_response = json.loads(player_response_match.group(1))
+    else:
+        return {'error': 'Could not find ytInitialPlayerResponse'}
+        player_response = None
+
+    if initial_data_match is not None:
+        initial_data = initial_data_match.group(1)
+        initial_data = JS_STRING_ESCAPE_RE.sub(js_escape_replace, initial_data)
+        initial_data = json.loads(initial_data)
+    else:
+        print('extract_watch_info_from_html: failed to find initialData')
+        initial_data = None
+
+    # imitate old format expected by extract_watch_info
+    fake_polymer_json = {
+        'player': {
+            'args': {},
+            'assets': {
+                'js': base_js_url
+            }
+        },
+        'playerResponse': player_response,
+        'response': initial_data,
+    }
+
+    return extract_watch_info(fake_polymer_json)
+
+
+
 def get_caption_url(info, language, format, automatic=False, translation_language=None):
     '''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.'''
     url = info['_captions_base_url']
@@ -602,19 +672,6 @@ def update_with_age_restricted_info(info, video_info_page):
     _extract_formats(info, player_response)
     _extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX)
 
-html_watch_page_base_js_re = re.compile(r'jsUrl":\s*"([\w\-\./]+/base.js)"')
-def update_with_missing_base_js(info, html_watch_page):
-    '''Extracts base_js url and player_name from html watch page. return err
-    Use when base_js is missing from the json page.'''
-    match = html_watch_page_base_js_re.search(html_watch_page)
-    if match:
-        info['base_js'] = normalize_url(match.group(1))
-        # must uniquely identify url
-        info['player_name'] = urllib.parse.urlparse(info['base_js']).path
-        return False
-    else:
-        return 'Could not find base_js url in watch page html'
-
 def requires_decryption(info):
     return ('formats' in info) and info['formats'] and info['formats'][0]['s']