diff options
Diffstat (limited to 'youtube')
-rw-r--r-- | youtube/watch.py | 31 | ||||
-rw-r--r-- | youtube/yt_data_extract/__init__.py | 2 | ||||
-rw-r--r-- | youtube/yt_data_extract/watch_extraction.py | 83 |
3 files changed, 77 insertions, 39 deletions
diff --git a/youtube/watch.py b/youtube/watch.py index f7b8051..7f3b5be 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -189,20 +189,7 @@ def decrypt_signatures(info, video_id): if not yt_data_extract.requires_decryption(info): return False if not info['player_name']: - # base.js urls missing. Usually this is because there is no - # embedded player response; instead it's in the json as playerResponse, - # but there's no base.js key. - # Example: https://www.youtube.com/watch?v=W6iQPK3F16U - # See https://github.com/user234683/youtube-local/issues/22#issuecomment-706395160 - url = 'https://m.youtube.com/watch?v=' + video_id + '&bpctr=9999999999' - html_watch_page = util.fetch_url( - url, - headers=watch_headers, - report_text='Fetching html watch page to retrieve missing base.js', - debug_name='watch_page_html').decode('utf-8') - err = yt_data_extract.update_with_missing_base_js(info, html_watch_page) - if err: - return err + return 'Could not find player name' player_name = info['player_name'] if player_name in decrypt_cache: @@ -222,21 +209,15 @@ def decrypt_signatures(info, video_id): def extract_info(video_id, use_invidious, playlist_id=None, index=None): # bpctr=9999999999 will bypass are-you-sure dialogs for controversial # videos - url = 'https://m.youtube.com/watch?v=' + video_id + '&pbj=1&bpctr=9999999999' + url = 'https://m.youtube.com/watch?v=' + video_id + '&bpctr=9999999999' if playlist_id: url += '&list=' + playlist_id if index: url += '&index=' + index - polymer_json = util.fetch_url(url, headers=watch_headers, - debug_name='watch') - polymer_json = polymer_json.decode('utf-8') - # TODO: Decide whether this should be done in yt_data_extract.extract_watch_info - try: - polymer_json = json.loads(polymer_json) - except json.decoder.JSONDecodeError: - traceback.print_exc() - return {'error': 'Failed to parse json response'} - info = yt_data_extract.extract_watch_info(polymer_json) + watch_page = util.fetch_url(url, headers=watch_headers, + debug_name='watch') + watch_page = watch_page.decode('utf-8') + info = yt_data_extract.extract_watch_info_from_html(watch_page) # request player urls if it's missing # see https://github.com/user234683/youtube-local/issues/22#issuecomment-706395160 diff --git a/youtube/yt_data_extract/__init__.py b/youtube/yt_data_extract/__init__.py index 697e003..ad7bd03 100644 --- a/youtube/yt_data_extract/__init__.py +++ b/youtube/yt_data_extract/__init__.py @@ -10,4 +10,4 @@ from .watch_extraction import (extract_watch_info, get_caption_url, update_with_age_restricted_info, requires_decryption, extract_decryption_function, decrypt_signatures, _formats, update_format_with_type_info, extract_hls_formats, - update_with_missing_base_js) + extract_watch_info_from_html) diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py index 75fa206..c304d23 100644 --- a/youtube/yt_data_extract/watch_extraction.py +++ b/youtube/yt_data_extract/watch_extraction.py @@ -569,6 +569,76 @@ def extract_watch_info(polymer_json): info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None return info +single_char_codes = { + 'n': '\n', + '\\': '\\', + '"': '"', + "'": "'", + 'b': '\b', + 'f': '\f', + 'n': '\n', + 'r': '\r', + 't': '\t', + 'v': '\x0b', + '0': '\x00', + '\n': '', # backslash followed by literal newline joins lines +} +def js_escape_replace(match): + r'''Resolves javascript string escape sequences such as \x..''' + # some js-strings in the watch page html include them for no reason + # https://mathiasbynens.be/notes/javascript-escapes + escaped_sequence = match.group(1) + if escaped_sequence[0] in ('x', 'u'): + return chr(int(escaped_sequence[1:], base=16)) + + # In javascript, if it's not one of those escape codes, it's just the + # literal character. e.g., "\a" = "a" + return single_char_codes.get(escaped_sequence, escaped_sequence) + +PLAYER_RESPONSE_RE = re.compile(r'<script[^>]*?>var ytInitialPlayerResponse = ({.*?});</script>') +INITIAL_DATA_RE = re.compile(r"<script[^>]*?>var ytInitialData = '(.+?[^\\])';") +BASE_JS_RE = re.compile(r'jsUrl":\s*"([\w\-\./]+?/base.js)"') +JS_STRING_ESCAPE_RE = re.compile(r'\\([^xu]|x..|u....)') +def extract_watch_info_from_html(watch_html): + base_js_match = BASE_JS_RE.search(watch_html) + player_response_match = PLAYER_RESPONSE_RE.search(watch_html) + initial_data_match = INITIAL_DATA_RE.search(watch_html) + + if base_js_match is not None: + base_js_url = base_js_match.group(1) + else: + base_js_url = None + + if player_response_match is not None: + player_response = json.loads(player_response_match.group(1)) + else: + return {'error': 'Could not find ytInitialPlayerResponse'} + player_response = None + + if initial_data_match is not None: + initial_data = initial_data_match.group(1) + initial_data = JS_STRING_ESCAPE_RE.sub(js_escape_replace, initial_data) + initial_data = json.loads(initial_data) + else: + print('extract_watch_info_from_html: failed to find initialData') + initial_data = None + + # imitate old format expected by extract_watch_info + fake_polymer_json = { + 'player': { + 'args': {}, + 'assets': { + 'js': base_js_url + } + }, + 'playerResponse': player_response, + 'response': initial_data, + } + + return extract_watch_info(fake_polymer_json) + + + def get_caption_url(info, language, format, automatic=False, translation_language=None): '''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.''' url = info['_captions_base_url'] @@ -602,19 +672,6 @@ def update_with_age_restricted_info(info, video_info_page): _extract_formats(info, player_response) _extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX) -html_watch_page_base_js_re = re.compile(r'jsUrl":\s*"([\w\-\./]+/base.js)"') -def update_with_missing_base_js(info, html_watch_page): - '''Extracts base_js url and player_name from html watch page. return err - Use when base_js is missing from the json page.''' - match = html_watch_page_base_js_re.search(html_watch_page) - if match: - info['base_js'] = normalize_url(match.group(1)) - # must uniquely identify url - info['player_name'] = urllib.parse.urlparse(info['base_js']).path - return False - else: - return 'Could not find base_js url in watch page html' - def requires_decryption(info): return ('formats' in info) and info['formats'] and info['formats'][0]['s'] |