From 5edcaa4f9dcfd005f3ed633ddcb5b432e954877e Mon Sep 17 00:00:00 2001 From: James Taylor Date: Wed, 16 Dec 2020 19:23:39 -0800 Subject: Improve ytInitialPlayerResponse extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Makes it work if there are additional javascripts statements after the playerResponse variable Signed-off-by: Jesús --- youtube/yt_data_extract/watch_extraction.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py index c304d23..2304cce 100644 --- a/youtube/yt_data_extract/watch_extraction.py +++ b/youtube/yt_data_extract/watch_extraction.py @@ -595,7 +595,13 @@ def js_escape_replace(match): # literal character. e.g., "\a" = "a" return single_char_codes.get(escaped_sequence, escaped_sequence) -PLAYER_RESPONSE_RE = re.compile(r']*?>var ytInitialPlayerResponse = ({.*?});') +# works but complicated and unsafe: +#PLAYER_RESPONSE_RE = re.compile(r']*?>[^<]*?var ytInitialPlayerResponse = ({(?:"(?:[^"\\]|\\.)*?"|[^"])+?});') + +# Because there are sometimes additional statements after the json object +# so we just capture all of those until end of script and tell json decoder +# to ignore extra stuff after the json object +PLAYER_RESPONSE_RE = re.compile(r']*?>[^<]*?var ytInitialPlayerResponse = ({.*?)') INITIAL_DATA_RE = re.compile(r"]*?>var ytInitialData = '(.+?[^\\])';") BASE_JS_RE = re.compile(r'jsUrl":\s*"([\w\-\./]+?/base.js)"') JS_STRING_ESCAPE_RE = re.compile(r'\\([^xu]|x..|u....)') @@ -610,7 +616,9 @@ def extract_watch_info_from_html(watch_html): base_js_url = None if player_response_match is not None: - player_response = json.loads(player_response_match.group(1)) + decoder = json.JSONDecoder() + # this will make it ignore extra stuff after end of object + player_response = decoder.raw_decode(player_response_match.group(1))[0] else: return {'error': 'Could not find ytInitialPlayerResponse'} player_response = None -- cgit v1.2.3