Improve ytInitialPlayerResponse extraction

Makes it work if there are additional javascripts statements after the playerResponse variable Signed-off-by: Jesús <heckyel@hyperbola.info>
author: James Taylor <user234683@users.noreply.github.com> 2020-12-16 19:23:39 -0800
committer: Jesús <heckyel@hyperbola.info> 2020-12-17 11:00:04 -0500
commit: 5edcaa4f9dcfd005f3ed633ddcb5b432e954877e (patch)
tree: 6173dc1615a799c0f872b6b951de74942ae495d7 /youtube
parent: 8443063fc48ef5d4a9d517c9999e17477b7a7123 (diff)
download: yt-local-5edcaa4f9dcfd005f3ed633ddcb5b432e954877e.tar.lz
yt-local-5edcaa4f9dcfd005f3ed633ddcb5b432e954877e.tar.xz
yt-local-5edcaa4f9dcfd005f3ed633ddcb5b432e954877e.zip
1 files changed, 10 insertions, 2 deletions
diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py
index c304d23..2304cce 100644
--- a/youtube/yt_data_extract/watch_extraction.py
+++ b/youtube/yt_data_extract/watch_extraction.py
@@ -595,7 +595,13 @@ def js_escape_replace(match):
     # literal character. e.g., "\a" = "a"
     return single_char_codes.get(escaped_sequence, escaped_sequence)
 
-PLAYER_RESPONSE_RE = re.compile(r'<script[^>]*?>var ytInitialPlayerResponse = ({.*?});</script>')
+# works but complicated and unsafe:
+#PLAYER_RESPONSE_RE = re.compile(r'<script[^>]*?>[^<]*?var ytInitialPlayerResponse = ({(?:"(?:[^"\\]|\\.)*?"|[^"])+?});')
+
+# Because there are sometimes additional statements after the json object
+# so we just capture all of those until end of script and tell json decoder
+# to ignore extra stuff after the json object
+PLAYER_RESPONSE_RE = re.compile(r'<script[^>]*?>[^<]*?var ytInitialPlayerResponse = ({.*?)</script>')
 INITIAL_DATA_RE = re.compile(r"<script[^>]*?>var ytInitialData = '(.+?[^\\])';")
 BASE_JS_RE = re.compile(r'jsUrl":\s*"([\w\-\./]+?/base.js)"')
 JS_STRING_ESCAPE_RE = re.compile(r'\\([^xu]|x..|u....)')
@@ -610,7 +616,9 @@ def extract_watch_info_from_html(watch_html):
         base_js_url = None
 
     if player_response_match is not None:
-        player_response = json.loads(player_response_match.group(1))
+        decoder = json.JSONDecoder()
+        # this will make it ignore extra stuff after end of object
+        player_response = decoder.raw_decode(player_response_match.group(1))[0]
     else:
         return {'error': 'Could not find ytInitialPlayerResponse'}
         player_response = None
author	James Taylor <user234683@users.noreply.github.com>	2020-12-16 19:23:39 -0800
committer	Jesús <heckyel@hyperbola.info>	2020-12-17 11:00:04 -0500
commit	5edcaa4f9dcfd005f3ed633ddcb5b432e954877e (patch)
tree	6173dc1615a799c0f872b6b951de74942ae495d7 /youtube
parent	8443063fc48ef5d4a9d517c9999e17477b7a7123 (diff)
download	yt-local-5edcaa4f9dcfd005f3ed633ddcb5b432e954877e.tar.lz yt-local-5edcaa4f9dcfd005f3ed633ddcb5b432e954877e.tar.xz yt-local-5edcaa4f9dcfd005f3ed633ddcb5b432e954877e.zip