[extractor] Add `_search_json`

All fetching of JSON objects should eventually be done with this function but only `youtube` is being refactored for now
author: pukkandan <pukkandan.ytdlp@gmail.com> 2022-06-03 21:02:31 +0530
committer: pukkandan <pukkandan.ytdlp@gmail.com> 2022-06-06 19:46:45 +0530
commit: b7c47b743871cdf3e0de75b17e4454d987384bf9 (patch)
tree: 928081bf818ee21df40d465c8ffce0bc9ae0ddc2 /yt_dlp/extractor/archiveorg.py
parent: 00bbc5f17710367adc7508062e155547b35edd20 (diff)
download: hypervideo-pre-b7c47b743871cdf3e0de75b17e4454d987384bf9.tar.lz
hypervideo-pre-b7c47b743871cdf3e0de75b17e4454d987384bf9.tar.xz
hypervideo-pre-b7c47b743871cdf3e0de75b17e4454d987384bf9.zip
1 files changed, 9 insertions, 12 deletions
diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py
index c85d5297d..c1c9b0adf 100644
--- a/yt_dlp/extractor/archiveorg.py
+++ b/yt_dlp/extractor/archiveorg.py
@@ -442,9 +442,10 @@ class YoutubeWebArchiveIE(InfoExtractor):
             'only_matching': True
         },
     ]
-    _YT_INITIAL_DATA_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE
-    _YT_INITIAL_PLAYER_RESPONSE_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*({.+?})[)\s]*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE
-    _YT_INITIAL_BOUNDARY_RE = r'(?:(?:var\s+meta|</script|\n)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_BOUNDARY_RE
+    _YT_INITIAL_DATA_RE = YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE
+    _YT_INITIAL_PLAYER_RESPONSE_RE = fr'''(?x)
+        (?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*|
+        {YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE}'''
 
     _YT_DEFAULT_THUMB_SERVERS = ['i.ytimg.com']  # thumbnails most likely archived on these servers
     _YT_ALL_THUMB_SERVERS = orderedSet(
@@ -474,11 +475,6 @@ class YoutubeWebArchiveIE(InfoExtractor):
         elif not isinstance(res, list) or len(res) != 0:
             self.report_warning('Error while parsing CDX API response' + bug_reports_message())
 
-    def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
-        return self._parse_json(self._search_regex(
-            (fr'{regex}\s*{self._YT_INITIAL_BOUNDARY_RE}',
-             regex), webpage, name, default='{}'), video_id, fatal=False)
-
     def _extract_webpage_title(self, webpage):
         page_title = self._html_extract_title(webpage, default='')
         # YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix.
@@ -488,10 +484,11 @@ class YoutubeWebArchiveIE(InfoExtractor):
 
     def _extract_metadata(self, video_id, webpage):
         search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) if webpage else (lambda x: None))
-        player_response = self._extract_yt_initial_variable(
-            webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, video_id, 'initial player response') or {}
-        initial_data = self._extract_yt_initial_variable(
-            webpage, self._YT_INITIAL_DATA_RE, video_id, 'initial player response') or {}
+        player_response = self._search_json(
+            self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response',
+            video_id, fatal=False)
+        initial_data = self._search_json(
+            self._YT_INITIAL_DATA_RE, webpage, 'initial data', video_id, fatal=False)
 
         initial_data_video = traverse_obj(
             initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'videoPrimaryInfoRenderer'),
author	pukkandan <pukkandan.ytdlp@gmail.com>	2022-06-03 21:02:31 +0530
committer	pukkandan <pukkandan.ytdlp@gmail.com>	2022-06-06 19:46:45 +0530
commit	b7c47b743871cdf3e0de75b17e4454d987384bf9 (patch)
tree	928081bf818ee21df40d465c8ffce0bc9ae0ddc2 /yt_dlp/extractor/archiveorg.py
parent	00bbc5f17710367adc7508062e155547b35edd20 (diff)
download	hypervideo-pre-b7c47b743871cdf3e0de75b17e4454d987384bf9.tar.lz hypervideo-pre-b7c47b743871cdf3e0de75b17e4454d987384bf9.tar.xz hypervideo-pre-b7c47b743871cdf3e0de75b17e4454d987384bf9.zip