aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--yt_dlp/extractor/archiveorg.py21
-rw-r--r--yt_dlp/extractor/common.py24
-rw-r--r--yt_dlp/extractor/youtube.py23
-rw-r--r--yt_dlp/utils.py13
4 files changed, 42 insertions, 39 deletions
diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py
index c85d5297d..c1c9b0adf 100644
--- a/yt_dlp/extractor/archiveorg.py
+++ b/yt_dlp/extractor/archiveorg.py
@@ -442,9 +442,10 @@ class YoutubeWebArchiveIE(InfoExtractor):
'only_matching': True
},
]
- _YT_INITIAL_DATA_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE
- _YT_INITIAL_PLAYER_RESPONSE_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*({.+?})[)\s]*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE
- _YT_INITIAL_BOUNDARY_RE = r'(?:(?:var\s+meta|</script|\n)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_BOUNDARY_RE
+ _YT_INITIAL_DATA_RE = YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE
+ _YT_INITIAL_PLAYER_RESPONSE_RE = fr'''(?x)
+ (?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*|
+ {YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE}'''
_YT_DEFAULT_THUMB_SERVERS = ['i.ytimg.com'] # thumbnails most likely archived on these servers
_YT_ALL_THUMB_SERVERS = orderedSet(
@@ -474,11 +475,6 @@ class YoutubeWebArchiveIE(InfoExtractor):
elif not isinstance(res, list) or len(res) != 0:
self.report_warning('Error while parsing CDX API response' + bug_reports_message())
- def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
- return self._parse_json(self._search_regex(
- (fr'{regex}\s*{self._YT_INITIAL_BOUNDARY_RE}',
- regex), webpage, name, default='{}'), video_id, fatal=False)
-
def _extract_webpage_title(self, webpage):
page_title = self._html_extract_title(webpage, default='')
# YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix.
@@ -488,10 +484,11 @@ class YoutubeWebArchiveIE(InfoExtractor):
def _extract_metadata(self, video_id, webpage):
search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) if webpage else (lambda x: None))
- player_response = self._extract_yt_initial_variable(
- webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, video_id, 'initial player response') or {}
- initial_data = self._extract_yt_initial_variable(
- webpage, self._YT_INITIAL_DATA_RE, video_id, 'initial player response') or {}
+ player_response = self._search_json(
+ self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response',
+ video_id, fatal=False)
+ initial_data = self._search_json(
+ self._YT_INITIAL_DATA_RE, webpage, 'initial data', video_id, fatal=False)
initial_data_video = traverse_obj(
initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'videoPrimaryInfoRenderer'),
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index c0b1fa9e0..316b58ce3 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -35,6 +35,7 @@ from ..utils import (
ExtractorError,
GeoRestrictedError,
GeoUtils,
+ LenientJSONDecoder,
RegexNotFoundError,
UnsupportedError,
age_restricted,
@@ -930,19 +931,10 @@ class InfoExtractor:
else:
self.report_warning(errmsg + str(ve))
- def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, lenient=False):
- if transform_source:
- json_string = transform_source(json_string)
+ def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, **parser_kwargs):
try:
- try:
- return json.loads(json_string, strict=False)
- except json.JSONDecodeError as e:
- if not lenient:
- raise
- try:
- return json.loads(json_string[:e.pos], strict=False)
- except ValueError:
- raise e
+ return json.loads(
+ json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
except ValueError as ve:
errmsg = f'{video_id}: Failed to parse JSON'
if fatal:
@@ -1196,6 +1188,14 @@ class InfoExtractor:
self.report_warning('unable to extract %s' % _name + bug_reports_message())
return None
+ def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='', fatal=True, **kwargs):
+ """Searches string for the JSON object specified by start_pattern"""
+ # NB: end_pattern is only used to reduce the size of the initial match
+ return self._parse_json(
+ self._search_regex(rf'{start_pattern}\s*(?P<json>{{.+}})\s*{end_pattern}',
+ string, name, group='json', fatal=fatal) or '{}',
+ video_id, fatal=fatal, ignore_extra=True, **kwargs) or {}
+
def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Like _search_regex, but strips HTML tags and unescapes entities.
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py
index 8b2332dc1..c8541c664 100644
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -397,9 +397,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
if self._LOGIN_REQUIRED and not self._cookies_passed:
self.raise_login_required('Login details are needed to download this content', method='cookies')
- _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+})\s*;'
- _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+})\s*;'
- _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
+ _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*='
+ _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*='
def _get_default_ytcfg(self, client='web'):
return copy.deepcopy(INNERTUBE_CLIENTS[client])
@@ -476,12 +475,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
data=json.dumps(data).encode('utf8'), headers=real_headers,
query={'key': api_key or self._extract_api_key(), 'prettyPrint': 'false'})
- def extract_yt_initial_data(self, item_id, webpage, fatal=True):
- data = self._search_regex(
- (fr'{self._YT_INITIAL_DATA_RE}\s*{self._YT_INITIAL_BOUNDARY_RE}',
- self._YT_INITIAL_DATA_RE), webpage, 'yt initial data', fatal=fatal)
- if data:
- return self._parse_json(data, item_id, fatal=fatal)
+ def extract_yt_initial_data(self, item_id, webpage):
+ return self._search_json(self._YT_INITIAL_DATA_RE, webpage, 'yt initial data', item_id, fatal=True)
def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
return self._parse_json(self._search_regex(
@@ -3052,9 +3047,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg):
initial_pr = None
if webpage:
- initial_pr = self._extract_yt_initial_variable(
- webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
- video_id, 'initial player response')
+ initial_pr = self._search_json(
+ self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response', video_id, fatal=False)
all_clients = set(clients)
clients = clients[::-1]
@@ -3678,9 +3672,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
initial_data = None
if webpage:
- initial_data = self._extract_yt_initial_variable(
- webpage, self._YT_INITIAL_DATA_RE, video_id,
- 'yt initial data')
+ initial_data = self._search_json(
+ self._YT_INITIAL_DATA_RE, webpage, 'yt initial data', video_id, fatal=False)
if not initial_data:
query = {'videoId': video_id}
query.update(self._get_checkok_params())
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py
index 00721eb46..777b8b3ea 100644
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -594,6 +594,19 @@ def clean_html(html):
return html.strip()
+class LenientJSONDecoder(json.JSONDecoder):
+ def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
+ self.transform_source, self.ignore_extra = transform_source, ignore_extra
+ super().__init__(*args, **kwargs)
+
+ def decode(self, s):
+ if self.transform_source:
+ s = self.transform_source(s)
+ if self.ignore_extra:
+ return self.raw_decode(s.lstrip())[0]
+ return super().decode(s)
+
+
def sanitize_open(filename, open_mode):
"""Try to open the given filename, and slightly tweak it if this fails.