diff options
author | pukkandan <pukkandan.ytdlp@gmail.com> | 2022-06-03 21:02:31 +0530 |
---|---|---|
committer | pukkandan <pukkandan.ytdlp@gmail.com> | 2022-06-06 19:46:45 +0530 |
commit | b7c47b743871cdf3e0de75b17e4454d987384bf9 (patch) | |
tree | 928081bf818ee21df40d465c8ffce0bc9ae0ddc2 /yt_dlp/extractor/common.py | |
parent | 00bbc5f17710367adc7508062e155547b35edd20 (diff) | |
download | hypervideo-pre-b7c47b743871cdf3e0de75b17e4454d987384bf9.tar.lz hypervideo-pre-b7c47b743871cdf3e0de75b17e4454d987384bf9.tar.xz hypervideo-pre-b7c47b743871cdf3e0de75b17e4454d987384bf9.zip |
[extractor] Add `_search_json`
All fetching of JSON objects should eventually be done with this function
but only `youtube` is being refactored for now
Diffstat (limited to 'yt_dlp/extractor/common.py')
-rw-r--r-- | yt_dlp/extractor/common.py | 24 |
1 files changed, 12 insertions, 12 deletions
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index c0b1fa9e0..316b58ce3 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -35,6 +35,7 @@ from ..utils import ( ExtractorError, GeoRestrictedError, GeoUtils, + LenientJSONDecoder, RegexNotFoundError, UnsupportedError, age_restricted, @@ -930,19 +931,10 @@ class InfoExtractor: else: self.report_warning(errmsg + str(ve)) - def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, lenient=False): - if transform_source: - json_string = transform_source(json_string) + def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, **parser_kwargs): try: - try: - return json.loads(json_string, strict=False) - except json.JSONDecodeError as e: - if not lenient: - raise - try: - return json.loads(json_string[:e.pos], strict=False) - except ValueError: - raise e + return json.loads( + json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs) except ValueError as ve: errmsg = f'{video_id}: Failed to parse JSON' if fatal: @@ -1196,6 +1188,14 @@ class InfoExtractor: self.report_warning('unable to extract %s' % _name + bug_reports_message()) return None + def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='', fatal=True, **kwargs): + """Searches string for the JSON object specified by start_pattern""" + # NB: end_pattern is only used to reduce the size of the initial match + return self._parse_json( + self._search_regex(rf'{start_pattern}\s*(?P<json>{{.+}})\s*{end_pattern}', + string, name, group='json', fatal=fatal) or '{}', + video_id, fatal=fatal, ignore_extra=True, **kwargs) or {} + def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ Like _search_regex, but strips HTML tags and unescapes entities. |