diff options
Diffstat (limited to 'youtube_dl/extractor/common.py')
-rw-r--r-- | youtube_dl/extractor/common.py | 47 |
1 files changed, 11 insertions, 36 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a61753b17..eaae5e484 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -15,7 +15,7 @@ import time import math from ..compat import ( - compat_cookiejar_Cookie, + compat_cookiejar, compat_cookies, compat_etree_Element, compat_etree_fromstring, @@ -1182,33 +1182,16 @@ class InfoExtractor(object): 'twitter card player') def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): - json_ld_list = list(re.finditer(JSON_LD_RE, html)) + json_ld = self._search_regex( + JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs) default = kwargs.get('default', NO_DEFAULT) + if not json_ld: + return default if default is not NO_DEFAULT else {} # JSON-LD may be malformed and thus `fatal` should be respected. # At the same time `default` may be passed that assumes `fatal=False` # for _search_regex. Let's simulate the same behavior here as well. fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False - json_ld = [] - for mobj in json_ld_list: - json_ld_item = self._parse_json( - mobj.group('json_ld'), video_id, fatal=fatal) - if not json_ld_item: - continue - if isinstance(json_ld_item, dict): - json_ld.append(json_ld_item) - elif isinstance(json_ld_item, (list, tuple)): - json_ld.extend(json_ld_item) - if json_ld: - json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) - if json_ld: - return json_ld - if default is not NO_DEFAULT: - return default - elif fatal: - raise RegexNotFoundError('Unable to extract JSON-LD') - else: - self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message()) - return {} + return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): if isinstance(json_ld, compat_str): @@ -1273,10 +1256,10 @@ class InfoExtractor(object): extract_interaction_statistic(e) for e in json_ld: - if '@context' in e: + if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')): item_type = e.get('@type') if expected_type is not None and expected_type != item_type: - continue + return info if item_type in ('TVEpisode', 'Episode'): episode_name = unescapeHTML(e.get('name')) info.update({ @@ -1310,17 +1293,11 @@ class InfoExtractor(object): }) elif item_type == 'VideoObject': extract_video_object(e) - if expected_type is None: - continue - else: - break + continue video = e.get('video') if isinstance(video, dict) and video.get('@type') == 'VideoObject': extract_video_object(video) - if expected_type is None: - continue - else: - break + break return dict((k, v) for k, v in info.items() if v is not None) @staticmethod @@ -2363,8 +2340,6 @@ class InfoExtractor(object): if res is False: return [] ism_doc, urlh = res - if ism_doc is None: - return [] return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id) @@ -2843,7 +2818,7 @@ class InfoExtractor(object): def _set_cookie(self, domain, name, value, expire_time=None, port=None, path='/', secure=False, discard=False, rest={}, **kwargs): - cookie = compat_cookiejar_Cookie( + cookie = compat_cookiejar.Cookie( 0, name, value, port, port is not None, domain, True, domain.startswith('.'), path, True, secure, expire_time, discard, None, None, rest) |