aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--yt_dlp/extractor/common.py42
-rw-r--r--yt_dlp/extractor/npr.py16
2 files changed, 25 insertions, 33 deletions
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index 90af41575..f4c34f43c 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -1392,27 +1392,25 @@ class InfoExtractor:
return self._html_search_meta('twitter:player', html,
'twitter card player')
- def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
- json_ld_list = list(re.finditer(JSON_LD_RE, html))
- default = kwargs.get('default', NO_DEFAULT)
- # JSON-LD may be malformed and thus `fatal` should be respected.
- # At the same time `default` may be passed that assumes `fatal=False`
- # for _search_regex. Let's simulate the same behavior here as well.
- fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
- json_ld = []
- for mobj in json_ld_list:
- json_ld_item = self._parse_json(
- mobj.group('json_ld'), video_id, fatal=fatal)
- if not json_ld_item:
- continue
- if isinstance(json_ld_item, dict):
- json_ld.append(json_ld_item)
- elif isinstance(json_ld_item, (list, tuple)):
- json_ld.extend(json_ld_item)
- if json_ld:
- json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
- if json_ld:
- return json_ld
+ def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
+ """Yield all json ld objects in the html"""
+ if default is not NO_DEFAULT:
+ fatal = False
+ for mobj in re.finditer(JSON_LD_RE, html):
+ json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
+ for json_ld in variadic(json_ld_item):
+ if isinstance(json_ld, dict):
+ yield json_ld
+
+ def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
+ """Search for a video in any json ld in the html"""
+ if default is not NO_DEFAULT:
+ fatal = False
+ info = self._json_ld(
+ list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
+ video_id, fatal=fatal, expected_type=expected_type)
+ if info:
+ return info
if default is not NO_DEFAULT:
return default
elif fatal:
@@ -1500,7 +1498,7 @@ class InfoExtractor:
assert is_type(e, 'VideoObject')
author = e.get('author')
info.update({
- 'url': traverse_obj(e, 'contentUrl', 'embedUrl', expected_type=url_or_none),
+ 'url': url_or_none(e.get('contentUrl')),
'title': unescapeHTML(e.get('name')),
'description': unescapeHTML(e.get('description')),
'thumbnails': [{'url': url}
diff --git a/yt_dlp/extractor/npr.py b/yt_dlp/extractor/npr.py
index e4ff8d6c2..e677e862d 100644
--- a/yt_dlp/extractor/npr.py
+++ b/yt_dlp/extractor/npr.py
@@ -1,9 +1,5 @@
from .common import InfoExtractor
-from ..utils import (
- int_or_none,
- qualities,
- url_or_none,
-)
+from ..utils import int_or_none, qualities, traverse_obj, url_or_none
class NprIE(InfoExtractor):
@@ -74,10 +70,6 @@ class NprIE(InfoExtractor):
})['list']['story'][0]
playlist_title = story.get('title', {}).get('$text')
- # Fetch the JSON-LD from the npr page.
- json_ld = self._search_json_ld(
- self._download_webpage(url, playlist_id), playlist_id, 'NewsArticle', fatal=False)
-
KNOWN_FORMATS = ('threegp', 'm3u8', 'smil', 'mp4', 'mp3')
quality = qualities(KNOWN_FORMATS)
@@ -124,8 +116,10 @@ class NprIE(InfoExtractor):
stream_url, stream_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
- if not formats and json_ld.get('url'):
- formats.extend(self._extract_m3u8_formats(json_ld['url'], media_id, 'mp4', m3u8_id='hls', fatal=False))
+ if not formats:
+ raw_json_ld = self._yield_json_ld(self._download_webpage(url, playlist_id), playlist_id, fatal=False)
+ m3u8_url = traverse_obj(list(raw_json_ld), (..., 'subjectOf', ..., 'embedUrl'), get_all=False)
+ formats = self._extract_m3u8_formats(m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)
self._sort_formats(formats)