aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorsiddharth ravikumar <s@ricketyspace.net>2022-06-02 20:51:11 -0400
committerGitHub <noreply@github.com>2022-06-02 17:51:11 -0700
commite50c3500b43d80e4492569c4b4523c4379c6fbb2 (patch)
treef60c19bce5de56a449e41516bedd1501fea1ddb2
parent09d02ea4294fd5b284a18a904b8b08f3c9ec1fd9 (diff)
downloadhypervideo-pre-e50c3500b43d80e4492569c4b4523c4379c6fbb2.tar.lz
hypervideo-pre-e50c3500b43d80e4492569c4b4523c4379c6fbb2.tar.xz
hypervideo-pre-e50c3500b43d80e4492569c4b4523c4379c6fbb2.zip
[extractor/npr] Use stream url from json-ld (#3455)
Closes #1934 Authored by: r5d
-rw-r--r--yt_dlp/extractor/common.py4
-rw-r--r--yt_dlp/extractor/npr.py17
2 files changed, 20 insertions, 1 deletions
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index d88d5e6f9..71e982f02 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -1461,7 +1461,7 @@ class InfoExtractor:
assert e['@type'] == 'VideoObject'
author = e.get('author')
info.update({
- 'url': url_or_none(e.get('contentUrl')),
+ 'url': traverse_obj(e, 'contentUrl', 'embedUrl', expected_type=url_or_none),
'title': unescapeHTML(e.get('name')),
'description': unescapeHTML(e.get('description')),
'thumbnails': [{'url': url}
@@ -1529,6 +1529,8 @@ class InfoExtractor:
})
if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject':
extract_video_object(e['video'][0])
+ elif traverse_obj(e, ('subjectOf', 0, '@type')) == 'VideoObject':
+ extract_video_object(e['subjectOf'][0])
elif item_type == 'VideoObject':
extract_video_object(e)
if expected_type is None:
diff --git a/yt_dlp/extractor/npr.py b/yt_dlp/extractor/npr.py
index 6d93f154c..e4ff8d6c2 100644
--- a/yt_dlp/extractor/npr.py
+++ b/yt_dlp/extractor/npr.py
@@ -51,6 +51,15 @@ class NprIE(InfoExtractor):
# multimedia, no formats, stream
'url': 'https://www.npr.org/2020/02/14/805476846/laura-stevenson-tiny-desk-concert',
'only_matching': True,
+ }, {
+ 'url': 'https://www.npr.org/2022/03/15/1084896560/bonobo-tiny-desk-home-concert',
+ 'info_dict': {
+ 'id': '1086468851',
+ 'ext': 'mp4',
+ 'title': 'Bonobo: Tiny Desk (Home) Concert',
+ 'duration': 1061,
+ 'thumbnail': r're:^https?://media.npr.org/assets/img/.*\.jpg$',
+ },
}]
def _real_extract(self, url):
@@ -65,6 +74,10 @@ class NprIE(InfoExtractor):
})['list']['story'][0]
playlist_title = story.get('title', {}).get('$text')
+ # Fetch the JSON-LD from the npr page.
+ json_ld = self._search_json_ld(
+ self._download_webpage(url, playlist_id), playlist_id, 'NewsArticle', fatal=False)
+
KNOWN_FORMATS = ('threegp', 'm3u8', 'smil', 'mp4', 'mp3')
quality = qualities(KNOWN_FORMATS)
@@ -110,6 +123,10 @@ class NprIE(InfoExtractor):
formats.extend(self._extract_m3u8_formats(
stream_url, stream_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
+
+ if not formats and json_ld.get('url'):
+ formats.extend(self._extract_m3u8_formats(json_ld['url'], media_id, 'mp4', m3u8_id='hls', fatal=False))
+
self._sort_formats(formats)
entries.append({