diff options
author | bashonly <88596187+bashonly@users.noreply.github.com> | 2023-01-14 10:40:42 -0600 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-01-14 16:40:42 +0000 |
commit | cb73b8460c3ce6d37ab651a4e44bb23b10056154 (patch) | |
tree | 5de56f4ec455c4becb96da6f9c4aeb6a51ef3fa3 | |
parent | 7481998b169b2a52049fc33bff82034d6563ead4 (diff) | |
download | hypervideo-pre-cb73b8460c3ce6d37ab651a4e44bb23b10056154.tar.lz hypervideo-pre-cb73b8460c3ce6d37ab651a4e44bb23b10056154.tar.xz hypervideo-pre-cb73b8460c3ce6d37ab651a4e44bb23b10056154.zip |
[extractor/nbc] Fix `NBC` and `NBCStations` extractors (#6033)
Improve `InfoExtractor._parse_smil_formats` extension detection
Closes #6019
Authored by: bashonly
-rw-r--r-- | yt_dlp/extractor/common.py | 5 | ||||
-rw-r--r-- | yt_dlp/extractor/nbc.py | 249 |
2 files changed, 151 insertions, 103 deletions
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index ef9759974..e37595ffd 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -32,6 +32,7 @@ from ..utils import ( FormatSorter, GeoRestrictedError, GeoUtils, + HEADRequest, LenientJSONDecoder, RegexNotFoundError, RetryManager, @@ -80,6 +81,7 @@ from ..utils import ( update_Request, update_url_query, url_basename, + urlhandle_detect_ext, url_or_none, urljoin, variadic, @@ -2311,7 +2313,8 @@ class InfoExtractor: height = int_or_none(medium.get('height')) proto = medium.get('proto') ext = medium.get('ext') - src_ext = determine_ext(src) + src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext( + self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False)) streamer = medium.get('streamer') or base if proto == 'rtmp' or streamer.startswith('rtmp'): diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index 00c592cc3..82d759f75 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -8,24 +8,26 @@ from .adobepass import AdobePassIE from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, + HEADRequest, + RegexNotFoundError, + UserNotLive, + clean_html, int_or_none, parse_age_limit, parse_duration, - RegexNotFoundError, smuggle_url, - str_or_none, traverse_obj, try_get, - unified_strdate, + unescapeHTML, unified_timestamp, update_url_query, url_basename, - variadic, + xpath_attr, ) class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE - _VALID_URL = r'https?(?P<permalink>://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?P<id>n?\d+))' + _VALID_URL = r'https?(?P<permalink>://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?P<id>(?:NBCE|n)?\d+))' _TESTS = [ { @@ -38,10 +40,18 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'timestamp': 1424246400, 'upload_date': '20150218', 'uploader': 'NBCU-COM', + 'episode': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s', + 'episode_number': 86, + 'season': 'Season 2', + 'season_number': 2, + 'series': 'Tonight Show: Jimmy Fallon', + 'duration': 237.0, + 'chapters': 'count:1', + 'tags': 'count:4', + 'thumbnail': r're:https?://.+\.jpg', }, 'params': { - # m3u8 download - 'skip_download': True, + 'skip_download': 'm3u8', }, }, { @@ -55,11 +65,7 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'upload_date': '20141206', 'uploader': 'NBCU-COM', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': 'Only works from US', + 'skip': 'page not found', }, { # HLS streams requires the 'hdnea3' cookie @@ -73,10 +79,59 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'upload_date': '20090315', 'uploader': 'NBCU-COM', }, + 'skip': 'page not found', + }, + { + # manifest url does not have extension + 'url': 'https://www.nbc.com/the-golden-globe-awards/video/oprah-winfrey-receives-cecil-b-de-mille-award-at-the-2018-golden-globes/3646439', + 'info_dict': { + 'id': '3646439', + 'ext': 'mp4', + 'title': 'Oprah Winfrey Receives Cecil B. de Mille Award at the 2018 Golden Globes', + 'episode': 'Oprah Winfrey Receives Cecil B. de Mille Award at the 2018 Golden Globes', + 'episode_number': 1, + 'season': 'Season 75', + 'season_number': 75, + 'series': 'The Golden Globe Awards', + 'description': 'Oprah Winfrey receives the Cecil B. de Mille Award at the 75th Annual Golden Globe Awards.', + 'uploader': 'NBCU-COM', + 'upload_date': '20180107', + 'timestamp': 1515312000, + 'duration': 570.0, + 'tags': 'count:8', + 'thumbnail': r're:https?://.+\.jpg', + 'chapters': 'count:1', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, + { + # new video_id format + 'url': 'https://www.nbc.com/quantum-leap/video/bens-first-leap-nbcs-quantum-leap/NBCE125189978', + 'info_dict': { + 'id': 'NBCE125189978', + 'ext': 'mp4', + 'title': 'Ben\'s First Leap | NBC\'s Quantum Leap', + 'description': 'md5:a82762449b7ec4bb83291a7b355ebf8e', + 'uploader': 'NBCU-COM', + 'series': 'Quantum Leap', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Ben\'s First Leap | NBC\'s Quantum Leap', + 'episode_number': 1, + 'duration': 170.171, + 'chapters': [], + 'timestamp': 1663956155, + 'upload_date': '20220923', + 'tags': 'count:10', + 'age_limit': 0, + 'thumbnail': r're:https?://.+\.jpg', + }, + 'expected_warnings': ['Ignoring subtitle tracks'], 'params': { - 'skip_download': True, + 'skip_download': 'm3u8', }, - 'skip': 'Only works from US', }, { 'url': 'https://www.nbc.com/classic-tv/charles-in-charge/video/charles-in-charge-pilot/n3310', @@ -600,32 +655,36 @@ class NBCStationsIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.nbclosangeles.com/news/local/large-structure-fire-in-downtown-la-prompts-smoke-odor-advisory/2968618/', - 'md5': '462041d91bd762ef5a38b7d85d6dc18f', 'info_dict': { 'id': '2968618', 'ext': 'mp4', 'title': 'Large Structure Fire in Downtown LA Prompts Smoke Odor Advisory', - 'description': None, + 'description': 'md5:417ed3c2d91fe9d301e6db7b0942f182', 'timestamp': 1661135892, - 'upload_date': '20220821', + 'upload_date': '20220822', 'uploader': 'NBC 4', - 'uploader_id': 'KNBC', + 'channel_id': 'KNBC', 'channel': 'nbclosangeles', }, + 'params': { + 'skip_download': 'm3u8', + }, }, { 'url': 'https://www.telemundoarizona.com/responde/huracan-complica-reembolso-para-televidente-de-tucson/2247002/', - 'md5': '0917dcf7885be1023a9220630d415f67', 'info_dict': { 'id': '2247002', 'ext': 'mp4', - 'title': 'Huracán complica que televidente de Tucson reciba reembolso', + 'title': 'Huracán complica que televidente de Tucson reciba reembolso', 'description': 'md5:af298dc73aab74d4fca6abfb12acb6cf', 'timestamp': 1660886507, 'upload_date': '20220819', 'uploader': 'Telemundo Arizona', - 'uploader_id': 'KTAZ', + 'channel_id': 'KTAZ', 'channel': 'telemundoarizona', }, + 'params': { + 'skip_download': 'm3u8', + }, }] _RESOLUTIONS = { @@ -644,48 +703,39 @@ class NBCStationsIE(InfoExtractor): r'<script>var\s*nbc\s*=', webpage, 'NBC JSON data', video_id) pdk_acct = nbc_data.get('pdkAcct') or 'Yh1nAC' fw_ssid = traverse_obj(nbc_data, ('video', 'fwSSID')) - fw_network_id = traverse_obj(nbc_data, ('video', 'fwNetworkID'), default='382114') - video_data = self._parse_json(self._html_search_regex( - r'data-videos="([^"]*)"', webpage, 'video data', default='{}'), video_id) - video_data = variadic(video_data)[0] - video_data.update(self._parse_json(self._html_search_regex( - r'data-meta="([^"]*)"', webpage, 'metadata', default='{}'), video_id)) + video_data = self._search_json( + r'data-videos="\[', webpage, 'video data', video_id, default={}, transform_source=unescapeHTML) + video_data.update(self._search_json( + r'data-meta="', webpage, 'metadata', video_id, default={}, transform_source=unescapeHTML)) + if not video_data: + raise ExtractorError('No video metadata found in webpage', expected=True) - formats = [] + info, formats, subtitles = {}, [], {} + is_live = int_or_none(video_data.get('mpx_is_livestream')) == 1 + query = { + 'formats': 'MPEG-DASH none,M3U none,MPEG-DASH none,MPEG4,MP3', + 'format': 'SMIL', + 'fwsitesection': fw_ssid, + 'fwNetworkID': traverse_obj(nbc_data, ('video', 'fwNetworkID'), default='382114'), + 'pprofile': 'ots_desktop_html', + 'sensitive': 'false', + 'w': '1920', + 'h': '1080', + 'mode': 'LIVE' if is_live else 'on-demand', + 'vpaid': 'script', + 'schema': '2.0', + 'sdk': 'PDK 6.1.3', + } - if video_data.get('mpx_is_livestream') == '1': - live = True - player_id = traverse_obj( - video_data, 'mpx_m3upid', ('video', 'meta', 'mpx_m3upid'), 'mpx_pid', - ('video', 'meta', 'mpx_pid'), 'pid_streaming_web_medium') - query = { - 'mbr': 'true', - 'assetTypes': 'LegacyRelease', - 'fwsitesection': fw_ssid, - 'fwNetworkID': fw_network_id, - 'pprofile': 'ots_desktop_html', - 'sensitive': 'false', - 'w': '1920', - 'h': '1080', - 'rnd': '1660303', - 'mode': 'LIVE', - 'format': 'SMIL', - 'tracking': 'true', - 'formats': 'M3U+none,MPEG-DASH+none,MPEG4,MP3', - 'vpaid': 'script', - 'schema': '2.0', - 'SDK': 'PDK+6.1.3', - } - info = { - 'title': f'{channel} livestream', - } + if is_live: + player_id = traverse_obj(video_data, ((None, ('video', 'meta')), ( + 'mpx_m3upid', 'mpx_pid', 'pid_streaming_web_medium')), get_all=False) + info['title'] = f'{channel} livestream' else: - live = False - player_id = traverse_obj( - video_data, ('video', 'meta', 'pid_streaming_web_high'), 'pid_streaming_web_high', - ('video', 'meta', 'mpx_pid'), 'mpx_pid') + player_id = traverse_obj(video_data, ( + (None, ('video', 'meta')), ('pid_streaming_web_high', 'mpx_pid')), get_all=False) date_string = traverse_obj(video_data, 'date_string', 'date_gmt') if date_string: @@ -693,63 +743,58 @@ class NBCStationsIE(InfoExtractor): r'datetime="([^"]+)"', date_string, 'date string', fatal=False) else: date_string = traverse_obj( - nbc_data, ('dataLayer', 'adobe', 'prop70'), ('dataLayer', 'adobe', 'eVar70'), - ('dataLayer', 'adobe', 'eVar59')) + nbc_data, ('dataLayer', 'adobe', ('prop70', 'eVar70', 'eVar59')), get_all=False) - video_url = traverse_obj(video_data, ('video', 'meta', 'mp4_url'), 'mp4_url') + video_url = traverse_obj(video_data, ((None, ('video', 'meta')), 'mp4_url'), get_all=False) if video_url: - height = url_basename(video_url).split('-')[1].split('p')[0] + height = self._search_regex(r'\d+-(\d+)p', url_basename(video_url), 'height', default=None) formats.append({ 'url': video_url, 'ext': 'mp4', 'width': int_or_none(self._RESOLUTIONS.get(height)), 'height': int_or_none(height), - 'format_id': f'http-{height}', + 'format_id': 'http-mp4', }) - query = { - 'mbr': 'true', - 'assetTypes': 'LegacyRelease', - 'fwsitesection': fw_ssid, - 'fwNetworkID': fw_network_id, - 'format': 'redirect', - 'manifest': 'm3u', - 'Tracking': 'true', - 'Embedded': 'true', - 'formats': 'MPEG4', - } - info = { - 'title': video_data.get('title') or traverse_obj( - nbc_data, ('dataLayer', 'contenttitle'), ('dataLayer', 'title'), - ('dataLayer', 'adobe', 'prop22'), ('dataLayer', 'id')), - 'description': traverse_obj(video_data, 'summary', 'excerpt', 'video_hero_text'), - 'upload_date': str_or_none(unified_strdate(date_string)), - 'timestamp': int_or_none(unified_timestamp(date_string)), - } - - if not player_id: - raise ExtractorError( - 'No video player ID or livestream player ID found in webpage', expected=True) - - headers = {'Origin': f'https://www.{channel}.com'} - manifest, urlh = self._download_webpage_handle( - f'https://link.theplatform.com/s/{pdk_acct}/{player_id}', video_id, - headers=headers, query=query, note='Downloading manifest') - if live: - manifest_url = self._search_regex(r'<video src="([^"]*)', manifest, 'manifest URL') - else: - manifest_url = urlh.geturl() + info.update({ + 'title': video_data.get('title') or traverse_obj(nbc_data, ( + 'dataLayer', (None, 'adobe'), ('contenttitle', 'title', 'prop22')), get_all=False), + 'description': + traverse_obj(video_data, 'summary', 'excerpt', 'video_hero_text') + or clean_html(traverse_obj(nbc_data, ('dataLayer', 'summary'))), + 'timestamp': unified_timestamp(date_string), + }) - formats.extend(self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', headers=headers, m3u8_id='hls', - fatal=live, live=live, errnote='No HLS formats found')) + smil = None + if player_id and fw_ssid: + smil = self._download_xml( + f'https://link.theplatform.com/s/{pdk_acct}/{player_id}', video_id, + note='Downloading SMIL data', query=query, fatal=is_live) + if smil: + manifest_url = xpath_attr(smil, './/{*}video', 'src', fatal=is_live) + subtitles = self._parse_smil_subtitles(smil, '*') + fmts, subs = self._extract_m3u8_formats_and_subtitles( + manifest_url, video_id, 'mp4', m3u8_id='hls', fatal=is_live, + live=is_live, errnote='No HLS formats found') + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + if not formats: + self.raise_no_formats('No video content found in webpage', expected=True) + elif is_live: + try: + self._request_webpage( + HEADRequest(formats[0]['url']), video_id, note='Checking live status') + except ExtractorError: + raise UserNotLive(video_id=channel) return { - 'id': str_or_none(video_id), + 'id': video_id, 'channel': channel, - 'uploader': str_or_none(nbc_data.get('on_air_name')), - 'uploader_id': str_or_none(nbc_data.get('callLetters')), + 'channel_id': nbc_data.get('callLetters'), + 'uploader': nbc_data.get('on_air_name'), 'formats': formats, - 'is_live': live, + 'subtitles': subtitles, + 'is_live': is_live, **info, } |