diff options
Diffstat (limited to 'hypervideo_dl/extractor/nbc.py')
-rw-r--r-- | hypervideo_dl/extractor/nbc.py | 288 |
1 files changed, 183 insertions, 105 deletions
diff --git a/hypervideo_dl/extractor/nbc.py b/hypervideo_dl/extractor/nbc.py index 1ea6355..b3c28ab 100644 --- a/hypervideo_dl/extractor/nbc.py +++ b/hypervideo_dl/extractor/nbc.py @@ -3,29 +3,34 @@ import json import re from .common import InfoExtractor -from .theplatform import ThePlatformIE +from .theplatform import ThePlatformIE, default_ns from .adobepass import AdobePassIE from ..compat import compat_urllib_parse_unquote +from ..networking import HEADRequest from ..utils import ( ExtractorError, + RegexNotFoundError, + UserNotLive, + clean_html, + determine_ext, + float_or_none, int_or_none, + mimetype2ext, parse_age_limit, parse_duration, - RegexNotFoundError, + remove_end, smuggle_url, - str_or_none, traverse_obj, try_get, - unified_strdate, + unescapeHTML, unified_timestamp, update_url_query, url_basename, - variadic, ) class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE - _VALID_URL = r'https?(?P<permalink>://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?P<id>n?\d+))' + _VALID_URL = r'https?(?P<permalink>://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?P<id>(?:NBCE|n)?\d+))' _TESTS = [ { @@ -38,10 +43,18 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'timestamp': 1424246400, 'upload_date': '20150218', 'uploader': 'NBCU-COM', + 'episode': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s', + 'episode_number': 86, + 'season': 'Season 2', + 'season_number': 2, + 'series': 'Tonight Show: Jimmy Fallon', + 'duration': 237.0, + 'chapters': 'count:1', + 'tags': 'count:4', + 'thumbnail': r're:https?://.+\.jpg', }, 'params': { - # m3u8 download - 'skip_download': True, + 'skip_download': 'm3u8', }, }, { @@ -55,11 +68,7 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'upload_date': '20141206', 'uploader': 'NBCU-COM', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': 'Only works from US', + 'skip': 'page not found', }, { # HLS streams requires the 'hdnea3' cookie @@ -73,10 +82,58 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'upload_date': '20090315', 'uploader': 'NBCU-COM', }, + 'skip': 'page not found', + }, + { + # manifest url does not have extension + 'url': 'https://www.nbc.com/the-golden-globe-awards/video/oprah-winfrey-receives-cecil-b-de-mille-award-at-the-2018-golden-globes/3646439', + 'info_dict': { + 'id': '3646439', + 'ext': 'mp4', + 'title': 'Oprah Winfrey Receives Cecil B. de Mille Award at the 2018 Golden Globes', + 'episode': 'Oprah Winfrey Receives Cecil B. de Mille Award at the 2018 Golden Globes', + 'episode_number': 1, + 'season': 'Season 75', + 'season_number': 75, + 'series': 'The Golden Globe Awards', + 'description': 'Oprah Winfrey receives the Cecil B. de Mille Award at the 75th Annual Golden Globe Awards.', + 'uploader': 'NBCU-COM', + 'upload_date': '20180107', + 'timestamp': 1515312000, + 'duration': 570.0, + 'tags': 'count:8', + 'thumbnail': r're:https?://.+\.jpg', + 'chapters': 'count:1', + }, 'params': { - 'skip_download': True, + 'skip_download': 'm3u8', + }, + }, + { + # new video_id format + 'url': 'https://www.nbc.com/quantum-leap/video/bens-first-leap-nbcs-quantum-leap/NBCE125189978', + 'info_dict': { + 'id': 'NBCE125189978', + 'ext': 'mp4', + 'title': 'Ben\'s First Leap | NBC\'s Quantum Leap', + 'description': 'md5:a82762449b7ec4bb83291a7b355ebf8e', + 'uploader': 'NBCU-COM', + 'series': 'Quantum Leap', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Ben\'s First Leap | NBC\'s Quantum Leap', + 'episode_number': 1, + 'duration': 170.171, + 'chapters': [], + 'timestamp': 1663956155, + 'upload_date': '20220923', + 'tags': 'count:10', + 'age_limit': 0, + 'thumbnail': r're:https?://.+\.jpg', + }, + 'params': { + 'skip_download': 'm3u8', }, - 'skip': 'Only works from US', }, { 'url': 'https://www.nbc.com/classic-tv/charles-in-charge/video/charles-in-charge-pilot/n3310', @@ -136,6 +193,7 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE query = { 'mbr': 'true', 'manifest': 'm3u', + 'switch': 'HLSServiceSecure', } video_id = video_data['mpxGuid'] tp_path = 'NnzsPC/media/guid/%s/%s' % (video_data.get('mpxAccountId') or '2410887629', video_id) @@ -599,32 +657,54 @@ class NBCStationsIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.nbclosangeles.com/news/local/large-structure-fire-in-downtown-la-prompts-smoke-odor-advisory/2968618/', - 'md5': '462041d91bd762ef5a38b7d85d6dc18f', 'info_dict': { 'id': '2968618', 'ext': 'mp4', 'title': 'Large Structure Fire in Downtown LA Prompts Smoke Odor Advisory', - 'description': None, + 'description': 'md5:417ed3c2d91fe9d301e6db7b0942f182', + 'duration': 112.513, 'timestamp': 1661135892, - 'upload_date': '20220821', + 'upload_date': '20220822', 'uploader': 'NBC 4', - 'uploader_id': 'KNBC', + 'channel_id': 'KNBC', 'channel': 'nbclosangeles', }, + 'params': { + 'skip_download': 'm3u8', + }, }, { 'url': 'https://www.telemundoarizona.com/responde/huracan-complica-reembolso-para-televidente-de-tucson/2247002/', - 'md5': '0917dcf7885be1023a9220630d415f67', 'info_dict': { 'id': '2247002', 'ext': 'mp4', - 'title': 'Huracán complica que televidente de Tucson reciba reembolso', + 'title': 'Huracán complica que televidente de Tucson reciba reembolso', 'description': 'md5:af298dc73aab74d4fca6abfb12acb6cf', + 'duration': 172.406, 'timestamp': 1660886507, 'upload_date': '20220819', 'uploader': 'Telemundo Arizona', - 'uploader_id': 'KTAZ', + 'channel_id': 'KTAZ', 'channel': 'telemundoarizona', }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # direct mp4 link + 'url': 'https://www.nbcboston.com/weather/video-weather/highs-near-freezing-in-boston-on-wednesday/2961135/', + 'md5': '9bf8c41dc7abbb75b1a44f1491a4cc85', + 'info_dict': { + 'id': '2961135', + 'ext': 'mp4', + 'title': 'Highs Near Freezing in Boston on Wednesday', + 'description': 'md5:3ec486609a926c99f00a3512e6c0e85b', + 'duration': 235.669, + 'timestamp': 1675268656, + 'upload_date': '20230201', + 'uploader': '', + 'channel_id': 'WBTS', + 'channel': 'nbcboston', + }, }] _RESOLUTIONS = { @@ -640,51 +720,42 @@ class NBCStationsIE(InfoExtractor): webpage = self._download_webpage(url, video_id) nbc_data = self._search_json( - r'<script>var\s*nbc\s*=', webpage, 'NBC JSON data', video_id) + r'<script>\s*var\s+nbc\s*=', webpage, 'NBC JSON data', video_id) pdk_acct = nbc_data.get('pdkAcct') or 'Yh1nAC' fw_ssid = traverse_obj(nbc_data, ('video', 'fwSSID')) - fw_network_id = traverse_obj(nbc_data, ('video', 'fwNetworkID'), default='382114') - video_data = self._parse_json(self._html_search_regex( - r'data-videos="([^"]*)"', webpage, 'video data', default='{}'), video_id) - video_data = variadic(video_data)[0] - video_data.update(self._parse_json(self._html_search_regex( - r'data-meta="([^"]*)"', webpage, 'metadata', default='{}'), video_id)) + video_data = self._search_json( + r'data-videos="\[', webpage, 'video data', video_id, default={}, transform_source=unescapeHTML) + video_data.update(self._search_json( + r'data-meta="', webpage, 'metadata', video_id, default={}, transform_source=unescapeHTML)) + if not video_data: + raise ExtractorError('No video metadata found in webpage', expected=True) - formats = [] + info, formats = {}, [] + is_live = int_or_none(video_data.get('mpx_is_livestream')) == 1 + query = { + 'formats': 'MPEG-DASH none,M3U none,MPEG-DASH none,MPEG4,MP3', + 'format': 'SMIL', + 'fwsitesection': fw_ssid, + 'fwNetworkID': traverse_obj(nbc_data, ('video', 'fwNetworkID'), default='382114'), + 'pprofile': 'ots_desktop_html', + 'sensitive': 'false', + 'w': '1920', + 'h': '1080', + 'mode': 'LIVE' if is_live else 'on-demand', + 'vpaid': 'script', + 'schema': '2.0', + 'sdk': 'PDK 6.1.3', + } - if video_data.get('mpx_is_livestream') == '1': - live = True - player_id = traverse_obj( - video_data, 'mpx_m3upid', ('video', 'meta', 'mpx_m3upid'), 'mpx_pid', - ('video', 'meta', 'mpx_pid'), 'pid_streaming_web_medium') - query = { - 'mbr': 'true', - 'assetTypes': 'LegacyRelease', - 'fwsitesection': fw_ssid, - 'fwNetworkID': fw_network_id, - 'pprofile': 'ots_desktop_html', - 'sensitive': 'false', - 'w': '1920', - 'h': '1080', - 'rnd': '1660303', - 'mode': 'LIVE', - 'format': 'SMIL', - 'tracking': 'true', - 'formats': 'M3U+none,MPEG-DASH+none,MPEG4,MP3', - 'vpaid': 'script', - 'schema': '2.0', - 'SDK': 'PDK+6.1.3', - } - info = { - 'title': f'{channel} livestream', - } + if is_live: + player_id = traverse_obj(video_data, ((None, ('video', 'meta')), ( + 'mpx_m3upid', 'mpx_pid', 'pid_streaming_web_medium')), get_all=False) + info['title'] = f'{channel} livestream' else: - live = False - player_id = traverse_obj( - video_data, ('video', 'meta', 'pid_streaming_web_high'), 'pid_streaming_web_high', - ('video', 'meta', 'mpx_pid'), 'mpx_pid') + player_id = traverse_obj(video_data, ( + (None, ('video', 'meta')), ('pid_streaming_web_high', 'mpx_pid')), get_all=False) date_string = traverse_obj(video_data, 'date_string', 'date_gmt') if date_string: @@ -692,63 +763,70 @@ class NBCStationsIE(InfoExtractor): r'datetime="([^"]+)"', date_string, 'date string', fatal=False) else: date_string = traverse_obj( - nbc_data, ('dataLayer', 'adobe', 'prop70'), ('dataLayer', 'adobe', 'eVar70'), - ('dataLayer', 'adobe', 'eVar59')) + nbc_data, ('dataLayer', 'adobe', ('prop70', 'eVar70', 'eVar59')), get_all=False) - video_url = traverse_obj(video_data, ('video', 'meta', 'mp4_url'), 'mp4_url') + video_url = traverse_obj(video_data, ((None, ('video', 'meta')), 'mp4_url'), get_all=False) if video_url: - height = url_basename(video_url).split('-')[1].split('p')[0] + ext = determine_ext(video_url) + height = self._search_regex(r'\d+-(\d+)p', url_basename(video_url), 'height', default=None) formats.append({ 'url': video_url, - 'ext': 'mp4', + 'ext': ext, 'width': int_or_none(self._RESOLUTIONS.get(height)), 'height': int_or_none(height), - 'format_id': f'http-{height}', + 'format_id': f'http-{ext}', }) - query = { - 'mbr': 'true', - 'assetTypes': 'LegacyRelease', - 'fwsitesection': fw_ssid, - 'fwNetworkID': fw_network_id, - 'format': 'redirect', - 'manifest': 'm3u', - 'Tracking': 'true', - 'Embedded': 'true', - 'formats': 'MPEG4', - } - info = { - 'title': video_data.get('title') or traverse_obj( - nbc_data, ('dataLayer', 'contenttitle'), ('dataLayer', 'title'), - ('dataLayer', 'adobe', 'prop22'), ('dataLayer', 'id')), - 'description': traverse_obj(video_data, 'summary', 'excerpt', 'video_hero_text'), - 'upload_date': str_or_none(unified_strdate(date_string)), - 'timestamp': int_or_none(unified_timestamp(date_string)), - } - - if not player_id: - raise ExtractorError( - 'No video player ID or livestream player ID found in webpage', expected=True) - - headers = {'Origin': f'https://www.{channel}.com'} - manifest, urlh = self._download_webpage_handle( - f'https://link.theplatform.com/s/{pdk_acct}/{player_id}', video_id, - headers=headers, query=query, note='Downloading manifest') - if live: - manifest_url = self._search_regex(r'<video src="([^"]*)', manifest, 'manifest URL') - else: - manifest_url = urlh.geturl() + info.update({ + 'title': video_data.get('title') or traverse_obj(nbc_data, ( + 'dataLayer', (None, 'adobe'), ('contenttitle', 'title', 'prop22')), get_all=False), + 'description': + traverse_obj(video_data, 'summary', 'excerpt', 'video_hero_text') + or clean_html(traverse_obj(nbc_data, ('dataLayer', 'summary'))), + 'timestamp': unified_timestamp(date_string), + }) + + smil = None + if player_id and fw_ssid: + smil = self._download_xml( + f'https://link.theplatform.com/s/{pdk_acct}/{player_id}', video_id, + note='Downloading SMIL data', query=query, fatal=is_live) + subtitles = self._parse_smil_subtitles(smil, default_ns) if smil else {} + for video in smil.findall(self._xpath_ns('.//video', default_ns)) if smil else []: + info['duration'] = float_or_none(remove_end(video.get('dur'), 'ms'), 1000) + video_src_url = video.get('src') + ext = mimetype2ext(video.get('type'), default=determine_ext(video_src_url)) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + video_src_url, video_id, 'mp4', m3u8_id='hls', fatal=is_live, + live=is_live, errnote='No HLS formats found') + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif video_src_url: + formats.append({ + 'url': video_src_url, + 'format_id': f'https-{ext}', + 'ext': ext, + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + }) - formats.extend(self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', headers=headers, m3u8_id='hls', - fatal=live, live=live, errnote='No HLS formats found')) + if not formats: + self.raise_no_formats('No video content found in webpage', expected=True) + elif is_live: + try: + self._request_webpage( + HEADRequest(formats[0]['url']), video_id, note='Checking live status') + except ExtractorError: + raise UserNotLive(video_id=channel) return { - 'id': str_or_none(video_id), + 'id': video_id, 'channel': channel, - 'uploader': str_or_none(nbc_data.get('on_air_name')), - 'uploader_id': str_or_none(nbc_data.get('callLetters')), + 'channel_id': nbc_data.get('callLetters'), + 'uploader': nbc_data.get('on_air_name'), 'formats': formats, - 'is_live': live, + 'subtitles': subtitles, + 'is_live': is_live, **info, } |