[extractor/nbc] Fix `NBC` and `NBCStations` extractors (#6033)

Improve `InfoExtractor._parse_smil_formats` extension detection Closes #6019 Authored by: bashonly
author: bashonly <88596187+bashonly@users.noreply.github.com> 2023-01-14 10:40:42 -0600
committer: GitHub <noreply@github.com> 2023-01-14 16:40:42 +0000
commit: cb73b8460c3ce6d37ab651a4e44bb23b10056154 (patch)
tree: 5de56f4ec455c4becb96da6f9c4aeb6a51ef3fa3
parent: 7481998b169b2a52049fc33bff82034d6563ead4 (diff)
download: hypervideo-pre-cb73b8460c3ce6d37ab651a4e44bb23b10056154.tar.lz
hypervideo-pre-cb73b8460c3ce6d37ab651a4e44bb23b10056154.tar.xz
hypervideo-pre-cb73b8460c3ce6d37ab651a4e44bb23b10056154.zip
2 files changed, 151 insertions, 103 deletions
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index ef9759974..e37595ffd 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -32,6 +32,7 @@ from ..utils import (
     FormatSorter,
     GeoRestrictedError,
     GeoUtils,
+    HEADRequest,
     LenientJSONDecoder,
     RegexNotFoundError,
     RetryManager,
@@ -80,6 +81,7 @@ from ..utils import (
     update_Request,
     update_url_query,
     url_basename,
+    urlhandle_detect_ext,
     url_or_none,
     urljoin,
     variadic,
@@ -2311,7 +2313,8 @@ class InfoExtractor:
             height = int_or_none(medium.get('height'))
             proto = medium.get('proto')
             ext = medium.get('ext')
-            src_ext = determine_ext(src)
+            src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
+                self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
             streamer = medium.get('streamer') or base
 
             if proto == 'rtmp' or streamer.startswith('rtmp'):
diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py
index 00c592cc3..82d759f75 100644
--- a/yt_dlp/extractor/nbc.py
+++ b/yt_dlp/extractor/nbc.py
@@ -8,24 +8,26 @@ from .adobepass import AdobePassIE
 from ..compat import compat_urllib_parse_unquote
 from ..utils import (
     ExtractorError,
+    HEADRequest,
+    RegexNotFoundError,
+    UserNotLive,
+    clean_html,
     int_or_none,
     parse_age_limit,
     parse_duration,
-    RegexNotFoundError,
     smuggle_url,
-    str_or_none,
     traverse_obj,
     try_get,
-    unified_strdate,
+    unescapeHTML,
     unified_timestamp,
     update_url_query,
     url_basename,
-    variadic,
+    xpath_attr,
 )
 
 
 class NBCIE(ThePlatformIE):  # XXX: Do not subclass from concrete IE
-    _VALID_URL = r'https?(?P<permalink>://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?P<id>n?\d+))'
+    _VALID_URL = r'https?(?P<permalink>://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?P<id>(?:NBCE|n)?\d+))'
 
     _TESTS = [
         {
@@ -38,10 +40,18 @@ class NBCIE(ThePlatformIE):  # XXX: Do not subclass from concrete IE
                 'timestamp': 1424246400,
                 'upload_date': '20150218',
                 'uploader': 'NBCU-COM',
+                'episode': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s',
+                'episode_number': 86,
+                'season': 'Season 2',
+                'season_number': 2,
+                'series': 'Tonight Show: Jimmy Fallon',
+                'duration': 237.0,
+                'chapters': 'count:1',
+                'tags': 'count:4',
+                'thumbnail': r're:https?://.+\.jpg',
             },
             'params': {
-                # m3u8 download
-                'skip_download': True,
+                'skip_download': 'm3u8',
             },
         },
         {
@@ -55,11 +65,7 @@ class NBCIE(ThePlatformIE):  # XXX: Do not subclass from concrete IE
                 'upload_date': '20141206',
                 'uploader': 'NBCU-COM',
             },
-            'params': {
-                # m3u8 download
-                'skip_download': True,
-            },
-            'skip': 'Only works from US',
+            'skip': 'page not found',
         },
         {
             # HLS streams requires the 'hdnea3' cookie
@@ -73,10 +79,59 @@ class NBCIE(ThePlatformIE):  # XXX: Do not subclass from concrete IE
                 'upload_date': '20090315',
                 'uploader': 'NBCU-COM',
             },
+            'skip': 'page not found',
+        },
+        {
+            # manifest url does not have extension
+            'url': 'https://www.nbc.com/the-golden-globe-awards/video/oprah-winfrey-receives-cecil-b-de-mille-award-at-the-2018-golden-globes/3646439',
+            'info_dict': {
+                'id': '3646439',
+                'ext': 'mp4',
+                'title': 'Oprah Winfrey Receives Cecil B. de Mille Award at the 2018 Golden Globes',
+                'episode': 'Oprah Winfrey Receives Cecil B. de Mille Award at the 2018 Golden Globes',
+                'episode_number': 1,
+                'season': 'Season 75',
+                'season_number': 75,
+                'series': 'The Golden Globe Awards',
+                'description': 'Oprah Winfrey receives the Cecil B. de Mille Award at the 75th Annual Golden Globe Awards.',
+                'uploader': 'NBCU-COM',
+                'upload_date': '20180107',
+                'timestamp': 1515312000,
+                'duration': 570.0,
+                'tags': 'count:8',
+                'thumbnail': r're:https?://.+\.jpg',
+                'chapters': 'count:1',
+            },
+            'params': {
+                'skip_download': 'm3u8',
+            },
+        },
+        {
+            # new video_id format
+            'url': 'https://www.nbc.com/quantum-leap/video/bens-first-leap-nbcs-quantum-leap/NBCE125189978',
+            'info_dict': {
+                'id': 'NBCE125189978',
+                'ext': 'mp4',
+                'title': 'Ben\'s First Leap | NBC\'s Quantum Leap',
+                'description': 'md5:a82762449b7ec4bb83291a7b355ebf8e',
+                'uploader': 'NBCU-COM',
+                'series': 'Quantum Leap',
+                'season': 'Season 1',
+                'season_number': 1,
+                'episode': 'Ben\'s First Leap | NBC\'s Quantum Leap',
+                'episode_number': 1,
+                'duration': 170.171,
+                'chapters': [],
+                'timestamp': 1663956155,
+                'upload_date': '20220923',
+                'tags': 'count:10',
+                'age_limit': 0,
+                'thumbnail': r're:https?://.+\.jpg',
+            },
+            'expected_warnings': ['Ignoring subtitle tracks'],
             'params': {
-                'skip_download': True,
+                'skip_download': 'm3u8',
             },
-            'skip': 'Only works from US',
         },
         {
             'url': 'https://www.nbc.com/classic-tv/charles-in-charge/video/charles-in-charge-pilot/n3310',
@@ -600,32 +655,36 @@ class NBCStationsIE(InfoExtractor):
 
     _TESTS = [{
         'url': 'https://www.nbclosangeles.com/news/local/large-structure-fire-in-downtown-la-prompts-smoke-odor-advisory/2968618/',
-        'md5': '462041d91bd762ef5a38b7d85d6dc18f',
         'info_dict': {
             'id': '2968618',
             'ext': 'mp4',
             'title': 'Large Structure Fire in Downtown LA Prompts Smoke Odor Advisory',
-            'description': None,
+            'description': 'md5:417ed3c2d91fe9d301e6db7b0942f182',
             'timestamp': 1661135892,
-            'upload_date': '20220821',
+            'upload_date': '20220822',
             'uploader': 'NBC 4',
-            'uploader_id': 'KNBC',
+            'channel_id': 'KNBC',
             'channel': 'nbclosangeles',
         },
+        'params': {
+            'skip_download': 'm3u8',
+        },
     }, {
         'url': 'https://www.telemundoarizona.com/responde/huracan-complica-reembolso-para-televidente-de-tucson/2247002/',
-        'md5': '0917dcf7885be1023a9220630d415f67',
         'info_dict': {
             'id': '2247002',
             'ext': 'mp4',
-            'title': 'Huracán complica que televidente de Tucson reciba reembolso',
+            'title': 'Huracán complica que televidente de Tucson reciba  reembolso',
             'description': 'md5:af298dc73aab74d4fca6abfb12acb6cf',
             'timestamp': 1660886507,
             'upload_date': '20220819',
             'uploader': 'Telemundo Arizona',
-            'uploader_id': 'KTAZ',
+            'channel_id': 'KTAZ',
             'channel': 'telemundoarizona',
         },
+        'params': {
+            'skip_download': 'm3u8',
+        },
     }]
 
     _RESOLUTIONS = {
@@ -644,48 +703,39 @@ class NBCStationsIE(InfoExtractor):
             r'<script>var\s*nbc\s*=', webpage, 'NBC JSON data', video_id)
         pdk_acct = nbc_data.get('pdkAcct') or 'Yh1nAC'
         fw_ssid = traverse_obj(nbc_data, ('video', 'fwSSID'))
-        fw_network_id = traverse_obj(nbc_data, ('video', 'fwNetworkID'), default='382114')
 
-        video_data = self._parse_json(self._html_search_regex(
-            r'data-videos="([^"]*)"', webpage, 'video data', default='{}'), video_id)
-        video_data = variadic(video_data)[0]
-        video_data.update(self._parse_json(self._html_search_regex(
-            r'data-meta="([^"]*)"', webpage, 'metadata', default='{}'), video_id))
+        video_data = self._search_json(
+            r'data-videos="\[', webpage, 'video data', video_id, default={}, transform_source=unescapeHTML)
+        video_data.update(self._search_json(
+            r'data-meta="', webpage, 'metadata', video_id, default={}, transform_source=unescapeHTML))
+        if not video_data:
+            raise ExtractorError('No video metadata found in webpage', expected=True)
 
-        formats = []
+        info, formats, subtitles = {}, [], {}
+        is_live = int_or_none(video_data.get('mpx_is_livestream')) == 1
+        query = {
+            'formats': 'MPEG-DASH none,M3U none,MPEG-DASH none,MPEG4,MP3',
+            'format': 'SMIL',
+            'fwsitesection': fw_ssid,
+            'fwNetworkID': traverse_obj(nbc_data, ('video', 'fwNetworkID'), default='382114'),
+            'pprofile': 'ots_desktop_html',
+            'sensitive': 'false',
+            'w': '1920',
+            'h': '1080',
+            'mode': 'LIVE' if is_live else 'on-demand',
+            'vpaid': 'script',
+            'schema': '2.0',
+            'sdk': 'PDK 6.1.3',
+        }
 
-        if video_data.get('mpx_is_livestream') == '1':
-            live = True
-            player_id = traverse_obj(
-                video_data, 'mpx_m3upid', ('video', 'meta', 'mpx_m3upid'), 'mpx_pid',
-                ('video', 'meta', 'mpx_pid'), 'pid_streaming_web_medium')
-            query = {
-                'mbr': 'true',
-                'assetTypes': 'LegacyRelease',
-                'fwsitesection': fw_ssid,
-                'fwNetworkID': fw_network_id,
-                'pprofile': 'ots_desktop_html',
-                'sensitive': 'false',
-                'w': '1920',
-                'h': '1080',
-                'rnd': '1660303',
-                'mode': 'LIVE',
-                'format': 'SMIL',
-                'tracking': 'true',
-                'formats': 'M3U+none,MPEG-DASH+none,MPEG4,MP3',
-                'vpaid': 'script',
-                'schema': '2.0',
-                'SDK': 'PDK+6.1.3',
-            }
-            info = {
-                'title': f'{channel} livestream',
-            }
+        if is_live:
+            player_id = traverse_obj(video_data, ((None, ('video', 'meta')), (
+                'mpx_m3upid', 'mpx_pid', 'pid_streaming_web_medium')), get_all=False)
+            info['title'] = f'{channel} livestream'
 
         else:
-            live = False
-            player_id = traverse_obj(
-                video_data, ('video', 'meta', 'pid_streaming_web_high'), 'pid_streaming_web_high',
-                ('video', 'meta', 'mpx_pid'), 'mpx_pid')
+            player_id = traverse_obj(video_data, (
+                (None, ('video', 'meta')), ('pid_streaming_web_high', 'mpx_pid')), get_all=False)
 
             date_string = traverse_obj(video_data, 'date_string', 'date_gmt')
             if date_string:
@@ -693,63 +743,58 @@ class NBCStationsIE(InfoExtractor):
                     r'datetime="([^"]+)"', date_string, 'date string', fatal=False)
             else:
                 date_string = traverse_obj(
-                    nbc_data, ('dataLayer', 'adobe', 'prop70'), ('dataLayer', 'adobe', 'eVar70'),
-                    ('dataLayer', 'adobe', 'eVar59'))
+                    nbc_data, ('dataLayer', 'adobe', ('prop70', 'eVar70', 'eVar59')), get_all=False)
 
-            video_url = traverse_obj(video_data, ('video', 'meta', 'mp4_url'), 'mp4_url')
+            video_url = traverse_obj(video_data, ((None, ('video', 'meta')), 'mp4_url'), get_all=False)
             if video_url:
-                height = url_basename(video_url).split('-')[1].split('p')[0]
+                height = self._search_regex(r'\d+-(\d+)p', url_basename(video_url), 'height', default=None)
                 formats.append({
                     'url': video_url,
                     'ext': 'mp4',
                     'width': int_or_none(self._RESOLUTIONS.get(height)),
                     'height': int_or_none(height),
-                    'format_id': f'http-{height}',
+                    'format_id': 'http-mp4',
                 })
 
-            query = {
-                'mbr': 'true',
-                'assetTypes': 'LegacyRelease',
-                'fwsitesection': fw_ssid,
-                'fwNetworkID': fw_network_id,
-                'format': 'redirect',
-                'manifest': 'm3u',
-                'Tracking': 'true',
-                'Embedded': 'true',
-                'formats': 'MPEG4',
-            }
-            info = {
-                'title': video_data.get('title') or traverse_obj(
-                    nbc_data, ('dataLayer', 'contenttitle'), ('dataLayer', 'title'),
-                    ('dataLayer', 'adobe', 'prop22'), ('dataLayer', 'id')),
-                'description': traverse_obj(video_data, 'summary', 'excerpt', 'video_hero_text'),
-                'upload_date': str_or_none(unified_strdate(date_string)),
-                'timestamp': int_or_none(unified_timestamp(date_string)),
-            }
-
-        if not player_id:
-            raise ExtractorError(
-                'No video player ID or livestream player ID found in webpage', expected=True)
-
-        headers = {'Origin': f'https://www.{channel}.com'}
-        manifest, urlh = self._download_webpage_handle(
-            f'https://link.theplatform.com/s/{pdk_acct}/{player_id}', video_id,
-            headers=headers, query=query, note='Downloading manifest')
-        if live:
-            manifest_url = self._search_regex(r'<video src="([^"]*)', manifest, 'manifest URL')
-        else:
-            manifest_url = urlh.geturl()
+            info.update({
+                'title': video_data.get('title') or traverse_obj(nbc_data, (
+                    'dataLayer', (None, 'adobe'), ('contenttitle', 'title', 'prop22')), get_all=False),
+                'description':
+                    traverse_obj(video_data, 'summary', 'excerpt', 'video_hero_text')
+                    or clean_html(traverse_obj(nbc_data, ('dataLayer', 'summary'))),
+                'timestamp': unified_timestamp(date_string),
+            })
 
-        formats.extend(self._extract_m3u8_formats(
-            manifest_url, video_id, 'mp4', headers=headers, m3u8_id='hls',
-            fatal=live, live=live, errnote='No HLS formats found'))
+        smil = None
+        if player_id and fw_ssid:
+            smil = self._download_xml(
+                f'https://link.theplatform.com/s/{pdk_acct}/{player_id}', video_id,
+                note='Downloading SMIL data', query=query, fatal=is_live)
+        if smil:
+            manifest_url = xpath_attr(smil, './/{*}video', 'src', fatal=is_live)
+            subtitles = self._parse_smil_subtitles(smil, '*')
+            fmts, subs = self._extract_m3u8_formats_and_subtitles(
+                manifest_url, video_id, 'mp4', m3u8_id='hls', fatal=is_live,
+                live=is_live, errnote='No HLS formats found')
+            formats.extend(fmts)
+            self._merge_subtitles(subs, target=subtitles)
+
+        if not formats:
+            self.raise_no_formats('No video content found in webpage', expected=True)
+        elif is_live:
+            try:
+                self._request_webpage(
+                    HEADRequest(formats[0]['url']), video_id, note='Checking live status')
+            except ExtractorError:
+                raise UserNotLive(video_id=channel)
 
         return {
-            'id': str_or_none(video_id),
+            'id': video_id,
             'channel': channel,
-            'uploader': str_or_none(nbc_data.get('on_air_name')),
-            'uploader_id': str_or_none(nbc_data.get('callLetters')),
+            'channel_id': nbc_data.get('callLetters'),
+            'uploader': nbc_data.get('on_air_name'),
             'formats': formats,
-            'is_live': live,
+            'subtitles': subtitles,
+            'is_live': is_live,
             **info,
         }
author	bashonly <88596187+bashonly@users.noreply.github.com>	2023-01-14 10:40:42 -0600
committer	GitHub <noreply@github.com>	2023-01-14 16:40:42 +0000
commit	cb73b8460c3ce6d37ab651a4e44bb23b10056154 (patch)
tree	5de56f4ec455c4becb96da6f9c4aeb6a51ef3fa3
parent	7481998b169b2a52049fc33bff82034d6563ead4 (diff)
download	hypervideo-pre-cb73b8460c3ce6d37ab651a4e44bb23b10056154.tar.lz hypervideo-pre-cb73b8460c3ce6d37ab651a4e44bb23b10056154.tar.xz hypervideo-pre-cb73b8460c3ce6d37ab651a4e44bb23b10056154.zip