diff options
Diffstat (limited to 'yt_dlp/extractor/gdcvault.py')
-rw-r--r-- | yt_dlp/extractor/gdcvault.py | 105 |
1 files changed, 84 insertions, 21 deletions
diff --git a/yt_dlp/extractor/gdcvault.py b/yt_dlp/extractor/gdcvault.py index a248a170d..acc6478b8 100644 --- a/yt_dlp/extractor/gdcvault.py +++ b/yt_dlp/extractor/gdcvault.py @@ -5,7 +5,10 @@ import re from .common import InfoExtractor from .kaltura import KalturaIE from ..utils import ( + HEADRequest, + remove_start, sanitized_Request, + smuggle_url, urlencode_postdata, ) @@ -100,6 +103,26 @@ class GDCVaultIE(InfoExtractor): 'format': 'mp4-408', }, }, + { + # Kaltura embed, whitespace between quote and embedded URL in iframe's src + 'url': 'https://www.gdcvault.com/play/1025699', + 'info_dict': { + 'id': '0_zagynv0a', + 'ext': 'mp4', + 'title': 'Tech Toolbox', + 'upload_date': '20190408', + 'uploader_id': 'joe@blazestreaming.com', + 'timestamp': 1554764629, + }, + 'params': { + 'skip_download': True, + }, + }, + { + # HTML5 video + 'url': 'http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru', + 'only_matching': True, + }, ] def _login(self, webpage_url, display_id): @@ -120,38 +143,78 @@ class GDCVaultIE(InfoExtractor): request = sanitized_Request(login_url, urlencode_postdata(login_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') self._download_webpage(request, display_id, 'Logging in') - webpage = self._download_webpage(webpage_url, display_id, 'Getting authenticated video page') + start_page = self._download_webpage(webpage_url, display_id, 'Getting authenticated video page') self._download_webpage(logout_url, display_id, 'Logging out') - return webpage + return start_page def _real_extract(self, url): video_id, name = re.match(self._VALID_URL, url).groups() display_id = name or video_id - webpage = self._download_webpage(url, display_id) - - title = self._html_search_regex( - r'<td><strong>Session Name:?</strong></td>\s*<td>(.*?)</td>', - webpage, 'title') - - PLAYER_REGEX = r'<iframe src=\"(?P<manifest_url>.*?)\".*?</iframe>' - manifest_url = self._html_search_regex( - PLAYER_REGEX, webpage, 'manifest_url') - - partner_id = self._search_regex( - r'/p(?:artner_id)?/(\d+)', manifest_url, 'partner id', - default='1670711') + webpage_url = 'http://www.gdcvault.com/play/' + video_id + start_page = self._download_webpage(webpage_url, display_id) + + direct_url = self._search_regex( + r's1\.addVariable\("file",\s*encodeURIComponent\("(/[^"]+)"\)\);', + start_page, 'url', default=None) + if direct_url: + title = self._html_search_regex( + r'<td><strong>Session Name:?</strong></td>\s*<td>(.*?)</td>', + start_page, 'title') + video_url = 'http://www.gdcvault.com' + direct_url + # resolve the url so that we can detect the correct extension + video_url = self._request_webpage( + HEADRequest(video_url), video_id).geturl() + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'title': title, + } - kaltura_id = self._search_regex( - r'entry_id=(?P<id>(?:[^&])+)', manifest_url, - 'kaltura id', group='id') + embed_url = KalturaIE._extract_url(start_page) + if embed_url: + embed_url = smuggle_url(embed_url, {'source_url': url}) + ie_key = 'Kaltura' + else: + PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/(?:gdc-)?player.*?\.html.*?".*?</iframe>' + + xml_root = self._html_search_regex( + PLAYER_REGEX, start_page, 'xml root', default=None) + if xml_root is None: + # Probably need to authenticate + login_res = self._login(webpage_url, display_id) + if login_res is None: + self.report_warning('Could not login.') + else: + start_page = login_res + # Grab the url from the authenticated page + xml_root = self._html_search_regex( + PLAYER_REGEX, start_page, 'xml root') + + xml_name = self._html_search_regex( + r'<iframe src=".*?\?xml(?:=|URL=xml/)(.+?\.xml).*?".*?</iframe>', + start_page, 'xml filename', default=None) + if not xml_name: + info = self._parse_html5_media_entries(url, start_page, video_id)[0] + info.update({ + 'title': remove_start(self._search_regex( + r'>Session Name:\s*<.*?>\s*<td>(.+?)</td>', start_page, + 'title', default=None) or self._og_search_title( + start_page, default=None), 'GDC Vault - '), + 'id': video_id, + 'display_id': display_id, + }) + return info + embed_url = '%s/xml/%s' % (xml_root, xml_name) + ie_key = 'DigitallySpeaking' return { '_type': 'url_transparent', - 'url': 'kaltura:%s:%s' % (partner_id, kaltura_id), - 'ie_key': KalturaIE.ie_key(), 'id': video_id, 'display_id': display_id, - 'title': title, + 'url': embed_url, + 'ie_key': ie_key, } |