diff options
Diffstat (limited to 'yt_dlp/extractor')
26 files changed, 1190 insertions, 301 deletions
diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py index 66b12c72f..360fa4699 100644 --- a/yt_dlp/extractor/abematv.py +++ b/yt_dlp/extractor/abematv.py @@ -8,10 +8,6 @@ import struct from base64 import urlsafe_b64encode from binascii import unhexlify -import typing -if typing.TYPE_CHECKING: - from ..YoutubeDL import YoutubeDL - from .common import InfoExtractor from ..aes import aes_ecb_decrypt from ..compat import ( @@ -36,15 +32,15 @@ from ..utils import ( # NOTE: network handler related code is temporary thing until network stack overhaul PRs are merged (#2861/#2862) -def add_opener(self: 'YoutubeDL', handler): +def add_opener(ydl, handler): ''' Add a handler for opening URLs, like _download_webpage ''' # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 - assert isinstance(self._opener, compat_urllib_request.OpenerDirector) - self._opener.add_handler(handler) + assert isinstance(ydl._opener, compat_urllib_request.OpenerDirector) + ydl._opener.add_handler(handler) -def remove_opener(self: 'YoutubeDL', handler): +def remove_opener(ydl, handler): ''' Remove handler(s) for opening URLs @param handler Either handler object itself or handler type. @@ -52,8 +48,8 @@ def remove_opener(self: 'YoutubeDL', handler): ''' # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 - opener = self._opener - assert isinstance(self._opener, compat_urllib_request.OpenerDirector) + opener = ydl._opener + assert isinstance(ydl._opener, compat_urllib_request.OpenerDirector) if isinstance(handler, (type, tuple)): find_cp = lambda x: isinstance(x, handler) else: diff --git a/yt_dlp/extractor/adobepass.py b/yt_dlp/extractor/adobepass.py index bebcafa6b..f0eba8844 100644 --- a/yt_dlp/extractor/adobepass.py +++ b/yt_dlp/extractor/adobepass.py @@ -1345,6 +1345,11 @@ MSO_INFO = { 'username_field': 'username', 'password_field': 'password', }, + 'Suddenlink': { + 'name': 'Suddenlink', + 'username_field': 'username', + 'password_field': 'password', + }, } @@ -1636,6 +1641,52 @@ class AdobePassIE(InfoExtractor): query=hidden_data) post_form(mvpd_confirm_page_res, 'Confirming Login') + elif mso_id == 'Suddenlink': + # Suddenlink is similar to SlingTV in using a tab history count and a meta refresh, + # but they also do a dynmaic redirect using javascript that has to be followed as well + first_bookend_page, urlh = post_form( + provider_redirect_page_res, 'Pressing Continue...') + + hidden_data = self._hidden_inputs(first_bookend_page) + hidden_data['history_val'] = 1 + + provider_login_redirect_page = self._download_webpage( + urlh.geturl(), video_id, 'Sending First Bookend', + query=hidden_data) + + provider_tryauth_url = self._html_search_regex( + r'url:\s*[\'"]([^\'"]+)', provider_login_redirect_page, 'ajaxurl') + + provider_tryauth_page = self._download_webpage( + provider_tryauth_url, video_id, 'Submitting TryAuth', + query=hidden_data) + + provider_login_page_res = self._download_webpage_handle( + f'https://authorize.suddenlink.net/saml/module.php/authSynacor/login.php?AuthState={provider_tryauth_page}', + video_id, 'Getting Login Page', + query=hidden_data) + + provider_association_redirect, urlh = post_form( + provider_login_page_res, 'Logging in', { + mso_info['username_field']: username, + mso_info['password_field']: password + }) + + provider_refresh_redirect_url = extract_redirect_url( + provider_association_redirect, url=urlh.geturl()) + + last_bookend_page, urlh = self._download_webpage_handle( + provider_refresh_redirect_url, video_id, + 'Downloading Auth Association Redirect Page') + + hidden_data = self._hidden_inputs(last_bookend_page) + hidden_data['history_val'] = 3 + + mvpd_confirm_page_res = self._download_webpage_handle( + urlh.geturl(), video_id, 'Sending Final Bookend', + query=hidden_data) + + post_form(mvpd_confirm_page_res, 'Confirming Login') else: # Some providers (e.g. DIRECTV NOW) have another meta refresh # based redirect that should be followed. diff --git a/yt_dlp/extractor/ant1newsgr.py b/yt_dlp/extractor/ant1newsgr.py index 7d70e0427..1075b461e 100644 --- a/yt_dlp/extractor/ant1newsgr.py +++ b/yt_dlp/extractor/ant1newsgr.py @@ -97,8 +97,8 @@ class Ant1NewsGrArticleIE(Ant1NewsGrBaseIE): embed_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage)) if not embed_urls: raise ExtractorError('no videos found for %s' % video_id, expected=True) - return self.url_result_or_playlist_from_matches( - embed_urls, video_id, info['title'], ie=Ant1NewsGrEmbedIE.ie_key(), + return self.playlist_from_matches( + embed_urls, video_id, info.get('title'), ie=Ant1NewsGrEmbedIE.ie_key(), video_kwargs={'url_transparent': True, 'timestamp': info.get('timestamp')}) diff --git a/yt_dlp/extractor/ard.py b/yt_dlp/extractor/ard.py index 4ad5d6ddd..7ea339b39 100644 --- a/yt_dlp/extractor/ard.py +++ b/yt_dlp/extractor/ard.py @@ -407,8 +407,9 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): (?:(?:beta|www)\.)?ardmediathek\.de/ (?:(?P<client>[^/]+)/)? (?:player|live|video|(?P<playlist>sendung|sammlung))/ - (?:(?P<display_id>[^?#]+)/)? - (?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)''' + (?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)? + (?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+) + (?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))''' _TESTS = [{ 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', @@ -437,6 +438,13 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): 'upload_date': '20211108', }, }, { + 'url': 'https://www.ardmediathek.de/sendung/beforeigners/beforeigners/staffel-1/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw/1', + 'playlist_count': 6, + 'info_dict': { + 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw', + 'title': 'beforeigners/beforeigners/staffel-1', + }, + }, { 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', 'only_matching': True, }, { @@ -561,14 +569,15 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): break pageNumber = pageNumber + 1 - return self.playlist_result(entries, playlist_title=display_id) + return self.playlist_result(entries, playlist_id, playlist_title=display_id) def _real_extract(self, url): - video_id, display_id, playlist_type, client = self._match_valid_url(url).group( - 'id', 'display_id', 'playlist', 'client') + video_id, display_id, playlist_type, client, season_number = self._match_valid_url(url).group( + 'id', 'display_id', 'playlist', 'client', 'season') display_id, client = display_id or video_id, client or 'ard' if playlist_type: + # TODO: Extract only specified season return self._ARD_extract_playlist(url, video_id, display_id, client, playlist_type) player_page = self._download_json( diff --git a/yt_dlp/extractor/ccma.py b/yt_dlp/extractor/ccma.py index ea98f8688..9dbaabfa0 100644 --- a/yt_dlp/extractor/ccma.py +++ b/yt_dlp/extractor/ccma.py @@ -1,17 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -import calendar -import datetime - from .common import InfoExtractor from ..utils import ( clean_html, - extract_timezone, int_or_none, parse_duration, parse_resolution, try_get, + unified_timestamp, url_or_none, ) @@ -95,14 +92,8 @@ class CCMAIE(InfoExtractor): duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text')) tematica = try_get(informacio, lambda x: x['tematica']['text']) - timestamp = None data_utc = try_get(informacio, lambda x: x['data_emissio']['utc']) - try: - timezone, data_utc = extract_timezone(data_utc) - timestamp = calendar.timegm((datetime.datetime.strptime( - data_utc, '%Y-%d-%mT%H:%M:%S') - timezone).timetuple()) - except TypeError: - pass + timestamp = unified_timestamp(data_utc) subtitles = {} subtitols = media.get('subtitols') or [] diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index f86e7cb3e..354814433 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -226,6 +226,7 @@ class InfoExtractor(object): The following fields are optional: + direct: True if a direct video file was given (must only be set by GenericIE) alt_title: A secondary title of the video. display_id An alternative identifier for the video, not necessarily unique, but available before title. Typically, id is @@ -274,7 +275,7 @@ class InfoExtractor(object): * "url": A URL pointing to the subtitles file It can optionally also have: * "name": Name or description of the subtitles - * http_headers: A dictionary of additional HTTP headers + * "http_headers": A dictionary of additional HTTP headers to add to the request. "ext" will be calculated from URL if missing automatic_captions: Like 'subtitles'; contains automatically generated @@ -425,8 +426,8 @@ class InfoExtractor(object): title, description etc. - Subclasses of this one should re-define the _real_initialize() and - _real_extract() methods and define a _VALID_URL regexp. + Subclasses of this should define a _VALID_URL regexp and, re-define the + _real_extract() and (optionally) _real_initialize() methods. Probably, they should also be added to the list of extractors. Subclasses may also override suitable() if necessary, but ensure the function @@ -661,7 +662,7 @@ class InfoExtractor(object): return False def set_downloader(self, downloader): - """Sets the downloader for this IE.""" + """Sets a YoutubeDL instance as the downloader for this IE.""" self._downloader = downloader def _real_initialize(self): @@ -670,7 +671,7 @@ class InfoExtractor(object): def _real_extract(self, url): """Real extraction process. Redefine in subclasses.""" - pass + raise NotImplementedError('This method must be implemented by subclasses') @classmethod def ie_key(cls): @@ -749,7 +750,7 @@ class InfoExtractor(object): errmsg = '%s: %s' % (errnote, error_to_compat_str(err)) if fatal: - raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) + raise ExtractorError(errmsg, cause=err) else: self.report_warning(errmsg) return False @@ -1661,31 +1662,31 @@ class InfoExtractor(object): 'format_id': {'type': 'alias', 'field': 'id'}, 'preference': {'type': 'alias', 'field': 'ie_pref'}, 'language_preference': {'type': 'alias', 'field': 'lang'}, - - # Deprecated - 'dimension': {'type': 'alias', 'field': 'res'}, - 'resolution': {'type': 'alias', 'field': 'res'}, - 'extension': {'type': 'alias', 'field': 'ext'}, - 'bitrate': {'type': 'alias', 'field': 'br'}, - 'total_bitrate': {'type': 'alias', 'field': 'tbr'}, - 'video_bitrate': {'type': 'alias', 'field': 'vbr'}, - 'audio_bitrate': {'type': 'alias', 'field': 'abr'}, - 'framerate': {'type': 'alias', 'field': 'fps'}, - 'protocol': {'type': 'alias', 'field': 'proto'}, 'source_preference': {'type': 'alias', 'field': 'source'}, + 'protocol': {'type': 'alias', 'field': 'proto'}, 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'}, - 'filesize_estimate': {'type': 'alias', 'field': 'size'}, - 'samplerate': {'type': 'alias', 'field': 'asr'}, - 'video_ext': {'type': 'alias', 'field': 'vext'}, - 'audio_ext': {'type': 'alias', 'field': 'aext'}, - 'video_codec': {'type': 'alias', 'field': 'vcodec'}, - 'audio_codec': {'type': 'alias', 'field': 'acodec'}, - 'video': {'type': 'alias', 'field': 'hasvid'}, - 'has_video': {'type': 'alias', 'field': 'hasvid'}, - 'audio': {'type': 'alias', 'field': 'hasaud'}, - 'has_audio': {'type': 'alias', 'field': 'hasaud'}, - 'extractor': {'type': 'alias', 'field': 'ie_pref'}, - 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'}, + + # Deprecated + 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True}, + 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True}, + 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True}, + 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True}, + 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True}, + 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True}, + 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True}, + 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True}, + 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True}, + 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True}, + 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True}, + 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True}, + 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True}, + 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True}, + 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True}, + 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True}, + 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True}, + 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True}, + 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True}, + 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True}, } def __init__(self, ie, field_preference): @@ -1785,7 +1786,7 @@ class InfoExtractor(object): continue if self._get_field_setting(field, 'type') == 'alias': alias, field = field, self._get_field_setting(field, 'field') - if alias not in ('format_id', 'preference', 'language_preference'): + if self._get_field_setting(alias, 'deprecated'): self.ydl.deprecation_warning( f'Format sorting alias {alias} is deprecated ' f'and may be removed in a future version. Please use {field} instead') diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 5448acf01..09b795c56 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -520,6 +520,7 @@ from .foxnews import ( FoxNewsArticleIE, ) from .foxsports import FoxSportsIE +from .fptplay import FptplayIE from .franceculture import FranceCultureIE from .franceinter import FranceInterIE from .francetv import ( @@ -848,6 +849,7 @@ from .microsoftvirtualacademy import ( from .mildom import ( MildomIE, MildomVodIE, + MildomClipIE, MildomUserVodIE, ) from .minds import ( @@ -1150,6 +1152,11 @@ from .palcomp3 import ( PalcoMP3VideoIE, ) from .pandoratv import PandoraTVIE +from .panopto import ( + PanoptoIE, + PanoptoListIE, + PanoptoPlaylistIE +) from .paramountplus import ( ParamountPlusIE, ParamountPlusSeriesIE, @@ -1218,6 +1225,7 @@ from .podomatic import PodomaticIE from .pokemon import ( PokemonIE, PokemonWatchIE, + PokemonSoundLibraryIE, ) from .pokergo import ( PokerGoIE, @@ -2010,6 +2018,7 @@ from .ximalaya import ( XimalayaIE, XimalayaAlbumIE ) +from .xinpianchang import XinpianchangIE from .xminus import XMinusIE from .xnxx import XNXXIE from .xstream import XstreamIE diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index d39dcc058..ef57b221c 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -18,6 +18,7 @@ from ..utils import ( ExtractorError, float_or_none, get_element_by_id, + get_first, int_or_none, js_to_json, merge_dicts, @@ -405,11 +406,9 @@ class FacebookIE(InfoExtractor): ..., 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] media = [m for m in traverse_obj(post, (..., 'attachments', ..., 'media'), expected_type=dict) or [] if str(m.get('id')) == video_id and m.get('__typename') == 'Video'] - title = traverse_obj(media, (..., 'title', 'text'), get_all=False) - description = traverse_obj(media, ( - ..., 'creation_story', 'comet_sections', 'message', 'story', 'message', 'text'), get_all=False) - uploader_data = (traverse_obj(media, (..., 'owner'), get_all=False) - or traverse_obj(post, (..., 'node', 'actors', ...), get_all=False) or {}) + title = get_first(media, ('title', 'text')) + description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text')) + uploader_data = get_first(media, 'owner') or get_first(post, ('node', 'actors', ...)) or {} page_title = title or self._html_search_regex(( r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>(?P<content>[^<]*)</h2>', diff --git a/yt_dlp/extractor/fptplay.py b/yt_dlp/extractor/fptplay.py new file mode 100644 index 000000000..a34e90bb1 --- /dev/null +++ b/yt_dlp/extractor/fptplay.py @@ -0,0 +1,102 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hashlib +import time +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + join_nonempty, +) + + +class FptplayIE(InfoExtractor): + _VALID_URL = r'https?://fptplay\.vn/(?P<type>xem-video)/[^/]+\-(?P<id>\w+)(?:/tap-(?P<episode>[^/]+)?/?(?:[?#]|$)|)' + _GEO_COUNTRIES = ['VN'] + IE_NAME = 'fptplay' + IE_DESC = 'fptplay.vn' + _TESTS = [{ + 'url': 'https://fptplay.vn/xem-video/nhan-duyen-dai-nhan-xin-dung-buoc-621a123016f369ebbde55945', + 'md5': 'ca0ee9bc63446c0c3e9a90186f7d6b33', + 'info_dict': { + 'id': '621a123016f369ebbde55945', + 'ext': 'mp4', + 'title': 'Nhân Duyên Đại Nhân Xin Dừng Bước - Ms. Cupid In Love', + 'description': 'md5:23cf7d1ce0ade8e21e76ae482e6a8c6c', + }, + }, { + 'url': 'https://fptplay.vn/xem-video/ma-toi-la-dai-gia-61f3aa8a6b3b1d2e73c60eb5/tap-3', + 'md5': 'b35be968c909b3e4e1e20ca45dd261b1', + 'info_dict': { + 'id': '61f3aa8a6b3b1d2e73c60eb5', + 'ext': 'mp4', + 'title': 'Má Tôi Là Đại Gia - 3', + 'description': 'md5:ff8ba62fb6e98ef8875c42edff641d1c', + }, + }, { + 'url': 'https://fptplay.vn/xem-video/nha-co-chuyen-hi-alls-well-ends-well-1997-6218995f6af792ee370459f0', + 'only_matching': True, + }] + + def _real_extract(self, url): + type_url, video_id, episode = self._match_valid_url(url).group('type', 'id', 'episode') + webpage = self._download_webpage(url, video_id=video_id, fatal=False) + info = self._download_json(self.get_api_with_st_token(video_id, episode or 0), video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(info['data']['url'], video_id, 'mp4') + self._sort_formats(formats) + return { + 'id': video_id, + 'title': join_nonempty( + self._html_search_meta(('og:title', 'twitter:title'), webpage), episode, delim=' - '), + 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage), + 'formats': formats, + 'subtitles': subtitles, + } + + def get_api_with_st_token(self, video_id, episode): + path = f'/api/v6.2_w/stream/vod/{video_id}/{episode}/auto_vip' + timestamp = int(time.time()) + 10800 + + t = hashlib.md5(f'WEBv6Dkdsad90dasdjlALDDDS{timestamp}{path}'.encode()).hexdigest().upper() + r = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/' + n = [int(f'0x{t[2 * o: 2 * o + 2]}', 16) for o in range(len(t) // 2)] + + def convert(e): + t = '' + n = 0 + i = [0, 0, 0] + a = [0, 0, 0, 0] + s = len(e) + c = 0 + for z in range(s, 0, -1): + if n <= 3: + i[n] = e[c] + n += 1 + c += 1 + if 3 == n: + a[0] = (252 & i[0]) >> 2 + a[1] = ((3 & i[0]) << 4) + ((240 & i[1]) >> 4) + a[2] = ((15 & i[1]) << 2) + ((192 & i[2]) >> 6) + a[3] = (63 & i[2]) + for v in range(4): + t += r[a[v]] + n = 0 + if n: + for o in range(n, 3): + i[o] = 0 + + for o in range(n + 1): + a[0] = (252 & i[0]) >> 2 + a[1] = ((3 & i[0]) << 4) + ((240 & i[1]) >> 4) + a[2] = ((15 & i[1]) << 2) + ((192 & i[2]) >> 6) + a[3] = (63 & i[2]) + t += r[a[o]] + n += 1 + while n < 3: + t += '' + n += 1 + return t + + st_token = convert(n).replace('+', '-').replace('/', '_').replace('=', '') + return f'https://api.fptplay.net{path}?{urllib.parse.urlencode({"st": st_token, "e": timestamp})}' diff --git a/yt_dlp/extractor/frontendmasters.py b/yt_dlp/extractor/frontendmasters.py index 40b8cb0b4..0d29da29b 100644 --- a/yt_dlp/extractor/frontendmasters.py +++ b/yt_dlp/extractor/frontendmasters.py @@ -252,9 +252,9 @@ class FrontendMastersCourseIE(FrontendMastersPageBaseIE): entries = [] for lesson in lessons: lesson_name = lesson.get('slug') - if not lesson_name: - continue lesson_id = lesson.get('hash') or lesson.get('statsId') + if not lesson_id or not lesson_name: + continue entries.append(self._extract_lesson(chapters, lesson_id, lesson)) title = course.get('title') diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 0ddd050ff..6a8b8543b 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -146,6 +146,7 @@ from .tvp import TVPEmbedIE from .blogger import BloggerIE from .mainstreaming import MainStreamingIE from .gfycat import GfycatIE +from .panopto import PanoptoBaseIE class GenericIE(InfoExtractor): @@ -2498,6 +2499,15 @@ class GenericIE(InfoExtractor): 'id': '?vid=2295' }, 'playlist_count': 9 + }, + { + # Panopto embeds + 'url': 'https://www.monash.edu/learning-teaching/teachhq/learning-technologies/panopto/how-to/insert-a-quiz-into-a-panopto-video', + 'info_dict': { + 'title': 'Insert a quiz into a Panopto video', + 'id': 'insert-a-quiz-into-a-panopto-video' + }, + 'playlist_count': 1 } ] @@ -3723,6 +3733,9 @@ class GenericIE(InfoExtractor): if gfycat_urls: return self.playlist_from_matches(gfycat_urls, video_id, video_title, ie=GfycatIE.ie_key()) + panopto_urls = PanoptoBaseIE._extract_urls(webpage) + if panopto_urls: + return self.playlist_from_matches(panopto_urls, video_id, video_title) # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: diff --git a/yt_dlp/extractor/mgtv.py b/yt_dlp/extractor/mgtv.py index cab3aa045..4ac70ea57 100644 --- a/yt_dlp/extractor/mgtv.py +++ b/yt_dlp/extractor/mgtv.py @@ -13,12 +13,15 @@ from ..compat import ( from ..utils import ( ExtractorError, int_or_none, + try_get, + url_or_none, ) class MGTVIE(InfoExtractor): _VALID_URL = r'https?://(?:w(?:ww)?\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html' IE_DESC = '芒果TV' + IE_NAME = 'MangoTV' _TESTS = [{ 'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html', @@ -31,6 +34,32 @@ class MGTVIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', }, }, { + 'url': 'https://w.mgtv.com/b/427837/15588271.html', + 'info_dict': { + 'id': '15588271', + 'ext': 'mp4', + 'title': '春日迟迟再出发 沉浸版', + 'description': 'md5:a7a05a05b1aa87bd50cae619b19bbca6', + 'thumbnail': r're:^https?://.+\.jpg', + 'duration': 4026, + }, + }, { + 'url': 'https://w.mgtv.com/b/333652/7329822.html', + 'info_dict': { + 'id': '7329822', + 'ext': 'mp4', + 'title': '拜托,请你爱我', + 'description': 'md5:cd81be6499bafe32e4d143abd822bf9c', + 'thumbnail': r're:^https?://.+\.jpg', + 'duration': 2656, + }, + }, { + 'url': 'https://w.mgtv.com/b/427837/15591647.html', + 'only_matching': True, + }, { + 'url': 'https://w.mgtv.com/b/388252/15634192.html?fpa=33318&fpos=4&lastp=ch_home', + 'only_matching': True, + }, { 'url': 'http://www.mgtv.com/b/301817/3826653.html', 'only_matching': True, }, { @@ -40,12 +69,14 @@ class MGTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - tk2 = base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1] + tk2 = base64.urlsafe_b64encode( + f'did={compat_str(uuid.uuid4()).encode()}|pno=1030|ver=0.3.0301|clit={int(time.time())}'.encode())[::-1] try: api_data = self._download_json( 'https://pcweb.api.mgtv.com/player/video', video_id, query={ 'tk2': tk2, 'video_id': video_id, + 'type': 'pch5' }, headers=self.geo_verification_headers())['data'] except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: @@ -61,6 +92,7 @@ class MGTVIE(InfoExtractor): 'pm2': api_data['atc']['pm2'], 'tk2': tk2, 'video_id': video_id, + 'src': 'intelmgtv', }, headers=self.geo_verification_headers())['data'] stream_domain = stream_data['stream_domain'][0] @@ -71,7 +103,7 @@ class MGTVIE(InfoExtractor): continue format_data = self._download_json( stream_domain + stream_path, video_id, - note='Download video info for format #%d' % idx) + note=f'Download video info for format #{idx}') format_url = format_data.get('info') if not format_url: continue @@ -79,7 +111,7 @@ class MGTVIE(InfoExtractor): r'_(\d+)_mp4/', format_url, 'tbr', default=None)) formats.append({ 'format_id': compat_str(tbr or idx), - 'url': format_url, + 'url': url_or_none(format_url), 'ext': 'mp4', 'tbr': tbr, 'protocol': 'm3u8_native', @@ -97,4 +129,25 @@ class MGTVIE(InfoExtractor): 'description': info.get('desc'), 'duration': int_or_none(info.get('duration')), 'thumbnail': info.get('thumb'), + 'subtitles': self.extract_subtitles(video_id, stream_domain), } + + def _get_subtitles(self, video_id, domain): + info = self._download_json(f'https://pcweb.api.mgtv.com/video/title?videoId={video_id}', + video_id, fatal=False) or {} + subtitles = {} + for sub in try_get(info, lambda x: x['data']['title']) or []: + url_sub = sub.get('url') + if not url_sub: + continue + locale = sub.get('captionCountrySimpleName') + sub = self._download_json(f'{domain}{url_sub}', video_id, fatal=False, + note=f'Download subtitle for locale {sub.get("name")} ({locale})') or {} + sub_url = url_or_none(sub.get('info')) + if not sub_url: + continue + subtitles.setdefault(locale or 'en', []).append({ + 'url': sub_url, + 'ext': 'srt' + }) + return subtitles diff --git a/yt_dlp/extractor/mildom.py b/yt_dlp/extractor/mildom.py index b5a2e17f2..5f2df29c6 100644 --- a/yt_dlp/extractor/mildom.py +++ b/yt_dlp/extractor/mildom.py @@ -1,102 +1,42 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 -from datetime import datetime -import itertools +import functools import json from .common import InfoExtractor from ..utils import ( - update_url_query, - random_uuidv4, - try_get, + determine_ext, + dict_get, + ExtractorError, float_or_none, - dict_get -) -from ..compat import ( - compat_str, + OnDemandPagedList, + random_uuidv4, + traverse_obj, ) class MildomBaseIE(InfoExtractor): _GUEST_ID = None - _DISPATCHER_CONFIG = None - - def _call_api(self, url, video_id, query=None, note='Downloading JSON metadata', init=False): - query = query or {} - if query: - query['__platform'] = 'web' - url = update_url_query(url, self._common_queries(query, init=init)) - content = self._download_json(url, video_id, note=note) - if content['code'] == 0: - return content['body'] - else: - self.raise_no_formats( - f'Video not found or premium content. {content["code"]} - {content["message"]}', - expected=True) - def _common_queries(self, query={}, init=False): - dc = self._fetch_dispatcher_config() - r = { - 'timestamp': self.iso_timestamp(), - '__guest_id': '' if init else self.guest_id(), - '__location': dc['location'], - '__country': dc['country'], - '__cluster': dc['cluster'], - '__platform': 'web', - '__la': self.lang_code(), - '__pcv': 'v2.9.44', - 'sfr': 'pc', - 'accessToken': '', - } - r.update(query) - return r - - def _fetch_dispatcher_config(self): - if not self._DISPATCHER_CONFIG: - tmp = self._download_json( - 'https://disp.mildom.com/serverListV2', 'initialization', - note='Downloading dispatcher_config', data=json.dumps({ - 'protover': 0, - 'data': base64.b64encode(json.dumps({ - 'fr': 'web', - 'sfr': 'pc', - 'devi': 'Windows', - 'la': 'ja', - 'gid': None, - 'loc': '', - 'clu': '', - 'wh': '1919*810', - 'rtm': self.iso_timestamp(), - 'ua': self.get_param('http_headers')['User-Agent'], - }).encode('utf8')).decode('utf8').replace('\n', ''), - }).encode('utf8')) - self._DISPATCHER_CONFIG = self._parse_json(base64.b64decode(tmp['data']), 'initialization') - return self._DISPATCHER_CONFIG - - @staticmethod - def iso_timestamp(): - 'new Date().toISOString()' - return datetime.utcnow().isoformat()[0:-3] + 'Z' - - def guest_id(self): - 'getGuestId' - if self._GUEST_ID: - return self._GUEST_ID - self._GUEST_ID = try_get( - self, ( - lambda x: x._call_api( - 'https://cloudac.mildom.com/nonolive/gappserv/guest/h5init', 'initialization', - note='Downloading guest token', init=True)['guest_id'] or None, - lambda x: x._get_cookies('https://www.mildom.com').get('gid').value, - lambda x: x._get_cookies('https://m.mildom.com').get('gid').value, - ), compat_str) or '' - return self._GUEST_ID - - def lang_code(self): - 'getCurrentLangCode' - return 'ja' + def _call_api(self, url, video_id, query=None, note='Downloading JSON metadata', body=None): + if not self._GUEST_ID: + self._GUEST_ID = f'pc-gp-{random_uuidv4()}' + + content = self._download_json( + url, video_id, note=note, data=json.dumps(body).encode() if body else None, + headers={'Content-Type': 'application/json'} if body else {}, + query={ + '__guest_id': self._GUEST_ID, + '__platform': 'web', + **(query or {}), + }) + + if content['code'] != 0: + raise ExtractorError( + f'Mildom says: {content["message"]} (code {content["code"]})', + expected=True) + return content['body'] class MildomIE(MildomBaseIE): @@ -106,31 +46,13 @@ class MildomIE(MildomBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - url = 'https://www.mildom.com/%s' % video_id - - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(f'https://www.mildom.com/{video_id}', video_id) enterstudio = self._call_api( 'https://cloudac.mildom.com/nonolive/gappserv/live/enterstudio', video_id, note='Downloading live metadata', query={'user_id': video_id}) result_video_id = enterstudio.get('log_id', video_id) - title = try_get( - enterstudio, ( - lambda x: self._html_search_meta('twitter:description', webpage), - lambda x: x['anchor_intro'], - ), compat_str) - description = try_get( - enterstudio, ( - lambda x: x['intro'], - lambda x: x['live_intro'], - ), compat_str) - uploader = try_get( - enterstudio, ( - lambda x: self._html_search_meta('twitter:title', webpage), - lambda x: x['loginname'], - ), compat_str) - servers = self._call_api( 'https://cloudac.mildom.com/nonolive/gappserv/live/liveserver', result_video_id, note='Downloading live server list', query={ @@ -138,17 +60,20 @@ class MildomIE(MildomBaseIE): 'live_server_type': 'hls', }) - stream_query = self._common_queries({ - 'streamReqId': random_uuidv4(), - 'is_lhls': '0', - }) - m3u8_url = update_url_query(servers['stream_server'] + '/%s_master.m3u8' % video_id, stream_query) - formats = self._extract_m3u8_formats(m3u8_url, result_video_id, 'mp4', headers={ - 'Referer': 'https://www.mildom.com/', - 'Origin': 'https://www.mildom.com', - }, note='Downloading m3u8 information') - - del stream_query['streamReqId'], stream_query['timestamp'] + playback_token = self._call_api( + 'https://cloudac.mildom.com/nonolive/gappserv/live/token', result_video_id, + note='Obtaining live playback token', body={'host_id': video_id, 'type': 'hls'}) + playback_token = traverse_obj(playback_token, ('data', ..., 'token'), get_all=False) + if not playback_token: + raise ExtractorError('Failed to obtain live playback token') + + formats = self._extract_m3u8_formats( + f'{servers["stream_server"]}/{video_id}_master.m3u8?{playback_token}', + result_video_id, 'mp4', headers={ + 'Referer': 'https://www.mildom.com/', + 'Origin': 'https://www.mildom.com', + }) + for fmt in formats: fmt.setdefault('http_headers', {})['Referer'] = 'https://www.mildom.com/' @@ -156,10 +81,10 @@ class MildomIE(MildomBaseIE): return { 'id': result_video_id, - 'title': title, - 'description': description, + 'title': self._html_search_meta('twitter:description', webpage, default=None) or traverse_obj(enterstudio, 'anchor_intro'), + 'description': traverse_obj(enterstudio, 'intro', 'live_intro', expected_type=str), 'timestamp': float_or_none(enterstudio.get('live_start_ms'), scale=1000), - 'uploader': uploader, + 'uploader': self._html_search_meta('twitter:title', webpage, default=None) or traverse_obj(enterstudio, 'loginname'), 'uploader_id': video_id, 'formats': formats, 'is_live': True, @@ -168,7 +93,7 @@ class MildomIE(MildomBaseIE): class MildomVodIE(MildomBaseIE): IE_NAME = 'mildom:vod' - IE_DESC = 'Download a VOD in Mildom' + IE_DESC = 'VOD in Mildom' _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/playback/(?P<user_id>\d+)/(?P<id>(?P=user_id)-[a-zA-Z0-9]+-?[0-9]*)' _TESTS = [{ 'url': 'https://www.mildom.com/playback/10882672/10882672-1597662269', @@ -215,11 +140,8 @@ class MildomVodIE(MildomBaseIE): }] def _real_extract(self, url): - m = self._match_valid_url(url) - user_id, video_id = m.group('user_id'), m.group('id') - url = 'https://www.mildom.com/playback/%s/%s' % (user_id, video_id) - - webpage = self._download_webpage(url, video_id) + user_id, video_id = self._match_valid_url(url).group('user_id', 'id') + webpage = self._download_webpage(f'https://www.mildom.com/playback/{user_id}/{video_id}', video_id) autoplay = self._call_api( 'https://cloudac.mildom.com/nonolive/videocontent/playback/getPlaybackDetail', video_id, @@ -227,20 +149,6 @@ class MildomVodIE(MildomBaseIE): 'v_id': video_id, })['playback'] - title = try_get( - autoplay, ( - lambda x: self._html_search_meta('og:description', webpage), - lambda x: x['title'], - ), compat_str) - description = try_get( - autoplay, ( - lambda x: x['video_intro'], - ), compat_str) - uploader = try_get( - autoplay, ( - lambda x: x['author_info']['login_name'], - ), compat_str) - formats = [{ 'url': autoplay['audio_url'], 'format_id': 'audio', @@ -265,17 +173,81 @@ class MildomVodIE(MildomBaseIE): return { 'id': video_id, - 'title': title, - 'description': description, - 'timestamp': float_or_none(autoplay['publish_time'], scale=1000), - 'duration': float_or_none(autoplay['video_length'], scale=1000), + 'title': self._html_search_meta(('og:description', 'description'), webpage, default=None) or autoplay.get('title'), + 'description': traverse_obj(autoplay, 'video_intro'), + 'timestamp': float_or_none(autoplay.get('publish_time'), scale=1000), + 'duration': float_or_none(autoplay.get('video_length'), scale=1000), 'thumbnail': dict_get(autoplay, ('upload_pic', 'video_pic')), - 'uploader': uploader, + 'uploader': traverse_obj(autoplay, ('author_info', 'login_name')), 'uploader_id': user_id, 'formats': formats, } +class MildomClipIE(MildomBaseIE): + IE_NAME = 'mildom:clip' + IE_DESC = 'Clip in Mildom' + _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/clip/(?P<id>(?P<user_id>\d+)-[a-zA-Z0-9]+)' + _TESTS = [{ + 'url': 'https://www.mildom.com/clip/10042245-63921673e7b147ebb0806d42b5ba5ce9', + 'info_dict': { + 'id': '10042245-63921673e7b147ebb0806d42b5ba5ce9', + 'title': '全然違ったよ', + 'timestamp': 1619181890, + 'duration': 59, + 'thumbnail': r're:https?://.+', + 'uploader': 'ざきんぽ', + 'uploader_id': '10042245', + }, + }, { + 'url': 'https://www.mildom.com/clip/10111524-ebf4036e5aa8411c99fb3a1ae0902864', + 'info_dict': { + 'id': '10111524-ebf4036e5aa8411c99fb3a1ae0902864', + 'title': 'かっこいい', + 'timestamp': 1621094003, + 'duration': 59, + 'thumbnail': r're:https?://.+', + 'uploader': '(ルーキー', + 'uploader_id': '10111524', + }, + }, { + 'url': 'https://www.mildom.com/clip/10660174-2c539e6e277c4aaeb4b1fbe8d22cb902', + 'info_dict': { + 'id': '10660174-2c539e6e277c4aaeb4b1fbe8d22cb902', + 'title': 'あ', + 'timestamp': 1614769431, + 'duration': 31, + 'thumbnail': r're:https?://.+', + 'uploader': 'ドルゴルスレンギーン=ダグワドルジ', + 'uploader_id': '10660174', + }, + }] + + def _real_extract(self, url): + user_id, video_id = self._match_valid_url(url).group('user_id', 'id') + webpage = self._download_webpage(f'https://www.mildom.com/clip/{video_id}', video_id) + + clip_detail = self._call_api( + 'https://cloudac-cf-jp.mildom.com/nonolive/videocontent/clip/detail', video_id, + note='Downloading playback metadata', query={ + 'clip_id': video_id, + }) + + return { + 'id': video_id, + 'title': self._html_search_meta( + ('og:description', 'description'), webpage, default=None) or clip_detail.get('title'), + 'timestamp': float_or_none(clip_detail.get('create_time')), + 'duration': float_or_none(clip_detail.get('length')), + 'thumbnail': clip_detail.get('cover'), + 'uploader': traverse_obj(clip_detail, ('user_info', 'loginname')), + 'uploader_id': user_id, + + 'url': clip_detail['url'], + 'ext': determine_ext(clip_detail.get('url'), 'mp4'), + } + + class MildomUserVodIE(MildomBaseIE): IE_NAME = 'mildom:user:vod' IE_DESC = 'Download all VODs from specific user in Mildom' @@ -286,29 +258,32 @@ class MildomUserVodIE(MildomBaseIE): 'id': '10093333', 'title': 'Uploads from ねこばたけ', }, - 'playlist_mincount': 351, + 'playlist_mincount': 732, }, { 'url': 'https://www.mildom.com/profile/10882672', 'info_dict': { 'id': '10882672', 'title': 'Uploads from kson組長(けいそん)', }, - 'playlist_mincount': 191, + 'playlist_mincount': 201, }] - def _entries(self, user_id): - for page in itertools.count(1): - reply = self._call_api( - 'https://cloudac.mildom.com/nonolive/videocontent/profile/playbackList', - user_id, note='Downloading page %d' % page, query={ - 'user_id': user_id, - 'page': page, - 'limit': '30', - }) - if not reply: - break - for x in reply: - yield self.url_result('https://www.mildom.com/playback/%s/%s' % (user_id, x['v_id'])) + def _fetch_page(self, user_id, page): + page += 1 + reply = self._call_api( + 'https://cloudac.mildom.com/nonolive/videocontent/profile/playbackList', + user_id, note=f'Downloading page {page}', query={ + 'user_id': user_id, + 'page': page, + 'limit': '30', + }) + if not reply: + return + for x in reply: + v_id = x.get('v_id') + if not v_id: + continue + yield self.url_result(f'https://www.mildom.com/playback/{user_id}/{v_id}') def _real_extract(self, url): user_id = self._match_id(url) @@ -319,4 +294,5 @@ class MildomUserVodIE(MildomBaseIE): query={'user_id': user_id}, note='Downloading user profile')['user_info'] return self.playlist_result( - self._entries(user_id), user_id, 'Uploads from %s' % profile['loginname']) + OnDemandPagedList(functools.partial(self._fetch_page, user_id), 30), + user_id, f'Uploads from {profile["loginname"]}') diff --git a/yt_dlp/extractor/nrk.py b/yt_dlp/extractor/nrk.py index 49d58a685..4d723e886 100644 --- a/yt_dlp/extractor/nrk.py +++ b/yt_dlp/extractor/nrk.py @@ -8,6 +8,7 @@ import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + compat_HTTPError, determine_ext, ExtractorError, int_or_none, @@ -147,10 +148,14 @@ class NRKIE(NRKBaseIE): def _real_extract(self, url): video_id = self._match_id(url).split('/')[-1] - path_templ = 'playback/%s/program/' + video_id - def call_playback_api(item, query=None): - return self._call_api(path_templ % item, video_id, item, query=query) + try: + return self._call_api(f'playback/{item}/program/{video_id}', video_id, item, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + return self._call_api(f'playback/{item}/{video_id}', video_id, item, query=query) + raise + # known values for preferredCdn: akamai, iponly, minicdn and telenor manifest = call_playback_api('manifest', {'preferredCdn': 'akamai'}) diff --git a/yt_dlp/extractor/openrec.py b/yt_dlp/extractor/openrec.py index 0525b4830..b476c0986 100644 --- a/yt_dlp/extractor/openrec.py +++ b/yt_dlp/extractor/openrec.py @@ -4,10 +4,11 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( ExtractorError, + get_first, int_or_none, traverse_obj, unified_strdate, - unified_timestamp + unified_timestamp, ) from ..compat import compat_str @@ -19,42 +20,34 @@ class OpenRecBaseIE(InfoExtractor): def _extract_movie(self, webpage, video_id, name, is_live): window_stores = self._extract_pagestore(webpage, video_id) - movie_store = traverse_obj( - window_stores, - ('v8', 'state', 'movie'), - ('v8', 'movie'), - expected_type=dict) - if not movie_store: + movie_stores = [ + # extract all three important data (most of data are duplicated each other, but slightly different!) + traverse_obj(window_stores, ('v8', 'state', 'movie'), expected_type=dict), + traverse_obj(window_stores, ('v8', 'movie'), expected_type=dict), + traverse_obj(window_stores, 'movieStore', expected_type=dict), + ] + if not any(movie_stores): raise ExtractorError(f'Failed to extract {name} info') - title = movie_store.get('title') - description = movie_store.get('introduction') - thumbnail = movie_store.get('thumbnailUrl') - - uploader = traverse_obj(movie_store, ('channel', 'user', 'name'), expected_type=compat_str) - uploader_id = traverse_obj(movie_store, ('channel', 'user', 'id'), expected_type=compat_str) - - timestamp = int_or_none(traverse_obj(movie_store, ('publishedAt', 'time')), scale=1000) - - m3u8_playlists = movie_store.get('media') or {} + m3u8_playlists = get_first(movie_stores, 'media') or {} formats = [] for name, m3u8_url in m3u8_playlists.items(): if not m3u8_url: continue formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', live=is_live, m3u8_id='hls-%s' % name)) + m3u8_url, video_id, ext='mp4', live=is_live, m3u8_id=name)) self._sort_formats(formats) return { 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'title': get_first(movie_stores, 'title'), + 'description': get_first(movie_stores, 'introduction'), + 'thumbnail': get_first(movie_stores, 'thumbnailUrl'), 'formats': formats, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'timestamp': timestamp, + 'uploader': get_first(movie_stores, ('channel', 'user', 'name')), + 'uploader_id': get_first(movie_stores, ('channel', 'user', 'id')), + 'timestamp': int_or_none(get_first(movie_stores, ['publishedAt', 'time']), scale=1000) or unified_timestamp(get_first(movie_stores, 'publishedAt')), 'is_live': is_live, } @@ -72,7 +65,7 @@ class OpenRecIE(OpenRecBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage('https://www.openrec.tv/live/%s' % video_id, video_id) + webpage = self._download_webpage(f'https://www.openrec.tv/live/{video_id}', video_id) return self._extract_movie(webpage, video_id, 'live', True) @@ -96,7 +89,7 @@ class OpenRecCaptureIE(OpenRecBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage('https://www.openrec.tv/capture/%s' % video_id, video_id) + webpage = self._download_webpage(f'https://www.openrec.tv/capture/{video_id}', video_id) window_stores = self._extract_pagestore(webpage, video_id) movie_store = window_stores.get('movie') @@ -104,15 +97,6 @@ class OpenRecCaptureIE(OpenRecBaseIE): capture_data = window_stores.get('capture') if not capture_data: raise ExtractorError('Cannot extract title') - title = capture_data.get('title') - thumbnail = capture_data.get('thumbnailUrl') - upload_date = unified_strdate(capture_data.get('createdAt')) - - uploader = traverse_obj(movie_store, ('channel', 'name'), expected_type=compat_str) - uploader_id = traverse_obj(movie_store, ('channel', 'id'), expected_type=compat_str) - - timestamp = traverse_obj(movie_store, 'createdAt', expected_type=compat_str) - timestamp = unified_timestamp(timestamp) formats = self._extract_m3u8_formats( capture_data.get('source'), video_id, ext='mp4') @@ -120,13 +104,13 @@ class OpenRecCaptureIE(OpenRecBaseIE): return { 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, + 'title': capture_data.get('title'), + 'thumbnail': capture_data.get('thumbnailUrl'), 'formats': formats, - 'timestamp': timestamp, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'upload_date': upload_date, + 'timestamp': unified_timestamp(traverse_obj(movie_store, 'createdAt', expected_type=compat_str)), + 'uploader': traverse_obj(movie_store, ('channel', 'name'), expected_type=compat_str), + 'uploader_id': traverse_obj(movie_store, ('channel', 'id'), expected_type=compat_str), + 'upload_date': unified_strdate(capture_data.get('createdAt')), } @@ -148,6 +132,6 @@ class OpenRecMovieIE(OpenRecBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage('https://www.openrec.tv/movie/%s' % video_id, video_id) + webpage = self._download_webpage(f'https://www.openrec.tv/movie/{video_id}', video_id) return self._extract_movie(webpage, video_id, 'movie', False) diff --git a/yt_dlp/extractor/panopto.py b/yt_dlp/extractor/panopto.py new file mode 100644 index 000000000..d458dfe50 --- /dev/null +++ b/yt_dlp/extractor/panopto.py @@ -0,0 +1,445 @@ +import re +import calendar +import json +import functools +from datetime import datetime +from random import random + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlparse, + compat_urlparse +) + +from ..utils import ( + bug_reports_message, + ExtractorError, + get_first, + int_or_none, + OnDemandPagedList, + parse_qs, + traverse_obj, +) + + +class PanoptoBaseIE(InfoExtractor): + BASE_URL_RE = r'(?P<base_url>https?://[\w.]+\.panopto.(?:com|eu)/Panopto)' + + def _call_api(self, base_url, path, video_id, data=None, fatal=True, **kwargs): + response = self._download_json( + base_url + path, video_id, data=json.dumps(data).encode('utf8') if data else None, + fatal=fatal, headers={'accept': 'application/json', 'content-type': 'application/json'}, **kwargs) + if not response: + return + error_code = response.get('ErrorCode') + if error_code == 2: + self.raise_login_required(method='cookies') + elif error_code is not None: + msg = f'Panopto said: {response.get("ErrorMessage")}' + if fatal: + raise ExtractorError(msg, video_id=video_id, expected=True) + else: + self.report_warning(msg, video_id=video_id) + return response + + @staticmethod + def _parse_fragment(url): + return {k: json.loads(v[0]) for k, v in compat_urlparse.parse_qs(compat_urllib_parse_urlparse(url).fragment).items()} + + @staticmethod + def _extract_urls(webpage): + return [m.group('url') for m in re.finditer( + r'<iframe[^>]+src=["\'](?P<url>%s/Pages/(Viewer|Embed|Sessions/List)\.aspx[^"\']+)' % PanoptoIE.BASE_URL_RE, + webpage)] + + +class PanoptoIE(PanoptoBaseIE): + _VALID_URL = PanoptoBaseIE.BASE_URL_RE + r'/Pages/(Viewer|Embed)\.aspx.*(?:\?|&)id=(?P<id>[a-f0-9-]+)' + _TESTS = [ + { + 'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=26b3ae9e-4a48-4dcc-96ba-0befba08a0fb', + 'info_dict': { + 'id': '26b3ae9e-4a48-4dcc-96ba-0befba08a0fb', + 'title': 'Panopto for Business - Use Cases', + 'timestamp': 1459184200, + 'thumbnail': r're:https://demo\.hosted\.panopto\.com/Panopto/Services/FrameGrabber\.svc/FrameRedirect\?objectId=26b3ae9e-4a48-4dcc-96ba-0befba08a0fb&mode=Delivery&random=[\d.]+', + 'upload_date': '20160328', + 'ext': 'mp4', + 'cast': [], + 'duration': 88.17099999999999, + 'average_rating': int, + 'uploader_id': '2db6b718-47a0-4b0b-9e17-ab0b00f42b1e', + 'channel_id': 'e4c6a2fc-1214-4ca0-8fb7-aef2e29ff63a', + 'channel': 'Showcase Videos' + }, + }, + { + 'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=ed01b077-c9e5-4c7b-b8ff-15fa306d7a59', + 'info_dict': { + 'id': 'ed01b077-c9e5-4c7b-b8ff-15fa306d7a59', + 'title': 'Overcoming Top 4 Challenges of Enterprise Video', + 'uploader': 'Panopto Support', + 'timestamp': 1449409251, + 'thumbnail': r're:https://demo\.hosted\.panopto\.com/Panopto/Services/FrameGrabber\.svc/FrameRedirect\?objectId=ed01b077-c9e5-4c7b-b8ff-15fa306d7a59&mode=Delivery&random=[\d.]+', + 'upload_date': '20151206', + 'ext': 'mp4', + 'chapters': 'count:21', + 'cast': ['Panopto Support'], + 'uploader_id': 'a96d1a31-b4de-489b-9eee-b4a5b414372c', + 'average_rating': int, + 'description': 'md5:4391837802b3fc856dadf630c4b375d1', + 'duration': 1088.2659999999998, + 'channel_id': '9f3c1921-43bb-4bda-8b3a-b8d2f05a8546', + 'channel': 'Webcasts', + }, + }, + { + # Extra params in URL + 'url': 'https://howtovideos.hosted.panopto.com/Panopto/Pages/Viewer.aspx?randomparam=thisisnotreal&id=5fa74e93-3d87-4694-b60e-aaa4012214ed&advance=true', + 'info_dict': { + 'id': '5fa74e93-3d87-4694-b60e-aaa4012214ed', + 'ext': 'mp4', + 'duration': 129.513, + 'cast': ['Kathryn Kelly'], + 'uploader_id': '316a0a58-7fa2-4cd9-be1c-64270d284a56', + 'timestamp': 1569845768, + 'tags': ['Viewer', 'Enterprise'], + 'upload_date': '20190930', + 'thumbnail': r're:https://howtovideos\.hosted\.panopto\.com/Panopto/Services/FrameGrabber.svc/FrameRedirect\?objectId=5fa74e93-3d87-4694-b60e-aaa4012214ed&mode=Delivery&random=[\d.]+', + 'description': 'md5:2d844aaa1b1a14ad0e2601a0993b431f', + 'title': 'Getting Started: View a Video', + 'average_rating': int, + 'uploader': 'Kathryn Kelly', + 'channel_id': 'fb93bc3c-6750-4b80-a05b-a921013735d3', + 'channel': 'Getting Started', + } + }, + { + # Does not allow normal Viewer.aspx. AUDIO livestream has no url, so should be skipped and only give one stream. + 'url': 'https://unisa.au.panopto.com/Panopto/Pages/Embed.aspx?id=9d9a0fa3-e99a-4ebd-a281-aac2017f4da4', + 'info_dict': { + 'id': '9d9a0fa3-e99a-4ebd-a281-aac2017f4da4', + 'ext': 'mp4', + 'cast': ['LTS CLI Script'], + 'duration': 2178.45, + 'description': 'md5:ee5cf653919f55b72bce2dbcf829c9fa', + 'channel_id': 'b23e673f-c287-4cb1-8344-aae9005a69f8', + 'average_rating': int, + 'uploader_id': '38377323-6a23-41e2-9ff6-a8e8004bf6f7', + 'uploader': 'LTS CLI Script', + 'timestamp': 1572458134, + 'title': 'WW2 Vets Interview 3 Ronald Stanley George', + 'thumbnail': r're:https://unisa\.au\.panopto\.com/Panopto/Services/FrameGrabber.svc/FrameRedirect\?objectId=9d9a0fa3-e99a-4ebd-a281-aac2017f4da4&mode=Delivery&random=[\d.]+', + 'channel': 'World War II Veteran Interviews', + 'upload_date': '20191030', + }, + }, + { + 'url': 'https://ucc.cloud.panopto.eu/Panopto/Pages/Viewer.aspx?id=0e8484a4-4ceb-4d98-a63f-ac0200b455cb', + 'only_matching': True + }, + { + 'url': 'https://brown.hosted.panopto.com/Panopto/Pages/Embed.aspx?id=0b3ff73b-36a0-46c5-8455-aadf010a3638', + 'only_matching': True + }, + ] + + @classmethod + def suitable(cls, url): + return False if PanoptoPlaylistIE.suitable(url) else super().suitable(url) + + def _mark_watched(self, base_url, video_id, delivery_info): + duration = traverse_obj(delivery_info, ('Delivery', 'Duration'), expected_type=float) + invocation_id = delivery_info.get('InvocationId') + stream_id = traverse_obj(delivery_info, ('Delivery', 'Streams', ..., 'PublicID'), get_all=False, expected_type=str) + if invocation_id and stream_id and duration: + timestamp_str = f'/Date({calendar.timegm(datetime.utcnow().timetuple())}000)/' + data = { + 'streamRequests': [ + { + 'ClientTimeStamp': timestamp_str, + 'ID': 0, + 'InvocationID': invocation_id, + 'PlaybackSpeed': 1, + 'SecondsListened': duration - 1, + 'SecondsRejected': 0, + 'StartPosition': 0, + 'StartReason': 2, + 'StopReason': None, + 'StreamID': stream_id, + 'TimeStamp': timestamp_str, + 'UpdatesRejected': 0 + }, + ]} + + self._download_webpage( + base_url + '/Services/Analytics.svc/AddStreamRequests', video_id, + fatal=False, data=json.dumps(data).encode('utf8'), headers={'content-type': 'application/json'}, + note='Marking watched', errnote='Unable to mark watched') + + @staticmethod + def _extract_chapters(delivery): + chapters = [] + for timestamp in delivery.get('Timestamps', []): + start, duration = int_or_none(timestamp.get('Time')), int_or_none(timestamp.get('Duration')) + if start is None or duration is None: + continue + chapters.append({ + 'start_time': start, + 'end_time': start + duration, + 'title': timestamp.get('Caption') + }) + return chapters + + def _extract_streams_formats_and_subtitles(self, video_id, streams, **fmt_kwargs): + formats = [] + subtitles = {} + for stream in streams or []: + stream_formats = [] + http_stream_url = stream.get('StreamHttpUrl') + stream_url = stream.get('StreamUrl') + + if http_stream_url: + stream_formats.append({'url': http_stream_url}) + + if stream_url: + media_type = stream.get('ViewerMediaFileTypeName') + if media_type in ('hls', ): + m3u8_formats, stream_subtitles = self._extract_m3u8_formats_and_subtitles(stream_url, video_id) + stream_formats.extend(m3u8_formats) + subtitles = self._merge_subtitles(subtitles, stream_subtitles) + else: + stream_formats.append({ + 'url': stream_url + }) + for fmt in stream_formats: + fmt.update({ + 'format_note': stream.get('Tag'), + **fmt_kwargs + }) + formats.extend(stream_formats) + + return formats, subtitles + + def _real_extract(self, url): + base_url, video_id = self._match_valid_url(url).group('base_url', 'id') + delivery_info = self._call_api( + base_url, '/Pages/Viewer/DeliveryInfo.aspx', video_id, + query={ + 'deliveryId': video_id, + 'invocationId': '', + 'isLiveNotes': 'false', + 'refreshAuthCookie': 'true', + 'isActiveBroadcast': 'false', + 'isEditing': 'false', + 'isKollectiveAgentInstalled': 'false', + 'isEmbed': 'false', + 'responseType': 'json', + } + ) + + delivery = delivery_info['Delivery'] + session_start_time = int_or_none(delivery.get('SessionStartTime')) + + # Podcast stream is usually the combined streams. We will prefer that by default. + podcast_formats, podcast_subtitles = self._extract_streams_formats_and_subtitles( + video_id, delivery.get('PodcastStreams'), format_note='PODCAST') + + streams_formats, streams_subtitles = self._extract_streams_formats_and_subtitles( + video_id, delivery.get('Streams'), preference=-10) + + formats = podcast_formats + streams_formats + subtitles = self._merge_subtitles(podcast_subtitles, streams_subtitles) + self._sort_formats(formats) + + self.mark_watched(base_url, video_id, delivery_info) + + return { + 'id': video_id, + 'title': delivery.get('SessionName'), + 'cast': traverse_obj(delivery, ('Contributors', ..., 'DisplayName'), default=[], expected_type=lambda x: x or None), + 'timestamp': session_start_time - 11640000000 if session_start_time else None, + 'duration': delivery.get('Duration'), + 'thumbnail': base_url + f'/Services/FrameGrabber.svc/FrameRedirect?objectId={video_id}&mode=Delivery&random={random()}', + 'average_rating': delivery.get('AverageRating'), + 'chapters': self._extract_chapters(delivery) or None, + 'uploader': delivery.get('OwnerDisplayName') or None, + 'uploader_id': delivery.get('OwnerId'), + 'description': delivery.get('SessionAbstract'), + 'tags': traverse_obj(delivery, ('Tags', ..., 'Content')), + 'channel_id': delivery.get('SessionGroupPublicID'), + 'channel': traverse_obj(delivery, 'SessionGroupLongName', 'SessionGroupShortName', get_all=False), + 'formats': formats, + 'subtitles': subtitles + } + + +class PanoptoPlaylistIE(PanoptoBaseIE): + _VALID_URL = PanoptoBaseIE.BASE_URL_RE + r'/Pages/(Viewer|Embed)\.aspx.*(?:\?|&)pid=(?P<id>[a-f0-9-]+)' + _TESTS = [ + { + 'url': 'https://howtovideos.hosted.panopto.com/Panopto/Pages/Viewer.aspx?pid=f3b39fcf-882f-4849-93d6-a9f401236d36&id=5fa74e93-3d87-4694-b60e-aaa4012214ed&advance=true', + 'info_dict': { + 'title': 'Featured Video Tutorials', + 'id': 'f3b39fcf-882f-4849-93d6-a9f401236d36', + 'description': '', + }, + 'playlist_mincount': 36 + }, + { + 'url': 'https://utsa.hosted.panopto.com/Panopto/Pages/Viewer.aspx?pid=e2900555-3ad4-4bdb-854d-ad2401686190', + 'info_dict': { + 'title': 'Library Website Introduction Playlist', + 'id': 'e2900555-3ad4-4bdb-854d-ad2401686190', + 'description': 'md5:f958bca50a1cbda15fdc1e20d32b3ecb', + }, + 'playlist_mincount': 4 + }, + + ] + + def _entries(self, base_url, playlist_id, session_list_id): + session_list_info = self._call_api( + base_url, f'/Api/SessionLists/{session_list_id}?collections[0].maxCount=500&collections[0].name=items', playlist_id) + + items = session_list_info['Items'] + for item in items: + if item.get('TypeName') != 'Session': + self.report_warning('Got an item in the playlist that is not a Session' + bug_reports_message(), only_once=True) + continue + yield { + '_type': 'url', + 'id': item.get('Id'), + 'url': item.get('ViewerUri'), + 'title': item.get('Name'), + 'description': item.get('Description'), + 'duration': item.get('Duration'), + 'channel': traverse_obj(item, ('Parent', 'Name')), + 'channel_id': traverse_obj(item, ('Parent', 'Id')) + } + + def _real_extract(self, url): + base_url, playlist_id = self._match_valid_url(url).group('base_url', 'id') + + video_id = get_first(parse_qs(url), 'id') + if video_id: + if self.get_param('noplaylist'): + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + return self.url_result(base_url + f'/Pages/Viewer.aspx?id={video_id}', ie_key=PanoptoIE.ie_key(), video_id=video_id) + else: + self.to_screen(f'Downloading playlist {playlist_id}; add --no-playlist to just download video {video_id}') + + playlist_info = self._call_api(base_url, f'/Api/Playlists/{playlist_id}', playlist_id) + return self.playlist_result( + self._entries(base_url, playlist_id, playlist_info['SessionListId']), + playlist_id=playlist_id, playlist_title=playlist_info.get('Name'), + playlist_description=playlist_info.get('Description')) + + +class PanoptoListIE(PanoptoBaseIE): + _VALID_URL = PanoptoBaseIE.BASE_URL_RE + r'/Pages/Sessions/List\.aspx' + _PAGE_SIZE = 250 + _TESTS = [ + { + 'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Sessions/List.aspx#folderID=%22e4c6a2fc-1214-4ca0-8fb7-aef2e29ff63a%22', + 'info_dict': { + 'id': 'e4c6a2fc-1214-4ca0-8fb7-aef2e29ff63a', + 'title': 'Showcase Videos' + }, + 'playlist_mincount': 140 + + }, + { + 'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Sessions/List.aspx#view=2&maxResults=250', + 'info_dict': { + 'id': 'panopto_list', + 'title': 'panopto_list' + }, + 'playlist_mincount': 300 + }, + { + # Folder that contains 8 folders and a playlist + 'url': 'https://howtovideos.hosted.panopto.com/Panopto/Pages/Sessions/List.aspx?noredirect=true#folderID=%224b9de7ae-0080-4158-8496-a9ba01692c2e%22', + 'info_dict': { + 'id': '4b9de7ae-0080-4158-8496-a9ba01692c2e', + 'title': 'Video Tutorials' + }, + 'playlist_mincount': 9 + } + + ] + + def _fetch_page(self, base_url, query_params, display_id, page): + + params = { + 'sortColumn': 1, + 'getFolderData': True, + 'includePlaylists': True, + **query_params, + 'page': page, + 'maxResults': self._PAGE_SIZE, + } + + response = self._call_api( + base_url, '/Services/Data.svc/GetSessions', f'{display_id} page {page+1}', + data={'queryParameters': params}, fatal=False) + + for result in get_first(response, 'Results', default=[]): + # This could be a video, playlist (or maybe something else) + item_id = result.get('DeliveryID') + yield { + '_type': 'url', + 'id': item_id, + 'title': result.get('SessionName'), + 'url': traverse_obj(result, 'ViewerUrl', 'EmbedUrl', get_all=False) or (base_url + f'/Pages/Viewer.aspx?id={item_id}'), + 'duration': result.get('Duration'), + 'channel': result.get('FolderName'), + 'channel_id': result.get('FolderID'), + } + + for folder in get_first(response, 'Subfolders', default=[]): + folder_id = folder.get('ID') + yield self.url_result( + base_url + f'/Pages/Sessions/List.aspx#folderID="{folder_id}"', + ie_key=PanoptoListIE.ie_key(), video_id=folder_id, title=folder.get('Name')) + + def _extract_folder_metadata(self, base_url, folder_id): + response = self._call_api( + base_url, '/Services/Data.svc/GetFolderInfo', folder_id, + data={'folderID': folder_id}, fatal=False) + return { + 'title': get_first(response, 'Name', default=[]) + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + base_url = mobj.group('base_url') + + query_params = self._parse_fragment(url) + folder_id, display_id = query_params.get('folderID'), 'panopto_list' + + if query_params.get('isSubscriptionsPage'): + display_id = 'subscriptions' + if not query_params.get('subscribableTypes'): + query_params['subscribableTypes'] = [0, 1, 2] + elif query_params.get('isSharedWithMe'): + display_id = 'sharedwithme' + elif folder_id: + display_id = folder_id + + query = query_params.get('query') + if query: + display_id += f': query "{query}"' + + info = { + '_type': 'playlist', + 'id': display_id, + 'title': display_id, + } + if folder_id: + info.update(self._extract_folder_metadata(base_url, folder_id)) + + info['entries'] = OnDemandPagedList( + functools.partial(self._fetch_page, base_url, query_params, display_id), self._PAGE_SIZE) + + return info diff --git a/yt_dlp/extractor/peertube.py b/yt_dlp/extractor/peertube.py index e0b2ab982..9d6b82178 100644 --- a/yt_dlp/extractor/peertube.py +++ b/yt_dlp/extractor/peertube.py @@ -87,6 +87,7 @@ class PeerTubeIE(InfoExtractor): maindreieck-tv\.de| mani\.tube| manicphase\.me| + media\.fsfe\.org| media\.gzevd\.de| media\.inno3\.cricket| media\.kaitaia\.life| diff --git a/yt_dlp/extractor/periscope.py b/yt_dlp/extractor/periscope.py index b93a02b7d..1a292b8ac 100644 --- a/yt_dlp/extractor/periscope.py +++ b/yt_dlp/extractor/periscope.py @@ -33,7 +33,7 @@ class PeriscopeBaseIE(InfoExtractor): return { 'id': broadcast.get('id') or video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'timestamp': parse_iso8601(broadcast.get('created_at')), 'uploader': uploader, 'uploader_id': broadcast.get('user_id') or broadcast.get('username'), diff --git a/yt_dlp/extractor/pokemon.py b/yt_dlp/extractor/pokemon.py index 402b574a7..b411390e2 100644 --- a/yt_dlp/extractor/pokemon.py +++ b/yt_dlp/extractor/pokemon.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import re from .common import InfoExtractor from ..utils import ( @@ -138,3 +139,42 @@ class PokemonWatchIE(InfoExtractor): 'episode': video_data.get('title'), 'episode_number': int_or_none(video_data.get('episode')), }) + + +class PokemonSoundLibraryIE(InfoExtractor): + _VALID_URL = r'https?://soundlibrary\.pokemon\.co\.jp' + + _TESTS = [{ + 'url': 'https://soundlibrary.pokemon.co.jp/', + 'info_dict': { + 'title': 'Pokémon Diamond and Pearl Sound Tracks', + }, + 'playlist_mincount': 149, + }] + + def _real_extract(self, url): + musicbox_webpage = self._download_webpage( + 'https://soundlibrary.pokemon.co.jp/musicbox', None, + 'Downloading list of songs') + song_titles = [x.group(1) for x in re.finditer(r'<span>([^>]+?)</span><br/>をてもち曲に加えます。', musicbox_webpage)] + song_titles = song_titles[4::2] + + # each songs don't have permalink; instead we return all songs at once + song_entries = [{ + 'id': f'pokemon-soundlibrary-{song_id}', + 'url': f'https://soundlibrary.pokemon.co.jp/api/assets/signing/sounds/wav/{song_id}.wav', + # note: the server always serves MP3 files, despite its extension of the URL above + 'ext': 'mp3', + 'acodec': 'mp3', + 'vcodec': 'none', + 'title': song_title, + 'track': song_title, + 'artist': 'Nintendo / Creatures Inc. / GAME FREAK inc.', + 'uploader': 'Pokémon', + 'release_year': 2006, + 'release_date': '20060928', + 'track_number': song_id, + 'album': 'Pokémon Diamond and Pearl', + } for song_id, song_title in enumerate(song_titles, 1)] + + return self.playlist_result(song_entries, playlist_title='Pokémon Diamond and Pearl Sound Tracks') diff --git a/yt_dlp/extractor/rokfin.py b/yt_dlp/extractor/rokfin.py index 79a5b2336..0fd65db4b 100644 --- a/yt_dlp/extractor/rokfin.py +++ b/yt_dlp/extractor/rokfin.py @@ -100,7 +100,7 @@ class RokfinIE(InfoExtractor): video_url, video_id, fatal=False, live=live_status == 'is_live') if not formats: - if metadata.get('premiumPlan'): + if traverse_obj(metadata, 'premiumPlan', 'premium'): self.raise_login_required('This video is only available to premium users', True, method='cookies') elif scheduled: self.raise_no_formats( @@ -129,7 +129,7 @@ class RokfinIE(InfoExtractor): 'tags': traverse_obj(metadata, ('tags', ..., 'title'), expected_type=str_or_none), 'live_status': live_status, 'availability': self._availability( - needs_premium=bool(metadata.get('premiumPlan')), + needs_premium=bool(traverse_obj(metadata, 'premiumPlan', 'premium')), is_private=False, needs_subscription=False, needs_auth=False, is_unlisted=False), # 'comment_count': metadata.get('numComments'), # Data provided by website is wrong '__post_extractor': self.extract_comments(video_id) if video_type == 'post' else None, diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index 8146b3ef5..64b8a71b6 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -59,8 +59,16 @@ class SoundcloudEmbedIE(InfoExtractor): class SoundcloudBaseIE(InfoExtractor): + _NETRC_MACHINE = 'soundcloud' + _API_V2_BASE = 'https://api-v2.soundcloud.com/' _BASE_URL = 'https://soundcloud.com/' + _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' + _API_AUTH_QUERY_TEMPLATE = '?client_id=%s' + _API_AUTH_URL_PW = 'https://api-auth.soundcloud.com/web-auth/sign-in/password%s' + _API_VERIFY_AUTH_TOKEN = 'https://api-auth.soundcloud.com/connect/session%s' + _access_token = None + _HEADERS = {} def _store_client_id(self, client_id): self._downloader.cache.store('soundcloud', 'client_id', client_id) @@ -103,14 +111,6 @@ class SoundcloudBaseIE(InfoExtractor): self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or 'a3e059563d7fd3372b49b37f00a00bcf' self._login() - _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' - _API_AUTH_QUERY_TEMPLATE = '?client_id=%s' - _API_AUTH_URL_PW = 'https://api-auth.soundcloud.com/web-auth/sign-in/password%s' - _API_VERIFY_AUTH_TOKEN = 'https://api-auth.soundcloud.com/connect/session%s' - _access_token = None - _HEADERS = {} - _NETRC_MACHINE = 'soundcloud' - def _login(self): username, password = self._get_login_info() if username is None: diff --git a/yt_dlp/extractor/sovietscloset.py b/yt_dlp/extractor/sovietscloset.py index daf1c7450..4bc2263f0 100644 --- a/yt_dlp/extractor/sovietscloset.py +++ b/yt_dlp/extractor/sovietscloset.py @@ -67,6 +67,7 @@ class SovietsClosetIE(SovietsClosetBaseIE): 'series': 'The Witcher', 'season': 'Misc', 'episode_number': 13, + 'episode': 'Episode 13', }, }, { @@ -92,6 +93,7 @@ class SovietsClosetIE(SovietsClosetBaseIE): 'series': 'Arma 3', 'season': 'Zeus Games', 'episode_number': 3, + 'episode': 'Episode 3', }, }, ] diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 620973a9f..56cc2dcc6 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -15,6 +15,7 @@ from ..compat import ( from ..utils import ( ExtractorError, HEADRequest, + get_first, int_or_none, join_nonempty, LazyList, @@ -816,8 +817,7 @@ class DouyinIE(TikTokIE): render_data = self._parse_json( render_data_json, video_id, transform_source=compat_urllib_parse_unquote) - return self._parse_aweme_video_web( - traverse_obj(render_data, (..., 'aweme', 'detail'), get_all=False), url) + return self._parse_aweme_video_web(get_first(render_data, ('aweme', 'detail')), url) class TikTokVMIE(InfoExtractor): diff --git a/yt_dlp/extractor/xinpianchang.py b/yt_dlp/extractor/xinpianchang.py new file mode 100644 index 000000000..9832d2398 --- /dev/null +++ b/yt_dlp/extractor/xinpianchang.py @@ -0,0 +1,95 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + try_get, + update_url_query, + url_or_none, +) + + +class XinpianchangIE(InfoExtractor): + _VALID_URL = r'https?://www\.xinpianchang\.com/(?P<id>[^/]+?)(?:\D|$)' + IE_NAME = 'xinpianchang' + IE_DESC = 'xinpianchang.com' + _TESTS = [{ + 'url': 'https://www.xinpianchang.com/a11766551', + 'info_dict': { + 'id': 'a11766551', + 'ext': 'mp4', + 'title': '北京2022冬奥会闭幕式再见短片-冰墩墩下班了', + 'description': 'md5:4a730c10639a82190fabe921c0fa4b87', + 'duration': 151, + 'thumbnail': r're:^https?://oss-xpc0\.xpccdn\.com.+/assets/', + 'uploader': '正时文创', + 'uploader_id': 10357277, + 'categories': ['宣传片', '国家城市', '广告', '其他'], + 'keywords': ['北京冬奥会', '冰墩墩', '再见', '告别', '冰墩墩哭了', '感动', '闭幕式', '熄火'] + }, + }, { + 'url': 'https://www.xinpianchang.com/a11762904', + 'info_dict': { + 'id': 'a11762904', + 'ext': 'mp4', + 'title': '冬奥会决胜时刻《法国派出三只鸡?》', + 'description': 'md5:55cb139ef8f48f0c877932d1f196df8b', + 'duration': 136, + 'thumbnail': r're:^https?://oss-xpc0\.xpccdn\.com.+/assets/', + 'uploader': '精品动画', + 'uploader_id': 10858927, + 'categories': ['动画', '三维CG'], + 'keywords': ['France Télévisions', '法国3台', '蠢萌', '冬奥会'] + }, + }, { + 'url': 'https://www.xinpianchang.com/a11779743?from=IndexPick&part=%E7%BC%96%E8%BE%91%E7%B2%BE%E9%80%89&index=2', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id=video_id) + domain = self.find_value_with_regex(var='requireNewDomain', webpage=webpage) + vid = self.find_value_with_regex(var='vid', webpage=webpage) + app_key = self.find_value_with_regex(var='modeServerAppKey', webpage=webpage) + api = update_url_query(f'{domain}/mod/api/v2/media/{vid}', {'appKey': app_key}) + data = self._download_json(api, video_id=video_id)['data'] + formats, subtitles = [], {} + for k, v in data.get('resource').items(): + if k in ('dash', 'hls'): + v_url = v.get('url') + if not v_url: + continue + if k == 'dash': + fmts, subs = self._extract_mpd_formats_and_subtitles(v_url, video_id=video_id) + elif k == 'hls': + fmts, subs = self._extract_m3u8_formats_and_subtitles(v_url, video_id=video_id) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + elif k == 'progressive': + formats.extend([{ + 'url': url_or_none(prog.get('url')), + 'width': int_or_none(prog.get('width')), + 'height': int_or_none(prog.get('height')), + 'ext': 'mp4', + } for prog in v if prog.get('url') or []]) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': data.get('title'), + 'description': data.get('description'), + 'duration': int_or_none(data.get('duration')), + 'categories': data.get('categories'), + 'keywords': data.get('keywords'), + 'thumbnail': data.get('cover'), + 'uploader': try_get(data, lambda x: x['owner']['username']), + 'uploader_id': try_get(data, lambda x: x['owner']['id']), + 'formats': formats, + 'subtitles': subtitles, + } + + def find_value_with_regex(self, var, webpage): + return self._search_regex(rf'var\s{var}\s=\s\"(?P<vid>[^\"]+)\"', webpage, name=var) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index ee0277fd7..66bb8d9f0 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -39,6 +39,7 @@ from ..utils import ( ExtractorError, float_or_none, format_field, + get_first, int_or_none, is_html, join_nonempty, @@ -72,10 +73,6 @@ from ..utils import ( ) -def get_first(obj, keys, **kwargs): - return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False) - - # any clients starting with _ cannot be explicity requested by the user INNERTUBE_CLIENTS = { 'web': { @@ -2081,7 +2078,93 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'age_limit': 0, 'channel_follower_count': int }, 'params': {'format': 'mhtml', 'skip_download': True} - } + }, { + # Ensure video upload_date is in UTC timezone (video was uploaded 1641170939) + 'url': 'https://www.youtube.com/watch?v=2NUZ8W2llS4', + 'info_dict': { + 'id': '2NUZ8W2llS4', + 'ext': 'mp4', + 'title': 'The NP that test your phone performance 🙂', + 'description': 'md5:144494b24d4f9dfacb97c1bbef5de84d', + 'uploader': 'Leon Nguyen', + 'uploader_id': 'VNSXIII', + 'uploader_url': 'http://www.youtube.com/user/VNSXIII', + 'channel_id': 'UCRqNBSOHgilHfAczlUmlWHA', + 'channel_url': 'https://www.youtube.com/channel/UCRqNBSOHgilHfAczlUmlWHA', + 'duration': 21, + 'view_count': int, + 'age_limit': 0, + 'categories': ['Gaming'], + 'tags': 'count:23', + 'playable_in_embed': True, + 'live_status': 'not_live', + 'upload_date': '20220103', + 'like_count': int, + 'availability': 'public', + 'channel': 'Leon Nguyen', + 'thumbnail': 'https://i.ytimg.com/vi_webp/2NUZ8W2llS4/maxresdefault.webp', + 'channel_follower_count': int + } + }, { + # date text is premiered video, ensure upload date in UTC (published 1641172509) + 'url': 'https://www.youtube.com/watch?v=mzZzzBU6lrM', + 'info_dict': { + 'id': 'mzZzzBU6lrM', + 'ext': 'mp4', + 'title': 'I Met GeorgeNotFound In Real Life...', + 'description': 'md5:cca98a355c7184e750f711f3a1b22c84', + 'uploader': 'Quackity', + 'uploader_id': 'QuackityHQ', + 'uploader_url': 'http://www.youtube.com/user/QuackityHQ', + 'channel_id': 'UC_8NknAFiyhOUaZqHR3lq3Q', + 'channel_url': 'https://www.youtube.com/channel/UC_8NknAFiyhOUaZqHR3lq3Q', + 'duration': 955, + 'view_count': int, + 'age_limit': 0, + 'categories': ['Entertainment'], + 'tags': 'count:26', + 'playable_in_embed': True, + 'live_status': 'not_live', + 'release_timestamp': 1641172509, + 'release_date': '20220103', + 'upload_date': '20220103', + 'like_count': int, + 'availability': 'public', + 'channel': 'Quackity', + 'thumbnail': 'https://i.ytimg.com/vi/mzZzzBU6lrM/maxresdefault.jpg', + 'channel_follower_count': int + } + }, + { # continuous livestream. Microformat upload date should be preferred. + # Upload date was 2021-06-19 (not UTC), while stream start is 2021-11-27 + 'url': 'https://www.youtube.com/watch?v=kgx4WGK0oNU', + 'info_dict': { + 'id': 'kgx4WGK0oNU', + 'title': r're:jazz\/lofi hip hop radio🌱chill beats to relax\/study to \[LIVE 24\/7\] \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'ext': 'mp4', + 'channel_id': 'UC84whx2xxsiA1gXHXXqKGOA', + 'availability': 'public', + 'age_limit': 0, + 'release_timestamp': 1637975704, + 'upload_date': '20210619', + 'channel_url': 'https://www.youtube.com/channel/UC84whx2xxsiA1gXHXXqKGOA', + 'live_status': 'is_live', + 'thumbnail': 'https://i.ytimg.com/vi/kgx4WGK0oNU/maxresdefault.jpg', + 'uploader': '阿鲍Abao', + 'uploader_url': 'http://www.youtube.com/channel/UC84whx2xxsiA1gXHXXqKGOA', + 'channel': 'Abao in Tokyo', + 'channel_follower_count': int, + 'release_date': '20211127', + 'tags': 'count:39', + 'categories': ['People & Blogs'], + 'like_count': int, + 'uploader_id': 'UC84whx2xxsiA1gXHXXqKGOA', + 'view_count': int, + 'playable_in_embed': True, + 'description': 'md5:2ef1d002cad520f65825346e2084e49d', + }, + 'params': {'skip_download': True} + }, ] @classmethod @@ -3008,6 +3091,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Some formats may have much smaller duration than others (possibly damaged during encoding) # Eg: 2-nOtRESiUc Ref: https://github.com/yt-dlp/yt-dlp/issues/2823 is_damaged = try_get(fmt, lambda x: float(x['approxDurationMs']) < approx_duration - 10000) + if is_damaged: + self.report_warning(f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) dct = { 'asr': int_or_none(fmt.get('audioSampleRate')), 'filesize': int_or_none(fmt.get('contentLength')), @@ -3027,7 +3112,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'language': join_nonempty(audio_track.get('id', '').split('.')[0], 'desc' if language_preference < -1 else ''), 'language_preference': language_preference, - 'preference': -10 if is_damaged else None, + # Strictly de-prioritize damaged and 3gp formats + 'preference': -10 if is_damaged else -2 if itag == '17' else None, } mime_mobj = re.match( r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '') @@ -3336,9 +3422,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # URL checking if user don't care about getting the best possible thumbnail 'thumbnail': traverse_obj(original_thumbnails, (-1, 'url')), 'description': video_description, - 'upload_date': unified_strdate( - get_first(microformats, 'uploadDate') - or search_meta('uploadDate')), 'uploader': get_first(video_details, 'author'), 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None, 'uploader_url': owner_profile_url, @@ -3489,6 +3572,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): for content in contents: vpir = content.get('videoPrimaryInfoRenderer') if vpir: + info['upload_date'] = strftime_or_none(self._extract_time_text(vpir, 'dateText')[0], '%Y%m%d') stl = vpir.get('superTitleLink') if stl: stl = self._get_text(stl) @@ -3567,6 +3651,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_id': 'uploader_id', 'channel_url': 'uploader_url', } + + # The upload date for scheduled and current live streams / premieres in microformats + # is generally the true upload date. Although not in UTC, we will prefer that in this case. + # Note this changes to the published date when the stream/premiere has finished. + # See: https://github.com/yt-dlp/yt-dlp/pull/2223#issuecomment-1008485139 + if not info.get('upload_date') or info.get('is_live') or info.get('live_status') == 'is_upcoming': + info['upload_date'] = ( + unified_strdate(get_first(microformats, 'uploadDate')) + or unified_strdate(search_meta('uploadDate')) + or info.get('upload_date')) + for to, frm in fallbacks.items(): if not info.get(to): info[to] = info.get(frm) diff --git a/yt_dlp/extractor/zingmp3.py b/yt_dlp/extractor/zingmp3.py index f84ba5cff..419bf30d8 100644 --- a/yt_dlp/extractor/zingmp3.py +++ b/yt_dlp/extractor/zingmp3.py @@ -9,7 +9,6 @@ from .common import InfoExtractor from ..utils import ( int_or_none, traverse_obj, - HEADRequest, ) @@ -106,18 +105,17 @@ class ZingMp3BaseIE(InfoExtractor): def _real_initialize(self): if not self.get_param('cookiefile') and not self.get_param('cookiesfrombrowser'): - self._request_webpage(HEADRequest(self._DOMAIN), None, note='Updating cookies') + self._request_webpage(self.get_api_with_signature(name_api=self._SLUG_API['bai-hat'], param={'id': ''}), + None, note='Updating cookies') def _real_extract(self, url): song_id, type_url = self._match_valid_url(url).group('id', 'type') - api = self.get_api_with_signature(name_api=self._SLUG_API[type_url], param={'id': song_id}) - return self._process_data(self._download_json(api, song_id)['data'], song_id, type_url) def get_api_with_signature(self, name_api, param): - sha256 = hashlib.sha256(''.join(f'{k}={v}' for k, v in param.items()).encode('utf-8')).hexdigest() - + param.update({'ctime': '1'}) + sha256 = hashlib.sha256(''.join(f'{i}={param[i]}' for i in sorted(param)).encode('utf-8')).hexdigest() data = { 'apiKey': self._API_KEY, 'sig': hmac.new(self._SECRET_KEY, f'{name_api}{sha256}'.encode('utf-8'), hashlib.sha512).hexdigest(), @@ -149,7 +147,7 @@ class ZingMp3IE(ZingMp3BaseIE): }, }, { 'url': 'https://zingmp3.vn/video-clip/Suong-Hoa-Dua-Loi-K-ICM-RYO/ZO8ZF7C7.html', - 'md5': 'e9c972b693aa88301ef981c8151c4343', + 'md5': 'c7f23d971ac1a4f675456ed13c9b9612', 'info_dict': { 'id': 'ZO8ZF7C7', 'title': 'Sương Hoa Đưa Lối', @@ -158,6 +156,22 @@ class ZingMp3IE(ZingMp3BaseIE): 'duration': 207, 'track': 'Sương Hoa Đưa Lối', 'artist': 'K-ICM, RYO', + 'album': 'Sương Hoa Đưa Lối (Single)', + 'album_artist': 'K-ICM, RYO', + }, + }, { + 'url': 'https://zingmp3.vn/bai-hat/Nguoi-Yeu-Toi-Lanh-Lung-Sat-Da-Mr-Siro/ZZ6IW7OU.html', + 'md5': '3e9f7a9bd0d965573dbff8d7c68b629d', + 'info_dict': { + 'id': 'ZZ6IW7OU', + 'title': 'Người Yêu Tôi Lạnh Lùng Sắt Đá', + 'ext': 'mp3', + 'thumbnail': r're:^https?://.+\.jpg', + 'duration': 303, + 'track': 'Người Yêu Tôi Lạnh Lùng Sắt Đá', + 'artist': 'Mr. Siro', + 'album': 'Người Yêu Tôi Lạnh Lùng Sắt Đá (Single)', + 'album_artist': 'Mr. Siro', }, }, { 'url': 'https://zingmp3.vn/embed/song/ZWZEI76B?start=false', @@ -184,6 +198,14 @@ class ZingMp3AlbumIE(ZingMp3BaseIE): }, 'playlist_count': 9, }, { + 'url': 'https://zingmp3.vn/album/Nhung-Bai-Hat-Hay-Nhat-Cua-Mr-Siro-Mr-Siro/ZWZAEZZD.html', + 'info_dict': { + '_type': 'playlist', + 'id': 'ZWZAEZZD', + 'title': 'Những Bài Hát Hay Nhất Của Mr. Siro', + }, + 'playlist_count': 49, + }, { 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', 'only_matching': True, }, { |